#### # function prototype #### # myginv(a, tol=1.e-10) # GetDirection(y1, y2, alpha=0.03, alpha0=0.05, ITMAX=10) # GetDirection2(mu1, Sigma1, mu2, Sigma2, ITMAX=10) # LowerJstar(n1, m1, tau1, n2, m2, tau2, alpha, alpha0) # LowerJstar2(a, n1, mu1, Sigma1, n2, mu2, Sigma2, alpha, alpha0) # SepIndex(mu1, tau1, mu2, tau2, alpha) # SepIndex2(a, mu1, Sigma1, mu2, Sigma2, alpha) # SepIndex2.data(a, n1, mu1, Sigma1, n2, mu2, Sigma2, alpha) # Proj.MeanCov(a, mu1, Sigma1, mu2, Sigma2) # Proj.MeanCov.data(a, n1, mu1, Sigma1, n2, mu2, Sigma2) # GetTau(n1, mu1, tau1, n2, mu2, tau2, alpha) # GetJMatrix(y, cl, th.Jstar=0.03, alpha=0.03, alpha0=0.05, ITMAX=10, ifoutlierdetect=F) # MergeOrNot2(y1, y2, th.Jstar=0.03, alpha=0.03, alpha0=0.05, ITMAX=10, ifoutlierdetect=F) # SepIndex.nonnormal(x1, x2, alpha) # OneCluster(n,p) ############################################################## # Iteratively get projection directions by maximizing the separation index of # projected data, given data. # y1 and y2 are n1xp and n2xp matrices respectively. p>1. # alpha -- tuning parameter reflecting the percentage in the two # tails that might be outlying. # ITMAX -- maximum iteration allowed when to iteratively calculate the # optimal projection direction. GetDirection<-function(y1, y2, alpha=0.03, alpha0=0.05, ITMAX=10) { y1<-as.matrix(y1); y2<-as.matrix(y2); p<-ncol(y1); myeps<-(1.0e-4); n1<-nrow(y1); n2<-nrow(y2); #cat("n1=", n1, " n2=", n2, "\n") # obtain mean vectors and covariance matrices if(n1>1) { mu1<-apply(y1, 2, mean); Sigma1<-cov(y1); } else { mu1<-as.vector(y1); Sigma1<-matrix(0, nrow=p, ncol=p); } if(n2>1) { mu2<-apply(y2, 2, mean); Sigma2<-cov(y2); } else { mu2<-as.vector(y2); Sigma2<-matrix(0, nrow=p, ncol=p); } # obtain the projection direction tmp<-GetDirection2(mu1,Sigma1,mu2,Sigma2,ITMAX) a<-as.vector(tmp$a); a.ini<-as.vector(tmp$a.ini); x1<-as.vector(y1%*%a); x2<-as.vector(y2%*%a); # obtain the normal version of the separation index Jstar and its lower bound tmp2<-LowerJstar2(a, n1, mu1, Sigma1, n2, mu2, Sigma2, alpha=alpha, alpha0=alpha0) # obtain the quantile version of the separation index Jstar tmp3<-SepIndex.nonnormal(x1, x2, alpha) return(list(a=a,a.ini=a.ini,Jstar=tmp2$Jstar,JL=tmp2$JL, Jstar.q=tmp3$Jstar, x1=x1,x2=x2)) } # Iteratively get projection direction by maximizing the separation index of # projected data, given mean vectors and covariance matrices (theoretical # version). # mu1, Sigma1 -- mean vector and covariance matrix of cluster 1 # mu2, Sigma2 -- mean vector and covariance matrix of cluster 2 # ITMAX -- maximum iteration allowed when to iteratively calculate the # optimal projection direction. GetDirection2<-function(mu1, Sigma1, mu2, Sigma2, ITMAX=10) { myeps<-(1.0e-4); p<-length(mu1); # dimension of data mu1<-as.vector(mu1) Sigma1<-as.matrix(Sigma1) mu2<-as.vector(mu2) Sigma2<-as.matrix(Sigma2) # get initial projection direction a=(mu2-mu1)/||mu2-mu1|| denom<-sqrt(sum((mu2-mu1)^2)) if(denom<10e-10) # two clusters stack together { # any direction is okay a<-rep(1, p) return(list(a=a, a.ini=rep(0, p))) } #cat("denom=", denom, "\n") a.ini<-as.vector(mu2-mu1)/denom theta<-a.ini; a<-a.ini; if(sum(diag(Sigma1))<1.0e-10) { return(list(a=a,a.ini=a.ini)) } if(sum(diag(Sigma2))<1.0e-10) { return(list(a=a,a.ini=a.ini)) } # iteratively find projection direction loop<-0; diff<-100; while(loopmyeps) { loop<-loop+1 # obtain tmp1=a^T*Sigma1*a and tmp2=a^T*Sigma2*a s1a<-as.vector(Sigma1%*%a); as1a<-sum(a*s1a); if(abs(as1a)<1.0e-6) { tmp1<- 0 } else { tmp1<-sqrt(as1a) } s2a<-as.vector(Sigma2%*%a); as2a<-sum(a*s2a); #cat("as2a=", as2a, "\n") if(abs(as2a)<1.0e-6) { tmp2<- 0 } else { tmp2<-sqrt(as2a) } if(abs(tmp1)<1.0e-6 || abs(tmp2)<1.0e-10) { a<-a.ini; break; } tmp3<-tmp1+tmp2; tmp4<-Sigma1/tmp1+Sigma2/tmp2; # obtain the generalized inverse if tmp4 is singluar. if(p>1){ itmp4<-myginv(tmp4); } else if(p==1) { if(abs(tmp4)<1.0e-10) { stop("error in GetDirection2()! tmp4=0!\n") } else { itmp4<-1/tmp4 } } tmp5<-as.vector(itmp4%*%theta); a.new<-as.vector(tmp3*tmp5); # normalize a.new a.new<-as.vector(a.new/sqrt(sum(a.new^2))); diff<-sum((a.new-a)^2); a<-a.new; } a<-a/max(abs(a)) a.ini<-a.ini/max(abs(a.ini)) res<-list(a=a, a.ini=a.ini) return(res) } # Calculate (1-alpha0)% lower CI of J* given projected means and variances # make sure the projected cluster 1 is in the leftside of the projected # cluster 2 # n1, m1, tau1 -- size, MLE of the mean and standard devitation of cluster 1 # n2, m2, tau2 -- size, MLE of the mean and standard devitation of cluster 2 # alpha -- tuning parameter reflecting the percentage in the two # tails that might be outlying. # alpha0 -- confidence level when calcuating the confidence lower bound of # the separation index J* LowerJstar<-function(n1, m1, tau1, n2, m2, tau2, alpha, alpha0) { # get estimate of J^* tmp<-SepIndex(m1, tau1, m2, tau2, alpha) Jstar<-tmp$Jstar # get estimate standard error of hat{J}^* tauhat<-GetTau(n1, m1, tau1, n2, m2, tau2, alpha) # obtain the lower bound n<-n1+n2; tmp1<-pi*Jstar/2.0; tmp2<-tan(tmp1); Za0<-qnorm(1-alpha0) # alpha0 upper percentile of the standard normal distr. tmp3<-Za0*pi*tauhat/(2.0*sqrt(n)*(cos(tmp1))^2) tmp4<-2.0/pi # JL is the asymptotic (1-alpha0)% confidence lower bound of J^* JL<-tmp4*atan(tmp2-tmp3) res<-list(Jstar=Jstar, JL=JL) return(res) } # Calculate (1-alpha0)% lower CI of J*, given projection direction, mean # vectors and covariance matrices. # a -- projection direction # n1, mu1, Sigma1 -- size, mean vector and covariance matrix of cluster 1 # n2, mu2, Sigma2 -- size, mean vector and covariance matrix of cluster 2 # alpha -- tuning parameter reflecting the percentage in the two # tails that might be outlying. # alpha0 -- confidence level when calcuating the confidence lower bound of # the separation index J* LowerJstar2<-function(a, n1, mu1, Sigma1, n2, mu2, Sigma2, alpha, alpha0) { # project mean vectors and covariance matrices tmp<-Proj.MeanCov.data(a, n1, mu1, Sigma1, n2, mu2, Sigma2) # tau1 and tau2 are mle of the variance of projected cluster 1 and 2 m1<-tmp$m1; tau1<-tmp$tau1; m2<-tmp$m2; tau2<-tmp$tau2; # obtain Jstar and its lower bound res<-LowerJstar(n1, m1, tau1, n2, m2, tau2, alpha=alpha, alpha0=alpha0) return(res) } # Calculate separation index, given projected means and variances. # make sure mu2>mu1 before using this function # mu1, tau1 -- mle of the mean and standard devitation of cluster 1 # mu2, tau2 -- mle of mean and standard devitation of cluster 2 # alpha -- tuning parameter SepIndex<-function(mu1, tau1, mu2, tau2, alpha) { Za<-qnorm(1-alpha/2) L1<-mu1-Za*tau1; U1<-mu1+Za*tau1; L2<-mu2-Za*tau2; U2<-mu2+Za*tau2; Jstar<-(L2-U1)/(U2-L1); intercept<-(U1+L2)/2; return(list(Jstar=Jstar,intercept=intercept,L1=L1,U1=U1,L2=L2,U2=U2)) } # Calculate separation index given the projection direction and mean vectors # and covariance matrices of two clusters. (Theoretical version) # a -- projection direction # mu1, Sigma1 -- mean vector and covariance matrix of cluster 1 # mu2, Sigma2 -- mean vector and covariance matrix of cluster 2 # alpha -- tuning parameter SepIndex2<-function(a, mu1, Sigma1, mu2, Sigma2, alpha) { Za<-qnorm(1-alpha/2) # project mean vectors and covariance matrices tmp<-Proj.MeanCov(a, mu1, Sigma1, mu2, Sigma2) # obtain separation index res<-SepIndex(tmp$m1, tmp$tau1, tmp$m2, tmp$tau2, alpha) return( res ) } # Calculate separation index given the projection direction and mean vectors # and covariance matrices of two clusters. (data version) # a -- projection direction # n1, mu1, Sigma1 -- size, mean vector and covariance matrix of cluster 1 # n2, mu2, Sigma2 -- size, mean vector and covariance matrix of cluster 2 # alpha -- tuning parameter SepIndex2.data<-function(a, n1, mu1, Sigma1, n2, mu2, Sigma2, alpha) { Za<-qnorm(1-alpha/2) # project mean vectors and covariance matrices tmp<-Proj.MeanCov.data(a, n1, mu1, Sigma1, n2, mu2, Sigma2) # obtain separation index res<-SepIndex(tmp$m1, tmp$tau1, tmp$m2, tmp$tau2, alpha) return( res ) } # Projection mean vectors and covariance matrices (theoretical version) # a -- projection direction # mu1, Sigma1 -- mean vector and covariance matrix of cluster 1 # mu2, Sigma2 -- mean vector and covariance matrix of cluster 2 Proj.MeanCov<-function(a, mu1, Sigma1, mu2, Sigma2) { # project mean vectors and covariance matrices m1<-sum(a*mu1) tau1<-sqrt(sum(a%*%Sigma1%*%a)) m2<-sum(a*mu2) tau2<-sqrt(sum(a%*%Sigma2%*%a)) if(m1>m2) { cat("************ mu1>mu2 in MergeOrNot *****************\n") } res<-list(m1=m1, tau1=tau1, m2=m2, tau2=tau2) return(res) } # Projection mean vectors and covariance matrices (data version) # a -- projection direction # n1, mu1, Sigma1 -- size, mean vector and covariance matrix of cluster 1 # n2, mu2, Sigma2 -- size, mean vector and covariance matrix of cluster 2 Proj.MeanCov.data<-function(a, n1, mu1, Sigma1, n2, mu2, Sigma2) { # project mean vectors and covariance matrices m1<-sum(a*mu1) s1<-sum(a%*%Sigma1%*%a) # sample variance of cluster 1 (not mle) m2<-sum(a*mu2) s2<-sum(a%*%Sigma2%*%a) # sample variance of cluster 1 (not mle) # obtain sample standard deviations of y1 and y2 (MLE) tau1<-sqrt((n1-1)*s1/n1) tau2<-sqrt((n2-1)*s2/n2) if(m1>m2) { cat("************ mu1>mu2 in MergeOrNot *****************\n") } res<-list(m1=m1, tau1=tau1, m2=m2, tau2=tau2) return(res) } # Calculate tau -- the asymptotic standard deviation of hat{Jstar} # make sure mu2>mu1 before using this function # n1, mu1, tau1 -- size, mean and standard devitation of cluster 1 # n2, mu2, tau2 -- size, mean and standard devitation of cluster 2 GetTau<-function(n1, mu1, tau1, n2, mu2, tau2, alpha) { tmp1<-mu2-mu1 tmp2<-tau1+tau2 Za<-qnorm(1-alpha/2) # alpha/2 upper percentile of the standard normal distr. denom<-(tmp1+Za*tmp2)^4 numer<-4*Za^2*(tau1^2/n1+tau2^2/n2)*(tmp1^2/2+tmp2^2) res<-sqrt(numer/denom) return(res) } # Given a partition, calculate the separation index matrix # y -- nxp data matrix # cl -- a partition of the data points # alpha -- tuning parameter reflecting the percentage in the two # tails that might be outlying. # ITMAX -- maximum iteration allowed when to iteratively calculate the # optimal projection direction. GetJMatrix<-function(y, cl, th.Jstar=0.03, alpha=0.03, alpha0=0.05, ITMAX=10, ifoutlierdetect=F) { y<-as.matrix(y); n<-nrow(y); p<-ncol(y); out.label<-which(cl==0) if(length(out.label)>0) { cl2<-cl[-out.label]; y2<-y[-out.label,]; } else { cl2<-cl; y2<-y; } k0<-length(unique(cl2)); n2<-nrow(y2); if(k0==1) { return(OneCluster(n,p)) } size<-tapply(rep(1,n2), cl2, sum); mat<-matrix(1, nrow=k0, ncol=k0); diag(mat)<-0; # merge matrix Jmat<-matrix(1, nrow=k0, ncol=k0); diag(Jmat)<- -1; # separation index mat # separation index mat (quantile version) Jmat.q<-matrix(1, nrow=k0, ncol=k0); diag(Jmat.q)<- -1; proj.direct<-array(0,c(k0,k0,p)) # projection directions for(k1 in 1:(k0-1)) { nk1<-size[k1]; if(nk1==1) { next }; yk1<-y2[cl2==k1,]; for(k2 in (k1+1):k0) { nk2<-size[k2]; if(nk2==1) { next }; yk2<-y2[cl2==k2,]; tmp<-MergeOrNot2(yk1, yk2, th.Jstar=th.Jstar, alpha=alpha, ITMAX=ITMAX, alpha0=alpha0,ifoutlierdetect=ifoutlierdetect) Jmat[k1, k2]<-tmp$Jstar; Jmat[k2, k1]<-Jmat[k1, k2]; Jmat.q[k1, k2]<-tmp$Jstar.q; Jmat.q[k2, k1]<-Jmat.q[k1, k2]; mat[k1, k2]<-tmp$M; mat[k2, k1]<-mat[k1, k2]; proj.direct[k1, k2,]<-tmp$a; proj.direct[k2, k1,]<- -tmp$a; } } res<-list(k0=k0, cl=cl, Jmat=Jmat, mat=mat, proj.direct=proj.direct, Jmat.q=Jmat.q) return(res) } # Check if two clusters should be merged or not (data version) # the covariance matices of the two clusters s1 and s2 can be singular. # # y1, y2 are two clusters # alpha -- tuning parameter reflecting the percentage in the two # tails that might be outlying. # ITMAX -- maximum iteration allowed when to iteratively calculate the # optimal projection direction. # ifoutlierdetect -- indicates if we should detect outliers during the # sequential clustering process. MergeOrNot2<-function(y1, y2, th.Jstar=0.03, alpha=0.03, alpha0=0.05, ITMAX=10, ifoutlierdetect=F) { y1<-as.matrix(y1); y2<-as.matrix(y2); # get projection direction tmp<-GetDirection(y1=y1, y2=y2, alpha=alpha, ITMAX=ITMAX, alpha0=alpha0); a<-as.vector(tmp$a); # the projection direction Jstar<-tmp$Jstar Jstar.q<-tmp$Jstar.q JL<-tmp$JL if(JL>0) { Jstar<-Jstar; M<-1; } else if(Jstar.q>th.Jstar) { Jstar<-Jstar.q; M<-1; } # not merge else { M<-0; Jstar<-min(Jstar, Jstar.q) } # merge return(list(Jstar=Jstar, Jstar.q=Jstar.q, M=M, a=as.vector(a))) } # Calculate non-normal version of the separation index # make sure the median of x1 is on the left side of that of x2 # x1 -- 1 dimensional cluster1 # x2 -- 1 dimensional cluster2 # alpha -- tuning parameter SepIndex.nonnormal<-function(x1, x2, alpha) { Za<-qnorm(1-alpha) # alpha upper percentile # project mean vectors and covariance matrices L1<-quantile(x1, probs=alpha/2) U1<-quantile(x1, probs=1-alpha/2) L2<-quantile(x2, probs=alpha/2) U2<-quantile(x2, probs=1-alpha/2) Jstar<-(L2-U1)/(U2-L1) intercept<-(U1+L2)/2; return(list(Jstar=Jstar,intercept=intercept,L1=L1,U1=U1,L2=L2,U2=U2)) } # return the setting of one cluster structure # n -- the number of data points # p -- the number of variables OneCluster<-function(n,p) { k0<-1; mat<- 0; Jmat<- -1; cl<-rep(1, n); proj.direct<-array(0,c(1,1, p)); return(list(k0=k0, cl=cl, mat=mat, Jmat=Jmat, proj.direct=proj.direct)) } # moore penrose generalized inverse # If A=UDV', m*n matrix with r non-zero singular values, then # A=U_1 D_r V'_1 where U_1 is m*r, V_1 is n*r, D_r is r*r diagonal matrix # moore penrose inverse is # A^+ = V_1 D_r^{-1} U'_1 # Proof that A* A^+ *A = A and A^+ *A *A^+ =A^+ is straightforward # since A^+ *A =V_1 V'_1 myginv<-function(a,tol=1.e-10) { asvd<-svd(a) dd<-asvd$d dnz<-dd[dd>tol] r<-length(dnz) #cat("number of non-zero singular values is ", r,"\n") ur<-asvd$u[,1:r] vr<-asvd$v[,1:r] d1<-diag(1/dnz) ainv<-vr%*%d1%*%t(ur) ainv }