library("mda") #library for mars local_linear<-function(x,h,dat,beta){ Y<-dat[,1] X<-dat[,-1] N<-length(Y) X<-X%*%beta x<-x%*%beta#beta%*%x D_mat<-cbind(rep(1,N),X) if (is.vector(X)){ dim<-1 d<-abs(X-rep(x,N)) } else{ dim<-length(X[1,]) d<-sqrt(apply(X-t(matrix(rep(x,N),dim,N)),1,norm2)) } K<-diag(dnorm(d/h)/dnorm(0)) pred<-c(1,x)%*%solve(t(D_mat)%*%K%*%D_mat)%*%t(D_mat)%*%K%*%Y return(pred) } ##### performs estimation of small dimesnion by CV with local linear forward regression est_dim_CV<-function(Blist,dat,h_loclin=NULL,dim.max,median_use=F,method_mars=F){ #standardize regressors by symmetric root of inverse covariance mat Sig<-est_varmat(dat[,-1]) eig_dec<-eigen(Sig) Sroot_inv<-eig_dec$vectors%*%((diag(eig_dec$values^(-1/2))))%*%t(eig_dec$vectors) dat[,-1]<-as.matrix(dat[,-1])%*%Sroot_inv N<-length(dat[,1]) dim<-length(dat[1,-1]) MSE<-mat.or.vec(N,dim.max) for(u in 1:dim.max){ beta<-Blist[[u]] if(is.null(h_loclin)){ #h_loclin<-(1/N)^(1/(3+2*u)) h_loclin<-1.2*N^(-1/(4+u))#(1/N)^(1/(3+2*j)) } for(i in 1:N){ x<-dat[i,-1] if(method_mars==F){MSE[i,u]<-(dat[i,1]-local_linear(x,h_loclin,as.matrix(dat[-i,]),beta))^2} #predict with local linear if(method_mars==T){ dat_fit<-dat[-i,-1]%*%beta X_new<-dat[i,-1]%*%beta mars_mod<-mars(dat_fit,dat[-i,1]) #fit mars model MSE[i,u]<-(dat[i,1]-predict(mars_mod,X_new))^2 #predict with mars model }#predict with mars } } if(median_use){MSE_ave<-apply(MSE,2,median)} else{MSE_ave<-colMeans(MSE)} #apply(MSE,2,median) est_dim<-which.min(MSE_ave) ret<-list(est_dim = est_dim, MSE_ave = MSE_ave, MSE = MSE) return(ret) } ########### test_for_dim<-function(Lmat,dim.max=NULL,alpha=0.1,method='greater'){ #Lmat... matrix with dimension N times dim.max with columns corresponding to aov_dat for dim 1,2,3,....,max.dim if(is.null(dim.max)){dim.max<-length(Lmat[1,])} pval<-mat.or.vec(dim.max-1,1) est_dim<-dim.max # for (j in 1:(dim.max-1)){ j<-1 while(est_dim==dim.max&jalpha){est_dim<-j} } j<-j+1 } ret<-list(est_dim,pval) names(ret)<-c('estdim','pval') return(ret) } #### ## test_for_dim_elbow<-function(Lmat){ dim.max<-length(Lmat[1,]) ave<-colMeans(Lmat) boxplot(Lmat,xlab='k') lines(seq(1,dim.max),ave,col='red') # tmp<-cbind(ave,seq(1,dim.max)) # colnames(tmp)<-c('response','k') # diff<-lm(response~k,data=as.data.frame(tmp))$coefficients[2] # # est_dim<-dim.max # i<-1 # while(idiff){est_dim<-i} # i<-i+1 # } return(which.min(ave)) } ###### #Small simulation example for truedim =1 set.seed(21) dim<-6 truedim<-1 N<-50 b<-c(1,0,0,0,0,0) m<-20 est_dim<-mat.or.vec(m,7) dim.max<-4 for(i in 1:m){ dat<-creat_sample(b,N,fsquare,0.5) Blist<-list() Lmat<-mat.or.vec(N,dim.max) for(u in 1:dim.max){ #calculate B for different possible truedim's m1<-stiefl_opt(dat,k=(dim-u)) #original bandwidth selection rule used that controlls number of points in a slice!!!!!!, also choose_h_2 Blist[[u]]<-fill_base(m1$est_base)[,1:u] Lmat[,u]<-m1$aov_dat } #estimate truedim with different methods est_dim[i,1]<-est_dim_CV(Blist,dat,dim.max = dim.max,median_use = T)$est_dim est_dim[i,2]<-est_dim_CV(Blist,dat,dim.max = dim.max,median_use = F)$est_dim est_dim[i,3]<-est_dim_CV(Blist,dat,dim.max = dim.max,median_use = F,method_mars = T)$est_dim est_dim[i,4]<-test_for_dim(Lmat)$estdim est_dim[i,5]<-test_for_dim(Lmat,method = 'lower')$estdim est_dim[i,6]<-test_for_dim_elbow(cbind((dat[,1]-mean(dat[,1]))^2,Lmat))-1 mod_t<-mave(Y~.,data=as.data.frame(dat),method = 'meanMAVE') est_dim[i,7]<-which.min(mave.dim(mod_t)$cv) print(i) } length(which(est_dim[,1]==truedim))/m #fraction of where dimension is estimated correctly with mwthod 1 (CV with median) #0.5 length(which(est_dim[,2]==truedim))/m #fraction of where dimension is estimated correctly with mwthod 1 (CV with mean) #0.9 length(which(est_dim[,3]==truedim))/m #fraction of where dimension is estimated correctly with mwthod 1 (CV with mars and mean) #0.8 length(which(est_dim[,4]==truedim))/m #fraction of where dimension is estimated correctly with mwthod 1 (t.test method='greater') #0.0 length(which(est_dim[,5]==truedim))/m #fraction of where dimension is estimated correctly with mwthod 1 (t.test method='lower') #0.05 length(which(est_dim[,6]==truedim))/m #fraction of where dimension is estimated correctly with mwthod (elbow) #1 length(which(est_dim[,7]==truedim))/m #fraction of where dimension is estimated correctly with mave #0.95 ########## #Small simulation example for truedim =2 set.seed(21) dim<-6 truedim<-2 N<-100 b<-cbind(c(1,rep(0,dim-1)),c(0,1,rep(0,dim-2))) m<-20 est_dim<-mat.or.vec(m,7) dim.max<-4 for(i in 1:m){ dat<-creat_sample(b,N,function(x){return(x[1]*x[2])},0.5) Blist<-list() Lmat<-mat.or.vec(N,dim.max) for(u in 1:dim.max){ m1<-stiefl_opt(dat,k=(dim-u)) #original bandwidth selection used !!!!!!!!!!!!!!!!!!!!!, also choose_h_2 Blist[[u]]<-fill_base(m1$est_base)[,1:u] Lmat[,u]<-m1$aov_dat } est_dim[i,1]<-est_dim_CV(Blist,dat,dim.max = dim.max,median_use = T)$est_dim est_dim[i,2]<-est_dim_CV(Blist,dat,dim.max = dim.max,median_use = F)$est_dim est_dim[i,3]<-est_dim_CV(Blist,dat,dim.max = dim.max,median_use = F,method_mars = T)$est_dim est_dim[i,4]<-test_for_dim(Lmat)$estdim est_dim[i,5]<-test_for_dim(Lmat,method = 'lower')$estdim est_dim[i,6]<-test_for_dim_elbow(cbind((dat[,1]-mean(dat[,1]))^2,Lmat))-1 mod_t<-mave(Y~.,data=as.data.frame(dat),method = 'meanMAVE') est_dim[i,7]<-which.min(mave.dim(mod_t)$cv) print(i) } length(which(est_dim[,1]==truedim))/m #fraction of where dimension is estimated correctly with mwthod (CV with median) length(which(est_dim[,2]==truedim))/m #fraction of where dimension is estimated correctly with mwthod (CV with mean) length(which(est_dim[,3]==truedim))/m #fraction of where dimension is estimated correctly with mwthod (CV with mars and mean) length(which(est_dim[,4]==truedim))/m #fraction of where dimension is estimated correctly with mwthod (t.test method='greater') length(which(est_dim[,5]==truedim))/m #fraction of where dimension is estimated correctly with mwthod (t.test method='lower') length(which(est_dim[,6]==truedim))/m #fraction of where dimension is estimated correctly with mwthod (elbow)