190 lines
6.8 KiB
R
190 lines
6.8 KiB
R
library("mda") #library for mars
|
|
|
|
local_linear<-function(x,h,dat,beta){
|
|
Y<-dat[,1]
|
|
X<-dat[,-1]
|
|
N<-length(Y)
|
|
X<-X%*%beta
|
|
x<-x%*%beta#beta%*%x
|
|
D_mat<-cbind(rep(1,N),X)
|
|
if (is.vector(X)){
|
|
dim<-1
|
|
d<-abs(X-rep(x,N))
|
|
}
|
|
else{
|
|
dim<-length(X[1,])
|
|
d<-sqrt(apply(X-t(matrix(rep(x,N),dim,N)),1,norm2))
|
|
}
|
|
K<-diag(dnorm(d/h)/dnorm(0))
|
|
pred<-c(1,x)%*%solve(t(D_mat)%*%K%*%D_mat)%*%t(D_mat)%*%K%*%Y
|
|
return(pred)
|
|
}
|
|
##### performs estimation of small dimesnion by CV with local linear forward regression
|
|
est_dim_CV<-function(Blist,dat,h_loclin=NULL,dim.max,median_use=F,method_mars=F){
|
|
#standardize regressors by symmetric root of inverse covariance mat
|
|
Sig<-est_varmat(dat[,-1])
|
|
eig_dec<-eigen(Sig)
|
|
Sroot_inv<-eig_dec$vectors%*%((diag(eig_dec$values^(-1/2))))%*%t(eig_dec$vectors)
|
|
dat[,-1]<-as.matrix(dat[,-1])%*%Sroot_inv
|
|
|
|
N<-length(dat[,1])
|
|
dim<-length(dat[1,-1])
|
|
|
|
MSE<-mat.or.vec(N,dim.max)
|
|
for(u in 1:dim.max){
|
|
beta<-Blist[[u]]
|
|
if(is.null(h_loclin)){
|
|
#h_loclin<-(1/N)^(1/(3+2*u))
|
|
h_loclin<-1.2*N^(-1/(4+u))#(1/N)^(1/(3+2*j))
|
|
|
|
}
|
|
|
|
for(i in 1:N){
|
|
x<-dat[i,-1]
|
|
if(method_mars==F){MSE[i,u]<-(dat[i,1]-local_linear(x,h_loclin,as.matrix(dat[-i,]),beta))^2} #predict with local linear
|
|
if(method_mars==T){
|
|
dat_fit<-dat[-i,-1]%*%beta
|
|
X_new<-dat[i,-1]%*%beta
|
|
mars_mod<-mars(dat_fit,dat[-i,1]) #fit mars model
|
|
MSE[i,u]<-(dat[i,1]-predict(mars_mod,X_new))^2 #predict with mars model
|
|
}#predict with mars
|
|
}
|
|
}
|
|
if(median_use){MSE_ave<-apply(MSE,2,median)}
|
|
else{MSE_ave<-colMeans(MSE)}
|
|
#apply(MSE,2,median)
|
|
est_dim<-which.min(MSE_ave)
|
|
ret<-list(est_dim = est_dim, MSE_ave = MSE_ave, MSE = MSE)
|
|
return(ret)
|
|
}
|
|
###########
|
|
test_for_dim<-function(Lmat,dim.max=NULL,alpha=0.1,method='greater'){
|
|
#Lmat... matrix with dimension N times dim.max with columns corresponding to aov_dat for dim 1,2,3,....,max.dim
|
|
if(is.null(dim.max)){dim.max<-length(Lmat[1,])}
|
|
|
|
pval<-mat.or.vec(dim.max-1,1)
|
|
est_dim<-dim.max
|
|
# for (j in 1:(dim.max-1)){
|
|
j<-1
|
|
while(est_dim==dim.max&j<dim.max){
|
|
if (method=='greater'){
|
|
pval[(j)]<-t.test(Lmat[,(j)],Lmat[,(j+1)],alternative = 'greater',paired=T)$p.value#mod[[1]][,5][1]
|
|
if(pval[j]<alpha){est_dim<-j}
|
|
}
|
|
if (method=='lower'){
|
|
pval[(j)]<-t.test(Lmat[,(j)],Lmat[,(j+1)],alternative = 'less',paired=F)$p.value#mod[[1]][,5][1]
|
|
if(pval[j]>alpha){est_dim<-j}
|
|
}
|
|
j<-j+1
|
|
|
|
}
|
|
ret<-list(est_dim,pval)
|
|
names(ret)<-c('estdim','pval')
|
|
return(ret)
|
|
}
|
|
####
|
|
|
|
##
|
|
test_for_dim_elbow<-function(Lmat){
|
|
dim.max<-length(Lmat[1,])
|
|
ave<-colMeans(Lmat)
|
|
boxplot(Lmat,xlab='k')
|
|
lines(seq(1,dim.max),ave,col='red')
|
|
# tmp<-cbind(ave,seq(1,dim.max))
|
|
# colnames(tmp)<-c('response','k')
|
|
# diff<-lm(response~k,data=as.data.frame(tmp))$coefficients[2]
|
|
#
|
|
# est_dim<-dim.max
|
|
# i<-1
|
|
# while(i<dim.max&est_dim==dim.max){
|
|
# if(ave[i+1]-ave[i]>diff){est_dim<-i}
|
|
# i<-i+1
|
|
# }
|
|
return(which.min(ave))
|
|
}
|
|
######
|
|
#Small simulation example for truedim =1
|
|
|
|
set.seed(21)
|
|
dim<-6
|
|
truedim<-1
|
|
N<-50
|
|
b<-c(1,0,0,0,0,0)
|
|
m<-20
|
|
est_dim<-mat.or.vec(m,7)
|
|
dim.max<-4
|
|
for(i in 1:m){
|
|
dat<-creat_sample(b,N,fsquare,0.5)
|
|
Blist<-list()
|
|
Lmat<-mat.or.vec(N,dim.max)
|
|
for(u in 1:dim.max){ #calculate B for different possible truedim's
|
|
m1<-stiefl_opt(dat,k=(dim-u)) #original bandwidth selection rule used that controlls number of points in a slice!!!!!!, also choose_h_2
|
|
Blist[[u]]<-fill_base(m1$est_base)[,1:u]
|
|
Lmat[,u]<-m1$aov_dat
|
|
}
|
|
#estimate truedim with different methods
|
|
est_dim[i,1]<-est_dim_CV(Blist,dat,dim.max = dim.max,median_use = T)$est_dim
|
|
est_dim[i,2]<-est_dim_CV(Blist,dat,dim.max = dim.max,median_use = F)$est_dim
|
|
est_dim[i,3]<-est_dim_CV(Blist,dat,dim.max = dim.max,median_use = F,method_mars = T)$est_dim
|
|
est_dim[i,4]<-test_for_dim(Lmat)$estdim
|
|
est_dim[i,5]<-test_for_dim(Lmat,method = 'lower')$estdim
|
|
est_dim[i,6]<-test_for_dim_elbow(cbind((dat[,1]-mean(dat[,1]))^2,Lmat))-1
|
|
mod_t<-mave(Y~.,data=as.data.frame(dat),method = 'meanMAVE')
|
|
est_dim[i,7]<-which.min(mave.dim(mod_t)$cv)
|
|
|
|
print(i)
|
|
}
|
|
|
|
length(which(est_dim[,1]==truedim))/m #fraction of where dimension is estimated correctly with mwthod 1 (CV with median)
|
|
#0.5
|
|
length(which(est_dim[,2]==truedim))/m #fraction of where dimension is estimated correctly with mwthod 1 (CV with mean)
|
|
#0.9
|
|
length(which(est_dim[,3]==truedim))/m #fraction of where dimension is estimated correctly with mwthod 1 (CV with mars and mean)
|
|
#0.8
|
|
length(which(est_dim[,4]==truedim))/m #fraction of where dimension is estimated correctly with mwthod 1 (t.test method='greater')
|
|
#0.0
|
|
length(which(est_dim[,5]==truedim))/m #fraction of where dimension is estimated correctly with mwthod 1 (t.test method='lower')
|
|
#0.05
|
|
length(which(est_dim[,6]==truedim))/m #fraction of where dimension is estimated correctly with mwthod (elbow)
|
|
#1
|
|
length(which(est_dim[,7]==truedim))/m #fraction of where dimension is estimated correctly with mave
|
|
#0.95
|
|
##########
|
|
#Small simulation example for truedim =2
|
|
|
|
set.seed(21)
|
|
dim<-6
|
|
truedim<-2
|
|
N<-100
|
|
b<-cbind(c(1,rep(0,dim-1)),c(0,1,rep(0,dim-2)))
|
|
m<-20
|
|
est_dim<-mat.or.vec(m,7)
|
|
dim.max<-4
|
|
for(i in 1:m){
|
|
dat<-creat_sample(b,N,function(x){return(x[1]*x[2])},0.5)
|
|
Blist<-list()
|
|
Lmat<-mat.or.vec(N,dim.max)
|
|
for(u in 1:dim.max){
|
|
m1<-stiefl_opt(dat,k=(dim-u)) #original bandwidth selection used !!!!!!!!!!!!!!!!!!!!!, also choose_h_2
|
|
Blist[[u]]<-fill_base(m1$est_base)[,1:u]
|
|
Lmat[,u]<-m1$aov_dat
|
|
}
|
|
est_dim[i,1]<-est_dim_CV(Blist,dat,dim.max = dim.max,median_use = T)$est_dim
|
|
est_dim[i,2]<-est_dim_CV(Blist,dat,dim.max = dim.max,median_use = F)$est_dim
|
|
est_dim[i,3]<-est_dim_CV(Blist,dat,dim.max = dim.max,median_use = F,method_mars = T)$est_dim
|
|
est_dim[i,4]<-test_for_dim(Lmat)$estdim
|
|
est_dim[i,5]<-test_for_dim(Lmat,method = 'lower')$estdim
|
|
est_dim[i,6]<-test_for_dim_elbow(cbind((dat[,1]-mean(dat[,1]))^2,Lmat))-1
|
|
mod_t<-mave(Y~.,data=as.data.frame(dat),method = 'meanMAVE')
|
|
est_dim[i,7]<-which.min(mave.dim(mod_t)$cv)
|
|
|
|
print(i)
|
|
}
|
|
|
|
length(which(est_dim[,1]==truedim))/m #fraction of where dimension is estimated correctly with mwthod (CV with median)
|
|
length(which(est_dim[,2]==truedim))/m #fraction of where dimension is estimated correctly with mwthod (CV with mean)
|
|
length(which(est_dim[,3]==truedim))/m #fraction of where dimension is estimated correctly with mwthod (CV with mars and mean)
|
|
length(which(est_dim[,4]==truedim))/m #fraction of where dimension is estimated correctly with mwthod (t.test method='greater')
|
|
length(which(est_dim[,5]==truedim))/m #fraction of where dimension is estimated correctly with mwthod (t.test method='lower')
|
|
length(which(est_dim[,6]==truedim))/m #fraction of where dimension is estimated correctly with mwthod (elbow)
|