2
0
Fork 0
CVE/CVE_legacy/estimation_of_dimension.R

190 lines
6.8 KiB
R

library("mda") #library for mars
local_linear<-function(x,h,dat,beta){
Y<-dat[,1]
X<-dat[,-1]
N<-length(Y)
X<-X%*%beta
x<-x%*%beta#beta%*%x
D_mat<-cbind(rep(1,N),X)
if (is.vector(X)){
dim<-1
d<-abs(X-rep(x,N))
}
else{
dim<-length(X[1,])
d<-sqrt(apply(X-t(matrix(rep(x,N),dim,N)),1,norm2))
}
K<-diag(dnorm(d/h)/dnorm(0))
pred<-c(1,x)%*%solve(t(D_mat)%*%K%*%D_mat)%*%t(D_mat)%*%K%*%Y
return(pred)
}
##### performs estimation of small dimesnion by CV with local linear forward regression
est_dim_CV<-function(Blist,dat,h_loclin=NULL,dim.max,median_use=F,method_mars=F){
#standardize regressors by symmetric root of inverse covariance mat
Sig<-est_varmat(dat[,-1])
eig_dec<-eigen(Sig)
Sroot_inv<-eig_dec$vectors%*%((diag(eig_dec$values^(-1/2))))%*%t(eig_dec$vectors)
dat[,-1]<-as.matrix(dat[,-1])%*%Sroot_inv
N<-length(dat[,1])
dim<-length(dat[1,-1])
MSE<-mat.or.vec(N,dim.max)
for(u in 1:dim.max){
beta<-Blist[[u]]
if(is.null(h_loclin)){
#h_loclin<-(1/N)^(1/(3+2*u))
h_loclin<-1.2*N^(-1/(4+u))#(1/N)^(1/(3+2*j))
}
for(i in 1:N){
x<-dat[i,-1]
if(method_mars==F){MSE[i,u]<-(dat[i,1]-local_linear(x,h_loclin,as.matrix(dat[-i,]),beta))^2} #predict with local linear
if(method_mars==T){
dat_fit<-dat[-i,-1]%*%beta
X_new<-dat[i,-1]%*%beta
mars_mod<-mars(dat_fit,dat[-i,1]) #fit mars model
MSE[i,u]<-(dat[i,1]-predict(mars_mod,X_new))^2 #predict with mars model
}#predict with mars
}
}
if(median_use){MSE_ave<-apply(MSE,2,median)}
else{MSE_ave<-colMeans(MSE)}
#apply(MSE,2,median)
est_dim<-which.min(MSE_ave)
ret<-list(est_dim = est_dim, MSE_ave = MSE_ave, MSE = MSE)
return(ret)
}
###########
test_for_dim<-function(Lmat,dim.max=NULL,alpha=0.1,method='greater'){
#Lmat... matrix with dimension N times dim.max with columns corresponding to aov_dat for dim 1,2,3,....,max.dim
if(is.null(dim.max)){dim.max<-length(Lmat[1,])}
pval<-mat.or.vec(dim.max-1,1)
est_dim<-dim.max
# for (j in 1:(dim.max-1)){
j<-1
while(est_dim==dim.max&j<dim.max){
if (method=='greater'){
pval[(j)]<-t.test(Lmat[,(j)],Lmat[,(j+1)],alternative = 'greater',paired=T)$p.value#mod[[1]][,5][1]
if(pval[j]<alpha){est_dim<-j}
}
if (method=='lower'){
pval[(j)]<-t.test(Lmat[,(j)],Lmat[,(j+1)],alternative = 'less',paired=F)$p.value#mod[[1]][,5][1]
if(pval[j]>alpha){est_dim<-j}
}
j<-j+1
}
ret<-list(est_dim,pval)
names(ret)<-c('estdim','pval')
return(ret)
}
####
##
test_for_dim_elbow<-function(Lmat){
dim.max<-length(Lmat[1,])
ave<-colMeans(Lmat)
boxplot(Lmat,xlab='k')
lines(seq(1,dim.max),ave,col='red')
# tmp<-cbind(ave,seq(1,dim.max))
# colnames(tmp)<-c('response','k')
# diff<-lm(response~k,data=as.data.frame(tmp))$coefficients[2]
#
# est_dim<-dim.max
# i<-1
# while(i<dim.max&est_dim==dim.max){
# if(ave[i+1]-ave[i]>diff){est_dim<-i}
# i<-i+1
# }
return(which.min(ave))
}
######
#Small simulation example for truedim =1
set.seed(21)
dim<-6
truedim<-1
N<-50
b<-c(1,0,0,0,0,0)
m<-20
est_dim<-mat.or.vec(m,7)
dim.max<-4
for(i in 1:m){
dat<-creat_sample(b,N,fsquare,0.5)
Blist<-list()
Lmat<-mat.or.vec(N,dim.max)
for(u in 1:dim.max){ #calculate B for different possible truedim's
m1<-stiefl_opt(dat,k=(dim-u)) #original bandwidth selection rule used that controlls number of points in a slice!!!!!!, also choose_h_2
Blist[[u]]<-fill_base(m1$est_base)[,1:u]
Lmat[,u]<-m1$aov_dat
}
#estimate truedim with different methods
est_dim[i,1]<-est_dim_CV(Blist,dat,dim.max = dim.max,median_use = T)$est_dim
est_dim[i,2]<-est_dim_CV(Blist,dat,dim.max = dim.max,median_use = F)$est_dim
est_dim[i,3]<-est_dim_CV(Blist,dat,dim.max = dim.max,median_use = F,method_mars = T)$est_dim
est_dim[i,4]<-test_for_dim(Lmat)$estdim
est_dim[i,5]<-test_for_dim(Lmat,method = 'lower')$estdim
est_dim[i,6]<-test_for_dim_elbow(cbind((dat[,1]-mean(dat[,1]))^2,Lmat))-1
mod_t<-mave(Y~.,data=as.data.frame(dat),method = 'meanMAVE')
est_dim[i,7]<-which.min(mave.dim(mod_t)$cv)
print(i)
}
length(which(est_dim[,1]==truedim))/m #fraction of where dimension is estimated correctly with mwthod 1 (CV with median)
#0.5
length(which(est_dim[,2]==truedim))/m #fraction of where dimension is estimated correctly with mwthod 1 (CV with mean)
#0.9
length(which(est_dim[,3]==truedim))/m #fraction of where dimension is estimated correctly with mwthod 1 (CV with mars and mean)
#0.8
length(which(est_dim[,4]==truedim))/m #fraction of where dimension is estimated correctly with mwthod 1 (t.test method='greater')
#0.0
length(which(est_dim[,5]==truedim))/m #fraction of where dimension is estimated correctly with mwthod 1 (t.test method='lower')
#0.05
length(which(est_dim[,6]==truedim))/m #fraction of where dimension is estimated correctly with mwthod (elbow)
#1
length(which(est_dim[,7]==truedim))/m #fraction of where dimension is estimated correctly with mave
#0.95
##########
#Small simulation example for truedim =2
set.seed(21)
dim<-6
truedim<-2
N<-100
b<-cbind(c(1,rep(0,dim-1)),c(0,1,rep(0,dim-2)))
m<-20
est_dim<-mat.or.vec(m,7)
dim.max<-4
for(i in 1:m){
dat<-creat_sample(b,N,function(x){return(x[1]*x[2])},0.5)
Blist<-list()
Lmat<-mat.or.vec(N,dim.max)
for(u in 1:dim.max){
m1<-stiefl_opt(dat,k=(dim-u)) #original bandwidth selection used !!!!!!!!!!!!!!!!!!!!!, also choose_h_2
Blist[[u]]<-fill_base(m1$est_base)[,1:u]
Lmat[,u]<-m1$aov_dat
}
est_dim[i,1]<-est_dim_CV(Blist,dat,dim.max = dim.max,median_use = T)$est_dim
est_dim[i,2]<-est_dim_CV(Blist,dat,dim.max = dim.max,median_use = F)$est_dim
est_dim[i,3]<-est_dim_CV(Blist,dat,dim.max = dim.max,median_use = F,method_mars = T)$est_dim
est_dim[i,4]<-test_for_dim(Lmat)$estdim
est_dim[i,5]<-test_for_dim(Lmat,method = 'lower')$estdim
est_dim[i,6]<-test_for_dim_elbow(cbind((dat[,1]-mean(dat[,1]))^2,Lmat))-1
mod_t<-mave(Y~.,data=as.data.frame(dat),method = 'meanMAVE')
est_dim[i,7]<-which.min(mave.dim(mod_t)$cv)
print(i)
}
length(which(est_dim[,1]==truedim))/m #fraction of where dimension is estimated correctly with mwthod (CV with median)
length(which(est_dim[,2]==truedim))/m #fraction of where dimension is estimated correctly with mwthod (CV with mean)
length(which(est_dim[,3]==truedim))/m #fraction of where dimension is estimated correctly with mwthod (CV with mars and mean)
length(which(est_dim[,4]==truedim))/m #fraction of where dimension is estimated correctly with mwthod (t.test method='greater')
length(which(est_dim[,5]==truedim))/m #fraction of where dimension is estimated correctly with mwthod (t.test method='lower')
length(which(est_dim[,6]==truedim))/m #fraction of where dimension is estimated correctly with mwthod (elbow)