library("Matrix")
# library("fields")
# library('dr')
# library('mvtnorm')
# library(mgcv)
# library(pracma)
# library(RVAideMemoire)
library("MAVE")
# library(pls)
# library(LaplacesDemon)
# library(earth)
# library("latex2exp")

#################################################

##############################

################################


fsquare<-function(x){
  ret<-0
  for (h in 1:length(x)){ret<-ret+x[h]^2}
  return(ret)
}
f_id<-function(x){return(x[1])}
vfun<-function(x)return(c(cos(x),sin(x)))
vfun_grad<-function(x)return(c(-sin(x),cos(x)))
f_true<-function(x,sigma){return(sigma^2 + cos(x)^2)}
###############################
##creat sample for M2, b is B matrix of model, N..sample size,
#f...link function, sigma..standard deviation of error term
#produces sample with X standard normal with dimension to columnlength of b an
# epsilon centered normal with standard deviation sigma
creat_sample<-function(b,N,f,sigma){
  if (is.vector(b)==1){dim<-length(b)}
  if (is.vector(b)==0){dim<-length(b[,1])}
  
  X<-mat.or.vec(N,dim)
  Y<-mat.or.vec(N,1)
  for (i in 1:N){
    X[i,]<-rnorm(dim,0,1)
    Y[i]<-f(t(b)%*%X[i,])+rnorm(1,0,sigma)
  }
  dat<-cbind(Y,X)
  return(dat)
}
#####
creat_b<-function(dim,n){
  temp<-mat.or.vec(dim,n)
  
  if (n>1){
    while (rankMatrix(temp,method = 'qr')< min(dim,n)){
      for (i in 1:n){
        temp[,i]<-rnorm(dim,0,1)
        temp[,i]<-temp[,i]/sqrt(sum(temp[,i]^2))
      }
    }
  }
  if (n==1) {temp<-rnorm(dim,0,1)
  temp<-temp/sqrt(sum(temp^2))}
  return(temp)
}
###

### creat sample for M1,M3
## b is B matrix of model, N..sample size,
#f...link function, sigma..standard deviation of error term
#produces sample with X normal with Covariancematrix SS' with dimension to columnlength of b an
# epsilon centered normal with standard deviation sigma
creat_sample_nor_nonstand<-function(b,N,f,S,sigma){
  #Var(x)= SS'
  if (is.vector(b)==1){dim<-length(b)}
  if (is.vector(b)==0){dim<-length(b[,1])}
  
  X<-mat.or.vec(N,dim)
  Y<-mat.or.vec(N,1)
  for (i in 1:N){
    X[i,]<-rnorm(dim,0,1)%*%t(S)
    Y[i]<-f(t(b)%*%X[i,])+rnorm(1,0,sigma)
  }
  dat<-cbind(Y,X)
  return(dat)
}
####

####

###
####

####
#### creat sample for M4
##creat sample for M2, b is B matrix of model, N..sample size,
#f...link function, sigma..standard deviation of error term
#produces sample with X Gausian mixture 
#(i.e. X ~ (Zrep(-1,dim) + (1_Z)rep(1,dim))*dispers_para + N(0, I_p) and Z~B(pi)
#with dimension to columnlength of b an
# epsilon centered normal with standard deviation sigma
creat_sample_noneliptic_gausmixture<-function(b,N,f,sigma,pi,dispers_para=1){
  if (is.vector(b)==1){dim<-length(b)}
  if (is.vector(b)==0){dim<-length(b[,1])}
  
  X<-mat.or.vec(N,dim)
  Y<-mat.or.vec(N,1)
  for (i in 1:N){
    p<-rbinom(1,1,pi)
    X[i,]<-(-rep(1,dim)*p+rep(1,dim)*(1-p))*dispers_para+rnorm(dim,0,1)
    Y[i]<-f(t(b)%*%X[i,])+rnorm(1,0,sigma)
  }
  dat<-cbind(Y,X)
  return(dat)
}
###
###############################
#estimates covariance matrix by ML estimate
# X...n times dim data matrix with rows corresponding to X_i
est_varmat<-function(X){
  dim<-length(X[1,])
  Sig<-mat.or.vec(dim,dim)#Cov(X)
  X<-scale(X,center = T,scale = F)
  Sig<-t(X)%*%X/length(X[,1])
  return(Sig)
}
####
# normalizes a vector x such that it sums to 1
column_normalize<-function(x){return(x/sum(x))}
####
#squared 2-norm of a vector x
norm2<-function(x){return(sum(x^2))}
###
# fills up the matrix estb (dim times q) (dim > q) to a dim times dim orthonormal matrix
fill_base<-function(estb){
  dim<-length(estb[,1])
  k<-length(estb[1,])
  P<-(diag(rep(1,dim))-estb%*%t(estb))
  rk<-0
  while (rk < dim){
    tmp1<-P%*%creat_b(dim,1)
    nor<-sqrt(sum(tmp1^2))
    if(nor>10^(-4)){
      estb<-cbind(tmp1/nor,estb)
      rkn<-rankMatrix(estb,method = 'qr')
      if (rkn > rk){P<-(diag(rep(1,dim))-estb%*%t(estb))
      rk<-rkn}
    }
    if(nor <=10^(-4)){rk<-0}
    
  }
  
  return(estb)
}
##################
###new stiefel optimization
#draws from the uniform measure of Stiefel(p,k)
#returns a p times k dimensional semiorthogonal matrix
stiefl_startval<-function(p,k){return(qr.Q(qr(matrix(rnorm(p*k,0,1),p,k))))}
##### chooses shifting points
#auxiliary function for stielfel_opt
# X... N times dim data matrix
#S... sample covariance matrix
#0 < p <=1 fraction of data point used as shifting points
# if p=1 all data points used as shifting points
q_ind<-function(X,S,p){
  dim<-length(X[1,])
  N<-length(X[,1])
  if (p < 1){
    Sinv<-solve(S)
    ind<-c()
    bound<-qchisq(p,dim)
    for (j in 1:N){if(t(X[j,])%*%Sinv%*%X[j,]<=bound){ind<-c(ind,j)}}#maybe demean
    q<-length(ind)
    if (q<=2){
      q<-N
      ind<-seq(1,N)}
  }
  else {
    q<-N
    ind<-seq(1,N)
  }
  ret<-list(q,ind)
  names(ret)<-c('q','ind')
  return(ret)
}
####
#calculates the trace of a quadratic matrix S
var_tr<-function(x){return(sum(diag(x)))}
####
#chooses bandwith h according to formula in paper
#dim...dimension of X vector
#k... row dim of V (dim times q matrix) corresponding to a basis of orthogonal complement of B in model
# N...sample size
#nObs... nObs in bandwith formula
#tr...trace of sample covariance matrix of X
choose_h_2<-function(dim,k,N,nObs,tr){
  #h1<-qchisq(nObs/N,2*(dim-k))
  h2<-qchisq((nObs-1)/(N-1),(dim-k))
  return(h2*2*tr/dim)
}
####
#auxiliary function for stiefel_opt
#X... data matrix of random variable X
#q,ind... output of function q_ind
Xl_fun<-function(X,q,ind){
  N<-length(X[,1])
  Xl<-(kronecker(rep(1,q),X)-kronecker(X[ind,],rep(1,N)))
  return(Xl)
}
###########
# evaluates L(V) and returns L_n(V),(L_tilde_n(V,X_i))_{i=1,..,n} and grad_V L_n(V) (p times k) 
# V... (dim times q) matrix 
# Xl... output of Xl_fun
# dtemp...vector with pairwise distances |X_i - X_j|
# q...output of q_ind function
# Y... vector with N Y_i values
# if grad=T, gradient of L(V) also returned
LV<-function(V,Xl,dtemp,h,q,Y,grad=T){
  N<-length(Y)
  if(is.vector(V)){k<-1}
  else{k<-length(V[1,])}
  Xlv<-Xl%*%V
  d<-dtemp-((Xlv^2)%*%rep(1,k))
  w<-dnorm(d/h)/dnorm(0)
  w<-matrix(w,N,q)
  w<-apply(w,2,column_normalize)
  mY<-t(w)%*%Y
  sig<-t(w)%*%(Y^2)-(mY)^2 # \tilde{L}_n(V, s_0) mit s_0 coresponds to q_ind
  if(grad==T){
    grad<-mat.or.vec(dim,k)
    tmp1<-(kronecker(sig,rep(1,N))-(as.vector(kronecker(rep(1,q),Y))-kronecker(mY,rep(1,N)))^2)
    if(k==1){
      grad_d<- -2*Xl*as.vector(Xlv)
      grad<-(1/h^2)*(1/q)*t(grad_d*as.vector(d)*as.vector(w))%*%tmp1
    }
    else{
      for (j in 1:(k)){
        grad_d<- -2*Xl*as.vector(Xlv[,j])
        grad[,j]<- (1/h^2)*(1/q)*t(grad_d*as.vector(d)*as.vector(w))%*%tmp1
      }
    }
    ret<-list(mean(sig),sig,grad)
    names(ret)<-c('var','sig','grad')
  }
  else{
    ret<-list(mean(sig),sig)
    names(ret)<-c('var','sig')
  }
  
  return(ret)
}
#### performs stiefle optimization of argmin_{V : V'V=I_k} L_n(V)
#through curvilinear search with k0 starting values drawn uniformly on stiefel maniquefold
#dat...(N times dim+1) matrix with first column corresponding to Y values, the other columns 
#consists of X data matrix, (i.e. dat=cbind(Y,X))
#h... bandwidth
#k...row dimension of V that is calculated, corresponds to dimension of orthogonal complement of B
#k0... number of arbitrary starting values
#p...fraction of data points used as shifting point
#maxit... number of maximal iterations in curvilinear search
#nObs.. nObs parameter for choosing bandwidth if no h is supplied
#lambda_0...initial stepsize
#tol...tolerance for stoping iterations
#sclack_para...if relative improvment is worse than sclack_para the stepsize is reduced
#output:
#est_base...Vhat_k= argmin_V:V'V=I_k L_n(V) a (dim times k) matrix where dim is row-dimension of X data matrix
#var...value of L_n(Vhat_k)
#aov_dat... (L_tilde_n(Vhat_k,X_i))_{i=1,..,N}
#count...number of iterations 
#h...bandwidth
#count2...number of times sclack_para is active
stiefl_opt<-function(dat,h=NULL,k,k0=30,p=1,maxit=50,nObs=sqrt(length(dat[,1])),lambda_0=1,tol=10^(-3),sclack_para=0){

  history <- matrix(NA, maxit, k0)

  Y<-dat[,1]
  X<-dat[,-1]
  N<-length(Y)
  dim<-length(X[1,])
  S<-est_varmat(X)
  if(p<1){
    
    tmp1<-q_ind(X,S,p)
    q<-tmp1$q
    ind<-tmp1$ind
  }
  else{
    q<-N
    ind<-1:N
  }
  Xl<-(kronecker(rep(1,q),X)-kronecker(X[ind,],rep(1,N)))
  dtemp<-apply(Xl,1,norm2)
  tr<-var_tr(S)
  if(is.null(h)){h<-choose_h(dim,k,N,nObs,tr)}
  best<-exp(10000)
  Vend<-mat.or.vec(dim,k)
  sig<-mat.or.vec(q,1)
  for(u in 1:k0){
    Vnew<-Vold<-stiefl_startval(dim,k)
    #print(Vold)
    #print(LV(Vold,Xl,dtemp,h,q,Y)$var)
    Lnew<-Lold<-exp(10000)
    lambda<-lambda_0
    err<-10
    count<-0
    count2<-0
    while(err>tol&count<maxit){
      #print(Vold)
      tmp2<-LV(Vold,Xl,dtemp,h,q,Y)
      G<-tmp2$grad
      Lold<-tmp2$var
      W<-G%*%t(Vold)-Vold%*%t(G)
      stepsize<-lambda#/(2*sqrt(count+1))
      Vnew<-solve(diag(1,dim)+stepsize*W)%*%(diag(1,dim)-stepsize*W)%*%Vold
      # print(Vnew)
      tmp3<-LV(Vnew,Xl,dtemp,h,q,Y,grad=F)
      Lnew<-tmp3$var
      err<-sqrt(sum((Vold%*%t(Vold)-Vnew%*%t(Vnew))^2))/sqrt(2*k)#sqrt(sum(tmp3$grad^2))/(dim*k)#
      #print(err)
      if(((Lnew-Lold)/Lold) > sclack_para){#/(count+1)^(0.5)
        lambda=lambda/2
        err<-10
        count2<-count2+1
        count<-count-1
        Vnew<-Vold #!!!!!
        
      }
      Vold<-Vnew
      count<-count+1
      #print(count)

      # Write to history
      history[count, u] <- Lnew

    }
    if(best>Lnew){
      best<-Lnew
      Vend<-Vnew
      sig<-tmp3$sig
    }

  }
  ret<-list(Vend,best,sig,count,h,count2, history)
  names(ret)<-c('est_base','var','aov_dat','count','h','count2', 'history') 
  return(ret)
}
##### calculates distance between two subspaces 
#i.e. |b1%*%(b1%*%b1')^{-1}%*%b1' - b2%*%(b2%*%b2')^{-1}%*%b2'|_2
subspace_dist<-function(b1,b2){
  #b1<-orth(b1)
  #b2<-orth(b2)
  P1<-b1%*%solve(t(b1)%*%b1)%*%t(b1)
  P2<-b2%*%solve(t(b2)%*%b2)%*%t(b2)
  return(sqrt(sum((P1-P2)^2)))
}
####performs local linaer regression
#x...datapoint at where estimated function should be evaluated
#h...bandwidth 
#dat..(N times dim+1) matrix with first column corresponding to Y values, the other columns 
#consists of X data matrix, (i.e. dat=cbind(Y,X))
#beta...projection matrix
local_linear<-function(x,h,dat,beta){
  Y<-dat[,1]
  X<-dat[,-1]
  N<-length(Y)
  X<-X%*%beta
  x<-x%*%beta#beta%*%x
  D_mat<-cbind(rep(1,N),X)
  if (is.vector(X)){
    dim<-1
    d<-abs(X-rep(x,N))
  }
  else{
    dim<-length(X[1,])
    d<-sqrt(apply(X-t(matrix(rep(x,N),dim,N)),1,norm2))
  }
  K<-diag(dnorm(d/h)/dnorm(0))
  pred<-c(1,x)%*%solve(t(D_mat)%*%K%*%D_mat)%*%t(D_mat)%*%K%*%Y
  return(pred)
}