diff --git a/simulations/simulation_poi.R b/simulations/simulation_poi.R
new file mode 100644
index 0000000..49c3873
--- /dev/null
+++ b/simulations/simulation_poi.R
@@ -0,0 +1,111 @@
+
+################################################################################
+###                 LDA (sparse Linear Discrimina Analysis)                  ###
+################################################################################
+devtools::load_all('tensorPredictors/')
+
+C <- function(rho, p) {
+    res <- matrix(rho, p, p)
+    diag(res) <- 1
+    res
+}
+R <- function(rho, p) {
+    rho^abs(outer(1:p, 1:p, `-`))
+}
+
+dataset <- function(nr) {
+    K <- 3          # Nr. Groups
+    n.i <- 30       # Sample group size for each of the K groups
+    n <- K * n.i    # Sample size
+    p <- 200        # Nr. of predictors
+
+    # Generate test data
+    V <- cbind(matrix(c(
+        2, 1, 2, 1, 2,
+        1,-1, 1,-1, 1,
+        0, 1,-1, 1, 0
+    ), 3, 5, byrow = TRUE),
+        matrix(0, 3, p - 5)
+    )
+    W <- cbind(matrix(c(
+    -1, 1, 1, 1, 1,
+        1,-1, 1,-1, 1,
+        1, 1,-1, 1, 0
+    ), 3, 5, byrow = TRUE),
+        matrix(0, 3, p - 5)
+    )
+
+    if (nr == 1) {          # Model 1
+        y <- factor(rep(1:K, each = n.i))
+        X <- rmvnorm(n, mu = rep(0, p)) + V[y, ]
+        B <- cbind(V[1, ] - V[2, ], V[2, ] - V[3, ])
+    } else if (nr == 2) {   # Model 2
+        y <- factor(rep(1:K, each = n.i))
+        X <- rmvnorm(n, sigma = C(0.5, p)) + (V %*% C(0.5, p))[y, ]
+        B <- cbind(V[1, ] - V[2, ], V[2, ] - V[3, ])
+    } else if (nr == 3) {   # Model 3
+        y <- factor(rep(1:K, each = n.i))
+        X <- rmvnorm(n, sigma = R(0.5, p)) + (V %*% R(0.5, p))[y, ]
+        B <- cbind(V[1, ] - V[2, ], V[2, ] - V[3, ])
+    } else if (nr == 4) {   # Model 4
+        y <- factor(rep(1:K, each = n.i))
+        X <- rmvnorm(n, sigma = C(0.5, p)) + (W %*% C(0.5, p))[y, ]
+        B <- cbind(W[1, ] - W[2, ], W[2, ] - W[3, ])
+    } else if (nr == 5) {   # Model 5
+        K <- 4
+        n <- K * n.i
+
+        W.tilde <- 2 * rbind(W, colMeans(W))
+        mu.tilde <- W.tilde %*% C(0.5, p)
+
+        y <- factor(rep(1:K, each = n.i))
+        X <- rmvnorm(n, sigma = C(0.5, p)) + mu.tilde[y, ]
+
+        B <- cbind(W[1, ] - W[2, ], W[2, ] - W[3, ])
+    } else {
+        stop("Unknown model nr.")
+    }
+
+    list(X = X, y = y, B = qr.Q(qr(B)))
+}
+
+# # Model 1
+# fit <- with(dataset(1), {
+#     with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'POI-C'))
+# })
+# fit <- with(dataset(1), {
+#     with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'FastPOI-C'))
+# })
+# fit <- with(dataset(1), {
+#     with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'POI-C', use.C = TRUE))
+# })
+# fit <- with(dataset(1), {
+#     with(GEP(X, y, 'lda'), POI(lhs, rhs, ncol(B), method = 'FastPOI-C', use.C = TRUE))
+# })
+
+# head(fit$vectors, 10)
+
+count <- 0
+nr.reps <- 100
+sim <- replicate(nr.reps, {
+    res <- double(0)
+    for (model.nr in 1:5) {
+        for (method in c('POI-C', 'FastPOI-C')) {
+            for (use.C in c(FALSE, TRUE)) {
+                dist <- with(dataset(model.nr), {
+                    fit <- with(GEP(X, y, 'lda'), {
+                        POI(lhs, rhs, ncol(B), method = 'POI-C', use.C = use.C)
+                    })
+                    # dist.subspace(B, fit$vectors, is.ortho = FALSE, normalize = TRUE)
+                    dist.projection(B, fit$vectors)
+                })
+                names(dist) <- paste('M', model.nr, '-', method, '-', use.C)
+                res <- c(res, dist)
+            }
+        }
+    }
+    cat("Counter", (count <<- count + 1), "/", nr.reps, "\n")
+    res
+})
+
+(stats <- as.matrix(rowMeans(sim)))
diff --git a/tensorPredictors/R/GEP.R b/tensorPredictors/R/GEP.R
index a0639c5..c079bcf 100644
--- a/tensorPredictors/R/GEP.R
+++ b/tensorPredictors/R/GEP.R
@@ -14,7 +14,7 @@
 #' Reduction and Variable Selection" By Xin Chen, Changliang Zou and
 #' R. Dennis Cook.
 #'
-GEP <- function(X, y, method = c('pfc', 'pca', 'sir', 'save'), ...,
+GEP <- function(X, y, method = c('pfc', 'pca', 'sir', 'lda'), ...,
     nr.slices = 10, ensamble = list(abs, identity, function(x) x^2)
 ) {
     method <- match.arg(method)
@@ -52,6 +52,21 @@ GEP <- function(X, y, method = c('pfc', 'pca', 'sir', 'save'), ...,
         })))
         # Sample covariance
         rhs <- cov(X)
+    } else if (method == 'lda') {
+        # TODO: check this properly!!! (Maybe a bit better implementation and/or
+        # some theoretical inaccuracies)
+
+        y <- as.factor(y)
+
+        # group means
+        mu <- as.matrix(aggregate(X, list(y), mean)[, -1])
+
+        # between group covariance Sigma.B = Cov(E[X | y])
+        lhs <- ((nrow(mu) - 1) / nrow(mu)) * cov(mu)
+
+        # within group covariance (Sigma.T = Sigma.B + Sigma.W)
+        # with Sigma.W = E(Cov(X | y)) and Sigma.T = Cov(X)
+        rhs <- (((nrow(X) - 1) / nrow(X)) * cov(X)) - lhs
     } else {
         stop('Not implemented!')
     }
diff --git a/tensorPredictors/R/POI.R b/tensorPredictors/R/POI.R
index 8c73e79..ef5235b 100644
--- a/tensorPredictors/R/POI.R
+++ b/tensorPredictors/R/POI.R
@@ -1,13 +1,13 @@
 solve.gep <- function(A, B, d = nrow(A)) {
     isrB <- matpow(B, -0.5)
 
-    if (requireNamespace("RSpectra", quietly = TRUE) && d < nrow(A)) {
+    if ((d < nrow(A)) && requireNamespace("RSpectra", quietly = TRUE)) {
         eig <- RSpectra::eigs_sym(isrB %*% A %*% isrB, d)
     } else {
         eig <- eigen(isrB %*% A %*% isrB, symmetric = TRUE)
     }
 
-    list(vectors = isrB %*% eig$vectors, values = eig$values)
+    list(vectors = isrB %*% eig$vectors[, 1:d, drop = FALSE], values = eig$values)
 }
 
 POI.lambda.max <- function(A, d = 1L, method = c('POI-C', 'POI-L', 'FastPOI-C', 'FastPOI-L')) {
@@ -37,16 +37,28 @@ POI.lambda.max <- function(A, d = 1L, method = c('POI-C', 'POI-L', 'FastPOI-C',
 
 #' Penalysed Orthogonal Iteration.
 #'
-#' @param lambda Default: 0.75 * lambda_max for FastPOI-C method.
-#'
-#' @note use.C required 'poi.so' beeing dynamicaly loaded.
-#'  dyn.load('../tensor_predictors/poi.so')
+#' @param A Left hand side of GEP
+#' @param B right hand side of GEP
+#' @param d number of eigen-vectors, -values to be computed coresponding to the
+#'  largest \eqn{d} eigenvalues of the penalized GEP
+#' @param sparsity scaling for max penalty term in [0, 1) where 0 corresponds to
+#'  no penalization and 1 leads to the trivial solution. (default: 1 / 2)
+#' @param method ether \code{"POI-C"} or \code{"FastPOI-C"} where
+#'      POI-C: Penalized Orthogonal Iteration with Coordinate-wise Lasso penalty
+#'      FastPOI-C: Fast POI with Coordinate-wise Lasso penalty
+#'  where the Coordinate-wise Lasso is a group Lasso penalty.
+#' @param iter.outer maximum number of orthogonal iterations (ignored by Fast
+#'  methods)
+#' @param iter.inner maximum number of inner iterations
+#' @param tol numerical tolerance. Absolute values smaller than \code{tol} are
+#'  treated as 0.
 #'
 #' @export
 POI <- function(A, B, d = 1L, sparsity = 0.5,
-    method = c('POI-C', 'FastPOI-C'), # TODO: Maybe implement the the lasso loss too
+    method = c('POI-C', 'FastPOI-C'), # TODO: Maybe implement Lasso penalty too
     iter.outer = 100L, iter.inner = 500L,
-    tol = sqrt(.Machine$double.eps)
+    tol = sqrt(.Machine$double.eps),
+    use.C = FALSE
 ) {
     method <- match.arg(method)
 
@@ -78,29 +90,36 @@ POI <- function(A, B, d = 1L, sparsity = 0.5,
     } else {
         Delta <- eigen(A, symmetric = TRUE)$vectors[, 1:d, drop = FALSE]
     }
-    Q <- Delta
-    Z <- matrix(0, nrow(Q), ncol(Q))
 
-    # Outer loop (iteration)
-    for (i in seq_len(iter.outer)) {
-        Q.last <- Q # for break condition
+    # In case of fast POI, only one iteration
+    if (startsWith(method, 'Fast')) {
+        # Step 1: Optimize (inner loop, a.k.a. coordinate wise penalization)
+        if (use.C) {
+            Z <- .Call('FastPOI_C_sub', B, Delta, lambda,
+                    as.integer(iter.inner), tol,
+                    PACKAGE = 'tensorPredictors')
+        } else {
+            # Initial value
+            Z <- Delta
 
-        # Step 1: Solve B Z_i = A Q_{i-1} for Z_i
-        Delta <- crossprod(A, Q)
-        # Inner Loop
-        for (j in seq_len(iter.inner)) {
-            Z.last <- Z # for break condition
+            # Note, the R implementation does NOT use a cyclic update instead
+            # performs coordinate penalization in parallel
+            for (j in seq_len(iter.inner)) {
+                Z.last <- Z # for break condition
 
-            traces <- Delta - B %*% Z + diag(B) * Z
-            Z <- traces * (pmax(1 - lambda / sqrt(rowSums(traces^2)), 0) / diag(B))
+                # TODO: it seems (in general) that the cyclic update is actually needed!
+                traces <- Delta - B %*% Z + diag(B) * Z
+                Z <- traces * (pmax(1 - lambda / sqrt(rowSums(traces^2)), 0) / diag(B))
 
-            # Inner break condition
-            if (norm(Z.last - Z, 'F') < tol) {
-                break
+                # Inner break condition (second condition safeguards against devergence)
+                diff <- norm(Z.last - Z, 'F')
+                if (diff < tol || 1 < tol * diff) {
+                    break
+                }
             }
         }
 
-        # Step 2: QR decomposition of Z_i = Q_i R_i.
+        # Step 2: QR decomposition (same as below)
         if (d == 1L) {
             Z.norm <- norm(Z, 'F')
             if (Z.norm < tol) {
@@ -120,94 +139,92 @@ POI <- function(A, B, d = 1L, sparsity = 0.5,
                 Q <- qr.Q(qr(Z))
             }
         }
+    } else { # POI-C
+        Q <- Delta
+        Z <- matrix(0, nrow(Q), ncol(Q))
 
-        # In case of fast POI, only one iteration
-        if (startsWith(method, 'Fast')) {
-            break
-        }
-        if (norm(tcrossprod(Q, Q) - tcrossprod(Q.last, Q.last), 'F') < tol) {
-            break
+        # Outer loop (iteration)
+        for (i in seq_len(iter.outer)) {
+            Q.last <- Q # for break condition
+
+            # Step 1: Solve B Z_i = A Q_{i-1} for Z_i
+            Delta <- crossprod(A, Q)
+
+            # Inner Loop
+            if (use.C) {
+                Z <- .Call('FastPOI_C_sub', B, Delta, lambda,
+                        as.integer(iter.inner), tol,
+                        PACKAGE = 'tensorPredictors')
+            } else {
+                # Note, the R implementation does NOT use a cyclic update instead
+                # performs coordinate penalization in parallel
+                for (j in seq_len(iter.inner)) {
+                    Z.last <- Z # for break condition
+
+                    # TODO: it seems (in general) that the cyclic update is actually needed!
+                    traces <- Delta - B %*% Z + diag(B) * Z
+                    Z <- traces * (pmax(1 - lambda / sqrt(rowSums(traces^2)), 0) / diag(B))
+
+                    # Inner break condition (second condition safeguards against devergence)
+                    diff <- norm(Z.last - Z, 'F')
+                    if (diff < tol || 1 < tol * diff) {
+                        break
+                    }
+                }
+            }
+
+            # Step 2: QR decomposition of Z_i = Q_i R_i.
+            if (d == 1L) {
+                Z.norm <- norm(Z, 'F')
+                if (Z.norm < tol) {
+                    Q <- matrix(0, p, d)
+                } else {
+                    Q <- Z / Z.norm
+                }
+            } else {
+                # Detect zero columns.
+                zero.col <- colSums(abs(Z)) < tol
+                if (all(zero.col)) {
+                    Q <- matrix(0, nrow(Z), ncol(Z))
+                } else if (any(zero.col)) {
+                    Q <- matrix(0, nrow(Z), ncol(Z))
+                    Q[, !zero.col] <- qr.Q(qr(Z[, !zero.col]))
+                } else {
+                    Q <- qr.Q(qr(Z))
+                }
+            }
+
+            # Outer break condition || Q Q' - Q.last Q.last' || < tol
+            # The used form is equivalent but much faster for d << p
+            # The following holds in general for two matrices A, B of dim p x d
+            #   || A A' - B B' ||^2 = || A' A ||^2 - 2 || A' B ||^2 + || B' B ||^2
+            # for the Frobenius norm ||.||. The computational cost of the left side
+            # is O(p^2 d) while the right side has O(p d^2).
+            tr <- sum(crossprod(Q)^2) -
+            2 * sum(crossprod(Q, Q.last)^2) +
+                sum(crossprod(Q.last)^2)
+            if (sqrt(max(0, tr)) < tol) {
+                break
+            }
         }
     }
 
-    # TODO: Finish with transformation to original solution U of
-    #   A U = B U Lambda.
+    # Reconstruct solution of the original GEP by solving
+    #   (Q' A Q) T = (Q' B Q) T D
+    # for T and D which gives the solution U = Q T and Lambda = D.
+    if (1 < d) {
+        eig <- solve.gep(crossprod(Q, A) %*% Q, crossprod(Q, B) %*% Q)
+        vectors <- Q %*% eig$vectors
+        values <- eig$values
+    } else {
+        vectors <- Q
+        values <- c((crossprod(Q, A) %*% Q) / (crossprod(Q, B) %*% Q))
+    }
 
     structure(list(
-        Z = Z, Q = Q,
+        vectors = vectors,
+        values = values,
         lambda = lambda,
         call = match.call()
     ), class = c("tensor_predictor", "POI"))
 }
-
-
-# POI.bak <- function(A, B, d,
-#                 lambda = 0.75 * sqrt(max(rowSums(Delta^2))),
-#                 update.tol = 1e-3,
-#                 tol = 100 * .Machine$double.eps,
-#                 maxit = 400L,
-#                 # maxit.outer = maxit,
-#                 maxit.inner = maxit,
-#                 use.C = FALSE,
-#                 method = 'FastPOI-C') {
-
-#     # TODO:
-#     stopifnot(method == 'FastPOI-C')
-
-#     if (requireNamespace("RSpectra", quietly = TRUE)) {
-#         Delta <- RSpectra::eigs_sym(A, d)$vectors
-#     } else {
-#         Delta <- eigen(A, symmetric = TRUE)$vectors[, 1:d, drop = FALSE]
-#     }
-
-#     # Set initial value.
-#     Z <- Delta
-
-#     # Step 1: Optimization.
-#     # The "inner" optimization loop, aka repeated coordinate optimization.
-#     if (use.C) {
-#         Z <- .Call('FastPOI_C_sub', A, B, Delta, lambda, as.integer(maxit.inner),
-#                    PACKAGE = 'tensorPredictors')
-#     } else {
-#         p <- nrow(Z)
-#         for (iter.inner in 1:maxit.inner) {
-#             Zold <- Z
-#             for (g in 1:p) {
-#                 a <- Delta[g, ] - B[g, ] %*% Z + B[g, g] * Z[g, ]
-#                 a_norm <- sqrt(sum(a^2))
-#                 if (a_norm > lambda) {
-#                     Z[g, ] <- a * ((1 - lambda / a_norm) / B[g, g])
-#                 } else {
-#                     Z[g, ] <- 0
-#                 }
-#             }
-#             if (norm(Z - Zold, 'F') < update.tol) {
-#                 break
-#             }
-#         }
-#     }
-
-#     # Step 2: QR decomposition.
-#     if (d == 1L) {
-#         Z_norm <- sqrt(sum(Z^2))
-#         if (Z_norm < tol) {
-#             Q <- matrix(0, p, d)
-#         } else {
-#             Q <- Z / Z_norm
-#         }
-#     } else {
-#         # Detect zero columns.
-#         zeroColumn <- colSums(abs(Z)) < tol
-#         if (all(zeroColumn)) {
-#             Q <- matrix(0, p, d)
-#         } else if (any(zeroColumn)) {
-#             Q <- matrix(0, p, d)
-#             Q[, !zeroColumn] <- qr.Q(qr(Z))
-#         } else {
-#             Q <- qr.Q(qr(Z))
-#         }
-#     }
-
-#     list(Z = Z, Q = Q, iter.inner = if (use.C) NA else iter.inner,
-#          lambda = lambda)
-# }
diff --git a/tensorPredictors/R/dist_projection.R b/tensorPredictors/R/dist_projection.R
new file mode 100644
index 0000000..7a5811d
--- /dev/null
+++ b/tensorPredictors/R/dist_projection.R
@@ -0,0 +1,27 @@
+#' Porjection Distance of two matrices
+#'
+#' Defined as sine of the maximum principal angle between the column spaces
+#' of the matrices
+#'      max{ sin theta_i, i = 1, ..., min(d1, d2) }
+#' 
+#' @param A,B matrices of size \eqn{p\times d_1} and \eqn{p\times d_2}.
+#'
+#' @export
+dist.projection <- function(A, B, is.ortho = FALSE,
+    tol = sqrt(.Machine$double.eps)
+) {
+    if (!is.ortho) {
+        qrA <- qr(A, tol)
+        A <- qr.Q(qrA)[, seq_len(qrA$rank), drop = FALSE]
+        qrB <- qr(B, tol)
+        B <- qr.Q(qrB)[, seq_len(qrB$rank), drop = FALSE]
+    }
+
+    if (ncol(A) == 0L && ncol(B) == 0L) {
+        0
+    } else if (ncol(A) == 0L || ncol(B) == 0L) {
+        1
+    } else {
+        sin(acos(min(c(La.svd(crossprod(A, B), 0, 0)$d, 1))))
+    }
+}
diff --git a/tensorPredictors/R/dist_subspace.R b/tensorPredictors/R/dist_subspace.R
index cb4dc6c..19d23f8 100644
--- a/tensorPredictors/R/dist_subspace.R
+++ b/tensorPredictors/R/dist_subspace.R
@@ -14,18 +14,30 @@
 #'  subspaces of different dimensions" <arXiv:1407.0900>
 #' 
 #' @export
-dist.subspace <- function (A, B, is.ortho = FALSE, normalize = FALSE) {
+dist.subspace <- function (A, B, is.ortho = FALSE, normalize = FALSE,
+    tol = sqrt(.Machine$double.eps)
+) {
     if (!is.matrix(A)) A <- as.matrix(A)
     if (!is.matrix(B)) B <- as.matrix(B)
 
-    if (is.ortho) {
-        PA <- tcrossprod(A, A)
-        PB <- tcrossprod(B, B)
-    } else {
-        PA <- A %*% solve(t(A) %*% A, t(A))
-        PB <- B %*% solve(t(B) %*% B, t(B))
+    if (!is.ortho) {
+        qrA <- qr(A, tol)
+        if (qrA$rank < ncol(A)) {
+            A <- qr.Q(qrA)[, abs(diag(qr.R(qrA))) > tol, drop = FALSE]
+        } else {
+            A <- qr.Q(qrA)
+        }
+        qrB <- qr(B, tol)
+        if (qrB$rank < ncol(B)) {
+            B <- qr.Q(qrB)[, abs(diag(qr.R(qrB))) > tol, drop = FALSE]
+        } else {
+            B <- qr.Q(qrB)
+        }
     }
 
+    PA <- tcrossprod(A, A)
+    PB <- tcrossprod(B, B)
+
     if (normalize) {
         rankSum <- ncol(A) + ncol(B)
         c <- 1 / sqrt(min(rankSum, 2 * nrow(A) - rankSum))
diff --git a/tensorPredictors/src/poi.c b/tensorPredictors/src/poi.c
index 30fa8a9..de3dbc1 100644
--- a/tensorPredictors/src/poi.c
+++ b/tensorPredictors/src/poi.c
@@ -1,10 +1,18 @@
 #include <math.h>
 
+/**
+ * 
+ * NOTE: CURRENTLY NOT IN USE!
+ * 
+ */
+
 #include <R.h>
 #include <Rinternals.h>
 
 /* invoced by .Call */
-extern SEXP FastPOI_C_sub(SEXP in_A, SEXP in_B, SEXP in_Delta, SEXP in_lambda, SEXP in_maxit) {
+extern SEXP FastPOI_C_sub(SEXP in_B, SEXP in_Delta,
+    SEXP in_lambda, SEXP in_maxit, SEXP in_tol
+) {
     int i, j, k, g;
 
     int p = nrows(in_Delta);
@@ -16,10 +24,10 @@ extern SEXP FastPOI_C_sub(SEXP in_A, SEXP in_B, SEXP in_Delta, SEXP in_lambda, S
     double* Zold = (double*)R_alloc(p * d, sizeof(double));
     double* Delta = REAL(in_Delta);
     double* a = (double*)R_alloc(d, sizeof(double));
-    double* A = REAL(in_A);
     double* B = REAL(in_B);
     double a_norm;
     double lambda = asReal(in_lambda);
+    double tol = asReal(in_tol);
     double scale;
     double res;
 
@@ -55,7 +63,6 @@ extern SEXP FastPOI_C_sub(SEXP in_A, SEXP in_B, SEXP in_Delta, SEXP in_lambda, S
             for (j = 0; j < d; ++j) {
                 Z[j * p + g] = scale * a[j];
             }
-
         }
 
         // Copy Z to Zold and check break condition.
@@ -64,7 +71,7 @@ extern SEXP FastPOI_C_sub(SEXP in_A, SEXP in_B, SEXP in_Delta, SEXP in_lambda, S
             res += (Z[j] - Zold[j]) * (Z[j] - Zold[j]);
             Zold[j] = Z[j];
         }
-        if (res < 1e-6) {
+        if (res < tol) {
             break;
         }