library(CVE)
library(reticulate)
library(tensorflow)

#' Null space basis of given matrix `V`
#'
#' @param V `(p, q)` matrix
#' @return Semi-orthogonal `(p, p - q)` matrix spaning the null space of `V`.
#' @keywords internal
#' @export
null <- function(V) {
    tmp <- qr(V)
    set <- if(tmp$rank == 0L) seq_len(ncol(V)) else -seq_len(tmp$rank)
    return(qr.Q(tmp, complete = TRUE)[, set, drop = FALSE])
}

subspace_dist <- function(A, B) {
    P <- A %*% solve(t(A) %*% A, t(A))
    Q <- B %*% solve(t(B) %*% B, t(B))
    norm(P - Q, 'F') / sqrt(ncol(A) + ncol(B))
}

estimate.bandwidth <- function (X, k, nObs = sqrt(nrow(X)), version = 1L) {
    n <- nrow(X)
    p <- ncol(X)
    X_c <- scale(X, center = TRUE, scale = FALSE)

    if (version == 1) {
        (2 * sum(X_c^2) / (n * p)) * (1.2 * n^(-1 / (4 + k)))^2
    } else if (version == 2) {
        2 * qchisq((nObs - 1) / (n - 1), k) * sum(X_c^2) / (n * p)
    } else {
        stop("Unknown version.")
    }
}

tf_Variable <- function(obj, dtype = "float32", ...) {
    tf$Variable(obj, dtype = dtype, ...)
}
tf_constant <- function(obj, dtype = "float32", ...) {
    tf$constant(obj, dtype = dtype, ...)
}

cve.tf <- function(X, Y, k, h = estimate.bandwidth(X, k, sqrt(nrow(X))),
    V.init = NULL, optimizer_initialier = tf$optimizers$RMSprop, attempts = 10L,
    nr.projections = nrow(X)^(3 / 2),
    sd_noise = 0, method = c("simple", "weighted")
) {
    method <- match.arg(method)

    `-0.5` <- tf_constant(-0.5)
    `1` <- tf_constant(1)
    `2` <- tf_constant(2)
    n <- nrow(X)
    p <- ncol(X)
    k <- as.integer(k)
    q <- p - k

    if (!is.matrix(Y))
        Y <- as.matrix(Y)

    # Projective resampling.
    if (ncol(Y) > 1L) {
        R <- matrix(rnorm(ncol(Y) * nr.projections), ncol(Y))
        R <- t(t(R) / sqrt(colSums(R^2)))
        Y <- Y %*% R
    }

    X <- tf_constant(scale(X))
    Y <- tf_constant(scale(Y))
    I <- tf_constant(diag(1, p))
    h <- tf_Variable(h)

    loss <- tf_function(function(V) {
        Q <- I - tf$matmul(V, V, transpose_b = TRUE)
        if (sd_noise > 0)
            XQ <- tf$matmul(X + tf$random$normal(list(n, p), stddev = sd_noise), Q)
        else
            XQ <- tf$matmul(X, Q)
        S <- tf$matmul(XQ, XQ, transpose_b = TRUE)
        d <- tf$linalg$diag_part(S)
        D <- tf$reshape(d, list(n, 1L)) + tf$reshape(d, list(1L, n)) - `2` * S
        K <- tf$exp((`-0.5` / h) * tf$pow(D, 2L))
        w <- tf$reduce_sum(K, 1L, keepdims = TRUE)
        y1 <- tf$divide(tf$matmul(K, Y), w)
        y2 <- tf$divide(tf$matmul(K, tf$pow(Y, 2L)), w)
        if (method == "simple") {
            l <- tf$reduce_mean(y2 - tf$pow(y1, 2L))
        } else {# weighted
            w <- w - `1`
            w <- w / tf$reduce_sum(w)
            l <- tf$reduce_sum(w * (y2 - tf$pow(y1, 2L)))
            l <- l / tf$cast(tf$shape(Y)[2], "float32")
        }
        l
    })

    if (is.null(V.init))
        V.init <- qr.Q(qr(matrix(rnorm(p * q), p, q)))
    else
        attempts <- 1L
    V <- tf_Variable(V.init, constraint = function(w) { tf$linalg$qr(w)$q })

    min.loss <- Inf
    for (attempt in seq_len(attempts)) {
        optimizer = optimizer_initialier()

        out <- tf$while_loop(
            cond = tf_function(function(i, L) i < 400L),
            body = tf_function(function(i, L) {
                with(tf$GradientTape() %as% tape, {
                    tape$watch(V)
                    L <- loss(V)
                })
                grad <- tape$gradient(L, V)
                optimizer$apply_gradients(list(list(grad, V)))

                list(i + 1L, L)
            }),
            loop_vars = list(tf_constant(0L, "int32"), tf_constant(Inf))
        )

        if (as.numeric(out[[2]]) < min.loss) {
            min.loss <- as.numeric(out[[2]])
            min.V <- as.matrix(V)
        }
        V$assign(qr.Q(qr(matrix(rnorm(p * q), p, q))))
    }

    list(B = null(min.V), V = min.V, loss = min.loss)
}
# ds <- dataset(1)
# out <- cve.call2(ds$X, ds$Y, ncol(ds$B))

plot.sim <- function(sim) {
    name <- deparse(substitute(sim))
    ssd <- sapply(sim, function(s) subspace_dist(s$B.true, s$B.est))
    print(summary(ssd))
    h <- hist(ssd, freq = FALSE, breaks = seq(0, 1, 0.1), main = name,
              xlab = "Subspace Distance")
    lines(density(ssd, from = 0, to = 1))
    stat <- c(Median = median(ssd), Mean = mean(ssd))
    abline(v = stat, lty = 2)
    text(stat, max(h$density), names(stat),
         pos = if(diff(stat) > 0) c("2", "4") else c("4", "2"))
}

multivariate.dataset <- function(n = 100, p = 6, q = 4) {
    CVE <- getNamespace('CVE')

    X <- matrix(rnorm(n * p), n, p)
    Delta <- diag(1, q, q)
    Delta[1, 2] <- Delta[2, 1] <- -0.5
    epsilon <- CVE$rmvnorm(n, sigma = Delta)
    B <- matrix(0, p, q)
    B[1, 1] <- 1
    B[2, 2] <- 2 / sqrt(5)
    B[3, 2] <- 1 / sqrt(5)

    Y <- X %*% B + epsilon

    list(X = X, Y = Y, B = B[, 1:2])
}


set.seed(42)
reps <- 10L
sim.cve <- vector("list", reps)
sim.tf1  <- vector("list", reps)
sim.tf2  <- vector("list", reps)

start <- Sys.time()
for (i in 1:reps) {
    # ds <- dataset(1)
    ds <- multivariate.dataset()

    # sim.cve[[i]] <- list(
    #     B.est = coef(CVE::cve.call(ds$X, ds$Y, k = ncol(ds$B)), ncol(ds$B)),
    #     B.true = ds$B
    # )

    sim.tf1[[i]] <- list(
        B.est = cve.tf(ds$X, ds$Y, ncol(ds$B),
            optimizer_initialier = tf$optimizers$Adam)$B,
        B.true = ds$B
    )

    sim.tf2[[i]] <- list(
        B.est = cve.tf(ds$X, ds$Y, ncol(ds$B),
            optimizer_initialier = tf$optimizers$Adam,
            method = "weighted")$B,
        B.true = ds$B
    )

    cat(sprintf("\r%4d/%d -", i, reps), format(Sys.time() - start), '\n')
}

# pdf('subspace_comp.pdf')
par(mfrow = c(3, 1))
plot.sim(sim.cve)
plot.sim(sim.tf1)
plot.sim(sim.tf2)
# dev.off()