CVE/simulations/hitters.R

library(ISLR) # for Hitters dataset
library(MAVE)
library(CVE)

# Set global parameters.
seed <- 21
max.dim <- 5L
attempts <- 25L
max.iter <- 100L
# momentum <- 0.0

# Prepair data for analysis.
cols <- c("Salary", "AtBat", "Hits", "HmRun", "Runs", "RBI",
          "Walks", "Years", "CAtBat","CHits", "CHmRun","CRuns",
          "CRBI", "CWalks", "PutOuts", "Assists", "Errors")
outliers <- c(92,  # Gary Pettis
              120, # John Moses
              173, # Milt Thompson
              189, # Rick Burleson
              220, # Scott Fletcher
              230, # Tom Foley
              241) # Terry Puhl
# Subselect as a matrix without outliers as well as reordered
# and filtered columns.
ds <- na.omit(Hitters[, cols])[-outliers, ]
ds$Salary <- log(ds$Salary)
ds <- scale(ds, center = TRUE, scale = TRUE)
# Split into data and responce.
X <- as.matrix(ds[, colnames(ds) != "Salary"])
Y <- as.matrix(ds[, "Salary"])


pdf("results/hitters.pdf", width = 15, height = 5)
layout(matrix(c(1, 2), nrow = 1))
cat("Seed: ", seed, "\n\n")
################################################################################
###                                 meanMAVE                                 ###
################################################################################
set.seed(seed)

dr <- mave(Y ~ X, method = "meanMAVE", max.dim = max.dim)
dim <- which.min(mave.dim(dr)$cv)
B <- coef(dr, 2L)
# train linear model on reduced data
data <- as.data.frame(cbind(Y, X %*% B))
colnames(data) <- c("Y", "dir1", "dir2")
model <- lm(Y ~ dir1 + I(dir1^2) + dir2, data = data)
# Print and log.
cat("### meanMAVE\n\nest.dim: ", dim, '\n\n')
summary(model)
plot(x = data[, "dir1"], y = data[, "Y"],
     main = "meanMave", xlab = "dir1", ylab = "Y")
plot(x = data[, "dir2"], y = data[, "Y"],
     main = "meanMave", xlab = "dir2", ylab = "Y")

################################################################################
###                                   CVE                                    ###
################################################################################
set.seed(seed)

dr <- cve(Y ~ X, max.dim = max.dim, max.iter = max.iter, attempts = attempts)
dim <- predict_dim(dr, method = "cv")$k
B <- coef(dr, 2L)
# Determine 1st direction (cause CVE does not care for column order).
if (which.max(abs(t(coef(dr, 1L)) %*% B)) == 2) {
     B <- B[, c(2, 1)] # switch directions
}
# train linear model on reduced data
data <- as.data.frame(cbind(Y, X %*% B))
colnames(data) <- c("Y", "dir1", "dir2")
model <- lm(Y ~ dir1 + I(dir1^2) + dir2, data = data)
# Print and log.
cat("### CVE\n\nest.dim: ", dim, '\n\n')
summary(model)
plot(x = data[, "dir1"], y = data[, "Y"],
     main = "CVE", xlab = "dir1", ylab = "Y")
plot(x = data[, "dir2"], y = data[, "Y"],
     main = "CVE", xlab = "dir2", ylab = "Y")

dev.off()