81 lines
2.8 KiB
R
81 lines
2.8 KiB
R
|
library(ISLR) # for Hitters dataset
|
||
|
library(MAVE)
|
||
|
library(CVE)
|
||
|
|
||
|
# Set global parameters.
|
||
|
seed <- 21
|
||
|
max.dim <- 5L
|
||
|
attempts <- 25L
|
||
|
max.iter <- 100L
|
||
|
# momentum <- 0.0
|
||
|
|
||
|
# Prepair data for analysis.
|
||
|
cols <- c("Salary", "AtBat", "Hits", "HmRun", "Runs", "RBI",
|
||
|
"Walks", "Years", "CAtBat","CHits", "CHmRun","CRuns",
|
||
|
"CRBI", "CWalks", "PutOuts", "Assists", "Errors")
|
||
|
outliers <- c(92, # Gary Pettis
|
||
|
120, # John Moses
|
||
|
173, # Milt Thompson
|
||
|
189, # Rick Burleson
|
||
|
220, # Scott Fletcher
|
||
|
230, # Tom Foley
|
||
|
241) # Terry Puhl
|
||
|
# Subselect as a matrix without outliers as well as reordered
|
||
|
# and filtered columns.
|
||
|
ds <- na.omit(Hitters[, cols])[-outliers, ]
|
||
|
ds$Salary <- log(ds$Salary)
|
||
|
ds <- scale(ds, center = TRUE, scale = TRUE)
|
||
|
# Split into data and responce.
|
||
|
X <- as.matrix(ds[, colnames(ds) != "Salary"])
|
||
|
Y <- as.matrix(ds[, "Salary"])
|
||
|
|
||
|
|
||
|
pdf("results/hitters.pdf", width = 15, height = 5)
|
||
|
layout(matrix(c(1, 2), nrow = 1))
|
||
|
cat("Seed: ", seed, "\n\n")
|
||
|
################################################################################
|
||
|
### meanMAVE ###
|
||
|
################################################################################
|
||
|
set.seed(seed)
|
||
|
|
||
|
dr <- mave(Y ~ X, method = "meanMAVE", max.dim = max.dim)
|
||
|
dim <- which.min(mave.dim(dr)$cv)
|
||
|
B <- coef(dr, 2L)
|
||
|
# train linear model on reduced data
|
||
|
data <- as.data.frame(cbind(Y, X %*% B))
|
||
|
colnames(data) <- c("Y", "dir1", "dir2")
|
||
|
model <- lm(Y ~ dir1 + I(dir1^2) + dir2, data = data)
|
||
|
# Print and log.
|
||
|
cat("### meanMAVE\n\nest.dim: ", dim, '\n\n')
|
||
|
summary(model)
|
||
|
plot(x = data[, "dir1"], y = data[, "Y"],
|
||
|
main = "meanMave", xlab = "dir1", ylab = "Y")
|
||
|
plot(x = data[, "dir2"], y = data[, "Y"],
|
||
|
main = "meanMave", xlab = "dir2", ylab = "Y")
|
||
|
|
||
|
################################################################################
|
||
|
### CVE ###
|
||
|
################################################################################
|
||
|
set.seed(seed)
|
||
|
|
||
|
dr <- cve(Y ~ X, max.dim = max.dim, max.iter = max.iter, attempts = attempts)
|
||
|
dim <- predict_dim(dr, method = "cv")$k
|
||
|
B <- coef(dr, 2L)
|
||
|
# Determine 1st direction (cause CVE does not care for column order).
|
||
|
if (which.max(abs(t(coef(dr, 1L)) %*% B)) == 2) {
|
||
|
B <- B[, c(2, 1)] # switch directions
|
||
|
}
|
||
|
# train linear model on reduced data
|
||
|
data <- as.data.frame(cbind(Y, X %*% B))
|
||
|
colnames(data) <- c("Y", "dir1", "dir2")
|
||
|
model <- lm(Y ~ dir1 + I(dir1^2) + dir2, data = data)
|
||
|
# Print and log.
|
||
|
cat("### CVE\n\nest.dim: ", dim, '\n\n')
|
||
|
summary(model)
|
||
|
plot(x = data[, "dir1"], y = data[, "Y"],
|
||
|
main = "CVE", xlab = "dir1", ylab = "Y")
|
||
|
plot(x = data[, "dir2"], y = data[, "Y"],
|
||
|
main = "CVE", xlab = "dir2", ylab = "Y")
|
||
|
|
||
|
dev.off()
|