add: real_data scripts
This commit is contained in:
parent
6e4f3d30a7
commit
2c5a87ccfe
|
@ -0,0 +1,79 @@
|
|||
#!/usr/bin/env Rscript
|
||||
## data source: https://archive.ics.uci.edu/ml/datasets/Beijing+Multi-Site+Air-Quality+Data
|
||||
|
||||
library(mda)
|
||||
Sys.setenv(TF_CPP_MIN_LOG_LEVEL = "3") # Suppress `tensorflow` notes/warnings
|
||||
suppressPackageStartupMessages({
|
||||
library(NNSDR)
|
||||
})
|
||||
|
||||
## Configuration
|
||||
d <- 4L # reduction dimension
|
||||
epochs = c(2L, 3L) # training epochs (OPG, Refinement)
|
||||
|
||||
## Loading one site of the "Beijing Air Quality" data set
|
||||
files <- list.files('data/Beijing\ Multi\ Site\ Air\ Quality\ Data/',
|
||||
pattern = '*.csv', full.names = TRUE)
|
||||
ds <- na.omit(Reduce(rbind, lapply(files, read.csv)))
|
||||
|
||||
## Create model matrix with dummy variables for factors (One-Hot encoded) for
|
||||
## regression of PM2.5 (and dropping PM10)
|
||||
X <- model.matrix(~ year + month + day + hour + SO2 + NO2 + CO + O3 + TEMP +
|
||||
PRES + DEWP + RAIN + wd + WSPM + station + 0, ds)
|
||||
Y <- as.matrix(ds$PM2.5)
|
||||
|
||||
## Build Dimension Reduction Neuronal Network model (matching the data)
|
||||
nn <- nnsdr$new(
|
||||
input_shapes = list(x = ncol(X)),
|
||||
d = d, # Reduction dimension
|
||||
hidden_units = 512L,
|
||||
activation = 'relu'
|
||||
)
|
||||
|
||||
## Open simulation log file, write simulation configuration and header
|
||||
log <- file(format(Sys.time(), "results/Beijing_Air_Quality.csv"), "w", blocking = FALSE)
|
||||
cat('# d = ', d, '\n# epochs = ', epochs[1], ',', epochs[2], '\n',
|
||||
'method,fold,mse,var(Y.test),time.user,time.system,time.elapsed\n',
|
||||
sep = '', file = log, append = TRUE)
|
||||
|
||||
## K-Fold Cross Validation
|
||||
K <- 10
|
||||
for (i in 1:K) {
|
||||
## Split into train/test sets
|
||||
train <- (1:K) != i
|
||||
X.train <- scale(X[train, ])
|
||||
Y.train <- Y[train, , drop = FALSE]
|
||||
X.test <- scale(X[!train, ], center = attr(X.train, 'scaled:center'),
|
||||
scale = attr(X.train, 'scaled:scale'))
|
||||
Y.test <- Y[!train, , drop = FALSE]
|
||||
|
||||
## Training
|
||||
time <- system.time(nn$fit(X.train, Y.train, epochs = epochs, initializer = 'fromOPG'))
|
||||
|
||||
mse <- mean((nn$predict(X.test) - Y.test)^2)
|
||||
cat('"nn.ref",', i, ',', mse, ',', c(var(Y.test)), ',',
|
||||
time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n',
|
||||
sep = '', file = log, append = TRUE)
|
||||
|
||||
## Linear Model
|
||||
time <- system.time(lm.mod <- lm(y ~ ., data.frame(X.train, y = Y.train)))
|
||||
|
||||
mse <- mean((predict(lm.mod, data.frame(X.test, y = Y.test)) - Y.test)^2)
|
||||
cat('"lm",', i, ',', mse, ',', c(var(Y.test)), ',',
|
||||
time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n',
|
||||
sep = '', file = log, append = TRUE)
|
||||
|
||||
## MARS
|
||||
time <- system.time(mars.mod <- mars(X.train, Y.train))
|
||||
|
||||
mse <- mean((predict(mars.mod, X.test) - Y.test)^2)
|
||||
cat('"mars",', i, ',', mse, ',', c(var(Y.test)), ',',
|
||||
time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n',
|
||||
sep = '', file = log, append = TRUE)
|
||||
|
||||
## Reset model
|
||||
nn$reset()
|
||||
}
|
||||
|
||||
## Finished, close simulation log file
|
||||
close(log)
|
|
@ -0,0 +1,70 @@
|
|||
#!/usr/bin/env Rscript
|
||||
|
||||
library(MAVE)
|
||||
library(CVarE)
|
||||
library(mlbench)
|
||||
Sys.setenv(TF_CPP_MIN_LOG_LEVEL = "3") # Suppress `tensorflow` notes/warnings
|
||||
suppressPackageStartupMessages({
|
||||
library(NNSDR)
|
||||
})
|
||||
|
||||
## Configuration
|
||||
d <- 1L
|
||||
|
||||
## Load Boston Housing data set
|
||||
data('BostonHousing')
|
||||
ds <- BostonHousing[, names(BostonHousing) != 'chas']
|
||||
|
||||
## Build Dimension Reduction Neuronal Network model (matching the data)
|
||||
nn <- nnsdr$new(
|
||||
input_shapes = list(x = ncol(ds) - 1L),
|
||||
d = d, # Reduction dimension
|
||||
hidden_units = 512L,
|
||||
activation = 'relu'
|
||||
)
|
||||
|
||||
## Open simulation log file, write simulation configuration and header
|
||||
log <- file(format(Sys.time(), "results/BostonHousing.csv"), "w", blocking = FALSE)
|
||||
cat('# d = ', d, '\n', 'method,fold,mse\n', sep = '', file = log, append = TRUE)
|
||||
|
||||
## K-Fold Cross Validation
|
||||
K <- nrow(ds) # with K == nrow(ds) -> LOO CV
|
||||
for (i in 1:K) {
|
||||
ds.train <- ds[(1:K) != i, ]
|
||||
ds.test <- ds[(1:K) == i, , drop = FALSE]
|
||||
X.train <- as.matrix(ds.train[, names(ds) != 'medv'])
|
||||
Y.train <- as.matrix(ds.train[, 'medv'])
|
||||
X.test <- as.matrix(ds.test[, names(ds) != 'medv'])
|
||||
Y.test <- as.matrix(ds.test[, 'medv'])
|
||||
|
||||
## Fit `DR` Neuronal Network model
|
||||
nn$fit(X.train, Y.train, epochs = c(200L, 400L), initializer = 'fromOPG')
|
||||
Y.pred <- nn$predict(X.test)
|
||||
mse <- mean((Y.pred - Y.test)^2)
|
||||
cat('"nn.ref",', i, ',', mse, '\n',
|
||||
sep = '', file = log, append = TRUE)
|
||||
|
||||
## MAVE as reference
|
||||
dr <- mave.compute(X.train, Y.train, method = 'meanMAVE', max.dim = d)
|
||||
Y.pred <- predict(dr, X.test, d)
|
||||
mse <- mean((Y.pred - Y.test)^2)
|
||||
cat('"mave",', i, ',', mse, '\n',
|
||||
sep = '', file = log, append = TRUE)
|
||||
|
||||
## and CVE
|
||||
X.scaled <- scale(X.train)
|
||||
dr <- cve.call(X.scaled, Y.train, k = d)
|
||||
Y.pred <- predict(dr, scale(X.test,
|
||||
scale = attr(X.scaled, 'scaled:scale'),
|
||||
center = attr(X.scaled, 'scaled:center')),
|
||||
k = d)
|
||||
mse <- mean((Y.pred - Y.test)^2)
|
||||
cat('"cve",', i, ',', mse, '\n',
|
||||
sep = '', file = log, append = TRUE)
|
||||
|
||||
## Reset model
|
||||
nn$reset()
|
||||
}
|
||||
|
||||
## Finished, close simulation log file
|
||||
close(log)
|
|
@ -0,0 +1,86 @@
|
|||
#!/usr/bin/env Rscript
|
||||
## data source: the `MAVE` R-package
|
||||
|
||||
library(MAVE)
|
||||
library(CVarE)
|
||||
Sys.setenv(TF_CPP_MIN_LOG_LEVEL = "3") # Suppress `tensorflow` notes/warnings
|
||||
suppressPackageStartupMessages({
|
||||
library(NNSDR)
|
||||
})
|
||||
|
||||
## Configuration
|
||||
d <- 1L # reduction dimension
|
||||
epochs = c(50L, 100L) # training epochs
|
||||
dropped <- c('id', 'date', 'zipcode') #, 'sqft_basement') # columns to be dropped
|
||||
|
||||
## Loading the "House Price in King Counte, USA" data set provided by MAVE
|
||||
data('kc_house_data')
|
||||
ds <- kc_house_data[, !(names(kc_house_data) %in% dropped)]
|
||||
|
||||
## Build Dimension Reduction Neuronal Network model (matching the data)
|
||||
nn <- nnsdr$new(
|
||||
input_shapes = list(x = ncol(ds) - 1L),
|
||||
d = d, # Reduction dimension
|
||||
hidden_units = 512L,
|
||||
activation = 'relu'
|
||||
)
|
||||
|
||||
## Open simulation log file, write simulation configuration and header
|
||||
log <- file(format(Sys.time(), "results/kc_house_data.csv"), "w", blocking = FALSE)
|
||||
cat('# d = ', d, '\n# epochs = ', epochs[1], ',', epochs[2], '\n',
|
||||
'# dropped = ', paste(dropped, collapse = ', '), '\n',
|
||||
'method,fold,mse,var(Y.test),time.user,time.system,time.elapsed\n',
|
||||
sep = '', file = log, append = TRUE)
|
||||
|
||||
## K-Fold Cross Validation
|
||||
K <- 10
|
||||
for (i in 1:K) {
|
||||
ds.train <- ds[(1:K) != i, ]
|
||||
ds.test <- ds[(1:K) == i, , drop = FALSE]
|
||||
X.train <- as.matrix(ds.train[, names(ds) != 'price'])
|
||||
Y.train <- as.matrix(ds.train[, 'price'])
|
||||
X.test <- as.matrix(ds.test[, names(ds) != 'price'])
|
||||
Y.test <- as.matrix(ds.test[, 'price'])
|
||||
|
||||
## Fit `DR` Neuronal Network model
|
||||
time <- system.time(nn$fit(X.train, Y.train, epochs = epochs, initializer = 'fromOPG'))
|
||||
|
||||
mse <- mean((nn$predict(X.test) - Y.test)^2)
|
||||
cat('"nn.ref",', i, ',', mse, ',', c(var(Y.test)), ',',
|
||||
time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n',
|
||||
sep = '', file = log, append = TRUE)
|
||||
|
||||
## `MAVE`
|
||||
time <- system.time(dr <- mave.compute(X.train, Y.train, method = 'meanMAVE', max.dim = d))
|
||||
|
||||
# Sometimes the `mda` package fails -> predict with NA/NaN/Inf value error.
|
||||
mse <- tryCatch(mean((predict(dr, X.test, d) - Y.test)^2),
|
||||
error = function(err) NA)
|
||||
cat('"mave",', i, ',', mse, ',', c(var(Y.test)), ',',
|
||||
time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n',
|
||||
sep = '', file = log, append = TRUE)
|
||||
|
||||
# Current implementation requires too much memory (CVarE v1.1). Run on `VSC`.
|
||||
# ## and CVE
|
||||
# X.scaled <- scale(X.train)
|
||||
# time <- system.time(dr <- cve.call(X.scaled, Y.train, k = d))
|
||||
|
||||
# # Might have the same problem as MAVE since we use `mda` as well.
|
||||
# mse <- tryCatch({
|
||||
# Y.pred <- predict(dr, scale(X.test,
|
||||
# scale = attr(X.scaled, 'scaled:scale'),
|
||||
# center = attr(X.scaled, 'scaled:center')),
|
||||
# k = d)
|
||||
# mean((Y.pred - Y.test)^2)
|
||||
# },
|
||||
# error = function(err) NA)
|
||||
# cat('"cve",', i, ',', mse, ',', c(var(Y.test)), ',',
|
||||
# time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n',
|
||||
# sep = '', file = log, append = TRUE)
|
||||
|
||||
## Reset model
|
||||
nn$reset()
|
||||
}
|
||||
|
||||
## Finished, close simulation log file
|
||||
close(log)
|
Loading…
Reference in New Issue