From 2c5a87ccfef971967ea623326d49d42ededd7dc6 Mon Sep 17 00:00:00 2001 From: daniel Date: Tue, 20 Apr 2021 19:42:33 +0200 Subject: [PATCH] add: real_data scripts --- .../Beijing_Multi_Site_Air_Quality_Data.R | 79 +++++++++++++++++ real_data/BostonHousing.R | 70 +++++++++++++++ real_data/kc_house_data.R | 86 +++++++++++++++++++ 3 files changed, 235 insertions(+) create mode 100644 real_data/Beijing_Multi_Site_Air_Quality_Data.R create mode 100644 real_data/BostonHousing.R create mode 100644 real_data/kc_house_data.R diff --git a/real_data/Beijing_Multi_Site_Air_Quality_Data.R b/real_data/Beijing_Multi_Site_Air_Quality_Data.R new file mode 100644 index 0000000..5957f5c --- /dev/null +++ b/real_data/Beijing_Multi_Site_Air_Quality_Data.R @@ -0,0 +1,79 @@ +#!/usr/bin/env Rscript +## data source: https://archive.ics.uci.edu/ml/datasets/Beijing+Multi-Site+Air-Quality+Data + +library(mda) +Sys.setenv(TF_CPP_MIN_LOG_LEVEL = "3") # Suppress `tensorflow` notes/warnings +suppressPackageStartupMessages({ + library(NNSDR) +}) + +## Configuration +d <- 4L # reduction dimension +epochs = c(2L, 3L) # training epochs (OPG, Refinement) + +## Loading one site of the "Beijing Air Quality" data set +files <- list.files('data/Beijing\ Multi\ Site\ Air\ Quality\ Data/', + pattern = '*.csv', full.names = TRUE) +ds <- na.omit(Reduce(rbind, lapply(files, read.csv))) + +## Create model matrix with dummy variables for factors (One-Hot encoded) for +## regression of PM2.5 (and dropping PM10) +X <- model.matrix(~ year + month + day + hour + SO2 + NO2 + CO + O3 + TEMP + + PRES + DEWP + RAIN + wd + WSPM + station + 0, ds) +Y <- as.matrix(ds$PM2.5) + +## Build Dimension Reduction Neuronal Network model (matching the data) +nn <- nnsdr$new( + input_shapes = list(x = ncol(X)), + d = d, # Reduction dimension + hidden_units = 512L, + activation = 'relu' +) + +## Open simulation log file, write simulation configuration and header +log <- file(format(Sys.time(), "results/Beijing_Air_Quality.csv"), "w", blocking = FALSE) +cat('# d = ', d, '\n# epochs = ', epochs[1], ',', epochs[2], '\n', + 'method,fold,mse,var(Y.test),time.user,time.system,time.elapsed\n', + sep = '', file = log, append = TRUE) + +## K-Fold Cross Validation +K <- 10 +for (i in 1:K) { + ## Split into train/test sets + train <- (1:K) != i + X.train <- scale(X[train, ]) + Y.train <- Y[train, , drop = FALSE] + X.test <- scale(X[!train, ], center = attr(X.train, 'scaled:center'), + scale = attr(X.train, 'scaled:scale')) + Y.test <- Y[!train, , drop = FALSE] + + ## Training + time <- system.time(nn$fit(X.train, Y.train, epochs = epochs, initializer = 'fromOPG')) + + mse <- mean((nn$predict(X.test) - Y.test)^2) + cat('"nn.ref",', i, ',', mse, ',', c(var(Y.test)), ',', + time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n', + sep = '', file = log, append = TRUE) + + ## Linear Model + time <- system.time(lm.mod <- lm(y ~ ., data.frame(X.train, y = Y.train))) + + mse <- mean((predict(lm.mod, data.frame(X.test, y = Y.test)) - Y.test)^2) + cat('"lm",', i, ',', mse, ',', c(var(Y.test)), ',', + time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n', + sep = '', file = log, append = TRUE) + + ## MARS + time <- system.time(mars.mod <- mars(X.train, Y.train)) + + mse <- mean((predict(mars.mod, X.test) - Y.test)^2) + cat('"mars",', i, ',', mse, ',', c(var(Y.test)), ',', + time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n', + sep = '', file = log, append = TRUE) + + ## Reset model + nn$reset() +} + +## Finished, close simulation log file +close(log) diff --git a/real_data/BostonHousing.R b/real_data/BostonHousing.R new file mode 100644 index 0000000..9268dee --- /dev/null +++ b/real_data/BostonHousing.R @@ -0,0 +1,70 @@ +#!/usr/bin/env Rscript + +library(MAVE) +library(CVarE) +library(mlbench) +Sys.setenv(TF_CPP_MIN_LOG_LEVEL = "3") # Suppress `tensorflow` notes/warnings +suppressPackageStartupMessages({ + library(NNSDR) +}) + +## Configuration +d <- 1L + +## Load Boston Housing data set +data('BostonHousing') +ds <- BostonHousing[, names(BostonHousing) != 'chas'] + +## Build Dimension Reduction Neuronal Network model (matching the data) +nn <- nnsdr$new( + input_shapes = list(x = ncol(ds) - 1L), + d = d, # Reduction dimension + hidden_units = 512L, + activation = 'relu' +) + +## Open simulation log file, write simulation configuration and header +log <- file(format(Sys.time(), "results/BostonHousing.csv"), "w", blocking = FALSE) +cat('# d = ', d, '\n', 'method,fold,mse\n', sep = '', file = log, append = TRUE) + +## K-Fold Cross Validation +K <- nrow(ds) # with K == nrow(ds) -> LOO CV +for (i in 1:K) { + ds.train <- ds[(1:K) != i, ] + ds.test <- ds[(1:K) == i, , drop = FALSE] + X.train <- as.matrix(ds.train[, names(ds) != 'medv']) + Y.train <- as.matrix(ds.train[, 'medv']) + X.test <- as.matrix(ds.test[, names(ds) != 'medv']) + Y.test <- as.matrix(ds.test[, 'medv']) + + ## Fit `DR` Neuronal Network model + nn$fit(X.train, Y.train, epochs = c(200L, 400L), initializer = 'fromOPG') + Y.pred <- nn$predict(X.test) + mse <- mean((Y.pred - Y.test)^2) + cat('"nn.ref",', i, ',', mse, '\n', + sep = '', file = log, append = TRUE) + + ## MAVE as reference + dr <- mave.compute(X.train, Y.train, method = 'meanMAVE', max.dim = d) + Y.pred <- predict(dr, X.test, d) + mse <- mean((Y.pred - Y.test)^2) + cat('"mave",', i, ',', mse, '\n', + sep = '', file = log, append = TRUE) + + ## and CVE + X.scaled <- scale(X.train) + dr <- cve.call(X.scaled, Y.train, k = d) + Y.pred <- predict(dr, scale(X.test, + scale = attr(X.scaled, 'scaled:scale'), + center = attr(X.scaled, 'scaled:center')), + k = d) + mse <- mean((Y.pred - Y.test)^2) + cat('"cve",', i, ',', mse, '\n', + sep = '', file = log, append = TRUE) + + ## Reset model + nn$reset() +} + +## Finished, close simulation log file +close(log) diff --git a/real_data/kc_house_data.R b/real_data/kc_house_data.R new file mode 100644 index 0000000..4fccf3a --- /dev/null +++ b/real_data/kc_house_data.R @@ -0,0 +1,86 @@ +#!/usr/bin/env Rscript +## data source: the `MAVE` R-package + +library(MAVE) +library(CVarE) +Sys.setenv(TF_CPP_MIN_LOG_LEVEL = "3") # Suppress `tensorflow` notes/warnings +suppressPackageStartupMessages({ + library(NNSDR) +}) + +## Configuration +d <- 1L # reduction dimension +epochs = c(50L, 100L) # training epochs +dropped <- c('id', 'date', 'zipcode') #, 'sqft_basement') # columns to be dropped + +## Loading the "House Price in King Counte, USA" data set provided by MAVE +data('kc_house_data') +ds <- kc_house_data[, !(names(kc_house_data) %in% dropped)] + +## Build Dimension Reduction Neuronal Network model (matching the data) +nn <- nnsdr$new( + input_shapes = list(x = ncol(ds) - 1L), + d = d, # Reduction dimension + hidden_units = 512L, + activation = 'relu' +) + +## Open simulation log file, write simulation configuration and header +log <- file(format(Sys.time(), "results/kc_house_data.csv"), "w", blocking = FALSE) +cat('# d = ', d, '\n# epochs = ', epochs[1], ',', epochs[2], '\n', + '# dropped = ', paste(dropped, collapse = ', '), '\n', + 'method,fold,mse,var(Y.test),time.user,time.system,time.elapsed\n', + sep = '', file = log, append = TRUE) + +## K-Fold Cross Validation +K <- 10 +for (i in 1:K) { + ds.train <- ds[(1:K) != i, ] + ds.test <- ds[(1:K) == i, , drop = FALSE] + X.train <- as.matrix(ds.train[, names(ds) != 'price']) + Y.train <- as.matrix(ds.train[, 'price']) + X.test <- as.matrix(ds.test[, names(ds) != 'price']) + Y.test <- as.matrix(ds.test[, 'price']) + + ## Fit `DR` Neuronal Network model + time <- system.time(nn$fit(X.train, Y.train, epochs = epochs, initializer = 'fromOPG')) + + mse <- mean((nn$predict(X.test) - Y.test)^2) + cat('"nn.ref",', i, ',', mse, ',', c(var(Y.test)), ',', + time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n', + sep = '', file = log, append = TRUE) + + ## `MAVE` + time <- system.time(dr <- mave.compute(X.train, Y.train, method = 'meanMAVE', max.dim = d)) + + # Sometimes the `mda` package fails -> predict with NA/NaN/Inf value error. + mse <- tryCatch(mean((predict(dr, X.test, d) - Y.test)^2), + error = function(err) NA) + cat('"mave",', i, ',', mse, ',', c(var(Y.test)), ',', + time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n', + sep = '', file = log, append = TRUE) + + # Current implementation requires too much memory (CVarE v1.1). Run on `VSC`. + # ## and CVE + # X.scaled <- scale(X.train) + # time <- system.time(dr <- cve.call(X.scaled, Y.train, k = d)) + + # # Might have the same problem as MAVE since we use `mda` as well. + # mse <- tryCatch({ + # Y.pred <- predict(dr, scale(X.test, + # scale = attr(X.scaled, 'scaled:scale'), + # center = attr(X.scaled, 'scaled:center')), + # k = d) + # mean((Y.pred - Y.test)^2) + # }, + # error = function(err) NA) + # cat('"cve",', i, ',', mse, ',', c(var(Y.test)), ',', + # time['user.self'], ',', time['sys.self'], ',', time['elapsed'], '\n', + # sep = '', file = log, append = TRUE) + + ## Reset model + nn$reset() +} + +## Finished, close simulation log file +close(log)