```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
####Analytics Class####
## R programming skillset 6 _ essential pipeline of cross-validation (CV)
```{r}
# Step 1 -> Read data into R workstation
#### Read data from a CSV file
#### Example: Alzheimer's Disease
# filename
# RCurl is the R package to read csv file using a link
library(RCurl)
AD <- read.csv(text=getURL("https://raw.githubusercontent.com/shuailab/ind_498/master/resource/data/AD2.csv"))
# str(data)
```
```{r}
# Step 2 -> Data preprocessing
# Create your X matrix (predictors) and Y vector (outcome variable)
X <- AD[,1:15]
Y <- AD$MMSCORE
data <- data.frame(X,Y)
names(data)[16] <- c("MMSCORE")
```
```{r}
# Step 3 -> gather a list of candidate models
# Linear regression: often to compare models with different predictors
# Decision tree: often to compare models with different depth
# Random forest: often to compare models with different number of trees, depth of individual tress, and number of features to split
# Use linear regression model as an example
model1 <- "MMSCORE ~ ."
model2 <- "MMSCORE ~ AGE + PTEDUCAT + FDG + AV45 + HippoNV + rs3865444"
model3 <- "MMSCORE ~ AGE + PTEDUCAT"
model4 <- "MMSCORE ~ FDG + AV45 + HippoNV"
```
```{r}
# Step 4 -> Use 10-fold cross-validation to evaluate all the models
# First, let me use 10-fold cross-validation to evaluate the performance of model1
n_folds = 10 # number of fold (the parameter K as we say, K-fold cross validation)
N <- dim(data)[1] # the sample size, N, of the dataset
folds_i <- sample(rep(1:n_folds, length.out = N)) # This randomly creates a labeling vector (1 X N) for the N samples. For example, here, N = 16, and I run this function and it returns the value as 5 4 4 10 6 7 6 8 3 2 1 5 3 9 2 1. That means, the first sample is allocated to the 5th fold, the 2nd and 3rd samples are allocated to the 4th fold, etc.
cv_mse <- NULL # cv_mse aims to make records of the mean squared error (MSE) for each fold
for (k in 1:n_folds) {
test_i <- which(folds_i == k) # In each iteration of the 10 iterations, remember, we use one fold of data as the testing data
data.train <- data[-test_i, ] # Then, the remaining 9 folds' data form our training data
data.test <- data[test_i, ] # This is the testing data, from the ith fold
lm.AD <- lm(model1, data = data.train) # Fit the linear model with the training data
y_hat <- predict(lm.AD, data.test) # Predict on the testing data using the trained model
true_y <- data.test$MMSCORE # get the true y values for the testing data
cv_mse[k] <- mean((true_y - y_hat)^2) # mean((true_y - y_hat)^2): mean squared error (MSE). The small this error, the better your model is
}
mean(cv_mse)
# Then, evaluate model2
cv_mse <- NULL # cv_mse aims to make records of the mean squared error (MSE) for each fold
for (k in 1:n_folds) {
test_i <- which(folds_i == k) # In each iteration of the 10 iterations, remember, we use one fold of data as the testing data
data.train <- data[-test_i, ] # Then, the remaining 9 folds' data form our training data
data.test <- data[test_i, ] # This is the testing data, from the ith fold
lm.AD <- lm(model2, data = data.train) # Fit the linear model with the training data
y_hat <- predict(lm.AD, data.test) # Predict on the testing data using the trained model
true_y <- data.test$MMSCORE # get the true y values for the testing data
cv_mse[k] <- mean((true_y - y_hat)^2) # mean((true_y - y_hat)^2): mean squared error (MSE). The small this error, the better your model is
}
mean(cv_mse)
# Then, evaluate model3
cv_mse <- NULL # cv_mse aims to make records of the mean squared error (MSE) for each fold
for (k in 1:n_folds) {
test_i <- which(folds_i == k) # In each iteration of the 10 iterations, remember, we use one fold of data as the testing data
data.train <- data[-test_i, ] # Then, the remaining 9 folds' data form our training data
data.test <- data[test_i, ] # This is the testing data, from the ith fold
lm.AD <- lm(model3, data = data.train) # Fit the linear model with the training data
y_hat <- predict(lm.AD, data.test) # Predict on the testing data using the trained model
true_y <- data.test$MMSCORE # get the true y values for the testing data
cv_mse[k] <- mean((true_y - y_hat)^2) # mean((true_y - y_hat)^2): mean squared error (MSE). The small this error, the better your model is
}
mean(cv_mse)
# Then, evaluate model4
cv_mse <- NULL # cv_mse aims to make records of the mean squared error (MSE) for each fold
for (k in 1:n_folds) {
test_i <- which(folds_i == k) # In each iteration of the 10 iterations, remember, we use one fold of data as the testing data
data.train <- data[-test_i, ] # Then, the remaining 9 folds' data form our training data
data.test <- data[test_i, ] # This is the testing data, from the ith fold
lm.AD <- lm(model4, data = data.train) # Fit the linear model with the training data
y_hat <- predict(lm.AD, data.test) # Predict on the testing data using the trained model
true_y <- data.test$MMSCORE # get the true y values for the testing data
cv_mse[k] <- mean((true_y - y_hat)^2) # mean((true_y - y_hat)^2): mean squared error (MSE). The small this error, the better your model is
}
mean(cv_mse)
# Thus, we conclude that, the model2, as it achieves minimum mse by the 10-fold cross-validation, is the best model
```
```{r}
# Extension
# The above pipeline with 4 steps could be also applied on other models. Here, let's consider another application on decision tree model.
# Use 10-fold cross-validation to select the best tree size.
# We can directly use the code for 10-fold cross-validation of the linear regression model. Only need to replace the lm.AD <- lm(MMSCORE ~ ., data = data.train) with tree.MMSCORE <- rpart( MMSCORE ~ ., data = data.train, minbucket = 1)
# Note that, as we did in linear regression model, we need to specify a list of candidate models (usually from simple to complex), then use 10-fold cross-validation to evaluate each of them. Then we pick up the best model with minimum prediciton error. Here, as the complexity of a decision tree is largely determined by the depth (also directly related to the minimum # of samples in the leaf nodes), we can evaluate a list of candidate models by trying a tree with minbucket = 1, a tree with minbucket = 10, a tree with minbucket = 20, and a tree with minbucket = 30 (this model only has one root node, thus, it is the simplest model without any split)
require(rpart)
require(rpart.plot)
# First, let me use 10-fold cross-validation to evaluate the performance of the full model
n_folds = 10 # number of fold (the parameter K as we say, K-fold cross validation)
N <- dim(data)[1] # the sample size, N, of the dataset
folds_i <- sample(rep(1:n_folds, length.out = N)) # This randomly creates a labeling vector (1 X N) for the N samples. For example, here, N = 16, and I run this function and it returns the value as 5 4 4 10 6 7 6 8 3 2 1 5 3 9 2 1. That means, the first sample is allocated to the 5th fold, the 2nd and 3rd samples are allocated to the 4th fold, etc.
cv_mse <- NULL # cv_mse aims to make records of the mean squared error (MSE) for each fold
for (k in 1:n_folds) {
test_i <- which(folds_i == k) # In each iteration of the 10 iterations, remember, we use one fold of data as the testing data
data.train <- data[-test_i, ] # Then, the remaining 9 folds' data form our training data
data.test <- data[test_i, ] # This is the testing data, from the ith fold
tree.AD <- tree.MMSCORE <- rpart(MMSCORE ~ ., data = data.train, minbucket = 1) # Fit the linear model with the training data
y_hat <- predict(tree.AD, data.test) # Predict on the testing data using the trained model
true_y <- data.test$MMSCORE # get the true y values for the testing data
cv_mse[k] <- mean((true_y - y_hat)^2) # mean((true_y - y_hat)^2): mean squared error (MSE). The small this error, the better your model is
}
mean(cv_mse)
cv_mse <- NULL # cv_mse aims to make records of the mean squared error (MSE) for each fold
for (k in 1:n_folds) {
test_i <- which(folds_i == k) # In each iteration of the 10 iterations, remember, we use one fold of data as the testing data
data.train <- data[-test_i, ] # Then, the remaining 9 folds' data form our training data
data.test <- data[test_i, ] # This is the testing data, from the ith fold
tree.AD <- tree.MMSCORE <- rpart(MMSCORE ~ ., data = data.train, minbucket = 10) # Fit the linear model with the training data
y_hat <- predict(tree.AD, data.test) # Predict on the testing data using the trained model
true_y <- data.test$MMSCORE # get the true y values for the testing data
cv_mse[k] <- mean((true_y - y_hat)^2) # mean((true_y - y_hat)^2): mean squared error (MSE). The small this error, the better your model is
}
mean(cv_mse)
cv_mse <- NULL # cv_mse aims to make records of the mean squared error (MSE) for each fold
for (k in 1:n_folds) {
test_i <- which(folds_i == k) # In each iteration of the 10 iterations, remember, we use one fold of data as the testing data
data.train <- data[-test_i, ] # Then, the remaining 9 folds' data form our training data
data.test <- data[test_i, ] # This is the testing data, from the ith fold
tree.AD <- tree.MMSCORE <- rpart(MMSCORE ~ ., data = data.train, minbucket = 20) # Fit the linear model with the training data
y_hat <- predict(tree.AD, data.test) # Predict on the testing data using the trained model
true_y <- data.test$MMSCORE # get the true y values for the testing data
cv_mse[k] <- mean((true_y - y_hat)^2) # mean((true_y - y_hat)^2): mean squared error (MSE). The small this error, the better your model is
}
mean(cv_mse)
cv_mse <- NULL # cv_mse aims to make records of the mean squared error (MSE) for each fold
for (k in 1:n_folds) {
test_i <- which(folds_i == k) # In each iteration of the 10 iterations, remember, we use one fold of data as the testing data
data.train <- data[-test_i, ] # Then, the remaining 9 folds' data form our training data
data.test <- data[test_i, ] # This is the testing data, from the ith fold
tree.AD <- tree.MMSCORE <- rpart(MMSCORE ~ ., data = data.train, minbucket = 30) # Fit the linear model with the training data
y_hat <- predict(tree.AD, data.test) # Predict on the testing data using the trained model
true_y <- data.test$MMSCORE # get the true y values for the testing data
cv_mse[k] <- mean((true_y - y_hat)^2) # mean((true_y - y_hat)^2): mean squared error (MSE). The small this error, the better your model is
}
mean(cv_mse)
# Thus, we conclude that, the minbucket = 20, as it achieves minimum mse by the 10-fold cross-validation, is the best model
```