```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
####Analytics Class####
## R programming skillset _ essential pipeline of Support Vector Machine (SVM)
```{r}
# Step 1 -> Read data into R workstation
#### Read data from a CSV file
#### Example: Alzheimer's Disease
# filename
# RCurl is the R package to read csv file using a link
library(RCurl)
data <- read.csv(text=getURL("https://raw.githubusercontent.com/shuailab/ind_498/master/resource/data/AD.csv"))
# str(data)
```
```{r}
# Step 2 -> Data preprocessing
# Create your X matrix (predictors) and Y vector (outcome variable)
X <- data[,2:16]
Y <- data$DX_bl
# Make sure the outcome variable is legitimate. If it is a continuous variable (regression problem), it should be defined as a "num" variable in R. If it is a binary or a more genernal categorical variable (classification problem), it should be defined as a "factor" variable in R.
# Here, we focus on the binary outcome "DX_bl" (two classes: normal, diseases). The following code makes sure the variable "DX_bl" is a "factor".
Y <- paste0("c", Y) # This line is to "factorize" the variable "DX_bl". It denotes "0" as "c0" and "1" as "c1", to highlight the fact that "DX_bl" is a factor variable, not a numerical variable
Y <- as.factor(Y) # as.factor is to convert any variable into the format as "factor" variable.
# Then, we integrate everything into a data frame
data <- data.frame(X,Y)
names(data)[16] = c("DX_bl")
# Create a training data (half the original data size)
train.ix <- sample(nrow(data),floor( nrow(data)/2) )
data.train <- data[train.ix,]
# Create a testing data (half the original data size)
data.test <- data[-train.ix,]
```
```{r}
# Step 3 -> gather a list of candidate models
# SVM: often to compare models with different kernels, different values of C, different set of variables
# Use different set of variables
model1 <- as.formula(DX_bl ~ .)
model2 <- as.formula(DX_bl ~ AGE + PTEDUCAT + FDG + AV45 + HippoNV + rs3865444)
model3 <- as.formula(DX_bl ~ AGE + PTEDUCAT)
model4 <- as.formula(DX_bl ~ FDG + AV45 + HippoNV)
```
```{r}
# Step 4 -> Use 10-fold cross-validation to evaluate all the models
# First, let me use 10-fold cross-validation to evaluate the performance of model1
n_folds = 10 # number of fold (the parameter K as we say, K-fold cross validation)
N <- dim(data.train)[1] # the sample size, N, of the dataset
folds_i <- sample(rep(1:n_folds, length.out = N)) # This randomly creates a labeling vector (1 X N) for the N samples. For example, here, N = 16, and I run this function and it returns the value as 5 4 4 10 6 7 6 8 3 2 1 5 3 9 2 1. That means, the first sample is allocated to the 5th fold, the 2nd and 3rd samples are allocated to the 4th fold, etc.
cv_err <- NULL # cv_mse aims to make records of the prediction error for each fold
for (k in 1:n_folds) {
test_i <- which(folds_i == k) # In each iteration of the 10 iterations, remember, we use one fold of data as the testing data
data.train.cv <- data.train[-test_i, ] # Then, the remaining 9 folds' data form our training data
data.test.cv <- data.train[test_i, ] # This is the testing data, from the ith fold
require( 'kernlab' )
linear.svm <- ksvm(model1, data=data.train.cv, type='C-svc', kernel='vanilladot', C=10) # Fit the linear SVM model with the training data
y_hat <- predict(linear.svm, data.test.cv) # Predict on the testing data using the trained model
true_y <- data.test.cv$DX_bl # get the true y values for the testing data
cv_err[k] <-length(which(y_hat != true_y))/length(y_hat) # 1) which(pred.tree != data$DX_bl) identifies the locations of the incorrect predictions; 2)length(any vector) returns the length of that vector; 3) thus, the ratio of incorrect prediction over the total prediction is the prediction error
}
mean(cv_err)
cv_err <- NULL # cv_mse aims to make records of the prediction error for each fold
for (k in 1:n_folds) {
test_i <- which(folds_i == k) # In each iteration of the 10 iterations, remember, we use one fold of data as the testing data
data.train.cv <- data.train[-test_i, ] # Then, the remaining 9 folds' data form our training data
data.test.cv <- data.train[test_i, ] # This is the testing data, from the ith fold
require( 'kernlab' )
linear.svm <- ksvm(model2, data=data.train.cv, type='C-svc', kernel='vanilladot', C=10) # Fit the linear SVM model with the training data
y_hat <- predict(linear.svm, data.test.cv) # Predict on the testing data using the trained model
true_y <- data.test.cv$DX_bl # get the true y values for the testing data
cv_err[k] <-length(which(y_hat != true_y))/length(y_hat) # 1) which(pred.tree != data$DX_bl) identifies the locations of the incorrect predictions; 2)length(any vector) returns the length of that vector; 3) thus, the ratio of incorrect prediction over the total prediction is the prediction error
}
mean(cv_err)
cv_err <- NULL # cv_mse aims to make records of the prediction error for each fold
for (k in 1:n_folds) {
test_i <- which(folds_i == k) # In each iteration of the 10 iterations, remember, we use one fold of data as the testing data
data.train.cv <- data.train[-test_i, ] # Then, the remaining 9 folds' data form our training data
data.test.cv <- data.train[test_i, ] # This is the testing data, from the ith fold
require( 'kernlab' )
linear.svm <- ksvm(model3, data=data.train.cv, type='C-svc', kernel='vanilladot', C=10) # Fit the linear SVM model with the training data
y_hat <- predict(linear.svm, data.test.cv) # Predict on the testing data using the trained model
true_y <- data.test.cv$DX_bl # get the true y values for the testing data
cv_err[k] <-length(which(y_hat != true_y))/length(y_hat) # 1) which(pred.tree != data$DX_bl) identifies the locations of the incorrect predictions; 2)length(any vector) returns the length of that vector; 3) thus, the ratio of incorrect prediction over the total prediction is the prediction error
}
mean(cv_err)
cv_err <- NULL # cv_mse aims to make records of the prediction error for each fold
for (k in 1:n_folds) {
test_i <- which(folds_i == k) # In each iteration of the 10 iterations, remember, we use one fold of data as the testing data
data.train.cv <- data.train[-test_i, ] # Then, the remaining 9 folds' data form our training data
data.test.cv <- data.train[test_i, ] # This is the testing data, from the ith fold
require( 'kernlab' )
linear.svm <- ksvm(model4, data=data.train.cv, type='C-svc', kernel='vanilladot', C=10) # Fit the linear SVM model with the training data
y_hat <- predict(linear.svm, data.test.cv) # Predict on the testing data using the trained model
true_y <- data.test.cv$DX_bl # get the true y values for the testing data
cv_err[k] <-length(which(y_hat != true_y))/length(y_hat) # 1) which(pred.tree != data$DX_bl) identifies the locations of the incorrect predictions; 2)length(any vector) returns the length of that vector; 3) thus, the ratio of incorrect prediction over the total prediction is the prediction error
}
mean(cv_err)
```
```{r}
# Remark 1: You can also try to use the above chunk of code to select the best kernel, i.e., by specifying the argument in ksvm()
# kernel='rbfdot': Radial Basis kernel "Gaussian"
# kernel='polydot': Polynomial kernel
# kernel='vanilladot': Linear kernel
# kernel='tanhdot': Hyperbolic tangent kernel
# kernel='laplacedot': Laplacian kernel
# kernel='besseldot': Bessel kernel
# kernel='anovadot': ANOVA RBF kernel
# kernel='splinedot': Spline kernel
# kernel='stringdot': String kernel
# Remark 1: You can also try to use the above chunk of code to select the best value of C, i.e., by specifying the argument C in ksvm()
```
```{r}
# Step 5 -> After model selection, use ksvm() function to build your final model
linear.svm <- ksvm(model2, data=data.train, type='C-svc', kernel='vanilladot', C=10) # (1) The argument, kernel='vanilladot', means that we are going to build a linear SVM model; (2) C=10 is the tolerance parameter (similarly as the penalty parameter in LASSO, C in SVM is to balance two objectives - one to maximize margin, another one to reduce errors)
```
```{r}
# Step 6 -> Predict using your SVM model
y_hat <- predict(linear.svm, data.test) # a few comments: 1) predict() is a function that you can find in many R pacakges. R package developers tend to agree to write such a function predict(obj, data), where obj is the obj of the created model by that package, and data is the data points you want to predict on. 2) While in many cases this is not needed, sometimes you do need to specify the argument "type". We use the default in predict.svm, type = 'response', which means, y_hat are the classification (c0 or c1).
```
```{r}
# Step 7 -> Evaluate the prediction performance of your SVM model
# (1) Three main metrics for classification: Accuracy, Sensitivity (1- False Positive), Specificity (1 - False Negative)
library(caret) # confusionMatrix() in the package "caret" is a powerful function to summerize the prediction performance of a classification model, reporting metrics such as Accuracy, Sensitivity (1- False Positive), Specificity (1 - False Negative), to name a few.
confusionMatrix(y_hat, data.test$DX_bl)
# (2) ROC curve is another commonly reported metric for classification models
library(pROC) # pROC has the roc() function that is very useful here
y_hat <- predict(linear.svm, data.test, type = 'decision') # Here, we re-predict the y_hat with type = 'decision'. The reason is that, in order to use roc(), we need to generate the predictions as numerical values. Then, roc() can try different cut-off values to convert the numerical scaled y_hat into different sets of binary classification. Recall how ROC is defined.
plot(roc(data.test$DX_bl, y_hat),
col="green", main="ROC Curve")
```