```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
####Analytics Class####
## R programming skillset 3 _ essential pipeline of logistic regression analysis
```{r}
# Step 1 -> Read data into R workstation
#### Read data from a CSV file
#### Example: Alzheimer's Disease
# filename
# RCurl is the R package to read csv file using a link
library(RCurl)
data <- read.csv(text=getURL("https://raw.githubusercontent.com/shuailab/ind_498/master/resource/data/AD.csv"))
str(data)
```
```{r}
# Step 2 -> Data preprocessing
# Create your X matrix (predictors) and Y vector (outcome variable)
X <- data[,2:16]
Y <- data$DX_bl
# Make sure the outcome variable is legitimate. If it is a continuous variable (regression problem), it should be defined as a "num" variable in R. If it is a binary or a more genernal categorical variable (classification problem), it should be defined as a "factor" variable in R.
# Here, we focus on the binary outcome "DX_bl" (two classes: normal, diseases). The following code makes sure the variable "DX_bl" is a "factor".
Y <- paste0("c", Y) # This line is to "factorize" the variable "DX_bl". It denotes "0" as "c0" and "1" as "c1", to highlight the fact that "DX_bl" is a factor variable, not a numerical variable
Y <- as.factor(Y) # as.factor is to convert any variable into the format as "factor" variable.
# Then, we integrate everything into a data frame
data <- data.frame(X,Y)
names(data)[16] = c("DX_bl")
# Create a training data (half the original data size)
train.ix <- sample(nrow(data),floor( nrow(data)/2) )
data.train <- data[train.ix,]
# Create a testing data (half the original data size)
data.test <- data[-train.ix,]
```
```{r}
# Step 3 -> Use glm() function to build a full model with all predictors
logit.AD.full <- glm(DX_bl ~ ., data = data.train, family = "binomial")
summary(logit.AD.full)
# Important knowledge point: 1) The use of glm() function, including the correct specification of the regression formula (e.g., DX_bl ~ .), arguments used in the glm() (help(glm)) such as family = "binomial" means that we use glm() for logistic regression model; 2) interpretation of the results, including the significant predictors, p-values, t-tests, and R-squares, etc.
```
```{r}
# Step 4 -> use step() to automatically delete all the insignificant variables
# Automatic model selection
logit.AD.reduced <- step(logit.AD.full, direction="both", trace = 0) # 1) direction="backward" was used in the example of linear regression model; here, we use direction="both", which means we start with the full model, then sequentially both remove insignificant variables and also recruit new variables (from the ones that have been removed previously - why? remember that a variable is significant depends on what are other variables on the model already). 2) trace = 0 is to disable the presentation of showing all the models that have been evaluated along the process. If you want to see the process, simply set trace = 1 or not specify this argument (by default, trace = 1 in the function step()..)
anova(logit.AD.reduced,logit.AD.full,test = "LRT") # anova(model1,model2) is to compare if the two models, model1 and model2, are statistically different. If not statistically different, we prefer the simpler model. The argument, test = "LRT", means that a p-value will be reported via the Likelihood Ratio Test (LRT). In this example, p-value is 0.7794, indicates that the two models are not statistically different - which means the reduced model does the same good job as the full model
summary(logit.AD.reduced)
```
```{r}
# Step 5 -> test the significance of the logistic model
# Test residual deviance for lack-of-fit (if > 0.10, little-to-no lack-of-fit)
dev.p.val <- 1 - pchisq(logit.AD.reduced$deviance, logit.AD.reduced$df.residual)
# 1) The argument logit.AD.reduced$deviance is a similar idea as residual in regression model;
# 2) logit.AD.reduced$df.residual is the degree of freedom (df) of this deviance.
# 3) Providing these two arguments to pchisq (chi-square distribution), which calculates the left hand side of the area under the curve. Thus, 1 - pchisq gives you the right hand side of the area under the curve, therefore, the p-value.
dev.p.val
```
```{r}
# Step 6 -> Predict using your logistic regression model
y_hat <- predict(logit.AD.reduced, data.test) # a few comments: 1) predict() is a function that you can find in many R packages. R package developers usually write such a function predict(obj, data), where obj is the obj of the created model by that package, and data is the data points you want to predict on. 2) While in many cases this is not needed, sometimes you do need to specify the argument "type". Here, type = c("link", "response", "terms"). We use the default, type = "link", which means, y_hat are the values from the linear equation part of the logistic regression model. Indeed, in this way, y_hat are the intermediate values. Supposely, in this option, 0 is a cut-off value (only by default, not optimal though), i.e., if y_hat < 0, we name it as one class, and if y_hat > 0, it is another class.
```
```{r}
# Step 7 -> Evaluate the prediction performance of your logistic regression model
# (1) Three main metrics for classification: Accuracy, Sensitivity (1- False Positive), Specificity (1 - False Negative)
y_hat2 <- y_hat
y_hat2[which(y_hat > 0)] = "c1" # Since y_hat here is the values from the linear equation part of the logistic regression model, by default, we should use 0 as a cut-off value (only by default, not optimal though), i.e., if y_hat < 0, we name it as one class, and if y_hat > 0, it is another class.
y_hat2[which(y_hat < 0)] = "c0"
library(caret) # confusionMatrix() in the package "caret" is a powerful function to summerize the prediction performance of a classification model, reporting metrics such as Accuracy, Sensitivity (1- False Positive), Specificity (1 - False Negative), to name a few.
library(e1071)
confusionMatrix(table(y_hat2, data.test$DX_bl))
# (2) ROC curve is another commonly reported metric for classification models
library(pROC) # pROC has the roc() function that is very useful here
plot(roc(data.test$DX_bl, y_hat),
col="green", main="ROC Curve")
```
```{r}
# Remark: how to obtain the 95% CI of the predictions
# predict() uses all the temp values in dataset, including appended values
y_hat <- predict(logit.AD.reduced, data.test, type = "link", se.fit = TRUE)
data.test$fit <- y_hat$fit
data.test$se.fit <- y_hat$se.fit
# We can readily convert these information into the 95\% CIs of the predictions (the way these 95\% CIs are derived are again, only in approximated sense).
# CI for fitted values
data.test <- within(data.test, {
# added "fitted" to make predictions at appended temp values
fitted = exp(fit) / (1 + exp(fit))
fit.lower = exp(fit - 1.96 * se.fit) / (1 + exp(fit - 1.96 * se.fit))
fit.upper = exp(fit + 1.96 * se.fit) / (1 + exp(fit + 1.96 * se.fit))
})
```
```{r}
# Remark: draw the prediction figure
logit.AD.FDG <- glm(DX_bl ~ FDG, data = data.train, family = "binomial")
y_hat <- predict(logit.AD.FDG, data.test, type = "link", se.fit = TRUE)
data.test$fit <- y_hat$fit
data.test$se.fit <- y_hat$se.fit
# We can readily convert these information into the 95\% CIs of the predictions (the way these 95\% CIs are derived are again, only in approximated sense).
# CI for fitted values
data.test <- within(data.test, {
# added "fitted" to make predictions at appended temp values
fitted = exp(fit) / (1 + exp(fit))
fit.lower = exp(fit - 1.96 * se.fit) / (1 + exp(fit - 1.96 * se.fit))
fit.upper = exp(fit + 1.96 * se.fit) / (1 + exp(fit + 1.96 * se.fit))
})
library(ggplot2)
newData <- data.test[order(data.test$FDG),]
newData$DX_bl = as.numeric(newData$DX_bl)
newData$DX_bl[which(newData$DX_bl==1)] = 0
newData$DX_bl[which(newData$DX_bl==2)] = 1
newData$DX_bl = as.numeric(newData$DX_bl)
p <- ggplot(newData, aes(x = FDG, y = DX_bl))
# predicted curve and point-wise 95\% CI
p <- p + geom_ribbon(aes(x = FDG, ymin = fit.lower, ymax = fit.upper), alpha = 0.2)
p <- p + geom_line(aes(x = FDG, y = fitted), colour="red")
# fitted values
p <- p + geom_point(aes(y = fitted), size=2, colour="red")
# observed values
p <- p + geom_point(size = 2)
p <- p + ylab("Probability")
p <- p + labs(title = "Observed and predicted probability of disease")
print(p)
require(reshape2)
data.train$ID = c(1:dim(data.train)[1])
AD.long <- melt(data.train[,c(1,3,4,5,6,16,17)], id.vars = c("ID", "DX_bl"))
# Plot the data using ggplot
require(ggplot2)
p <- ggplot(AD.long, aes(x = factor(DX_bl), y = value))
# boxplot, size=.75 to stand out behind CI
p <- p + geom_boxplot(size = 0.75, alpha = 0.5)
# points for observed data
p <- p + geom_point(position = position_jitter(w = 0.05, h = 0), alpha = 0.1)
# diamond at mean for each group
p <- p + stat_summary(fun.y = mean, geom = "point", shape = 18, size = 6,
alpha = 0.75, colour = "red")
# confidence limits based on normal distribution
p <- p + stat_summary(fun.data = "mean_cl_normal", geom = "errorbar",
width = .2, alpha = 0.8)
p <- p + facet_wrap( ~ variable, scales = "free_y", ncol = 3)
p <- p + labs(title = "Boxplots of variables by diagnosis (0 - normal; 1 - patient)")
print(p)
tempData = cbind(data.test$fit,data.test$DX_bl)
require(ggplot2)
qplot(factor(data.test$DX_bl), data.test$fit, data = data.test,
geom=c("boxplot"), fill = factor(data.test$DX_bl))
```