```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` ####Analytics Class#### ####Chapter 9#### ```{r} # Write a nice simulator to generate dataset with one predictor and one outcome # from a polynomial regression model require(splines) seed <- rnorm(1) set.seed(seed) gen_data <- function(n, coef, v_noise) { eps <- rnorm(n, 0, v_noise) x <- sort(runif(n, 0, 100)) X <- cbind(1,ns(x, df = (length(coef) - 1))) y <- as.numeric(X %*% coef + eps) return(data.frame(x = x, y = y)) } ``` ```{r} n_train <- 30 coef <- c(1,0.5) v_noise <- 3 tempData <- gen_data(n_train, coef, v_noise) tempData[31,] = c(200,200) # Fit the data using linear regression model x <- tempData[, "x"] y <- tempData[, "y"] fit <- lm(y~x,data=tempData) # Plot the data x <- tempData$x X <- cbind(1, x) y <- tempData$y plot(y ~ x, col = "gray", lwd = 2) lines(x, X %*% coef, lwd = 3, col = "black") lines(x, fitted(fit), lwd = 3, col = "darkorange") legend(x = "topleft", legend = c("True function", "Fitted linear model"), lwd = rep(4, 4), col = c("black", "darkorange"), text.width = 100, cex = 1.5) ``` ```{r} # Simulate one batch of data n_train <- 100 coef <- c(-0.68,0.82,-0.417,0.32,-0.68) v_noise <- 0.2 n_df <- 20 df <- 1:n_df tempData <- gen_data(n_train, coef, v_noise) # Fit different KNN models x <- tempData$x X <- cbind(1, ns(x, df = (length(coef) - 1))) y <- tempData$y # install.packages("FNN") require(FNN) xy.knn3<- knn.reg(train = x, y = y, k=3) xy.knn10<- knn.reg(train = x, y = y, k=10) xy.knn50<- knn.reg(train = x, y = y, k=50) # Plot the data plot(y ~ x, col = "gray", lwd = 2) lines(x, X %*% coef, lwd = 3, col = "black") lines(x, xy.knn3$pred, lwd = 3, col = "darkorange") lines(x, xy.knn10$pred, lwd = 3, col = "dodgerblue4") lines(x, xy.knn50$pred, lwd = 3, col = "forestgreen") legend(x = "topleft", legend = c("True function", "KNN (k = 3)", "KNN (k = 10)", "KNN (k = 50)"), lwd = rep(3, 4), col = c("black", "darkorange", "dodgerblue4", "forestgreen"), text.width = 32, cex = 0.85) ``` ```{r} # Repeat the above experiments with kernel smoother # Plot the data plot(y ~ x, col = "gray", lwd = 2) lines(x, X %*% coef, lwd = 3, col = "black") lines(ksmooth(x,y, "normal", bandwidth=2),lwd = 3, col = "darkorange") lines(ksmooth(x,y, "normal", bandwidth=5),lwd = 3, col = "dodgerblue4") lines(ksmooth(x,y, "normal", bandwidth=15),lwd = 3, col = "forestgreen") legend(x = "topright", legend = c("True function", "Kernel Reg (bw = 2)", "Kernel Reg (bw = 5)", "Kernel Reg (bw = 15)"), lwd = rep(3, 4), col = c("black", "darkorange", "dodgerblue4", "forestgreen"), text.width = 32, cex = 0.85) ``` ```{r} # Write a simulator to generate dataset with one predictor and one outcome # from a linear regression model with an outlier seed <- rnorm(1) set.seed(seed) gen_data <- function(n, coef, v_noise) { eps <- rnorm(n, 0, v_noise) x <- sort(runif(n, 0, 100)) X <- cbind(1,x) y <- as.numeric(X %*% coef + eps) return(data.frame(x = x, y = y)) } ``` ```{r} n_train <- 30 coef <- c(1,0.5) v_noise <- 3 tempData <- gen_data(n_train, coef, v_noise) tempData[31,] = c(200,200) # Fit the data using linear regression model x <- tempData[, "x"] y <- tempData[, "y"] fit <- lm(y~x,data=tempData) # Plot the data x <- tempData$x X <- cbind(1, x) y <- tempData$y plot(y ~ x, col = "gray", lwd = 2) lines(x, X %*% coef, lwd = 3, col = "black") lines(x, fitted(fit), lwd = 3, col = "darkorange") legend(x = "topleft", legend = c("True function", "Fitted linear model"), lwd = rep(4, 4), col = c("black", "darkorange"), text.width = 120, cex = 1.5) ``` ```{r} # Conditional variance function # Simulate a regression model with heterogeneous variance gen_data <- function(n, coef, v_noise) { x <- rnorm(100,0,2) eps <- rnorm(100,0,sapply(x,function(x){0.5+0.8*x^2})) X <- cbind(1,x) y <- as.numeric(X %*% coef + eps) return(data.frame(x = x, y = y)) } n_train <- 100 coef <- c(1,0.5) v_noise <- 2.5 tempData <- gen_data(n_train, coef, v_noise) ``` ```{r} # Fit the data using linear regression model (OLS) x <- tempData[, "x"] y <- tempData[, "y"] fit.ols <- lm(y~x,data=tempData) # Plot the data and the models x <- tempData$x X <- cbind(1, x) y <- tempData$y plot(y ~ x, col = "gray", lwd = 2) # Plot the true model lines(x, X %*% coef, lwd = 3, col = "black") # Plot the linear regression model (OLS) lines(x, fitted(fit.ols), lwd = 3, col = "darkorange") legend(x = "topleft", legend = c("True function", "Linear model (OLS)"), lwd = rep(4, 4), col = c("black", "darkorange"), text.width = 4, cex = 1) ``` ```{r} # Plot the residual estimated from the linear regression model (OLS) plot(x,residuals(fit.ols)^2,ylab="squared residuals",col = "gray", lwd = 2) # Plot the true model underlying the variance of the error term curve((1+0.8*x^2)^2,col = "black", lwd = 3, add=TRUE) # Fit a nonlinear regression model for residuals # install.packages("np") require(np) var1 <- npreg(residuals(fit.ols)^2 ~ x) grid.x <- seq(from=min(x),to=max(x),length.out=300) lines(grid.x,predict(var1,exdat=grid.x), lwd = 3, col = "darkorange") legend(x = "topleft", legend = c("True function", "Fitted nonlinear model (1st iter)"), lwd = rep(4, 4), col = c("black", "darkorange"), text.width = 5, cex = 1.2) ``` ```{r} # Fit a linear regression model (WLS) with the weights specified # by the fitted nonlinear model of the residuals fit.wls <- lm(y~x,weights=1/fitted(var1)) plot(y ~ x, col = "gray", lwd = 2,ylim = c(-20,20)) # Plot the true model lines(x, X %*% coef, lwd = 3, col = "black") # Plot the linear regression model (OLS) lines(x, fitted(fit.ols), lwd = 3, col = "darkorange") # Plot the linear regression model (WLS) with estimated variance function lines(x, fitted(fit.wls), lwd = 3, col = "forestgreen") legend(x = "topleft", legend = c("True function", "Linear (OLS)", "Linear (WLS) + estimated variance"), lwd = rep(4, 4), col = c("black", "darkorange","forestgreen"), text.width = 5, cex = 1) ``` ```{r} # Plot the residual estimated from the linear regression model (OLS) plot(x,residuals(fit.ols)^2,ylab="squared residuals",col = "gray", lwd = 2) # Plot the true model underlying the variance of the error term curve((1+0.8*x^2)^2,col = "black", lwd = 3, add=TRUE) # Fit a nonlinear regression model for residuals # install.packages("np") require(np) var2 <- npreg(residuals(fit.wls)^2 ~ x) grid.x <- seq(from=min(x),to=max(x),length.out=300) lines(grid.x,predict(var1,exdat=grid.x), lwd = 3, col = "darkorange") lines(grid.x,predict(var2,exdat=grid.x), lwd = 3, col = "forestgreen") legend(x = "topleft", legend = c("True function", "Fitted nonlinear model (1st iter)", "Fitted nonlinear model (2nd iter)"), lwd = rep(4, 4), col = c("black", "darkorange", "forestgreen"), text.width = 6, cex = 1.2) ``` ```{r} #### Dataset of Alzheimer's Disease #### Objective: prediction of MMSCORE # filename # setwd("C:/Users/shuai/Google Drive/Shuai/Paper/Working/2017 Analytics Book/analytics book/data/ADNI") library(RCurl) AD <- read.csv(text=getURL("https://raw.githubusercontent.com/shuailab/ind_498/master/resource/data/AD.csv")) str(AD) # Fit the data using linear regression model (OLS) x <- AD$HippoNV y <- AD$MMSCORE fit.ols <- lm(y~x,data=AD) ``` ```{r} # Fit a linear regression model (WLS) with the weights specified # by the fitted nonlinear model of the residuals var1 <- npreg(residuals(fit.ols)^2 ~ HippoNV, data = AD) fit.wls <- lm(y~x,weights=1/fitted(var1)) plot(y ~ x, col = "gray", lwd = 2) # Plot the linear regression model (OLS) lines(x, fitted(fit.ols), lwd = 3, col = "darkorange") # Plot the linear regression model (WLS) with estimated variance function lines(x, fitted(fit.wls), lwd = 3, col = "forestgreen") legend(x = "topleft", legend = c("Linear (OLS)", "Linear (WLS) + estimated variance"), lwd = rep(4, 4), col = c("darkorange","forestgreen"), text.width = 0.2, cex = 1) ``` ```{r} # Plot the residual estimated from the linear regression model (OLS) plot(x,residuals(fit.ols)^2,ylab="squared residuals",col = "gray", lwd = 2) # Fit a nonlinear regression model for residuals # install.packages("np") require(np) var2 <- npreg(residuals(fit.wls)^2 ~ x) grid.x <- seq(from=min(x),to=max(x),length.out=300) lines(grid.x,predict(var1,exdat=grid.x), lwd = 3, col = "darkorange") lines(grid.x,predict(var2,exdat=grid.x), lwd = 3, col = "forestgreen") legend(x = "topleft", legend = c("Fitted nonlinear model (1st iter)", "Fitted nonlinear model (2nd iter)"), lwd = rep(4, 4), col = c( "darkorange", "forestgreen"), text.width = 0.25, cex = 1.2) ```