library(devtools)

library(dplyr)

library(randomForest)

library(ggplot2)

# Fetch data from the UCI MAchine Learning Repository

url <-"https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"

mpg <- read.table(url,stringsAsFactors = FALSE,na.strings="?")

names(mpg) <- c("mpg","cyl","disp","hp","weight","accel","year","origin","name")

head(mpg)

dim(mpg);

summary(mpg)

sapply(mpg,class)

mpg <- mutate(mpg, hp = as.numeric(hp), year = as.factor(year), origin = as.factor(origin))

head(mpg)

# Function to divide data into training, and test sets

index <- function(data=data,pctTrain=0.7)

{

N <- nrow(data)

train <- sample(N, pctTrain*N)

test <- setdiff(seq_len(N),train)

Ind <- list(train=train,test=test)

return(Ind)

}

#

set.seed(123)

ind <- index(mpg,0.8)

length(ind$train)

length(ind$test)

form <- formula("mpg ~ cyl + disp + hp + weight + accel + year + origin")

rf_fit <- randomForest(formula=form,data=na.omit(mpg[ind$train,]), keep.inbag=TRUE) # Build the model

# Plot the error as the number of trees increases

plot(rf_fit)

k <- 7 # no. of rpedictors in the model

MSE <- 7.477594 # MSE from the randomForest output

######################### confidence inetrval

n <- nrow(na.omit(mpg[ind$train,]))

X <- na.omit(mpg[ind$train,])[,2:8]

X <- apply(X,2,function(X) as.numeric(X))

XtXi <- solve(t(X)%*%X)

df <- data.frame(actual = na.omit(mpg[ind$train,])$mpg ,

predicted = rf_fit$predicted)

df <- data.frame(X,df)

for(i in 1:nrow(df))

{

df$lower_limit[i] = df$predicted[i] - sqrt(MSE*t(as.matrix(as.numeric(df[i,1:7])))%*%XtXi%*%as.matrix(as.numeric(df[i,1:7])))*qt(p=.975,df=n-k-1)

df$upper_limit[i] = df$predicted[i] + sqrt(MSE*t(as.matrix(as.numeric(df[i,1:7])))%*%XtXi%*%as.matrix(as.numeric(df[i,1:7])))*qt(p=.975,df=n-k-1)

}

p11 <- ggplot(df, aes(x = actual, y = predicted))

p11 + geom_errorbar(aes(ymin=lower_limit, ymax=upper_limit), width=.1) +

geom_point() +

geom_abline(intercept=0, slope=1, linetype=2) +

xlab("Observed MPG") +

ylab("Fitted MPG") +

ggtitle("Confidence Intervals for Random Forests")