-
Notifications
You must be signed in to change notification settings - Fork 0
/
Species_Acres_Modeling.R
174 lines (153 loc) · 6.09 KB
/
Species_Acres_Modeling.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
### Focus: Species & Acres
## Analyze species count and acres
# This dataset had to be analyzed separately the data would repeat in each row
# Thus adding more weight and significance
library(caret)
library(ggplot2)
psva_count <- read.csv("species_parks_subset.csv")
str(psva_count)
#############################################################
### Simple Analysis (Without CV)
lm.subset <- lm(Visitors ~ Acres+Species, data=psva_count)
summary(lm.subset)
# Remove acres
lm.subset2 <- lm(Visitors ~ Species, data=psva_count)
summary(lm.subset2)
# Interaction effect
lm.subset3 <- lm(Visitors ~ Acres+Species+Acres*Species, data=psva_count)
summary(lm.subset3)
# Compare AIC
AIC(lm.subset)
AIC(lm.subset2) # Best model: Species only
AIC(lm.subset3)
# Plot the species vs. visitors
plot(psva_count$Species, psva_count$Visitors,
xlab='Number of Species', ylab="Number of Visitors")
abline(lm.subset2, col="red")
#############################################################
### Advanced Analysis (With CV)
# Split the data
set.seed(1)
train.index <- sample(nrow(psva_count), nrow(psva_count) * 0.8) # 80% is training data
# Create test and training data frames
# Removed acres
NP.train <- psva_count[train.index, c(2,4)] # Model with this
NP.test <- psva_count[-train.index, c(2,4)] # We don't touch this while training
################
## Diagnostics
# Identify possible skewed variables
library(psych)
skewValues <- apply(NP.train, 2, skew)
skewSE <- sqrt(6/nrow(NP.train)) # Standard error of skewness
# Anything over 2 SEs in skew is potentially problematic
abs(skewValues)/skewSE > 2
## Visualize data distributions of variables with high skew identified above
multi.hist(NP.train[,abs(skewValues)/skewSE > 2])
# Identify correlated predictors
library(corrplot)
NP.cor <- cor(NP.train[,1:2])
NP.cor
corrplot(NP.cor, order="hclust") # Visualize and cluster by high correlation
# No correlation between acres and visitors
# Let visualize the variables and their distributions and possible outliers
boxplot(x = as.list(NP.train)) # Limit has an extremely large range and outliers that could influence things
# Lets remove limit to see the Species distribution better
boxplot(x = as.list(NP.train[1]))
################
## Modeling
# K-fold CV (k = 10)
ctrl <- trainControl(method="cv", number=10)
# Train the data
set.seed(1)
np.lm <- train(Visitors ~ ., data=NP.train, method = "lm", trControl=ctrl)
summary(np.lm)
# Diagnostic plots
par(mfrow=c(2,1))
plot(NP.train$Visitors ~ predict(np.lm), xlab="Predict", ylab="Actual",
main="Actual vs. Predicted")
plot(resid(np.lm) ~ predict(np.lm), xlab="Predict", ylab="Residuals",
main="Predicted Residuals vs. Predicted Values")
# Review what variables were most important
varImp(np.lm) # Species is important
# Lets fit a model that preprocess predictors on each resample fold based on our earlier findings
set.seed(1)
np.lm.pp <- train(Visitors ~ ., data=NP.train,
preProcess=c("BoxCox", "scale", "center"),
method = "lm", trControl=ctrl)
summary(np.lm.pp)
# Let create a model removing correlated predictors
set.seed(1)
# Cutoff correlation at >0.75
highlyCorDescr <- findCorrelation(NP.cor, cutoff=0.75)
highlyCorDescr # No correlation
#############################################################
## kNN
# Set values of k to search through, K 1 to 10
k.grid <- expand.grid(k=1:10)
set.seed(1)
np.knn.pp <- train(Visitors ~ ., data=NP.train, method = "knn",
preProcess=c("BoxCox", "center", "scale"),
tuneGrid=k.grid, trControl=ctrl)
np.knn.pp # k=9
plot(np.knn.pp)
#############################################################
## GAM Smoothing Spline
gamGrid <- expand.grid(df = c(1:10))
set.seed(1)
np.ss.pp <- train(Visitors ~ ., data=NP.train, method="gamSpline", trControl=ctrl,
preProcess=c("BoxCox", "scale", "center"),
tuneGrid=gamGrid)
np.ss.pp # df=2
# Plot the RMSE over df
plot(np.ss.pp)
#############################################################
### COMPARE MODEL PERFORMANCE
# Let's compare all of the model resampling performance
# First lets put all trained models in a list object
c.models<- list("LM Base"=np.lm, "LM PreProc"=np.lm.pp,
"kNN PreProc"=np.knn.pp, "GAM Smoothing Spline"=np.ss.pp)
NP.resamples <- resamples(c.models)
summary(NP.resamples)
# Plot training RMSE performance
bwplot(NP.resamples, metric="RMSE")
# Plot training R^2 performance
bwplot(NP.resamples, metric="Rsquared")
# Evaluate differences in model and see if their are statistical significant differnces
NP.diff <- diff(NP.resamples)
summary(NP.diff) # Lower diagonal is p-value of differences
### TEST DATA EVALUATION
# Creating function called testPerformance to run against each model
# To measure test performance
testPerformance <- function(model) {
postResample(predict(model, NP.test), NP.test$Visitors)
}
# Apply the test performance function against each of the models in the list
test.mod <- lapply(c.models, testPerformance)
# Variable setup
model_test <- c("LM Base", "LM PreProc", "kNN PreProc", "GAM SS")
RMSE_test <- c(test.mod$`LM Base`[[1]], test.mod$`LM PreProc`[[1]],
test.mod$`kNN PreProc`[[1]],
test.mod$`GAM Smoothing Spline`[[1]])
R2_test <- c(test.mod$`LM Base`[[2]], test.mod$`LM PreProc`[[2]],
test.mod$`kNN PreProc`[[2]],
test.mod$`GAM Smoothing Spline`[[2]])
# Plot training RMSE performance
ggplot(data=data.frame(model_test, RMSE_test),
aes(x=model_test, y=RMSE_test, fill=model_test)) +
geom_bar(stat="identity", colour="black") +
labs(y="RMSE", x="Models") +
labs(fill = "Models") +
theme(axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
axis.title.x=element_blank()) +
scale_fill_brewer(palette="OrRd")
# Plot training R^2 performance
ggplot(data=data.frame(model_test, R2_test),
aes(x=model_test, y=R2_test, fill=model_test)) +
geom_bar(stat="identity", colour="black") +
labs(y="R^2", x="Models") +
labs(fill = "Models") +
theme(axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
axis.title.x=element_blank()) +
scale_fill_brewer(palette="PuBuGn")