-
Notifications
You must be signed in to change notification settings - Fork 0
/
sdm_demo.R
325 lines (248 loc) · 11.3 KB
/
sdm_demo.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
# This script demonstrates some methods of using R in SDM, and attempts to teach
# SDM concepts through demonstration this is more or less the first draft of the
# RMarkdown File in this repo
# For downloading occurrence data from multiple sources
library(spocc)
# For data manipulation and presentation
library(tidyr)
library(dplyr)
library(ggplot2)
# Provides various functions relating to spatial analysis
library(maptools)
library(sp)
# Reads and manipulates vector data
library(rgdal)
# Reads and manipulates raster data
library(raster)
# SDM packages
library(dismo)
library(sdm)
# Let's ready some political boundaries in for visualization
usa.shp <- readOGR("USA")
CA_OR.shp <- usa.shp[usa.shp@data$NAME %in% c("California", "Oregon"),]
# Source occurrence data from iNaturalist
dc <- occ(query = "Darlingtonia californica", from = 'inat')
# Could just as easily source from gbif:
# dc <- occ(query = "Darlingtonia californica", from = 'gbif')
# Horrible nasty function that makes species occurrence data into lon, lat dataframe
dc.df <- as.data.frame(dc$inat$data$Darlingtonia_californica%>%
dplyr::filter(identifications_most_agree == TRUE) %>%
dplyr::select(location)) %>%
filter(!is.na(location)) %>%
separate(location, c("lat", "lon"), ",") %>%
select(lon, lat)
# Lat and Lon columns must be specified as numeric values
dc.df$lon <- as.numeric(dc.df$lon)
dc.df$lat <- as.numeric(dc.df$lat)
# Let's take a look at our raw occurrence points
sp::plot(CA_OR.shp, main = "Darlingtonia californica (iNaturalist)")
points(dc.df$lon, dc.df$lat)
# Download bioclimatic variables from worldclim
# run ??getData to see other options for climate data
bioclim_global.stack <- raster::getData(name = "worldclim",
var = 'bio',
res = 2.5)
# This replaces bioX name with more descriptive acronym
# We choose 4 climatic variables here more for examplary purposes than biologically
# sound reason
names(bioclim_global.stack)[names(bioclim_global.stack) %in% c("bio1", "bio10", "bio11", "bio12")] <-
c("MAT", "MTWQ", "MTCQ", "MAP")
# This crops the extent to Oregon and California
bioclim.stack <- crop(bioclim_global.stack, extent(CA_OR.shp))
# Let's only look at these 4 variables
predictors <- raster::subset(bioclim.stack, c("MAT", "MTWQ", "MTCQ", "MAP"))
# bio1 = Mean Annual Temperature (MAT)
# bio10 = Mean Temperature of Warmest Quarter (MTWQ)
# bio11 = Mean Temperature of Coldest Quarter (MTCQ)
# bio12 = Annual Precipitation (MAP)
# etc.
# Let's see what Mean Annual temperature raster looks like
sp::plot(predictors$MAT, main = "Mean Annual Temperature")
# Organizing species presence and environmental predictors into data.frames
# These are the predictor values at locations of species presence
presvals <- raster::extract(predictors, dc.df)
# These are 500 random locations, used as in place of absence values as
# 'pseudoabsences' (the species probably doesn't occur at any random point)
backgr <- randomPoints(predictors, 500)
# predictor values at random locations
absvals <- raster::extract(predictors, backgr)
# We know that expected habitat suitability (Ey) is 1 for areas where the species
# was found, and we assume it's 0 for the random background points
Ey <- c(rep(1, nrow(presvals)), rep(0, nrow(absvals)))
# Now here we have a dataframe with the response variable (Ey) and corresponding
# predictor values
sdmdata <- data.frame(cbind(Ey, rbind(presvals, absvals)))
View(sdmdata)
# Look for collinearity using a scatterplot matrix
pairs(sdmdata[,2:length(sdmdata)], cex = 0.1, fig=TRUE)
# MAT and MTWQ are collinear across this area!
# So let's remove bio1 and try again
predictors <- raster::subset(bioclim.stack, c("MTWQ", "MTCQ", "MAP"))
presvals <- raster::extract(predictors, dc.df)
backgr <- randomPoints(predictors, 500)
absvals <- raster::extract(predictors, backgr)
Ey <- c(rep(1, nrow(presvals)), rep(0, nrow(absvals)))
sdmdata <- data.frame(cbind(Ey, rbind(presvals, absvals)))
pairs(sdmdata[,2:length(sdmdata)], cex = 0.1, fig=TRUE)
# Better! A popular metric of collinearity analysis is Variance Inlation Factor
# (see https://www.rdocumentation.org/packages/car/versions/3.0-9/topics/vif)
# Let's start Modeling! Here's the most basic SDM
## Linear Model SDM
# E(y) = mx + b + e
# E(y) is expected habitat suitability
# b = intercept
# m = coefficient
# x = predictor
# e = error
# Assumes: Normal distributed error
# mean = 0
# Variance(y) is constant
sdm_lm <- lm(Ey ~ MTWQ, data = sdmdata)
plot(sdmdata$MTWQ, sdmdata$Ey,
main = "D. californica Linear SDM",
xlab = "Mean Temp of Warmest Quarter",
ylab = "Habitat Suitability"
)
abline(sdm_lm)
# The simplest SDM ever! Do this for each predictor and average habitat
# suitability in each cell (I think?)
## Generalized Linear Model SDM
# g(E(y)) = LP = mx + b + e
# g() is link function
# LP = Linear Predictor
# Does not have to have normal distribution of error
# does not have to have homogenous variance. Options of different link functions.
# Different link functions have their own assumptions
plot(sdmdata$MTWQ, sdmdata$Ey,
main = "D. californica GLM SDM",
xlab = "Mean Temp of Warmest Quarter",
ylab = "Habitat Suitability"
)
# A GLM with a Gaussian Link Function is a Linear Model
sdm_lm <- glm(Ey ~ MTWQ, data = sdmdata, family = gaussian)
abline(sdm_lm, lwd = 3, col = "blue")
legend("topright", legend = c("Linear Model"), col = c("blue"), lty=1, lwd = 3)
# In SDM we use the logit link function, to turn the binary presence/absence
# into a continuous response
# GLM: g(E(y)) = LP = log10(Ey / (1 - Ey)) = mx + b + e
# Solve for Ey:
# Ey = e^(mx + b + e) / (1 + e^(mx + b + e))
# Go to slide >>>
sdm_glm <- glm(Ey ~ MTWQ, data = sdmdata, family = binomial)
# This is the equation of line 142, written as an R function
Ey_calc <- function(x){
y_int <- as.numeric(sdm_glm$coefficients["(Intercept)"])
m <- as.numeric(sdm_glm$coefficients["MTWQ"])
LP = m*x + y_int
return(exp(LP) / (1 + exp(LP)))
}
# Need to sort in order for plot to add line properly
MTWQ_sorted <- sort(sdmdata$MTWQ)
Ey <- Ey_calc(MTWQ_sorted)
lines(MTWQ_sorted, Ey, lwd = 3, col = "red")
legend("topright", legend = c("Linear Model", "GLM Model"), col = c("blue", "red"), lty=1, lwd = 3)
# Here's a function we'll use to plot SDM projections
project.sdm <- function(prediction, plotName){
sp::plot(prediction, main = plotName)
sp::plot(CA_OR.shp, add = T)
points(dc.df, pch = 16, cex = .2)
#legend("bottomright", legend = "obs. occurrences", pch = 16, cex=.2)
}
# Here's a map of projection of the SDM model
prediction_glm <- raster::predict(bioclim.stack, sdm_glm)
# we need to log transform it to give us the expected presence
# We have g(E(x)), where E(x) = e^(LP) / (1 + e^LP)
# where E(x) is expected presence or absence of species
prediction_glm.Ey <- exp(prediction_glm) / (1 + exp(prediction_glm))
par(mfrow=c(1,2))
plot(MTWQ_sorted, Ey, lwd = 3, col = "green", type = 'l',
main = "GLM",
xlab = "Mean Temp of Warmest Quarter",
ylab = "Habitat Suitability")
project.sdm(prediction_glm.Ey, "GLM SDM (D. californica), MTWQ only")
# Let's make GLM with all 3 variables
par(mfrow=c(1,1))
sdm_glm3 <- glm(Ey ~ MTWQ + MTCQ + MAP, data = sdmdata, family = binomial)
prediction_glm3 <- raster::predict(bioclim.stack, sdm_glm3)
prediction_glm3.Ey <- exp(prediction_glm3) / (1+exp(prediction_glm3))
project.sdm(prediction_glm3.Ey, "GLM SDM (D. californica)")
# Now let's use real software based SDMs
# Here's Machine Learning GLM
# Let's try sdm package
# sdm packages can do GLM, GAM, GBM, RF, TREE, MARS, SVM
# Prep data format for the sdm package
sdm.pkg.df_pres <- cbind(dc.df, presvals)
sdm.pkg.df_pres$Ey <- 1
names(sdm.pkg.df_pres)[1:2] <- c("x", "y")
sdm.pkg.df_abs <- data.frame(cbind(backgr, absvals))
sdm.pkg.df_abs$Ey <- 0
sdmdf_sdmpkg <- rbind(sdm.pkg.df_pres, sdm.pkg.df_abs)
sdmdata_sdmpkg <- sdmData(Ey ~ MTWQ + MTCQ + MAP, train = sdmdf_sdmpkg)
# Run the model and project
sdm_ml.glm <- sdm::sdm(Ey ~ MTWQ + MTCQ + MAP, data = sdmdata_sdmpkg, methods=c("glm"))
prediction_ml.glm <- raster::predict(sdm_ml.glm, bioclim.stack)
project.sdm(prediction_ml.glm, "ML GLM SDM (D. californica)")
# Use this function to determine which variables were more important
getVarImp(sdm_ml.glm)
# Let's try GAM
# Run the model and project
sdm_ml.gam <- sdm::sdm(Ey ~ MTWQ + MTCQ + MAP, data = sdmdata_sdmpkg, methods=c("gam"))
prediction_ml.gam <- raster::predict(sdm_ml.gam, bioclim.stack)
project.sdm(prediction_ml.gam, "ML GAM SDM (D. californica)")
getVarImp(sdm_ml.gam)
# Now Let's try more strictly Machine Learning: Random Forest
# Run the model and project
sdm_rf <- sdm::sdm(Ey ~ MTWQ + MTCQ + MAP, data = sdmdata_sdmpkg, methods=c("rf"))
prediction_rf <- raster::predict(sdm_rf, bioclim.stack)
project.sdm(prediction_rf, "Random Forest SDM (D. californica)")
getVarImp(sdm_rf)
# ENSEMBLE let's make an ensemble model of all of them
# a number of methods can be used, see documentation, but they include
# weighted mean, unweighted mean, median, entropy, etc
sdm_glm.gam.rf <- sdm::sdm(Ey ~ MTWQ + MTCQ + MAP, data = sdmdata_sdmpkg, methods=c("glm","gam","rf"))
sdm_ensemble <- sdm::ensemble(sdm_glm.gam.rf, bioclim.stack,
setting=list(method="weighted", stat="TSS"))
project.sdm(sdm_ensemble, "Ensemble (D. californica)")
getVarImp(sdm_glm.gam.rf)
# Let's see what dismo has
# BIOCLIM, DOMAIN, MaxEnt
# but powerful model evaluation software, and software to test the importance of
# particular variables, producing bioclim variables for monthly climate data (CMIP5 is great)
# niche equivalency, niche overlap, a tool that finds the best threshold based on
# particular accuracy metrics
# Here's BIOCLIM, one of the oldest models, and EE
# Puts a convex hull around all the points in n-dimensional niche space
# Not very powerful at extrapolation
sdm_bioclim <- bioclim(presvals)
response(sdm_bioclim)
prediction_bioclim <- dismo::predict(sdm_bioclim, bioclim.stack)
project.sdm(prediction_bioclim, "BIOCLIM SDM (D. californica)")
# Maxent, need to install maxent (https://biodiversityinformatics.amnh.org/open_source/maxent/)
# and place it here:
system.file("java", package="dismo")
sdm_maxent <- maxent(predictors, dc.df)
prediction_maxent <- dismo::predict(sdm_maxent, bioclim.stack)
project.sdm(prediction_maxent, "MaxEnt SDM (D. californica)")
# Look at response for each predictor
response(sdm_maxent)
# Look at variable contribution for maxent
plot(sdm_maxent)
# Make an ensemble model!!
ensemble_bioclim.maxent <- mean(prediction_bioclim, prediction_maxent)
# Seems less sophisticated than the 'sdm' package!
plot(ensemble_bioclim.maxent, main = "MaxEnt BIOCLIM ensemble")
points(dc.df, pch = 16, cex = .2)
legend("bottomright", legend = "obs. occurrences", pch = 16)
# Amalgamation of all models:
par(mfrow=c(3,3))
project.sdm(prediction_glm.Ey, "GLM SDM (D. californica), MTWQ only")
project.sdm(prediction_glm3.Ey, "GLM SDM (D. californica)")
project.sdm(prediction_ml.glm, "ML GLM SDM (D. californica)")
project.sdm(prediction_ml.gam, "ML GAM SDM (D. californica)")
project.sdm(prediction_rf, "Random Forest SDM (D. californica)")
project.sdm(sdm_ensemble, "Ensemble (D. californica)")
project.sdm(prediction_bioclim, "BIOCLIM SDM (D. californica)")
project.sdm(prediction_maxent, "MaxEnt SDM (D. californica)")
plot(ensemble_bioclim.maxent, main = "MaxEnt BIOCLIM ensemble")
points(dc.df, pch = 16, cex = .2)
par(mfrow=c(1,1))