-
Notifications
You must be signed in to change notification settings - Fork 0
/
Data cleaning.R
208 lines (141 loc) · 8.43 KB
/
Data cleaning.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
library(reshape2)
library(dplyr)
library(ggplot2)
library(vegan)
library(stringr)
library(rgbif)
library(tidyverse)
library(ape)
library(V.PhyloMaker)
library(phytools)
library(PhyloMeasures)
library(ggtree)
library(treeio)
library(picante)
veg.ini <- read.csv("veg.csv", header = TRUE, check.names = FALSE)
# melt to dataframe to long format
veg.ini_2 <- reshape2::melt(veg.ini, id.vars ="species", na.rm = FALSE)
write.csv(veg.ini_2,"species1.csv") # save as cvs file
# GBIF authentication of species names
ns<-read.csv('gbif.csv',sep=';')
sp.ex<-name_backbone(name='Solidago gigantea') # view the complete of the species
View(sp.ex)
# run the gbif command
check_gbif_fun<-function(x) name_backbone(name = x, kingdom = 'plants')
gbifkey <- lapply(ns$species, check_gbif_fun)
for(i in 1:length(gbifkey))gbifkey[[i]]<-as.data.frame(gbifkey[[i]])
gbifkey <- bind_rows(gbifkey)
gbifkey$orig<-ns$species
write.table(gbifkey,'gbif_names.csv', sep = ',', row.names = TRUE)
# sum and group species in layer
species <- read.csv("species_data.csv")
species_merged <- species%>% group_by(plotid,species)%>%summarise(cover)
write.csv(species_merged,"species_sum.csv")
## Use dcast function to transpose the data into wide format
species_sum <-read.csv("species_sum.csv")
species_trans <- dcast(species_sum, plotid~ species,
value.var="cover", fill=0,
fun.aggregate = sum)
write.csv(species_trans,"community.csv")
species_layer <- read.csv("species_data.csv")
species_trans <- dcast(species_layer, plotid + layer~ species,
value.var="cover", fill=0,
fun.aggregate = sum)
write.csv(species_layer,"community_layer.csv")
## generate a phylogeny tree for community data
comm.phylo <- read.csv("community_phylo_revised.csv")
## Create phylogenetic tree
comm.phylo1 = phylo.maker(comm.phylo)
comm.tree = comm.phylo1$scenario.3
# Plot the tree
plotTree(comm.tree)
windows()
ggtree(comm.tree, branch.length='none', layout='circular') + geom_tiplab()
# export the tree for manual editing or use in other software
write.tree(comm.tree, "commmuity_revised.tre")
# Import the tree back into R
# Or import a tree made with other software into R
tree <- read.tree("tree.tre")
ggplot(tree, aes(y)) + geom_tree() + theme_tree()
# To show all it internal node/tips
ggtree(comm.tree) + geom_nodepoint(color="#b5e521", alpha=1/4, size=10) + geom_tippoint(color="#FDAC4F", shape=8, size=3)
# Displaying labels
p + geom_tiplab(size=2, color="purple")
ggtree(comm.tree) + geom_tiplab(as_ylab = FALSE, color = "firebrick")
ggtree(comm.tree, layout = "circular") + geom_tiplab(aes(angle=angle),size=2, color="blue")
#traits imputation
#in this file we get a dataset with species traits and we will conduct phylogenetic-based imputaion
#load required libraries
require(tidyverse)# for data processing and ggplots
require(V.PhyloMaker)# source of a phylogenetic megatree
require(PVR)#for calculation of phylogenetic distance matrix eigenvalues
require(missForest)# for random forest-based imputation
require(reshape2)#for melting data.frames
require(ggthemes)#for nice plots layout
traits<-read.csv('traits.csv') #.csv file with traits and taxonomic information (species/genus/family)
summary(traits)
#traits are columns 10-25
?read.csv
#phylogenetic tree-----
#using V.phylomaker package we need to extract tree from a megatree, we are going to specify as an argument dataframe with three columns - family, genera and species. We use scenario.3 as default, recommended scenario in the V.phylomaker package
phylo<-phylo.maker(traits[,1:3])$scenario.3
plot(phylo, type='fan', cex=.8)
#quick check of cladograme
#phylo.imputations----------
# following Penone et al. (2014) Meth Ecol Evol
#https://besjournals.onlinelibrary.wiley.com/doi/full/10.1111/2041-210X.12232
deco<-PVRdecomp(phylo) #let's decompose the matrix of phylogenetic distances into eigenvalues
cumsum(deco@Eigen$values[1:100]/sum(deco@Eigen$values))#we need to know how much of phylogenetic distance variability is explained by particular eigenvectors, Penone et al. used 15 which explained 60% of variability, here we can see that 16 first eigenvectors explain 60.16% of variability
#as some of variables were presented as character, we need to code them as factors, i.e. Ellenberg's EIVs, pollination or flowering months
for(i in c(14:16)) traits[,i]<-factor(traits[,i], ordered = T)
set.seed(111) #to maintain replicability let's set seed of pseudorandom numbers generator
base.miss<-missForest(cbind(traits[,c(4:21)],deco@Eigen$vectors[,1:16]), ntree=500, maxiter = 15)
traits$liform<-factor(traits$liform)
traits$Alien.status<-factor(traits$Alien.status)
traits$Invasive.status<-factor(traits$Invasive.status)
traits$Residence.time.status<-factor(traits$Residence.time.status)
traits$EIV.T<-as.numeric(traits$EIV.T)
traits$EIV.M<-as.numeric(traits$EIV.M)
traits$EIV.SR<-as.numeric(traits$EIV.SR)
traits$EIV.N<-as.numeric(traits$EIV.N)
#this may keep some time
#we bind our traits matrix with eignecectors, and then every single column containing missing values is predicted based on remaining columns. This literally relies on phylogenetic similarity of species and known traits values
#let's see how good is the fitness
base.miss$OOBerror
#NRMSE is not bad - this is standardized RMSE for numeric traits
#PFC is fraction of false classified cases - here 40.5% is relatively low value, assuming multilevel traits, e.g. Ellenberg's indicators
# in case of prevalence of known trait values, only not-known are imputed
###comparison for appendix pictures----
#we need to bind original and predicted results
comparison<-rbind(traits[,c(4:21)],base.miss$ximp[,1:18])
#only columns with traits
comparison$type<-c(rep(c('orig','imputed'),each=nrow(traits)))
#new column divideds original and imputed values
#we can compare density of distributions
ggplot(comparison, aes(x=SLA,col=type))+geom_density()
ggplot(comparison, aes(x=H,col=type))+geom_density()+scale_x_log10()
#even within the groups
ggplot(comparison, aes(x=liform,y=LDMC,col=type))+geom_boxplot()#+scale_y_log10()
#separately we have to show the continuous traits
#firstly, let's change factors into numeric backward
for(i in c(1:3,11:14)) comparison[,i]<-as.numeric(as.factor(comparison[,i]))
app.imp.cont<-comparison[c(1:18,19)]%>%melt(id.vars='type',variable.name='trait',value.name='val')
#here we melted wide table into long table, to make easy facet-based ggplot
ggplot(app.imp.cont, aes(x=val,col=type))+facet_wrap(~trait,scales='free_x',ncol=4)+geom_density(aes(y=..scaled..))+theme_few()+theme(legend.position = 'bottom')+scale_color_manual(values=c('red','blue'))
#here we can see the overlap between predicted and real values and low errors in EIV-N and LDMC
#
ggplot(app.imp.cont, aes(x=val,col=type))+facet_wrap(~trait,scales='free_x',ncol=4)+geom_density(aes(y=..scaled..))+theme_few()+theme(legend.position = 'bottom')+scale_color_manual(values=c('red','blue'))+scale_x_log10()
#some variables are better to be shown with log scales
#and for factors:
#we melt data.frame with traits twice - first with number of all species with particular trait level, excluding NAs, second - with counts of all species with known information about trait level. This will show us how proportions of traits levels vary between original and imputed datasets
app.imp.fac<-comparison[,c(1:3,9:14,19)]%>%melt(id.vars='type',variable.name='trait',value.name='val')%>%filter(!is.na(val))%>%group_by(type,trait,val)%>%summarise(count=n())
#here we first melted data.frame, then we filter out NAs, groupbed by variables and summarized
app.imp.fcount<-comparison[,c(1:3,9:14,19)]%>%melt(id.vars='type',variable.name='trait',value.name='val')%>%filter(!is.na(val))%>%group_by(type,trait)%>%summarise(counttrait=n())
app.imp.fac<-left_join(app.imp.fac,app.imp.fcount)
#let's join these data.frames
app.imp.fac$perc<-app.imp.fac$count/app.imp.fac$counttrait
#And calculate the percentage fraction (perc)
ggplot(app.imp.fac,aes(y=perc,x=type,fill=val))+geom_col(position='stack',col='black')+facet_wrap(~trait, ncol=4)+theme_few()+theme(legend.position = 'bottom')
#save results
#joint columns with no traits (1:8) from original database and imputed traits dataset (i.e. original values + NAs replaced by estimation)
write.table(cbind(traits[,1:3],base.miss$ximp[,1:18]),'traits_impute.csv', quote = FALSE, sep = ",")