McD_DSP_anal.Rmd

---
title: "DSP anal"
author: "Skye Wills"
date: "July, 2018"
output:
  html_document: default
  pdf_document: default
classoption: landscape
---


```{r setup, include=FALSE}
rm(list=ls())
knitr::opts_chunk$set(echo = FALSE)

list.of.packages <- c("knitr", "tidyverse", "aqp", "readxl", "kableExtra", "purrr", "sharpshootR", "kableExtra")

new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)


library(knitr)
library(tidyverse)
library(purrr)
library(aqp)
library(readxl)
library(sharpshootR)
library(kableExtra)



```



```{r data, echo = F, message=F, warning=F}

#set DSP file location

#data can be downloaded here:
#https://nrcs.box.com/s/tomz9nl719v1e7r4y84d9rv7tn1vnnpz
#LIMS samples collected as part of a DSP project with limited use and info


DSP <- read_excel("DSP_project_master_Mar2018.xlsx", sheet = "All Data Assembled", 
          col_types = c("text", "text", "text", "date", "text", "text", "text", "numeric",
                        "text", "text", "text", "text", "text", "text", "text", "text", "text",
                        "text", "text", "text", "numeric", "text", "numeric",   "text",
                        "numeric", "numeric", "numeric", "text", "numeric", "numeric",
                        "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
                        "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
                        "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
                        "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
                        "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
                        "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
                        "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
                        "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
                        "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
                        "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
                        "numeric", "numeric", "numeric", "numeric", "numeric", "numeric"
                        ))

#data entry using best guess for each column (use if columns added)
#dsp_data <- read_excel("DSP_project_master_Mar2018.xlsx", sheet = "All Data Assembled" ) 


dsp_labels <- read_excel("DSP_project_master_Mar2018.xlsx", sheet ="dsp_coredata_label") 
# includes original LIMS names and explanation of columns/properties
dsp_site <- read_excel("DSP_project_master_Mar2018.xlsx", sheet ="dsp_coredata_label") # data from NASIS

 #fix names so that there are no spaces
names(DSP) <- gsub(" ", "_", names(DSP))
names(dsp_labels) <- gsub(" ", "_", names(dsp_labels)) 
names(dsp_site) <- gsub(" ", "_", names(dsp_site))

```

```{r prop_prep, include=FALSE, message=F, warning=F}

DSP %>%
  gather(key = "Property", value = "Value", -c(Name:Moist), Comp_layer) %>%
  left_join(
              dsp_labels %>% mutate(Property = Anal) %>% select(Property, Simple_explanation, Label) 
            ) %>%
  filter(!is.na(Value)) %>%
  select(Property) %>%
  filter(grepl("^BD", Property)) %>%
  group_by(Property) %>%
  count()

#Order of bulk density selection - change order if desired
bd_1 <- "BD_core_fld"
bd_2 <- "BD_fieldcore"
bd_3 <- "BD_clod_13"
bd_4 <- "BD_compcav"
bd_5 <- "BD_recon13"
bd_6 <- "BD_recon_moist"
bd_7 <- "BD_whole_moist"

```


```{r data_prep, include= F, message=F, warning=F}

#Currently the columns are ID'd directly - eventually these should be changeable

##################
# # #check that BD assignments are correct
# # str(c(dsp[, bd_1], dsp[, bd_2],dsp[, bd_3],dsp[, bd_4],dsp[, bd_5],dsp[, bd_6], dsp[, bd_7]))
# # 
#   dspB <- dsp %>%
#     select_(bd_1,  bd_2, bd_3, bd_4, bd_5, bd_6, bd_7)
# 
#  #create new data element that combines all bulk density methods
#   dspB <- dspB %>%
#          # mutate_(BulkDensity = if_else(!is.na( print(bd_1, quote=FALSE)), 
#          #                               print(bd_1, quote=FALSE),
#          #                       if_else(!is.na(print(bd_2, quote=FALSE)),  
#          #                               print(bd_2, quote=FALSE),
#          #                           if_else(!is.na(print(bd_3, quote=FALSE)),  
#          #                                   print(bd_3, quote=FALSE),
#          #                           if_else(!is.na( print(bd_4, quote=FALSE)),  
#          #                                   print(bd_4, quote=FALSE),
#          #                              if_else(is.na(print(bd_5, quote=FALSE)), 
#          #                                      print(bd_5, quote=FALSE),
#          #                                if_else(!is.na(print(bd_6, quote=FALSE)),
#          #                                        print(bd_6, quote=FALSE),
#          #                                   print(bd_7, quote=FALSE))
#          #                            )))))
#          # )
#        mutate(bd_source = if_else(!is.na(BD_core_fld), 'BD_core_fld',
#                               if_else(!is.na(BD_fieldcore), 'BD_fieldcore',
#                                  if_else(!is.na(BD_clod_13),  'BD_clod_13',
#                                        if_else(!is.na(BD_compcav),  'BD_compcav',
#                                                if_else(is.na(BD_recon13), 'BD_recon13',
#                                                  if_else(!is.na(BD_recon_moist),'BD_recon_moist',
#                                                                 'BD_whole_moist')
#                                   ))))))       %>%
#       mutate_(BulkDensity = if_else(!is.na(dsp$BD_core_fld), dsp$BD_core_fld,
#                                if_else(!is.na(dsp$BD_fieldcore), dsp$BD_fieldcore,
#                                   if_else(!is.na(dsp$BD_clod_13),  dsp$BD_clod_13,
#                                         if_else(!is.na(dsp$BD_compcav),  dsp$BD_compcav,
#                                                 if_else(is.na(dsp$BD_recon13), dsp$BD_recon13,                                                  if_else(!is.na(dsp$BD_recon_moist),dsp$BD_recon_moist,
#                                                                  dsp$BD_whole_moist)
#                                    ))))))
#      
# 
#    
#    %>%
#    mutate_(BulkDensity = if_else(!is.na(bd_1), bd_1,
#                               if_else(!is.na(bd_2), bd_2,
#                                      if_else(!is.na(bd_3), bd_3,
#                                            if_else(!is.na(bd_4), bd_4,
#                                                    if_else(is.na(bd_5),bd_5,
#                                                        if_else(!is.na(bd_6), bd_6,
#                                                                 bd_7)                                   )))))
#    )
#####################

dsp <- data.frame(DSP)
dsp$BulkDensity <- ifelse(!is.na(dsp[,bd_1]), dsp[,bd_1],
                          ifelse(!is.na(dsp[,bd_2]), dsp[,bd_2],
                                 ifelse(!is.na(dsp[,bd_3]), dsp[,bd_3],
                                        ifelse(!is.na(dsp[,bd_4]), dsp[,bd_4], 
                                               ifelse(!is.na(dsp[,bd_5]), dsp[,bd_5],
                                                      ifelse(!is.na(dsp[,bd_6]), dsp[,bd_6],
                                                             dsp[,bd_7])
                                                      )))))

dsp$bd_source <-  ifelse(!is.na(dsp[,bd_1]), bd_1, 
    ifelse(!is.na(dsp[,bd_2]), bd_2,
      ifelse(!is.na(dsp[,bd_3]), bd_3,
          ifelse(!is.na(dsp[,bd_4]), bd_4,
             ifelse(!is.na(dsp[,bd_5]), bd_5,
               ifelse(!is.na(dsp[,bd_6]), bd_6,
                    bd_7)
          )))))

table(dsp$bd_source)

str(dsp$BulkDensity)
summary(dsp$BulkDensity)

#change na's to zero for Calcium carbonate
dsp$CaCarb[is.na(dsp$CaCarb)]<- 0

#change negative values to zero for Calcium carbonate
dsp$CaCarb[dsp$CaCarb<0] <- 0

summary(dsp$CaCarb)


#calculate SOC with updated carbonates
dsp$SOC <- dsp$Tot_C - 0.12*dsp$CaCarb
dsp$SOC[dsp$SOC<0] <- 0

summary(dsp$SOC)

#############
```



```{r proj, include = F, message=F, warning=F}
table(dsp$Name)

#Alter this statement to select the project of interest "alter project code inside quotations"

PROJECT<-"NE_DSP_McD"

dsp <- dsp %>%
  filter(Name == PROJECT)

```

```{r , echo = F, message=F, warning=F}
#column used to compare conditions within each project
COMPARE<-"COND"

#label for comparison made - usually management system or state phase or condition
x_label <- "Management System"

#stratify data by spatial collection distribution (use unique plot id)
PLOT<-"PlotID"

#Plot numbers - plot numbers, are not unique across COND; but are shorter labels
PLOT_NO <- "Plot"
```

###Comparable Layers
It is helpful to group horizons into similar layers for analysis. 
There were two general kinds of horizons sampled.  The names are based on long-term soil forming factors.

*A horizons* - zones or organic matter accumulation and moderate clay depletion (for this project we include Ap horizons, which are A horizons modified substaintially by disturbance like tillage)

*Bt horizons* - zones of clay accumulation


_*Lists of Horizons Used*_
```{r comp, echo = F, message=F, warning=F }
# Based on generalized horizon labels
table(dsp$hor_desg)

#Assign desired comparable layers (group horizons for comparisons and statistical analysis) #most horizons are covered by this list, but not all
cl <- c("A horizons",
        "Bt horizons")

# use REGEX rules to find matching horizons to assign to comparable layers
#adjust as needed
# the $ sign signifies that any character is acceptable in that position
cl_hor <- c('A|^A|A$',
            'Bt|^Bt|Bt$|Bt|B')



dsp$Comp_layer <- generalize.hz(dsp$hor_desg, cl, cl_hor)
          

                        
table(dsp$Comp_layer)

k <- dsp %>%
  select(Soil, hor_desg,Comp_layer) %>%
  group_by(Soil, hor_desg, Comp_layer) %>%
  tally()


tab <- table(dsp$Comp_layer, dsp$hor_desg)
addmargins(tab)
m <- genhzTableToAdjMat(tab)
# plot using a function from the sharpshootR package
par(mar=c(1,1,1,1))
plotSoilRelationGraph(m, graph.mode = 'directed', edge.arrow.size=0.5)




```


  

```{r dataprep, include =FALSE}

#select primary analysis field using dsp_labels
names(dsp)
names(dsp_labels)

#change from wide to long format
#and join labels to each property
dsp <- dsp %>%
  gather(key = "Property", value = "Value", -c(Name:Moist, bd_source)) %>%
  left_join(dsp_labels %>% mutate(Property = Anal) %>% select(Property, Simple_explanation, Label)
  )

#alter column attributes
dsp$Value <- as.numeric(dsp$Value)
dsp$Property <- as.factor(dsp$Property)

dsp$bd_source <- ifelse(grepl("^BD", dsp$Property),dsp$bd_source,"")

names(dsp)

```



<br>

###Carbon PLOTS

Soil carbon was measured for all samples.  A subset had POX-C and B-glucosidase measured. The first plots evaluate the values for each metric by depth of the sample.  The second set of plots show the range of low - average - high vlaues for each general management system.

__Soil Organic Carbon__ is all soil carbon in a soil sample that is not associated with minerals (like calcium carbonate or lime).  It includes multiple ages and pools of organic carbon and thus it takes many years to change values due to management. Many measurements have been made for soil survey and there is lot of information about expected or typical values for any given soil.

__POX-C__ is Permangante Oxidizable carbon, sometimes called 'Active Carbon'.  Thought to be related to pools of carbon that change quickly with management.

__B-glucosidase__ is an extra-cellular enzyme produced by soil microbes.  It is an indicator of general microbial activity.  When comparing managment systems or sites on the same soil, higher levels mean higher microbial activity.   

```{r explore_plots, echo = F, warning=F, message=F}

#check labels
l <- dsp %>% filter(!is.na(Label) & !is.na(Value)) %>% select(Property) %>% group_by(Property) %>% count()

 n <- dsp %>% filter(!is.na(Value)) %>% select(Property) %>% group_by(Property) %>% count()

 
### create summary plots
###############
#Select properties that you want to be evaluated

prop <- c( 'Clay', 'pH', 'SOC', 'POX_C', 'Bgluc')

###########################
#Depth plot
d <- dsp %>% filter(Property %in% prop & !is.na(Value))  %>%
  ggplot(aes(x = Value, y = hor_top, color=COND)) +   geom_point(size = 6) + scale_y_reverse() +
  labs( y = "Top Depth (cm)") + 
  facet_wrap(~Property, scale = 'free_x', nrow = 1)

 d

prop <- c( 'SOC', 'POX_C', 'Bgluc')

for (i in unique(prop)) {
  Label = unique(dsp %>% filter(Property == i) %>% select(Label))
  dp <- dsp %>% filter(Property == i) %>%
    group_by(Pedon, Comp_layer ) %>%
     ggplot(aes( y = Value, x = COND, color = Comparison)) + 
    geom_boxplot() + geom_jitter(alpha = 0.5) + 
    facet_wrap(~Comp_layer) + 
    labs( x = "Management System", y = paste(Label), color = "Field")
  print(dp)
}

```

<br>

<br>

<br>

<br>
<br>

##Organic Matter

For soil survey analysis, soil carbon is measured though CO2 evolved during high temperature combustion. Then any mineral carbon is subtracted to get soil organic  and converted to Organic Matter

``` {r om_plots1, echo = F, warning=F, message=F} 

d_soc <- dsp %>% filter(Property == "SOC")  %>%
  ggplot(aes(x = Value*1.72 , y = hor_top, color=COND)) +   geom_point(size = 6) + scale_y_reverse() +
  labs( y = "Top Depth (cm)", title = "Organic Matter by Depth", x = "Organic Matter %") 

d_soc


  dp <- dsp %>% filter(Property == 'SOC') %>%
    group_by(Pedon, Comp_layer ) %>%
     ggplot(aes( y = Value*1.72, x = COND, color = Comparison)) + 
    geom_boxplot() + geom_jitter(alpha = 0.5) + 
    facet_wrap(~Comp_layer) + 
    labs( x = "Management System", y = 'Organic Matter %')
  
 dp
  
  
 
 
```

<br>

<br>

<br>

##McDonald Sites

``` {r om_plots, echo = F, warning=F, message=F} 

McD <- dsp %>% filter (Comparison == 'McDonald Soil Health Management' & Property == 'SOC')%>%
      mutate(om = Value*1.72, Label = "Organimc Matter (%)",
             top_in = hor_top/2.54) %>%
      rename(MapUnit= Region.strata, Season = AgronFeat, Field = Comparison, Site = Plot)

Mcd_om <- McD %>% 
  ggplot(aes(y = om, x = Site, color=Soil)) +  geom_boxplot() + 
  geom_jitter(size = 6, aes(shape = Season)) + 
  labs( title = 'Organic Matter by Site, Season, and Layer',  y = "Organic Matter %") +
  facet_wrap(~Comp_layer)

Mcd_om




md <- McD %>%
    ggplot(aes(x = om , y = hor_top, color=PlotID)) + geom_point(aes(shape = Season), size = 6) + scale_y_reverse() +
  labs( y = "Depth (inches)", title = "Organic Matter by Depth", x = "Organic Matter %", color = "Site") 
 
md

Smd <- McD %>%
    ggplot(aes(x = om , y = hor_top)) + geom_point(aes(color = Season, shape = Season), size = 6) + 
  scale_y_reverse() + scale_color_brewer(type = 'qual') +
  labs( y = "Depth (inches)", title = "Organic Matter by Depth", x = "Organic Matter %") +
  facet_wrap(~Site, labeller = label_both)
 
Smd

Smd +   geom_text(aes(label=Pedon)) + facet_wrap(~Site, labeller = label_both, ncol = 2)

Dmd <- McD %>%
   mutate(D = cut(top_in, breaks = c(0, 3, 6, 12, 24, 48), labels = c('0 - 3 inches', '3 - 6 inches', '6 - 12 inches', '12 - 24 inches','24 - 48 inces'))) %>%
  mutate(D = factor(D, levels = rev(levels(D))))%>%
    filter(!is.na(D)) %>% 
  group_by(PlotID, Field, Season, Soil, MapUnit, D)  %>%
  summarize(OMD = mean(om, na.rm=T), SOCD  = mean(Value, na.rm=T)) %>%
 ggplot(aes(y=OMD, x =D)) +    geom_jitter(aes(shape = Season, color = MapUnit), size =4, alpha = 0.75) + 
  geom_boxplot(color = "grey", alpha = 0.75)  +  
  coord_flip()  +
  theme(plot.caption=element_text(hjust=0,size=9),
        axis.title=element_text(size=12),
        plot.title=element_text(size=24))+
  labs( y = "Depth", title = "Organic Matter by Depth", x = "Organic Matter %", color = "Map Unit") 

Dmd






```

<br>

##Summary Data

Averaging sites allows us to better compare general trends and ignore within field variability.

```{r data_summary, echo = F, warning=F, message=F}

ped <- dsp %>%  filter(Property %in% prop) %>%
    group_by(Pedon_ID, PlotID, Region.strata,  Property, Comparison, COND, Soil, AgronFeat) %>%
    rename(MapUnit= Region.strata, Season = AgronFeat,Field = Comparison) %>%
    summarize(
      wt.avg = weighted.mean(Value, hor_thick, na.rm = TRUE)
    #add quantiles 
    )

plot <- ped %>%  filter(Property %in% prop) %>%
    group_by(PlotID, MapUnit,  Property, Field, COND, Soil, Season) %>%
    summarize(
      Plot.min = min(wt.avg, na.rm = TRUE),
      Plot.mean = mean(wt.avg, na.rm = TRUE),
      Plot.max = max(wt.avg, na.rm = TRUE)
                                              )
    #add quantiles? 
  
PLOT.mean <- plot %>%
  select(PlotID, MapUnit,  Property, Field, COND, Soil, Season, Plot.mean) %>%
  spread(Property, Plot.mean) %>%
  mutate(OM = SOC*1.72)

PLOT.min <- plot %>%
  select (PlotID, MapUnit,  Property, Field, COND, Soil, Season, Plot.min) %>%
  spread(Property, Plot.min)%>%
  mutate(OM = SOC*1.72)

PLOT.max <- plot %>%
  select (PlotID, MapUnit,  Property, Field,  COND, Soil, Season, Plot.max) %>%
  spread(Property, Plot.max)%>%
  mutate(OM = SOC*1.72)


landscape(kable(PLOT.mean, format = "html", digits = 2, caption = "Average Pedon Value for each Plot"), margin = NULL)
landscape(kable(PLOT.min, digits = 2, caption = "Pedon Minimum Value for each Plot"))
kable(PLOT.max, digits = 2, caption = "Pedon Maximum Value for each Plot")
```
 
```{r summary_plot, echo = F, warning=F, message=F}

PLOT <- plot %>% gather(key = "Stat", value = "Value", -c(PlotID:Season))
                        

prop <- c( 'SOC', 'POX_C', 'Bgluc')
for (i in unique(prop)) {
  Label = unique(dsp %>% filter(Property == i) %>% select(Label))
  pp <- PLOT %>% filter(Property == i) %>%
        group_by(Stat, PlotID, Field) %>%
        ggplot(aes( y = Value, x = COND)) + 
          geom_boxplot() + geom_jitter(aes(color = Season), alpha = 0.5) + 
          facet_wrap(~Stat, nrow=1) + 
          labs( x = "Management System", y = paste(Label))
  print(pp)
}
 

```