125_Calculate_trends_leftadjusted.Rmd

---
title: "125 Calculate time trends using package leftadjusted"
output: html_document
---

This procedure uses data from script 109    

**Basis has now been added to all relevant "series" files, the plot result files, and functions (scr. 125...functions + 402..functions** 
**This was done for results in folder 07, see APPENDIX 2 for modificating and combining files at the bottom of this script** 
**HOWEVER: The codes in this file should be made so series also are selected using 'Basis'** 


## Install leftcensored   

- And make sure it's the right version! See comment below.    

```{r}

remove.packages("leftcensored")
devtools::install_github("DagHjermann/leftcensored", upgrade = "never", force = TRUE, )

# Check model syntax
cat(leftcensored:::get_jags_model_code())

# LINE 10 SHOULD have 'min(max(...', as this:
# prob[j] <- min(max(pnorm(cut[j], mu[n+j], tau), 0.01),0.99)

source("https://raw.githubusercontent.com/DagHjermann/leftcensored/main/R/jags_model_code.R")  
# Check model syntax
cat(leftcensored:::get_jags_model_code())
cat(get_jags_model_code())

# dir("../../../../../opt/R")
# dir("../../../../../opt/R/leftcensored")
# dir("../../../../../opt/R/leftcensored/R/leftcensored")
# file.remove


```

## 0a. Constants (update each year)     

```{r}

last_year <- 2021

# Not used:
series_lastyear <- 2018   # The series must last at least until this year    
                          # Series without data in any of the three last years will be excluded

#
# Folder for saving input files and results
#

# folder_5: full new run of all series, JAGS model including "prob[j] <- min(max(..." (see above)  
# folder_6: time trends of length-adjusted series, only HG so far  

folder_results <- "Data/125_results_2021_05"          # Basis WW, all
# folder_results <- "Data/125_results_2021_06"        # Basis WWa (length-adjusted), HG only
# folder_results <- "Data/125_results_2021_07"        # combining folder 05 and 06, see APPENDIX 2 below

folder_input <- paste0(folder_results, "_input")

# folder_results <- paste0("Data/125_results_", last_year)
# 
if (!dir.exists(folder_results)){
  dir.create(folder_results)
}
if (!dir.exists(folder_input)){
  dir.create(folder_input)
}

```


### Type of value (wet-weight, wet-weight length-adjusted, etc.)


```{r}


if (folder_input == "Data/125_results_2021_05_input"){
  # For runs up until folder_5:
  value_column <- "VALUE_WW"
} else if (folder_input == "Data/125_results_2021_05_input"){
  # For folder_6:
  value_column <- "VALUE_WWa"
} else {
  stop("Latest results are folder 5 or 6 - select one of these")
}
message("Selected value column: ", value_column)

```

  
## 1. Libraries and functions  

### The rest
```{r, results='hide', message=FALSE, warning=FALSE}

# install.packages("lubridate")

# General purpose
library(dplyr)
library(tidyr)
library(purrr)
library(mgcv)    #  mgcv_1.8-39
# install.packages("mgcv")  # for mgcv_1.8-40, b
# packageVersion("mgcv")
packageVersion("mgcv")
library(ggplot2)

# Specific for the analysis
library(rjags)
library(runjags)
library(leftcensored)

# For parallel computing
if (!"doParallel" %in% installed.packages())
  install.packages("doParallel")
library(doParallel)

# Load functions defined in other scripts  
source("125_Calculate_trends_leftadjusted_functions.R")
source("110_Medians_and_PROREF_functions.R")  # for homogenize_series
source("002_Utility_functions.R")

```

## 2. Read data  

### a. Main data  
Read and reformat the most recent data (by default)  

```{r, collapse=TRUE}

# If we have NOT length-adjusted the last year's data:
# filepattern <- "105_data_with_uncertainty_"         # entire file name except date and extension

# Normally, if we have length-adjusted the last year's data:
filepattern <- "109_adjusted_data_"       # entire file name except date and extension

filenumber <- 1                           # filenumber = 1 means "read the newest file"

files <- dir("Data", pattern = filepattern) %>% rev()

dat_all <- read_rds_file("Data",
                     files, 
                     filenumber = filenumber,   # "1" gets the newest file   
                     get_date = FALSE, time_since_modified = TRUE)

# dat_all <- data_list$data

# The file_date text will be used in part 10, when we save the resulting file
# cat("File date text:", data_list$file_date, "\n")

```


### b. Homogenize time series  
Change STATION_CODE, in order to make a single time series for data with different STATION_CODE that in practice should count as the same station   
* Fixing station 227G1 and 227G2 (shall count as 227G)  
* Fixing station 36A and 36A1 (36A1 shall count as 36A)  
  
Also combines PARAM = VDSI and PARAM = Intersex to PARAM = "VDSI/Intersex" for station 71G  

```{r}

dat_all <- homogenize_series(dat_all)

```

### c. Prepare data   

- **Important**: set 'value_column' (WW or WWa)

- NOTE: part c-d1 (d1 at least) are slow and can be skipped by going to d2 and read the saved data  

```{r}

# These warnings can be ignored:
# Warning: Unknown or uninitialised column: `threshold`.
# Warning: Unknown or uninitialised column: `uncensored`.

dat_all_isotopes <- dat_all %>%
  filter(grepl("Delta", PARAM))

dat_all_with_zeros <- dat_all %>%
  filter(!grepl("Delta", PARAM)) %>%
  group_by(PARAM, STATION_CODE, TISSUE_NAME, LATIN_NAME) %>%
  mutate(N_zeros = sum(VALUE_WW <= 0)) %>%
  filter(N_zeros > 0)

dat_all_sans_zeros <- dat_all %>%
  filter(!grepl("Delta", PARAM)) %>%
  group_by(PARAM, STATION_CODE, TISSUE_NAME, LATIN_NAME) %>%
  mutate(N_zeros = sum(VALUE_WW <= 0)) %>%
  filter(N_zeros == 0)

dat_all_prep1_isotopes <- lc_prepare(dat_all_isotopes,
                           x = "MYEAR",
                           y = value_column, 
                           censored = "FLAG1",
                           log = FALSE,
                           keep_original_columns = TRUE) 

dat_all_prep1_with_zeros <- lc_prepare(dat_all_with_zeros,
                           x = "MYEAR",
                           y = value_column, 
                           censored = "FLAG1",
                           log = TRUE, const = 1,
                           keep_original_columns = TRUE) 

dat_all_prep1_sans_zeros <- lc_prepare(dat_all_sans_zeros,
                           x = "MYEAR",
                           y = value_column, 
                           censored = "FLAG1",
                           log = TRUE,
                           keep_original_columns = TRUE) 

dat_all_prep1 <- bind_rows(
  dat_all_prep1_isotopes,
  dat_all_prep1_with_zeros,
  dat_all_prep1_sans_zeros
)

# For folder_results = 125_results_2021_06:
# dat_all_prep1 <- dat_all_prep1_sans_zeros

# Test
# lc_plot(dat_all_prep1 %>% filter(PARAM %in% "CB153" & STATION_CODE %in% "15A"))
# lc_plot(dat_all_prep1 %>% filter(PARAM %in% "SN" & STATION_CODE %in% "80B"))
```


### c2. HG ONLY  

**NOTE**

```{r}

# dat_all_prep1 <- dat_all_prep1 %>%
#   filter(PARAM %in% "HG")

```

### c3. Sum parameters  


*Not used*   
  
Idea was to use the leftcensored method for sums, setting trends based on agreement between trends for "without LOQ" and "with LOQ" data.   
    - E.g. set downward trend if bothtremds of "without LOQ" and "with LOQ" data have a downwards trend, otherwise set trend unknown  


```{r}

if (FALSE){
  
  source("101_Combine_with_legacy_data_functions.R")  # list 'sum_parameters'  
  
  names(sum_parameters)
  
  # sum_parameters[["DDTEP"]]
  # sum_parameters[["CB_S7"]]
  sum_parameters[["BDE6S"]]
  
  year1 <- 2011
  sumpar <- "BDE6S"
  
  df_without_loq <- dat_all_prep1 %>%
    filter(PARAM %in% sum_parameters[[sumpar]] & MYEAR >= year1) %>%
    mutate(
      VALUE_WW = case_when(
        is.na(FLAG1) ~ VALUE_WW,
        !is.na(FLAG1) ~ 0)
    ) %>%
    group_by(STATION_CODE, TISSUE_NAME, LATIN_NAME, x) %>%
    summarize(
      VALUE_WW = sum(VALUE_WW)
    ) %>%
    ungroup() %>%
    mutate(
      PARAM = paste0(sumpar, "_without_LOQ"),
      FLAG1 = as.character(NA)
    )
  
  df_with_loq <- dat_all_prep1 %>%
    filter(PARAM %in% sum_parameters[[sumpar]] & MYEAR >= year1) %>%
    group_by(STATION_CODE, TISSUE_NAME, LATIN_NAME, x) %>%
    summarize(
      VALUE_WW = sum(VALUE_WW)
    ) %>%
    ungroup() %>%
    mutate(
      PARAM = paste0(sumpar, "_with_LOQ"),
      FLAG1 = as.character(NA)
    )
  
  df_with_and_without_loq <- bind_rows(df_without_loq, df_with_loq)
  
}


# dat_all_prep1 <- dat_all_prep1 %>%
#   filter(PARAM %in% "HG")

```


### d1. Add flags for rule 1 and rule 2 (columns Rule1 and Rule2)   

```{r}

if (FALSE){
 # Testing
 test1 <- dat_all_prep1 %>% filter(PARAM %in% "SN" & STATION_CODE %in% "80B")
 test2 <- leftcensored:::lc_flag1(test1)
 test3 <- leftcensored:::lc_flag2(test2)
}


#
# Create 'dat_all_prep3'
#

# File name for saving data
fn_dat_all <- paste0(folder_input, "/125_dat_all_prep3.rds")

if (!file.exists(fn_dat_all)){
  
  # If file doesn't exist, we create it and store it
  
  lc_flag1_seriesno <- function(i, seriesdata, data){
    data %>%
      filter(PARAM %in% seriesdata$PARAM[i],
             STATION_CODE %in% seriesdata$STATION_CODE[i],
             LATIN_NAME %in% seriesdata$LATIN_NAME[i],
             TISSUE_NAME %in% seriesdata$TISSUE_NAME[i]) %>%
      leftcensored:::lc_flag1(show_result = FALSE)
  }
  
  lc_flag2_seriesno <- function(i, seriesdata, data){
    data %>%
      filter(PARAM %in% seriesdata$PARAM[i],
             STATION_CODE %in% seriesdata$STATION_CODE[i],
             LATIN_NAME %in% seriesdata$LATIN_NAME[i],
             TISSUE_NAME %in% seriesdata$TISSUE_NAME[i]) %>%
      leftcensored:::lc_flag2(show_result = FALSE)
  }
  
  # Only series existing in the last year
  dat_series_for_flag <- dat_all_prep1 %>%
    filter(MYEAR == last_year) %>%     
    distinct(PARAM, STATION_CODE, LATIN_NAME, TISSUE_NAME) %>%
    as.data.frame()
  dat_series_for_flag$Rowno <- 1:nrow(dat_series_for_flag) # only for interactive checking
  
  if (FALSE){
    # Testing
    test2 <- lc_flag1_seriesno(1537, dat_series_for_flag, dat_all_prep1)
    test3 <- lc_flag2_seriesno(1537, dat_series_for_flag, dat_all_prep1)
    xtabs(~Rule2 + MYEAR, test3)
  }
  
  # SLOW! 9 minutes for flag1, 4 minutes fpr flag2 - but it works, at least
  dat_all_prep2 <- map_dfr(
    1:nrow(dat_series_for_flag), lc_flag1_seriesno,
    seriesdata = dat_series_for_flag,
    data = dat_all_prep1)
  
  dat_all_prep3 <- map_dfr(
    1:nrow(dat_series_for_flag), lc_flag2_seriesno,
    seriesdata = dat_series_for_flag,
    data = dat_all_prep2)
  
  # test4 <- dat_all_prep3 %>% filter(PARAM %in% "SN" & STATION_CODE %in% "80B")
  
  saveRDS(dat_all_prep3, fn_dat_all)
  
} else {
  
  # If file does exist, we just read it
  
  dat_all_prep3 <- readRDS(fn_dat_all)
  
}

```


### e. Make dat_series  

- One line per time series  

```{r}

# version of max() and min() that tolerates being given an x with length zero without warning
max_warningless <- function(x)
  ifelse(length(x)==0, NA, max(x))
min_warningless <- function(x)
  ifelse(length(x)==0, NA, min(x))

# TEST
# sel <- c(F,F,T,T,F)
# max_warningless((1:5)[sel])
# sel <- rep(FALSE,5)
# max_warningless((1:5)[sel])

  
 dat_series_1 <- dat_all_prep3 %>%
  group_by(PARAM, STATION_CODE, TISSUE_NAME, LATIN_NAME, x) %>%
  summarise(
    N = n(),
    N_over_LOQ = sum(uncensored == 1),
    P_over_LOQ = N_over_LOQ/N,
    Rule1 = first(Rule1),
    Rule2 = first(Rule2),
    .groups = "drop") %>%
  group_by(PARAM, STATION_CODE, TISSUE_NAME, LATIN_NAME) %>%
  summarise(
    First_year_overall = min(x),
    Last_year_overall = max(x),
    First_year = min_warningless(x[Rule1 & Rule2]),
    Last_year = max_warningless(x[Rule1 & Rule2]),
    N_years = length(unique(x[Rule1 & Rule2])),
    N_years_10yr = length(unique(x[Rule1 & Rule2 & x >= (last_year-10)])),
    Years_over_LOQ = sum(Rule1 & Rule2 & N_over_LOQ > 0),
    Last_year_over_LOQ = max_warningless(x[Rule1 & Rule2 & N_over_LOQ > 0]),
    .groups = "drop") 

 if (FALSE){ 
   dat_series_1 %>% filter(PARAM %in% "CB28") %>% View() 
   dat_all_prep3 %>% filter(PARAM %in% "CB28" & STATION_CODE %in% "98A2") %>% lc_plot()
 }
 
dat_series <- dat_series_1 %>%
  filter(Last_year_overall >= last_year) %>%
  mutate(
    Trend_model = case_when(
      Years_over_LOQ %in% 0:1 ~ "No model",
      Years_over_LOQ %in% 2 & N_years %in% 2 ~ "No model",
      Years_over_LOQ %in% 2:4 & N_years >= 3 ~ "Mean",
      Years_over_LOQ %in% 5:6 ~ "Linear",
      Years_over_LOQ %in% 7:9 ~ "Smooth, k_max=3",
      Years_over_LOQ %in% 10:14 ~ "Smooth, k_max=4",
      Years_over_LOQ >= 15 ~ "Smooth, k_max=5"),
    Trend_model = factor(
      Trend_model,
      levels = c("No model", "Mean", "Linear", 
                 "Smooth, k_max=3", "Smooth, k_max=4", "Smooth, k_max=5")),
    k_max = case_when(
      Trend_model %in% "No model" ~ as.numeric(NA),
      Trend_model %in% "Mean" ~ 1,
      Trend_model %in% "Linear" ~ 2,
      Trend_model %in% "Smooth, k_max=3" ~ 3,
      Trend_model %in% "Smooth, k_max=4" ~ 4,
      Trend_model %in% "Smooth, k_max=5" ~ 5)
    )
      
nrow(dat_series)

table(dat_series$Trend_model)
# xtabs(~Substance.Group + Trend_model,  dat_series)
  

```


### f. Statistics per substance  

- Selected substances  

```{r}

dat_series %>%
  filter(PARAM %in% c("HG", "CD", "CB118", "BAP", "PFOS", "PFOA")) %>%
  count(PARAM, Trend_model) %>%
  pivot_wider(PARAM, names_from = Trend_model, values_from = n, values_fill = 0,
              names_sort= TRUE)

```


### g. Last_year_over_LOQ  

```{r}

# Last_year_over_LOQ vs Last_year

xtabs(~Last_year + Last_year_over_LOQ, dat_series)

```

### h. Data series to run  

#### For first-time run  
```{r}

dat_series_trend <- dat_series %>%
  filter(!Trend_model %in% "No model") #%>%
#  filter(PARAM %in% "HG")

# 'series_no' decides name of result file that will be (over)written  
dat_series_trend$series_no <- 1:nrow(dat_series_trend)

```

#### For finishing run that has started, but was interrupted   

See APPENDIX 1 below


### i. Save series (and read saved files)   

```{r}

saveRDS(dat_series, 
        paste0(folder_input, "/125_dat_series.rds"))
saveRDS(dat_series_trend, 
        paste0(folder_input, "/125_dat_series_trend.rds"))

if (FALSE){
  # Read back
  dat_all_prep3 <- readRDS(paste0(folder_input, "/125_dat_all_prep3.rds"))
  dat_series <- readRDS(paste0(folder_input, "/125_dat_series.rds"))
  dat_series_trend <- readRDS(paste0(folder_input, "/125_dat_series_trend.rds"))
  
  # If series lacking:
  # dat_series_trend <- readRDS(paste0(folder_input, "/125_dat_series_trend_lacking.rds"))

}

```


### x. add "new" "No model" series to 'dat_series' of results folder 7  
```{r}

if (FALSE){

  # ONE TIME ONLY:
  # add "new" "No model" series to 'dat_series' of results folder 7
  folder_input_5 <- "Data/125_results_2021_05_input"
  folder_input_7 <- "Data/125_results_2021_07_input"
  
  dat_series_5 <- readRDS(paste0(folder_input_5, "/125_dat_series.rds")) %>%
    mutate(Basis = "WW")
  dat_series_7 <- readRDS(paste0(folder_input_7, "/125_dat_series.rds"))

  # Backup of old version:  
  # file.copy(
  #   paste0(folder_input_7, "/125_dat_series.rds"),
  #   paste0(folder_input_7, "/125_dat_series_OLD.rds")
  # )
  
  dat_series_to_add <- dat_series_5 %>%
    anti_join(dat_series_7, by = c("PARAM", "STATION_CODE", "TISSUE_NAME", "LATIN_NAME", "Basis"))
  nrow(dat_series_to_add)

  # get and add Substance.Group
  lookup_substgroup <- dat_series_7 %>% distinct(PARAM, Substance.Group)
  dat_series_to_add <- dat_series_to_add %>%
    left_join(lookup_substgroup)
    # get and add Substance.Group

  nrow(dat_series_5)
  nrow(dat_series_to_add)
  nrow(dat_series_7)                           # 2393
  nrow(dat_series_to_add) + nrow(dat_series_7) # 3022 - larger than 'dat_series_5', as 'dat_series_7' also contains WWa for HG
  
  dat_series_7_new <- bind_rows(dat_series_7, dat_series_to_add)
  
  nrow(dat_series_7)
  nrow(dat_series_7_new)

  # Overwrite old file:  
  # ONE TIME ONLY!
  # saveRDS(dat_series_7_new, 
  #         paste0(folder_input_7, "/125_dat_series.rds"))

}


```


## 3. Test  

### Single series  

```{r}

# param <- "AG"
# station_code <- "30A"
param <- "HG"
station_code <- "30B"

param <- "PB"
station_code <- "02B"

param <- "MCCP eksl. LOQ"
station_code <- "71B"

param <- "CB101"
station_code <- "30B"

#
# Example 1
#

# MCMC stops - Compilation error on line 22.  Unknown variable zero
param <- "HG"
station_code <- "36A"

# Fairly fast
param <- "CB52"
station_code <- "65A"

data_prep <- dat_all_prep3 %>%
  filter(PARAM == param & STATION_CODE == station_code)

i <- with(dat_series_trend, which(PARAM == param & STATION_CODE == station_code))
i
dat_series_trend[i,]

lc_plot(data_prep)

last_year_over_LOQ <- dat_series %>%
  filter(PARAM == param & STATION_CODE == station_code) %>% # View
  pull(Last_year_over_LOQ)

k_max <- dat_series %>%
  filter(PARAM == param & STATION_CODE == station_code) %>%
  pull(k_max)
k_values <- 1:k_max
# k_values <- 2:k_max

raftery <- FALSE

if (FALSE){
  
  # Test for a single 'k' value 
  
  # debugonce(leftcensored::lc_fixedsplines_tp)
  check <- leftcensored::lc_fixedsplines_tp(
    data = data_prep, 
    k = 2, 
    normalize = FALSE, raftery = raftery, measurement_error = "Uncertainty", 
    predict_x = seq(min(data_prep$x), max(data_prep$x), by = 0.25), 
    reference_x = last_year_over_LOQ, set_last_equal_x = last_year_over_LOQ)
  View(check$plot_data)
  
}

if (FALSE){
  
  # Test for all 'k' values ('k_values') 
  
  results_all <- purrr::map(
    k_values, 
    ~leftcensored::lc_fixedsplines_tp(
      data = data_prep, 
      k = .x, 
      normalize = FALSE, raftery = raftery, measurement_error = "Uncertainty", 
      predict_x = seq(min(data_prep$x), max(data_prep$x)), 
      reference_x = last_year_over_LOQ, set_last_equal_x = last_year_over_LOQ)
  )
  
  names(results_all) <- k_values
  str(results_all, 1) 
  str(results_all[[1]], 1) 
  str(results_all[[1]]$deviance, 1) 
  dev <- map(results_all, "deviance", .id = "k") %>% map_dbl(sum) 
  ddev <- -diff(dev)
  p_values <- 1-pchisq(ddev, 1)
  data.frame(k = names(dev), dev=dev, ddev = c(NA,ddev), p = c(NA,p_values))
  
}

# lc_plot(data_prep)
lc_plot(data_prep, results = results_all, facet = "wrap")

```

### get_splines_results_seriesno  

```{r, results='hide', message=FALSE, warning=FALSE}

# Fairly fast
param <- "CB52"
# param <- "HG"
station_code <- "65A"

param <- "HG"
station_code <- "43B2"

get_seriesno(param, station_code, data_series = dat_series_trend)

# debugonce(get_splines_results_seriesno)
get_splines_results_seriesno_s(11, 
                             data = dat_all_prep3, 
                             data_series = dat_series_trend, 
                             foldername = folder_results,  # test site 
                             raftery = FALSE)

```

### Check result  

```{r}

i <- 409
fn <- sprintf("trend_%04i.rda", i)
check <- readRDS(paste0(folder_results, "/", fn))

str(check, 1)

get_pointdata <- function(seriesno, data, data_series){
  subset(data, 
         PARAM %in% data_series$PARAM[seriesno] & 
           STATION_CODE %in% data_series$STATION_CODE[seriesno] & 
           TISSUE_NAME %in% data_series$TISSUE_NAME[seriesno] &
           LATIN_NAME %in% data_series$LATIN_NAME[seriesno])
  
}

# debugonce(get_pointdata)
df_points <- get_pointdata(i, data = dat_all_prep3, data_series = dat_series_trend)

ggplot(check$plot_data, aes(x, y)) +
  geom_ribbon(aes(ymin = y_q2.5, ymax = y_q97.5), fill = "lightblue") +
  geom_point(data = df_points) +
  geom_point(data = df_points, aes(y = threshold), shape = 6) +
  geom_line()

```


## 4. Analysing time trends and saving results    

- We save results as R files as we go, in case something goes wrong  

### Run analyses using doParallel  

```{r, results='hide'}

if (FALSE){
# if (T){
  
  # Do only once
  future::availableCores()
  
  # 4 / 16 / 64 
  cl <- makeCluster(4)
  registerDoParallel(cores = 4)
  
  # stopCluster(cl)
  
}

#
# Parallel 
#

# NEW: seriesno = column in the 'dat_series' data
series_no <- dat_series_trend$series_no 
# series_no <- c(6,7,8)

range(series_no)
length(series_no) # 1507

# Note folder name!

check_contents <- dir(folder_results)

if (length(check_contents) > 0){
  warning("You are writing to a folder which already contains ", length(check_contents), " files. THESE MAY BE OVERWRITTEN.")
}

# Run analyses
# For following the number of files from the terminal, use
#   ls -lrt
#   find -newermt '2 hours ago' | wc -l
# (1) lists files with the newest file last, the second counts number of files younger than 2 hour 

t0 <- Sys.time()
result <- foreach(i = series_no, 
                  .export = c("dat_all_prep3", "dat_series_trend")) %dopar%
                   get_splines_results_seriesno_s(i, 
                                 dat_all_prep3, dat_series_trend, foldername = folder_results,
                                 raftery = TRUE)
t1 <- Sys.time()
t1-t0
# 30 min for 100 on 16 cores  
# 4.8 hours for 1960 on 64 cores


```

### Run analyses without doParallel
```{r}

for (i in c(15:17)){
  result <- get_splines_results_seriesno_s(i, 
                                 data = dat_all_prep3, 
                                 data_series = dat_series_trend, 
                                 foldername = folder_results,  # test site 
                                 raftery = TRUE)
}

# i = 14: HG	71B
# no non-missing arguments to max; returning -Inf>
# dat_series_trend[14,]

```


## 5. Check results  


### Read and quick check of files  

```{r}

# folder_results <- "Data/125_results_2021_06"

  # File names and numbers  
  fns <- dir(folder_results, full.names = TRUE) %>% sort()
  length(fns)
  fileinfo_no <- substr(fns, nchar(folder_results) + 8, nchar(folder_results) + 11)
  names(fns) <- fileinfo_no 

  # Read file content
  result_list <-lapply(fns, readRDS)
  
  # Check length of each result
  result_length <-  map_int(result_list, length)
  table(result_length)
  
```

### Extract data frame for "successes"  

- Meaning that fitting worked (DIC exists)

```{r}

jags_finished <- map_lgl(result_list, ~!is.null(.x$k_values))
ok <- map_lgl(result_list, ~!is.null(.x$DIC))

plotno <- map_dbl(result_list, "seriesno")
PARAM <- map_chr(result_list, "PARAM")
STATION_CODE <- map_chr(result_list, "STATION_CODE")
TISSUE_NAME <- map_chr(result_list, "TISSUE_NAME")
LATIN_NAME <- map_chr(result_list, "LATIN_NAME")
k_max <- map_chr(result_list, "k_max")
k_sel <- NA
k_sel[ok] <- map_int(result_list[ok], "k_sel")

dat_success <- data.frame(ok, jags_finished, plotno, PARAM, STATION_CODE, TISSUE_NAME, LATIN_NAME, k_max, k_sel)

xtabs(~jags_finished + ok, dat_success)
xtabs(~k_max + ok, dat_success)

```


### Plot a single series  
```{r}

tsplot_seriesno(58, folder = folder_results)
tsplot_seriesno(58, folder = folder_results)

```

### Check one group of compounds  

```{r}

pno <- dat_success %>%
  filter(PARAM == "MCCP eksl. LOQ" & ok) %>%
  pull(plotno)

df_modelfit <- map_dfr(pno, extract_modelfit_data, folder = folder_results)
df_rawdata <- map_dfr(pno, extract_raw_data)

ggplot(df_modelfit, aes(x, y)) +
  geom_ribbon(aes(ymin = y_q2.5, ymax = y_q97.5), fill = "lightblue") +
  geom_point(data = df_rawdata %>% filter(!is.na(y))) +
  geom_point(data = df_rawdata %>% filter(!is.na(threshold)), aes(y = threshold), shape = 6) +
  geom_line() +
  facet_wrap(vars(STATION_CODE), scales = "free_y")

```

## 6. Check Rule1 and Rule2   

- Rule 1. Time series should be truncated from the left until Nplus/N >= 0.5      
- Rule 2. If a linear/smooth trend is fitted, the first year must be non-censored     
- Forgot to think of these when the estimations were ran  

```{r}

df_rule1 <- dat_all_prep3 %>%
  group_by(PARAM, STATION_CODE, TISSUE_NAME, LATIN_NAME) %>%
  summarise(Rule1_first = first(Rule1)) 
table(df_rule1$Rule1_first)

df_rule2 <- dat_all_prep3 %>%
  group_by(PARAM, STATION_CODE, TISSUE_NAME, LATIN_NAME) %>%
  summarise(Rule2_first = first(Rule2)) 
table(df_rule2$Rule2_first)

dat_success <- dat_series_trend %>%
  left_join(df_rule1) %>%
  left_join(df_rule2) 

table(dat_success$Rule1_first)  # 6   - should be run again
table(dat_success$Rule2_first)  # 20  - should be run again

```

## APPENDIX 1

#### For finishing run that has started, but was interrupted  
```{r}

if (FALSE){
  
  # FIRST, RE-MAKE 'dat_series_trend'  
  # THEN:

  # File names and numbers  
  fns <- dir(folder_results, full.names = TRUE) %>% sort()
  length(fns)
  fileinfo_no <- substr(fns, nchar(folder_results) + 8, nchar(folder_results) + 11)
  names(fns) <- fileinfo_no 
  
  # Read file content
  result_list <-lapply(fns, readRDS)
  
  result_length <-  map_int(result_list, length)
  table(result_length)
  #  5    7   20 
  # 38  405 1233 

  if (FALSE){
    # Examples
    i <- which(result_length == 5)[1]  
    str(result_list[[i]], 1)
    i <- which(result_length == 7)[1]  
    str(result_list[[i]], 1)
    i <- which(result_length == 20)[1]  
    str(result_list[[i]], 1)
  }
  
  serno_incomplete <- fileinfo_no[result_length == 5] %>% as.numeric()
  serno_lacking <- which(!dat_series_trend$series_no %in% as.numeric(fileinfo_no))
  serno_torun <- c(serno_incomplete, serno_lacking)
  
  # Update dat_series_trend 
  nrow(dat_series_trend)
  dat_series_trend <- dat_series_trend[serno_torun,]
  nrow(dat_series_trend)
  
  # DON'T save dat_series_trend (i.e.. don't run 2.i)
  
  # GO DIRECTLY TO 4

}

```

## APPENDIX 2 - Combine WW and WWa runs

- folder 07  
- Ths will be used for plotting etc.  
- combine files from folders 05 (WW) and 06 (WWa)  
- As we stupidly didn't included 'Basis' anywhere, some changes needs to be done  

```{r}

# folder_results <- "Data/125_results_2021_05"          # WW (wet-weight), all parameters  
# folder_results <- "Data/125_results_2021_06"          # WWa (wet-weight length-adjusted), HG only


#
# 1. Made folder 'Data/125_results_2021_07'  
#

#
# 2. Set these
#

folder_results <- "Data/125_results_2021_07"        # length-adjusted HG
folder_input <- paste0(folder_results, "_input")

if (!dir.exists(folder_results)){
  dir.create(folder_results)
}
if (!dir.exists(folder_input)){
  dir.create(folder_input)
}


#
# Parts 3-5: Read files from folder 06 (WWa), modify them, and write to folder 07
#

# Modification:
# 1. add 3000 to file names, so the numbers will not overlap with the files in 05 (the WW files) that has numbers to a bit over 2000
# 2. add 3000 to 'seriesno' inside result files (list element number 1) 
# 3. Add Basis = 'WWa' as list element after 'LATIN_NAME' (which is element no. 5)

add_constant <- 3000

#
# 3. Make new file names/paths
#

# Original folder and file names  
folder_results_orig <- "Data/125_results_2021_06"   
fns_orig <- dir(folder_results_orig, full.names = TRUE) %>% sort()
fns_orig

# Original file number  
fileinfo_no_orig <- substr(fns_orig, nchar(folder_results_orig) + 8, nchar(folder_results_orig) + 11)

# New file names  
fns_new <- sprintf("trend_%04i.rda", as.numeric(fileinfo_no_orig) + add_constant)
fns_new <- paste0(folder_results, "/", fns_new)

# Check
# 2 things differ: folder (06 vs 07), and file number  
cbind(fns_orig[1:3], fns_new[1:3])

#
# 4. Read all original files  
#

# Read file contents of all files  
result_list_orig <-lapply(fns_orig, readRDS)

# Check file no. 1
str(result_list_orig[[1]], 1)  
str(result_list_orig[[5]], 1)  

#
# 5. Make modifications 2-3 and save   
#

# Change 'seriesno' in all files, and save new versions
# - must only be done once (that goes for several of these steps)
for (i in 1:length(fns_orig)){
  X <- result_list_orig[[i]]
  # Modification 3: Add Basis
  X2 <- c(
    X[seq(1,5)],             # first 5 list elements
    list(Basis = "WWa"),     # Add Basis
    X[seq(6,length(X))]      # rest of the original list elements
  )
  # Modification 2: Increase seriesno by 3000
  X2$seriesno <- X2$seriesno + add_constant
  # Save to the new file name/path
  saveRDS(X2, fns_new[i])      
}


# Checks
fns_07 <- dir(folder_results, full.names = TRUE)
fns_07_no <- substr(fns_07, nchar(folder_results_orig) + 8, nchar(folder_results_orig) + 11) %>% as.numeric()
fns_07[fns_07_no >= 3000]

result_list_new <-lapply(fns_07[fns_07_no >= 3000], readRDS)

str(result_list_new[[1]], 1)
str(result_list_new[[5]], 1)


#
# Parts 6-8: Read files from folder 05 (WWa), modify them, and write to folder 07
#

# Only modification number 3: Add Basis = 'WW' as list element after 'LATIN_NAME' (which is element no. 5)

#
# 6. Make new file names/paths
#
#
# Changes from part3 are commented above
#

# Original folder and file names  
folder_results_orig <- "Data/125_results_2021_05"   
fns_orig <- dir(folder_results_orig, full.names = TRUE) %>% sort()
head(fns_orig)

# Original file number  
fileinfo_no_orig <- substr(fns_orig, nchar(folder_results_orig) + 8, nchar(folder_results_orig) + 11)

# New file names  
fns_new <- sprintf("trend_%04i.rda", as.numeric(fileinfo_no_orig))  # no 'add_constant' here (cf. part 3 above)
fns_new <- paste0(folder_results, "/", fns_new)
head(fns_new)

# Check
# 1 thing differ: folder (06 vs 07) -  cf. part 3 above
cbind(fns_orig[1:3], fns_new[1:3])

#
# 7. Read all original files
#
# Same code as part 4 above
#

# Read file contents of all files  
result_list_orig <-lapply(fns_orig, readRDS)

# Check file no. 1
str(result_list_orig[[1]], 1)  

#
# 8. Make modification (just add Basis) and save   
#

# Change 'seriesno' in all files, and save new versions
# - must only be done once (that goes for several of these steps)
for (i in 1:length(fns_orig)){
  X <- result_list_orig[[i]]
  # Modification: Add Basis
  X2 <- c(
    X[seq(1,5)],             # first 5 list elements
    list(Basis = "WW"),     # Add Basis (CHECK - cf. part 5 above)
    X[seq(6,length(X))]      # rest of the original list elements
  )
  # Modification 2: Increase seriesno by 3000 - not needed (cf. part 5 above)
  # Save to the new file name/path
  saveRDS(X2, fns_new[i])      
}


#
# 9. Combine 'series_trend' files for 'input'   
#

# Read
dat_series_trend_1 <- readRDS("Data/125_results_2021_05_input/125_dat_series_trend.rds")  # WW 
dat_series_trend_2 <- readRDS("Data/125_results_2021_06_input/125_dat_series_trend.rds")  # WWa

# Check
range(dat_series_trend_1$series_no)
range(dat_series_trend_2$series_no) 

# Combine, adding 3000 to series_no for the second file  
dat_series_trend <- bind_rows(
  dat_series_trend_1 %>% mutate(Basis = "WW"),
  dat_series_trend_2 %>% mutate(Basis = "WWa", series_no = series_no + add_constant)
)

# Save
saveRDS(dat_series_trend, "Data/125_results_2021_07_input/125_dat_series_trend.rds")  # WW + WWa

# Check that series number and STATION_CODE in series file is the same as in results files
# Check - series file
check_seriesfile <- readRDS("Data/125_results_2021_07_input/125_dat_series_trend.rds") %>%
  filter(series_no >= 3000) %>%
  rename(series_no_seriesfile = series_no) %>%
  select(series_no_seriesfile, STATION_CODE)
# Check - results file
fns_07 <- dir(folder_results, full.names = TRUE)
fns_07_no <- substr(fns_07, nchar(folder_results_orig) + 8, nchar(folder_results_orig) + 11) %>% as.numeric()
fns_07_sel <- fns_07[fns_07_no >= 3000]
result_list_new <- lapply(fns_07_sel, readRDS)
result_seriesno <- map_dbl(result_list_new, "seriesno")
result_station <- map_chr(result_list_new, "STATION_CODE")
check_resultfiles <- data.frame(fns_07_sel, result_seriesno, result_station) 
check <- bind_cols(check_seriesfile, check_resultfiles) 
# Visual check
check %>% View("Check series vs results")
# Formal check
if(sum(check[,1] != check[,4]) > 0)
  stop("Series numbers not equal")
if(sum(check[,2] != check[,5]) > 0)
  stop("Station codes not equal")

#
# 10. Combine 'series' files  
#

# Needed for script 126

# Read
dat_series_1 <- readRDS("Data/125_results_2021_05_input/125_dat_series.rds")  # WW 
dat_series_2 <- readRDS("Data/125_results_2021_06_input/125_dat_series.rds")  # WWa

# no 'series_no' column 

# Combine, adding 3000 to series_no for the second file  
dat_series <- bind_rows(
  dat_series_1 %>% mutate(Basis = "WW"),
  dat_series_2 %>% mutate(Basis = "WWa")
)

# Save
saveRDS(dat_series, "Data/125_results_2021_07_input/125_dat_series.rds")  # WW + WWa


#
# 11. Combine raw data files
#

# Need to add 'Basis'  

dat_1 <- readRDS("Data/125_results_2021_05_input/125_dat_all_prep3.rds")  # WW 
dat_2 <- readRDS("Data/125_results_2021_06_input/125_dat_all_prep3.rds")  # WWa

# Combine
dat_all_prep3 <- bind_rows(
  dat_1 %>% mutate(Basis = "WW"), 
  dat_2 %>% mutate(Basis = "WWa"))

# Save
saveRDS(dat_all_prep3, "Data/125_results_2021_07_input/125_dat_all_prep3.rds")  # WW + WWa


```