From 7e7064569f4c352817757f00d96c3b99a528002e Mon Sep 17 00:00:00 2001
From: holdenharris-NOAA <holden.harris@noaa.gov>
Date: Fri, 10 Nov 2023 15:28:37 -0500
Subject: [PATCH] First commit. Code works for one Ecospace scenario.

---
 Comparison-objective-function.R | 202 ++++++++++++++++++++++++++++++++
 1 file changed, 202 insertions(+)
 create mode 100644 Comparison-objective-function.R

diff --git a/Comparison-objective-function.R b/Comparison-objective-function.R
new file mode 100644
index 0000000..2980d96
--- /dev/null
+++ b/Comparison-objective-function.R
@@ -0,0 +1,202 @@
+## Naming conventions
+## 'sim' --> Related to Ecosim
+## 'spa' --> Related to Ecospace
+## 'obs' --> Related to observed timeseries data, i.e., that Ecosim was fitted to. 
+## 'B'   --> Denotes biomass
+## 'C'   --> Denotes catch
+
+## Setup -----------------------------------------------------------------------
+rm(list=ls())
+source("./functions.R") ## Pull in functions
+library(dplyr)
+
+## Input set up ----------------------------------------------------------------
+ewe_name     = "EwE_Outputs"
+sim_scenario = "sim-spa_01"
+obs_TS_name  = "TS_updated_IB13"
+srt_year     = 1980
+spa_scenarios  = c("spa_00", "spa_01", "spa_02_MOM6-ISIMIP3a", "spa_03_MOM6-ISIMIP3a_PP-phyc-vint")
+spa_scen_names = c("01 Base", "02 MODIS-ChlA", "03 MOM6-ChlA", "04 MOM6-Vint")
+
+## User-defined output parameters ----------------------------------------------
+today_date <- format(Sys.Date(), "%Y-%m-%d")
+out_file_notes = "multispa"
+plot_name_xY = paste0("B_scaled_xY_", out_file_notes)
+#plot_name_xM = paste0("B_scaled_xM_", out_file_notes)
+
+## Set scaling parameters 
+#init_years_toscale = 5
+init_years_toscale  = 2016-1980 + 1 ## 36. This will scale all outputs to the global average
+init_months_toscale = init_years_toscale * 12
+
+## -----------------------------------------------------------------------------
+##
+## Read-in ANNUAL Observed, Ecosim, and Ecospace TS
+dir_sim = paste0("./", ewe_name, "/ecosim_", sim_scenario, "/")
+
+## Read-in Ecosim annual biomass 
+filename = paste0(dir_sim, "biomass_annual.csv")
+num_skip_sim = f.find_start_line(filename, flag = srt_year)
+
+simB_xY <- read.csv(paste0(dir_sim, "biomass_annual.csv"), skip = num_skip_sim)
+years = simB_xY$year.group ## Get date range from Ecosim
+simB_xY$year.group = NULL
+
+## Read-in Ecosim annual catches
+simC_xY <- read.csv(paste0(dir_sim, "catch_annual.csv"), skip = num_skip_sim)
+simC_xY$year.group = NULL
+
+## Read-in Ecospace annual biomass and catches ---------------------------------
+ls_spaB_xY <- list()
+ls_spaC_xY <- list()
+for (i in 1:length(spa_scenarios)) {
+  dir_spa = paste0("./", ewe_name, "/", spa_scenarios[i], "/")
+  filename <- paste0(dir_spa, "Ecospace_Annual_Average_Biomass.csv")
+  num_skip_spa <- f.find_start_line(filename, flag = "Year")
+  spaB_xY <- read.csv(paste0(dir_spa, "Ecospace_Annual_Average_Biomass.csv"), 
+                      skip = num_skip_spa, header = TRUE)
+  spaB_xY$Year = NULL
+  
+  ## Standardize FG names ------------------------------
+  fg_names = f.standardize_group_names(colnames(spaB_xY))
+  num_fg = length(fg_names)
+  fg_df <- data.frame(
+    pool_code  = 1:num_fg,
+    group_name = paste(sprintf("%02d", 1:num_fg),
+                       gsub("_", " ", fg_names))
+  )
+  
+  ## Set row and column names
+  rownames(spaB_xY) = rownames(simB_xY) = years
+  colnames(spaB_xY) = colnames(simB_xY) = fg_df$group_name
+  
+  ## Add current spaB_xY reading into the list object ----
+  ls_spaB_xY[[i]] <- spaB_xY
+  
+  ## Read-in Ecospace annual catch
+  spaC_xY <- read.csv(paste0(dir_spa, "Ecospace_Annual_Average_Catch.csv"), 
+                      skip = num_skip_spa, header = TRUE)
+  spaC_xY$Year = NULL
+  ls_spaC_xY[[i]] <- spaC_xY
+}
+
+## -----------------------------------------------------------------------------
+## Prepare months and date series objects
+start_y <- min(years)
+end_y   <- max(years)
+date_series <- seq(as.Date(paste0(start_y, "-01-01")), as.Date(paste0(end_y,   "-12-01")), by = "1 month")
+year_series <- seq(as.Date(paste0(start_y, "-01-01")), as.Date(paste0(end_y,   "-12-01")), by = "1 year")
+ym_series <- format(date_series, "%Y-%m")
+
+## Read in MONTHLY biomasses
+simB_xM <- read.csv(paste0(dir_sim, "biomass_monthly.csv"), skip = num_skip_sim); simB_xM$timestep.group = NULL
+simC_xM <- read.csv(paste0(dir_sim, "catch_monthly.csv"), skip = num_skip_sim); simC_xM$timestep.group = NULL
+spaB_xM <- read.csv(paste0(dir_spa, "Ecospace_Average_Biomass.csv"), skip = num_skip_spa, header = TRUE); spaB_xM$TimeStep = NULL
+rownames(spaB_xM) = rownames(simB_xM) = ym_series
+
+## Read in "observed" timeseries -----------------------------------------------
+dir_obs = paste0("./", ewe_name, "/", obs_TS_name, ".csv")
+obs.list = f.read_ecosim_timeseries(dir_obs, num_row_header = 4)
+for(i in 1:length(obs.list)){assign(names(obs.list)[i],obs.list[[i]])} #make separate dataframe for each list element
+
+obsB.head <- merge(obsB.head, fg_df, by = "pool_code", all.x = TRUE)
+obsC.head <- merge(obsC.head, fg_df, by = "pool_code", all.x = TRUE)
+
+colnames(obsB) = obsB.head$group_name
+colnames(obsC) = obsC.head$group_name
+
+## -----------------------------------------------------------------------------
+##
+## 
+## 
+
+## -----------------------------------------------------------------------------
+## 
+
+init_years_toscale = 5
+
+fit_metrics <- data.frame(
+  nll_spa_obs = NA, nll_sim_obs = NA, nll_spa_sim = NA,
+  pb_spa_obs = NA, pb_sim_obs = NA, pb_spa_sim = NA,
+  mae_spa_obs = NA, mae_sim_obs = NA, mae_spa_sim = NA)
+
+for(i in 1:num_fg){
+  # i = 6
+  ## Get biomass for individual FG: Observed, Ecosim, and Ecospace
+  (grp  = fg_df$group_name[i])
+  simB = simB_xY[,i] 
+  spaB_ls <- lapply(ls_spaB_xY, function(df) df[, i]) ## Extract the i column from each data frame in the list
+  
+  ## Check to see if observed data is available
+  obsB_scaled=NULL
+  if(i %in% obsB.head$pool_code){
+    obs.idx     = which(obsB.head$pool_code==i)
+    obs_df      = suppressWarnings( ## Suppress warnings thrown when obs not available
+      data.frame(year_series, obsB = as.numeric(obsB[ ,obs.idx]))
+    )
+    non_na_obsB = obs_df$obsB[!is.na(obs_df$obsB)] # Extract non-NA values from obs_df$obsB
+    if_else (length(non_na_obsB) < init_years_toscale,
+             years_to_scale <- length(non_na_obsB),
+             years_to_scale <- init_years_toscale)
+    mean_init_years = mean(non_na_obsB[1:years_to_scale]) # Calculate the mean of the first 'init_years_toscale' non-NA values
+    obsB_scaled = obs_df$obsB / mean_init_years # Scale the entire obs_df$obsB by this mean
+  } else obsB_scaled=rep(NA, length(simB))  
+  
+  ## Scale to the average of a given timeframe
+  simB_scaled = simB / mean(simB[1:init_years_toscale], na.rm = TRUE) ## Ecosim scaled
+  spaB_scaled_ls = list() ## List of Ecospace scaled
+  for(j in 1:length(spa_scenarios)){
+    spaB               <- spaB_ls[[j]]
+    spaB_scaled        <- spaB / mean(spaB[1:init_years_toscale], na.rm = TRUE)
+    spaB_scaled_ls[[j]] <- spaB_scaled
+  }
+  
+  ## Create data frame to compare observed, Ecosim, and Ecospace
+  comp_df <- data.frame(obs = obsB_scaled, sim = simB_scaled, spa = spaB_scaled)
+  
+  ## Calculate log-liklihood ---------------------------------------------------
+  # Calculate log-likelihood for spa vs obs
+  resids <- comp_df$obs - comp_df$spa
+  nll_spa_obs <- -(-length(comp_df$obs)/2 * log(2*pi*var(resids, na.rm=TRUE)) - 1/(2*var(resids, na.rm=TRUE)) * sum(resids^2, na.rm=TRUE))
+  
+  # Calculate log-likelihood for sim vs obs
+  resids <- comp_df$obs - comp_df$sim
+  nll_sim_obs <- -(-length(comp_df$obs)/2 * log(2*pi*var(resids, na.rm=TRUE)) - 1/(2*var(resids, na.rm=TRUE)) * sum(resids^2, na.rm=TRUE))
+  
+  # Calculate log-likelihood for spa vs sim
+  resids <- comp_df$sim - comp_df$spa
+  nll_spa_sim <- -(-length(comp_df$sim)/2 * log(2*pi*var(resids, na.rm=TRUE)) - 1/(2*var(resids, na.rm=TRUE)) * sum(resids^2, na.rm=TRUE))
+  
+  ## Calculate percent bias
+  ## Note: This measure of percent bias aggregates all prediction errors, 
+  ## both positive and negative, into a single number. This can mask the 
+  ## variability of the errors across different observations.
+  pbi_spa_obs <- 100 * (sum(comp_df$spa - comp_df$obs, na.rm=TRUE) / sum(comp_df$obs,na.rm=TRUE))
+  pbi_sim_obs <- 100 * (sum(comp_df$sim - comp_df$obs, na.rm=TRUE) / sum(comp_df$obs,na.rm=TRUE))
+  pbi_spa_sim <- 100 * (sum(comp_df$spa - comp_df$sim, na.rm=TRUE) / sum(comp_df$sim,na.rm=TRUE))
+  
+  ## Calculate mean absolute error (MAE) ---------------------------------------
+  ## Average magnitude of errors between the predictions and observations, 
+  ## treating all errors with equal weight regardless of their size.
+  mae_spa_obs = mean(abs(comp_df$spa - comp_df$obs), na.rm = TRUE)
+  mae_sim_obs = mean(abs(comp_df$sim - comp_df$obs), na.rm = TRUE)
+  mae_spa_sim = mean(abs(comp_df$spa - comp_df$sim), na.rm = TRUE)
+  
+  ## Calculate root mean absolute error (RMSE) ---------------------------------
+  rmse_spa_obs <- sqrt(mean((comp_df$spa - comp_df$obs)^2, na.rm = TRUE))
+  rmse_sim_obs <- sqrt(mean((comp_df$sim - comp_df$obs)^2, na.rm = TRUE))
+  rmse_spa_sim <- sqrt(mean((comp_df$spa - comp_df$sim)^2, na.rm = TRUE))
+  
+  ## Store calculations
+  fit <-  c(nll_spa_obs, nll_sim_obs, nll_spa_sim, 
+            pbi_spa_obs, pbi_sim_obs, pbi_spa_sim,
+            mae_spa_obs, mae_sim_obs, mae_spa_sim); fit
+  
+  fit_metrics[i,] <- fit
+}
+rownames(fit_metrics) = fg_df$group_name
+fit_metrics = round(fit_metrics, 2)
+(fit_sums <- colSums(fit_metrics, na.rm = TRUE))
+
+  ## 
+  
\ No newline at end of file