This repository has been archived by the owner on May 28, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
/
_targets.R
140 lines (113 loc) · 5.54 KB
/
_targets.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
library(targets)
options(tidyverse.quiet = TRUE)
tar_option_set(packages = c("tidyverse", "lubridate", "rmarkdown", "knitr",
"dataRetrieval", "nhdplusTools", "sbtools",
"leaflet", "sf", "USAboundaries", "cowplot",
"ggspatial", "patchwork", "streamMetabolizer",
"reticulate", "yaml"))
source("1_fetch.R")
source("2_process.R")
source("2a_model.R")
source("3_visualize.R")
dir.create("1_fetch/out/", showWarnings = FALSE)
dir.create("1_fetch/log/", showWarnings = FALSE)
dir.create("2_process/out/", showWarnings = FALSE)
dir.create("2a_model/out/", showWarnings = FALSE)
dir.create("2_process/log/", showWarnings = FALSE)
dir.create("3_visualize/out/", showWarnings = FALSE)
dir.create("3_visualize/out/nhdv2_attr_png/", showWarnings = FALSE)
dir.create("3_visualize/log/", showWarnings = FALSE)
# 1) CONFIGURE DATA PIPELINE INPUTS/VARIABLES
# Define USGS parameter codes of interest
# 00300 = "dissolved oxygen, in milligrams per liter"
pcode_select <- c("00300")
# Define minor HUCs (hydrologic unit codes) that make up the DRB
# Lower Delaware: 020402 accounting code
drb_huc8s <- c("02040201","02040202","02040203","02040204","02040205","02040206","02040207")
# Define USGS site types for which to download NWIS data
# (https://maps.waterdata.usgs.gov/mapper/help/sitetype.html)
site_tp_select <- c("ST","ST-CA","SP")
# Omit undesired sites
# sites 01412350, 01484272 coded as site type "ST" but appear to be tidally-influenced
omit_nwis_sites <- c("01412350","01484272", "01477050", "01467200", "014670261", "01464600")
# Define USGS stat codes for continuous sites that only report daily statistics
# (https://help.waterdata.usgs.gov/stat_code)
stat_cd_select <- c("00001","00002","00003")
# Define earliest startDate and latest endDate for NWIS data retrievals
earliest_date <- "1979-10-01"
latest_date <- "2021-10-01"
# What is the minimum number of unique observation-days a site should have
# to be considered "well-observed" and therefore, included in the model?
# Note that if min_obs_days is changed from 100 below, you may want to
# reconsider the train/test model splits.
min_obs_days <- 100
# Change dummy date to force re-build of NWIS DO sites and data download
dummy_date <- "2023-03-02"
#2) CONFIGURE MODEL INPUTS/VARIABLES
validation_sites_urban <- c("01475530", "01475548")
validation_sites_nonurban <- c('01472104', '014721254', '014721259', '01473500', '01480617', '01480870', '01481000', '01481500')
# Define global model parameters for the "baseline" deep learning model
x_vars_global <- c("tmmn","tmmx","pr","srad","SLOPE","TOTDASQKM","CAT_BASIN_SLOPE",
"TOT_BASIN_SLOPE","CAT_ELEV_MEAN","CAT_RDX","CAT_BFI","CAT_EWT",
"CAT_TWI","CAT_PPT7100_ANN","TOT_PPT7100_ANN","CAT_RUN7100",
"CAT_CNPY11_BUFF100","CAT_IMPV11","TOT_IMPV11","CAT_NLCD11_wetland",
"TOT_NLCD11_wetland","CAT_SANDAVE","CAT_PERMAVE","TOT_PERMAVE",
"CAT_RFACT","CAT_WTDEP","TOT_WTDEP","CAT_NPDES_MAJ","CAT_NDAMS2010",
"CAT_NORM_STORAGE2010")
# Define model parameters and combine within a list that gets used to
# write a base model config file for the snakemake modeling workflow.
base_config_options <- list(
out_dir = "../../../out/models",
# random seed for training; If FALSE, no seed. Otherwise, specify the seed:
seed = FALSE,
num_replicates = 10,
trn_offset = 1,
tst_val_offset = 1,
epochs = 100,
hidden_size = 10,
dropout = 0.2,
recurrent_dropout = 0.2,
finetune_learning_rate = 0.01,
early_stopping = FALSE,
# train/val/test split information is defined above:
validation_sites_urban = validation_sites_urban,
validation_sites_nonurban = validation_sites_nonurban,
train_start_date = '2007-10-01',
train_end_date_temporal_holdout = '2015-10-01',
train_end_date_spatial_holdout = '2021-10-01',
val_start_date_temporal_holdout = '2015-10-01',
val_start_date_spatial_holdout = NULL,
val_end_date_temporal_holdout = '2021-10-01',
val_end_date_spatial_holdout = NULL,
x_vars = x_vars_global
)
# Configure individual models. If different x_vars are desired, add
# `x_vars = [vector of attribute names]` to any of the config options
# lists below, which will override `x_vars_global` in `base_config_options`.
# Model 0: Create a list that contains inputs for the "baseline" deep learning model.
model_config_options <- list(
y_vars = c("do_min","do_mean","do_max"),
lambdas = c(1,1,1)
)
# Model 1: Create a list that contains inputs for the metab_multitask model
metab_multitask_config_options <- list(
y_vars = c("do_min","do_mean","do_max","GPP","ER","K600","depth","temp.water"),
lambdas = c(1, 1, 1, 1, 1, 1, 1, 1)
)
# Model 1a: Create a list that contains inputs for the 1a_metab_multitask model
metab_1a_multitask_config_options <- list(
y_vars = c("do_min","do_mean","do_max","GPP","ER","K600","depth","temp.water"),
lambdas = c(1, 1, 1, 1, 1, 0, 0, 0)
)
# Model 1b: Create a list that contains inputs for the 1b_metab_multitask model
metab_1b_multitask_config_options <- list(
y_vars = c("do_min","do_mean","do_max","GPP","ER","K600","depth","temp.water"),
lambdas = c(1, 1, 1, 1, 0, 0, 0, 0)
)
# Model 2: Create a list that contains inputs for the metab_dense model
multitask_dense_config_options <- list(
y_vars = c("do_min","do_mean","do_max","GPP","ER","K600","depth","temp.water"),
lambdas = c(1, 1, 1, 1, 1, 1, 1, 1)
)
# Return the complete list of targets
c(p1_targets_list, p2_targets_list, p2a_targets_list, p3_targets_list)