-
Notifications
You must be signed in to change notification settings - Fork 1
/
111_Nstring_SD_DDI.Rmd
585 lines (458 loc) · 17.9 KB
/
111_Nstring_SD_DDI.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
---
title: "111_Nstring_and_SD"
output:
html_document:
keep_md: true
toc: true
---
Creates information needed for the creation of the big Excel file:
* 'N_string' for last year and current year
- "8 (6-3)" = 8 samples in total, 6 of them are pooled, max. individuals per pooled sample = 3
* 'SD' (standard deviation) for last year and current year
* 'D.d.i.' = detectable data information = "N>LOQ [min - maks]"
- Example: 7 [0.11 - 0.29] = 7 values >LOQ, these measurements varied from 0.11 to 0.29
As in script 110, some data series are 'homogenized'
* *Input:* Uses the same input file as script 110, and in a way works in the same way (summarises sample-level data into one line per year/station/tissue/parameter)
- Uses labware data in addition
* *Output* to three different files
- These are used by '201_Make_big_excel_file.Rmd'
## 0. Current year
The year of the last data
```{r}
selected_year <- 2021
if (as.numeric(substr(Sys.Date(), 1, 4)) - selected_year != 1){
warning("NOTE: 'selected_year' should usually be last year. Is it set correctly?")
} else {
message("OK ('selected_year' is one year ago)")
}
```
## 1. Packages
```{r, message=FALSE, warning=FALSE, results='hide'}
library(readxl)
library(tidyr)
#
# Dropping dtplyr (it only saves 20 seconds or so in this script)
#
# install.packages("dtplyr")
# library(data.table)
# library(dtplyr)
library(dplyr, warn.conflicts = FALSE)
library(purrr)
library(safejoin)
source("002_Utility_functions.R")
source("110_Medians_and_PROREF_functions.R") # homogenize_series
source("111_Nstring_SD_DDI_functions.R")
```
## 2. Read data
### Main data
Read and reformat the most recent data (by default)
```{r, collapse=TRUE}
# If we have NOT length-adjusted the last year's data:
# filepattern <- "101_data_updated_" # entire file name except date and extension
# Normally, if we have length-adjusted the last year's data:
filepattern <- "109_adjusted_data_" # entire file name except date and extension
filenumber <- 1 # filenumber = 1 means "read the newest file"
files <- dir("Data", pattern = filepattern) %>% rev()
data_list <- read_rds_file("Data",
files,
filenumber = filenumber, # "1" gets the newest file
get_date = TRUE, time_since_modified = TRUE)
data_all <- data_list$data
file_date <- data_list$file_date
# 'file_date' will be used in part 10, when we save the resulting file
```
### Homogenize time series
Change STATION_CODE, in order to make a single time series for data with different STATION_CODE that in practice should count as the same station
* Fixing station 227G1 and 227G2 (shall count as 227G)
* Fixing station 36A and 36A1 (36A1 shall count as 36A)
Also combines PARAM = VDSI and PARAM = Intersex to PARAM = "VDSI/Intersex" for station 71G
```{r}
data_all <- homogenize_series(data_all)
```
### Labware data
Uses the most recent data (by default)
* Made on the PC, project 'Milkys2_pc', using '801_Download_Labware_sample_data.Rmd'
```{r, results='hold'}
# Penultimate year:
files <- list_files(
folder = "Input_data",
pattern = paste0("Labware_samples_", selected_year-1))
df_samples_labware_raw_previous <- read_rds_file(
folder = "Input_data",
files,
filenumber=1)
labware_yr <- lubridate::year(df_samples_labware_raw_previous$SAMPLED_DATE)
labware_yr_same <- mean(labware_yr == (selected_year-1), na.rm = TRUE)
cat(100*labware_yr_same, "% of Labware years are in the expected year\n")
if (labware_yr_same < 0.75){
stop("Too few samples seems to be from the expected year")
}
# Ultimate year:
cat("\n")
files <- list_files(
folder = "Input_data",
pattern = paste0("Labware_samples_", selected_year))
df_samples_labware_raw_last <- read_rds_file(
folder = "Input_data",
files,
filenumber=1)
labware_yr <- lubridate::year(df_samples_labware_raw_last$SAMPLED_DATE)
labware_yr_same <- mean(labware_yr == selected_year, na.rm = TRUE)
cat(100*labware_yr_same, "% of Labware years are in the expected year\n")
if (labware_yr_same < 0.75){
stop("Too few samples seems to be from the expected year")
}
# Combine
df_samples_labware_raw <- bind_rows(
df_samples_labware_raw_previous %>% mutate(MYEAR = selected_year-1),
df_samples_labware_raw_last %>% mutate(MYEAR = selected_year)
)
cat("\n")
cat("Combined labware file - number of lines per year: \n")
xtabs(~MYEAR, df_samples_labware_raw)
```
### Fix BIOTA_SAMPLENO = 0
BIOTA_SAMPLENO corresponds to SAMPLE_NO2 in the data (i.e., sample number within station and tissue)
```{r, results='hold'}
# xtabs(~BIOTA_SAMPLENO, df_samples_labware_raw)
sel <- df_samples_labware_raw$BIOTA_SAMPLENO == 0
cat(sum(sel), "samples have BIOTA_SAMPLENO == 0 \n")
if (sum(sel) > 0){
# All are BI-Galle
cat("\nTissue of samples with BIOTA_SAMPLENO == 0: \n")
xtabs(~TISSUE, df_samples_labware_raw[sel,])
# Exctract sample from DESCRIPTION
# First, remove AQUAMONITOR_CODE part from the DESCRIPTION strings
description_clipped <- df_samples_labware_raw$DESCRIPTION
line_numbers <- which(sel)
# Loop through each line and remove AQUAMONITOR_CODE part
for (i in line_numbers){
description_clipped[i] <- sub(
df_samples_labware_raw$AQUAMONITOR_CODE[i],
replacement = "",
x = description_clipped[i])
}
extract_first_match <- function(string, pattern){
m <- gregexpr(pattern, string)
matches <- regmatches(string, m)
sapply(matches, tail, n = 1)
# tail(matches, 1)
}
# Test (vectorised)
# extract_first_match(c("36B Færder area - torsk lever 7 og 11", "36B Færder area - torsk muskel 5"), "[0-9]+")
# for checking the result
if (FALSE){
# The clipped descriptions
description_clipped[sel]
# Extract number characters
description_clipped[sel] %>%
extract_first_match("[0-9]+")
}
# Extract number characters
df_samples_labware_raw$BIOTA_SAMPLENO[sel] <- description_clipped[sel] %>%
extract_first_match("[0-9]+") %>%
as.numeric()
cat("\n")
cat("Values of BIOTA_SAMPLENO after fix (should no zeros or NAs): \n")
xtabs(~addNA(BIOTA_SAMPLENO), df_samples_labware_raw)
}
```
## 3. N_string (sample size last year)
Example: N_string "8 (6-3)" means that:
- number of samples (pooled or unpooled) = 8
- number of pooled samples = 6
- maximum number of individuals per pooled sample = 3
### Get number of individuals per pooled sample
Uses function 'number_of_ind()' to get number of individuals per pooled sample
```{r, results='hold'}
# Get number of individuals per pooled sample
# Use df_samples (table from LABWARE)
# NOTE: Differs from code in 201 by extracting MYEAR and using it in grouping/summarising
df_samples_labware <- df_samples_labware_raw %>%
rename(STATION_CODE = AQUAMONITOR_CODE,
LATIN_NAME = SPECIES,
SAMPLE_NO2 = BIOTA_SAMPLENO
) %>%
mutate(
TISSUE_NAME = substr(TISSUE, start = 4, stop = 100),
No_individuals = number_of_ind(X_BULK_BIO)
) %>% # get number of individuals per pooled sample
select(MYEAR, STATION_CODE, LATIN_NAME, TISSUE_NAME, SAMPLE_NO2, X_BULK_BIO, No_individuals) %>%
group_by(MYEAR, STATION_CODE, LATIN_NAME, TISSUE_NAME, SAMPLE_NO2) %>%
summarise_all(first) %>%
ungroup() %>%
# Special case for molluscs, which have no X_BULK_BIO value:
mutate(No_individuals = case_when(
LATIN_NAME %in% "Mytilus edulis" ~ 50,
LATIN_NAME %in% c("Littorina littorea", "Nucella lapillus") ~ 30,
is.na(No_individuals) ~ 1, # in the case of fish, no X_BULK_BIO value means 1 individual
TRUE ~ No_individuals)
)
xtabs(~is.na(No_individuals) + MYEAR, df_samples_labware)
```
### Fixing STATION_CODE
Same as in above (and in script 110)
```{r, results = 'hold'}
# Fixing station 227G1 and 227G2 (shall count as 227G)
sel <- df_samples_labware$STATION_CODE %in% c("227G1","227G2")
# xtabs(~MYEAR, data_all[sel,])
df_samples_labware$STATION_CODE[sel] <- "227G"
cat("Fixing station 227G1 and 227G2 (shall count as 227G) \n")
cat("- STATION_CODE changed for", sum(sel), "rows \n")
# Fixing station 36A and 36A1 (36A1 shall count as 36A)
sel <- df_samples_labware$STATION_CODE %in% "36A1"
# xtabs(~MYEAR, data_all[sel,])
df_samples_labware$STATION_CODE[sel] <- "36A"
cat("Fixing station 36A and 36A1 (36A1 shall count as 36A) \n")
cat("- STATION_CODE changed for", sum(sel), "rows \n")
```
### Raw data with 'No_individuals' (info on pooled sample) added
```{r, results='hold'}
data_all_samples <- data_all %>%
filter(MYEAR %in% c(selected_year-1, selected_year)) %>%
safe_left_join(subset(df_samples_labware, select = -X_BULK_BIO),
by = c("MYEAR", "STATION_CODE", "LATIN_NAME", "TISSUE_NAME", "SAMPLE_NO2"),
na_matches = "never",
check = "BCVm") %>%
mutate(No_individuals = case_when(
PARAM %in% c("ALAD", "EROD", "D4", "D5", "D6") ~ 1, # these parameters are from single cod
LATIN_NAME %in% "Mytilus edulis" ~ 50, # industry blue mussel stations I964, I965, I969
TRUE ~ No_individuals)
)
xtabs(~is.na(No_individuals), data_all_samples)
xtabs(~is.na(No_individuals) + MYEAR, data_all_samples)
```
### Check data with 'No_individuals' (info on pooled sample) added
Only left without are some very pretty strange parameters in cod liver in two stations
```{r, results='hold'}
data_all_samples %>%
filter(is.na(No_individuals)) %>%
xtabs(~STATION_CODE + MYEAR, .)
data_all_samples %>%
filter(is.na(No_individuals) & LATIN_NAME == "Gadus morhua") %>%
xtabs(~STATION_CODE + PARAM, .)
if (FALSE){
data_all_samples %>%
filter(is.na(No_individuals)) %>%
View()
}
```
### Make N_string data set, per parameter
- 'dat_sample_string' - one line per station/tissue/parameter/year (same value for all Basis)
- used in the big excel file
```{r, results='hold'}
# dat_sample_string - contains 'N_string' which will be added to data by left-join
# - equals raw data summarised per station x parameter
dat_sample_string_temporary <- data_all_samples %>%
# Don't include siloxans (D4-D6) because some eider duck have exra samples for them
filter(!is.na(VALUE_WW) & !PARAM %in% c("D4","D5","D6")) %>%
group_by(MYEAR, STATION_CODE, LATIN_NAME, TISSUE_NAME, PARAM) %>%
summarise(
N = n(),
N_pooled = sum(No_individuals > 1),
Max_ind = max(No_individuals),
.groups = "drop"
) %>% # str()
# Some special cases
mutate(
N_pooled = case_when(
TISSUE_NAME %in% c("Galle", "Liver - microsome") ~ as.integer(0),
STATION_CODE %in% "71G" ~ as.integer(1),
STATION_CODE %in% "33F" ~ as.integer(3),
TRUE ~ N_pooled),
Max_ind = case_when(
TISSUE_NAME %in% c("Galle", "Liver - microsome") ~ 1,
STATION_CODE %in% "71G" ~ 60,
STATION_CODE %in% "33F" ~ 5,
TRUE ~ Max_ind)
)
dat_sample_string <- dat_sample_string_temporary %>%
mutate(
N_string = paste0(N, " (", N_pooled, "-", Max_ind, ")") # Make string
)
if (FALSE){
View(dat_sample_string)
dat_sample_string %>%
filter(MYEAR == 2021 & STATION_CODE == "45B2") %>%
View("45B2")
data_all_samples %>%
filter(MYEAR == 2021 & STATION_CODE == "45B2") %>%
View("45B2 all")
sel <- grepl("NA", dat_sample_string$N_string)
View(dat_sample_string[sel,])
table(is.na(dat_sample_string$N_string))
table(substr(addNA(dat_sample_string$N_string), 1, 1))
table(dat_sample_string$N_string)
}
sel <- grepl("NA", dat_sample_string$N_string)
cat(sum(sel), "N_string contains 'NA' \n")
```
### Make N_string data set, per tissue
- 'dat_sample_string_tissue' - as 'dat_sample_string', but not one line per parameter
- using max value
- used for tables in report (made by ELU)
```{r, results='hold'}
# dat_sample_string - contains 'N_string' which will be added to data by left-join
# - equals raw data summarised per station x parameter
dat_sample_string_tissue <- dat_sample_string_temporary %>%
group_by(MYEAR, STATION_CODE, LATIN_NAME, TISSUE_NAME) %>%
summarise(
N = max(N, na.rm = TRUE),
N_pooled = max(N_pooled, na.rm = TRUE),
Max_ind = max(Max_ind),
.groups = "drop"
) %>%
mutate(
N_string = paste0(N, " (", N_pooled, "-", Max_ind, ")") # Make string
) %>%
filter(LATIN_NAME != "N. lapillus / L. littorea")
sel <- grepl("NA", dat_sample_string_tissue$N_string)
cat(sum(sel), "N_string contains 'NA' \n")
```
### Imposex and intersex - TO INCLUDE LATER?
```{r}
#
# Imposex and intersex - TO INCLUDE LATER
#
if (FALSE){
data_imposex <- readRDS("Data/32_data_imposex.rds")
df_numbers_imposex <- data_imposex %>%
filter(Sex %in% "f" & PARAM %in% c("VDSI","Intersex") & !is.na(VALUE_WW)) %>%
group_by(STATION_CODE) %>%
summarise(Max_ind = n()) %>%
ungroup() %>%
mutate(
N_string_new = paste0(1, " (", 1, "-", Max_ind, ")")
) %>%
select(STATION_CODE, N_string_new)
#
# Replace N_string values for imposex and intersex
#
dat_sample_string <- dat_sample_string %>%
safe_left_join(df_numbers_imposex,
by = c("STATION_CODE"),
na_matches = "never",
check = "BCV") %>%
mutate(N_string = ifelse(is.na(N_string_new), N_string, N_string_new)) %>%
select(-N_string_new)
}
# end of "Imposex and intersex" part
```
## 4. Standard deviation (SD)
* 'dat_all_sd' - one line per station/tissue/parameter/year/basis (different values for different Basis)
* No values for VDSI because they are given by a single value
```{r, results='hold'}
dat_sd <- data_all %>%
filter(MYEAR %in% c(selected_year-1, selected_year)) %>%
select(MYEAR, STATION_CODE, LATIN_NAME, TISSUE_NAME, PARAM, SAMPLE_NO2,
VALUE_WW, VALUE_DW, VALUE_FB, VALUE_WWa, VALUE_DWa, VALUE_FBa) %>%
tidyr::pivot_longer(cols = c(VALUE_WW, VALUE_DW, VALUE_FB, VALUE_WWa, VALUE_DWa, VALUE_FBa),
names_to = "Basis", values_to = "VALUE", values_drop_na = TRUE) %>%
mutate(Basis = sub("VALUE_", "", Basis)) %>%
group_by(MYEAR, STATION_CODE, LATIN_NAME, TISSUE_NAME, PARAM, Basis) %>%
summarise(SD = sd(VALUE, na.rm = TRUE))
xtabs(~is.na(SD) + Basis + MYEAR, dat_sd)
```
## 5. D.d.i. (detectable data information)
D.d.i. = detectable data information = "N>LOQ [min - maks]"
Example:
7 [0.11 - 0.29] means 7 measurements over LOQ, and these measurements varied from 0.11 to 0.29
```{r, results='hold'}
# Get LOQ flags (one row per station*parameter*sample)
data_all_flag <- data_all %>%
filter(MYEAR %in% selected_year) %>%
select(STATION_CODE, LATIN_NAME, TISSUE_NAME, MYEAR, SAMPLE_NO2, UNIT, PARAM,
FLAG1)
# Get values by making data set with one row per station*parameter*sample*basis
data_all_long <- data_all %>%
filter(MYEAR %in% selected_year) %>%
select(STATION_CODE, LATIN_NAME, TISSUE_NAME, MYEAR, SAMPLE_NO2, UNIT, PARAM,
VALUE_WW, VALUE_DW, VALUE_FB, VALUE_WWa, VALUE_DWa, VALUE_FBa) %>%
tidyr::pivot_longer(cols = starts_with("VALUE"),
names_to = "Basis", values_to = "Value", values_drop_na = TRUE) %>%
mutate(Basis = sub("VALUE_", "", Basis)) %>%
# Add LOQ flags to values (creating one row per parameter*sample*basis)
safe_left_join(
data_all_flag,
by = c("STATION_CODE", "LATIN_NAME", "TISSUE_NAME", "MYEAR",
"SAMPLE_NO2", "UNIT", "PARAM"),
na_matches = "never",
check = "BCV")
# Get min & max values, for adding to df_ddi later (one row per parameter*basis)
df_ddi_minmax <- data_all_long %>%
filter(is.na(FLAG1)) %>%
# Appropriate rounding of values:
mutate(
Value = case_when(
Value < 0.001 ~ round(Value, 4),
Value < 0.01 ~ round(Value, 3),
Value < 2 ~ round(Value, 2),
Value < 20 ~ round(Value, 1),
TRUE ~ round(Value, 0)
)) %>%
group_by(STATION_CODE,LATIN_NAME,TISSUE_NAME,PARAM,Basis) %>%
summarise(
ddi_min = min(Value, na.rm = TRUE),
ddi_max = max(Value, na.rm = TRUE),
.groups = "drop"
) %>%
ungroup()
# If you get an error with the following message:
# "Error: y is not unique on STATION_CODE, LATIN_NAME, TISSUE_NAME, MYEAR, SAMPLE_NO2, UNIT and PARAM "
# run the following
# check <- data_all_flag %>%
# add_count(STATION_CODE, LATIN_NAME, TISSUE_NAME, MYEAR, SAMPLE_NO2, UNIT, PARAM) %>%
# filter(n > 1)
#
# Create D.d.i
#
# Start DDI data set (df_ddi) by counting values over LOQ #
dat_ddi <- data_all_long %>%
group_by(STATION_CODE,LATIN_NAME,TISSUE_NAME,PARAM,Basis) %>%
summarise(
Over_LOQ = sum(is.na(FLAG1)), .groups = "drop"
) %>%
# Add min and max values
safe_left_join(
df_ddi_minmax,
by = c("STATION_CODE", "LATIN_NAME", "TISSUE_NAME", "PARAM", "Basis"),
na_matches = "never",
check = "BCV") %>%
# Create the D.d.i. string
mutate(DDI = case_when(
Over_LOQ > 1 & is.finite(ddi_min) & !is.na(ddi_min) ~ paste0(Over_LOQ, " [", ddi_min, "-", ddi_max, "]"),
Over_LOQ == 1 & is.finite(ddi_min) & !is.na(ddi_min) ~ paste0(Over_LOQ, " [", ddi_min, "]"),
Over_LOQ == 0 ~ paste0(Over_LOQ, " [n.a.]")
)
)
dat_ddi %>%
select(STATION_CODE,LATIN_NAME,TISSUE_NAME,PARAM,Basis,DDI) %>%
head()
```
## 6. Save
```{r, results= 'hold'}
filename <- paste0("Data/111_Nstring_updated_", file_date, ".rds")
saveRDS(dat_sample_string, filename)
cat("Data of N_string saved as: \n")
cat(" ", filename, "\n")
cat("\n")
filename <- paste0("Data/111_Nstring_tissue_updated_", file_date, ".xlsx")
writexl::write_xlsx(dat_sample_string_tissue, filename)
cat("Data of N_string, tissue-wise, saved to Excel (for ELU) as: \n")
cat(" ", filename, "\n")
cat("\n")
filename <- paste0("Data/111_SD_updated_", file_date, ".rds")
saveRDS(dat_sd, filename)
cat("Data of SD (standard deviation) saved as: \n")
cat(" ", filename, "\n")
cat("\n")
filename <- paste0("Data/111_DDI_updated_", file_date, ".rds")
saveRDS(dat_ddi, filename)
cat("Data of D.d.i. (detectable data information) saved as: \n")
cat(" ", filename, "\n")
```
```{r}
if (FALSE){
dat_sample_string <- readRDS("Data/111_Nstring_updated_2021-09-15.rds")
}
```