Beginning on TR for v2024, modifying process_UNWTO_arrivals.R to incl…

…ude both international and domestic to determine if aggregating makes sense.
OHI-Science · Aug 16, 2024 · f3c268a · f3c268a
1 parent 56dda20
commit f3c268a
Show file tree

Hide file tree

Showing 24 changed files with 7,814 additions and 0 deletions.
diff --git a/globalprep/tr/v2024/R/process_UNWTO_arrivals.R b/globalprep/tr/v2024/R/process_UNWTO_arrivals.R
@@ -0,0 +1,288 @@
+library(ohicore)
+library(tidyverse)
+library(stringr)
+library(WDI)
+library(here)
+library(janitor)
+library(plotly)
+library(readxl)
+library(naniar)
+library(countrycode)
+
+# ---- sources! ----
+source(here("workflow", "R", "common.R")) # file creates objects to process data
+
+region_data() # for rgns_all and rgns_eez
+
+regions_shape() # returns spatial shape object named 'regions' which includes land, eez, highseas, and antarctica regions
+
+# ---- set year and file path info ----
+current_year <- 2024 # Update this in the future!!
+version_year <- paste0("v",current_year)
+data_dir_version_year <- paste0("d", current_year)
+prev_ver_yr <- paste0("d", (current_year - 1))
+
+# ---- data directories ----
+
+# Raw data directory (on Mazu)
+raw_data_dir <- here::here(dir_M, "git-annex", "globalprep", "_raw_data")
+
+# UNWTO (UN World Tourism) raw data directory
+unwto_dir <- here(raw_data_dir, "UNWTO", data_dir_version_year)
+
+# final output dir
+output_dir <- here("globalprep","tr", version_year, "output")
+
+# process UNWTO arrivals in tourism data
+file_path_unwto <- here::here(unwto_dir, "unwto-inbound-arrivals-data.xlsx")
+unwto_arrivals <- readxl::read_xlsx(file_path_unwto, skip = 2) # read in the raw data
+
+unwto_clean <- unwto_arrivals %>% 
+  select(country = `Basic data and indicators`, total = `...6`, subdivision_1 = `...7`, subdivision_2 = `...8`, `1995`:`2021`) %>% # select relevant columns
+  fill(country, .direction = "down") %>% # add country name to all data associated with that country
+  pivot_longer(cols = c("total", "subdivision_1", "subdivision_2"),
+               values_to = "metric",
+               values_drop_na = TRUE) %>% # make the metrics into one column
+  select(-name) %>% # get rid of the name column since it's just the titles of the metrics which are already there
+  select(country, metric, everything()) %>% # reorder things
+  replace_with_na_all(condition = ~.x == "..") %>% # swap .. with NAs
+  pivot_longer(cols = 3:ncol(.), names_to = "year",
+               values_to = "tourism_arrivals_ct") %>% # make the years not columns anymore
+  pivot_wider(names_from = metric, values_from = tourism_arrivals_ct) %>%
+  mutate(overnights = as.numeric(`Overnights visitors (tourists)`), 
+         same_day = as.numeric(`Same-day visitors (excursionists)`), 
+         total = as.numeric(`Total arrivals`),
+         tourism_arrivals_ct = as.numeric(NA)) %>% # rename metrics so easier to work with, make numeric, and add a new column to fill with the new calculated values later
+  select(country, year, overnights, same_day, total, tourism_arrivals_ct) %>% # select columns needed for analysis (cruise passengers seem to be included in same-day)
+  group_by(country, year) %>% # group by county and year
+  mutate(
+    tourism_arrivals_ct = case_when(
+      !is.na(overnights) ~ overnights,
+      is.na(overnights) & !is.na(same_day) & !is.na(total) ~ total - same_day,
+      TRUE ~ tourism_arrivals_ct
+    ),
+    total = case_when(
+      !is.na(total) ~ total,
+      is.na(total) & !is.na(same_day) & !is.na(overnights) ~ overnights + same_day,
+      TRUE ~ total
+    )
+  ) %>% # v2023: NAs are 601 out of 6021
+  mutate(arrivals_method = ifelse(is.na(overnights) & !is.na(same_day) & !is.na(total), "UNWTO - subtraction", NA)) %>%
+  mutate(arrivals_gapfilled = ifelse(arrivals_method == "UNWTO - subtraction", "gapfilled", NA)) %>% # prepare a "gapfilled" column to indicate "gapfilled" or NA
+  ungroup() %>% # ungroup since not needed anymore
+  select(country, year, tourism_arrivals_ct, total, arrivals_method, arrivals_gapfilled) %>% # select only needed columns
+  mutate(country = str_to_title(country), # make countries look nice
+         tourism_arrivals_ct = round(as.numeric(tourism_arrivals_ct) * 1000),
+         total = round(as.numeric(total)*1000)) # since the units were in thousands
+
+# Macquerie, Andaman and Nicobar, Azores, Madeira, Prince Edwards Islands, Oecussi Ambeno, Canary Islands all duplicated with their governing regions. Aside from the uninhabited ones, I think it actually makes sense to give them the same score as their vassal states, given that places like Azores and Canary Islands probably make up a decent chunk of Portugal and Spain tourism...
+# get UNWTO data to have OHI region names
+unwto_match_iso3c <- unwto_clean %>%
+  mutate(iso3c = countrycode::countrycode(sourcevar = country, origin = "country.name", destination = "iso3c")) %>%
+  left_join(rgns_eez, by = c("iso3c" = "eez_iso3")) %>%
+  dplyr::select(rgn_id, year, arrivals_method, arrivals_gapfilled, tourism_arrivals_ct, total) %>%
+  filter(!is.na(rgn_id))
+
+unwto_clean_names_bonaire <- name_2_rgn(df_in = unwto_clean %>% filter(country == "Bonaire"), # do this just for Bonaire since it is the only region not matching above
+                                fld_name = 'country',
+                                # flds_unique = c('year'),
+                                keep_fld_name = TRUE) %>%
+  dplyr::select(rgn_id, year, arrivals_method, arrivals_gapfilled, tourism_arrivals_ct, total)#### losing lots of regions here for some reason... most concerndely USA
+
+
+unwto_clean_names <- rbind(unwto_clean_names_bonaire, unwto_match_iso3c) %>% # rbind back together. I would've used the name_2_rgns fxn for everything, but it was excluding a lot of regiosn for some reason...
+  left_join(rgns_eez) %>%
+  dplyr::select(rgn_id, rgn_name, year, arrivals_method, arrivals_gapfilled, tourism_arrivals_ct, total)
+
+# fix duplicates if there are any
+unwto_dupe_fix <- unwto_clean_names %>%
+  group_by(rgn_id, year, arrivals_method, arrivals_gapfilled) %>%
+  summarize(sum_fix = ifelse(all(is.na(tourism_arrivals_ct)), NA, sum(tourism_arrivals_ct, na.rm = TRUE)),
+            sum_fix_2 = ifelse(all(is.na(total)), NA, sum(total, na.rm = TRUE))) %>%
+  mutate(arrivals_method = ifelse(is.na(arrivals_method) & !is.na(sum_fix), "UNWTO", arrivals_method)) %>%
+  rename(tourism_arrivals_ct = sum_fix,
+         total = sum_fix_2)
+
+# check out things so far
+summary(unwto_dupe_fix) # v2023: 828 NAs in arrivals (before filtering the years down and gapfilling), 1708 in `total`
+
+# gapfill arrivals
+# downfill then upfill missing values using a linear model of the average increase per years across all years of data for 1995-2019
+# for 2020 and 2021, use the global average proportion increase or decrease and add to the previous years value
+
+test <- unwto_dupe_fix %>%
+  filter(year %in% c(2020, 2021)) %>%
+  filter(!is.na(tourism_arrivals_ct) & !is.na(total)) %>%
+  pivot_wider(names_from = year, values_from = c(tourism_arrivals_ct, total)) %>%
+  mutate(tourism_ct_diff = (tourism_arrivals_ct_2021 - tourism_arrivals_ct_2020)/tourism_arrivals_ct_2020) %>%
+  mutate(total_diff = (total_2021 - total_2020)/total_2020)
+
+gf_2021_tourism <- mean(test$tourism_ct_diff, na.rm = TRUE) # global average increase for 2021 tourist
+
+gf_2021_total <- mean(test$total_diff, na.rm = TRUE) # global average increase for 2021 total
+
+# 
+# plot(test$total_diff, test$tourism_ct_diff) # looks pretty linear, so thats good, we can use the same method of gapfilling
+# mean(test$total_diff, na.rm = TRUE) # 0.08301438 average increase
+# mean(test$tourism_ct_diff, na.rm = TRUE) # 0.2832816 average increase
+# ## ok, lets use these values to gapfill 2021 if it is NA and 2020 exists. So increase by X proportion (0.08 or 0.28)
+
+test <- unwto_dupe_fix %>%
+  filter(year %in% c(2019, 2020)) %>%
+  filter(!is.na(tourism_arrivals_ct) & !is.na(total)) %>%
+  pivot_wider(names_from = year, values_from = c(tourism_arrivals_ct, total)) %>%
+  mutate(tourism_ct_diff = (tourism_arrivals_ct_2020 - tourism_arrivals_ct_2019)/tourism_arrivals_ct_2019) %>%
+  mutate(total_diff = (total_2020 - total_2019)/total_2019)
+
+gf_2020_tourism <- mean(test$tourism_ct_diff, na.rm = TRUE) # global average decerease for 2020 tourst
+gf_2020_total <- mean(test$total_diff, na.rm = TRUE) # global average decrease for 2020 total
+
+# 
+# plot(test$total_diff, test$tourism_ct_diff) # looks pretty linear, so thats good, we can use the same method of gapfilling
+# mean(test$total_diff, na.rm = TRUE) # -0.7015468 average decrease
+# mean(test$tourism_ct_diff, na.rm = TRUE) # -0.7030777 average decrease
+# ## ok, lets use these values to gapfill 2020 if it is NA and 2019 exists. So decrease by X proportion ~70%
+
+
+unwto_upfill <-  unwto_dupe_fix %>%
+  filter(year < 2020) %>%
+  group_by(rgn_id) %>%
+  arrange(rgn_id, year) %>%
+  tidyr::fill(tourism_arrivals_ct, .direction = "up") %>%
+  tidyr::fill(total, .direction = "up") %>% # fill in any values that are empty from early years with values from the nearest year. Doing this because doesn't make sense to add earlier years based on a trend 
+  mutate(arrivals_method = ifelse(is.na(arrivals_method) & !is.na(tourism_arrivals_ct), "nearby year", arrivals_method)) %>%
+  mutate(arrivals_gapfilled = ifelse(arrivals_method == "nearby year", "gapfilled", arrivals_gapfilled))
+
+## calculate regional average increase or decrease in number of total arrivals
+lm_coef_data_total <- unwto_dupe_fix %>%
+  filter(!(year %in% c(2020, 2021))) %>%
+  group_by(rgn_id) %>%
+  filter(!is.na(total)) %>%
+  summarize(lm_coef_total = if (n() > 1) lm(total ~ year)$coefficients[2] else 0, .groups = 'drop') # give it an addition of 0 if it is stagnant
+
+## calculate regional average increase or decrease in number of tourism arrivals
+lm_coef_data_tourism <- unwto_dupe_fix %>%
+  filter(!(year %in% c(2020, 2021))) %>%
+  group_by(rgn_id) %>%
+  filter(!is.na(tourism_arrivals_ct)) %>%
+  summarize(lm_coef_tourism = lm(tourism_arrivals_ct ~ year)$coefficients[2])
+
+
+# Initialize a flag to check if there are still NAs
+na_flag <- TRUE
+
+# filter out any regions with all nas for each year, as these can't be gapfilled
+all_nas_tourism <- unwto_upfill %>%
+  group_by(rgn_id) %>%
+  filter(all(is.na(tourism_arrivals_ct))) %>%
+  dplyr::select(rgn_id, year, tourism_arrivals_ct, arrivals_method, arrivals_gapfilled)
+
+unwto_gapfill_lm_2019_tourism <- unwto_upfill %>%
+  left_join(lm_coef_data_tourism) %>% 
+   ungroup() %>%
+  filter(!(rgn_id %in% c(all_nas_tourism$rgn_id))) %>%
+  dplyr::select(rgn_id, year, tourism_arrivals_ct, arrivals_method, arrivals_gapfilled, lm_coef_tourism)
+
+
+## now lets fill in any values down with the linear model average increase per year for tourists
+while(na_flag) {
+
+  unwto_gapfill_lm_2019_tourism <- unwto_gapfill_lm_2019_tourism %>%
+    group_by(rgn_id) %>%
+    arrange(year) %>%
+    mutate(
+      tourism_arrivals_ct = case_when(
+        is.na(tourism_arrivals_ct) & !is.na(lag(tourism_arrivals_ct)) ~ lag(tourism_arrivals_ct) + lm_coef_tourism,
+        TRUE ~ tourism_arrivals_ct
+      )
+    ) %>%
+    mutate(arrivals_method = ifelse(is.na(arrivals_method) & !is.na(tourism_arrivals_ct), "linear model", arrivals_method)) %>%
+    mutate(arrivals_gapfilled = ifelse(arrivals_method == "linear model", "gapfilled", arrivals_gapfilled)) %>%
+    ungroup()
+
+  # Check if there are still NAs left in either column
+  na_flag <- any(is.na(unwto_gapfill_lm_2019_tourism$tourism_arrivals_ct))
+}
+
+unwto_gapfill_lm_2019_tourism_all <- unwto_gapfill_lm_2019_tourism %>%
+  dplyr::select(-lm_coef_tourism) %>%
+  rbind(all_nas_tourism)
+
+### Now do the same thing for total column
+# Initialize a flag to check if there are still NAs
+na_flag <- TRUE
+
+all_nas_total <- unwto_upfill %>%
+  group_by(rgn_id) %>%
+  filter(all(is.na(total))) %>%
+  dplyr::select(rgn_id, year, total)
+
+unwto_gapfill_lm_2019_total <- unwto_upfill %>%
+  filter(year < 2020) %>%
+  group_by(rgn_id) %>%
+  arrange(rgn_id, year) %>%
+  tidyr::fill(total, .direction = "up") %>% # fill in any values that are empty from early years with values from the nearest year
+  left_join(lm_coef_data_total) %>%
+  ungroup() %>%
+  filter(!(rgn_id %in% c(all_nas_total$rgn_id))) %>%
+  dplyr::select(rgn_id, year, total, lm_coef_total)
+
+
+## now lets fill in any values down with the linear model average increase per year for tourists
+while(na_flag) {
+
+  unwto_gapfill_lm_2019_total <- unwto_gapfill_lm_2019_total %>%
+    group_by(rgn_id) %>%
+    arrange(year) %>%
+    mutate(
+      total = case_when(
+        is.na(total) & !is.na(lag(total)) ~ lag(total) + lm_coef_total,
+        TRUE ~ total
+      )
+    ) %>%
+    ungroup()
+
+  # Check if there are still NAs left in either column
+  na_flag <- any(is.na(unwto_gapfill_lm_2019_total$total))
+}
+
+unwto_gapfill_lm_2019_total_all <- unwto_gapfill_lm_2019_total %>%
+  dplyr::select(-lm_coef_total) %>%
+  mutate(total = ifelse(rgn_id == 67 & total < 0, unwto_gapfill_lm_2019_total %>%
+                          filter(rgn_id == 67, year == 2009 ) %>% pull(total) , total)) %>% # fix libya, as it was being given negative values with the gapfill. Just give it its latest year (downfill)
+  rbind(all_nas_total)
+
+
+unwto_2020_2021 <- unwto_dupe_fix %>%
+  filter(year > 2019) # lets fix 2020 and 2021 now
+
+unwto_all_gf <- unwto_gapfill_lm_2019_tourism_all %>%
+    left_join(unwto_gapfill_lm_2019_total_all) %>%
+  rbind(unwto_2020_2021) %>%
+  group_by(rgn_id) %>%
+  arrange(rgn_id, year) %>%
+  # apply global average proportional increase or decrease for 2020 and 2021, because of covid pandemic messing up trends...
+  mutate(tourism_arrivals_ct = ifelse(year == 2020 & is.na(tourism_arrivals_ct), lag(tourism_arrivals_ct, n = 1) + lag(tourism_arrivals_ct, n = 1)*gf_2020_tourism, tourism_arrivals_ct),
+         total = ifelse(year == 2020 & is.na(total), lag(total, n = 1) + lag(total, n = 1)*gf_2020_total, total)) %>%
+
+  mutate(tourism_arrivals_ct = ifelse(year == 2021 & is.na(tourism_arrivals_ct), lag(tourism_arrivals_ct, n = 1) + lag(tourism_arrivals_ct, n = 1)*gf_2021_tourism, tourism_arrivals_ct),
+         total = ifelse(year == 2021 & is.na(total), lag(total, n = 1) + lag(total, n = 1)*gf_2021_total, total)) %>%
+  mutate(arrivals_method = ifelse(is.na(arrivals_method) & !is.na(tourism_arrivals_ct), "2020 and 2021 gapfill method", arrivals_method)) %>%
+  mutate(arrivals_gapfilled = ifelse(arrivals_method == "2020 and 2021 gapfill method", "gapfilled", arrivals_gapfilled)) %>%
+  filter(year >= 2008) %>% # get only the year we need and beyond
+  drop_na(tourism_arrivals_ct)  # remove any remaining NAs (any remaining have all NAs for that region)
+ # drop_na(total) # keep these NAs, we will just give these regions a perfect score...
+
+
+
+## old way
+# unwto_dupe_fix_downup_gf <- unwto_dupe_fix %>%
+#   fill(tourism_arrivals_ct, .direction = "downup") %>%
+#   fill(total, .direction = "downup") %>%
+#   mutate(arrivals_method = ifelse(is.na(arrivals_method) & !is.na(tourism_arrivals_ct), "nearby year", arrivals_method)) %>%
+#   mutate(arrivals_gapfilled = ifelse(arrivals_method == "nearby year", "gapfilled", arrivals_gapfilled)) %>%
+#   filter(year >= 2008) %>% # get only the year we need and beyond
+#   drop_na(tourism_arrivals_ct) # remove any remaining NAs (any remaining have all NAs for that region)
+# 
+# # check out things so far
+# summary(unwto_dupe_fix_downup_gf) # NAs should be 0 now
diff --git a/globalprep/tr/v2024/R/process_WB_generalized_fxn.R b/globalprep/tr/v2024/R/process_WB_generalized_fxn.R
@@ -0,0 +1,30 @@
+# v2023: THIS ENDED UP NOT BEING USED OTHER THAN IN EXPLORATION, BUT WAS LEFT HERE IN CASE CODE BECOMES RELEVANT IN THE FUTURE
+# the WDI function that we use functions similarly to this
+
+process_wb_data <- function(file_path, value_name, df_name) {
+  # read in the csv needing processing
+  df <- read_csv(file_path, skip = 4)
+
+  # select needed columns
+  df_clean <- df %>%
+    select(country = `Country Name`, `1960`:`2022`)
+
+  # get in long format
+  df_clean_long <- df_clean %>%
+    pivot_longer(cols = -country,
+                 names_to = "year",
+                 values_to = "values") %>%
+    mutate(country = str_to_title(country))
+
+  # run name_2_rgn to get correct names
+  df_clean_names <- name_2_rgn(df_in = df_clean_long,
+                               fld_name = "country",
+                               flds_unique = "year")
+
+  # fix any duplicates
+  df_dupe_fix <- df_clean_names %>%
+    group_by(rgn_id, year) %>%
+    summarize({{ value_name }} := ifelse(all(is.na(values)), NA, sum(values, na.rm = TRUE)))
+
+  assign(df_name, df_dupe_fix, envir = .GlobalEnv) # add to environment
+}
diff --git a/globalprep/tr/v2024/R/process_area_of_coastline.R b/globalprep/tr/v2024/R/process_area_of_coastline.R
@@ -0,0 +1,14 @@
+# read in area of coastline data
+inland_filepath <- file.path("globalprep", "lsp", paste0("v", version_year), "int", "area_protected_1km.csv")
+inland_data <- read_csv(inland_filepath)
+offshore_filepath <- file.path("globalprep", "lsp", paste0("v", version_year), "int", "area_protected_3nm.csv")
+offshore_data <- read_csv(offshore_filepath)
+
+# get combined value of inland and offshore for each ohi region
+inland_offshore <- inland_data %>%
+  left_join(offshore_data, by = join_by(rgn_id, year, rgn_name)) %>%
+  select(rgn_id, year, a_tot_km2.x, a_tot_km2.y) %>%
+  group_by(rgn_id, year) %>%
+  mutate(total_inland_offshore_area = a_tot_km2.x + a_tot_km2.y,
+         year = as.character(year)) %>%
+  select(rgn_id, year, total_inland_offshore_area)