Skip to content

Commit

Permalink
Beginning on TR for v2024, modifying process_UNWTO_arrivals.R to incl…
Browse files Browse the repository at this point in the history
…ude both international and domestic to determine if aggregating makes sense.
  • Loading branch information
sophialecuona committed Aug 16, 2024
1 parent 56dda20 commit f3c268a
Show file tree
Hide file tree
Showing 24 changed files with 7,814 additions and 0 deletions.
288 changes: 288 additions & 0 deletions globalprep/tr/v2024/R/process_UNWTO_arrivals.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,288 @@
library(ohicore)
library(tidyverse)
library(stringr)
library(WDI)
library(here)
library(janitor)
library(plotly)
library(readxl)
library(naniar)
library(countrycode)

# ---- sources! ----
source(here("workflow", "R", "common.R")) # file creates objects to process data

region_data() # for rgns_all and rgns_eez

regions_shape() # returns spatial shape object named 'regions' which includes land, eez, highseas, and antarctica regions

# ---- set year and file path info ----
current_year <- 2024 # Update this in the future!!
version_year <- paste0("v",current_year)
data_dir_version_year <- paste0("d", current_year)
prev_ver_yr <- paste0("d", (current_year - 1))

# ---- data directories ----

# Raw data directory (on Mazu)
raw_data_dir <- here::here(dir_M, "git-annex", "globalprep", "_raw_data")

# UNWTO (UN World Tourism) raw data directory
unwto_dir <- here(raw_data_dir, "UNWTO", data_dir_version_year)

# final output dir
output_dir <- here("globalprep","tr", version_year, "output")

# process UNWTO arrivals in tourism data
file_path_unwto <- here::here(unwto_dir, "unwto-inbound-arrivals-data.xlsx")
unwto_arrivals <- readxl::read_xlsx(file_path_unwto, skip = 2) # read in the raw data

unwto_clean <- unwto_arrivals %>%
select(country = `Basic data and indicators`, total = `...6`, subdivision_1 = `...7`, subdivision_2 = `...8`, `1995`:`2021`) %>% # select relevant columns
fill(country, .direction = "down") %>% # add country name to all data associated with that country
pivot_longer(cols = c("total", "subdivision_1", "subdivision_2"),
values_to = "metric",
values_drop_na = TRUE) %>% # make the metrics into one column
select(-name) %>% # get rid of the name column since it's just the titles of the metrics which are already there
select(country, metric, everything()) %>% # reorder things
replace_with_na_all(condition = ~.x == "..") %>% # swap .. with NAs
pivot_longer(cols = 3:ncol(.), names_to = "year",
values_to = "tourism_arrivals_ct") %>% # make the years not columns anymore
pivot_wider(names_from = metric, values_from = tourism_arrivals_ct) %>%
mutate(overnights = as.numeric(`Overnights visitors (tourists)`),
same_day = as.numeric(`Same-day visitors (excursionists)`),
total = as.numeric(`Total arrivals`),
tourism_arrivals_ct = as.numeric(NA)) %>% # rename metrics so easier to work with, make numeric, and add a new column to fill with the new calculated values later
select(country, year, overnights, same_day, total, tourism_arrivals_ct) %>% # select columns needed for analysis (cruise passengers seem to be included in same-day)
group_by(country, year) %>% # group by county and year
mutate(
tourism_arrivals_ct = case_when(
!is.na(overnights) ~ overnights,
is.na(overnights) & !is.na(same_day) & !is.na(total) ~ total - same_day,
TRUE ~ tourism_arrivals_ct
),
total = case_when(
!is.na(total) ~ total,
is.na(total) & !is.na(same_day) & !is.na(overnights) ~ overnights + same_day,
TRUE ~ total
)
) %>% # v2023: NAs are 601 out of 6021
mutate(arrivals_method = ifelse(is.na(overnights) & !is.na(same_day) & !is.na(total), "UNWTO - subtraction", NA)) %>%
mutate(arrivals_gapfilled = ifelse(arrivals_method == "UNWTO - subtraction", "gapfilled", NA)) %>% # prepare a "gapfilled" column to indicate "gapfilled" or NA
ungroup() %>% # ungroup since not needed anymore
select(country, year, tourism_arrivals_ct, total, arrivals_method, arrivals_gapfilled) %>% # select only needed columns
mutate(country = str_to_title(country), # make countries look nice
tourism_arrivals_ct = round(as.numeric(tourism_arrivals_ct) * 1000),
total = round(as.numeric(total)*1000)) # since the units were in thousands

# Macquerie, Andaman and Nicobar, Azores, Madeira, Prince Edwards Islands, Oecussi Ambeno, Canary Islands all duplicated with their governing regions. Aside from the uninhabited ones, I think it actually makes sense to give them the same score as their vassal states, given that places like Azores and Canary Islands probably make up a decent chunk of Portugal and Spain tourism...
# get UNWTO data to have OHI region names
unwto_match_iso3c <- unwto_clean %>%
mutate(iso3c = countrycode::countrycode(sourcevar = country, origin = "country.name", destination = "iso3c")) %>%
left_join(rgns_eez, by = c("iso3c" = "eez_iso3")) %>%
dplyr::select(rgn_id, year, arrivals_method, arrivals_gapfilled, tourism_arrivals_ct, total) %>%
filter(!is.na(rgn_id))

unwto_clean_names_bonaire <- name_2_rgn(df_in = unwto_clean %>% filter(country == "Bonaire"), # do this just for Bonaire since it is the only region not matching above
fld_name = 'country',
# flds_unique = c('year'),
keep_fld_name = TRUE) %>%
dplyr::select(rgn_id, year, arrivals_method, arrivals_gapfilled, tourism_arrivals_ct, total)#### losing lots of regions here for some reason... most concerndely USA


unwto_clean_names <- rbind(unwto_clean_names_bonaire, unwto_match_iso3c) %>% # rbind back together. I would've used the name_2_rgns fxn for everything, but it was excluding a lot of regiosn for some reason...
left_join(rgns_eez) %>%
dplyr::select(rgn_id, rgn_name, year, arrivals_method, arrivals_gapfilled, tourism_arrivals_ct, total)

# fix duplicates if there are any
unwto_dupe_fix <- unwto_clean_names %>%
group_by(rgn_id, year, arrivals_method, arrivals_gapfilled) %>%
summarize(sum_fix = ifelse(all(is.na(tourism_arrivals_ct)), NA, sum(tourism_arrivals_ct, na.rm = TRUE)),
sum_fix_2 = ifelse(all(is.na(total)), NA, sum(total, na.rm = TRUE))) %>%
mutate(arrivals_method = ifelse(is.na(arrivals_method) & !is.na(sum_fix), "UNWTO", arrivals_method)) %>%
rename(tourism_arrivals_ct = sum_fix,
total = sum_fix_2)

# check out things so far
summary(unwto_dupe_fix) # v2023: 828 NAs in arrivals (before filtering the years down and gapfilling), 1708 in `total`

# gapfill arrivals
# downfill then upfill missing values using a linear model of the average increase per years across all years of data for 1995-2019
# for 2020 and 2021, use the global average proportion increase or decrease and add to the previous years value

test <- unwto_dupe_fix %>%
filter(year %in% c(2020, 2021)) %>%
filter(!is.na(tourism_arrivals_ct) & !is.na(total)) %>%
pivot_wider(names_from = year, values_from = c(tourism_arrivals_ct, total)) %>%
mutate(tourism_ct_diff = (tourism_arrivals_ct_2021 - tourism_arrivals_ct_2020)/tourism_arrivals_ct_2020) %>%
mutate(total_diff = (total_2021 - total_2020)/total_2020)

gf_2021_tourism <- mean(test$tourism_ct_diff, na.rm = TRUE) # global average increase for 2021 tourist

gf_2021_total <- mean(test$total_diff, na.rm = TRUE) # global average increase for 2021 total

#
# plot(test$total_diff, test$tourism_ct_diff) # looks pretty linear, so thats good, we can use the same method of gapfilling
# mean(test$total_diff, na.rm = TRUE) # 0.08301438 average increase
# mean(test$tourism_ct_diff, na.rm = TRUE) # 0.2832816 average increase
# ## ok, lets use these values to gapfill 2021 if it is NA and 2020 exists. So increase by X proportion (0.08 or 0.28)

test <- unwto_dupe_fix %>%
filter(year %in% c(2019, 2020)) %>%
filter(!is.na(tourism_arrivals_ct) & !is.na(total)) %>%
pivot_wider(names_from = year, values_from = c(tourism_arrivals_ct, total)) %>%
mutate(tourism_ct_diff = (tourism_arrivals_ct_2020 - tourism_arrivals_ct_2019)/tourism_arrivals_ct_2019) %>%
mutate(total_diff = (total_2020 - total_2019)/total_2019)

gf_2020_tourism <- mean(test$tourism_ct_diff, na.rm = TRUE) # global average decerease for 2020 tourst
gf_2020_total <- mean(test$total_diff, na.rm = TRUE) # global average decrease for 2020 total

#
# plot(test$total_diff, test$tourism_ct_diff) # looks pretty linear, so thats good, we can use the same method of gapfilling
# mean(test$total_diff, na.rm = TRUE) # -0.7015468 average decrease
# mean(test$tourism_ct_diff, na.rm = TRUE) # -0.7030777 average decrease
# ## ok, lets use these values to gapfill 2020 if it is NA and 2019 exists. So decrease by X proportion ~70%


unwto_upfill <- unwto_dupe_fix %>%
filter(year < 2020) %>%
group_by(rgn_id) %>%
arrange(rgn_id, year) %>%
tidyr::fill(tourism_arrivals_ct, .direction = "up") %>%
tidyr::fill(total, .direction = "up") %>% # fill in any values that are empty from early years with values from the nearest year. Doing this because doesn't make sense to add earlier years based on a trend
mutate(arrivals_method = ifelse(is.na(arrivals_method) & !is.na(tourism_arrivals_ct), "nearby year", arrivals_method)) %>%
mutate(arrivals_gapfilled = ifelse(arrivals_method == "nearby year", "gapfilled", arrivals_gapfilled))

## calculate regional average increase or decrease in number of total arrivals
lm_coef_data_total <- unwto_dupe_fix %>%
filter(!(year %in% c(2020, 2021))) %>%
group_by(rgn_id) %>%
filter(!is.na(total)) %>%
summarize(lm_coef_total = if (n() > 1) lm(total ~ year)$coefficients[2] else 0, .groups = 'drop') # give it an addition of 0 if it is stagnant

## calculate regional average increase or decrease in number of tourism arrivals
lm_coef_data_tourism <- unwto_dupe_fix %>%
filter(!(year %in% c(2020, 2021))) %>%
group_by(rgn_id) %>%
filter(!is.na(tourism_arrivals_ct)) %>%
summarize(lm_coef_tourism = lm(tourism_arrivals_ct ~ year)$coefficients[2])


# Initialize a flag to check if there are still NAs
na_flag <- TRUE

# filter out any regions with all nas for each year, as these can't be gapfilled
all_nas_tourism <- unwto_upfill %>%
group_by(rgn_id) %>%
filter(all(is.na(tourism_arrivals_ct))) %>%
dplyr::select(rgn_id, year, tourism_arrivals_ct, arrivals_method, arrivals_gapfilled)

unwto_gapfill_lm_2019_tourism <- unwto_upfill %>%
left_join(lm_coef_data_tourism) %>%
ungroup() %>%
filter(!(rgn_id %in% c(all_nas_tourism$rgn_id))) %>%
dplyr::select(rgn_id, year, tourism_arrivals_ct, arrivals_method, arrivals_gapfilled, lm_coef_tourism)


## now lets fill in any values down with the linear model average increase per year for tourists
while(na_flag) {

unwto_gapfill_lm_2019_tourism <- unwto_gapfill_lm_2019_tourism %>%
group_by(rgn_id) %>%
arrange(year) %>%
mutate(
tourism_arrivals_ct = case_when(
is.na(tourism_arrivals_ct) & !is.na(lag(tourism_arrivals_ct)) ~ lag(tourism_arrivals_ct) + lm_coef_tourism,
TRUE ~ tourism_arrivals_ct
)
) %>%
mutate(arrivals_method = ifelse(is.na(arrivals_method) & !is.na(tourism_arrivals_ct), "linear model", arrivals_method)) %>%
mutate(arrivals_gapfilled = ifelse(arrivals_method == "linear model", "gapfilled", arrivals_gapfilled)) %>%
ungroup()

# Check if there are still NAs left in either column
na_flag <- any(is.na(unwto_gapfill_lm_2019_tourism$tourism_arrivals_ct))
}

unwto_gapfill_lm_2019_tourism_all <- unwto_gapfill_lm_2019_tourism %>%
dplyr::select(-lm_coef_tourism) %>%
rbind(all_nas_tourism)

### Now do the same thing for total column
# Initialize a flag to check if there are still NAs
na_flag <- TRUE

all_nas_total <- unwto_upfill %>%
group_by(rgn_id) %>%
filter(all(is.na(total))) %>%
dplyr::select(rgn_id, year, total)

unwto_gapfill_lm_2019_total <- unwto_upfill %>%
filter(year < 2020) %>%
group_by(rgn_id) %>%
arrange(rgn_id, year) %>%
tidyr::fill(total, .direction = "up") %>% # fill in any values that are empty from early years with values from the nearest year
left_join(lm_coef_data_total) %>%
ungroup() %>%
filter(!(rgn_id %in% c(all_nas_total$rgn_id))) %>%
dplyr::select(rgn_id, year, total, lm_coef_total)


## now lets fill in any values down with the linear model average increase per year for tourists
while(na_flag) {

unwto_gapfill_lm_2019_total <- unwto_gapfill_lm_2019_total %>%
group_by(rgn_id) %>%
arrange(year) %>%
mutate(
total = case_when(
is.na(total) & !is.na(lag(total)) ~ lag(total) + lm_coef_total,
TRUE ~ total
)
) %>%
ungroup()

# Check if there are still NAs left in either column
na_flag <- any(is.na(unwto_gapfill_lm_2019_total$total))
}

unwto_gapfill_lm_2019_total_all <- unwto_gapfill_lm_2019_total %>%
dplyr::select(-lm_coef_total) %>%
mutate(total = ifelse(rgn_id == 67 & total < 0, unwto_gapfill_lm_2019_total %>%
filter(rgn_id == 67, year == 2009 ) %>% pull(total) , total)) %>% # fix libya, as it was being given negative values with the gapfill. Just give it its latest year (downfill)
rbind(all_nas_total)


unwto_2020_2021 <- unwto_dupe_fix %>%
filter(year > 2019) # lets fix 2020 and 2021 now

unwto_all_gf <- unwto_gapfill_lm_2019_tourism_all %>%
left_join(unwto_gapfill_lm_2019_total_all) %>%
rbind(unwto_2020_2021) %>%
group_by(rgn_id) %>%
arrange(rgn_id, year) %>%
# apply global average proportional increase or decrease for 2020 and 2021, because of covid pandemic messing up trends...
mutate(tourism_arrivals_ct = ifelse(year == 2020 & is.na(tourism_arrivals_ct), lag(tourism_arrivals_ct, n = 1) + lag(tourism_arrivals_ct, n = 1)*gf_2020_tourism, tourism_arrivals_ct),
total = ifelse(year == 2020 & is.na(total), lag(total, n = 1) + lag(total, n = 1)*gf_2020_total, total)) %>%

mutate(tourism_arrivals_ct = ifelse(year == 2021 & is.na(tourism_arrivals_ct), lag(tourism_arrivals_ct, n = 1) + lag(tourism_arrivals_ct, n = 1)*gf_2021_tourism, tourism_arrivals_ct),
total = ifelse(year == 2021 & is.na(total), lag(total, n = 1) + lag(total, n = 1)*gf_2021_total, total)) %>%
mutate(arrivals_method = ifelse(is.na(arrivals_method) & !is.na(tourism_arrivals_ct), "2020 and 2021 gapfill method", arrivals_method)) %>%
mutate(arrivals_gapfilled = ifelse(arrivals_method == "2020 and 2021 gapfill method", "gapfilled", arrivals_gapfilled)) %>%
filter(year >= 2008) %>% # get only the year we need and beyond
drop_na(tourism_arrivals_ct) # remove any remaining NAs (any remaining have all NAs for that region)
# drop_na(total) # keep these NAs, we will just give these regions a perfect score...



## old way
# unwto_dupe_fix_downup_gf <- unwto_dupe_fix %>%
# fill(tourism_arrivals_ct, .direction = "downup") %>%
# fill(total, .direction = "downup") %>%
# mutate(arrivals_method = ifelse(is.na(arrivals_method) & !is.na(tourism_arrivals_ct), "nearby year", arrivals_method)) %>%
# mutate(arrivals_gapfilled = ifelse(arrivals_method == "nearby year", "gapfilled", arrivals_gapfilled)) %>%
# filter(year >= 2008) %>% # get only the year we need and beyond
# drop_na(tourism_arrivals_ct) # remove any remaining NAs (any remaining have all NAs for that region)
#
# # check out things so far
# summary(unwto_dupe_fix_downup_gf) # NAs should be 0 now
30 changes: 30 additions & 0 deletions globalprep/tr/v2024/R/process_WB_generalized_fxn.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# v2023: THIS ENDED UP NOT BEING USED OTHER THAN IN EXPLORATION, BUT WAS LEFT HERE IN CASE CODE BECOMES RELEVANT IN THE FUTURE
# the WDI function that we use functions similarly to this

process_wb_data <- function(file_path, value_name, df_name) {
# read in the csv needing processing
df <- read_csv(file_path, skip = 4)

# select needed columns
df_clean <- df %>%
select(country = `Country Name`, `1960`:`2022`)

# get in long format
df_clean_long <- df_clean %>%
pivot_longer(cols = -country,
names_to = "year",
values_to = "values") %>%
mutate(country = str_to_title(country))

# run name_2_rgn to get correct names
df_clean_names <- name_2_rgn(df_in = df_clean_long,
fld_name = "country",
flds_unique = "year")

# fix any duplicates
df_dupe_fix <- df_clean_names %>%
group_by(rgn_id, year) %>%
summarize({{ value_name }} := ifelse(all(is.na(values)), NA, sum(values, na.rm = TRUE)))

assign(df_name, df_dupe_fix, envir = .GlobalEnv) # add to environment
}
14 changes: 14 additions & 0 deletions globalprep/tr/v2024/R/process_area_of_coastline.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# read in area of coastline data
inland_filepath <- file.path("globalprep", "lsp", paste0("v", version_year), "int", "area_protected_1km.csv")
inland_data <- read_csv(inland_filepath)
offshore_filepath <- file.path("globalprep", "lsp", paste0("v", version_year), "int", "area_protected_3nm.csv")
offshore_data <- read_csv(offshore_filepath)

# get combined value of inland and offshore for each ohi region
inland_offshore <- inland_data %>%
left_join(offshore_data, by = join_by(rgn_id, year, rgn_name)) %>%
select(rgn_id, year, a_tot_km2.x, a_tot_km2.y) %>%
group_by(rgn_id, year) %>%
mutate(total_inland_offshore_area = a_tot_km2.x + a_tot_km2.y,
year = as.character(year)) %>%
select(rgn_id, year, total_inland_offshore_area)
Loading

0 comments on commit f3c268a

Please sign in to comment.