From c86aac9c1706d155f589a37fffd00d37aa0bbb23 Mon Sep 17 00:00:00 2001 From: dustin-duncan Date: Thu, 27 Jun 2024 21:42:50 +0000 Subject: [PATCH 1/2] Final commit today --- globalprep/le/v2024/Dustin_wb_service_dataprep.Rmd | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/globalprep/le/v2024/Dustin_wb_service_dataprep.Rmd b/globalprep/le/v2024/Dustin_wb_service_dataprep.Rmd index fb4e1e9b..d11606bd 100644 --- a/globalprep/le/v2024/Dustin_wb_service_dataprep.Rmd +++ b/globalprep/le/v2024/Dustin_wb_service_dataprep.Rmd @@ -186,6 +186,16 @@ line_plot <- plotly::plot_ly(proportional_employement_nona, x = ~year, y = ~prop yaxis = list(title = "Number of people employed by the tourism sector")) line_plot + + +penona_pivot <- proportional_employement_nona %>% + pivot_longer(cols = c(proportional_tourism_employment, yearly_wages_ppp_adjusted_by_year), names_to = "metric", values_to = "value") +# %>% + filter(admin_country_name.x %in% c("Indonesia", "Brazil", "Cyprus", "Argentina", "India")) + +ggplot(penona_pivot, aes(x = as.numeric(year), y = value, color = admin_country_name.x)) + + geom_line() + + facet_wrap(~metric, scales = "free", ncol = 1) ``` From 412c29298dbb59e1e68e272af7d5db6601945091 Mon Sep 17 00:00:00 2001 From: annaramji Date: Thu, 27 Jun 2024 22:16:47 +0000 Subject: [PATCH 2/2] putting all dataprep in one doc, end of livelihood dataprep for the day --- globalprep/le/v2024/jobs_dataprep_anna.qmd | 49 +++ globalprep/le/v2024/livelihood_dataprep.qmd | 438 ++++++++++++++++++++ globalprep/le/v2024/wb_service_dataprep.Rmd | 50 ++- 3 files changed, 534 insertions(+), 3 deletions(-) create mode 100644 globalprep/le/v2024/livelihood_dataprep.qmd diff --git a/globalprep/le/v2024/jobs_dataprep_anna.qmd b/globalprep/le/v2024/jobs_dataprep_anna.qmd index d1c652af..e7128f64 100644 --- a/globalprep/le/v2024/jobs_dataprep_anna.qmd +++ b/globalprep/le/v2024/jobs_dataprep_anna.qmd @@ -52,6 +52,7 @@ un_pivot_clean <- un_jobs_pivot %>% mutate(year = str_remove_all(year, "x")) # using countrycode::countrycode +# and following this example on Rpubs: https://rpubs.com/Teal_Emery/cleaning_intl_data_tips_and_tricks library(countrycode) @@ -125,3 +126,51 @@ ggplot(jobs_plot_df) + ``` +```{r} +plotly::plot_ly(jobs_plot_df, x = ~year, y = ~thousand_jobs, color = ~rgn_name, type = "scatter", mode = "lines") %>% + layout(title = "All Regions: Number of Tourism Jobs per Country over Time", + xaxis = list(title = "Year"), + yaxis = list(title = "Number of people working in the tourism sector")) +``` + + +India is seriously skewing the scales of this plot -- this, among general curiosity and conversation about how we should weight the number of jobs by the size of countries led us to create a proportional employment variable. + + + + +```{r} + +jobs_wages_join <- left_join(jobs_rgn_2012, pop_ppp_converted, by = c("rgn_id", "year")) + +jobs_wages_proportional_employment <- jobs_wages_join %>% + mutate(total_tourism_employees = thousand_employees*1000) %>% + dplyr::relocate(total_tourism_employees, .before = total_population) %>% + mutate(proportional_tourism_employment = total_tourism_employees/total_population) %>% + select(-c(notes.x, notes.y)) + +proportional_employement_nona <- jobs_wages_proportional_employment %>% + na.omit() %>% + mutate(year = as.numeric(year)) + +ggplot(proportional_employement_nona, aes(x = year, y = proportional_tourism_employment, color = rgn_id)) + + geom_line() + +line_plot <- plotly::plot_ly(proportional_employement_nona, x = ~year, y = ~proportional_tourism_employment, color = ~admin_country_name.x, type = "scatter", mode = "lines") + layout(title = "All Regions: Proportional Employment Line Chart", + xaxis = list(title = "Year"), + yaxis = list(title = "Number of people employed by the tourism sector")) + +line_plot + + +penona_pivot <- proportional_employement_nona %>% + pivot_longer(cols = c(proportional_tourism_employment, yearly_wages_ppp_adjusted_by_year), names_to = "metric", values_to = "value") +# %>% + filter(admin_country_name.x %in% c("Indonesia", "Brazil", "Cyprus", "Argentina", "India")) + +ggplot(penona_pivot, aes(x = as.numeric(year), y = value, color = admin_country_name.x)) + + geom_line() + + facet_wrap(~metric, scales = "free", ncol = 1) +``` + diff --git a/globalprep/le/v2024/livelihood_dataprep.qmd b/globalprep/le/v2024/livelihood_dataprep.qmd new file mode 100644 index 00000000..4c703bdf --- /dev/null +++ b/globalprep/le/v2024/livelihood_dataprep.qmd @@ -0,0 +1,438 @@ +--- +title: "Livelihood Dataprep: Tourism Jobs & Service Sector Wages" +format: html +--- + +--- +title: "World Bank Service Earnings Data Prep" + +--- + + +#### Setup +```{r} +# load packages ---- +library(tidyverse) # <3 +library(readxl) # for reading in excel data +library(here) # for reproducible file paths +library(janitor) # for cleaning column names +library(countrycode) # for cleaning country codes + + +# source common.R +source(here("workflow/R/common.R")) + +# set file paths to data in Mazu ---- +raw_data_path <- file.path(here(dir_M, #"/home/shares/ohi" + "git-annex", + "globalprep", + "_raw_data")) + +un_wto_path <- file.path(here(raw_data_path, "UNWTO", "d2024")) + +service_data <- file.path(un_wto_path, "join_database_w_definitions.xlsx") + +un_job_data <- file.path(dir_M, "git-annex/globalprep/_raw_data/UNWTO/d2024/unwto-all-data-download_0.xlsx") + +``` + + +# Wage Data + +### Read in wage data from World Bank + +[](https://www.unwto.org/tourism-statistics/key-tourism-statistics) + +Downloaded June 26, 2024. Last updated (according to the website): January 31, 2024 + + +```{r} +# read in .xlsx wage data from Mazu +service_data_raw <- read_xlsx(service_data, skip = 3, col_names = TRUE) + +# quick view: +head(service_data_raw) +``` + +The income column we're interested in is: +Median Earnings for wage workers per month in service, local nominal currency + + +First, let's tidy the data and select the columns we're interested in. Then we'll join with OHI regions to see what we're working with. + +```{r} +# ---------- Tidy Data ------------------------ +service_data_clean <- service_data_raw %>% + select("Country Name", "Country Code", "Region Code", "Year of survey", "Subsample", + "Total population", "Median Earnings for wage workers per month in service, local nominal currency") %>% + janitor::clean_names() %>% + mutate(year_of_survey = as.integer(year_of_survey), + country_name = as.factor(country_name)) %>% + filter(subsample %in% c("All")) %>% # filtering to All because data also contains "Urban", "Young", etc. + select(-subsample) + +# quick look +head(service_data_clean) + +# ----------- Join with OHI Regions ----------- + +# read in OHI regions for joining +region_names <- read_csv("https://raw.githubusercontent.com/OHI-Science/ohi-global/draft/eez/spatial/regions_list.csv") %>% + janitor::clean_names() + + +# prep column names for joining +service_clean <- service_data_clean %>% + rename(eez_iso3 = country_code, + admin_country_name = country_name) + + +# join by country_code and eez_iso3 +service_join <- inner_join(x = service_clean, region_names, by = c("eez_iso3", "admin_country_name")) + +# note: many-to-many relationship bc. admin country name applies to multiple regions from region_names -- would need to note this in analysis etc. + + +service_join + + + +# Filter out NAs for service wages + +service_no_na <- service_join %>% + drop_na(median_earnings_for_wage_workers_per_month_in_service_local_nominal_currency) + +service_no_na +``` + +Preliminary plot of wage data + +```{r} +# find most recent year of data +service_newest <- service_join %>% + group_by(admin_country_name) %>% + summarize(max_year = max(year_of_survey)) + +# filter to years >= 2013 + +post_2013_data <- service_join %>% + filter((year_of_survey) >= 2013) + + + +ggplot(data = post_2013_data) + + geom_line(aes(x = year_of_survey, y = median_earnings_for_wage_workers_per_month_in_service_local_nominal_currency)) + +# facet_wrap(~admin_country_name) + + theme_bw() + +``` + + + +Adjust service wage data to be PPP USD using OECD data +```{r} +# read in ppp data +oecd_data <- read_csv(file.path(here(un_wto_path, "SNA_TABLE4_27062024190625956.csv"))) %>% + janitor::clean_names() + +``` + +Join oecd and joined service data +```{r} + +service_yearly <- service_no_na %>% + mutate(median_annual_service_local_wages = median_earnings_for_wage_workers_per_month_in_service_local_nominal_currency +* 12) %>% + select(-notes) + +ppp_service_join <- left_join(service_yearly, oecd_data, by = c("year_of_survey" = "year", "eez_iso3" = "location")) + + +# filter to ones we have "value" (conversion) for + +ppp_service <- ppp_service_join %>% + drop_na(value) %>% + rename(year = year_of_survey) + +# selecting relevant columns +ppp_service_clean <- ppp_service %>% + select( + # country info + eez_iso3, admin_country_name, rgn_name, rgn_id, admin_rgn_id, + # currency info + unit_code, unit, + # year + year, + # wages info + median_earnings_for_wage_workers_per_month_in_service_local_nominal_currency, + median_annual_service_local_wages, + # conversion per country + value, + + # type of PPP operation + transact, + transaction, + + # socio-demographic info + total_population + ) + +ppp_service_clean + + + +# --------- Calculating Adjusted Wages ------------- + +# --------- PPP GDP ------------- +# make PPP subset +# using PPP GDP: covers both final consumption expenditure (household and government) and gross capital formation +ppp_gdp <- ppp_service_clean %>% + filter(transact %in% c("PPPGDP")) + +# check: number of unique countries, regions: +unique(ppp_gdp$admin_country_name) +cat("Number of unique countries in this subset", length(unique(ppp_gdp$admin_country_name))) +# 37 + +unique(ppp_gdp$rgn_name) +cat("Number of unique regions in this subset", length(unique(ppp_gdp$rgn_name))) +# 43 + + +# make new column of adjusted wages (by year by country)! + +# divide annual wage in local currency by PPP (local / USD, adjusted) + +adjusted_gdp <- ppp_gdp %>% + mutate(adjusted_wages = median_annual_service_local_wages / value) + + +# subset for viewing + +adjusted_gdp %>% select(admin_country_name, rgn_name, year, value, median_annual_service_local_wages, adjusted_wages) + + +max(adjusted_gdp$adjusted_wages) +mean(adjusted_gdp$adjusted_wages) + +``` + + +Super basic exploratory plots + +```{r} + +gdp_plot <- adjusted_gdp %>% + ggplot() + + geom_line(aes(x = year, y = adjusted_wages, + color = admin_country_name)) + + theme_bw() + + labs(title = "Adjusted Service Wages using PPP GDP", + x = "Year", + y = "Adjusted Wages") + +gdp_plot +``` + + +There is a clear, significant drop off in data points following 2016. We are still investigating as to why this is the case. + + + +```{r} + +gdp_year_subset <- adjusted_gdp %>% + filter(year >=2013) + + +gdp_subset_plot <- gdp_year_subset %>% + ggplot() + + geom_line(aes(x = year, y = adjusted_wages, + color = admin_country_name)) + + theme_bw() + + labs(title = "Adjusted Service Wages using PPP GDP (2013-)", + x = "Year", + y = "Adjusted Wages") + +gdp_subset_plot +``` + + +```{r} +library(plotly) + +line_plot <- plot_ly(gdp_year_subset, x = ~as.numeric(year), + y = ~adjusted_wages, color = ~admin_country_name, + type = "scatter", mode = "lines") %>% + layout(title = "All Regions: Median Wages Line Chart", + xaxis = list(title = "Year"), + yaxis = list(title = "PPP GDP Adjusted Median Annual Wages (USD)")) + +line_plot +``` + + + + + +# Employment Data + +```{r} + + +un_jobs_data_raw <- read_xlsx(un_job_data, col_names = TRUE, sheet = "Employment", skip = 2, na = c("", "..")) %>% + janitor::clean_names() + +un_jobs_clean <- un_jobs_data_raw %>% + select(-c(1:3)) %>% + fill(1, .direction = "down") %>% + select(-c(x5)) %>% + select(-c(3:6)) %>% + filter(x6 %in% c("Total")) %>% + select(-c(x6, x38)) + +# Now we can pivot our data from wide to long -- get yeras in 1 column, values in another! +un_jobs_pivot <- un_jobs_clean %>% + pivot_longer(cols = 2:28, + names_to = "year", + values_to = "thousand_jobs" + ) + + +# We still have a lot of tidying to do.... + +# -------- Tidy ---------- + +un_pivot_clean <- un_jobs_pivot %>% + # clean country names + rename(country = basic_data_and_indicators) %>% + mutate(country = str_to_title(country)) %>% + # clean up years (still in the format of x2020) + mutate(year = str_remove_all(year, "x")) + +# using countrycode::countrycode +# and following this example on Rpubs: https://rpubs.com/Teal_Emery/cleaning_intl_data_tips_and_tricks + +country_regex_to_iso3c <- function(country_string) { + country_string %>% + countrycode::countrycode(origin = "country.name", destination = "iso3c", origin_regex = TRUE) +} + + +# adding iso3 codes +un_pivot_clean_codes <- un_pivot_clean %>% + mutate(iso3c = country_regex_to_iso3c(country)) + + +# read in OHI regions for joining +region_names <- read_csv("https://raw.githubusercontent.com/OHI-Science/ohi-global/draft/eez/spatial/regions_list.csv") %>% + janitor::clean_names() + + +jobs_rgn_join <- left_join(region_names, un_pivot_clean_codes, by = c("eez_iso3" = "iso3c")) %>% + select(-c(notes)) + + +# test: drop rows where thousand_jobs is na + +jobs_no_na <- jobs_rgn_join %>% + drop_na(thousand_jobs) + + +length(unique(jobs_no_na$country)) +length(unique(jobs_no_na$rgn_name)) + + +# drop country column +jobs_rgn_join_ohi <- jobs_no_na %>% + select(-country) +``` + + +Preliminary plot + +```{r} +jobs_data_final <- jobs_rgn_join_ohi %>% + mutate(year = as.numeric(year)) + + +ggplot(jobs_data_final) + + geom_line(aes(x = year, y = thousand_jobs, + color = rgn_name)) + + theme_bw() + + theme(legend.position = "none") + + +jobs_plot_df <- jobs_data_final %>% + filter(year >= 2013) %>% + mutate(year = as.integer(year)) + +length(unique(jobs_plot_df$rgn_id)) + +ggplot(jobs_plot_df) + + geom_line(aes(x = year, y = thousand_jobs, + color = rgn_name)) + + theme_bw() + + labs( + title = "Preliminary plot of jobs (thousands) over time (2013-2021)", + x = "Year", + y = "Jobs (Thousands)" + ) + + theme(legend.position = "none") + +``` + + +Interactive `plotly` version: +```{r} +plotly::plot_ly(jobs_plot_df, x = ~year, y = ~thousand_jobs, color = ~admin_country_name, type = "scatter", mode = "lines") %>% + layout(title = "All Regions: Number of Tourism Jobs per Country over Time", + xaxis = list(title = "Year"), + yaxis = list(title = "Number of people working in the tourism sector")) +``` + + + +India is seriously skewing the scales of this plot -- this, among general curiosity and conversation about how we should weight the number of jobs by the size of countries led us to create a proportional employment variable. + + + +# COME BACK +(update names -- copied from other doc w diff variable names so this chunk won't work) + + +```{r} + +jobs_wages_join <- left_join(jobs_rgn_2012, pop_ppp_converted, by = c("rgn_id", "year")) + +jobs_wages_proportional_employment <- jobs_wages_join %>% + mutate(total_tourism_employees = thousand_employees*1000) %>% + dplyr::relocate(total_tourism_employees, .before = total_population) %>% + mutate(proportional_tourism_employment = total_tourism_employees/total_population) %>% + select(-c(notes.x, notes.y)) + +proportional_employement_nona <- jobs_wages_proportional_employment %>% + na.omit() %>% + mutate(year = as.numeric(year)) + +ggplot(proportional_employement_nona, aes(x = year, y = proportional_tourism_employment, color = rgn_id)) + + geom_line() + +line_plot <- plotly::plot_ly(proportional_employement_nona, x = ~year, y = ~proportional_tourism_employment, color = ~admin_country_name.x, type = "scatter", mode = "lines") + layout(title = "All Regions: Proportional Employment Line Chart", + xaxis = list(title = "Year"), + yaxis = list(title = "Number of people employed by the tourism sector")) + +line_plot + + +penona_pivot <- proportional_employement_nona %>% + pivot_longer(cols = c(proportional_tourism_employment, yearly_wages_ppp_adjusted_by_year), names_to = "metric", values_to = "value") +# %>% + filter(admin_country_name.x %in% c("Indonesia", "Brazil", "Cyprus", "Argentina", "India")) + +ggplot(penona_pivot, aes(x = as.numeric(year), y = value, color = admin_country_name.x)) + + geom_line() + + facet_wrap(~metric, scales = "free", ncol = 1) +``` + diff --git a/globalprep/le/v2024/wb_service_dataprep.Rmd b/globalprep/le/v2024/wb_service_dataprep.Rmd index 85091d2b..b56e4565 100644 --- a/globalprep/le/v2024/wb_service_dataprep.Rmd +++ b/globalprep/le/v2024/wb_service_dataprep.Rmd @@ -284,21 +284,28 @@ prc_plot <- adjusted_prc %>% geom_line(aes(x = year, y = adjusted_wages, color = admin_country_name)) + theme_bw() + - labs(title = "Adjusted Service Wages using PPP PRC") + labs(title = "Adjusted Service Wages using PPP PRC", + x = "Year", + y = "Adjusted Wages" + ) gdp_plot <- adjusted_gdp %>% ggplot() + geom_line(aes(x = year, y = adjusted_wages, color = admin_country_name)) + theme_bw() + - labs(title = "Adjusted Service Wages using PPP GDP") + labs(title = "Adjusted Service Wages using PPP GDP", + x = "Year", + y = "Adjusted Wages") p41_plot <- adjusted_p41 %>% ggplot() + geom_line(aes(x = year, y = adjusted_wages, color = admin_country_name)) + theme_bw() + - labs(title = "Adjusted Service Wages using PPP P41") + labs(title = "Adjusted Service Wages using PPP P41", + x = "Year", + y = "Adjusted Wages") @@ -307,3 +314,40 @@ gdp_plot p41_plot ``` + +There is a clear, significant drop off in data points following 2016. We are still investigating as to why this is the case. + + + +```{r} + +gdp_year_subset <- adjusted_gdp %>% + filter(year >=2013) + + +gdp_subset_plot <- gdp_year_subset %>% + ggplot() + + geom_line(aes(x = year, y = adjusted_wages, + color = admin_country_name)) + + theme_bw() + + labs(title = "Adjusted Service Wages using PPP GDP (2013-)", + x = "Year", + y = "Adjusted Wages") + +gdp_subset_plot +``` + + +```{r} +library(plotly) + +line_plot <- plot_ly(gdp_year_subset, x = ~as.numeric(year), + y = ~adjusted_wages, color = ~admin_country_name, + type = "scatter", mode = "lines") %>% + layout(title = "All Regions: Median Wages Line Chart", + xaxis = list(title = "Year"), + yaxis = list(title = "PPP GDP Adjusted Median Annual Wages (USD)")) + +line_plot +``` +