Skip to content

Latest commit

 

History

History
311 lines (227 loc) · 9.3 KB

readme.md

File metadata and controls

311 lines (227 loc) · 9.3 KB

Employed Status

Please note that the #DuboisChallenge is still on going! It might be interesting to revisit that data from last week OR apply the same visual techniques to this modern data.

The data this week comes from the BLS, specifically table cpsaat17 across several years.

Employed persons by industry, sex, race, and occupation

I went ahead and downloaded the data by year for the past 6 years (2015 to 2020).

I also scraped some earnings data from their self-service data tool, which provided an interesting challenge! If you want to play around with some "offline" web scraping it's worth a shot.

Weekly and hourly earnings data from the Current Population Survey

The BLS has a corresponding article with similar summary data.

Get the data here

# Get the Data

# Read in with tidytuesdayR package 
# Install from CRAN via: install.packages("tidytuesdayR")
# This loads the readme and all the datasets for the week of interest

# Either ISO-8601 date or year/week works!

tuesdata <- tidytuesdayR::tt_load('2021-02-23')
tuesdata <- tidytuesdayR::tt_load(2021, week = 9)

employed <- tuesdata$employed

# Or read in the data manually

employed <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-02-23/employed.csv')

earn <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-02-23/earn.csv')

Data Dictionary

employed.csv

Employed persons by industry, sex, race, and occupation

variable class description
industry character Industry group
major_occupation character Major Occupation category
minor_occupation character Minor Occupation Categoru
race_gender character Race or gender group (total, men/women, race)
industry_total double Industry total
employ_n double Employed number
year integer Year

earn.csv

Weekly median earnings and number of persons employed by race/gender/age group over time

variable class description
sex character Gender
race character Racial group
ethnic_origin character Ethnic origin (hispanic or non-hispanic)
age character Age group
year integer Year
quarter integer Quarter
n_persons double Number of persons employed by group
median_weekly_earn integer Median weekly earning in current dollars

Cleaning Script

This was an interesting problem, and I think I have a decent reproducible pipeline for this one!

Note the data is already cleaned, but feel free to test the below out for yourself.

#basic url
# https://www.bls.gov/cps/aa2020/cpsaat17.xlsx

library(tidyverse)
library(glue)


get_bls_report <- function(year){
  
  report_url <- glue::glue("https://www.bls.gov/cps/aa{year}/cpsaat17.xlsx")
  
  download.file(report_url, destfile = glue("2021/2021-02-23/bls-{year}.xlsx"))
}

ex_2019 <- readxl::read_excel("2021/2021-02-23/bls-2019.xlsx")

2015:2019 %>% 
  walk(get_bls_report)

# 2020 has no year in front of it
download.file(
  "https://www.bls.gov/cps/cpsaat17.xlsx", 
  destfile = "2021/2021-02-23/bls-2020.xlsx"
  )

# Raw BLS -----------------------------------------------------------------

raw_2020 <- readxl::read_excel("2021/2021-02-23/bls-2020.xlsx", skip = 3) %>% 
  slice(1:(n()-2))

major_grp <- raw_2020 %>% 
  slice(1) %>% 
  select(3:last_col()) %>% 
  set_names(nm = glue::glue("...{1:ncol(.)}")) %>% 
  pivot_longer(cols = everything(), values_to = "major_grp") 

minor_grp <- raw_2020 %>% 
  slice(2) %>% 
  select(3:last_col()) %>% 
  set_names(nm = glue::glue("...{1:ncol(.)}")) %>% 
  pivot_longer(cols = everything(), values_to = "minor_grp") 

combo_grp <- left_join(major_grp, minor_grp, by = "name") %>% 
  mutate(across(.fns = ~str_replace_all(.x, "\n", " "))) %>% 
  mutate(across(.fns = ~str_remove_all(.x, "\r"))) %>% 
  mutate(across(.fns = ~str_replace_all(.x, "- ", ""))) %>% 
  tidyr::fill(major_grp)

name_fill <- c("race_gender", "category", "total", glue("...{1:11}"))

clean_2020 <- raw_2020 %>% 
  rename(category = 1) %>% 
  mutate(
    race_gender = if_else(
      str_detect(category, "Agriculture and related"),
      lag(category),
      NA_character_
      ),
    .before = category
    ) %>% 
  fill(race_gender) %>% 
  slice(5:n()) %>% 
  set_names(nm = name_fill) %>% 
  pivot_longer(cols = contains("..."), names_to = "name", values_to = "employ_n") %>% 
  left_join(combo_grp, by = "name") %>% 
  mutate(year = 2020) %>% 
  select(category, major_grp, minor_grp, race_gender, cat_total = total, employ_n)

# Make a function!

clean_bls <- function(year){
  
  raw_df <- readxl::read_excel(glue("2021/2021-02-23/bls-{year}.xlsx"), skip = 3) %>% 
    slice(1:(n()-2))
  
  major_grp <- raw_df %>% 
    slice(1) %>% 
    select(3:last_col()) %>% 
    set_names(nm = glue::glue("...{1:ncol(.)}")) %>% 
    pivot_longer(cols = everything(), values_to = "major_grp") 
  
  minor_grp <- raw_df %>% 
    slice(2) %>% 
    select(3:last_col()) %>% 
    set_names(nm = glue::glue("...{1:ncol(.)}")) %>% 
    pivot_longer(cols = everything(), values_to = "minor_grp") 
  
  combo_grp <- left_join(major_grp, minor_grp, by = "name") %>% 
    mutate(across(.fns = ~str_replace_all(.x, "\n", " "))) %>% 
    mutate(across(.fns = ~str_remove_all(.x, "\r"))) %>% 
    mutate(across(.fns = ~str_replace_all(.x, "- ", ""))) %>% 
    tidyr::fill(major_grp)
  
  name_fill <- c("race_gender", "category", "total", glue("...{1:11}"))
  
  clean_df <- raw_df %>% 
    rename(category = 1) %>% 
    mutate(
      race_gender = if_else(
        str_detect(category, "Agriculture and related"),
        lag(category),
        NA_character_
      ),
      .before = category
    ) %>% 
    fill(race_gender) %>% 
    slice(5:n()) %>% 
    set_names(nm = name_fill) %>% 
    pivot_longer(cols = contains("..."), names_to = "name", values_to = "employ_n") %>% 
    left_join(combo_grp, by = "name") %>% 
    select(industry = category, major_occupation = major_grp, minor_occupation = minor_grp, race_gender, industry_total = total, employ_n) %>% 
    mutate(year = year)
   
  clean_df
  
}

# combine the data

all_bls <- 2015:2020 %>% 
  map_dfr(clean_bls) %>% 
  arrange(desc(year)) %>% 
  mutate(
    industry_total = as.integer(industry_total) * 1000,
    employ_n = as.integer(employ_n) * 1000
    )

# sanity check plot

all_bls %>% 
  filter(race_gender == "Black or African American") %>% 
  filter(minor_occupation == "Sales and related occupations") %>% 
  ggplot(aes(x = year, y = employ_n, group = industry)) +
  geom_line()

all_bls %>% 
  write_csv("2021/2021-02-23/employed.csv")
  
### Here I'm reading against the HTML file included

library(rvest)

raw_html <- read_html("2021/2021-02-23/bls-all.htm")

raw_html %>% 
  html_nodes("table.catalog") %>% 
  html_table()

all_catalog_raw <- raw_html %>% 
  html_nodes("table.catalog") %>% 
  html_table() 


all_catalog_clean <- all_catalog_raw %>% 
  map(clean_catalog)

clean_catalog <- function(table){
  
  table %>% 
    pivot_wider(names_from = X1, values_from = X2) %>% 
    janitor::clean_names()
  
}

clean_catalog(all_catalog_raw[[2]])


all_table_raw <- raw_html %>% 
  html_nodes("table.regular-data")%>% 
  html_table() 

all_table_clean <- all_table_raw %>% 
  map_dfr(clean_table, .id = "id") %>% 
  mutate(id = as.integer(id))

clean_table <- function(table){
  
  table %>% 
    filter(str_detect(Year, "Corrected", negate = TRUE)) %>% 
    mutate(across(everything(), ~str_remove(.x, "\\(C\\)"))) %>% 
    pivot_longer(Qtr1:Qtr4, names_to = "quarter", values_to = "value") %>% 
    rename(year = Year)
  
}

combine_tables <- bind_rows(all_table_clean) %>% 
  mutate(id = 1:n())

combined_data <- all_catalog_clean %>% 
  bind_rows() %>% 
  mutate(id = row_number()) %>% 
  left_join(all_table_clean, by = "id") %>% 
  mutate(
    year = as.integer(year), 
    quarter = str_remove(quarter, "Qtr") %>% as.integer(),
    value = as.integer(value)
  ) 

data_earn <- combined_data %>% 
  filter(earnings != "Person counts (number in thousands)") %>% 
  rename(median_weekly_earn = value) %>% 
  select(industry:last_col(), -id)

data_earn 

final_bls_earn <- combined_data %>%
  filter(earnings == "Person counts (number in thousands)") %>%
  rename(n_persons = value) %>%
  mutate(n_persons = n_persons * 1000) %>%
  select(industry:quarter, n_persons, -id) %>%
  left_join(
    data_earn,
    by = c(
      "industry", "occupation", "sex", "race", "ethnic_origin", "age", 
      "education", "class_of_worker", "labor_force_status", "year", "quarter"
      )
  ) %>% 
  select(sex, race, ethnic_origin, age, year:median_weekly_earn)

final_bls_earn %>% 
  write_csv("2021/2021-02-23/earn.csv")

# sanity check
final_bls_earn %>% 
  filter(quarter == 2, sex == "Both Sexes", race != "All Races") %>% 
  ggplot(aes(x = year, y = median_weekly_earn, color = race)) +
  geom_line() +
  facet_wrap(~age)
  distinct(quarter)