diff --git a/.Rbuildignore b/.Rbuildignore index c0d769a..d17fd8e 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -11,3 +11,4 @@ ^images_for_guide$ ^doc$ ^Meta$ +^CRAN-RELEASE$ diff --git a/CRAN-RELEASE b/CRAN-RELEASE new file mode 100644 index 0000000..0c7fce6 --- /dev/null +++ b/CRAN-RELEASE @@ -0,0 +1,2 @@ +This package was submitted to CRAN on 2019-07-30. +Once it is accepted, delete this file and tag the release (commit bb6db3fa30). diff --git a/DESCRIPTION b/DESCRIPTION index 6ae5cc3..2f910d2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -4,7 +4,7 @@ Description: Datasets and code published by the data journalism website 'FiveThirtyEight' available at . Note that while we received guidance from editors at 'FiveThirtyEight', this package is not officially published by 'FiveThirtyEight'. -Version: 0.4.0.9000 +Version: 0.5.0 Authors@R: c( person("Albert Y.", "Kim", email = "albert.ys.kim@gmail.com", role = c("aut", "cre")), person("Chester", "Ismay", email = "chester.ismay@gmail.com", role = "aut"), @@ -33,6 +33,7 @@ Suggests: ggplot2, dplyr, tidyr, + curl, readr, tibble, lubridate, @@ -44,15 +45,5 @@ Suggests: scales, broom, magrittr, - rmarkdown, - slam (>= 0.1-42), - highcharter (>= 0.7), - tidytext, - textdata, - hunspell, - fmsb, - wordcloud, - corrplot, - ggraph, - igraph + rmarkdown VignetteBuilder: knitr diff --git a/NEWS.md b/NEWS.md index f6ec534..6f85b18 100755 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# fivethirtyeight 0.4.0.9000 +# fivethirtyeight 0.5.0 * Added vignette corresponding to Technology Innovations in Statistics Education [paper](https://escholarship.org/uc/item/0rx1231m#main) * Removed tidyverse from `DESCRIPTION` Depends, Imports, or Suggests fields diff --git a/R/data_meredith.R b/R/data_meredith.R index 7edce10..6412740 100644 --- a/R/data_meredith.R +++ b/R/data_meredith.R @@ -284,20 +284,18 @@ #' Data was collected on August 27, 2017 between 12:05 a.m. and 1:15 a.m. EDT #' using the Twitter streaming API. \url{https://github.com/fivethirtyeight/data/tree/master/mayweather-mcgregor} #' @examples -#' # To obtain the entire dataset, run the code inside the following if statement: -#' if(FALSE){ -#' library(dplyr) -#' library(tidyr) -#' library(readr) -#' url <- -#' "https://raw.githubusercontent.com/fivethirtyeight/data/master/mayweather-mcgregor/tweets.csv" -#' mayweather_mcgregor_tweets <- read_csv(url) %>% -#' mutate( -#' emojis = as.logical(emojis), -#' retweeted = as.logical(retweeted), -#' id = as.character(id) -#' ) -#' } +#' # To obtain the entire dataset, run the following code: +#' library(dplyr) +#' library(readr) +#' +#' mayweather_mcgregor_tweets <- +#' "https://raw.githubusercontent.com/fivethirtyeight/data/master/mayweather-mcgregor/tweets.csv" %>% +#' read_csv() %>% +#' mutate( +#' emojis = as.logical(emojis), +#' retweeted = as.logical(retweeted), +#' id = as.character(id) +#' ) "mayweather_mcgregor_tweets" diff --git a/R/data_starry.R b/R/data_starry.R index 801d75f..ebe4a83 100644 --- a/R/data_starry.R +++ b/R/data_starry.R @@ -150,9 +150,9 @@ #' \url{https://projects.fivethirtyeight.com/2018-midterm-election-forecast/house/} #' #' @format Because of R package size restrictions, only a preview of the -#' first 10 rows of this dataset is included; to obtain the entire dataset -#' see Examples below. The preview is a data frame with 10 rows representing -#' district-level results of the classic, lite, and deluxe house forecasts +#' first 10 rows of this dataset is included; to obtain the entire dataset +#' see Examples below. The preview is a data frame with 10 rows representing +#' district-level results of the classic, lite, and deluxe house forecasts #' since 2018/08/01 and 11 variables. #' \describe{ #' \item{forecastdate}{date of the forecast} @@ -172,14 +172,16 @@ #' Methodology: #' \url{https://fivethirtyeight.com/methodology/how-fivethirtyeights-house-and-senate-models-work/} #' @seealso \code{\link{house_national_forecast}} -#' @examples +#' @examples +#' if(FALSE){ +#' #' # To obtain the entire dataset, run the following code: #' library(readr) #' library(dplyr) #' library(janitor) -#' -#' house_district_forecast <- -#' "https://projects.fivethirtyeight.com/congress-model-2018/house_district_forecast.csv" %>% +#' +#' house_district_forecast <- +#' "https://projects.fivethirtyeight.com/congress-model-2018/house_district_forecast.csv" %>% #' read_csv() %>% #' clean_names() %>% #' mutate( @@ -189,6 +191,8 @@ #' model = as.factor(model) #' ) %>% #' select(-special) +#' +#' } "house_district_forecast" #' 2018 House Forecast @@ -197,7 +201,7 @@ #' 'Forecasting the race for the House' #' \url{https://projects.fivethirtyeight.com/2018-midterm-election-forecast/house/} #' -#' @format A dataframe with 588 rows representing district-level results of the +#' @format A dataframe with 588 rows representing district-level results of the #' classic, lite, and deluxe house forecasts since 2018/08/01 and 11 variables. #' \describe{ #' \item{forecastdate}{date of the forecast} diff --git a/README.Rmd b/README.Rmd index d1ac49e..74977b6 100755 --- a/README.Rmd +++ b/README.Rmd @@ -10,6 +10,7 @@ knitr::opts_chunk$set( comment = "#>", fig.path = "README-" ) +library(fivethirtyeight) ``` @@ -53,15 +54,13 @@ library(fivethirtyeight) data(bechdel) head(bechdel) ?bechdel + # If using RStudio: View(bechdel) +``` -# To see a list of all data sets: -data(package = "fivethirtyeight") +To see a detailed list of all `r nrow(data(package = "fivethirtyeight")[[3]])` datasets, including information on the corresponding articles published on FiveThirtyEight.com, click [here](https://fivethirtyeight-r.netlify.com/articles/fivethirtyeight.html). -# To see a more detailed list of all data sets, see the package vignette: -vignette("fivethirtyeight", package = "fivethirtyeight") -``` ## Article in "Technology Innovations in Statistics Education" diff --git a/README.md b/README.md index 79fde01..393f789 100644 --- a/README.md +++ b/README.md @@ -42,16 +42,15 @@ library(fivethirtyeight) data(bechdel) head(bechdel) ?bechdel + # If using RStudio: View(bechdel) - -# To see a list of all data sets: -data(package = "fivethirtyeight") - -# To see a more detailed list of all data sets, see the package vignette: -vignette("fivethirtyeight", package = "fivethirtyeight") ``` +To see a detailed list of all 127 datasets, including information on the +corresponding articles published on FiveThirtyEight.com, click +[here](https://fivethirtyeight-r.netlify.com/articles/fivethirtyeight.html). + ## Article in “Technology Innovations in Statistics Education” The `fivethirtyeight` package was featured in [The fivethirtyeight R diff --git a/cran-comments.md b/cran-comments.md index 23f2e61..0cbd431 100755 --- a/cran-comments.md +++ b/cran-comments.md @@ -2,15 +2,25 @@ This is a resubmission. In this version I have: -* Fixed the DESCRIPTION file: -* Lowered the size of the package tarball -* Replaced all instances of FiveThirtyEight and 538 with `FiveThirtyEight` +* Fixed the NOTEs involving URLs with spaces +* Fixed the ERROR involving reading a CSV file off the web + ## Test environments -* local OS X install, R 3.4.1 -* win-builder (release) +* local OS X install, R 3.6.1. +* ubuntu 14.04 (on travis-ci), R 3.6.1. +* Rhub + + Windows Server 2008 R2 SP1, R-devel, 32/64 bit + + Ubuntu Linux 16.04 LTS, R-release, GCC + + Fedora Linux, R-devel, clang, gfortran +* win-builder (devel and release) + ## R CMD check results -There were no ERRORs, only two NOTES on package size. +There were originally no ERRORs or WARNINGs. There were 3 NOTEs: + +* From Rhub (R-devel): one example had CPU or elapsed time > 5s (`comic_characters`), but this NOTE did not occur in any other test environment. +* From Rhub (R-release): one example had CPU or elapsed time > 5s (`ratings`), but this NOTE did not occur in any other test environment. +* From win-builder (both devel and release), it said that 4 URLs were possibly invalid, however upon testing them individually, they all worked fine. diff --git a/data-raw/process_data_sets_meredith.R b/data-raw/process_data_sets_meredith.R index 2c1daff..c088312 100644 --- a/data-raw/process_data_sets_meredith.R +++ b/data-raw/process_data_sets_meredith.R @@ -120,7 +120,8 @@ mayweather_mcgregor_tweets <- read_csv("data-raw/mayweather-mcgregor/tweets.csv" emojis = as.logical(emojis), retweeted = as.logical(retweeted), id = as.character(id) - ) + ) %>% + slice(1:10) usethis::use_data(mayweather_mcgregor_tweets, overwrite = TRUE) diff --git a/man/house_district_forecast.Rd b/man/house_district_forecast.Rd index e38639a..45b44e9 100644 --- a/man/house_district_forecast.Rd +++ b/man/house_district_forecast.Rd @@ -5,9 +5,9 @@ \alias{house_district_forecast} \title{2018 House Forecast} \format{Because of R package size restrictions, only a preview of the -first 10 rows of this dataset is included; to obtain the entire dataset -see Examples below. The preview is a data frame with 10 rows representing -district-level results of the classic, lite, and deluxe house forecasts +first 10 rows of this dataset is included; to obtain the entire dataset +see Examples below. The preview is a data frame with 10 rows representing +district-level results of the classic, lite, and deluxe house forecasts since 2018/08/01 and 11 variables. \describe{ \item{forecastdate}{date of the forecast} @@ -39,13 +39,15 @@ The raw data behind the story the original dataset included an empty column "special", which was removed. } \examples{ +if(FALSE){ + # To obtain the entire dataset, run the following code: library(readr) library(dplyr) library(janitor) -house_district_forecast <- - "https://projects.fivethirtyeight.com/congress-model-2018/house_district_forecast.csv" \%>\% +house_district_forecast <- + "https://projects.fivethirtyeight.com/congress-model-2018/house_district_forecast.csv" \%>\% read_csv() \%>\% clean_names() \%>\% mutate( @@ -55,6 +57,8 @@ house_district_forecast <- model = as.factor(model) ) \%>\% select(-special) + +} } \seealso{ \code{\link{house_national_forecast}} diff --git a/man/house_national_forecast.Rd b/man/house_national_forecast.Rd index f7bd783..3bef992 100644 --- a/man/house_national_forecast.Rd +++ b/man/house_national_forecast.Rd @@ -4,7 +4,7 @@ \name{house_national_forecast} \alias{house_national_forecast} \title{2018 House Forecast} -\format{A dataframe with 588 rows representing district-level results of the +\format{A dataframe with 588 rows representing district-level results of the classic, lite, and deluxe house forecasts since 2018/08/01 and 11 variables. \describe{ \item{forecastdate}{date of the forecast} diff --git a/man/mayweather_mcgregor_tweets.Rd b/man/mayweather_mcgregor_tweets.Rd index 5a3f5f5..258c603 100644 --- a/man/mayweather_mcgregor_tweets.Rd +++ b/man/mayweather_mcgregor_tweets.Rd @@ -32,19 +32,17 @@ The raw data behind the story \url{https://fivethirtyeight.com/?post_type=fte_features&p=161615}. } \examples{ -# To obtain the entire dataset, run the code inside the following if statement: -if(FALSE){ - library(dplyr) - library(tidyr) - library(readr) - url <- - "https://raw.githubusercontent.com/fivethirtyeight/data/master/mayweather-mcgregor/tweets.csv" - mayweather_mcgregor_tweets <- read_csv(url) \%>\% - mutate( - emojis = as.logical(emojis), - retweeted = as.logical(retweeted), - id = as.character(id) - ) -} +# To obtain the entire dataset, run the following code: +library(dplyr) +library(readr) + +mayweather_mcgregor_tweets <- + "https://raw.githubusercontent.com/fivethirtyeight/data/master/mayweather-mcgregor/tweets.csv" \%>\% + read_csv() \%>\% + mutate( + emojis = as.logical(emojis), + retweeted = as.logical(retweeted), + id = as.character(id) + ) } \keyword{datasets} diff --git a/vignettes/NBA.Rmd b/vignettes/NBA.Rmd deleted file mode 100644 index a34c7bb..0000000 --- a/vignettes/NBA.Rmd +++ /dev/null @@ -1,72 +0,0 @@ ---- -title: "2015 NBA player predictions analysis using the `tidyverse`" -author: "G. Elliott Morris and Chester Ismay" -date: "`r Sys.Date()`" -output: - rmarkdown::html_vignette: - df_print: kable -vignette: | - %\VignetteIndexEntry{NBA player predictions tidyverse analysis} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -This vignette is based on data collected for the 538 story entitled "Projecting The Top 50 Players In The 2015 NBA Draft Class" by Neil Paine and Zach Bradshaw available [here](https://fivethirtyeight.com/features/projecting-the-top-50-players-in-the-2015-nba-draft-class/). - -First, we load the required packages to reproduce analysis. - -```{r, message=FALSE, warning=FALSE} -library(fivethirtyeight) -library(ggplot2) -library(dplyr) -library(ggthemes) -library(knitr) -# Turn off scientific notation -options(scipen = 99) -``` - -# Group the projected statistical plus-minus by year - -The `nba_draft_2015` data frame features National Basketball Association players and prospects. Each player has a `draft_year` that corresponds to the year the player was drafted into the NBA, if at all. We are interested in analyzing the average `projected_spm` for each draft year. Here, `projected_spm` corresponds to FiveThirtyEight's model of projected statistical plus-minus over year 2-5 of the player's NBA career. Plus-minus is defined by [ BasketballReference.com](https://www.basketball-reference.com/about/bpm.html) as "box score-based metric for evaluating basketball players' quality and contribution to the team." It is measured on a per 100 possessions basis to factor out playing time and adjusted so that the score is relative to the average NBA player. Further from BasketballReference, "0.0 is league average, +5 means the player is 5 points better than an average player over 100 possessions (which is about All-NBA level), -2 is replacement level, and -5 is really bad." - -```{r data_year} -nba_yearly <- nba_draft_2015 %>% - group_by(draft_year) %>% - summarise(mean_proj_spm = mean(projected_spm)) -nba_yearly -``` - -# Graph it! - -Now that we have calculated the mean projected plus-minus for each draft year, let's plot it to better understand which draft class was projected to have the most impact on team success. - -```{r graph, fig.width=7} -ggplot(nba_yearly, aes(x = draft_year, y = mean_proj_spm, fill = mean_proj_spm)) + - geom_col() + - theme_fivethirtyeight() + - labs(title = "Which NBA draft class was best?", - subtitle = "As measured by the mean player/prospect's projected plus-minus", - caption = "Data from FiveThirtyEight") + - theme(legend.position = "none", - plot.title = element_text(face = "bold", size = 20), - plot.subtitle = element_text(size = 12), - plot.caption = element_text(hjust = 0, size = 10)) -``` - -The overall trend here is that NBA draft classes, on average, have shown an increase in mean projected plus-minus since 2003. Based on this analysis, 2015 is the best draft class since it has the highest mean projected plus-minus of any year. Another way to think of this is that 2015 was the most balanced based on this metric. This plot also shows that the draft classes have tended to be projected as having more impact on team success over time. - -# So what about 2003 and 2004? - -So who were the players in that 2003 NBA draft class? What about in 2004 that has the second largest (in magnitude) average projected plus-minus. Let's explore the top three `projected_spm` for 2003 and 2004: - -```{r} -nba_draft_2015 %>% - filter(draft_year %in% c(2003, 2004)) %>% - group_by(draft_year) %>% - top_n(projected_spm, n = 3) %>% - select(player, position, draft_year, projected_spm) -``` - -If you are familiar with NBA basketball, these names will stick out to you. Dwayne Wade and Carmelo Anthony are perennial all-stars and Andre Iguodala was the 2015 NBA Finals MVP. (Note that LeBron James is not in this data.) - -The surprising thing here is that these great players were not able to counter-balance the players with low `projected_spm`. Remember that the mean was used here so even if the great players were outliers (in the positive direction) they weren't able to pull the mean in their direction. Also note that not all of the players listed in this data set ended up playing in the NBA or, if they did, potentially only played a small amount of time. diff --git a/vignettes/bechdel.Rmd b/vignettes/bechdel.Rmd deleted file mode 100644 index fa33807..0000000 --- a/vignettes/bechdel.Rmd +++ /dev/null @@ -1,291 +0,0 @@ ---- -title: "Bechdel analysis using the `tidyverse`" -author: "Chester Ismay" -date: "`r Sys.Date()`" -output: - rmarkdown::html_vignette: - df_print: kable -vignette: | - %\VignetteIndexEntry{Bechdel analysis using the `tidyverse`} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -This vignette is based on tidyverse-ifying the R code [here](https://raw.githubusercontent.com/fivethirtyeight/data/master/bechdel/analyze-bechdel.R) and reproducing some of the plots and analysis done in the 538 story entitled "The Dollar-And-Cents Case Against Hollywood's Exclusion of Women" by Walt Hickey available [here](https://fivethirtyeight.com/features/the-dollar-and-cents-case-against-hollywoods-exclusion-of-women/). - -Load required packages to reproduce analysis. Also load the `bechdel` dataset for analysis. - -```{r, message=FALSE, warning=FALSE} -library(fivethirtyeight) -library(ggplot2) -library(dplyr) -library(knitr) -library(magrittr) -library(broom) -library(stringr) -library(ggthemes) -library(scales) -# Turn off scientific notation -options(scipen = 99) -``` - - -## Filter to only 1990 - 2013 - -Focus only on films from 1990 to 2013 - -```{r bechdel90_13} -bechdel90_13 <- bechdel %>% filter(between(year, 1990, 2013)) -``` - -## Calculate variables - -Create international gross only and return on investment (ROI) columns and add to `bechdel_90_13` data frame - -```{r mutate} -bechdel90_13 %<>% - mutate(int_only = intgross_2013 - domgross_2013, - roi_total = intgross_2013 / budget_2013, - roi_dom = domgross_2013 / budget_2013, - roi_int = int_only / budget_2013) -``` - -## Create `generous` variable - -```{r generous} -bechdel90_13 %<>% - mutate(generous = ifelse(test = clean_test %in% c("ok", "dubious"), - yes = TRUE, - no = FALSE)) -``` - -## Determine median ROI and budget based on categories - -```{r summary_ROI} -ROI_by_binary <- bechdel90_13 %>% - group_by(binary) %>% - summarize(median_ROI = median(roi_total, na.rm = TRUE)) -ROI_by_binary -bechdel90_13 %>% - summarize( - `Median Overall Return on Investment` = median(roi_total, na.rm = TRUE)) -``` - -```{r summary_budget} -budget_by_binary <- bechdel90_13 %>% - group_by(binary) %>% - summarize(median_budget = median(budget_2013, na.rm = TRUE)) -budget_by_binary -bechdel90_13 %>% - summarize(`Median Overall Budget` = median(budget_2013, na.rm = TRUE)) -``` - -## View Distributions - -Look at the distributions of budget, international gross, ROI, and their logarithms - -```{r budget-plot, fig.width = 5, warning = FALSE} -ggplot(data = bechdel90_13, mapping = aes(x = budget)) + - geom_histogram(color = "white", bins = 20) + - labs(title = "Histogram of budget") -``` - -```{r log-budget-plot, fig.width = 5, warning = FALSE} -ggplot(data = bechdel90_13, mapping = aes(x = log(budget))) + - geom_histogram(color = "white", bins = 20) + - labs(title = "Histogram of Logarithm of Budget") -``` - -```{r intgross-plot, fig.width = 5, warning = FALSE} -ggplot(data = bechdel90_13, mapping = aes(x = intgross_2013)) + - geom_histogram(color = "white", bins = 20) + - labs(title = "Histogram of International Gross") -``` - -```{r log-intgross-plot, fig.width = 5, warning = FALSE} -ggplot(data = bechdel90_13, mapping = aes(x = log(intgross_2013))) + - geom_histogram(color = "white", bins = 20) + - labs(title = "Histogram of Logarithm of International Gross") -``` - -```{r roi-plot, fig.width = 5, warning = FALSE} -ggplot(data = bechdel90_13, mapping = aes(x = roi_total)) + - geom_histogram(color = "white", bins = 20) + - labs(title = "Histogram of ROI") -``` - -The previous distributions were skewed, but ROI is so skewed that purposefully limiting the x-axis may reveal a bit more information about the distribution: (Suggested by [Mustafa Ascha](https://github.com/mustafaascha).) - -```{r roi-plot2, fig.width = 5, warning = FALSE} -ggplot(data = bechdel90_13, mapping = aes(x = roi_total)) + - geom_histogram(color = "white", bins = 20) + - labs(title = "Histogram of ROI") + - xlim(0, 25) -``` - -```{r log-roi-plot, fig.width = 5, warning = FALSE} -ggplot(data = bechdel90_13, mapping = aes(x = log(roi_total))) + - geom_histogram(color = "white", bins = 20) + - labs(title = "Histogram of Logarithm of ROI") -``` - -## Linear Regression Models - -### Movies with higher budgets make more international gross revenues using logarithms on both variables - -```{r scatplot1, fig.width = 5, warning=FALSE} -ggplot(data = bechdel90_13, - mapping = aes(x = log(budget_2013), y = log(intgross_2013))) + - geom_point() + - geom_smooth(method = "lm", se = FALSE) -``` - -```{r reg1} -gross_vs_budget <- lm(log(intgross_2013) ~ log(budget_2013), - data = bechdel90_13) -tidy(gross_vs_budget) -``` - -### Bechdel dummy is not a significant predictor of `log(intgross_2013)` assuming `log(budget_2013)` is in the model - -Note that the regression lines nearly completely overlap. - -```{r scatplot2, fig.width = 5, warning=FALSE} -ggplot(data = bechdel90_13, - mapping = aes(x = log(budget_2013), y = log(intgross_2013), - color = binary)) + - geom_point() + - geom_smooth(method = "lm", se = FALSE) -``` - -```{r reg2} -gross_vs_budget_binary <- lm(log(intgross_2013) ~ log(budget_2013) + factor(binary), - data = bechdel90_13) -tidy(gross_vs_budget_binary) -``` - -Note the $p$-value on `factor(binary)PASS` here that is around 0.40. - -### Movies with higher budgets have lower ROI - -```{r scatplot3, warning=FALSE} -ggplot(data = bechdel90_13, - mapping = aes(x = log(budget_2013), y = log(roi_total))) + - geom_point() + - geom_smooth(method = "lm", se = FALSE) -``` - -```{r reg3} -roi_vs_budget <- lm(log(roi_total) ~ log(budget_2013), - data = bechdel90_13) -tidy(roi_vs_budget) -``` - -Note the negative coefficient here on `log(budget_2013)` and its corresponding small $p$-value. - -### Bechdel dummy is not a significant predictor of `log(roi_total)` assuming `log(budget_2013)` is in the model - -Note that the regression lines nearly completely overlap. - -```{r scatplot4, warning=FALSE} -ggplot(data = bechdel90_13, - mapping = aes(x = log(budget_2013), y = log(roi_total), - color = binary)) + - geom_point() + - geom_smooth(method = "lm", se = FALSE) -``` - -```{r reg4} -roi_vs_budget_binary <- lm(log(roi_total) ~ log(budget_2013) + factor(binary), - data = bechdel90_13) -tidy(roi_vs_budget_binary) -``` - -Note the $p$-value on `factor(binary)PASS` here that is around 0.40. - -## Dollars Earned for Every Dollar Spent graphic - -Calculating the values and creating a tidy data frame - -```{r roi-graphic} -passes_bechtel_rom <- bechdel90_13 %>% - filter(generous == TRUE) %>% - summarize(median_roi = median(roi_dom, na.rm = TRUE)) -median_groups_dom <- bechdel90_13 %>% - filter(clean_test %in% c("men", "notalk", "nowomen")) %>% - group_by(clean_test) %>% - summarize(median_roi = median(roi_dom, na.rm = TRUE)) -pass_bech_rom <- tibble(clean_test = "pass", - median_roi = passes_bechtel_rom$median_roi) -med_groups_dom_full <- bind_rows(pass_bech_rom, median_groups_dom) %>% - mutate(group = "U.S. and Canada") -``` - -```{r roi-graphic2, fig.width=5} -passes_bechtel_int <- bechdel90_13 %>% - filter(generous == TRUE) %>% - summarize(median_roi = median(roi_int, na.rm = TRUE)) -median_groups_int <- bechdel90_13 %>% - filter(clean_test %in% c("men", "notalk", "nowomen")) %>% - group_by(clean_test) %>% - summarize(median_roi = median(roi_int, na.rm = TRUE)) -pass_bech_int <- tibble(clean_test = "pass", - median_roi = passes_bechtel_int$median_roi) -med_groups_int_full <- bind_rows(pass_bech_int, median_groups_int) %>% - mutate(group = "International") -med_groups <- bind_rows(med_groups_dom_full, med_groups_int_full) %>% - mutate(clean_test = str_replace_all(clean_test, - "pass", - "Passes Bechdel Test"), - clean_test = str_replace_all(clean_test, "men", - "Women only talk about men"), - clean_test = str_replace_all(clean_test, "notalk", - "Women don't talk to each other"), - clean_test = str_replace_all(clean_test, "nowoWomen only talk about men", - "Fewer than two women")) -med_groups %<>% mutate(clean_test = factor(clean_test, - levels = c("Fewer than two women", - "Women don't talk to each other", - "Women only talk about men", - "Passes Bechdel Test"))) %>% - mutate(group = factor(group, levels = c("U.S. and Canada", "International"))) %>% - mutate(median_roi_dol = dollar(median_roi)) -``` - -Using only a few functions to plot - -```{r basic-538, fig.width=8} -ggplot(data = med_groups, mapping = aes(x = clean_test, y = median_roi, - fill = group)) + - geom_bar(stat = "identity") + - facet_wrap(~ group) + - coord_flip() + - labs(title = "Dollars Earned for Every Dollar Spent", subtitle = "2013 dollars") + - scale_fill_fivethirtyeight() + - theme_fivethirtyeight() -``` - -Attempt to fully reproduce **Dollars Earned for Every Dollar Spent** plot using `ggplot` - -```{r roi-plot-538, fig.width=8} -ggplot(data = med_groups, mapping = aes(x = clean_test, y = median_roi, - fill = group)) + - geom_bar(stat = "identity") + - geom_text(aes(label = median_roi_dol), hjust = -0.1) + - scale_y_continuous(expand = c(.25, 0)) + - coord_flip() + - facet_wrap(~ group) + - scale_fill_manual(values = c("royalblue", "goldenrod")) + - labs(title = "Dollars Earned for Every Dollar Spent", subtitle = "2013 dollars") + - theme_fivethirtyeight() + - theme(plot.title = element_text(hjust = -1.6), - plot.subtitle = element_text(hjust = -0.4), - strip.text.x = element_text(face = "bold", size = 16), - panel.grid.major = element_blank(), - panel.grid.minor = element_blank(), - axis.title.x = element_blank(), - axis.text.x = element_blank(), - axis.ticks.x = element_blank()) + - guides(fill = FALSE) -``` - diff --git a/vignettes/bibliography.bib b/vignettes/bibliography.bib index ccdbf79..3f8ff06 100644 --- a/vignettes/bibliography.bib +++ b/vignettes/bibliography.bib @@ -46,7 +46,7 @@ @TECHREPORT{GAISE_2016 INSTITUTION = {American Statistical Association}, ADDRESS = {Alexandria, VA}, YEAR = {2016}, - URL = {http://www.amstat.org/education/gaise, last accessed August 7, 2017} + URL = {https://www.amstat.org/asa/education/Guidelines-for-Assessment-and-Instruction-in-Statistics-Education-Reports.aspx} } @TECHREPORT{ASA_guidelines_2014, @@ -55,7 +55,7 @@ @TECHREPORT{ASA_guidelines_2014 INSTITUTION = {American Statistical Association}, ADDRESS = {Alexandria, VA}, YEAR = {2014}, - URL = {http://www.amstat.org/asa/files/pdfs/EDU-guidelines2014-11-15.pdf, last accessed August 7, 2017} + URL = {https://www.amstat.org/asa/files/pdfs/EDU-guidelines2014-11-15.pdf} } @article{nolan_lang_2010, @@ -148,7 +148,7 @@ @Article{lubridate volume = {40}, number = {3}, pages = {1--25}, - url = {http://www.jstatsoft.org/v40/i03/}, + url = {https://www.jstatsoft.org/v40/i03/}, } @@ -167,15 +167,17 @@ @article{mckinsey_2011 Author = {James Manyika and Michael Chui and Brad Brown and Jacques Bughin and Richard Dobbs and Charles Roxburgh and Angela Hung Byers}, tile = {{Big Data: The Next Frontier for Innovation, Competition, and Productivity}}, year = {2011}, - url = {http://www.mckinsey.com/business-functions/digital-mckinsey/our-insights/big-data-the-next-frontier-for-innovation, last accessed August 7, 2017} + url = {https://www.mckinsey.com/business-functions/digital-mckinsey/our-insights/big-data-the-next-frontier-for-innovation} } @article{NYT_2014, author = "Steve Lohr", title = {{For Big-Data Scientists, 'Janitor Work' Is Key Hurdle to Insights}}, year = "2014", + month = "8", + day = "17", journal = "New York Times", - url = {https://www.nytimes.com/2014/08/18/technology/for-big-data-scientists-hurdle-to-insights-is-janitor-work.html, last accessed August 7, 2017} + url = {https://www.nytimes.com/2014/08/18/technology/for-big-data-scientists-hurdle-to-insights-is-janitor-work.html} } @Book{R_packages, @@ -208,7 +210,7 @@ @article{wickham_2017 author = {Hadley Wickham}, title = {{The tidy tools manifesto}}, year = "2017", - URL = {https://cran.r-project.org/web/packages/tidyverse/vignettes/manifesto.html, last accessed August 7, 2015} + URL = {https://cran.r-project.org/package=tidyverse/vignettes/manifesto.html} } @Book{R4DS, @@ -244,7 +246,7 @@ @inproceedings{Guzdial_2006 location = {Canterbury, United Kingdom}, pages = {51--58}, numpages = {8}, - url = {http://doi.acm.org/10.1145/1151588.1151597}, + url = {https://doi.acm.org/10.1145/1151588.1151597}, doi = {10.1145/1151588.1151597}, acmid = {1151597}, publisher = {ACM}, @@ -318,7 +320,7 @@ @article{DAVIDSON:2014 journal = {AMSTAT News}, title = {{``Aren't We Data Science?''}}, year = "2013", - URL = {http://magazine.amstat.org/blog/2013/07/01/datascience/, last accessed April 11, 2015} + URL = {https://magazine.amstat.org/blog/2013/07/01/datascience/, last accessed April 11, 2015} } @article{Pando:2014, @@ -326,7 +328,7 @@ @article{Pando:2014 title = {{``Did the mathematician who hacked {OkCupid} violate federal computer laws?''}}, journal = {Pando Daily}, year = "2014", - URL = {http://pando.com/2014/01/22/did-the-mathematician-who-hacked-okcupid-violate-federal-computer-laws/, last accessed April 11, 2015} + URL = {https://pando.com/2014/01/22/did-the-mathematician-who-hacked-okcupid-violate-federal-computer-laws/, last accessed April 11, 2015} } @article{Strata, @@ -342,7 +344,7 @@ @article{Wired title = {{``How a Math Genius Hacked OkCupid to Find True Love''}}, journal = {WIRED}, year = {2014}, - URL = {http://www.wired.com/wiredscience/2014/01/how-to-hack-okcupid/, last accessed April 11, 2015} + URL = {https://www.wired.com/wiredscience/2014/01/how-to-hack-okcupid/, last accessed April 11, 2015} } @article{OkTrendsLies, @@ -350,7 +352,7 @@ @article{OkTrendsLies title = {{``The Biggest Lies in Online Data''}}, journal = {{OkTrends}: dating research from {OkCupid}}, year = {2010}, - URL = {http://blog.okcupid.com/index.php/the-biggest-lies-in-online-dating, last accessed April 11, 2015} + URL = {https://blog.okcupid.com/index.php/the-biggest-lies-in-online-dating, last accessed April 11, 2015} } % Software @@ -369,7 +371,7 @@ @Manual{mosaic author = {Randall Pruim and Daniel Kaplan and Nicholas Horton}, year = {2014}, version = {R package version 0.9.1-3}, - url = {http://CRAN.R-project.org/package=mosaic, last accessed April 11, 2015}, + url = {https://CRAN.R-project.org/package=mosaic, last accessed April 11, 2015}, } % web links @@ -438,7 +440,7 @@ @online{moderndive author = {Chester Ismay and Albert Y Kim}, title = {An Introduction to Statistical and Data Sciences via R}, year = 2017, - url = {http://www.moderndive.com/}, + url = {https://www.moderndive.com/}, urldate = {2017-08-02} } diff --git a/vignettes/biopics.Rmd b/vignettes/biopics.Rmd deleted file mode 100644 index 5b88b04..0000000 --- a/vignettes/biopics.Rmd +++ /dev/null @@ -1,426 +0,0 @@ ---- -title: "Looking at Biopics of Hollywood" -author: "Pradeep Adhokshaja" -date: "`r format(Sys.time(), '%d %B, %Y')`" -output: - rmarkdown::html_vignette: - df_print: kable -vignette: | - %\VignetteIndexEntry{Looking at Biopics of Hollywood} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Data Import - -For this project, we will be using the `tidyverse` library for data analysis. This is an R package that consists of tools such as `ggplot2`(data visualization),`tidyr`(data modification), `dplyr` (data management). The data set is obtained by calling the `fivethirtyeight` package using the command `library(fivethirtyeight)`. - - -After calling the necessary libraries, we look at the structure of the data. The command `glimpse()` allows us to get a brief summary of the data we are dealing with. This saves from displaying the entire dataset. A custom theme is set which will be used for all the graphs in this project. - -```{r include=FALSE} -plot_theme <- function(base_size = 12) { - theme( - text = element_text( color = "black"), - plot.title = element_text(size = 10,colour = "black",hjust=0.5), - plot.subtitle = element_text(face="italic"), - plot.caption = element_text(hjust=0), - axis.ticks = element_blank(), - panel.background = element_blank(), - panel.grid.major = element_line("grey80", size = 0.1), - panel.grid.minor = element_blank(), - strip.background = element_rect(fill = "grey80", color = "white"), - strip.text = element_text(size=12), - axis.title = element_text(size=8), - axis.text = element_text(size=8), - axis.title.x = element_text(hjust=1), - axis.title.y = element_text(hjust=1), - plot.background = element_blank(), - legend.background = element_blank(), - legend.title = element_text(colour = "black", face = "bold"), - legend.text = element_text(colour = "black", face = "bold")) -} -``` - -```{r message=FALSE, warning=FALSE} -library(fivethirtyeight) -library(ggplot2) -library(dplyr) -library(tidyr) -library(scales) -library(stringr) -library(highcharter) -glimpse(biopics) -``` - - -The data consists of 14 variables(columns) and 764 observations(rows). We will dive deeper into the data by asking ourselves a few questions about - - -1. Frequency of Biopics Across the Years -2. Data Visualization by Gender, Subject and person of color/white protaganist -3. Box Office Earnings - - -## Number of Biopics Released by Year - - -```{r fig.width=7 } -biopics %>% - group_by(year_release) %>% - summarise(n=n()) %>% - ggplot(aes(x=year_release,y=n)) + - geom_bar(stat = "identity") + - plot_theme() + - labs(x="Year",y="Number of Releases",title="Number of Releases by Year", - subtitle="", caption="Source:FiveThirtyEight") -``` - -The number of biopics have been increasing. - - -## Actors of Color and White Actors - - -### % Wise Plot By Decade - - -```{r fig.width=7} - -poc <- biopics %>% - mutate(time_period=cut(year_release,breaks=11,label=c("1910-1920","1920-1930","1930-1940","1940-1950","1950-1960","1960-1970","1970-1980","1980-1990","1990-2000","2000-2010","2010-2014"))) %>% group_by(time_period,person_of_color) %>% - summarise(n=n()) %>% - mutate(person_of_color=ifelse(person_of_color==0,"White","Person of Color")) %>% - mutate(n=n/sum(n)) - -ggplot(poc, aes(x=time_period, y=n, fill=person_of_color)) + - geom_bar(stat="identity") + - plot_theme() + - theme(axis.text.x = element_text(vjust=1,angle=90)) + - scale_y_continuous(labels = percent) + - labs(x="Decade",y="Percentage Composition",title="How do the % of movies involving actors of color change by decade?",caption="Data From: FiveThirtyEight") -``` - -The percentage of movies that had actors of color was the highest in the period *2010-2014*. Biopics that had actors of color were non existent during the period *1910-1930*. - - - -### Type of Subjects covered - - -*White Actors* - -```{r fig.width=7, eval=FALSE} -biopics %>% - mutate(person_of_color=ifelse(person_of_color==0,"White","Person of Color")) %>% - group_by(person_of_color,type_of_subject) %>% - summarise(n=n()) %>% - mutate(n=(n/sum(n))*100) %>% - filter(person_of_color !="Person of Color") %>% - mutate(n=round(n,2)) %>% - hchart("treemap", hcaes(x = type_of_subject, value = n, color = n)) %>% - hc_title(text="Type of Subjects of Movies involving White Actors(%)") -``` - - - - - -*Actors of Color* - - -```{r fig.width=7, eval=FALSE} -biopics %>% - mutate(person_of_color=ifelse(person_of_color==0,"White","Person of Color")) %>% - group_by(person_of_color,type_of_subject) %>% - summarise(n=n()) %>% - mutate(n=(n/sum(n))*100) %>% - filter(person_of_color =="Person of Color") %>% - mutate(n=round(n,2)) %>% - hchart("treemap", hcaes(x = type_of_subject, value = n, color = n)) %>% - hc_title(text="Type of Subjects of Movies involving Actors of Color(%)") -``` - - - -*A side by side comparison* - -```{r fig.width=7,fig.height=10} -biopics %>% - mutate(person_of_color = ifelse(person_of_color==0,"White","Person of Color")) %>% - group_by(person_of_color,type_of_subject) %>% - summarise(n=n()) %>% - mutate(n=n/sum(n)) %>% - ggplot(aes(x=type_of_subject,y=n,fill=person_of_color))+ - geom_bar(stat="identity")+ - scale_y_continuous(labels=percent)+ - plot_theme()+ - facet_wrap(~person_of_color)+ - coord_flip()+ - labs(title="How do subject coverage differ in biopics involving white actors and actors of color?",caption="Data From : FiveThirtyEight",x="Type of Subject",y="%") -``` - - -* Within biopics that had actors of color, topics like **Athlete**,**Activist**,**Singer**,**World Leader** and **Actor** had a higher coverage than in biopics that had white actors. - - - - -### Frequency of Movies Involving White Actors and Actors of Color - - -```{r fig.width=7,fig.height=8} -biopics %>% - mutate(person_of_color=ifelse(person_of_color==0,"White Actor","Actor of Color")) %>% - group_by(year_release,person_of_color) %>% - summarise(n=n()) %>% - ggplot(aes(x=year_release,y=n,colour=person_of_color))+ - geom_line()+ - plot_theme()+ - geom_vline(xintercept=1964,linetype=2)+ - labs(x="Year",y="Number",title="Number of Biopics by White Actors and Actors of Color", - subtitle="", - caption="Data from FiveThirtyEight")+ - geom_text(aes(1964,0),label="Civil Rights' Act",show.legend = F,hjust=-1,angle=90,vjust=1,inherit.aes = F,size=3)+ - geom_vline(xintercept=1974,linetype=2)+ - geom_text(aes(1974,0),label="First Successful African American themed Sitcom",show.legend = F, - hjust=0,vjust=1,inherit.aes = F,angle=90,size=3,colour="black") -``` - - -The vertical dashed line above indicates the year 1964, when the Civil Rights' Act was passed. The number of biopics depicting colored persons went up in the 1970's. The first successful African American themed sitcom was released in the year 1974 ("Good Times"). - - -## Gender Wise - - -### Number of Biopics for each gender - -```{r fig.width=7} -biopics %>% - group_by(year_release,subject_sex) %>% - summarise(n=n()) %>% - rename(gender=subject_sex) %>% - ggplot(aes(x=year_release,y=n,colour=gender))+ - geom_line()+ - plot_theme()+ - labs(x="Year",y="Number",title="Number of Biopics By Gender", - subtitle="",caption="Data from FiveThirtyEight")+ - scale_x_continuous(breaks = seq(1920,2014,5))+ - theme(plot.title=element_text(size=18),axis.text.x = element_text(angle=90, vjust=1)) -``` - - -### Change in the number of movies from previous years - -Before we start plotting the data, we have to replace empty values with a zero. For that purpose we modify the data using the `gather` and `dcast` functions . These functions help us reshape the data and assign values. - -```{r fig.width=7,warning=FALSE} -year_wise_female <- biopics %>% - group_by(subject_sex,year_release) %>% - summarise(n=n()) %>% - filter(subject_sex=="Female") %>% - mutate(n_new=lag(n,1)) %>% - mutate(change=n-n_new) -year_wise_male <- biopics %>% - group_by(subject_sex,year_release) %>% - summarise(n=n()) %>% - filter(subject_sex=="Male") %>% - mutate(n_new=lag(n,1)) %>% - mutate(change=n-n_new) -year_wise <- bind_rows(year_wise_male,year_wise_female) -year_wise %>% - ggplot(aes(x=as.numeric(year_release),y=change)) + - geom_line() + - plot_theme()+ - labs(x="Year",y="Number",title="Change in Number of Biopic Movies From Previous Year", - subtitle="", caption="Data from FiveThirtyEight")+ - facet_wrap(~subject_sex,scales = "fixed")+ - scale_x_continuous(breaks = seq(1920,2014,5)) + - theme(plot.title=element_text(size=18),axis.text.x = element_text(angle=90, vjust=1)) - -``` - -The change in the number of biopics that had male subjects shot up in 2014. - - -### What kind of subjects do movies about male and female protaganists deal with? - -```{r fig.width=7,fig.height=7} -biopics$type_of_subject <- gsub(" ","",biopics$type_of_subject) -biopics <- biopics %>% - mutate(type_of_subject = strsplit(as.character(type_of_subject), "/")) %>% - unnest(type_of_subject) -biopics$type_of_subject <- tolower(biopics$type_of_subject) -women_percent <-biopics %>% - filter(subject_sex=="Female") %>% - group_by(type_of_subject) %>% - summarise(n=n()) %>% - mutate(percent_women=n/sum(n)) -men_percent <- biopics %>% - filter(subject_sex=="Male") %>% - group_by(type_of_subject) %>% - summarise(n=n()) %>% - mutate(percent_man=n/sum(n)) - -percent_overall <- full_join(women_percent,men_percent,by="type_of_subject") %>% - select(-c(n.x,n.y)) %>% - mutate(percent_women = ifelse(is.na(percent_women),0,percent_women),percent_man=ifelse(is.na(percent_man),0,percent_man)) - -ggplot(percent_overall, aes(x = percent_women, y = percent_man, color = (percent_women - percent_man))) + - geom_abline(color = "gray40", lty = 2) + - geom_jitter(alpha = 0.1, size = 1, width = 0.3, height = 0.3) + - geom_text(aes(label = type_of_subject), check_overlap = TRUE, vjust = 1.5) + - scale_x_continuous(labels = percent_format(),limits = c(0,0.322)) + - scale_y_continuous(labels = percent_format(),limits=c(0,0.322)) + - scale_color_gradient( low = "red", high = "gray75") + - theme(legend.position="none") + - labs(y = "Biopics Involving Men", x ="Biopics Involving Women",text=element_text(size=10), - plot.title=element_text(hjust=0.5))+ - ggtitle("Relative Frequency of Subjects(Gender Wise)") -``` - - -Topics that are close to the line indicate topics that have similar frequencies in both the sets of data. These topics include *government*,*academic* and *activist*. Topics that are far from this line are topics that are found frequently in one set but not the other.For example,a larger percentage of biopics that involved women,portrayed authors. By looking at the other side, topics like *military* had a higher percentage of coverage in biopics involving men than women.More the red color of the text, the more frequently the topic appears in biopics related to men. - - -### Race and Gender - - -```{r fig.width=10,fig.height=10} -biopics %>% mutate(cuts=cut(year_release,breaks=5,label=c("1910-1930","1930-1950","1950-1970","1970-1990","1990-2010")))%>% - mutate(subject_race=ifelse(grepl("\\bHispanic\\b",subject_race),"Hispanic",subject_race)) %>% - group_by(cuts,subject_sex,subject_race)%>% filter(subject_race!="")%>% - summarise(n=n()) %>% - mutate(n=n/sum(n)) %>% - ggplot(aes(x=cuts, y=n*100, fill=subject_race)) + - geom_bar(stat="identity", position="dodge",width = 0.7)+ - plot_theme()+ - ylab("%")+ - ggtitle("Gender and Race")+ - theme(plot.title = element_text(hjust = 0.5))+ - xlab("Year")+ - scale_fill_manual(values = c("#24576D", "#A113E2","#000000", "#D91460", - "#28AADC", "#40cc49","#F2583F", - "#96503F","#ffc100","#918d58","#e98000","#d2f4d2", - "#cdc8b1","#a87582"))+ - facet_grid(subject_sex~.)+ - theme(plot.title=element_text(size=18),axis.text.x = element_text(angle=90, vjust=1))+ - labs(caption="Data From FiveThirtyEight") -``` - -Most of the biopics depict White Americans followed by African Americans. - -## Box Office Information - -The box office information suggests how successful the movie was. In this section, we are not taking into account the factor of inflation. - - -### Distributions of earnings per year - -```{r fig.width=12,fig.height=12} -options(scipen=999) -biopics %>% - filter(box_office!="-") %>% - mutate( - box_office=gsub("$","",box_office,fixed=T), - denom=str_sub(box_office,nchar(box_office),nchar(box_office)), - box_office=gsub("M","",box_office), - box_office=gsub("K","",box_office), - box_office=as.numeric(box_office), - box_office=ifelse(denom=="M",box_office*1000000,box_office), - box_office=ifelse(denom=="K",box_office*1000,box_office), - box_office= box_office/1000000 - ) %>% - ggplot(aes(x=as.factor(year_release), y=box_office)) + - geom_boxplot() + - stat_summary(fun.y="mean", geom="point", shape=23, size=2, fill="white")+ - plot_theme()+ - labs(title="Distributions of Earning Per Year",x="Year",y="In Millions", - subtitle="Inflation not taken into account(Diamond Point Represents Mean)",caption="Data From FiveThirtyEight")+ - facet_wrap(~subject_sex,ncol=1)+ - theme(axis.text.x = element_text(angle=90,vjust=1)) -``` - -A large number of movies earned below \$100M.As we progress through the 80's, these numbers go higher , especially in the case of biopics based on men. The highest earning movie based on a female protaganist was released in 1964.This was the "Sound of Music" which earned approximately \$163M. - - -### How do the distributions of box office earnings change by gender of subject? - - -```{r fig.width=7} -options(scipen=999) -biopics %>% - filter(box_office!="-") %>% - mutate( - box_office=gsub("$","",box_office,fixed=T), - denom=str_sub(box_office,nchar(box_office),nchar(box_office)), - box_office=gsub("M","",box_office), - box_office=gsub("K","",box_office), - box_office=as.numeric(box_office), - box_office=ifelse(denom=="M",box_office*1000000,box_office), - box_office=ifelse(denom=="K",box_office*1000,box_office), - box_office=box_office/1000000) %>% - ggplot(aes(x=subject_sex, y=box_office,fill=subject_sex)) + - geom_violin(color = "grey50")+ - xlab("Box Office") + - ylab("Count") + - stat_summary(fun.y="mean", geom="point", size=2, colour="white") + - plot_theme() + - theme(legend.position="none")+ - labs(x="Gender",y="In Millions",title="Distributions of Box Office Earnings", - subtitle="Inflation Not Taken into Account", - caption="Source: FiveThirtyEight") -``` - -The mean earning is a little higher for biopics with male subjects. The mean is higher probably due to the outliers present. - - -### How do the distributions of box office earnings change by race of subject? - - - -```{r fig.width=7,fig.height=7} -biopics %>% - filter(box_office!="-") %>% - mutate( - box_office=gsub("$","",box_office,fixed=T), - denom=str_sub(box_office,nchar(box_office),nchar(box_office)), - box_office=gsub("M","",box_office), - box_office=gsub("K","",box_office), - box_office=as.numeric(box_office), - box_office=ifelse(denom=="M",box_office*100000,box_office), - box_office=ifelse(denom=="K",box_office*1000,box_office), - box_office=box_office/1000000 - ) %>% - ggplot(aes(x=as.factor(person_of_color), y=box_office,fill=as.factor(person_of_color))) + - geom_violin(color = "grey50")+ - xlab("Person of Color") + - ylab("Count") + - stat_summary(fun.y="mean", geom="point", size=2, colour="white") + - plot_theme() + - theme(legend.position="none")+ - labs(x="Person of Color",y="In Millions",title="Distributions of Box Office Earnings", - subtitle="Inflation Not Taken into Account", - caption="Source: FiveThirtyEight") -``` - - -More number of movies with protaganists of color earned higher in the box office. This is probably why the average box office earnings are higher. - - - -## Results and Conclusion - -1. We see that there are more movies depicting non colored protaganists than colored protaganists -2. Biopics based on male characters tend to be more military and sports themed. -3. The biopics pertaining to non colored main characters shot up after 1974. -3. The highest grossing biopics based on a woman is The Sound of Music. -4. Biopics involving colored main characters earned higher on average in the box office. - -This shows that there needs to be more involvement by directors and actors in producing movies that depict important personalities that belong to other races. - diff --git a/vignettes/bob_ross.Rmd b/vignettes/bob_ross.Rmd deleted file mode 100644 index 598b4bf..0000000 --- a/vignettes/bob_ross.Rmd +++ /dev/null @@ -1,195 +0,0 @@ ---- -title: "Bob Ross - Joy of Painting" -author: "Jonathan Bouchet" -date: "`r Sys.Date()`" -output: -rmarkdown::html_vignette: -df_print: kable -vignette: | - %\VignetteIndexEntry{Bob Ross' Joy of Painting} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -This vignette is based on [538 study : A statistical analysis of the work of Bob Ross](https://fivethirtyeight.com/features/a-statistical-analysis-of-the-work-of-bob-ross/). Bob Ross was an american painter and host of the _The Joy of Painting_, an instructional television program that aired from 1983 to 1994 on PBS in the United States. - -Load required packages to reproduce analysis as well as the dataset. - -```{r, message = FALSE, warning = FALSE} -library(fivethirtyeight) -library(ggplot2) -library(dplyr) -library(tibble) -library(tidyr) -library(ggthemes) -library(knitr) -library(corrplot) -library(ggraph) -library(igraph) -``` - -## Data explanation and cleaning -The author of the article (W. Hickey) went through all Bob Ross's paintings and coded the describing elements (trees, water, mountain, etc ...) : when an element is present in a painting, it is encoding by 1 in the relevant column. He wasn't able to analyze 3 paintings. There are also 2 episodes having the same title, so one of them is renamed to avoid errors during a group_by episode. In addition, there are 22 episodes where Bob Ross did not paint. - -```{r load data} -df <- bob_ross -#define incomplete paintings -incomplete <-c("PURPLE MOUNTAIN RANGE","COUNTRY CHARM","PEACEFUL REFLECTIONS") -df <- df %>% filter(guest==0 & !(title %in% incomplete)) -#check the 2 episodes with same name -#df %>% filter(title=="LAKESIDE CABIN") -df[df$episode=='S08E02','title']<-'LAKESIDE CABIN 2' -``` - -After removing the missing paintings, the dataframe consists of 66 features describing 378 paintings. - -Given the structure of the dataframe : - -##Study by Features - -* a `colSum` can provide the total number and percentage (`tot`, `featurePercentage`) of features through all the paintings as well as their frequency(`featureFreq`). -* a `rowSum` can provide the distribution of features present per painting. - -## Frequency -```{r feature frequency prep.} -#calculate the colSums for numeric columns and transpose the result -temp <- as.data.frame(df %>% - select(-episode, -season, -episode_num ,-title) %>% - summarise_all(funs(sum)) %>% t()) - -#rename,switch columns and calculate percentage over all paintings and frequency though all episodes -per_features <- temp %>% rownames_to_column() %>% - select(feature=rowname, tot = V1) %>% - mutate( - feature_percentage = (tot / sum(tot))*100, - feature_percentage_Label = paste0(round(feature_percentage,1),"%"), - feature_freq = tot/ nrow(df)*100, - feature_freq_label = paste0(round(feature_freq,1),"%")) -``` - -```{r feature frequency plot} -feature_freq_cut <- 10 #10% most present features -ggplot(data=filter(per_features,feature_freq>feature_freq_cut), aes(x=reorder(feature,feature_freq),y=feature_freq)) + - geom_bar(stat='identity') + geom_text(aes(label=feature_freq_label), position=position_dodge(width=0.9), vjust=.5,hjust=0,size=2.5,color='red') + - coord_flip() + - theme_fivethirtyeight() + - ggtitle('Features\'s appearance(%) through all episodes') -``` - -* `tree` and `trees` features appear in more than 90% of all the paintings. - -## Correlation -Since a row with no entries causes a standard deviation = 0, features are selected based on their number. - -```{r feature correlation} -#find features present -top<-c(per_features %>% filter(tot>1) %>% arrange(-tot) %>% select(feature)) -num_data<-df %>% select_(.dots = top$feature) -num_cols <- sapply(num_data, is.numeric) -corrplot(cor(num_data[,num_cols]), method='square',order="AOE") -``` - -* we see positive correlation for the expected cases, like `tree` / `trees`, or `night` / `moon` -* we also see negative correlation for features totally different, such as `waves` / `tree` -* a negative correlation means that as one of the variables increases, the other tends to decrease, and vice versa, so it makes sense to find an anti-correlation in the case `waves` / `tree` for example. - -## Study by Episodes -### Episodes having the greatest number of features -```{r episode prep.} -per_episode <- df %>% - select(-episode,-season,-episode_num ,-title) %>% - select_if(is.numeric) %>% - mutate(episode=1:n()) %>% - gather(item, count, -episode) %>% - group_by(episode) %>% - summarise(sum = sum(count)) %>% - arrange(-sum) - -#select a cut -cut_features<-11 -ggplot(data=filter(per_episode,sum>cut_features), aes(x=reorder(episode,sum),y=sum)) + - geom_bar(stat='identity') + - coord_flip() + theme_fivethirtyeight() + - ggtitle(paste0('Paintings having more than ', cut_features,' features')) -``` - -### Episodes distribution vs. their number of features -```{r episoe plot} -per_episode_summary <- per_episode %>% - group_by(sum) %>% - summarise(tot_features=n()) %>% - mutate( - percent = (tot_features/ sum(tot_features))*100, - label = paste0(round(percent,1),"%")) - -ggplot(data=per_episode_summary, aes(x=sum,y=tot_features)) + - geom_bar(stat='identity') + - geom_text(aes(label=label), position=position_dodge(width=0.9), vjust=-1,hjust=.5,size=2.5,color='red')+ - theme_fivethirtyeight() + ggtitle('Distribution of paitings vs. \n number of features') -``` - -* the mean number of features among all paintings is: -```{r} -mean(per_episode$sum) -``` - - -## Network analysis -### Motivation -To further study the features's correlation, a network analysis can be performed. In this case, for each painting an object `feature_i, feature_j` is built where i,j are indexes for a given painting. The `ggraph` package takes as input a dataframe with 2 columns and makes a graph network based on these 2 columns. -The function below loops over all features in a given painting and make the graph connections. - -```{r network func.} -#function to loop an array of X features and return a DF with feature_1 | feature_2 -make_connection<-function(x){ - feature_1<-c() - feature_2<-c() - cnt<-1 - for(i in 1:(nrow(x)-1)){ - for(j in (i+1):(nrow(x))){ - feature_1[cnt]<-(x[i,1]) - feature_2[cnt]<-(x[j,1]) - cnt<-cnt+1 - } - } - res<-data.frame("feature_1"=feature_1,"feature_2"=feature_2) - return(res) -} -``` - -### Result with all paintings for the first season -```{r network season 1} -#create empty DF to store the results -season_1 <- df %>% filter(season==1) - -#empty dataframe to save all the connections -season1_res <- data.frame("feature_1"= character(),"feature_2"=character()) - -#loop over paintings in season 1 -for(i in 1:nrow(season_1)){ - #select features of ith painting and make a dataframe - temp <- as.data.frame(season_1 %>% select(-episode, -season, -episode_num ,-title) %>% slice(i) %>% t()) - pos_data <- temp %>% rownames_to_column() %>% select(feature=rowname, number = V1) %>% filter(number>0) - res<-make_connection(pos_data) - season1_res<-rbind(season1_res,res) -} -``` - -The interesting thing is that we can apply some weights to the graph. The weights are based on the frequency of the connection between 2 features. - -```{r network plot} -graph_s1 <- season1_res %>% - group_by(feature_1, feature_2) %>% - summarise(freq=n()) - -colnames(graph_s1)[3]<-'weight' - -g1<-graph.data.frame(graph_s1) -ggraph(g1,layout='circle') + - geom_edge_fan(aes(width=E(g1)$weight),alpha=.25,show.legend = FALSE) + - geom_node_point(size=6,color="red",alpha=1) + - geom_node_text(aes(label = name)) + theme_fivethirtyeight() + ggtitle('Features network of all paintings of season 1') -``` - -* larger width indicate the frequency of this correlation -* the most frequent connection are `tree | trees`, `tree | lake`, `lake | mountain`, which makes sense as seen with the correlation plot. diff --git a/vignettes/comics_gender.Rmd b/vignettes/comics_gender.Rmd deleted file mode 100644 index 13068ed..0000000 --- a/vignettes/comics_gender.Rmd +++ /dev/null @@ -1,272 +0,0 @@ ---- -title: "Gender in Comic Books" -author: "Jonathan Bouchet" -date: "`r Sys.Date()`" -output: -rmarkdown::html_vignette: -df_print: kable -vignette: | - %\VignetteIndexEntry{Comic gender analysis} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -This vignette is based on [538 study : Comic Books Are Still Made By Men, For Men And About Men](https://fivethirtyeight.com/features/women-in-comic-books/) study about Marvel and DC characters since ~1939 until 2014 (August 24th) and aims at investigating the features of Comic Books characters according their gender / Publisher. - -```{r, message = FALSE, warning = FALSE} -#load packages and csv file -library(fivethirtyeight) -library(ggplot2) -library(dplyr) -library(readr) -library(tidyr) -library(lubridate) -library(janitor) -library(knitr) -library(grid) -library(fmsb) -library(wordcloud) -library(gridExtra) -``` - - -#### Overview plots - -Load full dataset using code in `?comic_characters` help file. Note we need to do this since `fivethirtyeight::comic_characters` only contains a -preview of the first 10 rows of the full dataset. - -```{r, warning = FALSE, message = FALSE} -# Get DC characters: -comic_characters_dc <- - "https://github.com/fivethirtyeight/data/raw/master/comic-characters/dc-wikia-data.csv" %>% - read_csv() %>% - clean_names() %>% - mutate(publisher = "DC") - -# Get Marvel characters: -comic_characters_marvel <- - "https://github.com/fivethirtyeight/data/raw/master/comic-characters/marvel-wikia-data.csv" %>% - read_csv() %>% - clean_names() %>% - mutate(publisher = "Marvel") - -# Merge two dataset and perform further data wrangling: -comic_characters <- - comic_characters_dc %>% - bind_rows(comic_characters_marvel) %>% - separate(first_appearance, c("year2", "month"), ", ", remove = FALSE) %>% - mutate( - # If month was missing, set as January and day as 01: - month = ifelse(is.na(month), "01", month), - day = "01", - # Note some years missing: - date = ymd(paste(year, month, day, sep = "-")), - align = factor( - align, - levels = c("Bad Characters", "Reformed Criminals", "Netural Characters", "Good Characters"), - ordered = TRUE) - ) %>% - select(publisher, everything(), -c(year2, day)) -``` - - -#### Overview plots - -* percentage of Gender per publisher. -* raw number of characters per publisher. - -```{r, warning = FALSE, message = FALSE, fig.width = 10} -#calculate raw number and percentage of each gender for each publisher -raw_number_per_publisher <- comic_characters %>% - group_by(publisher) %>% - summarise(number = n()) %>% - arrange(-number) - -percent_gen_pub <- comic_characters %>% - group_by(sex, publisher) %>% - summarise(number = n()) %>% - arrange(-number) %>% - group_by(publisher) %>% - mutate(countT = sum(number)) %>% - group_by(sex) %>% - mutate( - percentage = (100*number/countT), - label = paste0(round(percentage, 2))) - -#plot percentage of each gender for each publisher -percentage_per_publisher <- ggplot(data = percent_gen_pub,aes(x = sex,y = percentage, fill = publisher)) + - geom_bar(width = 0.9, stat = "identity", position = 'dodge') + - theme(axis.text.x = element_text(angle = 45, hjust = 1),legend.position = 'none') + - geom_text(aes(label = label), position=position_dodge(width = 0.9), vjust = -0.25,size = 2.5) + - scale_fill_manual(values = c("#3B9AB2","#EBCC2A")) + - xlab('') + - ylab('Percentage') - -raw_number_per_publisher <- ggplot(data = raw_number_per_publisher, aes(x = publisher, y = number, fill = publisher)) + - geom_bar(width = 0.1, stat = "identity") + - coord_flip() + - scale_fill_manual(values = c("#3B9AB2","#EBCC2A")) + - xlab('') + - ylab('') + - theme(legend.position = 'None') - -grid.arrange(percentage_per_publisher, raw_number_per_publisher, ncol=2) -``` - -* Marvel has more than the double of number of characters compared to DC. -* `Male` are more present in both publishers compared to `Female` on a ~2.5:1 ratio, while `LGBT` characters represent less than 1 percent. - -#### Number of Characters vs. Time -```{r, message = FALSE, warning = FALSE, fig.width = 10} -#select data with no NA's for sex and date and groupby - -#define list of gender per publisher -gender_list_marvel <- c("Female Characters", "Male Characters", "Genderfluid Characters","Agender Characters") -gender_list_dc <- c("Female Characters", "Male Characters", "Genderless Characters","Transgender Characters") - -marvel_vs_time <- comic_characters %>% - filter(publisher == 'Marvel' & !is.na(month) & !is.na(sex)) %>% - group_by(year, month, sex) %>% - summarise(number = n()) %>% - mutate( - sex_ordered = factor(sex, levels = gender_list_marvel), - month_ordered = factor(month, levels = month.name)) - -dc_vs_time <- comic_characters %>% - filter(publisher == 'DC' & month!= "Holiday" & !is.na(month) & !is.na(sex)) %>% - mutate(month = ifelse(month=="01","January",month)) %>% - group_by(year, month, sex) %>% summarise(number = n()) %>% - mutate( - sex_ordered = factor(sex, levels = gender_list_dc), - month_ordered = factor(month, levels = month.name)) - -plot_marvel_time <- ggplot(data = marvel_vs_time, aes(year, month_ordered)) + - geom_tile(aes(fill = number),colour = "white") + - scale_fill_gradient(low = "#EBCC2A", high = "black") + - facet_wrap(~ sex_ordered, ncol = 4) + - theme(axis.title.x = element_blank(), axis.ticks.x = element_blank(), axis.title.y = element_blank(), axis.ticks.y = element_blank(), legend.position = 'right', legend.title = element_blank(), legend.key.size = unit(.2, "cm")) + - xlim(1935,2015) - -plot_dc_time <- ggplot(data = dc_vs_time, aes(year, month_ordered)) + geom_tile(aes(fill = number),colour = "white") + - scale_fill_gradient(low = "#3B9AB2", high = "black") + - facet_wrap(~ sex_ordered, ncol = 4) + - theme(axis.title.x = element_blank() ,axis.ticks.x = element_blank(), axis.title.y = element_blank(), axis.ticks.y = element_blank(), legend.position = 'right', legend.title = element_blank(), legend.key.size = unit(.2, "cm")) + - xlim(1935,2015) - -grid.arrange(rbind(ggplotGrob(plot_marvel_time), ggplotGrob(plot_dc_time), size = "last")) -``` - -* Most of the characters are either `Female` or `Male` for both publishers. -* There is a gap in character's creation (for both publishers) ~1955-1960. -* The increase in `Female` characters appear early for Marvel (~1970) while it was in the late 70's for DC comics. -* `LGBT` characters appear in the late 70's (DC) and in the late 60's (Marvel). - -#### Characteristics per Gender, publisher - -To represent the average characteristics, radarchart will be used. - -* Characteristics looked at are : `id`,`align`,`eye`,`hair` and `alive`. -* the idea is to group the rows by these features and compute their mean -* later by taking the higher percentage, we can deduce a `profile` representative of each publisher. - -##### example: vs `Hair` feature -```{r, message = FALSE, warning = FALSE} -#function to aggregate data by (gender, feature, publisher) -aggregateFeature <- function(current_publisher, feature){ - #empty list to keep dataframe by (gender, feature) - currentFeature <- list() - if(current_publisher == 'Marvel'){ - gender_list <- gender_list_marvel - } else { - gender_list <- gender_list_dc - } - - for(i in 1:length(gender_list)){ - currentFeature[[i]] <- comic_characters %>% - filter(publisher == current_publisher ) %>% - select(hair, sex) %>% - na.omit() %>% - group_by(hair) %>% - filter(sex == gender_list[i]) %>% - summarise(number = n()) %>% - arrange(-number) %>% - mutate(countT = sum(number)) %>% - mutate(percentage = round(100*number/countT,1)) %>% - select(hair, percentage, number) - - if(current_publisher == 'Marvel'){ - colnames(currentFeature[[i]])[2]<-'percentage_marvel' - } else{ - colnames(currentFeature[[i]])[2]<-'percentage_dc' - } - - if(feature == 'hair'){ - #strip 'hair' word for better display of the radarchart - currentFeature[[i]]$hair <- sapply(currentFeature[[i]]$hair, function(x) gsub(' Hair','', x))} - } - - names(currentFeature) <- gender_list - return(currentFeature) -} -``` - -```{r, message = FALSE, warning = FALSE} -#outer join the 2 dataframes for (feature=Hair, gender=Male) -merged <- full_join(aggregateFeature('DC','hair')[[2]], aggregateFeature('Marvel','hair')[[2]], by='hair') -#set min/max percentages for the radarchart limits -min <- rep(0, length(merged$hair)) -max <- rep(50, length(merged$hair)) - -maleHair <- data.frame(rbind(max,min,merged$percentage_dc,merged$percentage_marvel)) -colnames(maleHair) <- merged$hair -row.names(maleHair) <- c('max','min','percentage_dc','percentage_marvel') -maleHair[is.na(maleHair)] <- 0 -``` - -```{r, message = FALSE, warning = FALSE} -#cosmetics -radarchart(maleHair, #dataframe - axistype=2, #axis type - pcol=c("#3B9AB2", "#EBCC2A"), #color -Wes Anderson Sissou- - plwd=1, #axis line width - pty=19, #marker type - plty=3, #line type - cglcol="grey", #axis line color - cglty=2, #axis line type - axislabcol="grey", #Color of axis label - cglwd=.6, #line type - vlcex=.6, #font size label - palcex=1.) #font size value -legend(x=1, y=1.3, #position - legend = c('DC male characters','Marvel male characters'),#labels - bty="n", #no window - pch=16, #marker type - text.col = "black", #label colors - col=c("#3B9AB2","#EBCC2A"), #marker color - cex=.8, #marker size - pt.cex=1) #font size -``` - -#### Popularity -By looking at the number of appearances for each characters, we can make a visualization (`wordcloud`) representing the most popular characters. - -```{r, message = FALSE, warning = FALSE, fig.width = 10} -#remove NA from appearance column, keep only the name inside parentheses -marvel_appearances <- comic_characters %>% - filter(publisher == 'Marvel') %>% - select(name, appearances) %>% na.omit() %>% - mutate(name = gsub(" *\\(.*?\\) *", "", name)) -dc_appearances <- comic_characters %>% - filter(publisher == 'DC') %>% - select(name, appearances) %>% na.omit() %>% - mutate(name = gsub(" *\\(.*?\\) *", "", name)) - -color_marvel <- colorRampPalette(c("#EBCC2A", "black")) -color_dc <- colorRampPalette(c("#3B9AB2", "black")) - -op <- par(mar = c(1, 2, 2, 1), mfrow = c(1, 2)) -wordcloud(dc_appearances$name, dc_appearances$appearances, min.freq = 100, colors = color_dc(10), scale = c(1.75, 0.2)) -wordcloud(marvel_appearances$name, marvel_appearances$appearances, min.freq = 250, colors = color_marvel(10), scale = c(1.5, 0.2)) -``` - -* `Batman`(DC) and `Spider-Man`(Marvel) ends up being the characters with the most appearances. diff --git a/vignettes/fivethirtyeight.Rmd b/vignettes/fivethirtyeight.Rmd index d00a68b..d2212b5 100644 --- a/vignettes/fivethirtyeight.Rmd +++ b/vignettes/fivethirtyeight.Rmd @@ -1,5 +1,5 @@ --- -title: "`fivethirtyeight` Package" +title: "fivethirtyeight Package" author: "Albert Y. Kim, Chester Ismay, and Jennifer Chunn" date: "`r Sys.Date()`" output: rmarkdown::html_vignette @@ -15,14 +15,13 @@ library(ggplot2) library(dplyr) library(readr) library(knitr) -``` -```{r, message=FALSE, warning=FALSE, echo=FALSE} +# Import master Google Sheet of all 538 data available here: +# https://docs.google.com/spreadsheets/d/1IMWAHNPIDzplafWW6AGnGyHmB1BMjohEw_V5HmT70Gs/edit#gid=840984416 datasets <- "https://goo.gl/OT8iHa" %>% read_csv() %>% filter(!is.na(DATAFRAME_NAME)) %>% - mutate(DATAFRAME_NAME = paste("`", DATAFRAME_NAME, "`", sep="")) %>% transmute( `Data Frame Name` = DATAFRAME_NAME, `Article Title` = ARTICLE_TITLE, @@ -42,73 +41,37 @@ datasets <- ) %>% arrange(`Data Frame Name`) -preview <- datasets %>% +preview_only_datasets <- datasets %>% filter(Preview == "Y") %>% pull(`Data Frame Name`) ``` ## Note -Given CRAN package size restrictions, the following `r length(preview)` datasets only consist of a preview of the first 10 rows of the full dataset. Code to load the full dataset are provided in the respective help file examples. For example, type `?house_district_forecast`. +All `r nrow(data(package = "fivethirtyeight")[[3]])` datasets included in the `fivethirtyeight` package are listed in the next section. However due to CRAN-hosted R package size restrictions, the following `r length(preview_only_datasets)` datasets out of `r nrow(data(package = "fivethirtyeight")[[3]])` only consist of a preview of the first 10 rows of the full dataset. Code to load the full dataset into R are provided in the respective help file example. For example, type `?house_district_forecast` into R. ```{r, message=FALSE, warning=FALSE, echo=FALSE} -preview %>% - matrix(ncol=3) %>% - kable() +preview_only_datasets ``` - - -## Datasets - -Here are all `r nrow(datasets)` datasets included in the `fivethirtyeight` package: +## All datasets ```{r, message=FALSE, warning=FALSE, echo=FALSE} datasets %>% + mutate(`Data Frame Name` = paste("`", `Data Frame Name`, "`", sep="")) %>% select(-Preview) %>% kable() ``` -## Guidelines - -In order to make the data easily accessible to R novices, we pre-process the -original data sets as they exist in the [538 GitHub -repository](https://github.com/fivethirtyeight/data) to adhere to the following -guidelines: - -1. **Naming conventions for data frame and variable names**: - 1. Whenever possible, all names should be no more than 20 characters long. Exceptions to this rule exist when shortening the names to less than 20 characters would lead to a loss of information. - 1. Use only lower case characters and replace all spaces with underscores. This format is known as `snake_case` and is an alternative to `camelCase`. - 1. In the case of variable (column) names within a data frame, use underscores instead of spaces. -1. **Variables identifying observational units**: - 1. Any variables uniquely identifying each observational unit should be in the left-hand columns. -1. **Dates**: - 1. If only a `year` variable exists, then it should be represented as a numerical variable. - 1. If there are `year` and `month` variables, then convert them to `Date` objects as `year-month-01`. In other words, associate all observations from the same month to have a `day` of `01` so that a correct `Date` object can be assigned. - 1. If there are `year`, `month`, and `day` variables, then convert them to `Date` objects as `year-month-day`. -1. **Ordered Factors, Factors, Characters, and Logicals**: - 1. Ordinal categorical variables are represented as `ordered` factors. - 1. Categorical variables with a fixed and known set of levels are represented as regular `factor`s. - 1. Categorical variables whose possible levels are either unknown or of a very large number are represented as `character`s. - 1. Any "yes/no" character encoding of binary variables is converted to `TRUE/FALSE` logical variables. -1. **Tidy data format**: - 1. Whenever possible, save all data frames in ["tidy" data](https://vita.had.co.nz/papers/tidy-data.html) format: - a) Each variable forms a column. - a) Each observation forms a row. - a) Each type of observational unit forms a table. - 1. If converting the raw data to "tidy" data format alters the dataset too much, then make the code to convert to tidy format easily accessible in the help file. - -**Note**: The code used to pre-process the data can be found on the [GitHub repository](https://github.com/rudeboybert/fivethirtyeight/tree/master/data-raw) for the package in the `process_data_sets.R` files. These can serve as data manipulation/wrangling examples and exercises for more advanced students. - - - ## Motivation +The motivation for creating this package is articulated in [The fivethirtyeight R Package: "Tame Data" Principles for Introductory Statistics and Data Science Courses](https://escholarship.org/uc/item/0rx1231m) by Kim, Ismay, and Chunn (2018) published in Volume 11, Issue 1 of the journal "Technology Innovations in Statistics Education". Here is an executive summary. + We are involved in statistics and data science education, in particular at the introductory undergraduate level. As such, we are always looking for data sets that -balance being +balance being: 1. **Rich enough** to answer meaningful questions with, **real enough** to ensure that there is context, and **realistic enough** to convey to students that data as it exists "in the wild" often needs processing. 1. Easily and quickly accessible to novices, so that we [minimize the prerequisites to research](https://arxiv.org/abs/1507.05346). @@ -126,6 +89,38 @@ It is along these lines that we present `fivethirtyeight`: an R package of data With consultation from [Andrew Flowers](https://fivethirtyeight.com/contributors/andrew-flowers/) and [Andrei Scheinkman](https://fivethirtyeight.com/contributors/andrei-scheinkman/) of FiveThirtyEight, we go one step further by: -1. Doing just enough pre-processing so that statistics and data science novices can sink their teeth into the data right away. +1. Doing just enough pre-processing (i.e. data "taming") so that statistics and data science novices can sink their teeth into the data right away. 2. Packaging it all in an easy to load format: package installation instead of working with CSV files. 3. Providing easily accessible documentation: The help file for each data set includes a thorough description of the observational unit and all variables, a link to the original article, and (if listed) the data sources. + + +## "Tame" data principles + +In order to make the data easily accessible to R novices, we pre-process the +original data sets as they exist in the [538 GitHub +repository](https://github.com/fivethirtyeight/data) to adhere to the following +"tame" data guidelines: + +1. **Naming conventions for data frame and variable names**: + 1. Whenever possible, all names should be no more than 20 characters long. Exceptions to this rule exist when shortening the names to less than 20 characters would lead to a loss of information. + 1. Use only lower case characters and replace all spaces with underscores. This format is known as `snake_case` and is an alternative to `camelCase`, where successive words are delineated with upper case characters. + 1. In the case of variable (column) names within a data frame, use underscores instead of spaces. +1. **Variables identifying observational units**: + 1. Any variables uniquely identifying each observational unit should be in the left-hand columns. +1. **Dates**: + 1. If only a `year` variable exists, then it should be represented as a numerical variable. + 1. If there are `year` and `month` variables, then convert them to `Date` objects as `year-month-01`. In other words, associate all observations from the same month to have a `day` of `01` so that a correct `Date` object can be assigned. + 1. If there are `year`, `month`, and `day` variables, then convert them to `Date` objects as `year-month-day`. +1. **Ordered Factors, Factors, Characters, and Logicals**: + 1. Ordinal categorical variables are represented as `ordered` factors. + 1. Categorical variables with a fixed and known set of levels are represented as regular `factor`s. + 1. Categorical variables whose possible levels are either unknown or of a very large number are represented as `character`s. + 1. Any "yes/no" character encoding of binary variables is converted to `TRUE/FALSE` logical variables. +1. **Tidy data format**: + 1. Whenever possible, save all data frames in "tidy" data format as defined by @wickham_2014: + a) Each variable forms a column. + a) Each observation forms a row. + a) Each type of observational unit forms a table. + 1. If converting the raw data to "tidy" data format alters the dataset too much, then make the code to convert to tidy format easily accessible. + +**Note**: The code used to pre-process the data can be found on the [GitHub repository](https://github.com/rudeboybert/fivethirtyeight/tree/master/data-raw) for the package in the `process_data_sets.R` files. These can serve as data manipulation/wrangling examples and exercises for more advanced students. diff --git a/vignettes/tarantino_swears.Rmd b/vignettes/tarantino_swears.Rmd deleted file mode 100644 index a07b23d..0000000 --- a/vignettes/tarantino_swears.Rmd +++ /dev/null @@ -1,195 +0,0 @@ ---- -title: "How Many Fucks Does Tarantino Give?" -author: "Olivia Barrows, Jojo Miller, and Jayla Nakayama" -date: "`r Sys.Date()`" -output: - rmarkdown::html_vignette: - df_print: kable -vignette: | - %\VignetteIndexEntry{How Many Fucks Does Tarantino Give?} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(warning=FALSE, message=FALSE, fig.width=7.2) -``` - -This vignette is based on the data collected for the FiveThirtyEight study [A Complete Catalog Of Every Time Someone Cursed Or Bled Out In A Quentin Tarantino Movie](https://fivethirtyeight.com/features/complete-catalog-curses-deaths-quentin-tarantino-films/), with a focus on the use of swear words in his films. - -```{r libraries, message = FALSE, warning = FALSE} -library(fivethirtyeight) -library(ggplot2) -library(dplyr) -library(stringr) -library(knitr) -library(ggthemes) - -# Set default number of digits printed -options(digits = 2) -``` - - -## Points of Analysis -* comparison of total swear word usage over the course of Tarantino's career -* number of uses of the word 'fuck' and its variations, per movie - -The `tarantino` dataframe in the `fivethirtyeight` package presents data relating to the amount of deaths and swear words used in each of the seven movies noted via the `movie` variable. The `profane` variable notes whether the instance being examined relates to a swear word or not, and the `word` variable gives the actual form of the swear word used. Both deaths and instances of swearing are then noted as to when exactly they occur in the movie via the `minutes_in` variable. This analysis focuses on the usage of swear words in Tarantino's movies, and does not deal with the number of deaths tallied for his works. - - -### Preparing the Data -For this analysis, there is first the need to create a new dataframe, which will be called `tarantino_year`. This new dataframe includes the year of release for each film. Once created, the `tarantino_year` dataframe needs to be merged with the already existing `tarantino` dataframe, so the `year` variable will be present for analysis in the `tarantino_plus_year` data frame. - -```{r merge} -# Create new dataframe assigning year of release to movies -movie <- c("Reservoir Dogs", "Pulp Fiction", "Jackie Brown", "Kill Bill: Vol. 1", "Kill Bill: Vol. 2", "Inglorious Basterds", "Django Unchained") - -year <- c(1992, 1994, 1997, 2003, 2004, 2009, 2012) -tarantino_year <- tibble(movie, year) - -# Combine with existing `tarantino` dataframe -tarantino_plus_year <- inner_join(x = tarantino, y = tarantino_year, by = "movie") -``` - -The final step in preparing the `tarantino_plus_year` dataframe for this analysis is to filter out unnecessary information, keeping only entries with the value of `TRUE` under the `profane` variable. - -```{r profane} -tarantino_swears <- tarantino_plus_year %>% filter(profane == TRUE) -``` - - -### Total Swear Word Usage Over Tarantino's Career -Using the altered `tarantino_swears` dataframe, we can see how many times curse words are used in each of the movies. To better visualize how Tarantino's usage of swearing has changed over the years, we first ordered the movies by year of release, and then created a graphic with this re-ordered data. A table is provided below, for referencing year of release. - -```{r plot1, fig.height = 8} -# Ordering the movies by release year -by_year <- c("Reservoir Dogs", "Pulp Fiction", "Jackie Brown", "Kill Bill: Vol. 1", "Kill Bill: Vol. 2", - "Inglorious Basterds", "Django Unchained") -tarantino_factor <- tarantino_swears %>% - mutate(movie = factor(tarantino_swears$movie, levels = by_year)) - -# Plotting the amount of swear words used in each movie -ggplot(data = tarantino_factor, mapping = aes(x = movie, fill = movie)) + - geom_bar(color = "white") + - theme_fivethirtyeight() + - labs(x = "Movie", y = "Swear Count", title = "Total Swear Word Usage Per Movie") + - theme(axis.text.x = element_text(angle = 45, hjust = 1)) -``` - -```{r echo = FALSE} -#Presenting a table of movie and year of release, for reference -tarantino_year -``` - -In order to better represent this data available on the profanities used in Tarantino's movies, it is beneficial to first divide the swear words used into categories via the `word` variable. For the purposes of this analysis, eight were defined and stored in the `swear_category` variable: - -* swears including the word 'ass' -* swears including the word 'damn' -* swears including the word 'dick' or 'cock' -* swears including the word 'fuck' -* swears including the word 'shit', as well as 'merde' -* swears referring to damnation, and the word 'hell' -* swears with derogatory gender implications -* swears with derogatory racial implications - -```{r case_when} -# Creating the categories -tarantino_swears <- tarantino_swears %>% - mutate(swear_category = - case_when(grepl("ass", word) ~ "ass", - grepl("shit|merde", word) ~ "shit", - grepl("fuck", word) ~ "fuck", - grepl("damn|hell", word) ~ "damnation", - grepl("bastard", word) ~ "bastard", - grepl("dick|cock", word) ~ "dick", - grepl("bitch|cunt|pussy|faggot|slut", word) ~ "gender", - grepl("gook|jap|jew|n-word|negro|slope|wetback|squaw", word) ~ "race")) -``` - -With these categories defined, we can produce a table called `Profanity_Sum` showing how often swear words of each category were used during Tarantino's movies. - -``` {r Profanity_Sum} -Profanity_Sum <- tarantino_swears %>% - group_by(movie) %>% - summarize(Ass = mean(swear_category == "ass") * 100, - Shit = mean(swear_category == "shit") * 100, - Fuck = mean(swear_category == "fuck") * 100, - Dick = mean(swear_category == "dick") * 100, - Damnation = mean(swear_category == "damnation") * 100, - Bastard = mean(swear_category == "bastard") * 100, - Gender = mean(swear_category == "gender") * 100, - Race = mean(swear_category == "race") * 100, - Unspeakable = Gender + Race) -Profanity_Sum -``` - -This table allows us to conclude the following: - -* Despite what the title would suggest, _Inglorious Basterds_ does not use the word 'bastard' at all -* Apart from _Django Unchained_, the word 'fuck' and its variations are the most commonly used swear words in Tarantino's movies -* _Django Unchained_ features `Unspeakable` as its category with the highest percentage of swear words -* Overall, swear words falling in the `Bastard` and `Dick` categories are used the least often in Tarantino's movies - -You may notice the `Unspeakable` variable, which is a combination of the `gender` and `race` categories from the earlier code. It is possible to break this variable down relative to each movie, showing how often either category had swear words used. - -```{r Unspeakable_Sum} -Unspeakable_Sum <- tarantino_swears %>% - group_by(movie) %>% - summarize(gen_por = mean(swear_category == "gender"), - race_por = mean(swear_category == "race")) %>% - mutate(Gender_Derogatory = gen_por * 100, Race_Derogatory = race_por * 100) %>% - select(-gen_por, -race_por) -Unspeakable_Sum -``` - -The resulting table allows us to conclude the following: - -* _Django Unchained_ has the highest percentage of racially derogatory swear words of all seven movies -* _Kill Bill, Vol. 1_ has the highest percentage of gender derogatory swear words of all seven movies -* _Kill Bill, Vol. 1_ is the only one of Tarantino's movies to not use any racially derogatory swear words - -The observation about _Django Unchained_ having the highest percentage of racially derogatory swear words used compared to not only the other movies analyzed but also in the context of the movie itself may seem unusual, but actually makes sense within the context of the movie. Because the plot takes place in a time where slavery still exists, and the titular character is also a former slave, it is easier to see why this extreme number would be present as an outlier for this particular set of tables. - -Based on the above information as a whole, it is possible to see that there is a general trend of less swearing over time in Tarantino's movies, apart from _Django Unchained_. This is somewhat unexpected, as swearing has generally become more accepted over time; however, the later rise in swearing for _Django Unchained_ does better reflect this fact. The use of such a large number of racially derogatory swear words in a movie released in 2012 is also of note. - -### How Many Fucks Does Tarantino Actually Give? -Looking at the dataframe `tarantino_swears`, the next question we have to ask is 'How many fucks _exactly_ does Tarantino give in each of his movies?' First, however, it is necessary to filter for only usages of the word 'fuck' and its variations. - -```{r fbomb} -tarantino_fuck <- tarantino_swears %>% filter(swear_category == "fuck") -``` - -At this point, we are able to create a preliminary graph, which details the amount of all fucks given throughout each movie.The following information displays what this amount is, spread out over the time of the movie. We chose to divide the amount of swearing over the course of the movie in this manner because it allows us to examine what parts of each movie include the most swearing. - -```{r given, fig.height=8} -ggplot(data = tarantino_fuck, mapping = aes(x = minutes_in)) + - geom_histogram(binwidth = 10, color = "white", fill = "springgreen2") + facet_wrap(~movie) + - theme_fivethirtyeight() + - labs(x = "Minutes In", y = "Fucks Given", title = "Fucks Tarantino Gives Per Movie") + - theme(axis.text.x = element_text(angle = 45, hjust = 1)) -``` - -This can be further broken down to show what _types_ of fucks Tarantino gives. - -```{r type_given, fig.height=8} -ggplot(data = tarantino_fuck, mapping = aes(x = minutes_in, fill = word)) + - geom_histogram(binwidth = 10, color = "white") + facet_wrap(~movie) + - theme_fivethirtyeight() + - labs(x = "Minutes In", y = "Fucks Given", title = "Fucks Tarantino Gives Per Movie (by Type)") + - theme(axis.text.x = element_text(angle = 45, hjust = 1)) -``` - -From this information, it is possible to see that in terms of fucks given: - -* Tarantino most often uses the variations 'fuck' and 'fucking' -* _Kill Bill: Vol. 1_ had relatively few uses of the word 'fuck' and its variations -* _Reservoir Dogs_ sees a spike in 'fuck'-related swearing at about midway through the movie - -The spike in 'fuck'-related swearing about halfway through _Reservoir Dogs_ coincides with rising action of the plot nearing what would be the climax, and as such is an understandable increases. _Pulp Fiction_ also follows this, with the 'fuck'-related swearing more heavily weighted toward the end of the movie, around the time where the plot's climax would be taking place. Overall, the above plots show that 'fuck' has a number of variations which are used with varying frequency. - -## Conclusions -With the above data analyzed, we can see that Quentin Tarantino has used a large number of swear words throughout his career, arguably to great effect in some cases. Although the general trend seems to be a decline in the amount of swearing present in his movies over time, his 2012 movie _Django Unchained_ broke free of that trend and also used a staggeringly large number of racially derogatory swear words. Other than this statistical outlier, it's easy to notice that the word 'fuck' and its different variations is by far the most commonly used category of swear words in most of Tarantino's movies. A very versatile word, 'fuck' can be used in a variety of ways, and it would not be much of a stretch, based on this data, to say that Tarantino has capitalized on its different uses throughout his career. - -Sociologically, it is interesting to analyze the different patterns of swear word usage over time, particularly with the gender and racially derogatory categories of swear words. For the most part, Tarantino does not use many gender derogatory swear words for all of his movies apart from _Kill Bill: Vol. 1_ and _Kill Bill: Vol. 2_, gender derogatory swear words amounted to less than ten percent of the total swear words used for each movie. The two outliers, however, showed gender derogatory terms being used nearly twenty and sixteen percent of the time, respectively. Even racially derogatory swear words accounted for about ten percent or less of the swear words used in Tarantino's movies, apart from _Django Unchained_. Furthermore, viewing the word 'fuck' and its variations as a very versatile method of swearing is interesting to consider, though it's worth noting that even that amount has declined over the course of Tarantino's career. - -At the end of the day, it can be said with a fair amount of confidence that Tarantino did, indeed, give quite a few fucks. diff --git a/vignettes/trump_twitter.Rmd b/vignettes/trump_twitter.Rmd deleted file mode 100644 index efbdd15..0000000 --- a/vignettes/trump_twitter.Rmd +++ /dev/null @@ -1,180 +0,0 @@ ---- -title: "Trump Twitter analysis using the `tidyverse`" -author: "Adam Spannbauer and Jennifer Chunn" -date: "`r Sys.Date()`" -output: - rmarkdown::html_vignette: - df_print: kable -vignette: | - %\VignetteIndexEntry{Trump Twitter tidyverse analysis} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -This vignette is based on data collected for the 538 story entitled "The World's Favorite Donald Trump Tweets" by Leah Libresco available [here](https://fivethirtyeight.com/features/the-worlds-favorite-donald-trump-tweets/). - -Load required packages to reproduce analysis. - -```{r, message=FALSE, warning=FALSE} -library(fivethirtyeight) -library(ggplot2) -library(dplyr) -library(readr) -library(tidytext) -library(textdata) -library(stringr) -library(lubridate) -library(knitr) -library(hunspell) -# Turn off scientific notation -options(scipen = 99) -``` -## Check date range of tweets - -```{r date_range} -## check out structure and date range ------------------------------------------------ -(minDate <- min(date(trump_twitter$created_at))) -(maxDate <- max(date(trump_twitter$created_at))) -``` - - -# Create vectorised stemming function using hunspell -```{r hunspell} -my_hunspell_stem <- function(token) { - stem_token <- hunspell_stem(token)[[1]] - if (length(stem_token) == 0) return(token) else return(stem_token[1]) -} -vec_hunspell_stem <- Vectorize(my_hunspell_stem, "token") -``` - - -# Clean text by tokenizing & removing urls/stopwords -We first remove URLs and stopwords as specified in the `tidytext` library. Stopwords are common words in English. We also do spellchecking using hunspell. -```{r tokens} -trump_tokens <- trump_twitter %>% - mutate(text = str_replace_all(text, - pattern=regex("(www|https?[^\\s]+)"), - replacement = "")) %>% #rm urls - mutate(text = str_replace_all(text, - pattern = "[[:digit:]]", - replacement = "")) %>% - unnest_tokens(tokens, text) %>% #tokenize - mutate(tokens = vec_hunspell_stem(tokens)) %>% - filter(!(tokens %in% stop_words$word)) #rm stopwords -``` - - -# Sentiment analysis -To measure the sentiment of tweets, we used the AFINN lexicon for each (non-stop) word in a tweet. The score runs between -5 and 5. We then sum the scores for each word across all words in one tweet to get a total tweet sentiment score. -```{r sentiment} -afinn_sentiment <- system.file("extdata", "afinn.csv", package = "fivethirtyeight") %>% - read_csv() -trump_sentiment <- trump_tokens %>% - inner_join(afinn_sentiment, by=c("tokens"="word")) - -trump_full_text_sent <- trump_sentiment %>% - group_by(id) %>% - summarise(score = sum(value, na.rm=TRUE)) %>% - ungroup() %>% - right_join(trump_twitter, by="id") %>% - mutate(score_factor = ifelse(is.na(score), "Missing score", - ifelse(score < 0, "-.Negative", - ifelse(score == 0, "0", "+.Pos")))) -``` - -## Distribution of sentiment scores - -```{r} -trump_full_text_sent %>% - count(score_factor) %>% mutate(prop = prop.table(n)) -``` - -46.4% of tweets did not have sentiment scores. 15.4% were net negative and 36.6% were net positive. - -```{r sentiment_hist, fig.width=7, , warning=FALSE} -ggplot(data=trump_full_text_sent, aes(score)) + - geom_histogram(bins = 10) -``` - - - -# plot sentiment over time -```{r plot_time, fig.width=7} -sentOverTimeGraph <- ggplot(data=filter(trump_full_text_sent,!is.na(score)), aes(x=created_at, y=score)) + - geom_line() + - geom_point() + - xlab("Date") + - ylab("Sentiment (afinn)") + - ggtitle(paste0("Trump Tweet Sentiment (",minDate," to ",maxDate,")")) -sentOverTimeGraph -``` - - -# Examine top 5 most positive tweets -```{r pos_tweets} -most_pos_trump <- trump_full_text_sent %>% - arrange(desc(score)) %>% - head(n=5) %>% - .[["text"]] - -kable(most_pos_trump, format="html") -``` - -# Examine top 5 most negative tweets -```{r, neg_tweets} -most_neg_trump <- trump_full_text_sent %>% - arrange(score) %>% - head(n=5) %>% - .[["text"]] -kable(most_neg_trump, format = "html") -``` - - - -# When is trumps favorite time to tweet? -Total number of tweets and average sentiment (when available) by hour of the day, day of the week, and month -```{r tweet_time} -trump_tweet_times <- trump_full_text_sent %>% - mutate(weekday = wday(created_at, label=TRUE), - month = month(created_at, label=TRUE), - hour = hour(created_at), - month_over_time = round_date(created_at,"month")) - -plotSentByTime <- function(trump_tweet_times, timeGroupVar) { - timeVar <- substitute(timeGroupVar) - timeVarLabel <- str_to_title(timeVar) - - trump_tweet_time_sent <- trump_tweet_times %>% - rename(timeGroup = !! timeVar) %>% - group_by(timeGroup) %>% - summarise(score = mean(score, na.rm=TRUE), Count = n()) %>% - ungroup() - - ggplot(trump_tweet_time_sent, aes(x=timeGroup, y=Count, fill = score)) + - geom_bar(stat="identity") + - xlab(timeVarLabel) + - ggtitle(paste("Trump Tweet Count & Sentiment by", timeVarLabel)) -} -``` - - -```{r plot_hour, fig.width=7, warning=FALSE} -plotSentByTime(trump_tweet_times, "hour") -``` - -* Trump tweets the least between 4 and 10 am. -* Trump's tweets are most positive during the 10am hour. - - -```{r plot_weekday, fig.width=7, warning=FALSE} -plotSentByTime(trump_tweet_times, "weekday") -``` - -* Trump tweeted the most on Tuesday and Wednesday -* Trump was most positive in the second part of the work week (Wed, Thurs, Fri) - -```{r plot_month, fig.width=7, warning=FALSE} -plotSentByTime(trump_tweet_times, "month_over_time") -``` - -* In this dataset, the number of tweets decreased after November 2015 and drastically dropped off after March 2016. It is unclear if this is a result of actual decrease in tweeting frequency or a result of the data collection process.