From 80e830c96d5611de106ff785c4a80d054efe13f3 Mon Sep 17 00:00:00 2001 From: Quarto GHA Workflow Runner Date: Thu, 14 Sep 2023 21:55:48 +0000 Subject: [PATCH] Built site for gh-pages --- .nojekyll | 2 +- materials/1_hello_arrow-exercises.html | 92 +++---- materials/1_hello_arrow.html | 62 ++--- .../2_data_manipulation_1-exercises.html | 3 +- materials/2_data_manipulation_1.html | 128 ++++----- materials/4_data_manipulation_2.html | 251 ++++++++++-------- materials/images/datasets.png | Bin 34804 -> 0 bytes materials/images/nyc_taxi_dataset.png | Bin 0 -> 58492 bytes search.json | 95 ++++--- sitemap.xml | 34 +-- 10 files changed, 354 insertions(+), 313 deletions(-) delete mode 100644 materials/images/datasets.png create mode 100644 materials/images/nyc_taxi_dataset.png diff --git a/.nojekyll b/.nojekyll index 048e7b9..b4099ba 100644 --- a/.nojekyll +++ b/.nojekyll @@ -1 +1 @@ -c93ab8b4 \ No newline at end of file +409ec87a \ No newline at end of file diff --git a/materials/1_hello_arrow-exercises.html b/materials/1_hello_arrow-exercises.html index a3840f9..6052b4c 100644 --- a/materials/1_hello_arrow-exercises.html +++ b/materials/1_hello_arrow-exercises.html @@ -264,62 +264,50 @@

Hello Arrow Exercises

    -
  1. Calculate the total number of rides for every month in 2019

  2. -
  3. About how long did this query of 1.15 billion rows take?

  4. +
  5. Calculate the longest trip distance for every month in 2019

  6. +
  7. How long did this query take to run?

-

Total number of rides for every month in 2019:

+

Longest trip distance for every month in 2019:

nyc_taxi |> 
   filter(year == 2019) |>
-  count(month) |>
-  collect()
+ group_by(month) |> + summarize(longest_trip = max(trip_distance, na.rm = TRUE)) |> + arrange(month) |> + collect()
# A tibble: 12 × 2
-   month       n
-   <int>   <int>
- 1    11 6877463
- 2    10 7213588
- 3    12 6895933
- 4     1 7667255
- 5     2 7018750
- 6     3 7832035
- 7     4 7432826
- 8     5 7564884
- 9     6 6940489
-10     8 6072851
-11     7 6310134
-12     9 6567396
+ month longest_trip + <int> <dbl> + 1 1 832. + 2 2 702. + 3 3 237. + 4 4 831. + 5 5 401. + 6 6 45977. + 7 7 312. + 8 8 602. + 9 9 604. +10 10 308. +11 11 701. +12 12 19130.
-

Compute time for querying the 1.15 billion rows:

-
-
nyc_taxi |> 
-  filter(year == 2019) |>
-  group_by(month) |>
-  summarize(longest_trip = max(trip_distance, na.rm = TRUE)) |>
-  arrange(month) |> 
-  collect() |> 
-  system.time()
-
-
   user  system elapsed 
-  2.837   0.322   0.925 
-
-
-

or

+

Compute time:

-
library(tictoc)
-
-tic()
-nyc_taxi |> 
-  filter(year == 2019) |>
-  group_by(month) |>
-  summarize(longest_trip = max(trip_distance, na.rm = TRUE)) |>
-  arrange(month) |> 
-  collect()
+
library(tictoc)
+
+tic()
+nyc_taxi |> 
+  filter(year == 2019) |>
+  group_by(month) |>
+  summarize(longest_trip = max(trip_distance, na.rm = TRUE)) |>
+  arrange(month) |> 
+  collect()
# A tibble: 12 × 2
    month longest_trip
@@ -337,9 +325,23 @@ 

Hello Arrow Exercises

11 11 701. 12 12 19130.
-
toc()
+
toc()
-
0.441 sec elapsed
+
0.461 sec elapsed
+
+
+

or

+
+
nyc_taxi |> 
+  filter(year == 2019) |>
+  group_by(month) |>
+  summarize(longest_trip = max(trip_distance, na.rm = TRUE)) |>
+  arrange(month) |> 
+  collect() |> 
+  system.time()
+
+
   user  system elapsed 
+  3.887   0.225   0.435 
diff --git a/materials/1_hello_arrow.html b/materials/1_hello_arrow.html index 6b9ad30..464049c 100644 --- a/materials/1_hello_arrow.html +++ b/materials/1_hello_arrow.html @@ -453,9 +453,8 @@

Larger-Than-Memory Data


arrow::open_dataset()


-

sources: point to a string path or directory of data files (on disk or in a GCS/S3 bucket) and return an Arrow Dataset, then use dplyr methods to query it.