From 249bed289e295f5c59bc0f2b6327958f14e10406 Mon Sep 17 00:00:00 2001 From: Quarto GHA Workflow Runner Date: Wed, 6 Sep 2023 11:10:58 +0000 Subject: [PATCH] Built site for gh-pages --- .nojekyll | 2 +- materials/2_data_manipulation_1.html | 267 +++++++----- .../4_data_manipulation_2-exercises.html | 161 ++++--- materials/4_data_manipulation_2.html | 394 ++++++++++++------ materials/images/segfault.png | Bin 0 -> 16866 bytes search.json | 174 ++++++-- sitemap.xml | 32 +- 7 files changed, 696 insertions(+), 334 deletions(-) create mode 100644 materials/images/segfault.png diff --git a/.nojekyll b/.nojekyll index 5596367..f2c122c 100644 --- a/.nojekyll +++ b/.nojekyll @@ -1 +1 @@ -4fa3a878 \ No newline at end of file +0204bc42 \ No newline at end of file diff --git a/materials/2_data_manipulation_1.html b/materials/2_data_manipulation_1.html index 5f64fde..6de7025 100644 --- a/materials/2_data_manipulation_1.html +++ b/materials/2_data_manipulation_1.html @@ -394,6 +394,11 @@

Data Manipulation—Part 1

+
+

Goals

+

Avoiding these! But…don’t worry!

+ +

Arrow Datasets

@@ -524,6 +529,11 @@

calling nrow() to see how much data

filter(year %in% 2017:2021) |> nrow()
+
Called from: dim.arrow_dplyr_query(x)
+debug: rows <- Scanner$create(x)$CountRows()
+debug: c(rows, cols)
+
+
[1] 356236190
@@ -531,15 +541,15 @@

calling nrow() to see how much data

calling nrow() doesn’t work with intermediate step

-
nyc_taxi |>
-  filter(year %in% 2017:2021) |>
-  group_by(year) |>
-  summarize(
-    all_trips = n(),
-    shared_trips = sum(passenger_count > 1, na.rm = TRUE)
-  ) |>
-  mutate(pct_shared = shared_trips / all_trips * 100) |>
-  nrow()
+
nyc_taxi |>
+  filter(year %in% 2017:2021) |>
+  group_by(year) |>
+  summarize(
+    all_trips = n(),
+    shared_trips = sum(passenger_count > 1, na.rm = TRUE)
+  ) |>
+  mutate(pct_shared = shared_trips / all_trips * 100) |>
+  nrow()
[1] NA
@@ -548,16 +558,16 @@

calling nrow() doesn’t work with intermediate step

use compute() to execute intermediate steps

-
nyc_taxi |>
-  filter(year %in% 2017:2021) |>
-  group_by(year) |>
-  summarize(
-    all_trips = n(),
-    shared_trips = sum(passenger_count > 1, na.rm = TRUE)
-  ) |>
-  mutate(pct_shared = shared_trips / all_trips * 100) |>
-  compute() |>
-  nrow()
+
nyc_taxi |>
+  filter(year %in% 2017:2021) |>
+  group_by(year) |>
+  summarize(
+    all_trips = n(),
+    shared_trips = sum(passenger_count > 1, na.rm = TRUE)
+  ) |>
+  mutate(pct_shared = shared_trips / all_trips * 100) |>
+  compute() |>
+  nrow()
[1] 5
@@ -575,48 +585,49 @@

Your Turn

use head() then collect() to preview output for large queries

How much were fares in GBP (£)?

-
fares_pounds <- nyc_taxi |>
-  filter(year %in% 2012:2015) |>
-  mutate(
-    fare_amount_pounds = fare_amount * 0.79
-  ) |>
-  select(fare_amount, fare_amount_pounds)
+
fares_pounds <- nyc_taxi |>
+  mutate(
+    fare_amount_pounds = fare_amount * 0.79
+  )

How many rows?

-
fares_pounds |>
-  nrow()
+
fares_pounds |>
+  nrow()
-
[1] 662951433
+
[1] 1150352666
-
-

use head() then collect() to preview output

+
+

Use head(), select(), filter(), and collect() to preview results

-
fares_pounds |>
-  head() |>
-  collect()
+
nyc_taxi |>
+  filter(year == 2020) |>
+  mutate(fare_pounds = fare_amount * 0.79) |>
+  select(fare_amount, fare_pounds) |>
+  head() |>
+  collect()
# A tibble: 6 × 2
-  fare_amount fare_amount_pounds
-        <dbl>              <dbl>
-1        29.7              23.5 
-2         9.3               7.35
-3         4.1               3.24
-4         4.5               3.56
-5         4.5               3.56
-6         4.1               3.24
+ fare_amount fare_pounds + <dbl> <dbl> +1 8 6.32 +2 17 13.4 +3 6.5 5.14 +4 7 5.53 +5 6.5 5.14 +6 42 33.2

use across() to transform data in multiple columns

-
taxis_gbp <- nyc_taxi |>
-  mutate(across(ends_with("amount"), list(pounds = ~.x * 0.79)))
-
-taxis_gbp
+
taxis_gbp <- nyc_taxi |>
+  mutate(across(ends_with("amount"), list(pounds = ~.x * 0.79)))
+
+taxis_gbp
FileSystemDataset (query)
 vendor_name: string
@@ -655,10 +666,10 @@ 

use across() to transform data in multiple columns

use across() to transform data in multiple columns

-
taxis_gbp |>
-  select(contains("amount")) |>
-  head() |>
-  collect()
+
taxis_gbp |>
+  select(contains("amount")) |>
+  head() |>
+  collect()
# A tibble: 6 × 8
   fare_amount tip_amount tolls_amount total_amount fare_amount_pounds
@@ -673,6 +684,16 @@ 

use across() to transform data in multiple columns

# total_amount_pounds <dbl>
+
+
+

Summary

+
    +
  • Use nrow() to work out how many rows of data your analyses will return
  • +
  • Use compute() when you need to execute intermediate steps
  • +
  • Use collect() to pull all of the data into your R session
  • +
  • Use head(), select(), filter(), and collect() to preview results
  • +
  • Use across() to manipulate data in multiple columns at once
  • +
@@ -683,12 +704,12 @@

dplyr API in arrow - what is and isn’t implemented?

example - slice()

First three trips in the dataset in 2021 where distance > 100 miles

-
long_rides_2021 <- nyc_taxi |>
-  filter(year == 2021 & trip_distance > 100) |>
-  select(pickup_datetime, year, trip_distance)
-
-long_rides_2021 |>
-  slice(1:3)
+
long_rides_2021 <- nyc_taxi |>
+  filter(year == 2021 & trip_distance > 100) |>
+  select(pickup_datetime, year, trip_distance)
+
+long_rides_2021 |>
+  slice(1:3)
Error in UseMethod("slice"): no applicable method for 'slice' applied to an object of class "arrow_dplyr_query"
@@ -697,16 +718,16 @@

example - slice()

head to the docs!

-
?`arrow-dplyr`
+
?`arrow-dplyr`
-

or view them at https://arrow.apache.org/docs/r/reference/acero.html

+

or view them at https://arrow.apache.org/docs/r/reference/acero.html

A different function

-
long_rides_2021 |>
-  slice_max(n = 3, order_by = trip_distance, with_ties = FALSE) |>
-  collect()
+
long_rides_2021 |>
+  slice_max(n = 3, order_by = trip_distance, with_ties = FALSE) |>
+  collect()
# A tibble: 3 × 3
   pickup_datetime      year trip_distance
@@ -720,29 +741,29 @@ 

A different function

Or call collect() first

-
long_rides_2021 |>
-  collect() |>
-  slice(1:3)
+
long_rides_2021 |>
+  collect() |>
+  slice(1:3)
# A tibble: 3 × 3
   pickup_datetime      year trip_distance
   <dttm>              <int>         <dbl>
-1 2021-01-03 09:01:26  2021          216.
-2 2021-01-03 11:36:52  2021          268.
-3 2021-10-02 15:04:53  2021          188.
+1 2021-01-06 07:27:55 2021 271. +2 2021-01-03 09:01:26 2021 216. +3 2021-01-03 11:36:52 2021 268.

tidyr functions - pivot

-
library(tidyr)
-
-nyc_taxi |> 
-  group_by(vendor_name) |>
-  summarise(max_fare = max(fare_amount), min_fare = min(fare_amount)) |>
-  pivot_longer(!vendor_name, names_to = "metric") |> 
-  collect()
+
library(tidyr)
+
+nyc_taxi |> 
+  group_by(vendor_name) |>
+  summarise(max_fare = max(fare_amount)) |>
+  pivot_longer(!vendor_name, names_to = "metric") |> 
+  collect()
Error in UseMethod("pivot_longer"): no applicable method for 'pivot_longer' applied to an object of class "arrow_dplyr_query"
@@ -755,25 +776,22 @@

duckdb

tidyr functions - pivot with duckdb!

-
library(duckdb)
-
-nyc_taxi |> 
-  group_by(vendor_name) |>
-  summarise(max_fare = max(fare_amount), min_fare = min(fare_amount)) |>
-  to_duckdb() |> # send data to duckdb
-  pivot_longer(!vendor_name, names_to = "metric") |> 
-  to_arrow() |> # return data back to arrow
-  collect()
+
library(duckdb)
+
+nyc_taxi |> 
+  group_by(vendor_name) |>
+  summarise(max_fare = max(fare_amount)) |>
+  to_duckdb() |> # send data to duckdb
+  pivot_longer(!vendor_name, names_to = "metric") |> 
+  to_arrow() |> # return data back to arrow
+  collect()
-
# A tibble: 6 × 3
+
# A tibble: 3 × 3
   vendor_name metric     value
   <chr>       <chr>      <dbl>
 1 CMT         max_fare 998310.
 2 VTS         max_fare  10000.
-3 <NA>        max_fare   3555.
-4 CMT         min_fare   -652.
-5 VTS         min_fare  -1856 
-6 <NA>        min_fare   -150.
+3 <NA> max_fare 3555.
@@ -785,7 +803,7 @@

tidyr functions - pivot with duckdb!

Requires arrow 13.0.0

-

This code requires arrow 13.0.0 or above to run, due to a bug which was fixed in this version

+

This code requires arrow 13.0.0 or above to run, due to a bugfix in this version

@@ -805,10 +823,10 @@

Using functions inside verbs

Morning vs afternoon with namespacing

-
nyc_taxi |>
-  group_by(time_of_day = ifelse(lubridate::am(pickup_datetime), "morning", "afternoon")) |>
-  count() |>
-  collect()
+
nyc_taxi |>
+  group_by(time_of_day = ifelse(lubridate::am(pickup_datetime), "morning", "afternoon")) |>
+  count() |>
+  collect()
# A tibble: 2 × 2
 # Groups:   time_of_day [2]
@@ -822,12 +840,12 @@ 

Morning vs afternoon with namespacing

Morning vs afternoon - without namespacing

-
library(lubridate)
-
-nyc_taxi |>
-  group_by(time_of_day = ifelse(am(pickup_datetime), "morning", "afternoon")) |>
-  count() |>
-  collect()
+
library(lubridate)
+
+nyc_taxi |>
+  group_by(time_of_day = ifelse(am(pickup_datetime), "morning", "afternoon")) |>
+  count() |>
+  collect()
# A tibble: 2 × 2
 # Groups:   time_of_day [2]
@@ -838,12 +856,58 @@ 

Morning vs afternoon - without namespacing

+
+

What if a function isn’t implemented?

+
+
nyc_taxi |>
+  mutate(vendor_name = na_if(vendor_name, "CMT")) |>
+  head() |>
+  collect()
+
+
Error: Expression na_if(vendor_name, "CMT") not supported in Arrow
+Call collect() first to pull data into R.
+
+
+

Head to the docs again to see what’s implemented!

-
?`arrow-dplyr`
+
?`arrow-dplyr`
-

or view them at https://arrow.apache.org/docs/r/reference/acero.html

+

or view them at https://arrow.apache.org/docs/r/reference/acero.html

+
+
+

Option 1 - find a workaround!

+
+
nyc_taxi |>
+  mutate(vendor_name = ifelse(vendor_name == "CMT", NA, vendor_name)) |>
+  head() |>
+  collect()
+
+
# A tibble: 6 × 24
+  vendor_name pickup_datetime     dropoff_datetime    passenger_count
+  <chr>       <dttm>              <dttm>                        <int>
+1 <NA>        2012-01-20 14:09:36 2012-01-20 14:42:25               1
+2 <NA>        2012-01-20 14:54:10 2012-01-20 15:06:55               1
+3 <NA>        2012-01-20 08:08:01 2012-01-20 08:11:02               1
+4 <NA>        2012-01-20 08:36:22 2012-01-20 08:39:44               1
+5 <NA>        2012-01-20 20:58:32 2012-01-20 21:03:04               1
+6 <NA>        2012-01-20 19:40:20 2012-01-20 19:43:43               2
+# ℹ 20 more variables: trip_distance <dbl>, pickup_longitude <dbl>,
+#   pickup_latitude <dbl>, rate_code <chr>, store_and_fwd <chr>,
+#   dropoff_longitude <dbl>, dropoff_latitude <dbl>, payment_type <chr>,
+#   fare_amount <dbl>, extra <dbl>, mta_tax <dbl>, tip_amount <dbl>,
+#   tolls_amount <dbl>, total_amount <dbl>, improvement_surcharge <dbl>,
+#   congestion_surcharge <dbl>, pickup_location_id <int>,
+#   dropoff_location_id <int>, year <int>, month <int>
+
+
+
+
+

Option 2

+
    +
  • In data manipulation part 2!
  • +

Your Turn

@@ -854,12 +918,13 @@

Your Turn

➡️ Data Manipulation Part I Exercises Page

-
+

Summary

  • Working with Arrow datasets allow you to manipulate data which is larger-than-memory
  • You can use many dplyr functions with arrow - run ?\arrow-dplyr`` to view the docs
  • You can pass data to duckdb to use functions implemented in dbplyr and duckdb but not arrow
  • +
  • Sometimes the easiest solution is an alternative path
diff --git a/materials/4_data_manipulation_2-exercises.html b/materials/4_data_manipulation_2-exercises.html index 5944437..52ff237 100644 --- a/materials/4_data_manipulation_2-exercises.html +++ b/materials/4_data_manipulation_2-exercises.html @@ -266,13 +266,13 @@

Data Manipulation Part 2 - Exercises

month: int32
-
+
-Joins +User-defined functions
@@ -281,37 +281,104 @@

Data Manipulation Part 2 - Exercises

    -
  1. How many taxi pickups were recorded in 2019 from the three major airports covered by the NYC Taxis data set (JFK, LaGuardia, Newark)?
  2. +
  3. Write a user-defined function which wraps the stringr function str_replace_na(), and use it to replace any NA values in the vendor_name column with the string “No vendor” instead.
-
pickup_location <- read_csv_arrow(here::here("data/taxi_zone_lookup.csv"))
-
-pickup_location <- pickup_location |>
-  select(
-    pickup_location_id = LocationID,
-    borough = Borough,
-    pickup_zone = Zone
-  ) |>
-  arrow_table(schema = schema(
-    pickup_location_id = int64(),
-    borough = utf8(),
-    pickup_zone = utf8()
-  ))
-
-nyc_taxi |>
-  filter(year == 2019) |>
-  left_join(pickup_location) |>
-  filter(str_detect(pickup_zone, "Airport")) |>
-  count(pickup_zone) |>
-  collect()
+
# Preview the distinct vendor names before we start
+nyc_taxi |>
+  filter(year == 2019) |> # smaller subset of the data
+  distinct(vendor_name) |>
+  collect()
+
+
# A tibble: 3 × 1
+  vendor_name
+  <chr>      
+1 CMT        
+2 VTS        
+3 <NA>       
+
+
+
+
register_scalar_function(
+  name = "replace_vendor_na",
+  function(context, string) {
+    stringr::str_replace_na(string, "No vendor")
+  },
+  in_type = schema(string = string()),
+  out_type = string(),
+  auto_convert = TRUE
+)
+
+vendor_names_fixed <- nyc_taxi |>
+  mutate(vendor_name = replace_vendor_na(vendor_name)) 
+
+# Preview the distinct vendor names to check it's worked
+vendor_names_fixed |>
+  filter(year == 2019) |> # smaller subset of the data
+  distinct(vendor_name) |>
+  collect()
+
+
# A tibble: 3 × 1
+  vendor_name
+  <chr>      
+1 CMT        
+2 VTS        
+3 No vendor  
+
+
+
+
+
+
+
+
+
+
+ +
+
+Joins +
+
+
+
+ +
+
+
    +
  1. How many taxi pickups were recorded in 2019 from the three major airports covered by the NYC Taxis data set (JFK, LaGuardia, Newark)?
  2. +
+
+
+
+
pickup_location <- read_csv_arrow(here::here("data/taxi_zone_lookup.csv"))
+
+pickup_location <- pickup_location |>
+  select(
+    pickup_location_id = LocationID,
+    borough = Borough,
+    pickup_zone = Zone
+  ) |>
+  arrow_table(schema = schema(
+    pickup_location_id = int64(),
+    borough = utf8(),
+    pickup_zone = utf8()
+  ))
+
+nyc_taxi |>
+  filter(year == 2019) |>
+  left_join(pickup_location) |>
+  filter(str_detect(pickup_zone, "Airport")) |>
+  count(pickup_zone) |>
+  collect()
# A tibble: 3 × 2
   pickup_zone             n
   <chr>               <int>
-1 LaGuardia Airport 2159224
-2 JFK Airport       2729336
+1 JFK Airport       2729336
+2 LaGuardia Airport 2159224
 3 Newark Airport       8643
@@ -331,25 +398,25 @@

Data Manipulation Part 2 - Exercises

- +
-
+
  1. How many trips in September 2019 had a longer than average distance for that month?
-
+

Option 1 - via DuckDB

-
nyc_taxi |>
-  filter(year == 2019, month == 9) |>
-  to_duckdb() |>
-  mutate(mean_distance = mean(trip_distance)) |>
-  to_arrow() |>
-  filter(trip_distance < mean_distance) |>
-  count() |>
-  collect()
+
nyc_taxi |>
+  filter(year == 2019, month == 9) |>
+  to_duckdb() |>
+  mutate(mean_distance = mean(trip_distance)) |>
+  to_arrow() |>
+  filter(trip_distance < mean_distance) |>
+  count() |>
+  collect()
# A tibble: 1 × 1
         n
@@ -361,17 +428,17 @@ 

Option 1 - via DuckD

Option 2 - via a join

-
nyc_taxi |>
-  filter(year == 2019, month == 9) |>
-  left_join(
-    nyc_taxi |>
-      filter(year == 2019, month == 9) |>
-      group_by(year) |>
-      summarise(mean_distance = mean(trip_distance))
-    ) |>
-  filter(trip_distance < mean_distance) |>
-  count() |>
-  collect()
+
nyc_taxi |>
+  filter(year == 2019, month == 9) |>
+  left_join(
+    nyc_taxi |>
+      filter(year == 2019, month == 9) |>
+      group_by(year) |>
+      summarise(mean_distance = mean(trip_distance))
+    ) |>
+  filter(trip_distance < mean_distance) |>
+  count() |>
+  collect()
# A tibble: 1 × 1
         n
diff --git a/materials/4_data_manipulation_2.html b/materials/4_data_manipulation_2.html
index 980cd59..3452fe3 100644
--- a/materials/4_data_manipulation_2.html
+++ b/materials/4_data_manipulation_2.html
@@ -389,11 +389,174 @@
     
+

Data Manipulation—Part 2

- +
+

What if a function binding doesn’t exist - revisited!

+
    +
  • Option 1 - find a workaround
  • +
  • Option 2 - user-defined functions (UDFs)
  • +
+
+
+

Why use a UDF?

+

Sometimes it’s hard to find a workaround

+
+
nyc_taxi |>
+  mutate(duration_minutes = difftime(pickup_datetime, dropoff_datetime, units = "minutes")) |>
+  select(pickup_datetime, dropoff_datetime, duration_minutes) |>
+  head() |>
+  collect()
+
+
Error: In difftime(pickup_datetime, dropoff_datetime, units = "minutes"), `difftime()` with units other than `secs` not supported in Arrow
+Call collect() first to pull data into R.
+
+
+
+
+
+

User-defined functions (aka UDFs)

+
    +
  • Define your own functions
  • +
  • Scalar functions only - 1 row input and 1 row output
  • +
+
+
+

User-defined functions - definition

+
+
register_scalar_function(
+  name = "time_diff_minutes",
+  function(context, pickup, dropoff) {
+    difftime(dropoff, pickup, units = "mins") |>
+      round() |>
+      as.integer()
+  },
+  in_type = schema(
+    pickup = timestamp(unit = "ms"),
+    dropoff = timestamp(unit = "ms")
+  ),
+  out_type = int32(),
+  auto_convert = TRUE
+)
+
+
+
+

User-defined functions - definition

+

Give the function a name

+
+
register_scalar_function(
+  name = "time_diff_minutes",
+  function(context, pickup, dropoff) {
+    difftime(dropoff, pickup, units = "mins") |>
+      round() |>
+      as.integer()
+  },
+  in_type = schema(
+    pickup = timestamp(unit = "ms"),
+    dropoff = timestamp(unit = "ms")
+  ),
+  out_type = int32(),
+  auto_convert = TRUE
+)
+
+
+
+

User-defined functions - definition

+

Define the body of the function - first argument must be context

+
+
register_scalar_function(
+  name = "time_diff_minutes",
+  function(context, pickup, dropoff) {
+    difftime(dropoff, pickup, units = "mins") |>
+      round() |>
+      as.integer()
+  },
+  in_type = schema(
+    pickup = timestamp(unit = "ms"),
+    dropoff = timestamp(unit = "ms")
+  ),
+  out_type = int32(),
+  auto_convert = TRUE
+)
+
+
+
+

User-defined functions - definition

+

Define the schema of the input arguments

+
+
register_scalar_function(
+  name = "time_diff_minutes",
+  function(context, pickup, dropoff) {
+    difftime(dropoff, pickup, units = "mins") |>
+      round() |>
+      as.integer()
+  },
+  in_type = schema(
+    pickup = timestamp(unit = "ms"),
+    dropoff = timestamp(unit = "ms")
+  ),
+  out_type = int32(),
+  auto_convert = TRUE
+)
+
+
+
+

User-defined functions - definition

+

Define the data type of the output

+
+
register_scalar_function(
+  name = "time_diff_minutes",
+  function(context, pickup, dropoff) {
+    difftime(dropoff, pickup, units = "mins") |>
+      round() |>
+      as.integer()
+  },
+  in_type = schema(
+    pickup = timestamp(unit = "ms"),
+    dropoff = timestamp(unit = "ms")
+  ),
+  out_type = int32(),
+  auto_convert = TRUE
+)
+
+
+
+

User-defined functions - usage

+
+
nyc_taxi |>
+  mutate(duration_minutes = time_diff_minutes(pickup_datetime, dropoff_datetime)) |>
+  select(pickup_datetime, dropoff_datetime, duration_minutes) |>
+  head() |>
+  collect()
+
+
# A tibble: 6 × 3
+  pickup_datetime     dropoff_datetime    duration_minutes
+  <dttm>              <dttm>                         <int>
+1 2012-11-02 23:40:32 2012-11-02 23:58:16               18
+2 2012-11-02 23:40:41 2012-11-02 23:45:56                5
+3 2012-11-02 23:40:50 2012-11-02 23:49:20                8
+4 2012-11-02 23:40:52 2012-11-02 23:46:15                5
+5 2012-11-02 23:41:00 2012-11-02 23:44:00                3
+6 2012-11-02 23:41:00 2012-11-02 23:45:00                4
+
+
+
+
+

Your Turn

+
    +
  1. Write a user-defined function which wraps the stringr function str_replace_na(), and use it to replace any NA values in the vendor_name column with the string “No vendor” instead.
  2. +
+

➡️ Data Manipulation Part I Exercises Page

+
+
+

Summary

+
    +
  • You can use UDFs to create your own bindings when they don’t exist!
  • +
+

Joins

@@ -402,39 +565,39 @@

Joins

Joining a reference table

-
vendors <- tibble::tibble(
-  code = c("VTS", "CMT", "DDS"),
-  full_name = c(
-    "Verifone Transportation Systems",
-    "Creative Mobile Technologies",
-    "Digital Dispatch Systems"
-  )
-)
-
-nyc_taxi |>
-  left_join(vendors, by = c("vendor_name" = "code")) |>
-  select(vendor_name, full_name, pickup_datetime) |>
-  head(3) |>
-  collect()
+
vendors <- tibble::tibble(
+  code = c("VTS", "CMT", "DDS"),
+  full_name = c(
+    "Verifone Transportation Systems",
+    "Creative Mobile Technologies",
+    "Digital Dispatch Systems"
+  )
+)
+
+nyc_taxi |>
+  left_join(vendors, by = c("vendor_name" = "code")) |>
+  select(vendor_name, full_name, pickup_datetime) |>
+  head(3) |>
+  collect()
# A tibble: 3 × 3
   vendor_name full_name                    pickup_datetime    
   <chr>       <chr>                        <dttm>             
-1 CMT         Creative Mobile Technologies 2012-11-03 10:08:31
-2 CMT         Creative Mobile Technologies 2012-11-03 10:08:35
-3 CMT         Creative Mobile Technologies 2012-11-03 10:08:35
+1 CMT Creative Mobile Technologies 2012-01-27 23:35:26 +2 CMT Creative Mobile Technologies 2012-01-27 14:56:04 +3 CMT Creative Mobile Technologies 2012-01-27 16:12:50

Traps for the unwary

-
nyc_taxi_zones <-
-  read_csv_arrow(here::here("data/taxi_zone_lookup.csv")) |>
-  select(location_id = LocationID,
-         borough = Borough)
-
-nyc_taxi_zones
+
nyc_taxi_zones <-
+  read_csv_arrow(here::here("data/taxi_zone_lookup.csv")) |>
+  select(location_id = LocationID,
+         borough = Borough)
+
+nyc_taxi_zones
# A tibble: 265 × 2
    location_id borough      
@@ -456,19 +619,21 @@ 

Traps for the unwary

Why didn’t this work?

-
nyc_taxi |>
-  left_join(nyc_taxi_zones, by = c("pickup_location_id" = "location_id")) |>
-  collect()
+
nyc_taxi |>
+  left_join(nyc_taxi_zones, by = c("pickup_location_id" = "location_id")) |>
+  collect()
Error in `compute.arrow_dplyr_query()`:
-! Invalid: Incompatible data types for corresponding join field keys: FieldRef.Name(pickup_location_id) of type int64 and FieldRef.Name(location_id) of type int32
+! Invalid: Incompatible data types for corresponding join field keys: FieldRef.Name(pickup_location_id) of type int64 and FieldRef.Name(location_id) of type int32 +/home/nic/arrow/cpp/src/arrow/acero/hash_join_node.cc:131 ValidateSchemas(join_type, left_schema, left_keys, left_output, right_schema, right_keys, right_output, left_field_name_suffix, right_field_name_suffix) +/home/nic/arrow/cpp/src/arrow/acero/hash_join_node.cc:724 schema_mgr->Init( join_options.join_type, left_schema, join_options.left_keys, join_options.left_output, right_schema, join_options.right_keys, join_options.right_output, join_options.filter, join_options.output_suffix_for_left, join_options.output_suffix_for_right)

Schema for the nyc_taxi table

-
nyc_taxi$schema
+
nyc_taxi$schema
Schema
 vendor_name: string
@@ -501,7 +666,7 @@ 

Schema for the nyc_taxi table

Schema for the nyc_taxi_zones table

-
arrow_table(nyc_taxi_zones)$schema
+
arrow_table(nyc_taxi_zones)$schema
Schema
 location_id: int32
@@ -516,10 +681,10 @@ 

Schema for the nyc_taxi_zones table

Take control of the schema

-
nyc_taxi_zones_arrow <- arrow_table(
-  nyc_taxi_zones, 
-  schema = schema(location_id = int64(), borough = utf8())
-)
+
nyc_taxi_zones_arrow <- arrow_table(
+  nyc_taxi_zones, 
+  schema = schema(location_id = int64(), borough = utf8())
+)
  • schema() takes variable name / types as input
  • @@ -529,11 +694,11 @@

    Take control of the schema

    Take control of the schema

    -
    nyc_taxi_zones_arrow <- arrow_table(
    -  nyc_taxi_zones, 
    -  schema = schema(location_id = int64(), borough = utf8())
    -)
    -nyc_taxi_zones_arrow$schema
    +
    nyc_taxi_zones_arrow <- arrow_table(
    +  nyc_taxi_zones, 
    +  schema = schema(location_id = int64(), borough = utf8())
    +)
    +nyc_taxi_zones_arrow$schema
    Schema
     location_id: int64
    @@ -544,13 +709,13 @@ 

    Take control of the schema

    Prepare the auxiliary tables

    -
    pickup <- nyc_taxi_zones_arrow |>
    -  select(pickup_location_id = location_id,
    -         pickup_borough = borough)
    -
    -dropoff <- nyc_taxi_zones_arrow |>
    -  select(dropoff_location_id = location_id,
    -         dropoff_borough = borough)
    +
    pickup <- nyc_taxi_zones_arrow |>
    +  select(pickup_location_id = location_id,
    +         pickup_borough = borough)
    +
    +dropoff <- nyc_taxi_zones_arrow |>
    +  select(dropoff_location_id = location_id,
    +         dropoff_borough = borough)
    • Join separately for the pickup and dropoff zones
    • @@ -564,18 +729,18 @@

      Prepare the auxiliary tables

      Join and cross-tabulate

      -
      library(tictoc)
      -
      -tic()
      -borough_counts <- nyc_taxi |> 
      -  left_join(pickup) |>
      -  left_join(dropoff) |>
      -  count(pickup_borough, dropoff_borough) |>
      -  arrange(desc(n)) |>
      -  collect()
      -toc()
      +
      library(tictoc)
      +
      +tic()
      +borough_counts <- nyc_taxi |> 
      +  left_join(pickup) |>
      +  left_join(dropoff) |>
      +  count(pickup_borough, dropoff_borough) |>
      +  arrange(desc(n)) |>
      +  collect()
      +toc()
      -
      133.705 sec elapsed
      +
      1171.556 sec elapsed


      @@ -584,7 +749,7 @@

      Join and cross-tabulate

      The results

      -
      borough_counts
      +
      borough_counts
      # A tibble: 50 × 3
          pickup_borough dropoff_borough         n
      @@ -603,12 +768,19 @@ 

      The results

      -
      +

      Your Turn

      1. How many taxi pickups were recorded in 2019 from the three major airports covered by the NYC Taxis data set (JFK, LaGuardia, Newark)?

      ➡️ Data Manipulation Part I Exercises Page

      +
      +
      +

      Summary

      +
        +
      • You can join arrow tables and datasets to R data frames and arrow tables
      • +
      • The arrow data type of join keys must always match
      • +
      @@ -624,14 +796,14 @@

      What are window functions?

      Grouped summaries

      -
      fare_by_year <- nyc_taxi |>
      -  filter(year %in% 2021:2022) |>
      -  select(year, fare_amount)
      -
      -fare_by_year |>
      -  group_by(year) |>
      -  summarise(mean_fare = mean(fare_amount)) |> 
      -  collect()
      +
      fare_by_year <- nyc_taxi |>
      +  filter(year %in% 2021:2022) |>
      +  select(year, fare_amount)
      +
      +fare_by_year |>
      +  group_by(year) |>
      +  summarise(mean_fare = mean(fare_amount)) |> 
      +  collect()
      # A tibble: 1 × 2
          year mean_fare
      @@ -643,10 +815,10 @@ 

      Grouped summaries

      Window functions

      -
      fare_by_year |>
      -  group_by(year) |>
      -  mutate(mean_fare = mean(fare_amount)) |> 
      -  collect()
      +
      fare_by_year |>
      +  group_by(year) |>
      +  mutate(mean_fare = mean(fare_amount)) |> 
      +  collect()
      Error: window functions not currently supported in Arrow
       Call collect() first to pull data into R.
      @@ -656,15 +828,15 @@

      Window functions

      Window functions - via joins

      -
      fare_by_year |>
      -  left_join(
      -    nyc_taxi |>
      -      filter(year %in% 2021:2022) |>
      -      group_by(year) |>
      -      summarise(mean_fare = mean(fare_amount))
      -  ) |> 
      -  arrange(desc(fare_amount)) |>
      -  collect()
      +
      fare_by_year |>
      +  left_join(
      +    nyc_taxi |>
      +      filter(year %in% 2021:2022) |>
      +      group_by(year) |>
      +      summarise(mean_fare = mean(fare_amount))
      +  ) |> 
      +  arrange(desc(fare_amount)) |>
      +  collect()
      # A tibble: 30,902,618 × 3
           year fare_amount mean_fare
      @@ -686,13 +858,13 @@ 

      Window functions - via joins

      Window functions - via duckdb

      -
      fare_by_year |>
      -  group_by(year) |>
      -  to_duckdb() |>
      -  mutate(mean_fare = mean(fare_amount)) |> 
      -  to_arrow() |>
      -  arrange(desc(fare_amount)) |>
      -  collect()
      +
      fare_by_year |>
      +  group_by(year) |>
      +  to_duckdb() |>
      +  mutate(mean_fare = mean(fare_amount)) |> 
      +  to_arrow() |>
      +  arrange(desc(fare_amount)) |>
      +  collect()
      # A tibble: 30,902,618 × 3
           year fare_amount mean_fare
      @@ -711,57 +883,17 @@ 

      Window functions - via duckdb

      -
      +

      Your Turn

      1. How many trips in September 2019 had a longer than average distance for that month?

      ➡️ Data Manipulation Part I Exercises Page

      -
      -

      Custom functions

      -
        -
      • Not officially supported
      • -
      • Works for simple operations but not with bindings
      • -
      -
      -
      -

      Custom functions - supported

      -
      -
      millions <- function(x) x / 10^6
      -
      -nyc_taxi |>
      -  group_by(vendor_name) |>
      -  summarise(trips = n()) |>
      -  mutate(
      -    trips_mil = millions(trips)
      -  ) |>
      -  collect()
      -
      -
      # A tibble: 3 × 3
      -  vendor_name     trips trips_mil
      -  <chr>           <int>     <dbl>
      -1 CMT         530173884    530.  
      -2 VTS         617481207    617.  
      -3 <NA>          2697575      2.70
      -
      -
      -
      -
      -

      Custom functions - not supported

      -
      -
      morning <- function(x) ifelse(lubridate::am(x), "morning", "afternoon")
      -nyc_taxi |>
      -  group_by(morning(pickup_datetime)) |>
      -  count() |>
      -  collect()
      -
      -
      Error: Expression morning(pickup_datetime) not supported in Arrow
      -Call collect() first to pull data into R.
      -
      -
      +
      +

      Summary

        -
      • recommendation: write code as dplyr expressions instead of functions, or look up docs on user-defined functions for datasets (see ?register_scalar_function)
      • +
      • Window functions in arrow can be achieved via joins or passing data to and from duckdb
      diff --git a/materials/images/segfault.png b/materials/images/segfault.png new file mode 100644 index 0000000000000000000000000000000000000000..f6a1fb09b9aa4cc14d5a3b389304595f89f3da43 GIT binary patch literal 16866 zcmeIabx<7Nzb#5a6bTFxJcL1lyK52#w-DSRcyM<}2r>ixN@?P_ZS%W z=`k?wxjeZKoU!LT90dMgy1bQBfAZwX!m7#&2F6PaImjDz&$NRjKW%k$w8Wp$G$GE= zS9pD$5u6>Q+dqCJ{Y;8G)o2ULH?dgFXR~iI+5BjiXJ_HB=WI7o+uAYzb?Jqfk2Vua zC9iGwvUc{jst@0z0`J`){)3Mr==kUv4@|=1-J93Ef;opbZeCvPLw+Mj%10wUhc}B` z5hZ`DXh3rZ?#)Ze5dPF*`+5;Hi3sFA#?_H=(T$LHUJL5@V~!jdr~0+GJ%l@1QJVX4 zZ#_;YC(im+Lf%M)Q`b;%OXZy1f!zUn85PEB=7WO+J3Biu@u)8*68-GPly_WFubpzA z`Ow%33ky?vO_=OeEbLQJQ3bwiX!69s2-7ard{c38BKR!i);UXZeS4yB)0vDmmd9CU zpdv9uDYP`FGOIAZEUR{E9zM3ZJ8%ypcfu7uJ~=4~=jG?)Txp`jL>pU45z^ewQ3X>|0!=KU`CZ+HN!+hT7=R$(uBF8>#BxE@);<&$N8&~f z`?rZl2|XA=E7|#VmWWt>Gi9h5wve30!1$$6MHsU*(r3mG7V1qLub;CkEuGH#ENK^|GnPKH27INQY5r6z?kNU_0OMRY2Zx!w zfScQ_DRJ45t5c`oCvPN{X>DFg~Sf zPIxI`PcYPkr%inY#`jbEnuytxE&J1pi;J^AU7q9^7_#^N&b;@p;XreuZvsHFc({T4 z@27wHT47+2vPLQ?u}(}gYYX9F|Bcn7@Wjy2j^W|K!G(!V%ysL3>GZFkg;STXC?Ns; z{}SIvsap{eEz$q0UG{c?+_#c%Zf#eqyC7^k$?)~>#3E%Cl?ZaF_ww=b@$~*63OrTr zC-g9GNo#9sR#v+$e|Yxx<|Y|F{>thqQ*6I~zp#|5nGQ-5=3XsWAgvnF8UapMq@l9GZz;#piZnNx#@hJniyrPVZjH#)qhDQ9Dm94EUkyK@U0+6?Qdqo%- z83hIDUcOv;djZY%ygu6@e5EXyU}wuqE$sTUrNwuDwi1m-1NXMr=+fQ&ikxEj&6_us zm6h^&ReJWdb#pHVx)-O(Svj)6{w!x zOIN?&;aVY9J-rMJVl|?h{N{I>ba6++vH<}F=1FXygYjU zXc^4CW>U!HIc~(f4|1B|e}Qz?NE&3OF&Q1-meR=|S+1&(X%nr|nYq1{I{XTKxS$9I z6)VoEUSH!BhEqCjmeVWx@*BMbvmdHFzAPo*+ztG(jI}!gc6jvK#%6D>+H!EfpEou2 zdZH~SC#Q03an?%Q?>eHOVC&E7E{N9t`0R|A6z7#nM$MdUB7a#~nZ0#o1f|dbS4jRl zMh2x6qUX=2*WaavhRS$)0%@9dB)#}*HI(|~^i)AXp*NC(1RwwRC=ItNQKYn~qhnv~ zhEg=E#l%i_+Jux4wZovd)8#AJgkj;R`H0s=nRz*JGKc8PC!6Q_ClgW+8|RX#<5?_W z8BilVf(!oy)20nJCFEf`l4F-0n^KEWF*T{8|0Mw^@l1L1@ud+S-{)}icoMUjv|rSR z9OLuz-X|OV)s~}mwY3HdZ%9zk znHbY2lcWmH;R=&>%aJr87sL?y_C``tGABDb1c%JId+tG$(3ADv1xlZ-)m10E8!5`s zQ_3(47eC7Y8Qz_{7XCdn6_cO4b5N^IruXUhid8{)c!|ZE?b_xmqy83;EklIpHFEOD5p0eDjHpZ_ry*tB=XCWy(Otd z7gVo|l%J%bxmiX^YR9$7q%$O4#KZmw-WnVn47?xXrD+lY=*dF0tTA9(aopj-Sm0!=5Do8)cFxRqEAxr_L}kP-DtNZZdw*WZ0M#b5tYg5 zX^n)bscFsh-$$N)OW(Wnh-NfLF~&*;ib_gKfM4On&4G5_Q~0RME|rmcMlPX}Q1hAJ zo#Q?IwY|)*R7Niy%2I$8RGW3#|HjVW-`~cj*lgiodz+Diqr9-Nu_zCPLgg!^@|(-2 zB_?j|OcY%1&$*m!$K>ZT$o%s3@d?jv@Hm1?hCZ8`nOU&qm!aO<9;03MJ&n{YMF&?G zhe4a^tu;=9xp<4`Y!9j%8(Ze*KBcCiu5cn<-X!3qeoHzT+RNN%8~5W>;wABGv6gb~ z=z@Ox^$WudV%w4ZYT`1%P2>cNSZJr1guW}dSmU0!|Gm#C+hI*Y;aX)%%X;pWG2P!< z&Aa`0KG04A|Ac?u{j=Q=bdw;Se1X^6KbOrarc^Q*?BFbhO7x+L^P0&PYO@i)kk&Cgvp9)4{rpr0a!Ho~``> zrwe;~IUS+pITPN4{%JjLBUz%MWpmNmm*Q`!@PCFWYsV#=cbXr&;`-3UnLB95;S&;U zlYFZ{Mw)+gpiVdCjD!a5=Zz$Y7fI=meB4`BkcQ-A?yw-brGMVPPn%7ckZ+zTl>z_8 z%zA!l!{CR6sThkM?9Lus%+JqvhT>&P1Q=eI&P+|Ei}_4XOic9l7ThWHn{oI4nU9u0 zTXB+(q%i>pm9VSTyLZ2TXZZt(cWP>in3#BTa}!8^d3o_re{|!Uy!Ypps8XX@>@T8N zc>&8*L{WRbhKzmQZliI%N0TRhsP8GK0vrZ$bu+H3_ug<4?Q;|D&t;r1c(~gtyS@lx zsbfp61mApMA=n+nIX4e^duuotH3@^J3}xdOkga@?L^ieo1_~40}R!a;u|7L^*PMb#480HC) zgh(Cbigfl!Fzleaj_>K)m^Q9nnfPU_cF?@<8Gy4R{Gt zkjQ_WKNL@bH=%Ep#y(J=rET<%20v!!SH3w*=QIC?+yyz7RZmR@&{p}=(~q3$f*q2a zEvXsjL1jm$dJr@CX(PPyL9tvKRc|kr@SU9wL+`hF!_y@>v7u$cVIc>5T0vOLC+QV)Sveaf2Q&87pJY5pbwf>i!*PFSyg0(r5$K*oicl0^e zCAGyIxwHk8goI=-(}asgJ@2q2C>x&PZiV>kUo7=b-MxXICDPhHdm-vMT)_J%Xvto} z1rbqSuce)YAb)PDqnSgNH5O5!TMFO^-~bQ?#67m(j9+fW>rB z_b{O=o}#V!%ZqNi9E1*Cm+o9Iw`8veonTp- zb%|b&JRY^;urGoCkSKP1@S`z%h?M|X{xguAwC8J|H8_chNj3aXSwwB|hrVA_P!W+g zb>-vEkU$#0)rc+UVTjp$ELwB*`#E6~M}gm;PAa|F4()369GQy|4?ale*%k>FQ}s=h z*~G~HNa6i&H#t{ueiO0_sQVT{+&mbc6MQHc$`1mB9A^IUvzC992fN7bkzgbr1~ zw!X(#K8h%F;()C6(A`;|UI)i&s2jK{LMJ2TvO8+R9`h$lf!Eev_!SJIYS3+F>geOT z5i}#o=~Wbmz_uqi%W=X_&icCSly2o(K$2MJy&KdKou9ey?xUonWN4US)7g`i_xGsH z$yrYe=sa`aeBh%Z{6GbT`Pb9*wM3i8@zP5%KPW6~EaA8nP`tiQ7>^chbJS0#O z`Ec1SL`iln(|xPW4~7`PK506iyMdz(vM;K}C{}Gp5P8@c18&kWCNQ{A^(WkL^@P34 zjg9^O{yCS~-E!9#t3~J6)KI7zlSb6zuW0mam0Kz|d~^gkVqArfv~i0~qd%6;4p|MTKlOM zi|QgMt>#xlw`bcdWAj^VQUoZZ=dTJy!RVC*5LibH*SFu3Pft(MMRxKOZd*qx4edLM zCY$v=j*0b)^=|pX-lzs0*PtsF4uIi(jQ0W+9D4Pc00eqLPA>eZK26AdUmJjG#>U3# z>XXOoQUIX7ckiBwi3z3`ZEaB+N#<1!|4n??;OIU5sAxHVYnhX~WZZ|>sH10lr_N`O zaTt16WjM{+8vSk~=tW~Ht4?vMt@`vE8Bb5np_`i;5JIaHOs3dnC6)^OkTeSkP}Y=D_MNHzwXN{#XB^^?M-?l16>}<<4-GGE zfcD8I0MPd7X1p}mF0s4D$%(|DsHL1ECI_40$$eF$Q1ywj5xILM+{f%&{V)7`SgA7O z?HIt-{#sE;^v0lO*6)a>TnxKh%{Nii*FEI1j>A4q;r7!F#Yvx+juJ~&*ZSkvi+P9FxcIvA;tALr$Rj=m4lWY=uWjn+w~H06vJjPGEoH^Aw8V- ze2NScEX0siJeltIO!=~4?R0oJ@vhj%r>**_gmr(WxTt4#Rax%#3W?4UYA0p{X+fN6 z>Gn)T70P~IF4D$USS;%1x|~Se)@BlRIwBNkV(ISh@X&}=l9eZ|slnrIf1%n4)oRCN zQQKt`jM?2{%V0Lhm_J!jVIerpAq1wt`3K911Cq-@wYiWe# zA{XTmj_%#=v>UYOk z&%>X?m^bG{ZnzTYwyR-Htd2o}9^=);lx_*;tqJDaqI!fR)I7g@ocirPL^7=Bjh@}m z0~n!4NbAFxrR}hX@`^}Co7>E$DwY6F8f+(g*n)+XwdDok3BIa}OLg}Lve}X0&Bet< zFc|E8Lm6a~lLq^^svty041weuIM|7#FJ9U`561FC3|Ey9#5NHLXg7HPnC1Wb(VjPQMb1&!ywGkVyVh#I--o z#5K-2(XzRkr`@_H#c(h^ zu4#9~j^#oyKUB=iXVKjt&O!3nWGl@Sj2_V_j4eBOHKQ~KEN~6SD(D|q>ACDMY($Ba zX~F@p#yP$eDflQSFFTVrYOhCGkV)y&?`s9NpZrcg(RdFYj<1vy_!R;sz(The+h=I@ zI}z1xoENz2!;GpbkO~%PsZ2XCILm4hF;&@e=x;D*Irp&9JdqVtkErE~ZTt+! z7I!lF&V55m2#T5ev26YleAseGJnnQG&>z=dZi_Br~~@9McRC@peu(r9c>?G zun)o*LJUNrJ$fST>kd!kYzKDa(rM^6j^^jixi#4L+dM7~Jn79tH2YmB%flLdrWSlS z8XJGTK75#2gPfS5mWz10**LiT`{Ib67QBf9TF)kNOBSE42?h*rG;6iCUQV3HmVXeY z)ORp@{EX7d!A1ftx${+qdUt!fV$K%WH}GQWnKD@Qs0O$+^jbJa*@bZL8~WJ(o@i=n zGBTlA14@ifHZ-*0??5G&9AQG*sE-~;%xLEi4^D)iZm%9Z7SvEz*Vi)nhr7A4vkSO3eVv0iatxnz!_*w;(VX$>lR?Y92N=I_e`)r5 zOZXYR?3wW9|0PgD`Odi!<;{+V@q>i{+%br|<=~z3j#1y>ZTaHTlFoC0@fE;Hpm_^F z>jA3ZDf+X4&Mz(=F#mOYj2|+6sTsh){CzqH2kOOgLJWJsB#z%a60j^tURhbTrMS5G zjLRi9#tOj@)n(+L3-H^!d;x&?C}YpkVjbkhF%-mTXkV$Ass@U>s{Qj>q^XW();nW= za*Rbs;HFJU8R+P`rZmo87pfbM`f>b2mUnz4Xf4mTQQfP|F{!D)my>0@B{tsrU8Xi3cbC`?SJyBcvOqOaTAR$5xCL@!>5 z`(8Qz6VS+S0)8?5WVz#U0L%E1MTi%;C$~= zr3$?W2*`kb6VOt~dZ(DoZ4Pj}Uj$5*7-9i3O(}(!iILIj8~;6wmyx%xDEj8djcGf! zN)0d*%mLVU9}^QBJDQ(A<+1xF91f3(iCJA;ZIA=P`GDZ4B^KbdaGj z=%aMg-YBY{6!3<|#z<1G5A!wFInt4DXBRz(bS|x*o9mSMw^!@1xUPOwy7IeYY`!SX z1h``T7+S*{EMQ|n>1Uy}yKh)aqx<3E;n>)i$CBI3_E=6pzuhnqmXx;k?Agx5-=Q!v znpqxP(Iw;GyYK7U;^*6f$v5_tDfNs&OKUPbO{R0JO%t)poQ8$Y&~)t$r8m#ZL}^Z# zVxHb$Pli`LYM{n_8K2$^6VkSKY9Z0~6r zGBOgUqcc0(9;*k~#3heWB}GL_9*d!20)7ajiy2DK4oURA^mM7rLY+Jcq}iK$NJ#tk zKJeht7DPHwsNxaZyH^Ey<@(<^Y59fSm37M+^-ODpeb>@zh0Y<6&E{!12}zk|C}dPS zzWdRKs1D`!`Ene@9p+ob(bSV>XvlI(}J#MyN{B+Z6&jrESJ)(lw=`8X$9iq zQi-r({F8}92Eo0|dj96@g$cyI%zEac%`^<3nyUHbI&b`uu8)zgYmhL6D!yXB)|UsA zKB8Us#$m*dUz|bM+d}vAtXe8O8843`6;!Gj4C!K%U0C0DuqX%_^5efPeE77!cVrHg ziQ4+Gu4Sv!>}i35Ea|pdjXUVL`k2Z75Jp{Ki>X)YrHTtY{Vj6l@F*!Ybpd$y(1RQ) zAVs8(Pt-AJkfT{!?)ROOs_xEryc3Tu$B>d`4&95ui-lofaMg?7dL99d4)-nL&0zFg zq9xU$U^4O9D2J|(`)cT84@FI^{B^1OGOogYJMy=29w*|-?G$jmrWFAj4X^T7Ugx@D zwbGfRM`*M11Ris?>C)i_pXH**@Hyw#FuvVz4*vogFad6Az^!(BgvT?B-K*WStryxC zX4~!cTc=Uq$t7-Fmo+mWorq+y8m`v6*{`?rGp8`c4cz2twmza>dXHuCf<~Y9M7{Mm zXSI>j%Q8=IJ^g8d&2CF%uD~;_gg+(8k0rdmy~<;>Wv;Zdq$1KZ!(gI%vs7JLE zP_y~7IRQ;<=Lc>=s?8)XP@Hqeg=L$DiS;|?TL4?>W92i8;E+F@PQuG&^Es=G_(@fm z1u>gCV$*B%S$<9*lXRD6)yE{iRPSR-s*rz4(iQO2+uXtW{WmV3^FVW!_{UTajm96X zq+|tPV)mu9^8Nz0p*X>-Q`7Y2cSi3o@!FoB=}^1$?`d{l5qeq2W@`5x4fm6TL5Yr; z3JC?lYX$T@$v6c=kU;T09`?;fzSww37gKL{uvd)9jB;m$<@C@y0;l=McMfvDwnXrr z_z~rQ1b^8m&9X_bSeuX3|-EOE@E1GIca2GW`>(*MQ=0yX4PL zGYF-n_GVC(>#|nMkUdKyXAO-*Lw5HS|+?3 ze(cZB3&sT`czK?z?;TA$LI$jD-mga=47<5%6LIwq5^^9iKEfRmgDN3r1IT^HjhwyM zw?b>Yask>ek`vQScc&k%%&!VOdGrmulRkiV_-ev~jO9hYkRbhgI!J=*c_<-FB0CFKp%#>jsktP zo3YOD8LF$awf?@l4UHHI9iKsaHMeysvOYbRo}v}o*A18#Op0LG`&<*$keI%ldHF8V zhVDH%M)G$wQwzo^CIg3Cm?4ti@-F^R- zvnTT>v#I^!-Ci+~YUV{Bzqd(m43a!b_U>kTtE^P%m0{k7r8wG7`aSFw?G-botdY61 z8A0;E;KOCXI~KMFzh6~1_IK1gE!7<93_Ru~B{G2-#%q7i$XIZz2%pFsa#*~0EB!eP zD$7Z{xk2tt^=#d>|1@|=BA+A)I7cO6jp8ByiP;!=YFqP2dZyCIz9t ze84XVL_H=Rc}~oLcZ0kp9~?|yNUw3Efr;6L!|Cv2jKxbwnOba*GZq*Lk22n$mD@e~ z3>|uSWG~@Ar=4tUO?M|7DFJ#ab>uL5bfehJ_~f}!q-G~bRYMbDn6qvYEHSDPA0Osd zyA$J%?l^vI=@bMKA>|pHZNuQtpXTfNAUsx(~qP4+volQt(_wD5Fw=g-I zYGr|os+OGNPzr~BFII6#X;{js?Fdth)liBDn%o0*sWw&g=i$-2U7rA@wJ_P(laBQ> z>5eY4r!F(;_gCG@%T!RI_s$~UNt%L9IkbBViro?LoJ$WK1ir!b$rt4-qb zacShra$Ur*PQF5th{F!Qk-?(*mxm^JWk&w!=%>d|UAEXp%uLxGe!F=PYPI4cr^-zU z5krkfNHr6uk^bk4HJgK&bsW$qs=cLb#y<8F)xRg}mknmG5Q|oBP+@NWW25t)sJ0Z5 zi|qrS$W)7v$I^7R+-$x+d*2V8XcIPDjuYDUvG(q5$L#<5j0s4RdIJ~@j<+QNxzRt; zriBA{Fu1w67!%_Y1vCOG%MdHXejm`-l*5k8LQ?^~#9-$H=I#Ganipe7b6`!pl^LZb zWy?#8Uv)8J0g&+*l_8bza3vN>6mD|$;RNeI6%IONg(0xz^Jn9J-I9}wzy{}zgbGIG?3WuDP=G*81#EK8Zn&_x*-V(0|H*AX!Q_2M+~2Uz_d1+` z5*7=5vO;O(lCDAqlh?y92A1_vU+(7Y)ApjY8mVJY3Wl2*YEA2uBf!!CeH;tyBu)Xd z;^$TO5g8OJ%joUUy*TSw%S`7z601}25Da$l_m=k!Vt|xBjm~J)bf5vNjbLLlNC*Ao zqXAXA@RJa@k*VPhr!!N3BAab5}{*+sW1s!7;{t-tO(dmh5XK*Nz-Asi>?XYJPeM+Y;(z zys>FEjHQ|duJ^;RhUxV3>DVrY8)`gaJwIC@5&nXVfD{Bqo|e$& z{H7yfc6(OHx-lwy=sFSD(+w)tpaC8KWlh9LCf18-Q=F%V!={5ET|-{G>uPitms*&= z0FBR=;gq{d!_&3)fq{-l-P>4>%_k5#a{jyP%0Mm(yoAa4eS`7=-HQ!N145a;xij0C z!=vtaa}9bg>XPIlF!;2r_ql_e657(qd=rm)h+V~pu2q(<)5nHqwiho!T_b_`=w|S3 zWZHJKR^|d>9X+h5s${pzk|OqPQ_1WZ+H_F-Os~4Wrp|1c-2bL3ej$zPCHYy%arr(# ztD#ByHLR^OT}xl_+JC*QL{US#z6V{5Ojx4;eWfjm^R@4ichKFxTLm_sfO_>y|C972925t$VMk1JTjp((^VvAQGM0Z8{4fJNJzk0AB7wi*yd2VbErOAJ;C4G(rdo_Xvxg5Y8|4{ zzr1#mxt!)Qw~}}M34Z2D33hN zSL;=KE+vFgc7-Zo%=3IP-&bxK?Y&IC5ZXT z7A`3#p8%brQ&uDUs^Z4RU@`x|^o=1|GC)bSOwPUa{~X4Af|p5MP9X^);-}RS`1%U$ zufIDWQW}aq0EL?NYXke8PrFX$oyeoSUu{>5Bfig+em#D5GR`1T`E5BhmV*Gq)Iy*} zJ#MJ2392`nvc;7_ZTPs6SG%tsxymP8k)67psP(cmm1}@9zYBk+1^erMNb7eTP=ug1 zkO18u1)YH3c~%9eqVW=mCr*n^2j<)aPqHit;vs4Ln_*^l)XkW`3DaPo-)!w2?V$#~ ztP9KH%W;wq?wzx^dT*JiS2%M8#&55Ik`)%O?7ilVG8DD75EMQtyG@aWu@MZh=d;rf z376gYL9F^!!5#iL11xSF4%g>N=FcM!+s`TPzLD~;9@l63c}-xu2&B^=48@&ul#Llg zq)Dq$F-Z!;v_gWNq}9Y?BtU~m!|hKMX)oQwCG6!wSb#lYwTLqd>u&Mt^2h5UDaHhpFF^dFqt_eemWbIBWa1mIG}NP&6XX%9o-` z!tY`9Gf}JH-|!uJ`|UsYPT~I9Q!h;RUk`tFynT*7Sn26+y}N%>pJzW8(Hp$86EdGm0+;jt95kV$B38l6R&;iN^d0&_A>rvrJ4IrS@<09x z>q?(3*|Xg*#(C)%A0{gYF=OT+@m)vVXz)k~vG3F0^520ECXanffHYwAD$N{*<4Cq ze@%=iq~7x94tZ6{s8oM-!eiOmBFtd^$*}~k*hL7MN~L(z_`U&YEQ+gdGTsr4x~O6E zj(rUYMAnEGRO~s8zK^Re=%d4TY8lVmf>m93t1~VkrM=ZnxCDvc&PULQAnQZin@u3C@2}vc2In2UXlyQ zECXH51w^-lT(KSce-1C!M-JoOo%!o}m+f&>2JTUxJktLhsdGCD+rhDZ8bk9@NxkC; zE)1Wo{XCm=7u`!aI?}%}jdE*DfM(v+<7r-7h>5-_eHNDFdlcS0_#h_e>*&N`<_|#a zv!2PTc(->wN14tsk3L(pQi2MDO=9Uc&tI*v8&saO!&?2yohF9<{DKB4tiL z(C{;`M-2@K8gPQF^A$C9=9ngWfPL4*hTY)4a(}8+PRkL$(M+sx7sG8+(F$uP-v(VO zg4=I~H{(4UWqND*Y(_6C{N6yI!uEEb`*sW$D~}V>Ub)eHbnENNt<@=LzJ)yM%k7;R z)^FM5P80}a3m12ENl56s`ILG&ogI*7Ub%K5+90`POk zYA@jM^!^d>UkGIXRqOg+%W40cTXSB?kJ11-YU4lF_y0 zv+6cgW7%kUGqja8FG&!6bAuwDtULW8S=KNY76O2j7Y4AWf3ZAebkvsKW zOW&g`or>%Qr%6X&y=WKNLXpj53+xZ@`jD2-I>es_Upo9EMq%b(6Y{B-4{&aTLAn%M zc|Mv~D6K47(yMse1d4!$^+f%q`(sV(HSov^=e`O0dRH{RwPg`Qo}|cene1Qo zzu}X?S@4xEdEw-LBX<;+#IB&zY+q5RLz#_ZDAsuLxn+U9i7{6q$$-xsRQheCECQ*8{_<+ZaP9li$ zXs_e($vhN_3KUqP=n~+0!#eeq*Vpwb+~bjoMx&a|-dJ|^=n;Pc6!HSq=@G4;eNJ}R ztvx(^gDpREP+U&`{=wt&R6_4j$8wsI(z?3U(*O#7-(~rTnStJXQ7T*HO=_rsIzIBM zG>xK-v%|9CF&`iAk%W5F;X(+*hx}!VJGj|J@o04B?T>Cl^Vk=Rm7F&Hu|+fr zk`KqupL)RDU}5rPZ)DCxw$3BJ#-!ALmkgz#z+&U2>w6VQ6FkqMX;S@FvBu(OAS2@} z%eSr^a|3?Wg+*n^*m?(|aETZ)`K}dlU^*yK%KAJkkk3KvNAlrRT8y9bsT`O4?)KS( z3}|u+X{~*r)T9*4t*`h0ItjK==U4<8CbRqV=^G#3>`GrzkIvR>ZvTgC9d zrfUK@mX)-GutevtG z->i$PK`eH=Rv0~) z7XvJ8Oj!IlqNbwK1(f;RGU+>v-k7lonGUditVI|2M}I9W1ce>rZidYVf9ak|o?5$z z zd0fhk9ee{R7L#3<;^Vu!j!DnkG`;nWV@~)uEYoa(!ACmNqFZlQ^HExmx)@>Fseu=* z&PJ(qT^ykP>{1JVEU1gAfKY@Y+z^E2ba#ncOMAN|#_2XKR!UEd+ZY^AmdBsnvlKGz zMTj!e&(pKC7uTKtud&7m#|1^pYEJ*kMw8%3=pf(2ywi+fB-0l zagvu1$%Q2l^2Eb0WDjun==aS3t~gx?Nna)aiO@KqVELz)UB&zXMqbvbt7VCabVlD^IDv;(=tD`a^ZJZZ-3LEZOAXbMhVaa*e(#Dafb zem=#f-_p$LCG}9+jY{VjcfCt))XX&#>gUDmXm=v@FS`#;PUcQd^dKgb#ttkpC6BC) zuH}Rg1R{PRSce=6Ht*umEMgNBhlzWxmfkz=^vkYkSJl-JHR8Pcv{|eYHN^2WL&Dq2 zmQlhjaCFiMO?#(Wv05a|NHv1{RFoxig!+6b0><-RFI z!DEuUj6W^*AqdY%4T*Xj?9rMFvoYeSG*pvu7g|?0t(hdiRYgKLArP#{QQ456WqF?X zh^2cttY;3t9?FlPeA(LrJ}Ro0F$#s688t}!@Le{+#>v*Ep)rw=vT5#GFn!F+Jzsh z6qnqbGP?xS9cTICOKyE+)6D!qLOrQWOu-E+6r!!7?G%H~%SUK}yStY})sW@#Q}S+B z*O)_Fd_$9A?Vl$vGz{IsA_E_t@!cnN++O~~lEj<<^WmQOZF_;RjMNy$x?I~OuM2Qu zb%O=>C1}0VnYG2H1ff_;XXaYQOY=%|u?+n}mm3$eHM|fE)0V4~7;cx#i|h(a-iyEl z=48k z+$Bw>!%_6;iC@p#YEfoUEBNdW?a_;lqe0Ev&OaI~s^&W<(E}F(j224NNLM1g_g1pE zliD^mBeVY8r!snHaWNe590Q^6Lb{I5NG;sykO98TxC4iaCd>Sb?nc_fC)KWo6i?X+wMekH<$Z+nE*<(~WoIIp9!dML z(?lx~1M`RH5)trMfSU{yqsRbYZ=6nGSAAyLbEb0`0SN9_cRr*4hZF5T2Z#UeTbs7x z1$>C9K8?4JUrRXINBoWwfxu@41zPR}zv+5V9M>85rcE-Wp^2cOQi*DHZ@;!5;BDW_^@OFcuhGay26Oz+TB{TD-H#gw#~kB^TjVHphY-@WO2c6{;C#%7Di z&-XsZUk7FIG4H?h_}^uLTmW3Ia5%emX<69=z&Olb;1Q3QZwZ;vRoD~Lj+dXChHI+L zZ0r^R&N>D&#^(ref$JEP7o;ixx&4OM_x*}z#*?kT#m74@@4>%5n1Ate{%UFhL^ci9 zkxe3^J;R7A^8Ys8n-s!^v`j3RssHDiy;q;3A|_eGt1Y+?IU zJTSt;IRCE(McL18Zh-mJk>@m`d9#$v@Wu8r;D+S=4{pf+g*o^?c_;s4>&(BEhf z`Ueu?$m2d0)_=^DDl+XP3@-XZ5(uQpUz6ga=%L6g>72YOc*x)RkMqLf#IG~;ViI<( z$o5Z%SP2OSuRrCY#Q(&ieZuKNTkYJ70`%qInm5nAJ4N^MSrp&@>+Jjz+UjP~b)e8H zDk=sblKs;|pum7w%Z$7Dx1qJwlql&fd|76B&f)`=b*Iyz#0-MhB6MU~LgXgI%lc0}Ld z%I^3*jFm@Vm!3$)9?sBb0hjBQ?^Ovm5s01voksl;ZXCc3Z62YRha;%xJbk4$e08iU zZ~m>|ap+%LEjkvYlcIY9E1va1^g2#=eZ}c)YHn<iGzxjjuZLFvBMUhOoVu5n zM*&=je8jI$EH!|ujFew->ePjQ^;ffWlLC)j((PGUwTxuPi=>UcYX;M5&YgF}z4$PWmI5a2rl0{?z5fRcNHi<lsu0x zRzlo0ZYiXS<4?C~w1}`qzQhVU*uVR$E;O@`XkJWLV>EMa)bHs70-G#J!u}_2a5H@Z zi!^n(r_ZUF0;8dFZePoZE9-66il7unv7wKo https://github.com/posit-conf-2023/arrow/setup.html - 2023-09-05T17:13:28.973Z + 2023-09-06T11:10:57.739Z https://github.com/posit-conf-2023/arrow/materials/7_continue_learning.html - 2023-09-05T17:13:27.789Z + 2023-09-06T11:10:56.359Z https://github.com/posit-conf-2023/arrow/materials/5_arrow_single_file.html - 2023-09-05T17:13:26.937Z + 2023-09-06T11:10:55.411Z https://github.com/posit-conf-2023/arrow/materials/4_data_manipulation_2.html - 2023-09-05T17:13:22.689Z + 2023-09-06T11:10:51.062Z https://github.com/posit-conf-2023/arrow/materials/3_data_engineering.html - 2023-09-05T17:13:21.401Z + 2023-09-06T11:10:49.478Z https://github.com/posit-conf-2023/arrow/materials/2_data_manipulation_1.html - 2023-09-05T17:13:19.729Z + 2023-09-06T11:10:47.634Z https://github.com/posit-conf-2023/arrow/materials/1_hello_arrow.html - 2023-09-05T17:13:18.289Z + 2023-09-06T11:10:45.930Z https://github.com/posit-conf-2023/arrow/materials/0_housekeeping.html - 2023-09-05T17:13:16.593Z + 2023-09-06T11:10:44.026Z https://github.com/posit-conf-2023/arrow/index.html - 2023-09-05T17:13:15.101Z + 2023-09-06T11:10:42.438Z https://github.com/posit-conf-2023/arrow/materials/1_hello_arrow-exercises.html - 2023-09-05T17:13:17.605Z + 2023-09-06T11:10:45.170Z https://github.com/posit-conf-2023/arrow/materials/2_data_manipulation_1-exercises.html - 2023-09-05T17:13:18.981Z + 2023-09-06T11:10:46.718Z https://github.com/posit-conf-2023/arrow/materials/3_data_engineering-exercises.html - 2023-09-05T17:13:20.525Z + 2023-09-06T11:10:48.534Z https://github.com/posit-conf-2023/arrow/materials/4_data_manipulation_2-exercises.html - 2023-09-05T17:13:22.065Z + 2023-09-06T11:10:50.262Z https://github.com/posit-conf-2023/arrow/materials/5_arrow_single_file-exercises.html - 2023-09-05T17:13:26.281Z + 2023-09-06T11:10:54.699Z https://github.com/posit-conf-2023/arrow/materials/6_wrapping_up.html - 2023-09-05T17:13:27.493Z + 2023-09-06T11:10:56.035Z https://github.com/posit-conf-2023/arrow/materials/8_closing.html - 2023-09-05T17:13:28.109Z + 2023-09-06T11:10:56.707Z