diff --git a/_freeze/materials/1_hello_arrow-exercises/execute-results/html.json b/_freeze/materials/1_hello_arrow-exercises/execute-results/html.json
index f7b778c..37d1448 100644
--- a/_freeze/materials/1_hello_arrow-exercises/execute-results/html.json
+++ b/_freeze/materials/1_hello_arrow-exercises/execute-results/html.json
@@ -1,7 +1,7 @@
 {
-  "hash": "53f610ff8cc8524ff8fdda04614a7b6f",
+  "hash": "46f80887ae90f6136973650347ae57b7",
   "result": {
-    "markdown": "---\ntitle: \"Hello Arrow Exercises\"\nexecute:\n  echo: true\n  messages: false\n  warning: false\n---\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\nlibrary(dplyr)\n```\n:::\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi <- open_dataset(here::here(\"data/nyc-taxi\"))\nnyc_taxi\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nFileSystemDataset with 122 Parquet files\nvendor_name: string\npickup_datetime: timestamp[ms]\ndropoff_datetime: timestamp[ms]\npassenger_count: int64\ntrip_distance: double\npickup_longitude: double\npickup_latitude: double\nrate_code: string\nstore_and_fwd: string\ndropoff_longitude: double\ndropoff_latitude: double\npayment_type: string\nfare_amount: double\nextra: double\nmta_tax: double\ntip_amount: double\ntolls_amount: double\ntotal_amount: double\nimprovement_surcharge: double\ncongestion_surcharge: double\npickup_location_id: int64\ndropoff_location_id: int64\nyear: int32\nmonth: int32\n```\n:::\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi |> \n  nrow()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1155795912\n```\n:::\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi |>\n  filter(year %in% 2014:2017) |> \n  group_by(year) |>\n  summarize(\n    all_trips = n(),\n    shared_trips = sum(passenger_count > 1, na.rm = TRUE)\n  ) |>\n  mutate(pct_shared = shared_trips / all_trips * 100) |>\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 4 × 4\n   year all_trips shared_trips pct_shared\n  <int>     <int>        <int>      <dbl>\n1  2014 165114361     48816505       29.6\n2  2015 146112989     43081091       29.5\n3  2016 131165043     38163870       29.1\n4  2017 113495512     32296166       28.5\n```\n:::\n:::\n\n\n::: {#exercise-hello-nyc-taxi .callout-tip}\n## Exercises: First {dplyr} pipeline with Arrow\n\n::: panel-tabset\n## Problems\n\n1.  Calculate the total number of rides for every month in 2019\n2.  About how long did this query of 1.15 billion rows take?\n\n## Solution 1\n\nTotal number of rides for every month in 2019:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi |> \n  filter(year == 2019) |>\n  count(month) |>\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 12 × 2\n   month       n\n   <int>   <int>\n 1     1 7667255\n 2    11 6877463\n 3    12 6895933\n 4    10 7213588\n 5     2 7018750\n 6     3 7832035\n 7     4 7432826\n 8     5 7564884\n 9     6 6940489\n10     7 6310134\n11     8 6072851\n12     9 6567396\n```\n:::\n:::\n\n\n## Solution 2\n\nCompute time for querying the 1.15 billion rows:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi |> \n  filter(year == 2019) |>\n  group_by(month) |>\n  summarize(longest_trip = max(trip_distance, na.rm = TRUE)) |>\n  arrange(month) |> \n  collect() |> \n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n   user  system elapsed \n  2.844   0.175   0.331 \n```\n:::\n:::\n\n\nor\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(tictoc)\n\ntic()\nnyc_taxi |> \n  filter(year == 2019) |>\n  group_by(month) |>\n  summarize(longest_trip = max(trip_distance, na.rm = TRUE)) |>\n  arrange(month) |> \n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 12 × 2\n   month longest_trip\n   <int>        <dbl>\n 1     1         832.\n 2     2         702.\n 3     3         237.\n 4     4         831.\n 5     5         401.\n 6     6       45977.\n 7     7         312.\n 8     8         602.\n 9     9         604.\n10    10         308.\n11    11         701.\n12    12       19130.\n```\n:::\n\n```{.r .cell-code}\ntoc()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n0.379 sec elapsed\n```\n:::\n:::\n\n:::\n:::\n",
+    "markdown": "---\ntitle: \"Hello Arrow Exercises\"\nexecute:\n  echo: true\n  messages: false\n  warning: false\n---\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\nlibrary(dplyr)\n```\n:::\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi <- open_dataset(here::here(\"data/nyc-taxi\"))\n```\n:::\n\n\n\n\n\n\n::: {#exercise-hello-nyc-taxi .callout-tip}\n## Exercises: First {dplyr} pipeline with Arrow\n\n::: panel-tabset\n## Problems\n\n1.  Calculate the total number of rides for every month in 2019\n2.  About how long did this query of 1.15 billion rows take?\n\n## Solution 1\n\nTotal number of rides for every month in 2019:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi |> \n  filter(year == 2019) |>\n  count(month) |>\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 12 × 2\n   month       n\n   <int>   <int>\n 1     1 7667255\n 2    11 6877463\n 3    10 7213588\n 4    12 6895933\n 5     3 7832035\n 6     5 7564884\n 7     2 7018750\n 8     4 7432826\n 9     6 6940489\n10     7 6310134\n11     9 6567396\n12     8 6072851\n```\n:::\n:::\n\n\n## Solution 2\n\nCompute time for querying the 1.15 billion rows:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi |> \n  filter(year == 2019) |>\n  group_by(month) |>\n  summarize(longest_trip = max(trip_distance, na.rm = TRUE)) |>\n  arrange(month) |> \n  collect() |> \n  system.time()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n   user  system elapsed \n  2.552   0.307   0.705 \n```\n:::\n:::\n\n\nor\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(tictoc)\n\ntic()\nnyc_taxi |> \n  filter(year == 2019) |>\n  group_by(month) |>\n  summarize(longest_trip = max(trip_distance, na.rm = TRUE)) |>\n  arrange(month) |> \n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 12 × 2\n   month longest_trip\n   <int>        <dbl>\n 1     1         832.\n 2     2         702.\n 3     3         237.\n 4     4         831.\n 5     5         401.\n 6     6       45977.\n 7     7         312.\n 8     8         602.\n 9     9         604.\n10    10         308.\n11    11         701.\n12    12       19130.\n```\n:::\n\n```{.r .cell-code}\ntoc()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n0.492 sec elapsed\n```\n:::\n:::\n\n:::\n:::\n",
     "supporting": [
       "1_hello_arrow-exercises_files"
     ],
diff --git a/_freeze/materials/2_data_manipulation_1-exercises/execute-results/html.json b/_freeze/materials/2_data_manipulation_1-exercises/execute-results/html.json
index e6fb1a4..99321e6 100644
--- a/_freeze/materials/2_data_manipulation_1-exercises/execute-results/html.json
+++ b/_freeze/materials/2_data_manipulation_1-exercises/execute-results/html.json
@@ -1,7 +1,7 @@
 {
-  "hash": "00263065051d39e9d1c386cd2685b77b",
+  "hash": "847d8f98a2ba0c8cb036b54821c7fd5c",
   "result": {
-    "markdown": "---\ntitle: \"Data Manipulation Part 1 - Exercises\"\nexecute:\n  echo: true\n  messages: false\n  warning: false\n---\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\nlibrary(dplyr)\nlibrary(stringr)\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi <- open_dataset(here::here(\"data/nyc-taxi\"))\nnyc_taxi\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nFileSystemDataset with 120 Parquet files\nvendor_name: string\npickup_datetime: timestamp[ms]\ndropoff_datetime: timestamp[ms]\npassenger_count: int64\ntrip_distance: double\npickup_longitude: double\npickup_latitude: double\nrate_code: string\nstore_and_fwd: string\ndropoff_longitude: double\ndropoff_latitude: double\npayment_type: string\nfare_amount: double\nextra: double\nmta_tax: double\ntip_amount: double\ntolls_amount: double\ntotal_amount: double\nimprovement_surcharge: double\ncongestion_surcharge: double\npickup_location_id: int64\ndropoff_location_id: int64\nyear: int32\nmonth: int32\n```\n:::\n:::\n\n\n::: {#exercise-compute-collect .callout-tip}\n# Using `compute()` and `collect()`\n\n::: panel-tabset\n## Problem\n\n1.  How many taxi fares in the dataset had a total amount greater than \\$100?\n\n2.  How many distinct pickup locations (distinct combinations of the `pickup_latitude` and `pickup_longitude` columns) are in the dataset since 2016? Use `nrow()` to work this out.\n\n## Solution 1\n\n\n::: {.cell hash='2_data_manipulation_1-exercises_cache/html/compute-collect-1_22f5a7e3ca42f31be95226ca75ff8140'}\n\n```{.r .cell-code}\nnyc_taxi |>\n  filter(total_amount > 100) |>\n  nrow()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1518869\n```\n:::\n:::\n\n\n## Solution 2\n\n\n::: {.cell hash='2_data_manipulation_1-exercises_cache/html/compute-collect-2_d78c6443e0ec2328bb8e454462d97d56'}\n\n```{.r .cell-code}\nnyc_taxi |>\n  filter(year >= 2016) |>\n  distinct(pickup_longitude, pickup_latitude) |>\n  compute() |>\n  nrow()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 29105801\n```\n:::\n:::\n\n:::\n:::\n\n::: {#exercise-dplyr-api .callout-tip}\n# Using the dplyr API in arrow\n\n::: panel-tabset\n## Problem\n\n1.  Use the `dplyr::filter()` and `stringr::str_ends()` functions to return a subset of the data which is a) from September 2020, and b) the value in `vendor_name` ends with the letter \"S\".\n\n2.  Try to use the `stringr` function `str_replace_na()` to replace any `NA` values in the `vendor_name` column with the string \"No vendor\" instead. What happens, and why?\n\n3.  Bonus question: see if you can find a different way of completing the task in question 2.\n\n## Solution 1\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi |>\n  filter(str_ends(vendor_name, \"S\"), year == 2020,  month == 9) |>\n  collect()\n```\n:::\n\n\n## Solution 2 and 3\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi |>\n  mutate(vendor_name = stringr::str_replace_na(vendor_name, \"No vendor\")) |>\n  head() |>\n  collect()\n```\n:::\n\n\nThis won't work as `stringr::str_replace_na()` hasn't been implemented in Arrow. You could try using `mutate()` and `ifelse()` here instead.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi |>\n  mutate(vendor_name = ifelse(is.na(vendor_name), \"No vendor\", vendor_name)) |>\n  head() |>\n  collect()\n```\n:::\n\n\nOr, if you only needed a subset of the data, you could apply the function after collecting it into R memory.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi |>\n  filter(year == 2019, month == 10) |> # smaller subset of the data\n  collect() |>\n  mutate(vendor_name = stringr::str_replace_na(vendor_name, \"No vendor\"))\n```\n:::\n\n\n\n:::\n:::\n",
+    "markdown": "---\ntitle: \"Data Manipulation Part 1 - Exercises\"\nexecute:\n  echo: true\n  messages: false\n  warning: false\n---\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\nlibrary(dplyr)\nlibrary(stringr)\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi <- open_dataset(here::here(\"data/nyc-taxi\"))\n```\n:::\n\n\n::: {#exercise-compute-collect .callout-tip}\n# Using `compute()` and `collect()`\n\n::: panel-tabset\n## Problem\n\n1.  How many taxi fares in the dataset had a total amount greater than \\$100?\n\n2.  How many distinct pickup locations (distinct combinations of the `pickup_latitude` and `pickup_longitude` columns) are in the dataset since 2016? Use `nrow()` to work this out.\n\n## Solution 1\n\n\n::: {.cell hash='2_data_manipulation_1-exercises_cache/html/compute-collect-1_22f5a7e3ca42f31be95226ca75ff8140'}\n\n```{.r .cell-code}\nnyc_taxi |>\n  filter(total_amount > 100) |>\n  nrow()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 1518869\n```\n:::\n:::\n\n\n## Solution 2\n\n\n::: {.cell hash='2_data_manipulation_1-exercises_cache/html/compute-collect-2_d78c6443e0ec2328bb8e454462d97d56'}\n\n```{.r .cell-code}\nnyc_taxi |>\n  filter(year >= 2016) |>\n  distinct(pickup_longitude, pickup_latitude) |>\n  compute() |>\n  nrow()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 29105801\n```\n:::\n:::\n\n:::\n:::\n\n::: {#exercise-dplyr-api .callout-tip}\n# Using the dplyr API in arrow\n\n::: panel-tabset\n## Problem\n\n1.  Use the `dplyr::filter()` and `stringr::str_ends()` functions to return a subset of the data which is a) from September 2020, and b) the value in `vendor_name` ends with the letter \"S\".\n\n2.  Try to use the `stringr` function `str_replace_na()` to replace any `NA` values in the `vendor_name` column with the string \"No vendor\" instead. What happens, and why?\n\n3.  Bonus question: see if you can find a different way of completing the task in question 2.\n\n## Solution 1\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi |>\n  filter(str_ends(vendor_name, \"S\"), year == 2020,  month == 9) |>\n  collect()\n```\n:::\n\n\n## Solution 2 and 3\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi |>\n  mutate(vendor_name = stringr::str_replace_na(vendor_name, \"No vendor\")) |>\n  head() |>\n  collect()\n```\n:::\n\n\nThis won't work as `stringr::str_replace_na()` hasn't been implemented in Arrow. You could try using `mutate()` and `ifelse()` here instead.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi |>\n  mutate(vendor_name = ifelse(is.na(vendor_name), \"No vendor\", vendor_name)) |>\n  head() |>\n  collect()\n```\n:::\n\n\nOr, if you only needed a subset of the data, you could apply the function after collecting it into R memory.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi |>\n  filter(year == 2019, month == 10) |> # smaller subset of the data\n  collect() |>\n  mutate(vendor_name = stringr::str_replace_na(vendor_name, \"No vendor\"))\n```\n:::\n\n:::\n:::\n",
     "supporting": [],
     "filters": [
       "rmarkdown/pagebreak.lua"
diff --git a/_freeze/materials/4_data_manipulation_2-exercises/execute-results/html.json b/_freeze/materials/4_data_manipulation_2-exercises/execute-results/html.json
index 1dd8dd0..793c8dc 100644
--- a/_freeze/materials/4_data_manipulation_2-exercises/execute-results/html.json
+++ b/_freeze/materials/4_data_manipulation_2-exercises/execute-results/html.json
@@ -1,7 +1,7 @@
 {
-  "hash": "ad926165c89e1214f7fc11d06bf82064",
+  "hash": "19de5df6492df9b4b240e8609640bde4",
   "result": {
-    "markdown": "---\ntitle: \"Data Manipulation Part 2 - Exercises\"\nexecute:\n  echo: true\n  messages: false\n  warning: false\n---\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\nlibrary(dplyr)\nlibrary(duckdb)\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi <- open_dataset(here::here(\"data/nyc-taxi\"))\nnyc_taxi\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nFileSystemDataset with 120 Parquet files\nvendor_name: string\npickup_datetime: timestamp[ms]\ndropoff_datetime: timestamp[ms]\npassenger_count: int64\ntrip_distance: double\npickup_longitude: double\npickup_latitude: double\nrate_code: string\nstore_and_fwd: string\ndropoff_longitude: double\ndropoff_latitude: double\npayment_type: string\nfare_amount: double\nextra: double\nmta_tax: double\ntip_amount: double\ntolls_amount: double\ntotal_amount: double\nimprovement_surcharge: double\ncongestion_surcharge: double\npickup_location_id: int64\ndropoff_location_id: int64\nyear: int32\nmonth: int32\n```\n:::\n:::\n\n\n::: {#exercise-udfs .callout-tip}\n# User-defined functions\n\n::: panel-tabset\n## Problem\n\n1.  Write a user-defined function which wraps the `stringr` function `str_replace_na()`, and use it to replace any `NA` values in the `vendor_name` column with the string \"No vendor\" instead. (Test it on the data from 2019 so you're not pulling everything into memory)\n\n## Solution 1\n\n\n::: {.cell}\n\n```{.r .cell-code}\n# Preview the distinct vendor names before we start\nnyc_taxi |>\n  filter(year == 2019) |> # smaller subset of the data\n  distinct(vendor_name) |>\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 3 × 1\n  vendor_name\n  <chr>      \n1 CMT        \n2 VTS        \n3 <NA>       \n```\n:::\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nregister_scalar_function(\n  name = \"replace_vendor_na\",\n  function(context, string) {\n    stringr::str_replace_na(string, \"No vendor\")\n  },\n  in_type = schema(string = string()),\n  out_type = string(),\n  auto_convert = TRUE\n)\n\nvendor_names_fixed <- nyc_taxi |>\n  mutate(vendor_name = replace_vendor_na(vendor_name)) \n\n# Preview the distinct vendor names to check it's worked\nvendor_names_fixed |>\n  filter(year == 2019) |> # smaller subset of the data\n  distinct(vendor_name) |>\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 3 × 1\n  vendor_name\n  <chr>      \n1 CMT        \n2 VTS        \n3 No vendor  \n```\n:::\n:::\n\n:::\n:::\n\n::: {#exercise-joins .callout-tip}\n# Joins\n\n::: panel-tabset\n## Problem\n\n1.  How many taxi pickups were recorded in 2019 from the three major airports covered by the NYC Taxis data set (JFK, LaGuardia, Newark)? (Hint: you can use `stringr::str_detect()` to help you find pickup zones with the word \"Airport\" in them)\n\n## Solution 1\n\n\n::: {.cell}\n\n```{.r .cell-code}\npickup_location <- read_csv_arrow(here::here(\"data/taxi_zone_lookup.csv\"))\n\npickup_location <- pickup_location |>\n  select(\n    pickup_location_id = LocationID,\n    borough = Borough,\n    pickup_zone = Zone\n  ) \n\n\npickup_location_arrow <- arrow_table(\n  pickup_location, \n  schema = schema(\n    pickup_location_id = int64(),\n    borough = utf8(),\n    pickup_zone = utf8()\n  ))\n\nnyc_taxi |>\n  filter(year == 2019) |>\n  left_join(pickup_location_arrow) |>\n  filter(str_detect(pickup_zone, \"Airport\")) |>\n  count(pickup_zone) |>\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 3 × 2\n  pickup_zone             n\n  <chr>               <int>\n1 JFK Airport       2729336\n2 LaGuardia Airport 2159224\n3 Newark Airport       8643\n```\n:::\n:::\n\n:::\n:::\n\n::: {#exercise-window .callout-tip}\n# Window functions\n\n::: panel-tabset\n## Problem\n\n1.  How many trips in September 2019 had a longer than average distance for that month?\n\n## Solution 1\n\n### Option 1 - via DuckDB\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi |>\n  filter(year == 2019, month == 9) |>\n  to_duckdb() |>\n  mutate(mean_distance = mean(trip_distance)) |>\n  to_arrow() |>\n  filter(trip_distance < mean_distance) |>\n  count() |>\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 1 × 1\n        n\n    <int>\n1 4881580\n```\n:::\n:::\n\n\n### Option 2 - via a join\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi |>\n  filter(year == 2019, month == 9) |>\n  left_join(\n    nyc_taxi |>\n      filter(year == 2019, month == 9) |>\n      group_by(year) |>\n      summarise(mean_distance = mean(trip_distance))\n    ) |>\n  filter(trip_distance < mean_distance) |>\n  count() |>\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 1 × 1\n        n\n    <int>\n1 4881580\n```\n:::\n:::\n\n:::\n:::\n",
+    "markdown": "---\ntitle: \"Data Manipulation Part 2 - Exercises\"\nexecute:\n  echo: true\n  messages: false\n  warning: false\n---\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(arrow)\nlibrary(dplyr)\nlibrary(duckdb)\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi <- open_dataset(here::here(\"data/nyc-taxi\"))\n```\n:::\n\n\n::: {#exercise-udfs .callout-tip}\n# User-defined functions\n\n::: panel-tabset\n## Problem\n\n1.  Write a user-defined function which wraps the `stringr` function `str_replace_na()`, and use it to replace any `NA` values in the `vendor_name` column with the string \"No vendor\" instead. (Test it on the data from 2019 so you're not pulling everything into memory)\n\n## Solution 1\n\n\n::: {.cell}\n\n```{.r .cell-code}\n# Preview the distinct vendor names before we start\nnyc_taxi |>\n  filter(year == 2019) |> # smaller subset of the data\n  distinct(vendor_name) |>\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 3 × 1\n  vendor_name\n  <chr>      \n1 CMT        \n2 VTS        \n3 <NA>       \n```\n:::\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nregister_scalar_function(\n  name = \"replace_vendor_na\",\n  function(context, string) {\n    stringr::str_replace_na(string, \"No vendor\")\n  },\n  in_type = schema(string = string()),\n  out_type = string(),\n  auto_convert = TRUE\n)\n\nvendor_names_fixed <- nyc_taxi |>\n  mutate(vendor_name = replace_vendor_na(vendor_name)) \n\n# Preview the distinct vendor names to check it's worked\nvendor_names_fixed |>\n  filter(year == 2019) |> # smaller subset of the data\n  distinct(vendor_name) |>\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 3 × 1\n  vendor_name\n  <chr>      \n1 CMT        \n2 VTS        \n3 No vendor  \n```\n:::\n:::\n\n:::\n:::\n\n::: {#exercise-joins .callout-tip}\n# Joins\n\n::: panel-tabset\n## Problem\n\n1.  How many taxi pickups were recorded in 2019 from the three major airports covered by the NYC Taxis data set (JFK, LaGuardia, Newark)? (Hint: you can use `stringr::str_detect()` to help you find pickup zones with the word \"Airport\" in them)\n\n## Solution 1\n\n\n::: {.cell}\n\n```{.r .cell-code}\npickup_location <- read_csv_arrow(here::here(\"data/taxi_zone_lookup.csv\"))\n\npickup_location <- pickup_location |>\n  select(\n    pickup_location_id = LocationID,\n    borough = Borough,\n    pickup_zone = Zone\n  ) \n\n\npickup_location_arrow <- arrow_table(\n  pickup_location, \n  schema = schema(\n    pickup_location_id = int64(),\n    borough = utf8(),\n    pickup_zone = utf8()\n  ))\n\nnyc_taxi |>\n  filter(year == 2019) |>\n  left_join(pickup_location_arrow) |>\n  filter(str_detect(pickup_zone, \"Airport\")) |>\n  count(pickup_zone) |>\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 3 × 2\n  pickup_zone             n\n  <chr>               <int>\n1 LaGuardia Airport 2159224\n2 JFK Airport       2729336\n3 Newark Airport       8643\n```\n:::\n:::\n\n:::\n:::\n\n::: {#exercise-window .callout-tip}\n# Window functions\n\n::: panel-tabset\n## Problem\n\n1.  How many trips in September 2019 had a longer than average distance for that month?\n\n## Solution 1\n\n### Option 1 - via DuckDB\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi |>\n  filter(year == 2019, month == 9) |>\n  to_duckdb() |>\n  mutate(mean_distance = mean(trip_distance)) |>\n  to_arrow() |>\n  filter(trip_distance < mean_distance) |>\n  count() |>\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 1 × 1\n        n\n    <int>\n1 4881580\n```\n:::\n:::\n\n\n### Option 2 - via a join\n\n\n::: {.cell}\n\n```{.r .cell-code}\nnyc_taxi |>\n  filter(year == 2019, month == 9) |>\n  left_join(\n    nyc_taxi |>\n      filter(year == 2019, month == 9) |>\n      group_by(year) |>\n      summarise(mean_distance = mean(trip_distance))\n    ) |>\n  filter(trip_distance < mean_distance) |>\n  count() |>\n  collect()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 1 × 1\n        n\n    <int>\n1 4881580\n```\n:::\n:::\n\n:::\n:::\n",
     "supporting": [
       "4_data_manipulation_2-exercises_files"
     ],
diff --git a/_quarto.yaml b/_quarto.yaml
index 95ffd7b..945989a 100644
--- a/_quarto.yaml
+++ b/_quarto.yaml
@@ -17,22 +17,31 @@ website:
         menu:
         - text: "Welcome 👋"
           href: materials/0_housekeeping.qmd
+          target: "_blank"
         - text: "Hello Arrow"
           href: materials/1_hello_arrow.qmd
+          target: "_blank"
         - text: "Manipulating Data with Arrow (Part I)"
           href: materials/2_data_manipulation_1.qmd
+          target: "_blank"
         - text: "Data Engineering with Arrow"
           href: materials/3_data_engineering.qmd
+          target: "_blank"
         - text: "Manipulating Data with Arrow (Part II)"
           href: materials/4_data_manipulation_2.qmd
+          target: "_blank"
         - text: "Arrow In-Memory Workflows"
           href: materials/5_arrow_single_file.qmd
+          target: "_blank"
         - text: "Wrapping Up: Arrow & R Together"
           href: materials/6_wrapping_up.qmd
+          target: "_blank"
         - text: "Getting Help & More Resources"
           href: materials/7_continue_learning.qmd
+          target: "_blank"
         - text: "Thank you 👋"
           href: materials/8_closing.qmd
+          target: "_blank"
       - text: Exercises
         menu:
         - text: "Hello Arrow Exercises"
diff --git a/materials/1_hello_arrow-exercises.qmd b/materials/1_hello_arrow-exercises.qmd
index 88e6e9c..49a32c7 100644
--- a/materials/1_hello_arrow-exercises.qmd
+++ b/materials/1_hello_arrow-exercises.qmd
@@ -28,17 +28,20 @@ taxi_size |> summarise(total_GB = sum(size_GB))
 ```{r}
 #| label: open-dataset
 nyc_taxi <- open_dataset(here::here("data/nyc-taxi"))
-nyc_taxi
 ```
 
 ```{r}
 #| label: nrow
+#| include: false
+#| eval: false
 nyc_taxi |> 
   nrow()
 ```
 
 ```{r}
 #| label: first-dplyr-pipeline
+#| include: false
+#| eval: false
 nyc_taxi |>
   filter(year %in% 2014:2017) |> 
   group_by(year) |>
diff --git a/materials/2_data_manipulation_1-exercises.qmd b/materials/2_data_manipulation_1-exercises.qmd
index 6379adb..a992114 100644
--- a/materials/2_data_manipulation_1-exercises.qmd
+++ b/materials/2_data_manipulation_1-exercises.qmd
@@ -16,7 +16,6 @@ library(stringr)
 ```{r}
 #| label: open-dataset
 nyc_taxi <- open_dataset(here::here("data/nyc-taxi"))
-nyc_taxi
 ```
 
 ::: {#exercise-compute-collect .callout-tip}
@@ -108,7 +107,5 @@ nyc_taxi |>
   collect() |>
   mutate(vendor_name = stringr::str_replace_na(vendor_name, "No vendor"))
 ```
-
-
 :::
 :::
diff --git a/materials/4_data_manipulation_2-exercises.qmd b/materials/4_data_manipulation_2-exercises.qmd
index 6ed7fd5..dcab53e 100644
--- a/materials/4_data_manipulation_2-exercises.qmd
+++ b/materials/4_data_manipulation_2-exercises.qmd
@@ -16,7 +16,6 @@ library(duckdb)
 ```{r}
 #| label: open-dataset
 nyc_taxi <- open_dataset(here::here("data/nyc-taxi"))
-nyc_taxi
 ```
 
 ::: {#exercise-udfs .callout-tip}