-
Notifications
You must be signed in to change notification settings - Fork 0
/
511-exercises.Rmd
541 lines (365 loc) · 13.7 KB
/
511-exercises.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
---
title: "Forecasting: Principles and Practice Chapter 5 Exercises"
author: "Neil Martin"
date: "2024-09-11"
output: html_document
---
```{r setup, include=FALSE}
load(".RData")
library(fpp3)
library(knitr)
library(ggplot2)
library(readxl)
library(dplyr)
library(readr)
library(seasonal)
library(latex2exp)
library(tsibble)
library(feasts)
library(tidyr)
```
#### Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:
#### Australian Population (global_economy)
#### Bricks (aus_production)
#### NSW Lambs (aus_livestock)
#### Household wealth (hh_budget).
#### Australian takeaway food turnover (aus_retail).
```{r model_benchmark}
global_economy |>
filter(Country == "Australia") |>
model(RW(Population ~ drift())) |>
forecast(h = 5) |>
autoplot(global_economy) +
labs(title = "Australia Population",
subtitle = "5 Year Population Forecast")
bricks <- aus_production |>
filter_index("1970 Q1" ~ "2004 Q4") |>
select(Bricks)
bricks
bricks |>
model(SNAIVE(Bricks)) |>
forecast(h = 5) |>
autoplot(bricks) +
labs(title = "Bricks",
subtitle = "Future 5 Quarter Forecast")
aus_livestock |>
filter(State == "New South Wales", Animal == 'Lambs') |>
model(SNAIVE(Count)) |>
forecast(h = 24) |>
autoplot(aus_livestock) +
labs(title = "Lambs in New South Wales",
subtitle = "Dec 2018 - Dec 2020 Forecast")
hh_budget |>
model(RW(Wealth ~ drift())) |>
forecast(h = 5) |>
autoplot(hh_budget) +
labs(title = "Household Wealth",
subtitle = "5 Year Household Wealth Forecast")
aus_retail |>
filter(Industry == "Cafes, restaurants and takeaway food services") |>
model(RW(Turnover ~ drift())) |>
forecast( h = 24) |>
autoplot(aus_retail) +
labs(title = "Australian Takeaway Food Turnover",
subtitle = "Apr 1982 - Dec 2018, Forecasted until Dec 2021") +
facet_wrap(~State, scales = "free")
```
#### Use the Facebook stock price (data set gafa_stock) to do the following:
#### Produce a time plot of the series.
#### Produce forecasts using the drift method and plot them.
#### Show that the forecasts are identical to extending the line drawn between the first and last observations.
#### Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?
```{r fb_stock_again}
fb_stock <- gafa_stock |>
filter(Symbol == "FB") |>
mutate(trading_day = row_number()) |>
update_tsibble(index = trading_day, regular = TRUE)
fb_stock |> autoplot() +
labs(title = "FB Stock Open Price")
fb_stock |>
model(RW(Open ~ drift())) |>
forecast(h = 63) |>
autoplot(fb_stock) +
labs(title = "FB Stock Open Price",
y = "USD($)")
fb_stock |>
model(RW(Open ~ drift())) |>
forecast(h = 63) |>
autoplot(fb_stock) +
labs(title = "FB Stock Open Price") +
geom_segment(aes(x = 1, y = 54.83, xend = 1258, yend = 134.45),
colour = "red", linetype = "dashed")
fb_stock |>
model(Drift = NAIVE(Open ~ drift()),
Mean = MEAN(Open),
Naive = NAIVE(Open)) |>
forecast(h = 63) |>
autoplot(fb_stock, level = NULL) +
labs(title = "FB Stock Open Price",
y = "USD($)")
```
<p> None of the additional benchmarks are particularly useful. However, if I had to pick one it would be the drift.</p>
#### Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help.
```{r whitenoise_beer}
# Extract data of interest
recent_production <- aus_production |>
filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()
# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)
```
<p> The residuals are not white noise. This can be concluded from the results from the ACF function demonstrating peaks in Q4. </p>
#### Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case
```{r exports_bricks}
aus_exports <- global_economy |>
filter(Country == "Australia")
aus_exports
fit <- aus_exports |>
model(NAIVE(Exports))
fit |> gg_tsresiduals() +
ggtitle("Residual Plot for Australian Exports")
fit |> forecast() |> autoplot(aus_exports) +
ggtitle("Australian Exports")
fit <- bricks |>
model(SNAIVE(Bricks))
fit |> gg_tsresiduals() +
ggtitle("Residual Plot for Brick Production")
fit |> forecast() |> autoplot(bricks) +
ggtitle("Australian Brick Production")
```
<p> This data is not seasonal for the exports, therefore NAIVE is a better choice of model. For the Bricks data, the SNAIVE is better as this is broken down into quarters. The lag plot demonstrates clear seasonality with brick production increasing during Q4 and a reduction in production until the following quarter next year.</p>
#### Produce forecasts for the 7 Victorian series in aus_livestock using SNAIVE(). Plot the resulting forecasts including the historical data. Is this a reasonable benchmark for these series?
```{r 7_victorian_series}
aus_livestock |>
filter(State == "Victoria") |>
model(SNAIVE(Count ~ lag("2 years"))) |>
forecast(h = "2 years") %>%
autoplot(aus_livestock) +
labs(title = "Animals in Victoria") +
facet_wrap(~Animal, scales = "free")
```
<p> Lambs and Calves may benefit from a trend model as overtime the number of calves has increased steadily and the number of lambs increased. For the rest, there is clear seasonal patterns within the data so a SNAIVE model is more suitable.</p>
#### Are the following statements true or false? Explain your answer.
#### Good forecast methods should have normally distributed residuals.
<p> Yes normally distributed residuals outlines that there is less of a chance of the data being white noise and the model being more accurate.</p>
#### A model with small residuals will give good forecasts.
<p> No, even though there is little difference between the forecast and the actual values, the pattern of the data might change.</p>
#### The best measure of forecast accuracy is MAPE.
<p> This is generally the most used metric.</p>
#### If your model doesn’t forecast well, you should make it more complicated.
<p> Nope, sometimes choosing a more simplistic model can provide better results.</p>
#### Always choose the model with the best forecast accuracy as measured on the test set.
<p> No, this can result in overfitting. Consider the metric the model has against the validation/testing set.</p>
#### For your retail time series (from Exercise 7 in Section 2.10):
#### Create a training dataset consisting of observations before 2011 using
```{r retail_split}
set.seed(12345678)
myseries <- aus_retail |>
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries_train <- myseries |>
filter(year(Month) < 2011)
```
#### Check that your data have been split appropriately by producing the following plot.
```{r retail_split2}
autoplot(myseries, Turnover) +
autolayer(myseries_train, Turnover, colour = "red")
```
#### Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).
```{r retail_split3}
fit <- myseries_train |>
model(SNAIVE(Turnover))
```
#### Check the residuals.
```{r retail_split4}
fit |> gg_tsresiduals()
```
#### Produce forecasts for the test data
```{r retail_split5}
fc <- fit |>
forecast(new_data = anti_join(myseries, myseries_train))
fc |> autoplot(myseries)
```
#### Compare the accuracy of your forecasts against the actual values.
```{r retail_split6}
fit |> accuracy()
fc |> accuracy(myseries)
```
#### Consider the number of pigs slaughtered in New South Wales (data set aus_livestock).
#### Produce some plots of the data in order to become familiar with it.
```{r q8}
pigs <- aus_livestock |>
filter(Animal == "Pigs" & State == "New South Wales")
pigs
pigs |> autoplot()
pigs |> gg_season()
pigs |> gg_subseries()
```
#### Create a training set of 486 observations, withholding a test set of 72 observations (6 years).
```{r q81}
pigs_train <- pigs |>
filter(year(Month) < 2014)
autoplot(pigs, Count) +
autolayer(pigs_train, Count, colour = "red")
```
#### Try using various benchmark methods to forecast the training set and compare the results on the test set. Which method did best?
```{r q82}
pigs_fit <- pigs_train |>
model(SNAIVE(Count))
accuracy(pigs_fit)
pigs_fc <- pigs_fit |>
forecast(h = 12, level = NULL) |>
autoplot(pigs_train) +
labs(title = "Piggy Forecast")
pigs_fc
```
<p> Due to the stucture of the time series index, seasonal will likely always perform better. </p>
#### Check the residuals of your preferred method. Do they resemble white noise?
```{r q83}
pigs_fit |> gg_tsresiduals() +
ggtitle("Residuals")
```
<p> No, there does not appear to be white noise as the residuals are consistent.
#### Create a training set for household wealth (hh_budget) by withholding the last four years as a test set.
```{r 91}
hh_budget_train <- hh_budget |>
filter(Year <= 2010)
fit <- hh_budget_train |>
model(RW(Wealth ~ drift()))
accuracy(fit)
fc <- fit |>
forecast(h = 6, level=NULL) |>
autoplot(hh_budget_train)+
labs(title = "6 Year Household Wealth Forecast")
fc
```
#### Create a training set for Australian takeaway food turnover (aus_retail) by withholding the last four years as a test set.
```{r 101}
takeaway <- aus_retail |>
filter(Industry == "Cafes, restaurants and takeaway food services")
tw_train <- takeaway |>
filter(year(Month) <= 2014)
```
#### Fit all the appropriate benchmark methods to the training set and forecast the periods covered by the test set.
```{r q102}
fit <- takeaway |>
model(RW(Turnover ~ drift()))
fit
fit |>
forecast ( h = 48) |>
autoplot(takeaway) +
labs(title = "Australian Takeaway Food Turnover",
subtitle = "Apr 1982 - Dec 2018, Forecasted until Dec 2022") +
facet_wrap(~State, scales = "free")
```
#### We will use the Bricks data from aus_production (Australian quarterly clay brick production 1956–2005) for this exercise.
#### Use an STL decomposition to calculate the trend-cycle and seasonal indices. (Experiment with having fixed or changing seasonality.)
```{r q111}
bricks_additive <- bricks |>
model(classical_decomposition(Bricks, type = "additive")) |>
components() |>
autoplot() +
labs(title = "Classical additive decomposition of Australian brick production")
bricks_additive
```
```{r q112}
bricks_multi <- bricks |>
model(classical_decomposition(Bricks, type = "multiplicative")) |>
components() |>
autoplot() +
labs(title = "Classical multiplicative decomposition of Australian brick production")
bricks_multi
```
```{r q113}
bricks_stl <- bricks |>
model(STL(Bricks ~ season(window = 4), robust = TRUE)) |>
components() |>
autoplot() +
labs(title = "STL decomposition of Australian brick production")
bricks_stl
bricks_stl_per <- bricks |>
model(STL(Bricks ~ season(window = 'periodic'), robust = TRUE)) |>
components() |>
autoplot() +
labs(title = "Periodic STL decomposition of Australian brick production")
bricks_stl_per
```
#### Compute and plot the seasonally adjusted data.
```{r q114}
bricks
bricks_season <- bricks |>
model(stl = STL(Bricks))
brick_comp <- components(bricks_season)
brick_season_adjust <- bricks |>
autoplot(Bricks, color = "green") +
autolayer(components(bricks_season), season_adjust, color = "red") +
labs(title = "Seasonally adjusted Australian bricks data")
brick_season_adjust
```
#### Use a naïve method to produce forecasts of the seasonally adjusted data.
```{r q115}
brick_trend <- brick_comp |>
select(-c(.model, Bricks, trend, season_year, remainder))
brick_trend |>
model(NAIVE(season_adjust)) |>
forecast( h = "5 years") |>
autoplot(brick_trend) +
labs(title = "Seasonally adjusted Naive forecast", y = "Bricks")
```
#### tourism contains quarterly visitor nights (in thousands) from 1998 to 2017 for 76 regions of Australia.
#### Extract data from the Gold Coast region using filter() and aggregate total overnight trips (sum over Purpose) using summarise(). Call this new dataset gc_tourism.
```{r q120}
gc_tourism <- tourism |>
filter(Region == "Gold Coast") |>
group_by(Purpose) |>
summarise(Total_Overnight_Trips = sum(Trips, na.rm = TRUE))
range(gc_tourism$Quarter)
gc_train_1 <- gc_tourism |>
slice(1:(n()-4))
gc_train_2 <- gc_tourism |>
slice(1:(n()-8))
gc_train_3 <- gc_tourism |>
slice(1:(n()-12))
gc_train_1
gc_train_2
gc_train_3
```
#### Compute one year of forecasts for each training set using the seasonal naïve (SNAIVE()) method. Call these gc_fc_1, gc_fc_2 and gc_fc_3, respectively.
```{r q121}
gc_tourism <- tourism |>
filter(Region == "Gold Coast") |>
group_by(Purpose) |>
summarise(Total_Overnight_Trips = sum(Trips, na.rm = TRUE))
range(gc_tourism$Quarter)
gc_train_1 <- gc_tourism |>
slice(1:(n()-4))
gc_train_2 <- gc_tourism |>
slice(1:(n()-8))
gc_train_3 <- gc_tourism |>
slice(1:(n()-12))
gc_train_1
gc_train_2
gc_train_3
gc_fc_1 <- gc_train_1 |>
model(SNAIVE(Total_Overnight_Trips)) |>
forecast( h = 4)
gc_fc_1 |>
autoplot(gc_train_1) +
labs(title = "GC Train 1 SNaive Forecast")
gc_fc_2 <- gc_train_2 |>
model(SNAIVE(Total_Overnight_Trips)) |>
forecast( h = 4)
gc_fc_2 |> autoplot(gc_train_2) +
labs(title = "GC Train 2 SNaive Forecast")
gc_fc_3 <- gc_train_3 |>
model(SNAIVE(Total_Overnight_Trips)) |>
forecast( h = 4)
gc_fc_3 |> autoplot(gc_train_3) +
labs(title = "GC Train 3 SNaive Forecast")
gc_fc_1 |> accuracy(gc_tourism)
gc_fc_2 |> accuracy(gc_tourism)
gc_fc_3 |> accuracy(gc_tourism)
```
<p> It's clear that the gc_fc_3 has the lowest MAPE. This is due removing the last 3 years of observations. </p>