diff --git a/docs/figures/silverkite_multistage.png b/docs/figures/silverkite_multistage.png deleted file mode 100644 index de36292..0000000 Binary files a/docs/figures/silverkite_multistage.png and /dev/null differ diff --git a/docs/nbpages/quickstart/0200_changepoint_detection.py b/docs/nbpages/quickstart/0200_changepoint_detection.py deleted file mode 100644 index 2ce0c53..0000000 --- a/docs/nbpages/quickstart/0200_changepoint_detection.py +++ /dev/null @@ -1,356 +0,0 @@ -""" -Changepoint Detection -===================== - -You can detect trend and seasonality changepoints with just a few lines of code. - -Provide your timeseries as a pandas dataframe with timestamp and value. - -For example, to work with daily sessions data, your dataframe could look like this: - -.. code-block:: python - - import pandas as pd - df = pd.DataFrame({ - "datepartition": ["2020-01-08-00", "2020-01-09-00", "2020-01-10-00"], - "macrosessions": [10231.0, 12309.0, 12104.0] - }) - -The time column can be any format recognized by ``pd.to_datetime``. - -In this example, we'll load a dataset representing ``log(daily page views)`` -on the Wikipedia page for Peyton Manning. -It contains values from 2007-12-10 to 2016-01-20. More dataset info -`here `_. -""" - -import warnings - -warnings.filterwarnings("ignore") - -import pandas as pd -import plotly - -from greykite.algo.changepoint.adalasso.changepoint_detector import ChangepointDetector -from greykite.framework.benchmark.data_loader_ts import DataLoaderTS -from greykite.framework.templates.autogen.forecast_config import ForecastConfig -from greykite.framework.templates.forecaster import Forecaster -from greykite.framework.templates.model_templates import ModelTemplateEnum - -# Loads dataset into UnivariateTimeSeries -dl = DataLoaderTS() -ts = dl.load_peyton_manning_ts() -df = ts.df # cleaned pandas.DataFrame - -# %% -# Detect trend change points -# -------------------------- -# Let's plot the original timeseries. -# There are actually trend changes within this data set. -# The `~greykite.framework.input.univariate_time_series.UnivariateTimeSeries` -# class is used to store a timeseries and to provide basic description and plotting functions. -# The ``load_peyton_manning`` function automatically returns a ``UnivariateTimeSeries`` instance, -# however, for any ``df``, you can always initialize a ``UnivariateTimeSeries`` instance and -# do further explorations. -# (The interactive plot is generated by ``plotly``: **click to zoom!**) -fig = ts.plot() -plotly.io.show(fig) - -# %% -# `~greykite.algo.changepoint.adalasso.changepoint_detector.ChangepointDetector` -# utilizes pre-filters, regularization with regression based models, and -# post-filters to find time points where trend changes. -# -# To create a simple trend changepoint detection model, we first initialize the -# `~greykite.algo.changepoint.adalasso.changepoint_detector.ChangepointDetector` class, -# then run its attribute function ``find_trend_changepoints``. -model = ChangepointDetector() -res = model.find_trend_changepoints( - df=df, # data df - time_col="ts", # time column name - value_col="y") # value column name -pd.DataFrame({"trend_changepoints": res["trend_changepoints"]}) # prints a dataframe showing the result - -# %% -# The code above runs trend changepoint detection with the default parameters. -# We may visualize the detection results by plotting it with the attribute -# function ``plot``. - -fig = model.plot(plot=False) # plot = False returns a plotly figure object. -plotly.io.show(fig) - -# %% -# There might be too many changepoints with the default parameters. -# We could customize the parameters to meet individual requirements. -# -# To understand the parameters, we introduce a little bit of the background -# knowledge. The algorithm first does a mean aggregation to eliminate small -# fluctuations/seasonality effects (``resample_freq``). This avoids the trend -# picking up small fluctuations/seasonality effects. -# -# Then a great number of potential changepoints are placed uniformly over the -# whole time span (specified by time between changepoints ``potential_changepoint_distance`` -# or number of potential changepoints ``potential_changepoint_n`` -# , the former overrides the latter). -# -# The adaptive lasso (more info -# at `adalasso `_) -# is used to shrink insignificant changepoints' coefficients to zero. -# The initial estimator for adaptive lasso could be one of "ols", "ridge" -# and "lasso" (``adaptive_lasso_initial_estimator``). The regularization -# strength of adaptive lasso is also controllable by users -# (``regularization_strength``, between 0.0 and 1.0, greater values imply -# fewer changepoints. ``None`` triggers cross-validation to select the best -# tuning parameter based on prediction performance). -# -# Yearly seasonality effect is too long to be eliminated by aggregation, so -# fitting it with trend is recommended (``yearly_seasonality_order``). -# This allows changepoints to distinguish trend from yearly seasonality. -# -# Putting changepoints too close to the end of data is not recommended, -# because we may not have enough data to fit the final trend, -# especially in forecasting tasks. Therefore, one could specify how far -# from the end changepoints are not allowed (specified by the time from the end -# of data ``no_changepoint_distance_from_end`` or proportion of data from the end -# ``no_changepoint_proportion_from_end``, the former overrides the latter). -# -# Finally, a post-filter is applied to eliminate changepoints that are too close -# (``actual_changepoint_min_distance``). -# -# The following parameter combination uses longer aggregation with less potential -# changepoints placed and higher yearly seasonality order. Changepoints are not -# allowed in the last 20% of the data - -model = ChangepointDetector() # it's also okay to omit this and re-use the old instance -res = model.find_trend_changepoints( - df=df, # data df - time_col="ts", # time column name - value_col="y", # value column name - yearly_seasonality_order=15, # yearly seasonality order, fit along with trend - regularization_strength=0.5, # between 0.0 and 1.0, greater values imply fewer changepoints, and 1.0 implies no changepoints - resample_freq="7D", # data aggregation frequency, eliminate small fluctuation/seasonality - potential_changepoint_n=25, # the number of potential changepoints - no_changepoint_proportion_from_end=0.2) # the proportion of data from end where changepoints are not allowed -pd.DataFrame({"trend_changepoints": res["trend_changepoints"]}) - -# %% -# We may also plot the detection result. - -fig = model.plot(plot=False) -plotly.io.show(fig) - -# %% -# Now the detected trend changepoints look better! Similarly, we could also -# specify ``potential_changepoint_distance`` and ``no_changepoint_distance_from_end`` -# instead of ``potential_changepoint_n`` and ``no_changepoint_proportion_from_end``. -# For example ``potential_changepoint_distance="60D" and -# ``no_changepoint_distance_from_end="730D"``. Remeber these will override -# ``potential_changepoint_n`` and ``no_changepoint_proportion_from_end``. -# -# Moreover, one could also control what components to be plotted. For example - -fig = model.plot( - observation=True, # whether to plot the observations - observation_original=True, # whether to plot the unaggregated values - trend_estimate=True, # whether to plot the trend estimation - trend_change=True, # whether to plot detected trend changepoints - yearly_seasonality_estimate=True, # whether to plot estimated yearly seasonality - adaptive_lasso_estimate=True, # whether to plot the adaptive lasso estimated trend - seasonality_change=False, # detected seasonality change points, discussed in next section - seasonality_change_by_component=True, # plot seasonality by component (daily, weekly, etc.), discussed in next section - seasonality_estimate=False, # plot estimated trend+seasonality, discussed in next section - plot=False) # set to True to display the plot (need to import plotly interactive tool) or False to return the figure object -plotly.io.show(fig) - -# %% -# Detect seasonality change points -# -------------------------------- -# By seasonality change points, we mean the time points where the shape -# of seasonality effects change, i.e., the seasonal shape may become "fatter" -# or "thinner". Similar to trend changepoint detection, we also have -# pre-filtering, regularization with regression based model and post-filtering -# in seasonality change point detection. -# -# To create a simple seasonality changepoint detection model, we could either use -# the previous ``ChangepointDetector`` object which already has the trend changepoint -# information, or initialize a new ``ChangepointDetector`` object. Then one could run -# the ``find_seasonality_changepoints`` function. -# -# Note that because we first remove trend effect from the timeseries before detecting -# seasonality changepoints, using the old ``ChangepointDetector`` object with trend changepoint -# detection results on the same df will pass the existing trend information and save time. -# If a new class object is initialized and one runs ``find_seasonality_changepoints`` directly, -# the model will first run ``find_trend_changepoints`` to get trend changepoint information. -# In this case, it will run with the default trend changepoint detection parameters. -# However, it is recommended that user runs ``find_trend_changepoints`` and check the result -# before running ``find_seasonality_changepoints``. -# -# Here we use the old object which already contains trend changepoint information. - -res = model.find_seasonality_changepoints( - df=df, # data df - time_col="ts", # time column name - value_col="y") # value column name -pd.DataFrame(dict([(k, pd.Series(v)) for k, v in res["seasonality_changepoints"].items()])) # view result -# one could also print res["seasonality_changepoints"] directly to view the result - -# %% -# We can also plot the detection results, simply set ``seasonality_change`` and -# ``seasonality_estimate`` to be True. - -fig = model.plot( - seasonality_change=True, # detected seasonality change points, discussed in next section - seasonality_change_by_component=True, # plot seasonality by component (daily, weekly, etc.), discussed in next section - seasonality_estimate=True, # plot estimated trend+seasonality, discussed in next section - plot=False) # set to True to display the plot (need to import plotly interactive tool) or False to return the figure object -plotly.io.show(fig) - -# %% -# In this example, there is not too much seasonality change, thus we only see one -# yearly seasonality change point, however, we could also customize parameters to -# increase the seasonality changepoint detection sensitivity. -# -# The only parameter that differs from trend changepoint detection is ``seasonality_components_df``, -# which configures the seasonality components. Supplying daily, weekly and yearly seasonality -# works well for most cases. Users can also include monthly and quarterly seasonality. -# The full df is: - -seasonality_components_df = pd.DataFrame({ - "name": ["tod", "tow", "conti_year"], # component value column name used to create seasonality component - "period": [24.0, 7.0, 1.0], # period for seasonality component - "order": [3, 3, 5], # Fourier series order - "seas_names": ["daily", "weekly", "yearly"]}) # seasonality component name - -# %% -# However, if the inferred data frequency is at least one day, the daily component will be removed. -# -# Another optional parameter is ``trend_changepoints`` that allows users to provide -# a list of trend changepoints to skip calling ``find_trend_changepoints``. -# -# Now we run ``find_seasonality_changepoints`` with a smaller ``regularization_strength``, -# and restrict changepoints to the first 80% data. As recommended, we use our previous -# detected trend change points (use the same object after running ``find_trend_changepoints``). - -res = model.find_seasonality_changepoints( - df=df, # data df - time_col="ts", # time column name - value_col="y", # value column name - seasonality_components_df=pd.DataFrame({ # seasonality config df - "name": ["tow", "conti_year"], # component value column name used to create seasonality component - "period": [7.0, 1.0], # period for seasonality component - "order": [3, 5], # Fourier series order - "seas_names": ["weekly", "yearly"]}), # seasonality component name - regularization_strength=0.4, # between 0.0 and 1.0, greater values imply fewer changepoints, and 1.0 implies no changepoints - no_changepoint_proportion_from_end=0.2, # no changepoint in the last 20% data - trend_changepoints=None) # optionally specify trend changepoints to avoid calling find_trend_changepoints -pd.DataFrame(dict([(k, pd.Series(v)) for k, v in res["seasonality_changepoints"].items()])) # view result -# one could also print res["seasonality_changepoints"] directly to view the result - -# %% -# We can also plot the detection results. - -fig = model.plot( - seasonality_change=True, # detected seasonality change points, discussed in next section - seasonality_change_by_component=True, # plot seasonality by component (daily, weekly, etc.), discussed in next section - seasonality_estimate=True, # plot estimated trend+seasonality, discussed in next section - plot=False) # set to True to display the plot (need to import plotly interactive tool) or False to return the figure object -plotly.io.show(fig) - -# %% -# Create a forecast with changepoints -# ----------------------------------- -# Both trend changepoint detection and seasonality changepoint detection algorithms -# have been integrated with ``SILVERKITE``, so one is able to invoke the algorithm by -# passing corresponding parameters. -# It will first detect changepoints with the given parameters, -# then feed the detected changepoints to the forecasting model. - -# specify dataset information -metadata = dict( - time_col="ts", # name of the time column ("datepartition" in example above) - value_col="y", # name of the value column ("macrosessions" in example above) - freq="D" # "H" for hourly, "D" for daily, "W" for weekly, etc. - # Any format accepted by ``pd.date_range`` -) -# specify changepoint parameters in model_components -model_components = dict( - changepoints={ - # it's ok to provide one of ``changepoints_dict`` or ``seasonality_changepoints_dict`` by itself - "changepoints_dict": { - "method": "auto", - "yearly_seasonality_order": 15, - "regularization_strength": 0.5, - "resample_freq": "7D", - "potential_changepoint_n": 25, - "no_changepoint_proportion_from_end": 0.2 - }, - "seasonality_changepoints_dict": { - "potential_changepoint_distance": "60D", - "regularization_strength": 0.5, - "no_changepoint_proportion_from_end": 0.2 - } - }, - custom={ - "fit_algorithm_dict": { - "fit_algorithm": "ridge"}}) # use ridge to prevent overfitting when there many changepoints - -# Generates model config -config = ForecastConfig.from_dict( - dict( - model_template=ModelTemplateEnum.SILVERKITE.name, - forecast_horizon=365, # forecast 1 year - coverage=0.95, # 95% prediction intervals - metadata_param=metadata, - model_components_param=model_components)) - -# Then run with changepoint parameters -forecaster = Forecaster() -result = forecaster.run_forecast_config( - df=df, - config=config) - -# %% -# -# .. note:: -# The automatic trend changepoint detection algorithm also supports adding additional custom trend -# changepoints in forecasts. In the ``changepoints_dict`` parameter above, you may add the following -# parameters to include additional trend changepoints besides the detected ones: -# -# - ``dates``: a list of custom trend changepoint dates, parsable by `pandas.to_datetime`. For example, ["2020-01-01", "2020-02-15"]. -# - ``combine_changepoint_min_distance``: the minimum distance allowed between a detected changepoint and a custom changepoint, default is None. -# For example, "5D". If violated, one of them will be dropped according to the next parameter ``keep_detected``. -# - ``keep_detected``: True or False, default False. Decides whether to keep the detected changepoint or the custom changepoint when they are too close. -# If set to True, keeps the detected changepoint, otherwise keeps the custom changepoint. - -# %% -# Check results -# ------------- -# Details of the results are given in the -# `Simple forecast <./0100_simple_forecast.html>`_ -# example. We just show a few specific results here. - -# %% -# The original trend changepoint detection plot is accessible. -# One could pass the same parameters in a dictionary as they are using -# the ``plot`` function in ``ChangepointDetector``. - -fig = result.model[-1].plot_trend_changepoint_detection(dict(plot=False)) # -1 gets the estimator from the pipeline -plotly.io.show(fig) - -# %% -# Let's plot the historical forecast on the holdout test set. -backtest = result.backtest -fig = backtest.plot() -plotly.io.show(fig) - -# %% -# Let's plot the forecast (trained on all data): -forecast = result.forecast -fig = forecast.plot() -plotly.io.show(fig) - -# %% -# Check out the component plot, trend changepoints are marked in the trend -# component plot. -fig = backtest.plot_components() -plotly.io.show(fig) # fig.show() if you are using "PROPHET" template diff --git a/docs/nbpages/quickstart/0300_seasonality.py b/docs/nbpages/quickstart/0300_seasonality.py deleted file mode 100644 index 978c62b..0000000 --- a/docs/nbpages/quickstart/0300_seasonality.py +++ /dev/null @@ -1,835 +0,0 @@ -""" -Seasonality -=========== -Forecast models learn seasonal (cyclical) patterns and project them into the -future. Understanding the seasonal patterns in your dataset -can help you create a better forecast. Your goal is to identify which -seasonality patterns are most important to capture, and which should -be excluded from the model. - -This tutorial explains how to identify the dominant seasonal patterns and -check for interactions (e.g. daily seasonality that depends on day of week, -or weekly seasonality increases in magnitude over time). Such interactions -are important to model if they affect a large number of data points. - -We use the Peyton Manning dataset as a running example. -""" - -# %% -# Quick reference -# --------------- -# You will learn how to use the function -# `~greykite.framework.input.univariate_time_series.UnivariateTimeSeries.plot_quantiles_and_overlays` -# in `~greykite.framework.input.univariate_time_series.UnivariateTimeSeries` to assess seasonal patterns. -# -# Steps to detect seasonality: -# -# #. Start with the longest seasonal cycles to see the big picture, then proceed to shorter cycles. -# (yearly -> quarterly -> monthly -> weekly -> daily). -# #. For a given seasonality period: -# -# a. First, check for seasonal effect over the entire timeseries (main effect). -# Look for large variation that depends on the location in the cycle. -# Pick the time feature for your seasonality cycle. See available time features at -# `~greykite.common.features.timeseries_features.build_time_features_df`. -# -# - for yearly: ``"doy"``, ``"month_dom"``, ``"woy_dow"`` -# - for quarterly: ``"doq"`` -# - for monthly ``"dom"`` -# - for weekly: ``"str_dow"``, ``"dow_grouped"``, ``"is_weekend"``, ``"woy_dow"`` -# - for daily: ``"hour"``, ``"tod"`` -# - ("do" = "day of", "to" = "time of") -# -# .. code-block:: python -# -# fig = ts.plot_quantiles_and_overlays( -# groupby_time_feature="doy", # day of year (yearly seasonality) -# show_mean=True, # shows mean on the plot -# show_quantiles=[0.1, 0.9], # shows quantiles [0.1, 0.9] on the plot -# xlabel="day of year", -# ylabel=ts.original_value_col, -# title="yearly seasonality", -# ) -# -# b. Then, check for interactions by adding overlays and centering the values. -# These may be present even when there is no main effect:: -# -# # random sample of individual overlays (check for clusters with similar patterns) -# fig = ts.plot_quantiles_and_overlays( -# groupby_time_feature="str_dow", # day of week (weekly seasonality) -# show_mean=True, -# show_quantiles=False, -# # shows every 5th overlay. (accepts a list of indices/names, a number to sample, or `True` to show all) -# show_overlays=np.arange(0, ts.df.shape[0], 5), -# center_values=True, -# # each overlay contains 28 observations (4 weeks) -# overlay_label_sliding_window_size=28, -# xlabel="day of week", -# ylabel=ts.original_value_col, -# title="weekly seasonality with selected 28d averages", -# ) -# # interactions with periodic time feature -# fig = ts.plot_quantiles_and_overlays( -# groupby_time_feature="str_dow", -# show_mean=True, -# show_quantiles=False, -# show_overlays=True, -# center_values=True, -# # splits overlays by month (try other features too) -# overlay_label_time_feature="month", -# # optional overlay styling, passed to `plotly.graph_objects.Scatter` -# overlay_style={"line": {"width": 1}, "opacity": 0.5}, -# xlabel="day of week", -# ylabel=ts.original_value_col, -# title="weekly seasonality by month", -# ) -# # interactions with an event (holiday, etc.) -# fig = ts.plot_quantiles_and_overlays( -# groupby_time_feature="str_dow", -# show_mean=True, -# show_quantiles=False, -# show_overlays=True, -# center_values=True, -# # splits overlays by custom pd.Series value -# overlay_label_custom_column=is_football_season, -# overlay_style={"line": {"width": 1}, "opacity": 0.5}, -# # optional, how to aggregate values for each overlay (default=mean) -# aggfunc=np.nanmean, -# xlabel="day of week", -# ylabel=ts.original_value_col, -# title="weekly seasonality:is_football_season interaction", -# ) -# # seasonality changepoints (option a): overlay against time (good for yearly/quarterly/monthly) -# fig = ts.plot_quantiles_and_overlays( -# groupby_time_feature="woy_dow", # yearly(+weekly) seasonality -# show_mean=True, -# show_quantiles=True, -# show_overlays=True, -# overlay_label_time_feature="year", # splits by time -# overlay_style={"line": {"width": 1}, "opacity": 0.5}, -# center_values=True, -# xlabel="weekofyear_dayofweek", -# ylabel=ts.original_value_col, -# title="yearly and weekly seasonality for each year", -# ) -# # seasonality changepoints (option b): overlay by seasonality value (good for daily/weekly/monthly) -# # see advanced version below, where the mean is removed. -# fig = ts.plot_quantiles_and_overlays( -# # The number of observations in each sliding window. -# # Should contain a whole number of complete seasonality cycles, e.g. 24*7*k for k weekly seasonality cycles on hourly data. -# groupby_sliding_window_size=7*13, # x-axis, sliding windows with 13 weeks of daily observations. -# show_mean=True, -# show_quantiles=False, -# show_overlays=True, -# center_values=True, -# # overlays by the seasonality of interest (e.g. "hour", "str_dow", "dom") -# overlay_label_time_feature="str_dow", -# overlay_style={"line": {"width": 1}, "opacity": 0.5}, -# ylabel=ts.original_value_col, -# title="daily averages over time (centered)", -# ) -# -# #. For additional customization, fetch the dataframe for plotting via -# `~greykite.framework.input.univariate_time_series.UnivariateTimeSeries.get_quantiles_and_overlays`, -# compute additional stats as needed, and plot with -# `~greykite.common.viz.timeseries_plotting.plot_multivariate`. -# For example, to remove the mean effect in seasonality changepoints (option b):: -# -# grouped_df = ts.get_quantiles_and_overlays( -# groupby_sliding_window_size=7*13, # accepts the same parameters as `plot_quantiles_and_overlays` -# show_mean=True, -# show_quantiles=False, -# show_overlays=True, -# center_values=False, # note! does not center, to compute raw differences from the mean below -# overlay_label_time_feature="str_dow", -# ) -# overlay_minus_mean = grouped_df[OVERLAY_COL_GROUP] - grouped_df[MEAN_COL_GROUP].values # subtracts the mean -# x_col = overlay_minus_mean.index.name -# overlay_minus_mean.reset_index(inplace=True) # `plot_multivariate` expects the x-value to be a column -# fig = plot_multivariate( # plots the deviation from the mean -# df=overlay_minus_mean, -# x_col=x_col, -# ylabel=ts.original_value_col, -# title="day of week effect over time", -# ) -# -# -# #. The yearly seasonality plot can also be used to check for holiday effects. Click -# and drag to zoom in on the dates of interest:: -# -# fig = ts.plot_quantiles_and_overlays( -# groupby_time_feature="month_dom", # date on x-axis -# show_mean=True, -# show_quantiles=False, -# show_overlays=True, -# overlay_label_time_feature="year", # see the value for each year -# overlay_style={"line": {"width": 1}, "opacity": 0.5}, -# center_values=True, -# xlabel="day of year", -# ylabel=ts.original_value_col, -# title="yearly seasonality for each year (centered)", -# ) -# -# .. tip:: -# #. `~greykite.framework.input.univariate_time_series.UnivariateTimeSeries.plot_quantiles_and_overlays` -# allows grouping or overlays by (1) a time feature, (2) a sliding window, or (3) a custom column. -# See available time features at `~greykite.common.features.timeseries_features.build_time_features_df`. -# #. You can customize the plot style. See -# `~greykite.framework.input.univariate_time_series.UnivariateTimeSeries.plot_quantiles_and_overlays` -# for details. -# -# Load data -# --------- -# To start, let's plot the dataset. It contains daily observations between -# ``2007-12-10`` and ``2016-01-20``. - -# necessary imports -import numpy as np -import plotly - -from greykite.framework.input.univariate_time_series import UnivariateTimeSeries -from greykite.framework.constants import MEAN_COL_GROUP, OVERLAY_COL_GROUP -from greykite.common.constants import TIME_COL -from greykite.common.data_loader import DataLoader -from greykite.common.viz.timeseries_plotting import add_groupby_column -from greykite.common.viz.timeseries_plotting import plot_multivariate - -# Loads dataset into pandas DataFrame -dl = DataLoader() -df = dl.load_peyton_manning() -df.rename(columns={"y": "log(pageviews)"}, inplace=True) # uses a more informative name - -# plots dataset -ts = UnivariateTimeSeries() -ts.load_data( - df=df, - time_col="ts", - value_col="log(pageviews)", - freq="D") -fig = ts.plot() -plotly.io.show(fig) - -# %% -# Yearly seasonality -# ------------------ -# Because the observations are at daily frequency, -# it is possible to see yearly, quarterly, monthly, and weekly seasonality. -# The name of the seasonality refers to the length of one cycle. For example, -# yearly seasonality is a pattern that repeats once a year. -# -# .. tip:: -# It's helpful to start with the longest cycle to see the big picture. -# -# To examine yearly seasonality, plot the average value by day of year. -# -# Use `~greykite.framework.input.univariate_time_series.UnivariateTimeSeries.plot_quantiles_and_overlays` -# with ``show_mean=True`` and ``groupby_time_feature="doy"`` (day of year). -# ``groupby_time_feature`` accepts any time feature generated by -# `~greykite.common.features.timeseries_features.build_time_features_df`. -fig = ts.plot_quantiles_and_overlays( - groupby_time_feature="doy", # day of year - show_mean=True, # shows the mean - xlabel="day of year", - ylabel=f"mean of {ts.original_value_col}", - title="yearly seasonality", -) -plotly.io.show(fig) - -# %% -# There is a varying, non-constant pattern over the year, which indicates -# the presence of yearly seasonality. But the mean often does not reveal -# the entire story. -# -# Use `~greykite.framework.input.univariate_time_series.UnivariateTimeSeries.plot_quantiles_and_overlays` -# to see the volatility. Set ``show_mean=True`` and ``show_quantiles=True`` to plot the mean with the 0.1 and -# 0.9 quantiles. -fig = ts.plot_quantiles_and_overlays( - groupby_time_feature="doy", - show_mean=True, # shows mean on the plot - show_quantiles=True, # shows quantiles [0.1, 0.9] on the plot - xlabel="day of year", - ylabel=ts.original_value_col, - title="yearly seasonality", -) -plotly.io.show(fig) - -# %% -# The day of year does explain a lot of the variation in ``log(pageviews)``. However, the wide quantiles -# indicate that a lot of variation is not explained by this variable alone. This includes variation from -# trend, events, and other factors. -# -# You can easily request additional quantiles for a better sense of the distribution. -# Pass a list of the desired quantiles via ``show_quantiles``. -fig = ts.plot_quantiles_and_overlays( - groupby_time_feature="doy", - show_mean=True, - show_quantiles=[0.1, 0.25, 0.75, 0.9], # specifies quantiles to include - xlabel="day of year", - ylabel=ts.original_value_col, - title="yearly seasonality", -) -plotly.io.show(fig) - -# %% -# Surprisingly, the 75th percentile is below the mean between days 67 and 81. -# -# .. tip:: -# Click and drag to zoom in on the plot. -# Reset the view by double clicking inside the plot. -# -# To better understand what causes the volatility, we can use overlays to see the -# seasonality pattern split by a dimension of interest. Let's plot one line -# for each year to see if the pattern is consistent over time. Specify -# ``overlay_label_time_feature=True`` and -# ``overlay_label_time_feature="year"`` to request overlays, where one line is shown for -# each year. -# -# We also provide plotly styling options for the overlay lines via ``overlay_style`` (optional). -# Finally, we group by "month_dom" instead of "doy" on the x-axis to make it easier -# read the dates in "MM/DD" format. -fig = ts.plot_quantiles_and_overlays( - groupby_time_feature="month_dom", # groups by "MM/DD", e.g. 03/20 for March 20th. - show_mean=True, - show_quantiles=False, - show_overlays=True, # shows overlays, as configured by `overlay_label_time_feature` - overlay_label_time_feature="year", # splits by "year" - # optional overlay styling, passed to `plotly.graph_objects.Scatter` - overlay_style={"line": {"width": 1}, "opacity": 0.5}, - xlabel="day of year", - ylabel=ts.original_value_col, - title="yearly seasonality for each year", -) -plotly.io.show(fig) - -# %% -# Before we look too carefully, to isolate the effect against the selected groupby -# feature, it can be helpful to center the overlays. This removes the effect of trend and -# longer seasonal cycles from the overlays. Each line is shifted so that the average effect -# over a cycle is zero. Quantiles and mean are shifted together, centering the mean at 0, -# to maintain their relative positions; note that quantiles are still computed on the original -# uncentered distribution. -# -# .. tip:: -# Always start with an uncentered plot with mean and quantiles to check the magnitude -# of the seasonal effect relative to the timeseries' values. Then, center the plot and -# use overlays to better understand the effect. -# -# The plot below is the same plot after centering with ``center_values=True``. -fig = ts.plot_quantiles_and_overlays( - groupby_time_feature="month_dom", - show_mean=True, - show_quantiles=False, - show_overlays=True, - overlay_label_time_feature="year", - overlay_style={"line": {"width": 1}, "opacity": 0.5}, - center_values=True, - xlabel="day of year", - ylabel=ts.original_value_col, - title="yearly seasonality for each year (centered)", -) -plotly.io.show(fig) - -# %% -# This plot reveals some new insights: -# -# 1. Yearly seasonality is actually weak in this dataset; the line is mostly constant -# above/below 0 depending on whether the date is during the football season, which -# runs between September and early February. -# 2. The volatility is larger during the football season, and smaller otherwise. -# 3. The volatility in early March can be explained by a single spike in 2012. -# Similarly, there is an anomaly in June and December. -# -# .. note:: -# The above plot can also be used to assess the effect of yearly holidays. Use -# `~greykite.framework.input.univariate_time_series.UnivariateTimeSeries.plot_quantiles_and_overlays` -# with ``overlay_label_time_feature="year"``, and ``groupby_time_feature`` set to: -# -# - ``"doy"`` (day of year), -# - ``"month_dom"`` (month + day of month), -# - or ``"woy_dow"`` (week of year + day of week). -# - (to align the holiday to the groupby value across years). -# -# Click and drag to zoom in on a particular date range, to see the holiday's -# effect in each year. -# -# These insights provide hints for forecasting: -# -# - A feature indicating whether a particular date is in the football season -# or off-season (potentially split by regular season vs playoffs), is a simple -# way to capture most of the yearly variation. -# - Because the season starts on a different calendar day each year, consider adding -# add a feature for "days till start of season" and "days since end of season" to capture -# the on-ramp and down-ramp. -# - Check the anomalies to see if they should be considered outliers; if so, -# remove them from the training data to avoid affecting future predictions. -# -# With the insight that the values closely depend on the football season, -# and knowing that football games are played on particular days of the week, -# starting on a particular week of the year, we may expect yearly seasonal patterns -# to depend more on "week of year" + "day of week" than on the calendar date. (The -# same calendar date can fall on a different day of the week, depending on the year.) -# To check this, simply group by ``woy_dow``. This variable is encoded as -# {week of year}_{day of week}, e.g. 04_01 for Monday of 4th week. -# -# This is a different way to label each day of the year that captures both -# yearly and weekly seasonality at the same time. -fig = ts.plot_quantiles_and_overlays( - groupby_time_feature="woy_dow", # week of year and day of week - show_mean=True, - show_quantiles=True, - show_overlays=True, - overlay_label_time_feature="year", - overlay_style={"line": {"width": 1}, "opacity": 0.5}, - center_values=True, - xlabel="weekofyear_dayofweek", - ylabel=ts.original_value_col, - title="yearly and weekly seasonality for each year", -) -plotly.io.show(fig) - -# %% -# Notice a much stronger relationship than before: the mean varies more -# with the x-axis value, with tigher quantiles, so ``woy_dow`` explains more -# variability in the time series. There is a different weekly pattern during and outside -# the football season, with increasing volatility toward the playoffs (end of season). -# Next, let's explore the weekly patterns in more detail. -# -# Weekly seasonality -# ------------------ -# So far, we learned that the main seasonal effects depend on day of week and whether the day -# is during the football season. -# -# To check overall weekly seasonality, group by day of week (``str_dow``). We -# also set ``overlay_label_sliding_window_size=7`` and ``show_overlays=20`` to -# plot the values for 20 randomly selected weeks from the dataset. The "size" parameter indicates -# the number of sequential observations contained in each overlay (7=1 week). In the legend, each -# overlay is labeled by the first date in the overlay's sliding window. -fig = ts.plot_quantiles_and_overlays( - groupby_time_feature="str_dow", - show_mean=True, - show_quantiles=True, - show_overlays=20, # randomly selects up to 20 overlays - overlay_label_sliding_window_size=7, # each overlay is a single cycle (week) - center_values=False, - xlabel="day of week", - ylabel=ts.original_value_col, - title="weekly seasonality with overlays" -) -plotly.io.show(fig) - -# %% -# In the above plot, the effect doesn't vary much by day of week, -# but quantiles are large. Such a plot indicates one of two possibilities: -# -# 1) there is no seasonal pattern for this period (cycle length) -# 2) there is a seasonal pattern for this period, but it -# is not consistent across the entire timeseries. -# -# (2) is possible when seasonality depends on an interaction -# term. It may vary by a time dimension, change during an event, -# or evolve over time. In this case, it could be useful to model -# the seasonality conditional on the parameter when forecasting -# (interaction terms). -# -# For the Peyton Manning dataset, we know there is weekly seasonality -# during the football season. We suspect the effect is washed out in the -# above plot, because it averages weekly seasonality during the season -# and off-season. -# -# Suppose we did not already have this insight. How could we detect the presence -# of weekly seasonality conditional on interactions? -# -# - Overlays of individual cycles can suggest the presence of an interaction effect. -# - Look for clusters of overlay lines with similar (and not flat) patterns. -# Try to identify what they have in common. -# -# The previous plot showed a random sample of 20 overlays. The plot below -# selects every 5th overlay, evenly spaced through time. Each overlay is -# the average of a 28 day sliding window (four cycles) to smooth out volatility -# (``overlay_label_sliding_window_size=28``). -# There is a trade off when setting sliding window size: -# -# - Smaller window = see unique effects, but adds noise -# - Larger window = smooths out noise, but values regress toward the mean and may hide effects. -# -# Given this tradeoff, try a few window sizes to see if any patterns emerge. - -# Selects every 5th overlay. ``which_overlays`` is a list of -# allowed overlays. Each overlay spans 28 days, so every 5th overlay -# allows selection of different months across years. -which_overlays = np.arange(0, ts.df.shape[0], 5) # ``ts.df.shape[0]`` is an upper bound on the number of overlays -overlay_style = { # this is the default style - "opacity": 0.5, - "line": dict( - width=1, - color="#B3B3B3", # light gray - dash="solid"), - "legendgroup": OVERLAY_COL_GROUP} -fig = ts.plot_quantiles_and_overlays( - groupby_time_feature="str_dow", - show_mean=True, - show_quantiles=False, - show_overlays=which_overlays, # indices to show. Also accepts a list of strings (overlay names). - center_values=True, - overlay_label_sliding_window_size=28, # each overlay contains 28 observations (4 weeks) - overlay_style=overlay_style, - xlabel="day of week", - ylabel=ts.original_value_col, - title="weekly seasonality with 28d overlays", -) -plotly.io.show(fig) - -# %% -# In the above plot, some lines are close together above/below the mean -# on Monday, Saturday, and Sunday, suggesting the presence of -# an interaction pattern. In the next section, we explain how to -# detect such interactions. -# -# Checking for interactions -# ------------------------- -# The same function, `~greykite.framework.input.univariate_time_series.UnivariateTimeSeries.plot_quantiles_and_overlays`, -# can be used to check the three possible interaction factors: -# -# 1) interaction with time dimension, -# 2) interaction with events, -# 3) seasonality changepoints -# -# 1) Time dimension interaction -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# It is common for a seasonality pattern to depend on a time dimension. -# For example, daily seasonality may differ by day of week, or -# weekly seasonality may change around year end. The seasonality -# changes periodically with a time feature. -# -# To check this, use ``overlay_label_time_feature``. We check whether -# weekly seasonality interacts with month, by setting -# ``overlay_label_time_feature="month"``. -fig = ts.plot_quantiles_and_overlays( - groupby_time_feature="str_dow", - show_mean=True, - show_quantiles=False, - show_overlays=True, - center_values=True, - overlay_label_time_feature="month", # splits overlays by month - overlay_style={"line": {"width": 1}, "opacity": 0.5}, - xlabel="day of week", - ylabel=ts.original_value_col, - title="weekly seasonality by month", -) -plotly.io.show(fig) - -# %% -# There is a clear interaction -- notice two clusters of lines with -# different weekly seasonality patterns. -# (When forecasting, we do need to pay special attention to February, -# whose line is in between the two clusters. This is because it has -# one weekend in the football season and one weekend outside it. -# The month interaction alone is too coarse to reflect this.) -# -# 2) Event/holiday interaction -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# It is also common to have yearly seasonality that interacts with -# an event. For example, for hourly traffic data, a holiday can affect -# the daily seasonality as rush hour traffic is reduced. -# In our dataset, the football season may affect the weekly -# seasonality. -# -# .. note:: -# Both ``events`` and ``time dimensions`` occur at known times in the future; -# the difference is that events require external knowledge about when they -# occur, whereas time dimensions can be derived directly from -# the date itself, without any external knowledge. -# -# Our library contains information about the dates of common holidays, -# but you will need to supply information about other events if desired. -# -# You can pass a custom `pandas.Series` to the plotting function -# to define overlays. The series assigns a label to each row, and -# must have the same length as your input data. -# -# In the code below, we create two overlays using -# a (rough) indicator for ``is_football_season``. -# We used -# `~greykite.common.viz.timeseries_plotting.add_groupby_column` -# to get the derived time feature used to define this indicator. -# See the function's documentation for details. - -# Defines `is_football_season` by "week of year", -# using `add_groupby_column` to get the "week of year" time feature. -df_week_of_year = add_groupby_column( - df=ts.df, - time_col=TIME_COL, # The time column in ts.df is always TIME_COL - groupby_time_feature="woy") # Computes "week of year" based on the time column -added_column = df_week_of_year["groupby_col"] -week_of_year = df_week_of_year["df"][added_column] -is_football_season = (week_of_year <= 6) | (week_of_year >= 36) # rough approximation -fig = ts.plot_quantiles_and_overlays( - groupby_time_feature="str_dow", - show_mean=True, - show_quantiles=False, - show_overlays=True, - center_values=True, - overlay_label_custom_column=is_football_season, # splits overlays by `is_football_season` value - overlay_style={"line": {"width": 1}, "opacity": 0.5}, - aggfunc=np.nanmean, # how to aggregate values for each overlay (default=mean) - xlabel="week of year", - ylabel=ts.original_value_col, - title="weekly seasonality:is_football_season interaction", -) -plotly.io.show(fig) - -# %% -# ``is_football_season`` is able to distinguish the two weekly seasonality -# patterns identified by previous plots. There is strong weekly seasonality -# during the football season, but not outside it. Forecasts that use weekly -# seasonality should account for this important interaction. -# -# 3) Seasonality changepoint -# ^^^^^^^^^^^^^^^^^^^^^^^^^^ -# Lastly, we check if seasonality changes over time. -# For example, the seasonality may increase or decrease -# in magnitude, or its shape may change. -# -# For this, use seasonality changepoint detection. -# See `Changepoint detection <./0200_changepoint_detection.html>`_ -# for details. -# -# Plots can provide additional understanding to tune -# the parameters for changepoint detection. -# -# Let's plot the mean value by "day of week" over time. There will be one -# line for Mondays, one for Tuesdays, etc. We are looking for a change -# in the distribution of the values around the mean; this could indicate, -# for example, that the value on Mondays becomes a smaller % of the weekly -# total over time. -# -# Unlike before, notice that day of week is now the `overlay` feature, -# and we group by sliding windows of 91 observations each. The x-axis is -# indexed by the start of each window. You can adjust the window size -# as you'd like; as before, larger windows smooth out noise, but if the -# window is too large, it may mask meaningful changes. -fig = ts.plot_quantiles_and_overlays( - groupby_sliding_window_size=7*13, # x-axis, sliding windows with 91 days (13 weeks) each - show_mean=True, - show_quantiles=False, - show_overlays=True, - center_values=True, - overlay_label_time_feature="str_dow", # overlays by the seasonality of interest - overlay_style={"line": {"width": 1}, "opacity": 0.5}, - ylabel=ts.original_value_col, - title="daily averages over time (centered)", -) -plotly.io.show(fig) - -# %% -# This plot is hard to assess because of mean changes -# over time. It would be more clear to see if relative offset -# from the mean changes over time. -# -# To do this, get the raw daily averages using -# `~greykite.framework.input.univariate_time_series.UnivariateTimeSeries.get_quantiles_and_overlays`, -# subtract the mean, and plot the result with -# `~greykite.common.viz.timeseries_plotting.plot_multivariate`. -grouped_df = ts.get_quantiles_and_overlays( - groupby_sliding_window_size=7*13, # accepts the same parameters as `plot_quantiles_and_overlays` - show_mean=True, - show_quantiles=False, - show_overlays=True, - center_values=False, # note! does not center, to compute raw differences from the mean below - overlay_label_time_feature="str_dow", -) -overlay_minus_mean = grouped_df[OVERLAY_COL_GROUP] - grouped_df[MEAN_COL_GROUP].values # subtracts the mean -x_col = overlay_minus_mean.index.name -overlay_minus_mean.reset_index(inplace=True) # `plot_multivariate` expects the x-value to be a column -fig = plot_multivariate( # plots the deviation from the mean - df=overlay_minus_mean, - x_col=x_col, - ylabel=ts.original_value_col, - title="day of week effect over time") -plotly.io.show(fig) - -# %% -# The pattern looks fairly stable until Nov 2013, when Monday -# far surpasses Sunday as the weekly peak. The relative values on Monday -# and Tuesday increase, and the relative values on Saturday and Sunday decline. -# Thus, it may be useful to include a seasonality changepoint around -# that time. -# -# .. tip:: -# You can interact with the plot to focus on a particular day -# by double clicking its name in the legend. Double click again -# to unselect, or single click to show/hide a single series. -# -# Quarterly and monthly seasonality -# --------------------------------- -# Finally, let's check quarterly and monthly seasonality. -# -# Quarterly seasonality is weak relative to the size of the quantiles. -# The overlays do not suggest any clear interaction effects. -# It is likely not useful for a forecast model. -# (Remember to check the plot with ``center_values=False`` -# as well, to better assess the magnitude of the effect.) -fig = ts.plot_quantiles_and_overlays( - groupby_time_feature="doq", # day of quarter - show_mean=True, - show_quantiles=True, - show_overlays=20, # randomly selects up to 20 overlays - # No explicit overlay feature. Each overlay is a single cycle (quarter) - center_values=True, - xlabel="day of quarter", - ylabel=ts.original_value_col, - title="quarterly seasonality", -) -plotly.io.show(fig) - -# %% -# Monthly seasonality is weak relative to the size of the quantiles. -# The overlays do not suggest any clear interaction effects. -# It is likely not useful for a forecast model. -# (Remember to check the plot with ``center_values=False`` -# as well, to better assess the magnitude of the effect.) -fig = ts.plot_quantiles_and_overlays( - groupby_time_feature="dom", - show_mean=True, - show_quantiles=True, - show_overlays=20, # randomly selects up to 20 overlays - # No explicit overlay feature. Each overlay is a single cycle (month) - center_values=True, - xlabel="day of month", - ylabel=ts.original_value_col, - title="monthly seasonality", -) -plotly.io.show(fig) - -# %% -# How to forecast with this information -# ------------------------------------- -# Our goal was to identify seasonal patterns in the dataset to create -# a better forecast. -# -# We learned that a good forecast model must model dates during the -# football season and off-season differently. At a minimum, both the -# mean value and weekly seasonality should be allowed to vary depending -# on this ``is_football_season`` variable. -# -# To accomplish this, the following approaches could be considered, from least -# to most complex: -# -# 1. ``is_football_season*weekly_seasonality`` interaction -# 2. ``month*weekly_seasonality + february_week_num*weekly_seasonality`` interaction -# 3. ``woy_dow`` effect -# -# The first option is the most basic. The second allows capturing month-specific, -# weekly seasonality patterns, with special attention given to February, which -# falls both inside and outside the football season. Each week is allowed to -# have a different weekly seasonality. february_week_num is a categorical variable -# indicating the week of February (1, 2, 3, 4). The last option model every day -# of the year as a separate variable. This is unlikely to work well because it -# has too many parameters for the amount of data. -# -# .. note:: -# Appropriately increasing model complexity can improve the model's ability -# to capture meaningful variation. However, unnecessary complexity adds variance to -# the forecast due to estimation noise. A sparser model can better predict the -# future by making more efficient use of the data, as long as it captures the underlying -# dynamics. Proper cross validation and backtesting can be used to pick the best model. -# -# For example, while ``woy_dow`` enables modeling each day of year separately, doing so -# is likely to overfit the training data. Typically, weekly patterns should be modeled with -# weekly seasonality, rather than using yearly seasonality to model shorter -# cyclical patterns. -# -# To capture other seasonal effects, the following model components can be added: -# -# a) ``yearly_seasonality`` to capture weak yearly seasonality -# b) ``season_start`` and ``season_end`` events to capture start and end of season effect -# c) ``weekly seasonality changepoint`` (around Nov 2013) to capture shift in weekly seasonality shape -# -# In the "Silverkite" forecast model, the above components could be specified via -# -# .. code-block:: none -# -# - weekly seasonality: seasonality->weekly_seasonality, custom->extra_pred_cols->"str_dow" -# - yearly seasonality: seasonality->yearly_seasonality, custom->extra_pred_cols->"woy" or "woy_dow" -# - is_football_season: regressors->regressor_cols->"is_football_season" (define custom regressor) -# - start/end of season: holidays->daily_event_df_dict->"season_start","season_end" (define custom event) -# - interactions: custom->feature_sets_enabled, custom->extra_pred_cols (define interactions yourself) -# - changepoint: changepoints->seasonality_changepoints_dict -# -# See :doc:`/pages/model_components/0100_introduction` for details. -# -# Daily seasonality -# ----------------- -# The Peyton Manning dataset cannot have daily seasonality -# (variation within one day), because there is only one observation -# each day. -# -# For completeness, we show how to test for daily seasonality -# using an hourly bike sharing dataset. -# -# First, prepare and load your dataset. -df = dl.load_bikesharing() -bikesharing_ts = UnivariateTimeSeries() -bikesharing_ts.load_data( - df=df, - time_col="ts", - value_col="count", - freq="H", - regressor_cols=["tmax", "tmin", "pn"] -) -plotly.io.show(bikesharing_ts.plot()) - -# %% -# We proceed with further exploration for now. Group by -# ``"hour"`` to see the daily seasonality effect. -# There is more bikesharing activity during the day than at night. -fig = bikesharing_ts.plot_quantiles_and_overlays( - groupby_time_feature="hour", - show_mean=True, - show_quantiles=True, - show_overlays=25, - overlay_label_sliding_window_size=24, # each overlay contains 24 observations (1 day) - center_values=False, - xlabel="hour of day", - ylabel="number of shared bikes", - title="bike sharing activity by hour of day" -) -plotly.io.show(fig) - -# %% -# Check for interactions with day of week as follows. In this plot, weekdays -# follow a similar pattern, but Saturday and Sunday are different. -fig = bikesharing_ts.plot_quantiles_and_overlays( - groupby_time_feature="hour", - show_mean=True, - show_quantiles=False, - show_overlays=True, - center_values=True, - overlay_label_time_feature="str_dow", # splits overlays by day of week - overlay_style={"line": {"width": 1}, "opacity": 0.5}, - xlabel="hour of day", - ylabel="number of shared bikes", - title="bike sharing daily seasonality, by day of week" -) -plotly.io.show(fig) - -# %% -# As an aside, for multivariate datasets, you may set -# ``value_col`` to check the seasonality pattern -# for a different metric in the dataset. -# The bike sharing dataset is a multivariate dataset -# with columns "tmax", "tmin", "pn" for max/min daily -# temperature and precipitation. Let's plot max daily -# temperature by week of year. -print(f"Columns: {bikesharing_ts.df.columns}") -fig = bikesharing_ts.plot_quantiles_and_overlays( - value_col="tmax", - groupby_time_feature="woy", - show_mean=True, - show_quantiles=True, - show_overlays=False, - center_values=False, - xlabel="week of year", - title="max daily temperature by week of year" -) -plotly.io.show(fig) diff --git a/docs/nbpages/quickstart/0400_model_summary.py b/docs/nbpages/quickstart/0400_model_summary.py deleted file mode 100644 index d8487e7..0000000 --- a/docs/nbpages/quickstart/0400_model_summary.py +++ /dev/null @@ -1,229 +0,0 @@ -""" -Model Summary -============= -For every forecast model trained with the ``SILVERKITE`` algorithm, -you can print the model summary with only a few lines of code. -The model summary gives you insight into model performance, -parameter significance and etc. - -In this example, we will discuss how to utilize the -`~greykite.algo.common.model_summary.ModelSummary` -module to output model summary. - -First we'll load a dataset representing ``log(daily page views)`` -on the Wikipedia page for Peyton Manning. -It contains values from 2007-12-10 to 2016-01-20. More dataset info -`here `_. -""" - -import warnings - -warnings.filterwarnings("ignore") - -from greykite.common.data_loader import DataLoader -from greykite.framework.templates.autogen.forecast_config import ForecastConfig -from greykite.framework.templates.autogen.forecast_config import MetadataParam -from greykite.framework.templates.autogen.forecast_config import ModelComponentsParam -from greykite.framework.templates.model_templates import ModelTemplateEnum -from greykite.framework.templates.forecaster import Forecaster - -# Loads dataset into pandas DataFrame -dl = DataLoader() -df = dl.load_peyton_manning() - -# %% -# Then we create a forecast model with ``SILVERKITE`` template. -# For a simple example of creating a forecast model, see -# `Simple Forecast <./0100_simple_forecast.html>`_. -# For a detailed tuning tutorial, see -# `Forecast Model Tuning <../tutorials/0100_forecast_tutorial.html>`_. - -# Specifies dataset information -metadata = MetadataParam( - time_col="ts", # name of the time column - value_col="y", # name of the value column - freq="D" # "H" for hourly, "D" for daily, "W" for weekly, etc. -) - -# Specifies model parameters -model_components = ModelComponentsParam( - changepoints={ - "changepoints_dict": { - "method": "auto", - "potential_changepoint_n": 25, - "regularization_strength": 0.5, - "resample_freq": "7D", - "no_changepoint_distance_from_end": "365D"} - }, - uncertainty={ - "uncertainty_dict": "auto", - }, - custom={ - "fit_algorithm_dict": { - "fit_algorithm": "linear", - }, - } -) - -# Runs the forecast -forecaster = Forecaster() -result = forecaster.run_forecast_config( - df=df, - config=ForecastConfig( - model_template=ModelTemplateEnum.SILVERKITE.name, - forecast_horizon=365, # forecasts 365 steps ahead - coverage=0.95, # 95% prediction intervals - metadata_param=metadata, - model_components_param=model_components - ) -) - -# %% -# Creating model summary -# ^^^^^^^^^^^^^^^^^^^^^^ -# Now that we have the output from :py:meth:`~greykite.framework.templates.forecaster.Forecaster.run_forecast_config`, -# we are able to access the model summary. - -# Initializes the model summary class. -# ``max_colwidth`` is the maximum length of predictor names that can be displayed. -summary = result.model[-1].summary(max_colwidth=30) - -# %% -# The above command creates a model summary class and derives extra information -# that summarizes the model. Generally the summarized information includes -# the following sections: -# -# #. **Model parameter section:** includes basic model parameter information such -# as number of observations, number of features, model name and etc. -# #. **Model residual section:** includes the five number summary of training residuals. -# #. **Model coefficients section (for regression model):** the estimated coefficients -# and their p-values/confidence intervals. For linear regression, these are the -# conventional results; for ridge regression, these are calculated from bootstrap [1]_; -# for lasso regression, these are calculated by multi-sample-splitting [2]_. -# #. **Model coefficients section (for tree model):** the feature significance. -# #. **Model significance section (for regression model only):** the overall significance -# of the regression model, including the coefficient of determination, the -# F-ratio and its p-value, and model AIC/BIC. The results are based on classical -# statistical inference and may not be reliable for regularized methods (ridge, lasso, etc.). -# #. **Warning section:** any warnings for the model summary such as high multicollinearity -# are displayed in this section. -# -# To see the summary, you can either type ``summary`` or ``print(summary)``. - -# Prints the summary -print(summary) - -# %% -# The model summary provides useful insights: -# -# #. We can check the ``sig. code`` column to see which features are not significant. -# For example, the "Independence Day" events are not significant, -# therefore we could consider removing them from the model. -# #. We can check the effect of each feature by examing the confidence interval. -# For example, the Christmas day has a negative effect of -0.57, with a confidence interval -# of -0.93 to -0.22. The changepoint at 2010-02-15 changes the slope by -2.52, with a -# confidence interval of -3.60 to -1.44. -# -# For linear regression, the results are the -# same as the regular regression summary in R (the ``lm`` function). -# The usual considerations apply when interpreting the results: -# -# #. High feature correlation can increase the coefficient variance. -# This is common in forecasting problems, so we recommend regularized models. -# #. There is no standard way to calculate confidence intervals and p-values for regularized -# linear models (ridge, lasso, elastic_net). We follow the approach in [1]_ for ridge -# inference and [2]_ for lasso inference. -# The ideas are to use bootstrap and sample-splitting, respectively. -# -# - For ridge regression, the confidence intervals and p-values are based on biased estimators. -# This is a remedy for multicollinearity to produce better forecast, but could lower the true -# effect of the features. -# - For lasso regression, the confidence intervals and p-values are based on a multi-sample-split -# procedure. While this approach of generating CIs is optimized for accuracy, they are calculated -# independently of the coefficient estimates and are not guaranteed to overlap with the estimates. -# It's worth noting that the probability of a coefficient being nonzero is also reported in the column ``Prob_nonzero``. -# This probability can be used to interpret the significance of the corresponding feature. -# -# Moreover, if you would like to explore the numbers behind the printed summary, -# they are stored in the ``info_dict`` attribute, which is a python dictionary. - -# Prints the keys of the ``info_dict`` dictionary. -print(summary.info_dict.keys()) - -# %% - -# The above coefficient summary can be accessed as a pandas Dataframe. -print(summary.info_dict["coef_summary_df"]) - -# %% -# Selected features in a category -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# You may have noticed that there are too many features in the forecast model. -# It's not easy to read all of them in the coefficient summary table. -# The model summary class is able to filter the categories of these features. -# This is done by the -# `~greykite.algo.common.model_summary.ModelSummary.get_coef_summary` -# function. -# -# A few filters are available, including: -# -# - ``is_intercept``: intercept term. -# - ``is_time_feature``: features defined in `~greykite.common.features.timeseries_features.build_time_features_df`. -# - ``is_event``: holidays and events. -# - ``is_trend``: trend features. -# - ``is_seasonality``: seasonality features. -# - ``is_lag``: autoregressive features. -# - ``is_regressor``: extra regressors provided by user. -# - ``is_interaction``: interaction terms. -# -# All filters set to ``True`` will be joined with the logical operator ``or``, -# while all filters set to ``False`` will be joined with the logical operator ``and``. -# Simply speaking, set what you want to see to ``True`` and what you don't want to see -# to ``False``. -# -# By default, ``is_interaction`` is set to ``True``, this means as long as one feature in -# an interaction term belongs to a category set to ``True``, the interaction term is included -# in the output. However, if one feature in an interaction term belongs to a category set to -# ``False``, the interaction is excluded from the output. -# To hide interaction terms, set ``is_interaction`` to ``False``. - -# Displays intercept, trend features but not seasonality features. -summary.get_coef_summary( - is_intercept=True, - is_trend=True, - is_seasonality=False -) - -# %% -# There might be too many featuers for the trend (including interaction terms). -# Let's hide the interaction terms. - -# Displays intercept, trend features but not seasonality features. -# Hides interaction terms. -summary.get_coef_summary( - is_intercept=True, - is_trend=True, - is_seasonality=False, - is_interaction=False -) - -# %% -# Now we can see the pure trend features, including the continuous growth term and trend changepoints. -# Each changepoint's name starts with "cp" followed by the time point it happens. -# The estimated coefficients are the changes in slope at the corresponding changepoints. -# We can also see the significance of the changepoints by examining their p-values. -# -# We can also retrieve the filtered dataframe by setting ``return_df`` to ``True``. -# This way you could further explore the coefficients. - -output = summary.get_coef_summary( - is_intercept=True, - is_trend=True, - is_seasonality=False, - is_interaction=False, - return_df=True # returns the filtered df -) - -# %% -# .. [1] Reference: "An Introduction to Bootstrap", Efron 1993. -# .. [2] Reference: "High-Dimensional Inference: Confidence Intervals, p-Values and R-Software hdi", Dezeure, Buhlmann, Meier and Meinshausen. diff --git a/docs/nbpages/quickstart/0500_grid_search.py b/docs/nbpages/quickstart/0500_grid_search.py deleted file mode 100644 index 19179f7..0000000 --- a/docs/nbpages/quickstart/0500_grid_search.py +++ /dev/null @@ -1,308 +0,0 @@ -""" -Grid Search -=========== - -Forecast models have many hyperparameters that could significantly affect -the accuracy. These hyperparameters control different components -in the model including trend, seasonality, events, etc. -You can learn more about how to configure the components or hyperparameters in -`model tuning tutorial <../tutorials/0100_forecast_tutorial.html>`_. Here we -will see a step-by-step example of how to utilize the "grid search" functionality -to choose the best set of hyperparameters. - -All model templates support grid search. -Here we continue the `model tuning tutorial <../tutorials/0100_forecast_tutorial.html>`_ -example to use the ``SILVERKITE`` model on the Peyton Manning data set. -The mechanism of using grid search in ``PROPHET`` is similar. -""" - -import warnings - -warnings.filterwarnings("ignore") - -from greykite.common.data_loader import DataLoader -from greykite.common.evaluation import EvaluationMetricEnum -from greykite.framework.templates.autogen.forecast_config import ComputationParam -from greykite.framework.templates.autogen.forecast_config import EvaluationMetricParam -from greykite.framework.templates.autogen.forecast_config import EvaluationPeriodParam -from greykite.framework.templates.autogen.forecast_config import ForecastConfig -from greykite.framework.templates.autogen.forecast_config import ModelComponentsParam -from greykite.framework.templates.forecaster import Forecaster -from greykite.framework.utils.result_summary import summarize_grid_search_results - -# Loads dataset into pandas DataFrame -dl = DataLoader() -df = dl.load_peyton_manning() - -# %% -# Grid search hyperparameters -# --------------------------- -# -# In `model tuning tutorial <../tutorials/0100_forecast_tutorial.html>`_ -# we learned how the components affect the prediction and how to choose the potential -# candidate components. We also learned how to interpret the cross-validation results -# for one set of hyperparameters. In this section, we will go over the ``grid_search`` -# functionality that allows us to compare different sets of hyperparameters by running -# cross-validation on them automatically. -# -# In the `~greykite.framework.templates.autogen.forecast_config.ModelComponentsParam` class, -# each attribute contains a dictionary mapping parameter names to parameter values. You may -# specify either a specific parameter value to use, or a list of values to explore via grid search. -# Grid search is done over every possible combination of hyperparameters across the lists. -# -# .. note:: -# You may only provide lists for these attributes' parameter values, not for the parameter values -# of these attributes' parameter values if they are dictionaries. -# For example, ``seasonality`` is an attribute in ``ModelComponentsParam``, -# which has parameter names ``yearly_seasonality``, ``quarterly_seasonality``, etc. -# We can provide lists for the parameter values of these names. -# On the other hand, ``changepoints`` is an attribute, too, -# which has parameter names ``changepoints_dict`` and ``seasonality_changepoints_dict``. -# Both names take dictionaries as their parameter values. -# We can provide lists of dictionaries as the values, however, within each dictionary, -# we are not allowed to further wrap parameters in lists. -# -# Cross-validation will be performed over these sets of hyperparameters, and the best set of hyperparameters -# will be selected based on the metric you pick, specified by ``cv_selection_metric`` in -# `~greykite.framework.templates.autogen.forecast_config.EvaluationMetricParam`. -# -# Now consider that we want to compare different yearly seasonalities (10 or 20), trend changepoints (None or "auto") -# and fit algorithms (linear or ridge), while keeping all other model components the same. We could specify: - -seasonality = { - "yearly_seasonality": [10, 20], # yearly seasonality could be 10 or 20 - "quarterly_seasonality": False, - "monthly_seasonality": False, - "weekly_seasonality": False, - "daily_seasonality": False -} - -changepoints = { - # Changepoints could be None or auto. - "changepoints_dict": [ - None, - {"method": "auto"} - ] -} - -# Specifies custom parameters -custom = { - "fit_algorithm_dict": [ - {"fit_algorithm": "ridge"}, - {"fit_algorithm": "linear", "fit_algorithm_params": dict(missing="drop")} - ] -} - -# Specifies the model components -# Could leave the other components as default, -# or specify them in the normal way. -model_components = ModelComponentsParam( - seasonality=seasonality, - changepoints=changepoints, - custom=custom -) - -# Specifies the metrics -evaluation_metric = EvaluationMetricParam( - # The metrics in ``cv_report_metrics`` will be calculated and reported. - cv_report_metrics=[EvaluationMetricEnum.MeanAbsolutePercentError.name, - EvaluationMetricEnum.MeanSquaredError.name], - # The ``cv_selection_metric`` will be used to select the best set of hyperparameters. - # It will be added to ``cv_report_metrics`` if it's not there. - cv_selection_metric=EvaluationMetricEnum.MeanAbsolutePercentError.name -) - -# Specifies the forecast configuration. -# You could also specify ``forecast_horizon``, ``metadata_param``, etc. -config = ForecastConfig( - model_components_param=model_components, - evaluation_metric_param=evaluation_metric -) - -# %% -# For the configuration above, all other model components parameters are the same but yearly seasonality, -# changepoints and fit algorithm have 2 options each. The model will automatically run -# cross-validation over the 8 cases: -# -# - yearly seasonality = 10, no changepoints, fit algorithm = "linear". -# - yearly seasonality = 20, no changepoints, fit algorithm = "linear". -# - yearly seasonality = 10, automatic changepoints, fit algorithm = "linear". -# - yearly seasonality = 20, automatic changepoints, fit algorithm = "linear". -# - yearly seasonality = 10, no changepoints, fit algorithm = "ridge". -# - yearly seasonality = 20, no changepoints, fit algorithm = "ridge". -# - yearly seasonality = 10, automatic changepoints, fit algorithm = "ridge". -# - yearly seasonality = 20, automatic changepoints, fit algorithm = "ridge". -# -# The CV test scores will be reported for all 8 cases using the metrics in ``cv_report_metrics``, -# and the final model will be trained on the best set of hyperparameters according to the -# ``cv_selection_metric``. -# -# Selective grid search -# --------------------- -# Consider the case when you have 6 model components to tune, each with 3 different candidates. -# In this case, there will be 3^6=729 different sets of hyperparameters to grid search from. -# The results might be convincing because of the exhaustive grid search, however, the running -# time is going to pile up. -# -# It's very common that not all of the 729 sets of hyperparameters makes sense to us, so it -# would be good not to run all of them. There are two ways to do selective grid search: -# -# - Setting ``hyperparameter_budget``. -# - Utilizing ``hyperparameter_override``. -# -# Setting ``hyperparameter_budget`` -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# The ``hyperparameter_budget`` parameter directly controls how many sets of hyperparameters -# will be used in grid search. If this number is less than the number of all possible sets -# of hyperparameters, the algorithm will randomly pick ``hyperparameter_budget`` number of -# hyperparameter sets. Set ``hyperparameter_budget`` to ``-1`` to search all possible sets. -# You may set the budget in the ``ComputationParam`` class. This is a simple way to search a -# large space of hyperparameters if you are not sure which are likely to succeed. After you -# identify parameter values with better performance, you may run a more precise grid search -# to fine tune around these values. -# -# .. note:: -# If you have a small number of timeseries to forecast, we recommend using the -# `model tuning tutorial <../tutorials/0100_forecast_tutorial.html>`_ -# to help identify good parameters candidates. This is likely more effective than -# random grid search over a large grid. - -# Specifies the hyperparameter_budget. -# Randomly picks 3 sets of hyperparameters. -computation = ComputationParam( - hyperparameter_budget=3 -) -# Specifies the forecast configuration. -# You could also specify ``forecast_horizon``, ``metadata_param``, etc. -config = ForecastConfig( - model_components_param=model_components, - evaluation_metric_param=evaluation_metric, - computation_param=computation -) - -# %% -# Utilizing ``hyperparameter_override`` -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# The ``hyperparameter_override`` functionality allows us to customize the sets of hyperparameters -# to search within. The way is to specify the ``hyperparameter_override`` parameter in the -# ``ModelComponentsParam`` class. -# First, model components are translated to the parameters in the corresponding sklearn Estimator -# for the template (`~greykite.sklearn.estimator.simple_silverkite_estimator.SimpleSilverkiteEstimator` -# and `~greykite.sklearn.estimator.prophet_estimator.ProphetEstimator`). The name is usually the same as the -# key, for example, "estimator__yearly_seasonality" and "estimator__fit_algorithm_dict" (the ``ModelComponentsParam`` -# attribute is ignored). This creates a default hyperparameter_grid dictionary. Then for each dict in -# ``hyperparameter_override``, the default grid's values are replaced by the override values, producing a -# list of customized grids to search over. Grid search done across all the grids in the list. -# For more details, see -# `hyperparameter override <../../pages/model_components/1000_override.html#selective-grid-search>`_. -# Now assume we have the following parameter options, as above: -# -# - yearly seasonality orders: 10 and 20. -# - trend changepoints: None and "auto". -# - fit algorithm: linear and ridge. -# -# We do not want to run all 8 sets of hyperparameters. For example, we think that -# ridge is not needed for the model without changepoints because the model is simple, while linear should -# not be used when there are changepoints because the model is complex. So we want: -# -# - for no changepoints we use linear regression only. -# - for automatic changepoints we use ridge regression only. -# -# Then we can specify: - -seasonality = { - "yearly_seasonality": [10, 20], - "quarterly_seasonality": False, - "monthly_seasonality": False, - "weekly_seasonality": False, - "daily_seasonality": False -} - -changepoints = { - "changepoints_dict": None -} - -# Specifies custom parameters -custom = { - "fit_algorithm_dict": {"fit_algorithm": "linear"} -} - -# Hyperparameter override can be a list of dictionaries. -# Each dictionary will be one set of hyperparameters. -override = [ - {}, - { - "estimator__changepoints_dict": {"method": "auto"}, - "estimator__fit_algorithm_dict": {"fit_algorithm": "ridge"} - } -] - -# Specifies the model components -# Could leave the other components as default, -# or specify them in the normal way. -model_components = ModelComponentsParam( - seasonality=seasonality, - changepoints=changepoints, - custom=custom, - hyperparameter_override=override -) - -# Specifies the evaluation period -evaluation_period = EvaluationPeriodParam( - test_horizon=365, # leaves 365 days as testing data - cv_horizon=365, # each CV test size is 365 days (same as forecast horizon) - cv_max_splits=3, # 3 folds CV - cv_min_train_periods=365 * 4 # uses at least 4 years for training because we have 8 years data -) - -config = ForecastConfig( - forecast_horizon=365, - model_components_param=model_components, - evaluation_metric_param=evaluation_metric, - evaluation_period_param=evaluation_period -) - -# %% -# The forecast configuration above specifies the yearly seasonality orders in -# a list, therefore, both 10 and 20 will be searched. For the hyperparameter override -# list, there are two elements. The first one is an empty dictionary, which corresponds -# to the original changepoint and fit algorithm in the configuration. The second dictionary -# overrides changepoint method with automatic changepoint detection and fit algorithm with ridge. -# In total, the model will run 4 different configurations: -# -# - yearly seasonality 10, no changepoint, fit algorithm linear. -# - yearly seasonality 20, no changepoint, fit algorithm linear. -# - yearly seasonality 10, automatic changepoints, fit algorithm ridge. -# - yearly seasonality 20, automatic changepoints, fit algorithm ridge. -# -# In this way, we could only search the sets of hyperparameters we need and save a lot of time. -# Also note that the above configuration also configures the CV splits using -# `~greykite.framework.templates.autogen.forecast_config.EvaluationPeriodParam`. -# We can see the configs and evaluations with ``summarize_grid_search_results``. - -# Runs the forecast -forecaster = Forecaster() -result = forecaster.run_forecast_config( - df=df, - config=config -) - -# Summarizes the CV results -cv_results = summarize_grid_search_results( - grid_search=result.grid_search, - decimals=1, - # The below saves space in the printed output. Remove to show all available metrics and columns. - cv_report_metrics=None, - column_order=["rank", "mean_test", "split_test", "mean_train", "split_train", "mean_fit_time", "mean_score_time", "params"]) -cv_results["params"] = cv_results["params"].astype(str) -cv_results.set_index("params", drop=True, inplace=True) -cv_results - - -# %% -# .. tip:: -# The simple silverkite templates that use -# `~greykite.sklearn.estimator.simple_silverkite_estimator.SimpleSilverkiteEstimator` -# are the easiest templates to do grid search, because they support a list of model templates -# and a list of ``ModelComponentsParam``. For more information, see -# :doc:`/gallery/tutorials/0200_templates`. \ No newline at end of file diff --git a/docs/nbpages/quickstart/0600_forecast_one_by_one.py b/docs/nbpages/quickstart/0600_forecast_one_by_one.py deleted file mode 100644 index 3f393ac..0000000 --- a/docs/nbpages/quickstart/0600_forecast_one_by_one.py +++ /dev/null @@ -1,155 +0,0 @@ -""" -Forecast One By One -=================== - -A useful feature for short-term forecast in Silverkite model family is autoregression. -Silverkite has an "auto" option for autoregression, -which automatically selects the autoregression lag orders based on the data frequency and forecast horizons. -One important rule of this "auto" option is that the minimum order of autoregression terms -is at least the forecast horizon. -For example, if the forecast horizon is 3 on a daily model, -the minimum order of autoregression is set to 3. -The "auto" option won't have an order of 2 in this case, -because the 3rd day forecast will need the 1st day's observation, -which isn't available at the current time. -Although the model can make predictions with an autoregression lag order less than the forecast horizon -via simulations, it takes longer time to run and is not the preferred behavior in the "auto" option. - -However, in many cases, using smaller autoregression lag orders can give more accurate forecast results. -We observe that the only barrier of using an autoregression term of order 2 in the 3-day forecast model -is the 3rd day, while we can use it freely for the first 2 days. -Similarly, we are able to use an autoregression term of order 1 for the 1st day. -In a 3 day forecast, if the accuracy of all 3 days are important, then replacing the first 2 days' models -with shorter autoregression lag orders can improve the accuracy. -The forecast-one-by-one algorithm is designed in this context. - -The observations above together bring the idea of the forecast-one-by-one algorithm. -The algorithm allows fitting multiple models with the "auto" option in autoregression, -when one is forecasting with a forecast horizon longer than 1. -For each model, the "auto" option for autoregression selects the smallest -available autoregression lag order and predicts for the corresponding forecast steps, -thus improving the forecast accuracy for the early steps. - -In this example, we will cover how to activate the forecast-one-by-one approach -via the ``ForecastConfig`` and the ``Forecaster`` classes. -For a detailed API reference, please see the -`~greykite.framework.templates.autogen.forecast_config.ForecastConfig` and -`~greykite.sklearn.estimator.one_by_one_estimator.OneByOneEstimator` classes. -""" - -import warnings - -warnings.filterwarnings("ignore") - -import plotly -from greykite.common.data_loader import DataLoader -from greykite.framework.templates.autogen.forecast_config import ForecastConfig -from greykite.framework.templates.autogen.forecast_config import ModelComponentsParam -from greykite.framework.templates.forecaster import Forecaster -from greykite.framework.templates.model_templates import ModelTemplateEnum -from greykite.framework.utils.result_summary import summarize_grid_search_results - -# Loads dataset into pandas DataFrame -dl = DataLoader() -df = dl.load_peyton_manning() - -# %% -# The forecast-one-by-one option -# ------------------------------ -# -# The forecast-one-by-one option is specified through the ``forecast_one_by_one`` parameter -# in ``ForecastConfig``. - -config = ForecastConfig( - model_template=ModelTemplateEnum.SILVERKITE.name, - forecast_horizon=3, - model_components_param=ModelComponentsParam( - autoregression=dict(autoreg_dict="auto") - ), - forecast_one_by_one=True -) - -# %% -# The ``forecast_one_by_one`` parameter can be specified in the following ways -# -# - **``True``**: every forecast step will be a separate model. -# The number of models equals the forecast horizon. -# In this example, 3 models will be fit with the 3 forecast steps. -# - **``False``**: the forecast-one-by-one method is turned off. -# This is the default behavior and a single model is used for all forecast steps. -# - **A list of integers**: each integer corresponds to a model, -# and it is the number of steps. For example, in a 7 day forecast, -# specifying ``forecast_one_by_one=[1, 2, 4]`` will result in 3 models. -# The first model forecasts the 1st day with forecast horizon 1; -# The second model forecasts the 2nd - 3rd days with forecast horizon 3; -# The third model forecasts the 4th - 7th days with forecast horizon 7. -# In this case, the sum of the list entries must equal the forecast horizon. -# - **an integer ``n``**: every model will account for n steps. The last model -# will account for the rest `_. - #. We have pre-defined `~greykite.framework.templates.autogen.forecast_config.ModelComponentsParam` classes - that serve as default estimator parameters for different use cases. These pre-defined ``ModelComponentsParam`` classes have names. - #. You can specify in the ``model_template`` parameter a valid model template name. - The function will automatically map the ``model_template`` input to the corresponding estimator and its default parameters. - #. To override the default values, you can create a - `~greykite.framework.templates.autogen.forecast_config.ModelComponentsParam` class - with only the parameters you want to override, and pass it to the ``model_components_param`` parameter. - -Note that you don't have to specify all values in the -`~greykite.framework.templates.autogen.forecast_config.ModelComponentsParam` -to override the defaults. If a parameter is not specified, the default value for the parameter -specified by the model template name will be used. -In the later sections we will go over the valid ``model_template`` and ``ModelComponentsParam`` for each of the -three estimators. -For details about how to configure the other parameters and how to use the ``run_forecast_config`` function, see -`Tune your first forecast model <./0100_forecast_tutorial.html>`_. - -The three estimators accept different input for ``model_template`` and ``ModelComponentsParam``. -Below are the valid input types for the ``model_template`` parameter. - - - High-level ``Silverkite`` template: for the high-level ``SimpleSilverkiteEstimator``, we have model templates named - ``"SILVERKITE"``, ``"SILVERKITE_EMPTY"``, ``"SILVERKITE_DAILY_90"``, ``"SILVERKITE_WEEKLY"`` and a set of - generic naming following some rules. This type of model templates support list input for both - ``model_template`` and ``model_components_param`` parameters. - This type of model templates are most recommended for ease of use. - - Low-level ``Silverkite`` template: for the low-level ``SilverkiteEstimator``, we have a model template - named ``"SK"``. This template allows you to configure lower-level parameters in the ``Silverkite`` model. - This template does not support list input. - - Prophet template: for the ``ProphetEstimator``, we have a model template named ``"PROPHET"``. - This template does not support list input. - -To customize the default parameters in the templates, the -`~greykite.framework.templates.autogen.forecast_config.ModelComponentsParam` dataclass -takes the following parameters - -* ``growth``: defines how the trend of the time series grows. -* ``seasonality``: defines the seasonality components and orders. -* ``changepoints``: defines when trend and/or seasonality should change, including automatic options. -* ``events``: defines short term events and holidays. -* ``autoregression``: defines the lags and aggregations for the past values. -* ``regressors``: defines extra regressors. -* ``uncertainty``: defines the forecast interval parameters. -* ``custom``: defines parameters that do not belong to the other sections. -* ``hyperparameter_override``: used to create overrides for the parameters specified above; useful in grid search. - -The model's tuning parameters are set according to the categories above. -However, different estimators take different types of values for these categories. -We will go over each of the three types of templates, their default values, and how to customize the -``ModelComponentsParam`` for them. -For more general details, see :doc:`/pages/model_components/0100_introduction`. -""" -# Imports related libraries. -import pandas as pd - -from greykite.framework.templates.autogen.forecast_config import ForecastConfig -from greykite.framework.templates.autogen.forecast_config import ModelComponentsParam -from greykite.framework.templates.model_templates import ModelTemplateEnum -from greykite.framework.templates.simple_silverkite_template import SimpleSilverkiteTemplate - -# %% -# The High-level Templates in ``SILVERKITE`` -# ------------------------------------------ -# The high-level templates in ``SILVERKITE`` provides many good defaults that work under different scenarios. -# All templates in this section use `~greykite.sklearn.estimator.simple_silverkite_estimator.SimpleSilverkiteEstimator`. -# The two most basic templates are ``"SILVERKITE"`` and ``"SILVERKITE_EMPTY"``. -# -# ``"SILVERKITE"`` is a template with automatic growth, seasonality, holidays, and interactions. -# It works best for hourly and daily frequencies. -# If you specify ``"SILVERKITE"`` as ``model_template``, the following -# `~greykite.framework.templates.autogen.forecast_config.ModelComponentsParam` class -# is used as default template values. - -model_components_param_silverkite = ModelComponentsParam( - growth={ - "growth_term": "linear" - }, - seasonality={ - "yearly_seasonality": "auto", - "quarterly_seasonality": "auto", - "monthly_seasonality": "auto", - "weekly_seasonality": "auto", - "daily_seasonality": "auto", - }, - changepoints={ - "changepoints_dict": None, - "seasonality_changepoints_dict": None - }, - events={ - "holidays_to_model_separately": "auto", - "holiday_lookup_countries": "auto", - "holiday_pre_num_days": 2, - "holiday_post_num_days": 2, - "holiday_pre_post_num_dict": None, - "daily_event_df_dict": None, - }, - autoregression={ - "autoreg_dict": None - }, - regressors={ - "regressor_cols": [] - }, - uncertainty={ - "uncertainty_dict": None - }, - custom={ - "fit_algorithm_dict": { - "fit_algorithm": "ridge", - "fit_algorithm_params": None, - }, - "feature_sets_enabled": "auto", # "auto" based on data freq and size - "max_daily_seas_interaction_order": 5, - "max_weekly_seas_interaction_order": 2, - "extra_pred_cols": [], - "min_admissible_value": None, - "max_admissible_value": None, - } -) - -# %% -# To customize this template, create a ``ModelComponentsParam`` class like above with the parameters you would like to use -# to override the defaults, and feed it to the ``model_components_param`` parameter in ``ForecastConfig``. For example - -custom_model_components = ModelComponentsParam( - seasonality={ - "yearly_seasonality": 15 - }, - custom={ - "fit_algorithm_dict": { - "fit_algorithm": "ridge", - "fit_algorithm_params": None - } - } -) - -# %% -# These two parameters can be put in the -# `~greykite.framework.templates.autogen.forecast_config.ForecastConfig` class. -# The parameters used by the model will be those in the ``model_components_param_silverkite`` -# with ``"yearly_seasonality"`` and ``"fit_algorithm_dict"`` overridden by the custom parameters. - -forecast_config = ForecastConfig( - model_template=ModelTemplateEnum.SILVERKITE.name, - model_components_param=custom_model_components -) - -# %% -# Detailed explanations for these parameters are in :doc:`/pages/model_components/0100_introduction`. The following paragraphs -# briefly summarized what each parameter does. -# -# The ``growth`` parameter recognizes the key ``"growth_term"``, which describes the growth rate of the time series model. -# For ``"SILVERKITE"`` template, the value is ``"linear"`` and indicates linear growth. -# -# The ``seasonality`` parameter recognizes the keys ``"yearly_seasonality"``, ``"quarterly_seasonality"``, ``"monthly_seasonality"``, -# ``"weekly_seasonality"`` and ``"daily_seasonality"``. Their values are the corresponding Fourier series values. -# For ``"SILVERKITE"`` template, the values are ``"auto"`` and all orders will be 5. -# -# The ``changepoints`` parameter recognizes the keys ``"changepoints_dict"`` and ``"seasonality_changepoints_dict"``, -# which correspond to trend changepoints and seasonality changepoints. -# For more details of configuring these two parameters, see `Changepoints <../quickstart/0200_changepoint_detection.html>`_. -# For ``"SILVERKITE"`` template, both parameters are ``None``, indicating that neither trend changepoints nor seasonality changepoints -# is included. -# -# The ``events`` parameter recognizes the keys ``"holidays_to_model_separately"``, ``"holiday_lookup_countries"``, -# ``"holiday_pre_num_days"``, ``"holiday_post_num_days"``, ``"holiday_pre_post_num_dict"`` and ``"daily_event_df_dict"``. -# More details can be found at `Holidays and Events <../../pages/model_components/0400_events.html#>`_. -# For ``"SILVERKITE"`` template, it automatically looks up holidays in a holiday dictionary and model major holidays -# plus minus 2 days with separate indicators. -# -# The ``autoregression`` parameter recognizes the key ``"autoreg_dict"``. You can specify lags and aggregated lags through the -# dictionary to trigger autoregressive terms. Specify the value as ``"auto"`` to automatically include recommended -# autoregressive terms for the data frequency and forecast horizon. -# More details can be found at `Autoregression <../../pages/model_components/0800_autoregression.html#>`_. -# For ``"SILVERKITE"`` template, autoregression is not included. -# -# The ``regressors`` parameter recognizes the key ``"regressor_cols"``, which takes a list of regressor column names. These regressor columns -# have to be included in the training df for both training and forecast periods. For more details about regressors, see -# `Regressors <../../pages/model_components/0700_regressors.html#silverkite>`_. -# For ``"SILVERKITE"`` template, no regressors are included. -# -# The ``uncertainty`` parameter recognizes the key ``"uncertainty_dict"``, which takes a dictionary to specify how forecast intervals -# are calculated. For more details about uncertainty, see `Uncertainty <../../pages/model_components/0900_uncertainty.html#silverkite>`_. -# For ``"SILVERKITE"`` template, the default value is ``None``. If ``coverage`` in ``ForecastConfig`` is not None, -# the template uses a default setting based on data frequency. We will see how to set ``coverage`` later. -# -# The ``custom`` parameter recognizes specific keys for ``SILVERKITE`` type of templates that correspond to -# `~greykite.sklearn.estimator.simple_silverkite_estimator.SimpleSilverkiteEstimator`. These keys include -# -# - ``"fit_algorithm_dict"`` takes a dictionary to specify what regression method is used to fit the time series. -# The default is the ridge regression in `sklearn`. For a detailed list of algorithms, see -# `Algorithms <../../pages/model_components/0600_custom.html#fit-algorithm>`_. -# - ``"feature_sets_enabled"`` defines the interaction terms to be included in the model. A list of pre-defined -# interaction terms can be found at `Feature sets <../../pages/model_components/0600_custom.html#interactions>`_. -# The default is ``None``, which automatically finds the proper interaction terms that fit the data frequency. -# - ``"max_daily_seas_interaction_order"`` is the maximum order of Fourier series components in daily seasonality to -# be used in interactions. The default is 5. -# - ``"max_weekly_seas_interaction_order"`` is the maximum order of Fourier series components in daily seasonality to -# be used in interactions. The default is 2. -# - ``"extra_pred_cols"`` defines extra predictor column names. For details, see -# `Extra predictors <../../pages/model_components/0600_custom.html#extra-predictors>`_. -# The default is no extra predictors. -# - ``"min_admissible_value"`` is the minimum admissible value in forecast. All values below this will be clipped at this value. -# The default is None. -# - ``"max_admissible_value"`` is the maximum admissible value in forecast. All values above this will be clipped at this value. -# The default is None. -# -# All default high-level ``SILVERKITE`` templates are defined through this framework. -# The ``"SILVERKITE_EMPTY"`` template is an empty template that does not include any component. -# If you provide ``ModelComponentsParam`` via ``model_components_param`` with ``"SILVERKITE_EMPTY"``, -# the final model parameter to be used will be exactly what you provided through ``ModelComponentsParam``. -# It's not like ``"SILVERKITE"``, where the values you do not provide within ``model_components_param`` will -# be filled with the defaults in ``"SILVERKITE"``. -# If you choose to use the ``"SILVERKITE_EMPTY"`` template but do not provide any ``ModelComponentsParam`` -# via ``model_components_param``, the model will only fit the intercept term. -# -# Pre-defined Generic High-level ``SILVERKITE`` Templates -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# It can happen that you would like to customize the ``ModelComponentsParam`` but are not sure -# which values to set for each parameter. -# The high-level ``SILVERKITE`` template pre-defines sets of values for different components, -# indexed by human-readable language. -# This allows you to try sensible options for the components using a directive language. -# For example, "setting seasonality to normal and changepoints to light" is specified by -# ``sk.SEAS.value.NM`` and ``sk.CP.value.LT``. -# This option provides rough tuning knobs before fine tuning the exact parameter values. -# This type of template name must be initialized through the -# `~greykite.framework.templates.simple_silverkite_template_config.SimpleSilverkiteTemplateOptions` -# dataclass. -# You can choose a value for each component and assemble them as a template. - - -from greykite.framework.templates.simple_silverkite_template_config \ - import SimpleSilverkiteTemplateOptions as st -from greykite.framework.templates.simple_silverkite_template_config \ - import SILVERKITE_COMPONENT_KEYWORDS as sk -# The model template specifies -# hourly frequency, normal seasonality (no quarterly or monthly), linear growth, light trend changepoints, -# separate holidays with plus/minus 2 days, automatic feature sets, ridge regression, automatic autoregression, -# automatic max daily seasonality interaction order and automatic max weekly seasonality interaction order. -model_template = st( - freq=sk.FREQ.value.HOURLY, - seas=sk.SEAS.value.NM, - gr=sk.GR.value.LINEAR, - cp=sk.CP.value.LT, - hol=sk.HOL.value.SP2, - feaset=sk.FEASET.value.AUTO, - algo=sk.ALGO.value.RIDGE, - ar=sk.AR.value.AUTO, - dsi=sk.DSI.value.AUTO, - wsi=sk.WSI.value.AUTO -) - -# %% -# This option provides rough tuning knobs to intuitively try out different model component parameters. -# You can then fine tune the model using ``ModelComponentsParams`` directly. -# A complete list of the key-values are -# -# - ``FREQ``: the data frequency, can be "HOURLY", "DAILY" or "WEEKLY", default "DAILY". -# - ``SEAS``: the seasonality, can be "LT", "NM", "HV", "NONE", "LTQM", "NMQM" or "HVQM", default "LT". -# The "QM" versions include quarterly and monthly seasonality while the others do not. -# - ``GR``: the growth term, can be "LINEAR" or "NONE", default "LINEAR", corresponding to linear growth or constant growth. -# - ``CP``: the automatically detected trend change points, can be "NONE", "LT", "NM", "HV", default "NONE". -# - ``HOL``: the holidays, can be "NONE", "SP1", "SP2", "SP4" or "TG", default "NONE". The default configuration looks up -# popular holidays in a list of popular countries. The "SP{n}" values models major holidays -# with plus/minus n days around them separately, while "TG" models all holidays along with -# plus/minus 2 days together as one indicator. -# - ``FEASET``: the feature sets that defines the interaction terms, can be "AUTO", "ON" or "OFF", default "OFF". -# "AUTO" choose the pre-defined interaction terms automatically, while "ON" and "OFF" includes -# or excludes all pre-defined interaction terms, respectively. -# - ``ALGO``: the algorithm used to fit the model, can be "LINEAR", "RIDGE", "SGD" or "LASSO", default "LINEAR". -# Ridge and Lasso use cross-validation to identify the tuning parameter, while "SGD" -# (stochastic gradient descent) implements L2 norm regularization with tuning parameter 0.001. -# - ``AR``: the autoregressive terms, can be "AUTO" or "OFF", default "OFF". -# - ``DSI``: the maximum daily seasonality order used for interaction in feature sets, can be "AUTO" or "OFF", default "AUTO". -# - ``WSI``: the maximum weekly seasonality order used for interaction in feature sets, can be "AUTO" or "OFF", default "AUTO". -# -# Note that if you do not specify any parameter, the default value will be used: -# ``FREQ=DAILY``, ``SEAS=LT``, ``GR=LINEAR``, ``CP=NONE``, ``HOL=NONE``, ``FEASET=OFF``, ``ALGO=LINEAR``, -# ``AR=OFF``, ``DSI=AUTO``, ``WSI=AUTO``. -# To see how these keywords are converted to these model component params, see -# `~greykite.framework.templates.simple_silverkite_template_config.COMMON_MODELCOMPONENTPARAM_PARAMETERS`. -# However, you can print the ``ModelComponentsParam`` class for a model template with the util function -# `~greykite.framework.templates.simple_silverkite_template.SimpleSilverkiteTemplate.get_model_components_from_model_template`. - -sst = SimpleSilverkiteTemplate() -model_components = sst.get_model_components_from_model_template("SILVERKITE_EMPTY") -print(model_components[0]) # `model_components` is a list of length 1. - -# %% -# You can also pass a dataclass. - -model_components = sst.get_model_components_from_model_template(model_template) -print(model_components[0]) # `model_components` is a list of length 1. - -# %% -# Provide a List of Templates -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# For the high-level ``"SILVERKITE"`` templates through the -# `~greykite.sklearn.estimator.simple_silverkite_estimator.SimpleSilverkiteEstimator` estimator, -# you are allowed to provide a list of ``model_template`` or/and a list of ``model_components_param``. -# This option allows you to do grid search and compare over different templates/model component overrides -# at the same time. -# -# For ``model_template``, you can provide a list of any templates defined above. For example, you can do - -model_templates_list = ["SILVERKITE", "SILVERKITE_EMPTY", model_template] - -# %% -# The `~greykite.framework.templates.simple_silverkite_template.SimpleSilverkiteTemplate.get_model_components_from_model_template` -# also takes a list as input. -model_components = sst.get_model_components_from_model_template(model_templates_list) -print(model_components) # There are 3 elements. - -# %% -# For ``model_components_param``, you can also create a list of ``ModelComponentsParam`` classes to override -# the base templates. Each single ``ModelComponentsParam`` is used to override each single base template. -# Therefore, if you provide a list of 4 ``ModelComponentsParam`` via ``model_components_param`` and the list -# of 3 base templates above via ``model_template``, a total of 12 different sets of model parameters is expected. -# However, only unique sets of parameters will be kept. -# -# There are also pre-defined model templates that are defined through lists. -# The ``"SILVERKITE_DAILY_90"`` is a pre-tuned model template on daily data with 90 day's forecast horizon. -# It is defined through the data class with 4 sets of parameters. -# The ``"SILVERKITE_WEEKLY"`` is a pre-tuned model template on weekly data. -# It is defined through the data class with 4 sets of parameters. -# The ``"SILVERKITE_HOURLY_1"``, ``"SILVERKITE_HOURLY_24"``, ``"SILVERKITE_HOURLY_168"``, ``"SILVERKITE_HOURLY_336"`` -# are pre-tuned model templates on hourly data with horizons 1 hour, 1 day, 1 week and 2 weeks, respectively. -# They are defined through the data class with 4 sets of parameters each. -# -# You are also allowed to put these names in the ``model_template`` list, for example - -model_templates_list2 = ["SILVERKITE_DAILY_90", model_template] - -# %% -# This corresponds to 5 single base templates. Whenever you specify multiple sets of parameters -# (list of templates, list of model components, etc.), it's best to have a sufficient number -# of cross validation folds so that the model does not pick a biased set of parameters. -# -# The Low-level Templates in ``SILVERKITE`` -# ----------------------------------------- -# -# There is a pre-defined low-level template named ``"SK"`` that takes low-level parameters and uses -# `~greykite.sklearn.estimator.silverkite_estimator.SilverkiteEstimator`. -# -# The attributes in ``ModelComponentsParam`` are the same as in ``"SILVERKITE"`` but they take different -# types of inputs. - -model_components_param_sk = ModelComponentsParam( - growth={ - }, # growth does not accept any parameters, pass growth term via `extra_pred_cols` instead. - seasonality={ - "fs_components_df": [pd.DataFrame({ - "name": ["tod", "tow", "tom", "toq", "toy"], - "period": [24.0, 7.0, 1.0, 1.0, 1.0], - "order": [3, 3, 1, 1, 5], - "seas_names": ["daily", "weekly", "monthly", "quarterly", "yearly"]})], - }, - changepoints={ - "changepoints_dict": [None], - "seasonality_changepoints_dict": [None] - }, - events={ - "daily_event_df_dict": [None] - }, - autoregression={ - "autoreg_dict": [None] - }, - regressors={ - "regressor_cols": [None] - }, - uncertainty={ - "uncertainty_dict": [None] - }, - custom={ - "fit_algorithm_dict": { - "fit_algorithm": "ridge", - "fit_algorithm_params": None, - }, - "extra_pred_cols": ["ct1"], # linear growth - "min_admissible_value": [None], - "max_admissible_value": [None], - } -) - -# %% -# The ``growth`` parameter, the dictionary should be empty. The growth term's name is specified -# via ``extra_pred_cols`` in ``custom``. The default growth term is ``"ct1"``, which corresponds to linear growth. -# -# The ``seasonality`` parameter, it recognizes the key ``"fs_components_df"``, which is a pandas dataframe -# that specifies the fourier series generation information. For more information, see -# `~greykite.sklearn.estimator.silverkite_estimator.SilverkiteEstimator`. -# For ``"SK"`` template, the default includes daily, weekly, monthly, quarterly and yearly seasonality -# with orders 3, 3, 1, 1, 5, respectively. -# -# The ``changepoints`` parameter recognizes the keys ``"changepoints_dict"`` and ``"seasonality_changepoints_dict"``. -# Each of the two keys takes a parameter dictionary that corresponds to trend changepoints and seasonality changepoints. -# For more details of configuring these two parameters, see `Changepoints <../quickstart/0200_changepoint_detection.html>`_. -# For ``"SK"`` template, both parameters are ``None``, indicating that neither trend changepoints nor seasonality changepoints -# is included. -# -# The ``events`` parameter recognizes the key ``"daily_event_df_dict"``. -# Specify any events or holidays through the "daily_event_df_dict". The usage is the same as this parameter in ``SILVERKITE``. -# For ``"SK"`` template, the default is no daily events (holidays). -# -# The ``autoregression`` parameter recognizes the key ``"autoreg_dict"``. You can specify lags and aggregated lags through the -# dictionary to trigger autoregressive terms. Specify the value as ``"auto"`` to automatically include the proper order of lags. -# For ``"SK"`` template, autoregression is not included. -# -# The ``regressors`` parameter recognizes the key ``"regressor_cols"``, which takes a list of regressor column names. These regressor columns -# have to be included in the training df for both training and forecast periods. For more details about regressors, see -# `Regressors <../../pages/model_components/0700_regressors.html#silverkite>`_. -# For ``"SK"`` template, no regressors are included. -# -# The ``uncertainty`` parameter recognizes the key ``"uncertainty_dict"``, which takes a dictionary to specify how forecast intervals -# are calculated. For more details about uncertainty, see `Uncertainty <../../pages/model_components/0900_uncertainty.html#silverkite>`_. -# For ``"SK"`` template, the default value is ``None``. If ``coverage`` in ``ForecastConfig`` is not None, it will automatically finds the -# most proper conditional residual to compute forecast intervals. We will see how to set ``coverage`` later. -# -# The ``custom`` parameter recognizes specific keys for ``"SK"`` type of template that correspond to -# `~greykite.sklearn.estimator.silverkite_estimator.SilverkiteEstimator`. These keys include -# -# - ``"fit_algorithm_dict"`` takes a dictionary to specify what regression method is used to fit the time series. -# The default is the linear regression in `sklearn`. For a detailed list of algorithms, see -# `Algorithms <../../pages/model_components/0600_custom.html#fit-algorithm>`_. -# - ``"extra_pred_cols"`` defines extra predictor column names. It accepts any valid patsy model formula term. Every column -# name needs to be either generated by `~greykite.common.features.timeseries_features.build_silverkite_features` -# or included in the data df. For details, see -# `Extra predictors <../../pages/model_components/0600_custom.html#extra-predictors>`_. -# The default is ``["ct1"]``, which is the linear growth term. -# - ``"min_admissible_value"`` is the minimum admissible value in forecast. All values below this will be clipped at this value. -# The default is None. -# - ``"max_admissible_value"`` is the maximum admissible value in forecast. All values above this will be clipped at this value. -# The default is None. - -# %% -# A major difference between the high-level and low-level interfaces is that -# the lower-level interface does not have pre-defined holidays or feature sets (interaction terms), -# and takes more customizable seasonality information. Note that ``"SK"`` is the only low-level -# template in ``SILVERKITE`` estimators, and does not support a list of ``model_template`` or -# ``model_components_param``. -# -# The ``"PROPHET"`` Template -# -------------------------- -# -# The ``"PROPHET"`` template uses -# `~greykite.sklearn.estimator.prophet_estimator.ProphetEstimator`, -# which is a wrapper for the `Prophet model `_. -# -# The attributes in ``ModelComponentsParam`` are the same as in ``"SILVERKITE"`` but they take different -# types of inputs. - -model_components_param_prophet = ModelComponentsParam( - growth={ - "growth_term": ["linear"] - }, - seasonality={ - "seasonality_mode": ["additive"], - "seasonality_prior_scale": [10.0], - "yearly_seasonality": ['auto'], - "weekly_seasonality": ['auto'], - "daily_seasonality": ['auto'], - "add_seasonality_dict": [None] - }, - changepoints={ - "changepoint_prior_scale": [0.05], - "changepoints": [None], - "n_changepoints": [25], - "changepoint_range": [0.8] - }, - events={ - "holiday_lookup_countries": "auto", - "holiday_pre_num_days": [2], - "holiday_post_num_days": [2], - "start_year": 2015, - "end_year": 2030, - "holidays_prior_scale": [10.0] - }, - regressors={ - "add_regressor_dict": [None] - }, - uncertainty={ - "mcmc_samples": [0], - "uncertainty_samples": [1000] - } -) - -# %% -# The ``growth`` parameter recognizes the key ``"growth_term"``, which describes the growth rate of the time series model. -# For ``"PROPHET"`` template, the value indicates linear growth. -# -# The ``seasonality`` parameter recognizes the keys ``"seasonality_mode"``, ``"seasonality_prior_scale"``, -# ``"yearly_seasonality"``, ``"weekly_seasonality"``, ``"daily_seasonality"`` and ``"add_seasonality_dict"``. -# For ``"PROPHET"`` template, the seasonality model is "additive" with prior scale 10 and automatic components. -# -# The ``changepoints`` parameter recognizes the keys ``"changepoint_prior_scale"``, ``"changepoints"``, ``"n_changepoints"`` -# and ``"changepoint_range"``. -# The Prophet model supports trend changepoints only. -# For ``"PROPHET"`` template, it puts 25 potential trend changepoints uniformly over the first 80% -# data and use regularization with prior scale 0.05. -# -# The ``events`` parameter recognizes the keys ``"holiday_lookup_countries"``, -# ``"holiday_pre_num_days"``, ``"holiday_post_num_days"``, ``"start_year"``, ``"end_year"`` and ``"holidays_prior_scale"``. -# The algorithm automatically looks up holidays in ``"holiday_lookup_countries"``. -# For ``"PROPHET"`` template, it automatically looks up holidays between 2015 and 2030 with their -# plus/minus 2 days. The holiday prior scale is 10. -# -# The Prophet model does not support autoregression, so the ``autoregression`` value should be empty. -# -# The ``regressors`` parameter recognizes the key ``"add_regressor_dict"``. -# For more details about regressors, see -# `Regressors <../../pages/model_components/0700_regressors.html#prophet>`_. -# For ``"PROPHET"`` template, no regressors are included. -# -# The ``uncertainty`` parameter recognizes the key ``"mcmc_samples"`` and ``"uncertainty_samples"``. -# For more details about uncertainty, see `Uncertainty <../../pages/model_components/0900_uncertainty.html#prophet>`_. -# For ``"PROPHET"`` template, the default value is to sample 1000 uncertainty samples. -# -# The Prophet model does not have any specific value in the ``custom`` parameter. - -# %% -# Extra Notes -# ----------- -# - All templates take the ``hyperparameter_override`` key in their -# ``ModelComponentsParam`` class, which is used to define extra grid search options. -# For details, see `Grid search <../quickstart/0500_grid_search.html>`_. -# -# - To specify a string as a template name, it is recommended to use the -# `~greykite.framework.templates.model_templates.ModelTemplateEnum` -# to avoid typos. For example, - -silverkite_template = ModelTemplateEnum.SILVERKITE.name -silverkite_templates = [ - ModelTemplateEnum.SILVERKITE_EMPTY.name, - ModelTemplateEnum.SILVERKITE_DAILY_90.name -] -prophet_template = ModelTemplateEnum.PROPHET.name diff --git a/docs/nbpages/tutorials/0300_benchmark.py b/docs/nbpages/tutorials/0300_benchmark.py deleted file mode 100644 index 083b764..0000000 --- a/docs/nbpages/tutorials/0300_benchmark.py +++ /dev/null @@ -1,458 +0,0 @@ -""" -Benchmarking -============ - -You can easily compare predictive performance of multiple algorithms such as -``Silverkite`` and ``Prophet`` using the -`~greykite.framework.benchmark.benchmark_class.BenchmarkForecastConfig` class. -In this tutorial we describe the step by step process of defining, running and monitoring a benchmark. -We also demonstrate how to use the class functions to compute and plot errors for multiple models. -""" - -from dataclasses import replace - -import plotly -import plotly.graph_objects as go - -from greykite.common.evaluation import EvaluationMetricEnum -from greykite.framework.benchmark.benchmark_class import BenchmarkForecastConfig -from greykite.framework.benchmark.data_loader_ts import DataLoaderTS -from greykite.framework.templates.autogen.forecast_config import ComputationParam -from greykite.framework.templates.autogen.forecast_config import EvaluationMetricParam -from greykite.framework.templates.autogen.forecast_config import EvaluationPeriodParam -from greykite.framework.templates.autogen.forecast_config import MetadataParam -from greykite.framework.templates.autogen.forecast_config import ForecastConfig -from greykite.framework.templates.autogen.forecast_config import ModelComponentsParam -from greykite.sklearn.cross_validation import RollingTimeSeriesSplit - -# %% -# Load the data -# ------------- -# First load your dataset into a pandas dataframe. -# We will use the peyton-manning dataset as a running example. - -# Loads dataset into UnivariateTimeSeries -dl = DataLoaderTS() -ts = dl.load_peyton_manning_ts() -df = ts.df # cleaned pandas.DataFrame - -# %% -# Define the Configs -# ------------------ -# We specify the models we want to benchmark via the ``configs`` parameter. -# In this example we will benchmark 1 ``Prophet`` and 2 different ``Silverkite`` models. -# We first define the common components of the models -# such as ``MetadataParam`` and ``EvaluationMetricParam``, and then update the configuration to specify -# individual models. - -## Define common components of the configs -# Specifies dataset information -metadata = MetadataParam( - time_col="ts", # name of the time column - value_col="y", # name of the value column - freq="D" # "H" for hourly, "D" for daily, "W" for weekly, etc. -) - -# Defines number of periods to forecast into the future -forecast_horizon = 7 - -# Specifies intended coverage of the prediction interval -coverage = 0.95 - -# Defines the metrics to evaluate the forecasts -# We use Mean Absolute Percent Error (MAPE) in this tutorial -evaluation_metric = EvaluationMetricParam( - cv_selection_metric=EvaluationMetricEnum.MeanAbsolutePercentError.name, - cv_report_metrics=None -) - -# Defines the cross-validation config within pipeline -evaluation_period = EvaluationPeriodParam( - cv_max_splits=1, # Benchmarking n_splits is defined in tscv, here we don't need split to choose parameter sets - periods_between_train_test=0, -) - -# Defines parameters related to grid-search computation -computation = ComputationParam( - hyperparameter_budget=None, - n_jobs=-1, # to debug, change to 1 for more informative error messages - verbose=3) - -# Defines common components across all the configs -# ``model_template`` and ``model_components_param`` changes between configs -common_config = ForecastConfig( - metadata_param=metadata, - forecast_horizon=forecast_horizon, - coverage=coverage, - evaluation_metric_param=evaluation_metric, - evaluation_period_param=evaluation_period, - computation_param=computation, -) - -# %% -# Now we update ``common_config`` to specify the individual models. - -# Defines ``Prophet`` model template with custom seasonality -model_components = ModelComponentsParam( - seasonality={ - "seasonality_mode": ["additive"], - "yearly_seasonality": ["auto"], - "weekly_seasonality": [True], - }, - growth={ - "growth_term": ["linear"] - } -) -param_update = dict( - model_template="PROPHET", - model_components_param=model_components -) -Prophet = replace(common_config, **param_update) - -# Defines ``Silverkite`` model template with automatic autoregression -# and changepoint detection -model_components = ModelComponentsParam( - changepoints={ - "changepoints_dict": { - "method": "auto", - } - }, - autoregression={ - "autoreg_dict": "auto" - } -) -param_update = dict( - model_template="SILVERKITE", - model_components_param=model_components -) -Silverkite_1 = replace(common_config, **param_update) - -# Defines ``Silverkite`` model template via string encoding -param_update = dict( - model_template="DAILY_SEAS_NMQM_GR_LINEAR_CP_NM_HOL_SP2_FEASET_AUTO_ALGO_RIDGE_AR_AUTO_DSI_AUTO_WSI_AUTO", - model_components_param=None -) -Silverkite_2 = replace(common_config, **param_update) - -# Define the list of configs to benchmark -# The dictionary keys will be used to store the benchmark results -configs = { - "Prophet": Prophet, - "SK_1": Silverkite_1, - "SK_2": Silverkite_2, -} - -# %% -# Define the Cross-Validation (CV) -# -------------------------------- -# In time-series forecasting we use a Rolling Window CV. -# You can easily define it by using -# `~greykite.sklearn.cross_validation.RollingTimeSeriesSplit` class. -# The CV parameters depend on the data frequency, -# forecast horizon as well as the speed of the models. -# See ``Benchmarking documentation`` for guidance on how -# to choose CV parameters for your use case. - -# Define the benchmark folds -# CV parameters are changed for illustration purpose -tscv = RollingTimeSeriesSplit( - forecast_horizon=forecast_horizon, - min_train_periods=2 * 365, - expanding_window=True, - use_most_recent_splits=True, - periods_between_splits=5, - periods_between_train_test=0, - max_splits=4) # reduced to 4 from 16 for faster runtime - -# Print the train, test split for BM folds -for split_num, (train, test) in enumerate(tscv.split(X=df)): - print(split_num, train, test) - -# %% -# Run the Benchmark -# ----------------- -# To start the benchmarking procedure execute its ``run`` method. -# -# If you get an error message at this point, then there is a compatibility issue between your -# benchmark inputs. Check :ref:`Debugging the Benchmark` section for instructions on how to derive valid inputs. - -bm = BenchmarkForecastConfig(df=df, configs=configs, tscv=tscv) -bm.run() - -# %% -# Monitor the Benchmark -# --------------------- -# During benchmarking a couple of color coded progress bars are displayed to inform the user of the -# advancement of the entire process. The first bar displays ``config`` level information, while -# the second bar displays split level information for the current ``config``. -# See example in `Benchmarking documentation`. -# -# On the left side of the progress bar, it shows which ``config``/ split is currently being -# benchmarked and progress within that level as a percentage. -# -# On the right side, the user can see how many ``configs``/ splits have been benchmarked -# and how many are remaining. Additionally, this bar also displays elapsed time and remaining runtime -# for the corresponding level. - -# %% -# Benchmark Output -# ---------------- -# The output of a successful benchmark procedure is stored as a nested dictionary under the class attribute -# ``result``. For details on the structure of this tree check -# ``Benchmarking documentation``. -# -# You can extract any specific information by navigating this tree. For example, you can -# check the summary and component plot of any ``config``. - -# Check summary of SK_1 model on first fold -model = bm.result["SK_2"]["rolling_evaluation"]["split_0"]["pipeline_result"].model -model[-1].summary(max_colwidth=30) - -# %% - -# Check component plot of SK_2 on second fold -model = bm.result["SK_2"]["rolling_evaluation"]["split_1"]["pipeline_result"].model -fig = model[-1].plot_components() -plotly.io.show(fig) - - -# %% -# Compare forecasts -# ^^^^^^^^^^^^^^^^^ -# To obtain forecasts run the ``extract_forecasts`` method. You only need to run this once. - -bm.extract_forecasts() - -# %% -# This method does two things. -# -# * For every ``config``, it gathers forecast results across rolling windows and stores it -# as a dataframe in ``rolling_forecast_df`` under the ``config`` key. This helps in comparing forecasts -# and prediction accuracy across splits for the ``config``. - -# Forecast across rolling windows for SK_1 -forecast_sk_1 = bm.result["SK_1"]["rolling_forecast_df"] -forecast_sk_1.head() - -# %% -# * Concatenates ``rolling_forecast_df`` for all the ``configs`` and stores it as a dataframe in the -# class attribute ``forecasts``. This helps in comparing forecasts and prediction accuracies across ``configs``. - -# Forecasts across configs -bm.forecasts.head() - -# %% -# For any ``config`` you can plot forecasts across splits. This allows you to quickly check if there is -# any particular time window where the test performance drops. The forecasts for adjacent folds will -# overlap if the time windows of the corresponding folds overlap. - -fig = bm.plot_forecasts_by_config(config_name="SK_1") -plotly.io.show(fig) - -# %% -# The importance of this function becomes more significant when assessing a models performance over a -# longer period e.g. a year or multiple years. You can quickly catch if models test performance drops -# during weekends, specific months or holiday seasons. -# -# You can also compare forecasts from multiple ``configs`` by ``forecast_step`` which is -# defined as any number between 1 and ``forecast_horizon``. This is useful in forecasts with longer -# forecast horizons to check if the forecast volatility changes over time. - -fig = bm.plot_forecasts_by_step(forecast_step=3) -plotly.io.show(fig) - -# %% -# Compare Errors -# ^^^^^^^^^^^^^^ -# You can compare the predictive performance of your models via multiple evaluation metrics. -# In this example we will use MAPE and RMSE, but you can use any metric from ``EvaluationMetricEnum``. - -metric_dict = { - "MAPE": EvaluationMetricEnum.MeanAbsolutePercentError, - "RMSE": EvaluationMetricEnum.RootMeanSquaredError -} - -# %% -# Non Grouping Errors -# ^^^^^^^^^^^^^^^^^^^ -# To compare evaluation metrics without any grouping use ``get_evaluation_metrics``. -# The output shows metric values by ``config`` and ``split``. We can group by ``config_name`` to get -# metric values aggregated across all folds. - -# Compute evaluation metrics -evaluation_metrics_df = bm.get_evaluation_metrics(metric_dict=metric_dict) -# Aggregate by model across splits -error_df = evaluation_metrics_df.drop(columns=["split_num"]).groupby("config_name").mean() -error_df - -# %% - -# Visualize -fig = bm.plot_evaluation_metrics(metric_dict) -plotly.io.show(fig) - -# %% -# Train MAPE is high because some values in training dataset are close to 0. -# -# You can also compare the predictive accuracy across splits for any model from ``configs``. -# This allows you to check if the model performance varies significantly across time periods. - -# Compute evaluation metrics for a single config -evaluation_metrics_df = bm.get_evaluation_metrics(metric_dict=metric_dict, config_names=["SK_1"]) -# Aggregate by split number -error_df = evaluation_metrics_df.groupby("split_num").mean() -error_df.head() - -# %% - -# Visualize -title = "Average evaluation metric across rolling windows" -data = [] -# Each row (index) is a config. Adds each row to the bar plot. -for index in error_df.index: - data.append( - go.Bar( - name=index, - x=error_df.columns, - y=error_df.loc[index].values - ) - ) -layout = go.Layout( - xaxis=dict(title=None), - yaxis=dict(title="Metric Value"), - title=title, - title_x=0.5, - showlegend=True, - barmode="group", -) -fig = go.Figure(data=data, layout=layout) -plotly.io.show(fig) - -# %% -# Grouping Errors -# ^^^^^^^^^^^^^^^ -# To compare evaluation metrics with grouping use ``get_grouping_evaluation_metrics``. -# This allows you to group the error values by time features such as day of week, month etc. - -# Compute grouped evaluation metrics -grouped_evaluation_df = bm.get_grouping_evaluation_metrics( - metric_dict=metric_dict, - which="test", - groupby_time_feature="str_dow") -# Aggregate by split number -error_df = grouped_evaluation_df.groupby(["str_dow", "config_name"]).mean() -error_df - -# %% - -# Visualize -fig = bm.plot_grouping_evaluation_metrics( - metric_dict=metric_dict, - which="test", - groupby_time_feature="str_dow") -plotly.io.show(fig) - -# %% -# As you can see all the models have higher MAPE and RMSE during weekends. That means adding -# ``is_weekend`` indicator to the models will help. -# -# Compare runtimes -# ^^^^^^^^^^^^^^^^ -# You can compare and visualize runtimes of the models using the following codes. - -# Compute runtimes -runtime_df = bm.get_runtimes() -# Aggregate across splits -runtimes_df = runtime_df.drop(columns=["split_num"]).groupby("config_name").mean() -runtimes_df - -# %% - -# Visualize -fig = bm.plot_runtimes() -plotly.io.show(fig) - -# %% -# You can see ``Silverkite`` models run almost 3 times faster compared to ``Prophet``. -# -# Debugging the Benchmark -# ----------------------- -# When the `run` method is called, the input ``configs`` are first assessed of -# their suitability for a cohesive benchmarking procedure via the ``validate`` method. -# This is done prior to passing the ``configs`` to the forecasting pipeline to save wasted -# computing time for the user. -# Though not necessary, the user is encouraged to use ``validate`` for debugging. -# -# The ``validate`` method runs a series of checks to ensure that -# -# * The ``configs`` are compatible among themselves. For example, it checks if all the ``configs`` -# have the same ``forecast horizon``. -# * The ``configs`` are compatible with the CV schema. For example, ``forecast_horizon`` and -# ``periods_between_train_test`` parameters of ``configs`` are -# matched against that of the ``tscv``. -# -# Note that the ``validate`` method does not guarantee that the models will execute properly -# while in the pipeline. It is a good idea to do a test run on a smaller data and/ or smaller -# number of splits before running the full procedure. -# -# In the event of a mismatch a ``ValueError`` is raised with informative error messages -# to help the user in debugging. Some examples are provided below. -# -# Error due to incompatible model components in config -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -# regressor_cols is not part of Prophet's model components -model_components=ModelComponentsParam( - regressors={ - "regressor_cols": ["regressor1", "regressor2", "regressor_categ"] - } -) -invalid_prophet = replace(Prophet, model_components_param=model_components) -invalid_configs = {"invalid_prophet": invalid_prophet} -bm = BenchmarkForecastConfig(df=df, configs=invalid_configs, tscv=tscv) -try: - bm.validate() -except ValueError as err: - print(err) - -# %% -# Error due to wrong template name -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -# model template name is not part of TemplateEnum, thus invalid -unknown_template = replace(Prophet, model_template="SOME_TEMPLATE") -invalid_configs = {"unknown_template": unknown_template} -bm = BenchmarkForecastConfig(df=df, configs=invalid_configs, tscv=tscv) -try: - bm.validate() -except ValueError as err: - print(err) - -# %% -# Error due to different forecast horizons in configs -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -# the configs are valid by themselves, however incompatible for -# benchmarking as these have different forecast horizons -Prophet_forecast_horizon_30 = replace(Prophet, forecast_horizon=30) -invalid_configs = { - "Prophet": Prophet, - "Prophet_30": Prophet_forecast_horizon_30 -} -bm = BenchmarkForecastConfig(df=df, configs=invalid_configs, tscv=tscv) -try: - bm.validate() -except ValueError as err: - print(err) - -# %% -# Error due to different forecast horizons in config and tscv -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -## Error due to different forecast horizons in config and tscv -tscv = RollingTimeSeriesSplit(forecast_horizon=15) -bm = BenchmarkForecastConfig(df=df, configs=configs, tscv=tscv) -try: - bm.validate() -except ValueError as err: - print(err) diff --git a/docs/nbpages/tutorials/0400_monthly_data.py b/docs/nbpages/tutorials/0400_monthly_data.py deleted file mode 100644 index 404c8a5..0000000 --- a/docs/nbpages/tutorials/0400_monthly_data.py +++ /dev/null @@ -1,385 +0,0 @@ -""" -Example for monthly data -============================== - -This is a basic example for monthly data using Silverkite. -Note that here we are fitting a few simple models and the goal is not to optimize -the results as much as possible. -""" - -import warnings -from collections import defaultdict - -import plotly -import pandas as pd - -from greykite.framework.benchmark.data_loader_ts import DataLoaderTS -from greykite.framework.templates.autogen.forecast_config import EvaluationPeriodParam -from greykite.framework.templates.autogen.forecast_config import ForecastConfig -from greykite.framework.templates.autogen.forecast_config import MetadataParam -from greykite.framework.templates.autogen.forecast_config import ModelComponentsParam -from greykite.framework.templates.forecaster import Forecaster -from greykite.framework.utils.result_summary import summarize_grid_search_results -from greykite.framework.input.univariate_time_series import UnivariateTimeSeries - -warnings.filterwarnings("ignore") - -# %% -# Loads dataset into ``UnivariateTimeSeries``. -dl = DataLoaderTS() -agg_func = {"count": "sum"} -df = dl.load_bikesharing(agg_freq="monthly", agg_func=agg_func) -# In this monthly data the last month data is incomplete, therefore we drop it -df.drop(df.tail(1).index,inplace=True) -df.reset_index(drop=True) -ts = UnivariateTimeSeries() -ts.load_data( - df=df, - time_col="ts", - value_col="count", - freq="MS") - -# %% -# Exploratory data analysis (EDA) -# -------------------------------- -# After reading in a time series, we could first do some exploratory data analysis. -# The `~greykite.framework.input.univariate_time_series.UnivariateTimeSeries` class is -# used to store a timeseries and perform EDA. - -# %% -# A quick description of the data can be obtained as follows. -print(ts.describe_time_col()) -print(ts.describe_value_col()) -print(df.head()) - -# %% -# Let's plot the original timeseries. -# (The interactive plot is generated by ``plotly``: **click to zoom!**) -fig = ts.plot() -plotly.io.show(fig) - -# %% -# Exploratory plots can be plotted to reveal the time series's properties. -# Monthly overlay plot can be used to inspect the annual patterns. -# This plot overlays various years on top of each other. -fig = ts.plot_quantiles_and_overlays( - groupby_time_feature="month", - show_mean=False, - show_quantiles=False, - show_overlays=True, - overlay_label_time_feature="year", - overlay_style={"line": {"width": 1}, "opacity": 0.5}, - center_values=False, - xlabel="month of year", - ylabel=ts.original_value_col, - title="yearly seasonality for each year (centered)",) -plotly.io.show(fig) - -# %% -# Specify common metadata. -forecast_horizon = 4 -time_col = "ts" -value_col = "count" -meta_data_params = MetadataParam( - time_col=time_col, - value_col=value_col, - freq="MS", -) - -# %% -# Specify common evaluation parameters. -# Set minimum input data for training. -cv_min_train_periods = 24 -# Let CV use most recent splits for cross-validation. -cv_use_most_recent_splits = True -# Determine the maximum number of validations. -cv_max_splits = 5 -evaluation_period_param = EvaluationPeriodParam( - test_horizon=forecast_horizon, - cv_horizon=forecast_horizon, - periods_between_train_test=0, - cv_min_train_periods=cv_min_train_periods, - cv_expanding_window=True, - cv_use_most_recent_splits=cv_use_most_recent_splits, - cv_periods_between_splits=None, - cv_periods_between_train_test=0, - cv_max_splits=cv_max_splits, -) - -# %% -# Fit a simple model without autoregression. -# The important modeling parameters for monthly data are as follows. -# These are plugged into ``ModelComponentsParam``. -# The ``extra_pred_cols`` is used to specify growth and annual seasonality -# Growth is modelled with both "ct_sqrt", "ct1" for extra flexibility as we have -# longterm data and ridge regularization will avoid over-fitting the trend. -# The annual seasonality is modelled categorically with "C(month)" instead of -# Fourier series. This is because in monthly data, the number of data points in -# year is rather small (12) as opposed to daily data where there are many points in -# the year, which makes categorical representation non-feasible. -# The categorical representation of monthly also is more explainable/interpretable in the model -# summary. -extra_pred_cols = ["ct_sqrt", "ct1", "C(month, levels=list(range(1, 13)))"] -autoregression = None - -# Specify the model parameters -model_components = ModelComponentsParam( - growth=dict(growth_term=None), - seasonality=dict( - yearly_seasonality=[False], - quarterly_seasonality=[False], - monthly_seasonality=[False], - weekly_seasonality=[False], - daily_seasonality=[False] - ), - custom=dict( - fit_algorithm_dict=dict(fit_algorithm="ridge"), - extra_pred_cols=extra_pred_cols - ), - regressors=dict(regressor_cols=None), - autoregression=autoregression, - uncertainty=dict(uncertainty_dict=None), - events=dict(holiday_lookup_countries=None), -) - -# Run the forecast model -forecaster = Forecaster() -result = forecaster.run_forecast_config( - df=df, - config=ForecastConfig( - model_template="SILVERKITE", - coverage=0.95, - forecast_horizon=forecast_horizon, - metadata_param=meta_data_params, - evaluation_period_param=evaluation_period_param, - model_components_param=model_components - ) -) - -# Get the useful fields from the forecast result -model = result.model[-1] -backtest = result.backtest -forecast = result.forecast -grid_search = result.grid_search - -# Check model coefficients / variables -# Get model summary with p-values -print(model.summary()) - -# Get cross-validation results -cv_results = summarize_grid_search_results( - grid_search=grid_search, - decimals=2, - cv_report_metrics=None, - column_order=[ - "rank", "mean_test", "split_test", "mean_train", "split_train", - "mean_fit_time", "mean_score_time", "params"]) -# Transposes to save space in the printed output -print(cv_results.transpose()) - -# Check historical evaluation metrics (on the historical training/test set). -backtest_eval = defaultdict(list) -for metric, value in backtest.train_evaluation.items(): - backtest_eval[metric].append(value) - backtest_eval[metric].append(backtest.test_evaluation[metric]) -metrics = pd.DataFrame(backtest_eval, index=["train", "test"]).T -print(metrics) - -# %% -# Fit/backtest plot: -fig = backtest.plot() -plotly.io.show(fig) - -# %% -# Forecast plot: -fig = forecast.plot() -plotly.io.show(fig) - -# %% -# The components plot: -fig = forecast.plot_components() -plotly.io.show(fig) - -# %% -# Fit a simple model with autoregression. -# This is done by specifying the ``autoregression`` parameter in ``ModelComponentsParam``. -# Note that the auto-regressive structure can be customized further depending on your data. -extra_pred_cols = ["ct_sqrt", "ct1", "C(month, levels=list(range(1, 13)))"] -autoregression = { - "autoreg_dict": { - "lag_dict": {"orders": [1]}, - "agg_lag_dict": None - } -} - -# Specify the model parameters -model_components = ModelComponentsParam( - growth=dict(growth_term=None), - seasonality=dict( - yearly_seasonality=[False], - quarterly_seasonality=[False], - monthly_seasonality=[False], - weekly_seasonality=[False], - daily_seasonality=[False] - ), - custom=dict( - fit_algorithm_dict=dict(fit_algorithm="ridge"), - extra_pred_cols=extra_pred_cols - ), - regressors=dict(regressor_cols=None), - autoregression=autoregression, - uncertainty=dict(uncertainty_dict=None), - events=dict(holiday_lookup_countries=None), -) - -# Run the forecast model -forecaster = Forecaster() -result = forecaster.run_forecast_config( - df=df, - config=ForecastConfig( - model_template="SILVERKITE", - coverage=0.95, - forecast_horizon=forecast_horizon, - metadata_param=meta_data_params, - evaluation_period_param=evaluation_period_param, - model_components_param=model_components - ) -) - -# Get the useful fields from the forecast result -model = result.model[-1] -backtest = result.backtest -forecast = result.forecast -grid_search = result.grid_search - -# Check model coefficients / variables -# Get model summary with p-values -print(model.summary()) - -# Get cross-validation results -cv_results = summarize_grid_search_results( - grid_search=grid_search, - decimals=2, - cv_report_metrics=None, - column_order=[ - "rank", "mean_test", "split_test", "mean_train", "split_train", - "mean_fit_time", "mean_score_time", "params"]) -# Transposes to save space in the printed output -print(cv_results.transpose()) - -# Check historical evaluation metrics (on the historical training/test set). -backtest_eval = defaultdict(list) -for metric, value in backtest.train_evaluation.items(): - backtest_eval[metric].append(value) - backtest_eval[metric].append(backtest.test_evaluation[metric]) -metrics = pd.DataFrame(backtest_eval, index=["train", "test"]).T -print(metrics) - -# %% -# Fit/backtest plot: -fig = backtest.plot() -plotly.io.show(fig) - -# %% -# Forecast plot: -fig = forecast.plot() -plotly.io.show(fig) - -# %% -# The components plot: -fig = forecast.plot_components() -plotly.io.show(fig) - -# %% -# Fit a model with time-varying seasonality (month effect). -# This is achieved by adding ``"ct1*C(month)"`` to ``ModelComponentsParam``. -# Note that this feature may or may not be useful in your use case. -# We have included this for demonstration purposes only. -# In this example, while the fit has improved the backtest is inferior to the previous setting. -extra_pred_cols = ["ct_sqrt", "ct1", "C(month, levels=list(range(1, 13)))", - "ct1*C(month, levels=list(range(1, 13)))"] -autoregression = { - "autoreg_dict": { - "lag_dict": {"orders": [1]}, - "agg_lag_dict": None - } -} - -# Specify the model parameters -model_components = ModelComponentsParam( - growth=dict(growth_term=None), - seasonality=dict( - yearly_seasonality=[False], - quarterly_seasonality=[False], - monthly_seasonality=[False], - weekly_seasonality=[False], - daily_seasonality=[False] - ), - custom=dict( - fit_algorithm_dict=dict(fit_algorithm="ridge"), - extra_pred_cols=extra_pred_cols - ), - regressors=dict(regressor_cols=None), - autoregression=autoregression, - uncertainty=dict(uncertainty_dict=None), - events=dict(holiday_lookup_countries=None), -) - -# Run the forecast model -forecaster = Forecaster() -result = forecaster.run_forecast_config( - df=df, - config=ForecastConfig( - model_template="SILVERKITE", - coverage=0.95, - forecast_horizon=forecast_horizon, - metadata_param=meta_data_params, - evaluation_period_param=evaluation_period_param, - model_components_param=model_components - ) -) - -# Get the useful fields from the forecast result -model = result.model[-1] -backtest = result.backtest -forecast = result.forecast -grid_search = result.grid_search - -# Check model coefficients / variables -# Get model summary with p-values -print(model.summary()) - -# Get cross-validation results -cv_results = summarize_grid_search_results( - grid_search=grid_search, - decimals=2, - cv_report_metrics=None, - column_order=[ - "rank", "mean_test", "split_test", "mean_train", "split_train", - "mean_fit_time", "mean_score_time", "params"]) -# Transposes to save space in the printed output -print(cv_results.transpose()) - -# Check historical evaluation metrics (on the historical training/test set). -backtest_eval = defaultdict(list) -for metric, value in backtest.train_evaluation.items(): - backtest_eval[metric].append(value) - backtest_eval[metric].append(backtest.test_evaluation[metric]) -metrics = pd.DataFrame(backtest_eval, index=["train", "test"]).T -print(metrics) - -# %% -# Fit/backtest plot: -fig = backtest.plot() -plotly.io.show(fig) - -# %% -# Forecast plot: -fig = forecast.plot() -plotly.io.show(fig) - -# %% -# The components plot: -fig = forecast.plot_components() -plotly.io.show(fig) - diff --git a/docs/nbpages/tutorials/0500_weekly_data.py b/docs/nbpages/tutorials/0500_weekly_data.py deleted file mode 100644 index 32f8fc9..0000000 --- a/docs/nbpages/tutorials/0500_weekly_data.py +++ /dev/null @@ -1,426 +0,0 @@ -""" -Example for weekly data -======================= - -This is a basic example for weekly data using Silverkite. -Note that here we are fitting a few simple models and the goal is not to optimize -the results as much as possible. -""" - -import warnings -from collections import defaultdict - -import plotly -import pandas as pd - -from greykite.common.constants import TIME_COL -from greykite.common.constants import VALUE_COL -from greykite.framework.benchmark.data_loader_ts import DataLoader -from greykite.framework.input.univariate_time_series import UnivariateTimeSeries -from greykite.framework.templates.autogen.forecast_config import EvaluationPeriodParam -from greykite.framework.templates.autogen.forecast_config import ForecastConfig -from greykite.framework.templates.autogen.forecast_config import MetadataParam -from greykite.framework.templates.autogen.forecast_config import ModelComponentsParam -from greykite.framework.templates.forecaster import Forecaster -from greykite.framework.utils.result_summary import summarize_grid_search_results - -warnings.filterwarnings("ignore") - -# %% -# Loads weekly dataset into ``UnivariateTimeSeries``. -dl = DataLoader() -agg_func = {"count": "sum"} -df = dl.load_bikesharing(agg_freq="weekly", agg_func=agg_func) -# In this dataset the first week and last week's data are incomplete, therefore we drop it -df.drop(df.head(1).index,inplace=True) -df.drop(df.tail(1).index,inplace=True) -df.reset_index(drop=True) -ts = UnivariateTimeSeries() -ts.load_data( - df=df, - time_col="ts", - value_col="count", - freq="W-MON") -print(ts.df.head()) - -# %% -# Exploratory Data Analysis (EDA) -# ------------------------------- -# After reading in a time series, we could first do some exploratory data analysis. -# The `~greykite.framework.input.univariate_time_series.UnivariateTimeSeries` class is -# used to store a timeseries and perform EDA. - -# %% -# A quick description of the data can be obtained as follows. -print(ts.describe_time_col()) -print(ts.describe_value_col()) - -# %% -# Let's plot the original timeseries. -# (The interactive plot is generated by ``plotly``: **click to zoom!**) -fig = ts.plot() -plotly.io.show(fig) - -# %% -# Exploratory plots can be plotted to reveal the time series's properties. -# Monthly overlay plot can be used to inspect the annual patterns. -# This plot overlays various years on top of each other. -fig = ts.plot_quantiles_and_overlays( - groupby_time_feature="month", - show_mean=True, - show_quantiles=False, - show_overlays=True, - center_values=True, - overlay_label_time_feature="year", # splits overlays by year - overlay_style={"line": {"width": 1}, "opacity": 0.5}, - xlabel="Month", - ylabel=ts.original_value_col, - title="Yearly seasonality by year (centered)", -) -plotly.io.show(fig) - -# %% -# Weekly overlay plot. -fig = ts.plot_quantiles_and_overlays( - groupby_time_feature="woy", - show_mean=True, - show_quantiles=False, - show_overlays=True, - center_values=True, - overlay_label_time_feature="year", # splits overlays by year - overlay_style={"line": {"width": 1}, "opacity": 0.5}, - xlabel="Week of year", - ylabel=ts.original_value_col, - title="Yearly seasonality by year (centered)", -) -plotly.io.show(fig) - -# %% -# Fit Greykite Models -# ------------------- -# After some exploratory data analysis, let's specify the model parameters and fit a Greykite model. - -# %% -# Specify common metadata. -forecast_horizon = 4 # Forecast 4 weeks -time_col = TIME_COL # "ts" -value_col = VALUE_COL # "y" -metadata = MetadataParam( - time_col=time_col, - value_col=value_col, - freq="W-MON", # Optional, the model will infer the data frequency -) - -# %% -# Specify common evaluation parameters. -# Set minimum input data for training. -cv_min_train_periods = 52 * 2 -# Let CV use most recent splits for cross-validation. -cv_use_most_recent_splits = True -# Determine the maximum number of validations. -cv_max_splits = 6 -evaluation_period = EvaluationPeriodParam( - test_horizon=forecast_horizon, - cv_horizon=forecast_horizon, - periods_between_train_test=0, - cv_min_train_periods=cv_min_train_periods, - cv_expanding_window=True, - cv_use_most_recent_splits=cv_use_most_recent_splits, - cv_periods_between_splits=None, - cv_periods_between_train_test=0, - cv_max_splits=cv_max_splits, -) - -# %% -# Let's also define a helper function that generates the model results summary and plots. -def get_model_results_summary(result): - """Generates model results summary. - - Parameters - ---------- - result : `ForecastResult` - See :class:`~greykite.framework.pipeline.pipeline.ForecastResult` for documentation. - - Returns - ------- - Prints out model coefficients, cross-validation results, overall train/test evalautions. - """ - # Get the useful fields from the forecast result - model = result.model[-1] - backtest = result.backtest - grid_search = result.grid_search - - # Check model coefficients / variables - # Get model summary with p-values - print(model.summary()) - - # Get cross-validation results - cv_results = summarize_grid_search_results( - grid_search=grid_search, - decimals=2, - cv_report_metrics=None, - column_order=[ - "rank", "mean_test", "split_test", "mean_train", "split_train", - "mean_fit_time", "mean_score_time", "params"]) - # Transposes to save space in the printed output - print("================================= CV Results ==================================") - print(cv_results.transpose()) - - # Check historical evaluation metrics (on the historical training/test set). - backtest_eval = defaultdict(list) - for metric, value in backtest.train_evaluation.items(): - backtest_eval[metric].append(value) - backtest_eval[metric].append(backtest.test_evaluation[metric]) - metrics = pd.DataFrame(backtest_eval, index=["train", "test"]).T - print("=========================== Train/Test Evaluation =============================") - print(metrics) - -# %% -# Fit a simple model without autoregression. -# The the most important model parameters are specified through ``ModelComponentsParam``. -# The ``extra_pred_cols`` is used to specify growth and annual seasonality -# Growth is modelled with both "ct_sqrt", "ct1" for extra flexibility as we have -# longterm data and ridge regularization will avoid over-fitting the trend. -# The yearly seasonality is modelled using Fourier series. In the ``ModelComponentsParam``, -# we can specify the order of that - the higher the order is, the more flexible pattern -# the model could capture. Usually one can try integers between 10 and 50. - -autoregression = None -extra_pred_cols = ["ct1", "ct_sqrt", "ct1:C(month, levels=list(range(1, 13)))"] - -# Specify the model parameters -model_components = ModelComponentsParam( - autoregression=autoregression, - seasonality={ - "yearly_seasonality": 25, - "quarterly_seasonality": 0, - "monthly_seasonality": 0, - "weekly_seasonality": 0, - "daily_seasonality": 0 - }, - changepoints={ - 'changepoints_dict': { - "method": "auto", - "resample_freq": "7D", - "regularization_strength": 0.5, - "potential_changepoint_distance": "14D", - "no_changepoint_distance_from_end": "60D", - "yearly_seasonality_order": 25, - "yearly_seasonality_change_freq": None, - }, - "seasonality_changepoints_dict": None - }, - events={ - "holiday_lookup_countries": [] - }, - growth={ - "growth_term": None - }, - custom={ - 'feature_sets_enabled': False, - 'fit_algorithm_dict': dict(fit_algorithm='ridge'), - 'extra_pred_cols': extra_pred_cols, - } -) - -forecast_config = ForecastConfig( - metadata_param=metadata, - forecast_horizon=forecast_horizon, - coverage=0.95, - evaluation_period_param=evaluation_period, - model_components_param=model_components -) - -# Run the forecast model -forecaster = Forecaster() -result = forecaster.run_forecast_config( - df=ts.df, - config=forecast_config -) - -# %% -# Let's check the model results summary and plots. -get_model_results_summary(result) - -# %% -# Fit/backtest plot: -fig = result.backtest.plot() -plotly.io.show(fig) - -# %% -# Forecast plot: -fig = result.forecast.plot() -plotly.io.show(fig) - -# %% -# The components plot: -fig = result.forecast.plot_components() -plotly.io.show(fig) - -# %% -# Fit a simple model with autoregression. -# This is done by specifying the ``autoregression`` parameter in ``ModelComponentsParam``. -# Note that the auto-regressive structure can be customized further depending on your data. -autoregression = { - "autoreg_dict": { - "lag_dict": {"orders": [1]}, # Only use lag-1 - "agg_lag_dict": None - } -} -extra_pred_cols = ["ct1", "ct_sqrt", "ct1:C(month, levels=list(range(1, 13)))"] - -# Specify the model parameters -model_components = ModelComponentsParam( - autoregression=autoregression, - seasonality={ - "yearly_seasonality": 25, - "quarterly_seasonality": 0, - "monthly_seasonality": 0, - "weekly_seasonality": 0, - "daily_seasonality": 0 - }, - changepoints={ - 'changepoints_dict': { - "method": "auto", - "resample_freq": "7D", - "regularization_strength": 0.5, - "potential_changepoint_distance": "14D", - "no_changepoint_distance_from_end": "60D", - "yearly_seasonality_order": 25, - "yearly_seasonality_change_freq": None, - }, - "seasonality_changepoints_dict": None - }, - events={ - "holiday_lookup_countries": [] - }, - growth={ - "growth_term": None - }, - custom={ - 'feature_sets_enabled': False, - 'fit_algorithm_dict': dict(fit_algorithm='ridge'), - 'extra_pred_cols': extra_pred_cols, - } -) - -forecast_config = ForecastConfig( - metadata_param=metadata, - forecast_horizon=forecast_horizon, - coverage=0.95, - evaluation_period_param=evaluation_period, - model_components_param=model_components -) - -# Run the forecast model -forecaster = Forecaster() -result = forecaster.run_forecast_config( - df=ts.df, - config=forecast_config -) - -# %% -# Let's check the model results summary and plots. -get_model_results_summary(result) - -# %% -# Fit/backtest plot: -fig = result.backtest.plot() -plotly.io.show(fig) - -# %% -# Forecast plot: -fig = result.forecast.plot() -plotly.io.show(fig) - -# %% -# The components plot: -fig = result.forecast.plot_components() -plotly.io.show(fig) - -# %% -# Fit a greykite model with autoregression and forecast one-by-one. Forecast one-by-one is only -# used when autoregression is set to "auto", and it can be enable by setting ``forecast_one_by_one=True`` -# in -# Without forecast one-by-one, the lag order in autoregression has to be greater -# than the forecast horizon in order to avoid simulation (which leads to less accuracy). -# The advantage of turning on forecast_one_by_one is to improve the forecast accuracy by breaking -# the forecast horizon to smaller steps, fitting multiple models using immediate lags. -# Note that the forecast one-by-one option may slow down the training. -autoregression = { - "autoreg_dict": "auto" -} -extra_pred_cols = ["ct1", "ct_sqrt", "ct1:C(month, levels=list(range(1, 13)))"] -forecast_one_by_one = True - -# Specify the model parameters -model_components = ModelComponentsParam( - autoregression=autoregression, - seasonality={ - "yearly_seasonality": 25, - "quarterly_seasonality": 0, - "monthly_seasonality": 0, - "weekly_seasonality": 0, - "daily_seasonality": 0 - }, - changepoints={ - 'changepoints_dict': { - "method": "auto", - "resample_freq": "7D", - "regularization_strength": 0.5, - "potential_changepoint_distance": "14D", - "no_changepoint_distance_from_end": "60D", - "yearly_seasonality_order": 25, - "yearly_seasonality_change_freq": None, - }, - "seasonality_changepoints_dict": None - }, - events={ - "holiday_lookup_countries": [] - }, - growth={ - "growth_term": None - }, - custom={ - 'feature_sets_enabled': False, - 'fit_algorithm_dict': dict(fit_algorithm='ridge'), - 'extra_pred_cols': extra_pred_cols, - } -) - -forecast_config = ForecastConfig( - metadata_param=metadata, - forecast_horizon=forecast_horizon, - coverage=0.95, - evaluation_period_param=evaluation_period, - model_components_param=model_components, - forecast_one_by_one=forecast_one_by_one -) - -# Run the forecast model -forecaster = Forecaster() -result = forecaster.run_forecast_config( - df=ts.df, - config=forecast_config -) - -# %% -# Let's check the model results summary and plots. Here the forecast_one_by_one option fits 4 models -# for each step, hence 4 model summaries are printed, and 4 components plots are generated. -get_model_results_summary(result) - -# %% -# Fit/backtest plot: -fig = result.backtest.plot() -plotly.io.show(fig) - -# %% -# Forecast plot: -fig = result.forecast.plot() -plotly.io.show(fig) - -# %% -# The components plot: -figs = result.forecast.plot_components() -for fig in figs: - plotly.io.show(fig) diff --git a/docs/nbpages/tutorials/0600_reconcile_forecasts.py b/docs/nbpages/tutorials/0600_reconcile_forecasts.py deleted file mode 100644 index 7ff2c75..0000000 --- a/docs/nbpages/tutorials/0600_reconcile_forecasts.py +++ /dev/null @@ -1,664 +0,0 @@ -""" -Reconcile Forecasts -=================== - -This tutorial explains how use the -`~greykite.algo.reconcile.convex.reconcile_forecasts.ReconcileAdditiveForecasts` -class to create forecasts that satisfy inter-forecast additivity constraints. - -The inputs are: - -1. additive constraints to be satisfied -2. original (base) forecasts (timeseries) -3. actuals (timeseries) - -The output is adjusted forecasts that satisfy the constraints. -""" -# %% -# Optimization Approach -# --------------------- -# The adjusted forecasts are computed as a linear transformation of the base forecasts. -# The linear transform is the solution to an optimization problem -# (`details <../../pages/miscellaneous/reconcile_forecasts>`_). -# -# In brief, the objective is to minimize the weighted sum of these error terms: -# -# 1. ``Training MSE``: empirical MSE of the adjusted forecasts on the training set -# 2. ``Bias penalty``: estimated squared bias of adjusted forecast errors -# 3. ``Variance penalty``: estimated variance of adjusted forecast errors for an unbiased -# transformation, assuming base forecasts are unbiased (this underestimates the variance -# if the transformation is biased). -# 4. ``Adjustment penalty``: regularization term that penalizes large adjustments -# -# Subject to these constraints: -# -# 1. Adjusted forecasts satisfy inter-forecast additivity constraints (required) -# 2. Transform is unbiased (optional) -# 3. Transform matrix entries are between [lower, upper] bound (optional) -# -# `~greykite.algo.reconcile.convex.reconcile_forecasts.ReconcileAdditiveForecasts` -# allows you to tune the optimization objective and constraints. -# It also exposes common methods as special cases of this optimization problem. -# The available methods are: -# -# * ``"bottom_up"`` (bottom up) -# * ``"ols"`` (`OLS `_) -# * ``"mint_sample"`` (`MinT `_ with sample covariance) -# * ``"custom"`` (custom objective and constraints) -# -# .. note:: -# -# ``"bottom_up"`` is applicable when the constraints can be represented as a tree. -# It produces reconciled forecasts by summing the leaf nodes. This is equivalent to the -# solution to the optimization that only penalizes adjustment to the leaf nodes' forecasts. -# -# ``"ols"`` and ``"mint_sample"`` include only the variance penalty and require -# that the transform be unbiased. The variance penalty depends on forecast error covariances. -# ``"ols"`` assumes base forecast errors are uncorrelated with equal variance. -# ``"mint_sample"`` uses sample covariance of the forecast errors. - -# %% -# Prepare Input Data -# ------------------ -# In this tutorial, we consider a 3-level tree with the parent-child relationships below. -# -# .. code-block:: none -# -# 00 # level 0 -# / \ -# 10 11 # level 1 -# / | \ /\ -# 20 21 22 23 24 # level 2 -# -# We want the forecasts of parent nodes to equal the sum of the forecasts of their children. - -# %% -# First, we need to generate forecasts for each of the nodes. -# One approach is to generate the forecasts independently, using rolling window -# forecasting to get h-step ahead forecasts over time, for some constant ``h``. -# This can be done with the :doc:`benchmark class `. -# (The variance penalty assumes the residuals have fixed covariance, -# and using constant ``h`` helps with that assumption.) -# -# For this tutorial, we assume that forecasts have already been computed. -# Below, ``forecasts`` and ``actuals`` are pandas DataFrames in long format, where each column -# is a time series, and each row is a time step. The rows are sorted in ascending order. -import logging -import plotly -import warnings - -import pandas as pd -import numpy as np - -from greykite.algo.reconcile.convex.reconcile_forecasts import ReconcileAdditiveForecasts -from greykite.common.constants import TIME_COL -from greykite.common.data_loader import DataLoader -from greykite.common.viz.timeseries_plotting import plot_multivariate - -logger = logging.getLogger() -logger.setLevel(logging.ERROR) # reduces logging -warnings.simplefilter("ignore", category=UserWarning) # ignores matplotlib warnings when rendering documentation - -dl = DataLoader() -actuals = dl.load_data(data_name="daily_hierarchical_actuals") -forecasts = dl.load_data(data_name="daily_hierarchical_forecasts") -actuals.set_index(TIME_COL, inplace=True) -forecasts.set_index(TIME_COL, inplace=True) -forecasts.head().round(1) - -# %% -# .. note:: -# -# To use the reconcile method, dataframe columns should contain -# only the forecasts or actuals timeseries. Time should -# not be its own column. -# -# Above, we set time as the index using ``.set_index()``. -# Index values are ignored by the reconcile method -# so you could also choose to drop the column. - -# %% -# The rows and columns in forecasts and actuals correspond to each other. -assert forecasts.index.equals(actuals.index) -assert forecasts.columns.equals(actuals.columns) - -# %% -# Next, we need to encode the constraints. -# In general, these can be defined by ``constraint_matrix``. -# This is a ``c x m`` array encoding ``c`` constraints in ``m`` variables, -# where ``m`` is the number of timeseries. The columns in this matrix -# correspond to the columns in the forecasts/actuals dataframes below. -# The rows encode additive expressions that must equal 0. -constraint_matrix = np.array([ - # 00 10 11 20 21 22 23 24 - [-1, 1, 1, 0, 0, 0, 0, 0], # 0 = -1*x_00 + 1*x_10 + 1*x_11 - [ 0, -1, 0, 1, 1, 1, 0, 0], # 0 = -1*x_10 + 1*x_20 + 1*x_21 + 1*x_22 - [ 0, 0, -1, 0, 0, 0, 1, 1] # 0 = -1*x_11 + 1*x_23 + 1*x_24 -]) - -# %% -# Alternatively, if the graph is a tree, you can use the ``levels`` parameter. This -# is a more concise way to specify additive tree # constraints, where forecasts of -# parent nodes must equal the sum of the forecasts of their children. It assumes -# the columns in ``forecasts`` and ``actuals`` are in the tree's breadth first -# traversal order: i.e., starting from the root, scan left to right, -# top to bottom, as shown below for our example: -# -# .. code-block:: none -# -# 0 -# / \ -# 1 2 -# / | \ / \ -# 3 4 5 6 7 -# -# Here is an equivalent specification using the ``levels`` parameter. - -# The root has two children. -# Its children have 3 and 2 children, respectively. -levels = [[2], [3, 2]] -# Summarize non-leaf nodes by the number of children -# they have, and iterate in breadth first traversal order. -# Each level in the tree becomes a sublist of `levels`. -# -# (2) --> [2] -# / \ -# (3) (2) --> [3, 2] - -# %% -# .. note:: -# -# More formally, ``levels`` specifies the number of children of each -# internal node in the tree. The ith inner list provides the number -# of children of each node in level i. Thus, the first sublist has one -# integer, the length of a sublist is the sum of the previous sublist, -# and all entries in ``levels`` are positive integers. -# All leaf nodes must have the same depth. - -# %% -# For illustration, we plot the inconsistency between forecasts -# of the root node, ``"00"``, and its children. -# Notice that the blue and orange lines do not perfectly overlap. -parent = "00" -children = ["10", "11"] -cols = { - f"parent-{parent}": forecasts[parent], - "sum(children)": sum(forecasts[child] for child in children) -} -cols.update({f"child-{child}": forecasts[child] for child in children}) -cols[TIME_COL] = forecasts.index -parent_child_df = pd.DataFrame(cols) -fig = plot_multivariate( - df=parent_child_df, - x_col=TIME_COL, - title=f"Forecasts of node '{parent}' and its children violate the constraint", -) -plotly.io.show(fig) - -# %% -# Forecast reconciliation -# ----------------------- -# -# Training Evaluation -# ^^^^^^^^^^^^^^^^^^^ -# To reconcile these forecasts, we use the -# `~greykite.algo.reconcile.convex.reconcile_forecasts.ReconcileAdditiveForecasts` class. -raf = ReconcileAdditiveForecasts() - -# %% -# Fit -# ~~~ -# Call ``fit()`` to learn the linear transform. -# Available methods are ``"bottom_up"``, ``"ols"``, -# ``"mint_sample"``, ``"custom"``. -# Let's start with the bottom up method. -_ = raf.fit( - forecasts=forecasts, - actuals=actuals, - levels=levels, - method="bottom_up", -) - -# %% -# Each row in the transform matrix shows how to compute -# the adjusted forecast as a linear combination of the base forecasts. -# For the "bottom up" transform, the matrix simply reflects the tree structure. -raf.transform_matrix - -# %% -# We can visualize this matrix to more easily see how forecasts are combined. -# The top row in this plot shows that the adjusted forecast for -# node "00" (tree root) is the sum of all the base forecasts of the leaf nodes. -# "10" and "11" are the sum of their children, and each leaf node keeps its original value. -fig = raf.plot_transform_matrix() -plotly.io.show(fig) - -# %% -# Transform -# ~~~~~~~~~ -# The ``transform()`` method applies the transform and returns the adjusted (consistent) forecasts. -# If we call it without arguments, it applies the transform to the training set. -adjusted_forecasts = raf.transform() -adjusted_forecasts.head().round(1) - -# %% -# The adjusted forecasts on the training set are stored in the ``adjusted_forecasts`` attribute. -assert adjusted_forecasts.equals(raf.adjusted_forecasts) - -# %% -# Evaluate -# ~~~~~~~~ -# Now that we have the actuals, forecasts, and adjusted forecasts, -# we can check how the adjustment affects forecast quality. -# Here, we do evaluation on the training set. -_ = raf.evaluate( - is_train=True, # evaluates on training set - ipython_display=True, # displays evaluation table - plot=True, # displays plots - plot_num_cols=2, # formats plots into two columns -) - -# %% -# For better formatting in this documentation, let's display the -# table again. ``evaluation_df`` contains the -# evaluation table for the training set. The errors for -# the leaf nodes are the same, as expected, because their -# forecasts have not changed. -# The error for nodes "00" and "11" have increased. -raf.evaluation_df.round(1) - -# %% -# .. note:: -# -# The ``ipython_display`` parameter controls whether to display the evaluation table. -# -# - The "\*change" columns show the change in error after adjustment. -# - The "Base\*" columns show evaluation metrics for the original base forecasts. -# - The "Adjusted\*" columns show evaluation metrics for the adjusted forecasts. -# - MAPE/MedAPE = mean/median absolute percentage error, -# RMSE = root mean squared error, pp = percentage point. - -# %% -# We can check the diagnostic plots for more information. -# The "Base vs Adjusted" and "Adjustment Size" plots show that -# the forecasts for "00" and "11" are higher after adjustment. -# The "Forecast Error" plot shows that this increased the forecast error. -# (Plots are automatically shown when ``plot=True``. -# To make plots appear inline in this tutorial, we need -# to explicitly show the figures.) -plotly.io.show(raf.figures["base_adj"]) - -# %% -plotly.io.show(raf.figures["adj_size"]) - -# %% -plotly.io.show(raf.figures["error"]) - -# %% -# .. note:: -# -# The ``plot`` parameter controls whether to display -# diagnostic plots to adjusted to base forecasts. -# -# - "Base vs Adjusted Forecast" shows base forecast (blue) vs adjusted forecast (orange) -# - "Adjustment Size (%)" shows the size of the adjustment. -# - "Forecast Error (%)" shows the % error before (blue) and after (orange) adjustment. -# Closer to 0 is better. -# - Note that the y-axes are independent. - - -# %% -# For completeness, we can verify that the actuals -# and adjusted forecasts satisfy the constraints. -# ``constraint_violation`` shows constraint violation on the training set, -# defined as root mean squared violation -# (averaged across time points and constraints), -# divided by root mean squared actual value. -# It should be close to 0 for "adjusted" and "actual". -# (This is not necessary to check, because -# a warning is printed during fitting if actuals do not satisfy the constraints -# or if there is no solution to the optimization problem.) -raf.constraint_violation - -# %% -# Test Set Evaluation -# ^^^^^^^^^^^^^^^^^^^ -# Evaluation on the training set is sufficient for the ``"bottom_up"`` -# and ``"ols"`` methods, because they do not use the forecasts or actuals -# to learn the transform matrix. The transform depends only on the constraints. -# -# The ``"mint_sample"`` and ``"custom"`` methods use forecasts and actuals -# in addition to the constraints, so we should evaluate accuracy -# on an out-of-sample test set. -# -# .. csv-table:: Information used by each method -# :header: "", "constraints", "forecasts", "actuals" -# -# "``bottom_up``", "X", "", "" -# "``ols``", "X", "", "" -# "``mint_sample``", "X", "X", "X" -# "``custom``", "X", "X", "X" -# -# ``"custom"`` always uses the constraints. Whether it uses forecasts -# and actuals depends on the optimization terms: -# -# - ``forecasts``: used for adjustment penalty, train penalty, variance penalty -# with "sample" covariance, preset weight options ("MedAPE", "InverseMedAPE"). -# - ``actuals``: used for bias penalty, train penalty, variance penalty -# with "sample" covariance, preset weight options ("MedAPE", "InverseMedAPE"). - -# %% -# Train -# ~~~~~ -# We'll fit to the first half of the data and evaluate accuracy -# on the second half. -train_size = forecasts.shape[0]//2 -forecasts_train = forecasts.iloc[:train_size,:] -actuals_train = actuals.iloc[:train_size,:] -forecasts_test = forecasts.iloc[train_size:,:] -actuals_test = actuals.iloc[train_size:,:] - -# %% -# Let's try the ``"mint_sample"`` method. -# First, fit the transform and apply it on the training set. -# The transform matrix is more complex than before. -raf = ReconcileAdditiveForecasts() -raf.fit_transform( # fits and transforms the training data - forecasts=forecasts_train, - actuals=actuals_train, - levels=levels, - method="mint_sample" -) -assert raf.transform_matrix is not None # train fit result, set by fit -assert raf.adjusted_forecasts is not None # train transform result, set by transform -fig = raf.plot_transform_matrix() -plotly.io.show(fig) - -# %% -# Now, evaluate accuracy on the training set. -# In our example, all the reconciled forecasts have lower error -# than the base forecasts on the training set. -raf.evaluate(is_train=True) -assert raf.evaluation_df is not None # train evaluation result, set by evaluate -assert raf.figures is not None # train evaluation figures, set by evaluate -assert raf.constraint_violation is not None # train constraint violation, set by evaluate -raf.evaluation_df.round(1) - -# %% -# Test -# ~~~~ -# Next, apply the transform to the test set and evaluate accuracy. -# Not all forecasts have improved on the test set. -# This demonstrates the importance of test set evaluation. -raf.transform_evaluate( # transform and evaluates on test data - forecasts_test=forecasts_test, - actuals_test=actuals_test, - ipython_display=False, - plot=False, -) -assert raf.adjusted_forecasts_test is not None # test transform result, set by transform -assert raf.evaluation_df_test is not None # test evaluation result, set by evaluate -assert raf.figures_test is not None # test evaluation figures, set by evaluate -assert raf.constraint_violation_test is not None # test constraint violation, set by evaluate -raf.evaluation_df_test.round(1) - -# %% -# .. note:: -# -# The results for the test set are in the -# corresponding attributes ending with ``"_test"``. -# -# As a summary, here are some key attributes containing the results: -# -# .. code-block:: none -# -# transform_matrix : transform learned from train set -# adjusted_forecasts : adjusted forecasts on train set -# adjusted_forecasts_test : adjusted forecasts on test set -# evaluation_df : evaluation result on train set -# evaluation_df_test : evaluation result on test set -# constraint_violation : normalized constraint violations on train set -# constraint_violation_test : normalized constraint violations on test set -# figures : evaluation plots on train set -# figures_test : evaluation plots on test set -# -# For full attribute details, see -# `~greykite.algo.reconcile.convex.reconcile_forecasts.ReconcileAdditiveForecasts`. - -# %% -# Model Tuning -# ^^^^^^^^^^^^ -# Now that you understand the basic usage, we'll introduce some tuning parameters. -# If you have enough holdout data, you can use the out of sample evaluation to tune the model. -# -# First, try the presets for the ``method`` parameter: -# ``"bottom_up"``, ``"ols"``, ``"mint_sample"``, ``"custom"``. -# -# If you'd like to tune further, use the ``"custom"`` method to tune -# the optimization objective and constraints. -# The tuning parameters and their default values are shown below. -# See `~greykite.algo.reconcile.convex.reconcile_forecasts.ReconcileAdditiveForecasts` -# for details. -raf = ReconcileAdditiveForecasts() -_ = raf.fit_transform_evaluate( # fits, transforms, and evaluates on training data - forecasts=forecasts_train, - actuals=actuals_train, - fit_kwargs=dict( # additional parameters passed to fit() - levels=levels, - method="custom", - # tuning parameters, with their default values for the custom method - lower_bound=None, # Lower bound on each entry of ``transform_matrix``. - upper_bound=None, # Upper bound on each entry of ``transform_matrix``. - unbiased=True, # Whether the resulting transformation must be unbiased. - lam_adj=1.0, # Weight for the adjustment penalty (adj forecast - forecast) - lam_bias=1.0, # Weight for the bias penalty (adj actual - actual). - lam_train=1.0, # Weight for the training MSE penalty (adj forecast - actual) - lam_var=1.0, # Weight for the variance penalty (variance of adjusted forecast errors for an unbiased transformation, assuming base forecasts are unbiased) - covariance="sample", # Variance-covariance matrix of base forecast errors, used to compute the variance penalty ("sample", "identity" or numpy array) - weight_adj=None, # Weight for the adjustment penalty to put a different weight per-timeseries. - weight_bias=None, # Weight for the bias penalty to put a different weight per-timeseries. - weight_train=None, # Weight for the train MSE penalty to put a different weight per-timeseries. - weight_var=None, # Weight for the variance penalty to put a different weight per-timeseries. - ), - evaluate_kwargs=dict() # additional parameters passed to evaluate() -) - -# %% -# Using ``"custom"`` with default settings, -# we find good training set performance overall. -raf.evaluation_df.round(1) - -# %% -# Test set performance is also good, except for node "24". -raf.transform_evaluate( - forecasts_test=forecasts_test, - actuals_test=actuals_test, - ipython_display=False, - plot=False -) -raf.evaluation_df_test.round(1) - -# %% -# Notice from the tables that node "24" had the most accurate -# base forecast of all nodes. Therefore, we don't want its adjusted -# forecast to change much. It's possible that the above -# transform was overfitting this node. -# -# We can increase the adjustment penalty for node "24" -# so that its adjusted forecast will be closer to the original one. -# This should allow us to get good forecasts overall and -# for node "24" specifically. - -# the order of `weights` corresponds to `forecasts.columns` -weight = np.array([1, 1, 1, 1, 1, 1, 1, 5]) # weight is 5x higher for node "24" -raf = ReconcileAdditiveForecasts() -_ = raf.fit_transform_evaluate( - forecasts=forecasts_train, - actuals=actuals_train, - fit_kwargs=dict( - levels=levels, - method="custom", - lower_bound=None, - upper_bound=None, - unbiased=True, - lam_adj=1.0, - lam_bias=1.0, - lam_train=1.0, - lam_var=1.0, - covariance="sample", - weight_adj=weight, # apply the weights to adjustment penalty - weight_bias=None, - weight_train=None, - weight_var=None, - ) -) - -# %% -# .. note:: -# -# The default ``weight=None`` puts equal weight on all nodes. -# Weight can also be ``"MedAPE"`` (proportional to MedAPE -# of base forecasts), ``"InverseMedAPE"`` (proportional to 1/MedAPE -# of base forecasts), or a numpy array that specifies the weight -# for each node. -# -# .. note:: -# -# When the transform is unbiased (``unbiased=True``), -# the bias penalty is zero, so ``lam_bias`` and -# ``weight_bias`` have no effect. - -# %% -# The training error looks good. -raf.evaluation_df.round(1) - -# %% -# Plots of the transform matrix and adjustment size -# show that node "24"'s adjusted forecast is almost the -# same as its base forecast. -fig = raf.plot_transform_matrix() -plotly.io.show(fig) - -# %% -plotly.io.show(raf.figures["adj_size"]) - -# %% -# The test error looks better than before. - -# Transform and evaluate on the test set. -raf.transform_evaluate( - forecasts_test=forecasts_test, - actuals_test=actuals_test, - ipython_display=False, - plot=True, - plot_num_cols=2, -) -raf.evaluation_df_test.round(1) - -# %% -plotly.io.show(raf.figures_test["base_adj"]) - -# %% -plotly.io.show(raf.figures_test["adj_size"]) - -# %% -plotly.io.show(raf.figures_test["error"]) - - -# %% -# Tuning Tips -# ----------- -# -# If you have enough data, you can use cross validation with multiple test sets -# for a better estimate of test error. You can use test error to select the parameters. -# -# To tune the parameters, -# -# 1. Try all four methods. -# 2. Tune the lambdas and the weights for the custom method. -# -# For example, start with these lambda settings to see -# which penalties are useful: - -lambdas = [ - # lam_adj, lam_bias, lam_train, lam_var - (0, 0, 0, 1), # the same as "mint_sample" if other params are set to default values. - (0, 0, 1, 1), - (1, 0, 0, 1), - (1, 0, 1, 1), # the same as "custom" if other params are set to default values. - (1, 1, 1, 1), # try this one with unbiased=False -] - -# %% -# Tips: -# -# * ``var`` penalty is usually helpful -# * ``train``, ``adj``, ``bias`` penalties are sometimes helpful -# * You can increase the lambda for penalties that are more helpful. -# -# To try a biased transform, set ``(unbiased=False, lam_bias>0)``. -# Avoid ``(unbiased=False, lam_bias=0)``, because that can result in high bias. -# -# Choose weights that fit your needs. For example, you may care about -# the accuracy of some forecasts more than others. -# -# Setting ``weight_adj`` to ``"InverseMedAPE"`` is a convenient way to -# penalize adjustment to base forecasts that are already accurate. -# -# Setting ``weight_bias``, ``weight_train``, or ``weight_var`` -# to ``"MedAPE"`` is a convenient way to improve the error -# on base forecasts that start with high error. - -# %% -# Debugging -# --------- -# Some tips if you need to debug: - -# %% -# 1. Make sure the constraints are properly encoded -# (for the bottom up method, another way is to check -# the transform matrix). -raf.constraint_matrix - -# %% -# 2. The constraint violation should be 0 for the actuals. -raf.constraint_violation -raf.constraint_violation_test - -# %% -# 3. Check the transform matrix to understand predictions. -# -# .. code-block:: -# -# fig = raf.plot_transform_matrix() -# plotly.io.show(fig) - -# %% -# 4. For all methods besides "bottom_up", check if a solution was found to the optimization problem. -# If False, then the ``transform_matrix`` may be set to a fallback option (bottom up transform, if available). -# A warning is printed when this happens ("Failed to find a solution. Falling back to bottom-up method."). -raf.is_optimization_solution - -# %% -# 5. Check ``prob.status`` for details about cvxpy solver status -# and look for printed warnings for any issues. You can pass solver options -# to the ``fit`` method. See -# `~greykite.algo.reconcile.convex.reconcile_forecasts.ReconcileAdditiveForecasts` -# for details. -raf.prob.status - -# %% -# 6. Inspect the objective function value at the identified -# solution and its breakdown into components. This shows the terms -# in the objective after multiplication by the lambdas/weights. -raf.objective_fn_val - -# %% -# 7. Check objective function weights, to make sure -# covariance, etc., match expectations. -raf.objective_weights - -# %% -# 8. Check the convex optimization problem. -print(type(raf.prob)) -raf.prob diff --git a/docs/nbpages/tutorials/0700_silverkite_multistage.py b/docs/nbpages/tutorials/0700_silverkite_multistage.py deleted file mode 100644 index 8f2c0e9..0000000 --- a/docs/nbpages/tutorials/0700_silverkite_multistage.py +++ /dev/null @@ -1,480 +0,0 @@ -""" -The Silverkite Multistage Model -=============================== - -This is a tutorial for the Silverkite Multistage model. -Silverkite Multistage is a fast solution designed for more granular time series -(for example, minute-level), where a long history is needed to train a good model. - -For example, suppose we want to train a model on 2 years of 5-minute frequency data. -That's 210,240 observations. -If we directly fit a model to large input data, -training time and resource demand can be high (15+ minutes on i9 CPU). -If we use a shorter period to train the model, -the model will not be able to capture long term effects -such as holidays, monthly/quarterly seasonalities, year-end drops, etc. -There is a trade-off between speed and accuracy. - -On the other hand, if due to data retention policy, -we only have data in the original frequency for a short history, -but we have aggregated data for a longer history, -could we utilize both datasets to make the prediction more accurate? - -Silverkite Multistage is designed to close this gap. -It's easy to observe the following facts: - - - Trend can be learned with data at a weekly/daily granularity. - - Yearly seasonality, weekly seasonality and holiday effects can be learned with daily data. - - Daily seasonality and autoregression effects can be learned with most recent data if the forecast horizon - is small (which is usually the case in minute-level data). - -Then it's natural to think of the idea: not all components in the forecast model need -to be learned from minute-level granularity. Training each component with the least granularity data needed -can greatly save time while keeping the desired accuracy. - -Here we introduce the Silverkite Multistage algorithm, which is built upon the idea above: - - - Silverkite Multistage trains multiple models to fit a time series. - - Each stage of the model trains on the residuals of the previous stages, - takes an appropriate length of data, does an optional aggregation, - and learns the appropriate components for the granularity. - - The final predictions will be the sum of the predictions from all stages of models. - -In practice, we’ve found Silverkite Multistage to reduce training time by up to 10X while maintaining accuracy, -compared to a Silverkite model trained on the full dataset. - -A diagram of the Silverkite Multistage model flow is shown below. - -.. image:: /figures/silverkite_multistage.png - :width: 600 - :alt: Silverkite Multistage training flow - -Next, we will see examples of how to configure Silverkite Multistage models. -""" - -# import libraries -import plotly -from greykite.framework.templates.forecaster import Forecaster -from greykite.framework.templates.autogen.forecast_config import ForecastConfig,\ - MetadataParam, ModelComponentsParam, EvaluationPeriodParam -from greykite.framework.templates.model_templates import ModelTemplateEnum -from greykite.framework.benchmark.data_loader_ts import DataLoaderTS -from greykite.algo.forecast.silverkite.forecast_simple_silverkite_helper import cols_interact -from greykite.framework.templates.silverkite_multistage_template_config import SilverkiteMultistageTemplateConfig - -# %% -# Configuring the Silverkite Multistage model -# ------------------------------------------- -# -# We take an hourly dataset as an example. -# We will use the hourly Washington D.C. bikesharing dataset -# (`source `_). - -# loads the dataset -ts = DataLoaderTS().load_bikesharing_ts() -print(ts.df.head()) - -# plot the data -plotly.io.show(ts.plot()) - -# %% -# The data contains a few years of hourly data. -# Directly training on the entire dataset may take a couple of minutes. -# Now let's consider a two-stage model with the following configuration: -# -# - **Daily model**: a model trained on 2 years of data with daily aggregation. -# The model will learn the trend, yearly seasonality, weekly seasonality and holidays. -# For an explanation of the configuration below, see the `paper `_. -# - **Hourly model**: a model trained on the residuals to learn short term patterns. -# The model will learn daily seasonality, its interaction with the ``is_weekend`` indicator, -# and some autoregression effects. -# -# From `Tune your first forecast model <./0100_forecast_tutorial.html>`_ we know how to specify -# each single model above. The core configuration is specified via -# `~greykite.framework.templates.autogen.forecast_config.ModelComponentsParam`. -# We can specify the two models as follows. - -# the daily model -daily_model_components = ModelComponentsParam( - growth=dict( - growth_term="linear" - ), - seasonality=dict( - yearly_seasonality=12, - quarterly_seasonality=0, - monthly_seasonality=0, - weekly_seasonality=5, - daily_seasonality=0 # daily model does not have daily seasonality - ), - changepoints=dict( - changepoints_dict=dict( - method="auto", - regularization_strength=0.5, - yearly_seasonality_order=12, - resample_freq="3D", - potential_changepoint_distance="30D", - no_changepoint_distance_from_end="30D" - ), - seasonality_changepoints_dict=None - ), - autoregression=dict( - autoreg_dict="auto" - ), - events=dict( - holidays_to_model_separately=["Christmas Day", "New Year's Day", "Independence Day", "Thanksgiving"], - holiday_lookup_countries=["UnitedStates"], - holiday_pre_num_days=1, - holiday_post_num_days=1 - ), - custom=dict( - fit_algorithm_dict=dict( - fit_algorithm="ridge" - ), - feature_sets_enabled="auto", - min_admissible_value=0 - ) -) - -# creates daily seasonality interaction with is_weekend -daily_interaction = cols_interact( - static_col="is_weekend", - fs_name="tod_daily", - fs_order=5 -) - -# the hourly model -hourly_model_components = ModelComponentsParam( - growth=dict( - growth_term=None # growth is already modeled in daily model - ), - seasonality=dict( - yearly_seasonality=0, - quarterly_seasonality=0, - monthly_seasonality=0, - weekly_seasonality=0, - daily_seasonality=12 # hourly model has daily seasonality - ), - changepoints=dict( - changepoints_dict=None, - seasonality_changepoints_dict=None - ), - events=dict( - holidays_to_model_separately=None, - holiday_lookup_countries=[], - holiday_pre_num_days=0, - holiday_post_num_days=0 - ), - autoregression=dict( - autoreg_dict="auto" - ), - custom=dict( - fit_algorithm_dict=dict( - fit_algorithm="ridge" - ), - feature_sets_enabled="auto", - extra_pred_cols=daily_interaction - ) -) - -# %% -# Now to use Silverkite Multistage, -# just like specifying the model components of the Simple Silverkite model, -# we need to specify the model components for Silverkite Multistage. -# The Silverkite Multistage configuration is specified via -# ``ModelComponentsParam.custom["silverkite_multistage_configs"]``, -# which takes a list of -# `~greykite.framework.templates.silverkite_multistage_template_config.SilverkiteMultistageTemplateConfig` -# objects, each of which represents a stage of the model. -# -# The ``SilverkiteMultistageTemplateConfig`` object for a single stage takes the following parameters: -# -# - ``train_length``: the length of training data, for example ``"365D"``. -# Looks back from the end of the training data and takes observations up to this limit. -# - ``fit_length``: the length of data where fitted values are calculated. -# Even if the training data is not the entire period, the fitted values can still be calculated -# on the entire period. The default will be the same as the training length. -# - ``agg_freq``: the aggregation frequency in string representation. -# For example, "D", "H", etc. If not specified, the original frequency will be kept. -# - ``agg_func``: the aggregation function name, default is ``"nanmean"``. -# - ``model_template``: the model template name. This together with the ``model_components`` below -# specify the full model, just as when using the Simple Silverkite model. -# - ``model_components``: the model components. This together with the ``model_template`` above -# specify the full model for a stage, just as when using the Simple Silverkite model. -# -# ``SilverkiteMultistageTemplateConfig`` represents the flow of each stage of the model: -# taking the time series / residual, -# taking the appropriate length of training data, doing an optional aggregation, -# then training the model with the given parameters. -# Now let's define the ``SilverkiteMultistageTemplateConfig`` object one by one. - -# the daily model -daily_config = SilverkiteMultistageTemplateConfig( - train_length="730D", # use 2 years of data to train - fit_length=None, # fit on the same period as training - agg_func="nanmean", # aggregation function is nanmean - agg_freq="D", # aggregation frequency is daily - model_template=ModelTemplateEnum.SILVERKITE.name, # the model template - model_components=daily_model_components # the daily model components specified above -) - -# the hourly model -hourly_config = SilverkiteMultistageTemplateConfig( - train_length="30D", # use 30 days data to train - fit_length=None, # fit on the same period as training - agg_func="nanmean", # aggregation function is nanmean - agg_freq=None, # None means no aggregation - model_template=ModelTemplateEnum.SILVERKITE.name, # the model template - model_components=hourly_model_components # the daily model components specified above -) - -# %% -# The configurations simply go to ``ModelComponentsParam.custom["silverkite_multistage_configs"]`` -# as a list. We can specify the model components for Silverkite Multistage as below. -# Note that all keys other than ``"custom"`` and ``"uncertainty"`` will be ignored. - -model_components = ModelComponentsParam( - custom=dict( - silverkite_multistage_configs=[daily_config, hourly_config] - ), - uncertainty=dict() -) - -# %% -# Now we can fill in other parameters needed by -# `~greykite.framework.templates.autogen.forecast_config.ForecastConfig`. - -# metadata -metadata = MetadataParam( - time_col="ts", - value_col="y", - freq="H" # the frequency should match the original data frequency -) - -# evaluation period -evaluation_period = EvaluationPeriodParam( - cv_max_splits=0, # turn off cv for speeding up - test_horizon=0, # turn off test for speeding up -) - -# forecast config -config = ForecastConfig( - model_template=ModelTemplateEnum.SILVERKITE_MULTISTAGE_EMPTY.name, - forecast_horizon=24, # forecast 1 day ahead - coverage=0.95, # prediction interval is supported - metadata_param=metadata, - model_components_param=model_components, - evaluation_period_param=evaluation_period -) -forecaster = Forecaster() -forecast_result = forecaster.run_forecast_config( - df=ts.df, - config=config -) - -print(forecast_result.forecast.df_test.head()) - -# plot the predictions -fig = forecast_result.forecast.plot() -# interactive plot, click to zoom in -plotly.io.show(fig) - -# %% -# This model is 3X times faster than training with Silverkite on the entire hourly data -# (23.5 seconds vs 79.4 seconds). -# If speed is a concern due to high frequency data with long history, -# Silverkite Multistage is worth trying. -# -# .. note:: -# The order of specifying the ``SilverkiteMultistageTemplateConfig`` objects -# does not matter. The models will be automatically sorted with respect to -# ``train_length`` from long to short. This is to ensure that we have enough -# residuals from the previous model when we fit the next model. -# -# .. note:: -# The estimator expects different stage models to have different aggregation -# frequencies. If two stages have the same aggregation frequency, an error will -# be raised. -# -# .. note:: -# Since the models in each stage may not fit on the entire training data, -# there could be periods at the beginning of the training period where -# fitted values are not calculated. -# These NA fitted values are ignored when computing evaluation metrics on the training set. -# -# The uncertainty configuration -# ----------------------------- -# -# If you would like to include the uncertainty intervals, -# you can specify the ``"uncertainty"`` parameter in model components. -# -# The ``"uncertainty"`` key in ``ModelComponentsParam`` takes one key: -# ``"uncertainty_dict"``, which is a dictionary taking the following keys: -# -# - ``"uncertainty_method"``: a string representing the uncertainty method, -# for example, ``"simple_conditional_residuals"``. -# - ``"params"``: a dictionary of additional parameter needed by the uncertainty method. -# -# Now let's specify a configuration of uncertainty method via the -# ``uncertainty_dict`` parameter on the ``"simple_conditional_residuals"`` model. - -# specifies the ``uncertainty`` parameter -uncertainty = dict( - uncertainty_dict=dict( - uncertainty_method="simple_conditional_residuals", - params=dict( - conditional_cols=["dow"] # conditioning on day of week - ) - ) -) - -# adds to the ``ModelComponentsParam`` -model_components = ModelComponentsParam( - custom=dict( - silverkite_multistage_configs=[daily_config, hourly_config] - ), - uncertainty=uncertainty -) - -# %% -# The Silverkite Multistage Model templates -# ----------------------------------------- -# -# In the example above we have seen an model template named -# ``SILVERKITE_MULTISTAGE_EMPTY``. The template is an empty template -# that must be used with specified model components. -# Any model components (``silverkite_multistage_configs``) specified -# will be exactly the model parameters to be used. -# `Model templates <./0200_templates.html>`_ explains how model templates -# work and how they are overridden by model components. -# -# The Silverkite Multistage model also comes with the following model template: -# -# - ``SILVERKITE_TWO_STAGE``: a two-stage model similar to the model we present above. -# The first stage is a daily model trained on 56 * 7 days of data learning the long term effects -# including yearly/quarterly/monthly/weekly seasonality, holidays, etc. The second stage is -# a short term model in the original data frequency learning the daily seasonality -# and autoregression effects. Both stages' ``model_templates`` are ``SILVERKITE``. -# Note that this template assumes the data to be sub-daily. -# -# When you choose to use the Silverkite Multistage model templates, -# you can override default values by specifying the model components. -# The overriding in Silverkite Multistage works as follows: -# -# - For each ``SilverkiteMultistageTemplateConfig``'s overridden, there are two situations. -# -# If the customized ``model_template`` is the same as the ``model_template`` in the default model, -# for example, both are ``SILVERKITE``, then the customized ``model_components`` -# in the ``SilverkiteMultistageTemplateConfig`` will be used to override the -# ``model_components`` in the default ``SilverkiteMultistageTemplateConfig``, -# as overriding is done in the Silverkite template. -# -# If the model templates are different, say ``SILVERKITE`` in the default and ``SILVERKITE_EMPTY`` -# in the customized, then both the new ``model_template`` and the new entire ``model_components`` -# will be used to replace the original ``model_template`` and ``model_components`` in the default model. -# -# In both cases, the ``train_length``, ``fit_length``, ``agg_func`` and ``agg_freq`` will be overridden. -# -# For example, in ``SILVERKITE_TWO_STAGE``, both stages of default templates are ``SILVERKITE``. -# Consider the following example. - -model_template = "SILVERKITE_TWO_STAGE" -model_components_override = ModelComponentsParam( - custom=dict( - silverkite_multistage_configs=[ - SilverkiteMultistageTemplateConfig( - train_length="730D", - fit_length=None, - agg_func="nanmean", - agg_freq="D", - model_template=ModelTemplateEnum.SILVERKITE.name, - model_components=ModelComponentsParam( - seasonality=dict( - weekly_seasonality=7 - ) - ) - ), - SilverkiteMultistageTemplateConfig( - train_length="30D", - fit_length=None, - agg_func="nanmean", - agg_freq=None, - model_template=ModelTemplateEnum.SILVERKITE_EMPTY.name, - model_components=ModelComponentsParam( - seasonality=dict( - daily_seasonality=10 - ) - ) - ) - ] - ) -) - - -# %% -# The first model has the same model template ``SILVERKITE`` as the default model template, -# so in ``model_components``, only the weekly seasonality parameter will be used to override -# the default weekly seasonality in ``SILVERKITE`` model template. -# The second model has a different model template ``SILVERKITE_EMPTY``. -# Then the second model will use exactly the model template and model components specified in -# the customized parameters. -# -# This design is to maximize the flexibility to override an existing Silverkite Multistage model template. -# However, if you fully know what your configuration will be for each stage of the model, -# the suggestion is to use ``SILVERKITE_MULTISTAGE_EMPTY`` and specify your own configurations. -# -# .. note:: -# If the customized model components contains fewer models than provided by the model template, -# for example, only 1 stage model is customized when using ``SILVERKITE_TWO_STAGE``. -# The ` customized ``SilverkiteMultistageTemplateConfig`` will -# be used to override the first model in the ``SILVERKITE_TWO_STAGE``, -# and the 2nd model in ``SILVERKITE_TWO_STAGE`` will be appended to the end of the first overridden model. -# Oppositely, if the number of customized models is 3, the extra customized model will be appended to the -# end of the 2 models in ``SILVERKITE_TWO_STAGE``. -# -# Grid search -# ----------- -# -# See `Grid search <../quickstart/0500_grid_search.html>`_ for an introduction to the grid search functionality -# of Greykite. Grid search is also supported in Silverkite Multistage. -# The way to specify grid search is similar to specifying grid search in Simple Silverkite model: -# please specify the grids in each stage of the model's ``model_components``. -# The Silverkite Multistage model will automatically recognize the grids -# and formulate the full grids across all models. -# This design is to keep the behavior the same as using grid search in Silverkite models. -# -# For example, the following model components specifies two stage models. -# The first model has a grid on weekly seasonality with candidates 3 and 5. -# The second model has a grid on daily seasonality with candidates 10 and 12. -# The Silverkite Multistage model will automatically combine the grids from the two models, -# and generate a grid of size 4. - -model_components_grid = ModelComponentsParam( - custom=dict( - silverkite_multistage_configs=[ - SilverkiteMultistageTemplateConfig( - train_length="730D", - fit_length=None, - agg_func="nanmean", - agg_freq="D", - model_template=ModelTemplateEnum.SILVERKITE.name, - model_components=ModelComponentsParam( - seasonality=dict( - weekly_seasonality=[3, 5] - ) - ) - ), - SilverkiteMultistageTemplateConfig( - train_length="30D", - fit_length=None, - agg_func="nanmean", - agg_freq=None, - model_template=ModelTemplateEnum.SILVERKITE.name, - model_components=ModelComponentsParam( - seasonality=dict( - daily_seasonality=[10, 12] - ) - ) - ) - ] - ) -) diff --git a/greykite/algo/forecast/silverkite/silverkite_diagnostics.py b/greykite/algo/forecast/silverkite/silverkite_diagnostics.py deleted file mode 100644 index c00cfe1..0000000 --- a/greykite/algo/forecast/silverkite/silverkite_diagnostics.py +++ /dev/null @@ -1,406 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# original author: Sayan Patra -"""Silverkite plotting functions.""" -import warnings -from typing import Type - -import numpy as np -import pandas as pd -from plotly import graph_objects as go -from plotly.subplots import make_subplots - -from greykite.algo.changepoint.adalasso.changepoints_utils import get_trend_changepoint_dates_from_cols -from greykite.algo.common.model_summary import ModelSummary -from greykite.algo.forecast.silverkite.constants.silverkite_component import SilverkiteComponentsEnum -from greykite.algo.forecast.silverkite.constants.silverkite_component import SilverkiteComponentsEnumMixin -from greykite.algo.forecast.silverkite.constants.silverkite_constant import default_silverkite_constant -from greykite.common import constants as cst -from greykite.common.python_utils import get_pattern_cols -from greykite.common.viz.timeseries_plotting import add_groupby_column -from greykite.common.viz.timeseries_plotting import grouping_evaluation - - -class SilverkiteDiagnostics: - """Provides various plotting functions for the model generated by the Silverkite forecast algorithms. - - Attributes - ---------- - _silverkite_components_enum : Type[SilverkiteComponentsEnum] - The constants for plotting the silverkite components. - model_dict : `dict` or None - A dict with fitted model and its attributes. - The output of `~greykite.algo.forecast.silverkite.forecast_silverkite.SilverkiteForecast.forecast`. - pred_category : `dict` or None - A dictionary with keys being the predictor category and - values being the predictors belonging to the category. - For details, see - `~greykite.sklearn.estimator.base_silverkite_estimator.BaseSilverkiteEstimator.pred_category`. - time_col : str - Name of input data time column - value_col : str - Name of input data value column - components : `dict` or None - Components of the ``SilverkiteEstimator`` model. Set by ``self.plot_components``. - For details about the possible key values see - `~greykite.algo.forecast.silverkite.silverkite_diagnostics.SilverkiteDiagnostics.get_silverkite_components`. - Not available for ``random forest`` and ``gradient boosting`` methods and - set to the default value `None`. - model_summary : `class` or `None` - The `~greykite.algo.common.model_summary.ModelSummary` class. - """ - def __init__( - self, - constants: SilverkiteComponentsEnumMixin = default_silverkite_constant): - self._silverkite_components_enum: Type[SilverkiteComponentsEnum] = constants.get_silverkite_components_enum() - self.pred_category = None - self.time_col = None - self.value_col = None - self.components = None - self.model_summary = None - - def set_params(self, pred_category, time_col, value_col): - """ - Set the various params after the model has been created. - - Parameters - ---------- - pred_category : `dict` or None - A dictionary with keys being the predictor category and - values being the predictors belonging to the category. - For details, see `~greykite.sklearn.estimator.base_silverkite_estimator.BaseSilverkiteEstimator.pred_category`. - time_col: `str` - Time column name in the data frame. - value_col: `str` - Value column name in the data frame. - """ - self.pred_category = pred_category - self.time_col = time_col - self.value_col = value_col - - def summary(self, model_dict, max_colwidth=20) -> ModelSummary: - """Creates the model summary for the given model - - Parameters - ---------- - model_dict : `dict` or None - A dict with fitted model and its attributes. - max_colwidth : `int` - The maximum length for predictors to be shown in their original name. - If the maximum length of predictors exceeds this parameter, all - predictors name will be suppressed and only indices are shown. - - Returns - ------- - model_summary: `ModelSummary` - The model summary for this model. See `~greykite.algo.common.model_summary.ModelSummary` - """ - - if model_dict is not None: - # tree models do not have beta - self.model_summary = ModelSummary( - x=model_dict["x_mat"].values, - y=model_dict["y"].values, - pred_cols=list(model_dict["x_mat"].columns), - pred_category=self.pred_category, - fit_algorithm=model_dict["fit_algorithm"], - ml_model=model_dict["ml_model"], - max_colwidth=max_colwidth) - else: - self.model_summary = None - return self.model_summary - - def plot_components(self, model_dict, names=None, title=None): - """Class method to plot the components of a ``Silverkite`` model on the dataset passed to ``fit``. - - Parameters - ---------- - model_dict : `dict` or None - A dict with fitted model and its attributes. - names: `list` [`str`], default `None` - Names of the components to be plotted e.g. names = ["trend", "DAILY_SEASONALITY"]. - See `~greykite.algo.forecast.silverkite.silverkite_diagnostics.get_silverkite_components` - for the full list of valid names. - If `None`, all the available components are plotted. - title: `str`, optional, default `None` - Title of the plot. If `None`, default title is "Component plot". - - Returns - ------- - fig: `plotly.graph_objects.Figure` - Figure plotting components against appropriate time scale. - """ - if model_dict is None: - raise NotImplementedError("Call `self.set_params` before calling `plot_components`.") - - # recomputes `self.components` every time in case model was refit - if not hasattr(model_dict["ml_model"], "coef_"): - raise NotImplementedError("Component plot has only been implemented for additive linear models.") - else: - # Computes components for the training observations used to fit the model. - # Observations with NAs that are dropped when fitting are not included. - feature_df = ( - model_dict["ml_model"].coef_ - * model_dict["x_mat"]) - self.components = self.get_silverkite_components( - model_dict["df_dropna"], - self.time_col, - self.value_col, - feature_df) - - return self.plot_silverkite_components( - components=self.components, - names=names, - title=title) - - def get_silverkite_components( - self, - df, - time_col, - value_col, - feature_df): - """Compute the components of a ``Silverkite`` model. - - Notes - ----- - This function signature is chosen this way so that an user using `forecast_silverkite` can also use - this function, without any changes to the `forecast_silverkite` function. User can compute `feature_df` - as follows. Here `model_dict` is the output of `forecast_silverkite`. - feature_df = model_dict["mod"].coef_ * model_dict["design_mat"] - - The function aggregates components based on the column names of `feature_df`. - `feature_df` is defined as the patsy design matrix built by `design_mat_from_formula` - multiplied by the corresponding coefficients, estimated by the silverkite model. - - - ``cst.TREND_REGEX``: Used to identify `feature_df` columns corresponding to trend. - See `greykite.common.features.timeseries_features.get_changepoint_features` for details - about changepoint column names. - - ``cst.SEASONALITY_REGEX``: Used to identify `feature_df` columns corresponding to seasonality. - This means to get correct seasonalities, the user needs to provide seas_names. - See `greykite.common.features.timeseries_features.get_fourier_col_name` for details - about seasonality column names. - - ``cst.EVENT_REGEX``: Used to identify `feature_df` columns corresponding to events such as holidays. - See `~greykite.common.features.timeseries_features.add_daily_events` for details - about event column names. - - Parameters - ---------- - df : `pandas.DataFrame` - A dataframe containing `time_col`, `value_col` and `regressors`. - time_col : `str` - The name of the time column in ``df``. - value_col : `str` - The name of the value column in ``df``. - feature_df : `pandas.DataFrame` - A dataframe containing feature columns and values. - - Returns - ------- - components : `pandas.DataFrame` - Contains the components of the model. Same number of rows as `df`. Possible columns are - - - `"time_col"`: same as input ``time_col``. - - `"value_col"`: same as input ``value_col``. - - `"trend"`: column containing the trend. - - `"DAILY_SEASONALITY"`: column containing daily seasonality. - - `"WEEKLY_SEASONALITY"`: column containing weekly seasonality. - - `"MONTHLY_SEASONALITY"`: column containing monthly seasonality. - - `"QUARTERLY_SEASONALITY"`: column containing quarterly seasonality. - - `"YEARLY_SEASONALITY"`: column containing yearly seasonality. - - `"events"`: column containing events e.g. holidays effect. - - """ - if feature_df is None or feature_df.empty: - raise ValueError("feature_df must be non-empty") - - if df.shape[0] != feature_df.shape[0]: - raise ValueError("df and feature_df must have same number of rows.") - - feature_cols = feature_df.columns - components = df[[time_col, value_col]] - - # get trend (this includes interaction terms) - trend_cols = get_pattern_cols(feature_cols, cst.TREND_REGEX, cst.SEASONALITY_REGEX) - if trend_cols: - components["trend"] = feature_df[trend_cols].sum(axis=1) - - # get seasonalities - seas_cols = get_pattern_cols(feature_cols, cst.SEASONALITY_REGEX) - seas_components_dict = self._silverkite_components_enum.__dict__["_member_names_"].copy() - for seas in seas_components_dict: - seas_pattern = self._silverkite_components_enum[seas].value.ylabel - seas_pattern_cols = get_pattern_cols(seas_cols, seas_pattern) - if seas_pattern_cols: - components[seas] = feature_df[seas_pattern_cols].sum(axis=1) - - # get events (holidays for now) - event_cols = get_pattern_cols(feature_cols, cst.EVENT_REGEX) - if event_cols: - components["events"] = feature_df[event_cols].sum(axis=1) - - # get trend changepoints - # keep this column as the last column of the df - if trend_cols: - changepoint_dates = get_trend_changepoint_dates_from_cols(trend_cols=trend_cols) - if changepoint_dates: - ts = pd.to_datetime(components[time_col]) - changepoints = [1 if t in changepoint_dates else 0 for t in ts] - components["trend_changepoints"] = changepoints - - return components - - def group_silverkite_seas_components(self, df): - """Groups and renames``Silverkite`` seasonalities. - - Parameters - ---------- - df: `pandas.DataFrame` - DataFrame containing two columns: - - - ``time_col``: Timestamps of the original timeseries. - - ``seas``: A seasonality component. It must match a component name from the - `~greykite.algo.forecast.silverkite.constants.silverkite_component.SilverkiteComponentsEnum`. - - Returns - ------- - `pandas.DataFrame` - DataFrame grouped by the time feature corresponding to the seasonality - and renamed as defined in - `~greykite.algo.forecast.silverkite.constants.silverkite_component.SilverkiteComponentsEnum`. - """ - time_col, seas = df.columns - groupby_time_feature = self._silverkite_components_enum[seas].value.groupby_time_feature - xlabel = self._silverkite_components_enum[seas].value.xlabel - ylabel = self._silverkite_components_enum[seas].value.ylabel - - def grouping_func(grp): - return np.nanmean(grp[seas]) - - result = add_groupby_column( - df=df, - time_col=time_col, - groupby_time_feature=groupby_time_feature) - grouped_df = grouping_evaluation( - df=result["df"], - groupby_col=result["groupby_col"], - grouping_func=grouping_func, - grouping_func_name=ylabel) - grouped_df.rename({result["groupby_col"]: xlabel}, axis=1, inplace=True) - return grouped_df - - def plot_silverkite_components( - self, - components, - names=None, - title=None): - """Plot the components of a ``Silverkite`` model. - - Parameters - ---------- - components : `pandas.DataFrame` - A dataframe containing the components of a silverkite model, similar to the output - of `~greykite.algo.forecast.silverkite.silverkite_diagnostics.get_silverkite_components`. - names: `list` [`str`], optional, default `None` - Names of the components to be plotted e.g. names = ["trend", "DAILY_SEASONALITY"]. - See `~greykite.algo.forecast.silverkite.silverkite_diagnostics.get_silverkite_components` - for the full list of valid names. - If `None`, all the available components are plotted. - title: `str`, optional, default `None` - Title of the plot. If `None`, default title is "Component plot". - - Returns - ------- - fig: `plotly.graph_objects.Figure` - Figure plotting components against appropriate time scale. - - Notes - ----- - If names in `None`, all the available components are plotted. - ``value_col`` is always plotted in the first panel, as long as there is a match between - given ``names`` list and ``components.columns``. - - See Also - -------- - `~greykite.algo.forecast.silverkite.silverkite_diagnostics.get_silverkite_components` - """ - - time_col, value_col = components.columns[:2] - if "trend_changepoints" in components.columns: - trend_changepoints = components[time_col].loc[components["trend_changepoints"] == 1].tolist() - components = components.drop("trend_changepoints", axis=1) - else: - trend_changepoints = None - if names is None: - names_kept = list(components.columns)[1:] # do not include time_col - else: - # loops over components.columns to maintain the order of the components - names_kept = [component for component in list(components.columns) if component in names] - names_removed = set(names) - set(components.columns) - - if not names_kept: - raise ValueError("None of the provided components have been specified in the model.") - elif names_removed: - warnings.warn(f"The following components have not been specified in the model: " - f"{names_removed}, plotting the rest.") - if names_kept[0] != value_col: - names_kept.insert(0, value_col) - - num_rows = len(names_kept) - fig = make_subplots(rows=num_rows, cols=1, vertical_spacing=0.35 / num_rows) - if title is None: - title = "Component plots" - fig.update_layout(dict(showlegend=True, title=title, title_x=0.5, height=350 * num_rows)) - - for ind, name in enumerate(names_kept): - df = components[[time_col, name]] - if "SEASONALITY" in name: - df = self.group_silverkite_seas_components(df) - - xlabel, ylabel = df.columns - row = ind + 1 - fig.append_trace(go.Scatter( - x=df[xlabel], - y=df[ylabel], - name=name, - mode="lines", - opacity=0.8, - showlegend=False - ), row=row, col=1) - - # `showline = True` shows a line only along the axes. i.e. for xaxis it will line the bottom - # of the image, but not top. Adding `mirror = True` also adds the line to the top. - fig.update_xaxes(title_text=xlabel, showline=True, mirror=True, row=row, col=1) - fig.update_yaxes(title_text=ylabel, showline=True, mirror=True, row=row, col=1) - - # plot trend change points - if trend_changepoints is not None and "trend" in names_kept: - for i, cp in enumerate(trend_changepoints): - show_legend = (i == 0) - fig.append_trace( - go.Scatter( - name="trend change point", - mode="lines", - x=[cp, cp], - y=[components["trend"].min(), components["trend"].max()], - line=go.scatter.Line( - color="#F44336", # red 500 - width=1.5, - dash="dash"), - showlegend=show_legend), - row=names_kept.index("trend") + 1, - col=1) - - return fig diff --git a/greykite/framework/templates/silverkite_multistage_template.py b/greykite/framework/templates/silverkite_multistage_template.py deleted file mode 100644 index 1abcacb..0000000 --- a/greykite/framework/templates/silverkite_multistage_template.py +++ /dev/null @@ -1,571 +0,0 @@ -# BSD 2-CLAUSE LICENSE - -# Redistribution and use in source and binary forms, with or without modification, -# are permitted provided that the following conditions are met: - -# Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# original author: Kaixu Yang - - -from itertools import product -from typing import Dict -from typing import List -from typing import Type - -import pandas as pd - -from greykite.common import constants as cst -from greykite.common.logging import LoggingLevelEnum -from greykite.common.logging import log_message -from greykite.common.python_utils import unique_in_list -from greykite.common.python_utils import update_dictionaries -from greykite.framework.templates.autogen.forecast_config import ForecastConfig -from greykite.framework.templates.autogen.forecast_config import MetadataParam -from greykite.framework.templates.base_template import BaseTemplate -from greykite.framework.templates.silverkite_multistage_template_config import SilverkiteMultistageTemplateConfig -from greykite.framework.templates.silverkite_multistage_template_config import SilverkiteMultistageTemplateConstants -from greykite.sklearn.estimator.base_forecast_estimator import BaseForecastEstimator -from greykite.sklearn.estimator.silverkite_multistage_estimator import SilverkiteMultistageEstimator -from greykite.sklearn.estimator.silverkite_multistage_estimator import SilverkiteMultistageModelConfig - - -class SilverkiteMultistageTemplate(BaseTemplate): - """The model template for Silverkite Multistage Estimator.""" - - DEFAULT_MODEL_TEMPLATE = "SILVERKITE_TWO_STAGE" - - def __init__( - self, - constants: SilverkiteMultistageTemplateConstants = SilverkiteMultistageTemplateConstants, - # The parameters here don't matter. They are set for compatibility. - estimator: BaseForecastEstimator = SilverkiteMultistageEstimator( - forecast_horizon=1, - model_configs=[] - )): - """The init function. - - The estimator parameters in init is just for compatibility. - It does not affect the results. - """ - super().__init__(estimator=estimator) - self._constants = constants() - - @property - def constants(self) -> SilverkiteMultistageTemplateConstants: - """Constants used by the template class. Includes the model templates and their default values. - """ - return self._constants - - def __get_regressor_templates(self): - """Gets the model templates for each sub-model. - - These templates are for ``self.get_regressor_cols`` and ``self.get_lagged_regressor_info`` - to use to extract those information from each single model. - - Returns - ------- - templates : `list` [`~greykite.framework.templates.base_template.BaseTemplate`] - A list of model template class instances. - """ - if self.config.model_components_param.custom is None: - return None - silverkite_multistage_configs = self.config.model_components_param.custom.get( - "silverkite_multistage_configs", None) - if silverkite_multistage_configs is None or silverkite_multistage_configs == []: - return [] - if isinstance(silverkite_multistage_configs, SilverkiteMultistageTemplateConfig): - silverkite_multistage_configs = [silverkite_multistage_configs] - templates = [] - for config in silverkite_multistage_configs: - template = self.__get_template_class(ForecastConfig(model_template=config.model_template))() - template.df = self.df - template.config = ForecastConfig( - model_template=config.model_template, - model_components_param=config.model_components) - templates.append(template) - return templates - - def get_regressor_cols(self): - """Gets the regressor columns in the model. - - Iterates over each submodel to extract the regressor columns. - - Returns - ------- - regressor_cols : `list` [`str`] - A list of the regressor column names used in any of the submodels. - """ - templates = self.__get_regressor_templates() - regressor_cols = [] - if templates is None or templates == []: - return [] - for template in templates: - try: - regressors = template.get_regressor_cols() - except AttributeError: - continue - regressor_cols += regressors if regressors is not None else [] - return unique_in_list( - array=regressor_cols, - ignored_elements=(None,)) - - def get_lagged_regressor_info(self): - """Gets the lagged regressor info for the model - - Iterates over each submodel to extract the lagged regressor info. - - Returns - ------- - lagged_regressor_info : `dict` - The combined lagged regressor info from all submodels. - """ - templates = self.__get_regressor_templates() - lagged_regressor_info = { - "lagged_regressor_cols": None, - "overall_min_lag_order": None, - "overall_max_lag_order": None - } - if templates is None or templates == []: - return lagged_regressor_info - for template in templates: - try: - info = template.get_lagged_regressor_info() - except AttributeError: - continue - # Combines the ``lagged_regressor_info`` from each model. - cols = info["lagged_regressor_cols"] - min_order = info["overall_min_lag_order"] - max_order = info["overall_max_lag_order"] - if lagged_regressor_info["lagged_regressor_cols"] is None: - lagged_regressor_info["lagged_regressor_cols"] = cols - elif cols is not None: - lagged_regressor_info["lagged_regressor_cols"] += cols - if lagged_regressor_info["overall_min_lag_order"] is None: - lagged_regressor_info["overall_min_lag_order"] = min_order - elif min_order is not None: - lagged_regressor_info["overall_min_lag_order"] = min( - lagged_regressor_info["overall_min_lag_order"], min_order) - if lagged_regressor_info["overall_max_lag_order"] is None: - lagged_regressor_info["overall_max_lag_order"] = min_order - elif min_order is not None: - lagged_regressor_info["overall_max_lag_order"] = max( - lagged_regressor_info["overall_max_lag_order"], max_order) - - return lagged_regressor_info - - def get_hyperparameter_grid(self): - """Gets the hyperparameter grid for the Silverkite Multistage Model. - - Returns - ------- - hyperparameter_grid : `dict` [`str`, `list` [`any`]] or `list` [ `dict` [`str`, `list` [`any`]] ] - hyperparameter_grid for grid search in - :func:`~greykite.framework.pipeline.pipeline.forecast_pipeline`. - The output dictionary values are lists, combined in grid search. - """ - if self.config is None: - raise ValueError(f"Forecast config must be provided, but `self.config` is `None`.") - model_template = self.config.model_template - model_components = self.config.model_components_param - - # Gets the model components from model template. - default_model_components = self.__get_default_model_components(model_template) - default_silverkite_multistage_configs = default_model_components.custom.get("silverkite_multistage_configs") - - # Checks if any parameter is specified in fields other than "custom". - not_none_parameters = [] - for key, value in model_components.__dict__.items(): - if value is not None and value != {} and key not in ["custom", "uncertainty"]: - not_none_parameters.append(key) - if not_none_parameters: - log_message( - message=f"Silverkite Multistage template only takes configuration through ``custom`` " - f"and ``uncertainty`` in ``model_components_param``. " - f"The following inputs are ignored \n{not_none_parameters}.", - level=LoggingLevelEnum.WARNING - ) - - # When ``custom`` is not None, we look for the ``silverkite_multistage_configs`` key. - custom = model_components.custom - # Gets the ``silverkite_multistage_configs`` from ``default_silverkite_multistage_configs`` and - # overriden by ``custom["silverkite_multistage_configs"]``. - # If no customized configs, the default configs will be ``new_configs``. - new_configs = self.__get_silverkite_multistage_configs_override( - custom=custom, - model_template=model_template, - default_silverkite_multistage_configs=default_silverkite_multistage_configs) - - # Converts template configs into estimator parameters. - estimator_list, estimator_params_list = self.__get_estimators_and_params_from_template_configs( - new_configs=new_configs - ) - - # Now the estimator parameters may contain grids, i.e., list of parameters from template classes. - # We need to flatten them and wrap them into list of parameters for `SilverkiteMultistageEstimator`. - # The following function call gets the flattened estimator parameters, - # in the format of a list of lists (different sets of parameters) of dictionaries (different stage models). - flattened_dictionaries = self.__flatten_estimator_params_list( - estimator_params_list=estimator_params_list - ) - - # Then we construct the `SilverkiteMultistageEstimator` parameters. - silverkite_multistage_model_configs = [] - for grid in flattened_dictionaries: - list_of_model_configs = [] - for i, config in enumerate(new_configs): - # This is a single ``SilverkiteMultistageModelConfig`` that corresponds to - # a single stage model in a set of configuration. - model_config = SilverkiteMultistageModelConfig( - train_length=config.train_length, - fit_length=config.fit_length, - agg_func=config.agg_func, - agg_freq=config.agg_freq, - estimator=estimator_list[i], - estimator_params=grid[i] - ) - # Appends this single ``SilverkiteMultistageModelConfig`` to get all stage of models. - list_of_model_configs.append(model_config) - # The hyperparameter grid consists of a list of all stage of models for grid search. - # This corresponds to different sets of configurations. - silverkite_multistage_model_configs.append(list_of_model_configs) - # ``freq`` is the data frequency, which is from ``metadata_param``. - freq = self.config.metadata_param.freq if self.config.metadata_param is not None else None - - # Gets the uncertainty parameter. - uncertainty = model_components.uncertainty - if uncertainty is not None: - uncertainty_dict = uncertainty.get("uncertainty_dict", None) - else: - uncertainty_dict = None - - # Gets the hyperparameter grid. - silverkite_multistage_hyperparameter_grid = dict( - estimator__forecast_horizon=[self.config.forecast_horizon], - estimator__freq=[freq], - estimator__model_configs=silverkite_multistage_model_configs, - estimator__uncertainty_dict=[uncertainty_dict] - ) - return silverkite_multistage_hyperparameter_grid - - @staticmethod - def __get_silverkite_multistage_configs_override( - custom: Dict[str, any], - model_template: str, - default_silverkite_multistage_configs: List[SilverkiteMultistageTemplateConfig]): - """Gets the overriden Silverkite Multistage configs by ``custom``. - - Parameters - ---------- - custom : `dict` [`str`, any] - The custom dictionary in `ModelComponentsParam`. - The only recognizable key is ``silverkite_multistage_configs``, - which takes a list of - `~greykite.framework.templates.silverkite_multistage_template_config.SilverkiteMultistageTemplateConfig`. - model_template : `str` - The model template used in Silverkite Multistage template. - default_silverkite_multistage_configs : `list` [ - `~greykite.framework.templates.silverkite_multistage_template_config.SilverkiteMultistageTemplateConfig`] - The default silverkite multistage configs from ``model_template``. - - Returns - ------- - new_configs : `list` [ - `~greykite.framework.templates.silverkite_multistage_template_config.SilverkiteMultistageTemplateConfig`] - The silverkite multistage configs overriden by the ``silverkite_multistage_configs`` in ``custom``. - """ - if custom is not None: - # Checks if any parameter is specified in "custom" other than "silverkite_multistage_config". - not_none_parameters = [] - for key, value in custom.items(): - if value is not None and value != [] and key != "silverkite_multistage_configs": - not_none_parameters.append(key) - if not_none_parameters: - log_message( - message=f"Silverkite Multistage template only takes configurations through " - f"``custom.silverkite_multistage_configs``. The following inputs are " - f"ignored \n{not_none_parameters}.", - level=LoggingLevelEnum.WARNING - ) - # Uses ``silverkite_multistage_configs`` to override the default components if it's not None. - silverkite_multistage_configs = custom.get("silverkite_multistage_configs", None) - if ((silverkite_multistage_configs is None or silverkite_multistage_configs == []) - and model_template == "SILVERKITE_MULTISTAGE_EMPTY"): - raise ValueError(f"``SILVERKITE_MULTISTAGE_EMPTY`` can not be used without overriding. " - f"You must provide parameters in " - f"``ModelComponentsParam.custom.silverkite_multistage_configs``.") - if silverkite_multistage_configs is not None: - # Wraps in a list if it's a single ``SilverkiteMultistageTemplateConfig``. - if isinstance(silverkite_multistage_configs, SilverkiteMultistageTemplateConfig): - silverkite_multistage_configs = [silverkite_multistage_configs] - # Must be a list. - if not isinstance(silverkite_multistage_configs, list): - raise ValueError(f"The ``silverkite_multistage_configs`` parameter must be a list of " - f"``SilverkiteMultistageTemplateConfig`` objects, found " - f"\n{silverkite_multistage_configs}.") - # Checks the lengths of default configs and overriding configs. - num_configs_in_default = len(default_silverkite_multistage_configs) - num_configs_in_override = len(silverkite_multistage_configs) - extra_configs = [] - if num_configs_in_default != num_configs_in_override: - if num_configs_in_default != 0: - log_message( - message=f"The number of configs in ``ModelComponentsParam`` ({num_configs_in_override}) " - f"does not match the number of configs in the default template " - f"({num_configs_in_default}). Appending extra configs to the end.", - level=LoggingLevelEnum.WARNING - ) - # These configs are extra configs, either from the default or from overriding. - # No matter where they come from, they will be appended to the end. - extra_configs = ( - silverkite_multistage_configs[-(num_configs_in_override - num_configs_in_default):] - if num_configs_in_override >= num_configs_in_default - else default_silverkite_multistage_configs[-(num_configs_in_default - num_configs_in_override):] - ) - - # Overrides the default ``SilverkiteMultistageTemplateConfig`` objects. - num_to_override = min(num_configs_in_default, num_configs_in_override) - new_configs = [] - for i in range(num_to_override): - default_config = default_silverkite_multistage_configs[i] - new_config = silverkite_multistage_configs[i] - # Overrides the original parameters. - keys = ["train_length", "fit_length", "agg_freq", "agg_func"] - for key in keys: - if getattr(new_config, key, None) is not None: - setattr(default_config, key, getattr(new_config, key)) - # For ``model_template`` and ``model_components``, - # both will be overriden if the new ``model_template`` is different - # from the default ``model_template``. However, if both ``model_templates`` - # are the same, the keys/values in the new ``model_components`` will be - # used to override the keys/values in the default ``model_components``, - # instead of replacing the entire default ``model_components`` with the new ``model_components``. - # The consideration here is that if one only specifies partial ``model_components`` and hope - # the rest can be kept as the default, this is the right way to do. If one hopes to - # use only the parameters specified in the new ``model_components`` and do not apply defaults, - # they should have used the ``SILVERKITE_MULTISTAGE_EMPTY`` template, - # and this is also the correct behavior. - if (new_config.model_template != default_config.model_template - or default_config.model_components is None): - for key in ["model_template", "model_components"]: - if getattr(new_config, key, None) is not None: - setattr(default_config, key, getattr(new_config, key)) - else: - for key in new_config.model_components.__dict__.keys(): - allow_unknown_keys = (key == "hyperparameter_override") - updated_value = update_dictionaries( - default_dict=getattr(default_config.model_components, key, {}) or {}, - overwrite_dicts=getattr(new_config.model_components, key), - allow_unknown_keys=allow_unknown_keys) - setattr(default_config.model_components, key, updated_value) - new_configs.append(default_config) - new_configs += extra_configs - else: - # If `ModelComponentsParam.custom["silverkite_multistage_configs]"` is None, - # use the default from template. - new_configs = default_silverkite_multistage_configs - else: - # If `ModelComponentsParam.custom` is None, - # use the default from template. - new_configs = default_silverkite_multistage_configs - return new_configs - - def __get_estimators_and_params_from_template_configs( - self, - new_configs: List[SilverkiteMultistageTemplateConfig]): - """Gets the estimators and estimator parameters from ``SilverkiteMultistageTemplateConfig`` objects. - - Parameters - ---------- - new_configs : `list` [ - `~greykite.framework.templates.silverkite_multistage_template_config.SilverkiteMultistageTemplateConfig`] - The silverkite multistage configs overriden by the ``silverkite_multistage_configs`` in ``custom``. - - Returns - ------- - estimators : `list` [`~greykite.sklearn.estimator.base_forecast_estimator.BaseForecastEstimator`] - The estimator classes in each stage. - estimator_params : `list` [`dict` [`str`, any]] - The estimator parameters in each stage. - These parameters are in ``hyperparameter_grid`` format and may contain nested grids. - """ - estimator_list = [] - estimator_params_list = [] - for config in new_configs: - template = self.__get_template_class(ForecastConfig(model_template=config.model_template))() - estimator = template._estimator.__class__ - # It's not common that `self.config.metadata_param` is None, - # but since `get_hyperparameter_grid` is a public method, - # in case people call it directly, we set the value defaults. - if self.config.metadata_param is not None and self.config.metadata_param.time_col is not None: - time_col = self.config.metadata_param.time_col - else: - time_col = cst.TIME_COL - if self.config.metadata_param is not None and self.config.metadata_param.value_col is not None: - value_col = self.config.metadata_param.value_col - else: - value_col = cst.VALUE_COL - date_format = self.config.metadata_param.date_format if self.config.metadata_param is not None else None - # Creates a sample df for the template class to generate hyperparameter grid. - # The ``apply_template_for_pipeline_params`` function does not use any information from ``df`` - # when generating the hyperparameter grid. - sample_df = pd.DataFrame({ - time_col: pd.date_range( - end=pd.to_datetime(self.df[time_col]).max().date(), - periods=100, - freq=config.agg_freq - ), - value_col: 0 - }) - estimator_params_grid = template.apply_template_for_pipeline_params( - df=sample_df, - # Here we ignore the ``forecast_horizon`` parameter. - # Even the wrong ``forecast_horizon`` is inferred for this model, - # the correct ``forecast_horizon`` will be used to override in the estimator's ``fit`` - # method. - config=ForecastConfig( - metadata_param=MetadataParam( - time_col=time_col, - value_col=value_col, - freq=config.agg_freq, - date_format=date_format, - ), - model_template=config.model_template, - model_components_param=config.model_components - ) - )["hyperparameter_grid"] - estimator_list.append(estimator) - estimator_params_list.append(estimator_params_grid) - return estimator_list, estimator_params_list - - @staticmethod - def __flatten_estimator_params_list( - estimator_params_list: List[Dict[str, any]]): - """Flattens the ``estimator_params_list``. - - The ``estimator_params_list`` is from ``self.__get_estimators_and_params_from_template_configs``, - and may contain nested grids within each parameter. - This function flattens it into the format of list of lists of ``estimator_params``. - - For example, the original ``estimator_params_list`` is - - [{"a": [1], "b": [2, 3]}, {"c": [4, 5]}] - - It consists of 2 stages of models. Each stage of model's parameters are in a dictionary. - The parameter values are in lists and could have multiple possible values. - - After flattening the ``estimator_params_list``, it becomes - - [[{"a": 1, "b": 2}, {"c": 4}], [{"a": 1, "b": 3}, {"c": 4}], - [{"a": 1, "b": 2}, {"c": 5}], [{"a": 1, "b": 3}, {"c": 5}]] - - There are 2 x 2 = 4 sets of parameters, i.e., 4 sets of ``estimator_params``, - each of which includes two dictionaries which correspond to the two stages of models. - - Parameters - ---------- - estimator_params_list : `list` [`dict` [`str`, any]] - The estimator parameter list in hyperparameter grids. - - Returns - ------- - flattened_estimator_params : `list` [`list` [`dict` [`str`, any]]] - The flattened list of lists of estimator parameter dictionaries. - """ - # Although Python 3.7 keeps the order in dictionary from insertion, - # to be more compatible, we use lists to ensure the keys and values are matched. - # For example, we have - # [{"a": [1], "b": [2, 3]}, {"c": [4, 5]}] - keys = [] - params = [] - for index, dictionary in enumerate(estimator_params_list): - keys.append([]) - params.append([]) - for key, value in dictionary.items(): - # ``time_properties`` are automatically inferred from the other parameters. - if "estimator__" in key and key != "estimator__time_properties": - keys[index].append(key.split("__")[1]) - params[index].append(value) - # Here we get a list of flattened values. - # [((1, 2), (4)), ((1, 3), (4)), ((1, 2), (5)), ((1, 3), (5))] - # The inner product gets all cross products for the value combinations within a stage. - # The outer product gets all cross products for the value combinations across stages. - flattened_params = list(product(*[list(product(*param)) for param in params])) - # Then we map the flattened parameters with their keys and flatten them. - # [[{"a": 1, "b": 2}, {"c": 4}], [{"a": 1, "b": 3}, {"c": 4}], - # [{"a": 1, "b": 2}, {"c": 5}], [{"a": 1, "b": 3}, {"c": 5}]] - flattened_dictionaries = [ - [ - {key: value for (key, value) in zip(subkeys, subvalues)} - for subkeys, subvalues in zip(keys, single_value) - ] - for single_value in flattened_params - ] - return flattened_dictionaries - - def __get_default_model_components( - self, - template: str): - """Gets the default model components from a model template name. - - Parameters - ---------- - template : `str` - The model template name. - - Returns - ------- - template : `~greykite.framework.templates.base_template.BaseTemplate` - The model template class. - """ - try: - template = getattr(self._constants, template) - except (AttributeError, TypeError): - raise ValueError(f"The template name {template} is not recognized!") - return template - - @property - def allow_model_template_list(self) -> bool: - return False - - @property - def allow_model_components_param_list(self) -> bool: - return False - - def __get_template_class(self, config: ForecastConfig = None) -> Type[BaseTemplate]: - """Extracts template class (e.g. `SimpleSilverkiteTemplate`) from the config. - Currently only supports single templates in - `~greykite.framework.templates.model_templates.ModelTemplateEnum`. - - Parameters - ---------- - config : :class:`~greykite.framework.templates.model_templates.ForecastConfig` or None - Config object for template class to use. - See :class:`~greykite.framework.templates.model_templates.ForecastConfig`. - - Returns - ------- - template_class : Type[`~greykite.framework.templates.base_template.BaseTemplate`] - An implementation of `~greykite.framework.templates.template_interface.TemplateInterface`. - """ - model_template_enum = self._constants.SilverkiteMultistageModelTemplateEnum - valid_names = list(model_template_enum.__members__.keys()) - if config.model_template not in valid_names: - raise ValueError( - f"Currently Silverkite Multistage only supports a known string of single model template. " - f"Model Template '{config.model_template}' is not recognized! Must be one of: {valid_names}.") - template_class = model_template_enum[config.model_template].value - return template_class diff --git a/greykite/framework/templates/silverkite_multistage_template_config.py b/greykite/framework/templates/silverkite_multistage_template_config.py deleted file mode 100644 index b3c7e48..0000000 --- a/greykite/framework/templates/silverkite_multistage_template_config.py +++ /dev/null @@ -1,275 +0,0 @@ -# BSD 2-CLAUSE LICENSE - -# Redistribution and use in source and binary forms, with or without modification, -# are permitted provided that the following conditions are met: - -# Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# original author: Kaixu Yang -"""Provides templates for SilverkiteMultistageEstimator that are pre-tuned to fit -specific use cases. - -These templates are recognized by -`~greykite.framework.templates.silverkite_multistage_model_templates.SilverkiteMultistageModelTemplateEnum`. -""" - -from dataclasses import dataclass -from enum import Enum -from typing import Callable -from typing import Optional -from typing import Type -from typing import Union - -from greykite.common.python_utils import mutable_field -from greykite.framework.templates.auto_arima_template import AutoArimaTemplate -from greykite.framework.templates.autogen.forecast_config import ModelComponentsParam -from greykite.framework.templates.prophet_template import ProphetTemplate -from greykite.framework.templates.silverkite_template import SilverkiteTemplate -from greykite.framework.templates.simple_silverkite_template import SimpleSilverkiteTemplate -from greykite.sklearn.estimator.silverkite_multistage_estimator import AggregationFunctionEnum - - -@dataclass -class SilverkiteMultistageTemplateConfig: - """The dataclass to store SilverkiteMultistage model config for a single model. - - Attributes - ---------- - train_length : `str`, default "392D" - The length of data used for training. For example, "56D". - fit_length : `str` or None, default None - The length of data where fitted values to be calculated. - Specify if ``fit_length`` is to be longer than ``train_length``. - agg_func : str or Callable, default - `~greykite.sklearn.estimator.silverkite_multistage_estimator.AggregationFunctionEnum.nanmean.name` - The aggregation function. - agg_freq : `str` or None, default None - The aggregation period. If None, no aggregation will be used. - model_template : `str`, default "SILVERKITE" - The mode template to be used. - model_commponents : `~greykite.framework.templates.autogen.forecast_config.ModelComponentsParam` or None, - default None - The parameters used to override the defaults in ``model_template``. - """ - train_length: str = "392D" # 56 weeks - fit_length: Optional[str] = None - agg_func: Union[str, Callable] = AggregationFunctionEnum.nanmean.name - agg_freq: Optional[str] = None - model_template: str = "SILVERKITE" - model_components: Optional[ModelComponentsParam] = None - - -# Defines the SILVERKITE_TWO_STAGE template here. -SILVERKITE_TWO_STAGE = ModelComponentsParam( - custom=dict( - silverkite_multistage_configs=[ - # Defines the long model. - # A daily model with 56 weeks training length. - # Learns the long-term trend, seasonality, events, etc. - SilverkiteMultistageTemplateConfig( - train_length="392D", # 56 weeks - fit_length=None, - agg_func="nanmean", - agg_freq="D", - model_template="SILVERKITE", - model_components=ModelComponentsParam( - seasonality={ - "yearly_seasonality": 12, - "quarterly_seasonality": 5, - "monthly_seasonality": 5, - "weekly_seasonality": 4, - "daily_seasonality": 0, - }, - growth={ - "growth_term": "linear" - }, - events={ - "holidays_to_model_separately": "auto", - "holiday_lookup_countries": "auto", - "holiday_pre_num_days": 1, - "holiday_post_num_days": 1, - "holiday_pre_post_num_dict": None, - "daily_event_df_dict": None, - }, - changepoints={ - "changepoints_dict": { - "method": "auto", - "resample_freq": "D", - "regularization_strength": 0.5, - "potential_changepoint_distance": "15D", - "no_changepoint_distance_from_end": "30D", - "yearly_seasonality_order": 15, - "yearly_seasonality_change_freq": "365D" - }, - "seasonality_changepoints_dict": None - }, - autoregression={ - "autoreg_dict": "auto" - }, - regressors={ - "regressor_cols": [] - }, - lagged_regressors={ - "lagged_regressor_dict": None - }, - uncertainty={ - "uncertainty_dict": None - }, - custom={ - "fit_algorithm_dict": { - "fit_algorithm": "ridge", - "fit_algorithm_params": None, - }, - "feature_sets_enabled": "auto", # "auto" based on data freq and size - "max_daily_seas_interaction_order": 0, - "max_weekly_seas_interaction_order": 2, - "extra_pred_cols": [], - "min_admissible_value": None, - "max_admissible_value": None, - } - ) - ), - # Defines the short model. - # Uses the original frequency with 4 weeks training length. - # Learns daily seasonality with autoregression. - SilverkiteMultistageTemplateConfig( - train_length="28D", # 4 weeks - fit_length=None, - agg_func="nanmean", - agg_freq=None, - model_template="SILVERKITE", - model_components=ModelComponentsParam( - seasonality={ - "yearly_seasonality": 0, - "quarterly_seasonality": 0, - "monthly_seasonality": 0, - "weekly_seasonality": 0, - "daily_seasonality": 12, - }, - growth={ - "growth_term": None - }, - events={ - "holidays_to_model_separately": [], - "holiday_lookup_countries": [], - "holiday_pre_num_days": 0, - "holiday_post_num_days": 0, - "holiday_pre_post_num_dict": None, - "daily_event_df_dict": None, - }, - changepoints={ - "changepoints_dict": None, - "seasonality_changepoints_dict": None - }, - autoregression={ - "autoreg_dict": "auto" - }, - regressors={ - "regressor_cols": [] - }, - lagged_regressors={ - "lagged_regressor_dict": None - }, - uncertainty={ - "uncertainty_dict": None - }, - custom={ - "fit_algorithm_dict": { - "fit_algorithm": "ridge", - "fit_algorithm_params": None, - }, - "feature_sets_enabled": "auto", # "auto" based on data freq and size - "max_daily_seas_interaction_order": 5, - "max_weekly_seas_interaction_order": 2, - "extra_pred_cols": [], - "min_admissible_value": None, - "max_admissible_value": None, - } - ) - ) - ] - ) -) -"""Two stage model for small frequency data. -The first stage uses 56 weeks data with daily frequency and trains the yearly/monthly/weekly seasonality, -trend, holidays effects. -The second stage uses the last 28 days data to train weekly/daily and autoregression effects. -This template is intended to be used with small granularity data (sub-daily) with long history to capture -long-term effects. The template is usually a few times faster than the full Silverkite model, -but still maintains a high level of accuracy. -The template was originally experimented on 5-minute granularity data and worked well. -""" - - -SILVERKITE_MULTISTAGE_EMPTY = ModelComponentsParam( - custom=dict( - silverkite_multistage_configs=[] - ) -) -"""Empty configuration for Silverkite Multistage. -All parameters will be exactly what user inputs. -Not to be used without overriding. -""" - - -class SilverkiteMultistageModelTemplateEnum(Enum): - """Templates that can be used with the Silverkite Multistage algorithm. - - The Silverkite Multistage algorithm is defined through - `~greykite.framework.templates.silverkite_multistage_template.SilverkiteMultistageTemplate`. - The algorithm includes multiple stages, where each stage can be one of the existing model templates - such as `SimpleSilverkiteTemplate` via "SILVERKITE". - - This Enum enumerates the model templates that are allowed to use in the Silverkite - Multistage algorithm, which include common single model templates defined in - `~greykite.framework.templates.model_templates.ModelTemplateEnum`. - """ - SILVERKITE = SimpleSilverkiteTemplate - """Default model template for `SimpleSilverkiteTemplate`.""" - SILVERKITE_WITH_AR = SimpleSilverkiteTemplate - """Default model template for `SimpleSilverkiteTemplate` with autoregression.""" - SILVERKITE_EMPTY = SimpleSilverkiteTemplate - """Null model template for `SimpleSilverkiteTemplate`.""" - SK = SilverkiteTemplate - """Default model template for `SilverkiteTemplate`.""" - PROPHET = ProphetTemplate - """Default model template for `ProphetTemplate`.""" - AUTO_ARIMA = AutoArimaTemplate - """Default model template for `AutoArimaTemplate`.""" - - -@dataclass -class SilverkiteMultistageTemplateConstants: - """Constants used by - `~greykite.framework.templates.silverkite_multistage_template.SilverkiteMultistageTemplate`. - Include the model templates and their default values. - """ - SILVERKITE_TWO_STAGE: ModelComponentsParam = mutable_field(SILVERKITE_TWO_STAGE) - """Defines the ``"SILVERKITE_TWO_STAGE"`` template. - Includes a 2-stage model. The first stage uses daily aggregation to learn long term effects. - The second stage uses the original frequency to learn short term effects from the residuals. - """ - SILVERKITE_MULTISTAGE_EMPTY: ModelComponentsParam = mutable_field(SILVERKITE_MULTISTAGE_EMPTY) - """Defines the ``"SILVERKITE_EMPTY"`` template. - The model config is empty. Uses exactly what user chooses to override. - Can not be used without overriding. - """ - SilverkiteMultistageModelTemplateEnum: Type[Enum] = SilverkiteMultistageModelTemplateEnum - """Defines the model templates that are supported by the Silverkite Multistage algorithm. - These are common single model templates defined in - `~greykite.framework.templates.model_templates.ModelTemplateEnum`, - and can be recognized in each stage of models in Silverkite Multistage. - """ diff --git a/greykite/sklearn/estimator/silverkite_multistage_estimator.py b/greykite/sklearn/estimator/silverkite_multistage_estimator.py deleted file mode 100644 index 76813c0..0000000 --- a/greykite/sklearn/estimator/silverkite_multistage_estimator.py +++ /dev/null @@ -1,1065 +0,0 @@ -# BSD 2-CLAUSE LICENSE - -# Redistribution and use in source and binary forms, with or without modification, -# are permitted provided that the following conditions are met: - -# Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# original author: Kaixu Yang -"""Silverkite Multistage estimator.""" - -from dataclasses import dataclass -from datetime import timedelta -from enum import Enum -from functools import partial -from typing import Callable -from typing import List -from typing import Optional -from typing import Type -from typing import Union - -import numpy as np -import pandas as pd -from dateutil.relativedelta import relativedelta -from pandas.tseries.frequencies import to_offset -from sklearn.metrics import mean_squared_error - -from greykite.common import constants as cst -from greykite.common.logging import LoggingLevelEnum -from greykite.common.logging import log_message -from greykite.sklearn.estimator.base_forecast_estimator import BaseForecastEstimator -from greykite.sklearn.estimator.simple_silverkite_estimator import SimpleSilverkiteEstimator - - -@dataclass -class SilverkiteMultistageModelConfig: - """The dataclass to store SilverkiteMultistage model config for a single model. - - Attributes - ---------- - train_length : `str`, default "392D" - The length of data used for training. For example, "56D". - fit_length : `str` or None, default None - The length of data where fitted values to be calculated. - Specify if ``fit_length`` is to be longer than ``train_length``. - agg_func : str or Callable, default "nanmean" - The aggregation function. - agg_freq : `str` or None, default None - The aggregation period. If None, no aggregation will be used. - estimator : `~greykite.sklearn.estimator.base_forecast_estimator.BaseForecastEstimator`, - default `~greykite.sklearn.estimator.simple_silverkite_estimator.SimpleSilverkiteEstimator` - The estimator to fit the time series. - estimator_params : `dict` or None, default None - The init parameters for ``estimator``. - When the estimator is in the Silverkite family, the parameters shouldn't include - ``forecast_horizon`` or ``past_df``, as they will be passed automatically. - """ - train_length: str = "392D" # 56 weeks - fit_length: Optional[str] = None - agg_func: Union[str, Callable] = "nanmean" - agg_freq: Optional[str] = None - estimator: Type[BaseForecastEstimator] = SimpleSilverkiteEstimator - estimator_params: Optional[dict] = None - - -class AggregationFunctionEnum(Enum): - """Defines some common aggregation functions that can be retrieved by their names. - - Every function is wrapped with ``partial`` because Enum handles functions differently from values. - Wrapping with ``partial`` allows us to extract the function with variable keys. - """ - mean = partial(np.mean) - median = partial(np.median) - nanmean = partial(np.nanmean) - maximum = partial(np.max) - minimum = partial(np.min) - - -class SilverkiteMultistageEstimator(BaseForecastEstimator): - """The Silverkite Multistage Estimator class. - Implements the Silverkite Multistage algorithm. - - The Silverkite Multistage allows users to fit multiple stages of - models with each stage in the following fashions: - - (1) subseting: take a subset of data from the end of training data; - (2) aggregation: aggregate the subset of data into desired frequency; - (3) training: train a model with the desired estimator and parameters. - - Users can just use one stage model to train on a subset/aggregation of the original data, - or can specify multiple stages, where the later stages will be trained on the fitted - residuals of the previous stages. - - This can significantly speed up the training process if the original data is long - and in fine granularity. - - Notes - ----- - The following assumptions or special implementations are made in this class: - - - The actual ``fit_length``, the length of data where the fitted values are calculated, - is the longer of ``train_length`` and ``fit_length``. The reason is that there is no - benefit of calculating a shorter period of fitted values. The fitted values are already - available during training (in Silverkite) so there is no loss to calculate fitted values - on a super set of the training data. - - The estimator sorts the ``model_configs`` according to the ``train_length`` in descending order. - The corresponding aggregation frequency, aggregation function, fit length, - estimator and parameters will be sorted accordingly. - This is to ensure that we have enough data to use from the previous model - when we fit the next model. - - When calculating the length of training data, the length of past df, etc, - the actual length used may include 1 more period to avoid missing timestamps. - For example, for an AR order of 5, you may see the length of ``past_df`` to be 6; - or for a train length of "365D", you may see the actual length to be 366. - This is expected, just to avoid potential missing timestamps after dropping - incomplete aggregation periods. - - The estimator expects different stages of models to have different aggregation - frequencies. If two stages have the same aggregation frequency, an error will - be raised. - - Since the models in each stage may not fit on the entire training data, - there could be periods where fitted values are not calculated. - Leading fitted values in the training period may be NA. - These values are ignored when computing evaluation metrics. - - Attributes - ---------- - model_configs : `list` [`SilverkiteMultistageModelConfig`] - A list of model configs for Silverkite Multistage estimator, - representing the stages in the model. - forecast_horizon : `int` - The forecast horizon on the original data frequency. - freq : `str` or None - The frequency of the original data. - train_lengths : `list` [`str`] or None - A list of training data lengths for the models. - fit_lengths : `list` [`str`] or None - A list of fitting data lengths for the models. - agg_funcs : `list` [`str` or Callable] or None - A list of aggregation functions for the models. - agg_freqs : `list` [`str`] or None - A list of aggregation frequencies for the models. - estimators : `list` [`BaseForecastEstimator`] or None - A list of estimators used in the models. - estimator_params : `list` [`dict` or None] or None - A list of estimator parameters for the estimators. - train_lengths_in_seconds : `list` [`int`] or None - The list of training lengths in seconds. - fit_lengths_in_seconds: : `list` [`int`] or None - The list of fitting lengths in seconds. - If the original ``fit_length`` is None or is shorter than the corresponding - ``train_length``, it will be replaced by the corresponding ``train_length``. - max_ar_orders : `list` [`int`] or None - A list of maximum AR orders in the models. - data_freq_in_seconds : `int` or None - The data frequency in seconds. - num_points_per_agg_freqs : `list` [`int`] or None - Number of data points in each aggregation frequency. - models : `list` [`BaseForecastEstimator`] - The list of model instances. - fit_df : `pandas.DataFrame` or None - The prediction df. - train_end : `pandas.Timestamp` or None - The train end timestamp. - forecast_horizons : `list` [`int`] - The list of forecast horizons for all models in terms of the aggregated frequencies. - """ - - def __init__( - self, - model_configs: List[SilverkiteMultistageModelConfig], - forecast_horizon: int, - freq: Optional[str] = None, - uncertainty_dict: Optional[dict] = None, - score_func: Callable = mean_squared_error, - coverage: Optional[float] = None, - null_model_params: Optional[dict] = None): - """Instantiates the class. - - Parameters - ---------- - model_configs : `list` [SilverkiteMultistageModelConfig] - A list of - `~greykite.sklearn.estimator.silverkite_multistage_estimator.SilverkiteMultistageModelConfig` - objects. Defines the stages in the Silverkite Multistage model. - forecast_horizon : `int` - The forecast horizon in the original data frequency. - freq : `str` or None, default None - The training data frequency. - This parameter is important in Silverkite Multistage model, - since calculation of aggregation and dropping incomplete aggregated periods depends on this. - If None, the model will try to infer it from data. - If inferring from data failed, the model fit will raise an error. - """ - # every subclass of BaseForecastEstimator must call super().__init__ - super().__init__( - score_func=score_func, - coverage=coverage, - null_model_params=null_model_params) - - self.model_configs: List[SilverkiteMultistageModelConfig] = model_configs - self.forecast_horizon: int = forecast_horizon - self.freq: Optional[str] = freq - self.uncertainty_dict = uncertainty_dict - - # Derived from ``self.model_configs``. - self.train_lengths: Optional[List[str]] = None - self.fit_lengths: Optional[List[Optional[str]]] = None - self.agg_funcs: Optional[List[Union[str, Callable]]] = None - self.agg_freqs: Optional[List[str]] = None - self.estimators: Optional[List[BaseForecastEstimator]] = None - self.estimator_params: Optional[List[Optional[dict]]] = None - self.train_lengths_in_seconds: Optional[List[int]] = None - self.fit_lengths_in_seconds: Optional[List[int]] = None - - # Set by ``fit`` method. - self.max_ar_orders: Optional[List[int]] = None - self.data_freq_in_seconds: Optional[int] = None - self.num_points_per_agg_freqs: Optional[List[int]] = None - self.models: Optional[List[BaseForecastEstimator]] = None - self.fit_df: Optional[pd.DataFrame] = None - self.train_end: Optional[pd.Timestamp] = None - self.forecast_horizons: Optional[List[int]] = None - - def fit( - self, - X, - y=None, - time_col=cst.TIME_COL, - value_col=cst.VALUE_COL, - **fit_params): - """Fits ``SilverkiteMultistage`` forecast model. - - Parameters - ---------- - X: `pandas.DataFrame` - Input timeseries, with timestamp column, - value column, and any additional regressors. - The value column is the response, included in - ``X`` to allow transformation by `sklearn.pipeline`. - y: ignored - The original timeseries values, ignored. - (The ``y`` for fitting is included in ``X``). - time_col: `str` - Time column name in ``X``. - value_col: `str` - Value column name in ``X``. - fit_params: `dict` - additional parameters for null model. - - Returns - ------- - self : self - Fitted model is stored in ``self.model_dict``. - """ - # Fits null model - super().fit( - X=X, - y=y, - time_col=time_col, - value_col=value_col, - **fit_params) - if self.freq is None: - self.freq = pd.infer_freq(X[time_col]) - if self.freq is None: - raise ValueError("Failed to infer frequency from data, please provide during " - "instantiation. Data frequency is required for aggregation.") - - self._initialize() - - # Gets the forecast horizons for all models. - # For each model, the forecast horizon is the length of the aggregated test df. - self.forecast_horizons = [] - for agg_freq in self.agg_freqs: - # Constructs a sample prediction df with the current freq and forecast horizon. - sample_df = pd.DataFrame({ - time_col: pd.date_range(X[time_col].max(), freq=self.freq, periods=self.forecast_horizon + 1)[1:], - value_col: 0 - }) - sample_df_agg = sample_df.resample(agg_freq, on=time_col).mean() # The aggregation function is not needed. - # The forecast horizon for the current model is the length of the aggregated df. - # The forecast horizon differ when the aggregation covers various periods of the aggregation frequency. - # For example, if the prediction period is 2020-01-01 23:00, 2020-01-02 00:00, 2020-01-02 01:00, - # and the aggregation frequency is "D", although the length of prediction is less than a day, - # but after aggregation, it will become 2020-01-01 and 2020-01-02. - # On the other hand, if the prediction period is 2020-01-01 21:00, 2020-01-01 22:00, 2020-01-01 23:00, - # and the aggregation frequency is "D", then after aggregation, it will be 2020-01-01 only. - # In each stage of model, the model will get the appropriate forecast horizon. - self.forecast_horizons.append(sample_df_agg.shape[0]) - - min_agg_freq = min([to_offset(freq) for freq in self.agg_freqs]) - if min_agg_freq < to_offset(self.freq): - raise ValueError(f"The minimum aggregation frequency {min_agg_freq} " - f"is less than the data frequency {self.freq}. Please make sure " - f"the aggregation frequencies are at least the data frequency.") - - self.train_end = X[time_col].max() - - # Trains the model. - fit_df = self._train(df=X) - self.fit_df = fit_df - - # Fits the uncertainty model - self._fit_uncertainty() - - return self - - def predict(self, X, y=None): - """Creates forecast for the dates specified in ``X``. - - Parameters - ---------- - X: `pandas.DataFrame` - Input timeseries with timestamp column and any additional regressors. - Timestamps are the dates for prediction. - Value column, if provided in ``X``, is ignored. - y: ignored. - - Returns - ------- - predictions: `pandas.DataFrame` - Forecasted values for the dates in ``X``. Columns: - - - ``TIME_COL``: dates - - ``PREDICTED_COL``: predictions - - ``PREDICTED_LOWER_COL``: lower bound of predictions, optional - - ``PREDICTED_UPPER_COL``: upper bound of predictions, optional - - ``PREDICTED_LOWER_COL`` and ``PREDICTED_UPPER_COL`` are present - if ``self.coverage`` is not None. - """ - pred = self._predict(X) - if self.uncertainty_model is not None: - pred_with_uncertainty = self.predict_uncertainty( - df=pred - ) - if pred_with_uncertainty is not None: - pred = pred_with_uncertainty - return pred - - def _initialize(self): - """Sets the derived attributes from model init parameters.""" - self.train_lengths: List[str] = [config.train_length for config in self.model_configs] - self.fit_lengths: List[Optional[str]] = [config.fit_length for config in self.model_configs] - self.agg_funcs: List[Union[str, Callable]] = [ - self._get_agg_func(config.agg_func) for config in self.model_configs] - self.agg_freqs: List[str] = [ - config.agg_freq if config.agg_freq is not None else self.freq for config in self.model_configs] - if len(set(self.agg_freqs)) != len(self.agg_freqs): - raise ValueError(f"Models from different stages should have different aggregation " - f"frequencies, found {self.agg_freqs}.") - self.estimators: List[BaseForecastEstimator] = [config.estimator for config in self.model_configs] - self.estimator_params: List[Optional[dict]] = [config.estimator_params for config in self.model_configs] - # Assumes train length is integer multiples of 1 second, which is most of the cases. - self.train_lengths_in_seconds: List[int] = [ - to_offset(length).delta // timedelta(seconds=1) for length in self.train_lengths] - self.fit_lengths_in_seconds: List[int] = [ - to_offset(length).delta // timedelta(seconds=1) - if length is not None else None for length in self.fit_lengths] - # If ``fit_length`` is None or is shorter than ``train_length``, it will be replaced by ``train_length``. - fit_lengths_in_seconds = [ - fit_length if fit_length is not None and fit_length >= train_length - else train_length - for fit_length, train_length in zip(self.fit_lengths_in_seconds, self.train_lengths_in_seconds) - ] - if fit_lengths_in_seconds != self.fit_lengths_in_seconds: - self.fit_lengths_in_seconds = fit_lengths_in_seconds - log_message( - message="Some `fit_length` is None or is shorter than `train_length`. " - "These `fit_length` have been replaced with `train_length`.", - level=LoggingLevelEnum.INFO - ) - self.models: List[BaseForecastEstimator] = [ - config.estimator(**config.estimator_params) for config in self.model_configs] - self.data_freq_in_seconds = to_offset(self.freq).delta // timedelta(seconds=1) - - @staticmethod - def _get_agg_func(agg_func: Optional[Union[str, Callable]]): - """Gets the aggregation function. - - Returns the input if it's None or a callable. - Finds the corresponding callable from - `~greykite.sklearn.estimator.silverkite_multistage_estimator.AggregationFunctionEnum` - and raises an error if no corresponding aggregation function is found. - - Parameters - ---------- - agg_func : `str`, Callable or None - The input of aggregation function. - - Returns - ------- - agg_func : Callable - The corresponding aggregation function if input is a string otherwise the input itself. - """ - if not isinstance(agg_func, str): - return agg_func - try: - agg_func = AggregationFunctionEnum[agg_func].value - return agg_func - except KeyError: - raise ValueError(f"The aggregation function {agg_func} is not recognized as a string. " - f"Please either pass a known string or a function.") - - @staticmethod - def _get_num_points_per_agg_freq( - data_freq: str, - agg_freqs: List[str]): - """Gets the number of data points in a aggregation period. - - Parameters - ---------- - data_freq : `str` - The data frequency. For example, "5T". - agg_freqs : `list` [`str`] - A list of aggregation frequencies. - - Returns - ------- - num_points : `list` [`int`] - The number of points in each aggregation period. - """ - return [to_offset(freq).delta // to_offset(data_freq).delta for freq in agg_freqs] - - def _get_freq_col(self, freq: str): - """Gets the column name for a specific frequency. - The name will be f"{self.time_col_}__{freq}". - - Parameters - ---------- - freq : `str` - The aggregation frequency. - - Returns - ------- - freq_col_name : `str` - The time column name for the frequency, f"{self.time_col_}__{freq}". - """ - return f"{self.time_col_}__{freq}" - - def _get_non_time_cols(self, columns: List[str]): - """Gets the non time columns in a df. - Non time columns do not have f"{self.time_col}__" in it or do not equal to self.time_col. - - Parameters - ---------- - columns : `list` [`str`] - The columns in a df. - - Returns - ------- - non_time_columns : `list` [`str`] - The non time columns. - Non time columns do not have f"{self.time_col}__" in it or do not equal to self.time_col. - """ - return [col for col in columns if f"{self.time_col_}__" not in col and col != self.time_col_] - - def _add_agg_freq_cols( - self, - df: pd.DataFrame): - """Appends the resample time columns to ``df``. - - For example, the original df has hourly data with columns "ts" and "y". - The original time column looks like - - "2020-01-01 00:00:00, 2020-01-01 01:00:00, 2020-01-01 02:00:00, - 2020-01-01 03:00:00, 2020-01-01 04:00:00, 2020-01-01 05:00:00, - 2020-01-01 06:00:00, 2020-01-01 07:00:00, 2020-01-01 08:00:00..." - - The resample frequencies are ["3H", "D"]. - The function adds two extra columns to ``df`` with names "ts__3H" and "ts__D". - The "ts__3H" will have the same value for every 3 hours, such as - - "2020-01-01 00:00:00, 2020-01-01 00:00:00, 2020-01-01 00:00:00, - 2020-01-01 03:00:00, 2020-01-01 03:00:00, 2020-01-01 03:00:00, - 2020-01-01 06:00:00, 2020-01-01 06:00:00, 2020-01-01 06:00:00..." - - and "ts__D" will have the same value for every day, such as - - "2020-01-01 00:00:00, 2020-01-01 00:00:00, ... - ... - 2020-01-01 00:00:00, 2020-01-01 00:00:00, 2020-01-01 00:00:00, - 2020-01-02 00:00:00, 2020-01-02 00:00:00, 2020-01-02 00:00:00..." - - Parameters - ---------- - df : `pandas.DataFrame` - The original data frame. - - Returns - ------- - df : `pandas.DataFrame` - The augmented df with resampled time columns. - """ - # Original df has ``self.time_col_`` as the original time column. - df = df.copy() - df[self.time_col_] = pd.to_datetime(df[self.time_col_]) - - for freq in self.agg_freqs: - col = self._get_freq_col(freq) # New column name for resampled time column. - # Borrows the value column for aggregation. - df_time = df[[self.time_col_, self.value_col_]].set_index(self.time_col_, drop=False) - - if len(df_time) == 0: - raise ValueError(f"The df size is zero. Does your input have NANs that are dropped?") - - # Gets the resampled frequency column. - # This solution is fast and no need to further improve. - df_time = (df_time - .resample(freq, on=self.time_col_) # resample, the index is resampled time column - .mean() # this function doesn't matter - .reset_index(drop=False) # adds the resampled time column to columns - .set_index(self.time_col_, drop=False) # copies the resampled time column to index - .reindex(df_time.index, method='ffill') # sets the original freq as index, fill resampled column - .rename(columns={self.time_col_: col}) # renames the filled resampled column - .reset_index(drop=False)) # copies the original freq time column to columns - - # Merges new resampled frequency column into original df. - df = df.merge(df_time[[self.time_col_, col]], on=self.time_col_) - - return df - - def _drop_incomplete_agg( - self, - df: pd.DataFrame, - agg_freq: str, - location: int, - num_points_per_agg_freq: int): - """Drops aggregations with incomplete periods. - - For example, a daily aggregation of hourly data will have a biased aggregation if the data starts from - 07:00:00, because the first day will be the aggregation of 07:00:00 to 23:00:00. This is not - representative and should be dropped. - - The returned df's indices are reset. - - Parameters - ---------- - df : `pandas.DataFrame` - The input dataframe with augmented aggregation frequency time columns. - agg_freq : `str` - The aggregation frequency. - location : `int` - Where to drop the incomplete aggregation periods. - Usually the incomplete aggregation periods happen at the begin and end of the df. - Specify location = 0 indicates the start, and specify location = -1 indicates the end. - num_points_per_agg_freq : `int` - The number of rows expected in a full period. - - Returns - ------- - df : `pandas.DataFrame` - The dataframe after dropping incomplete aggregation periods. - The df's indices are reset. - """ - if df.shape[0] == 0: - return df - freq_col = self._get_freq_col(agg_freq) - if (len(df[df[freq_col] == df[freq_col].iloc[location]]) - < num_points_per_agg_freq): - df = df[df[freq_col] != df[freq_col].iloc[location]] - return df.reset_index(drop=True) - - def _aggregate_values( - self, - df: pd.DataFrame, - agg_freq: str, - agg_func: Optional[callable]): - """Aggregates the ``df`` with the given ``agg_freq`` and applies the ``agg_func``. - - All columns whose names do not start with f"{time_col}__" will be kept and aggregated. - - Parameters - ---------- - df : `pandas.DataFrame` - The input dataframe. - agg_freq : `str` - The aggregation frequency. - agg_func : `str`, `callable` or None - The function used for aggregation. If None, no aggregation will be performed. - - Returns - ------- - df : `pandas.DataFrame` - The aggregated dataframe with f"{time_col}" being the timestamps and all aggregated columns. - """ - columns = [col for col in df.columns if f"{self.time_col_}__" not in col] - freq_col = self._get_freq_col(agg_freq) - if agg_func is not None: - df = (df - .groupby(freq_col) - .agg({col: agg_func for col in columns}) - .reset_index(drop=False) - .rename(columns={freq_col: self.time_col_}) - ) - else: - df = df.rename(columns={freq_col: self.time_col_}) - return df - - def _drop_incomplete_agg_and_aggregate_values( - self, - df: pd.DataFrame, - agg_freq: str, - agg_func: Optional[callable], - num_points_per_agg_freq: int, - drop_incomplete: bool): - """Drops incomplete periods from the begin and end, and gets aggregated values. - - Calls ``self._drop_incomplete_agg`` with locations 0 and -1, then calls - ``self._aggregate_values``. - - Parameters - ---------- - df : `pandas.DataFrame` - The input dataframe with augmented aggregation frequency time columns. - agg_freq : `str` - The aggregation frequency. - agg_func : `str`, `callable` or None - The function used for aggregation. If None, no aggregation will be performed. - num_points_per_agg_freq : `int` - The number of rows expected in a full period. - drop_incomplete : `bool` - Whether to drop incomplete periods from the begin and end. - This shouldn't be done when calculating fitted or prediction values, - because dropping may result in missing time points to predict. - - Returns - ------- - agg_df : `pandas.DataFrame` - The aggregated dataframe with f"{time_col}" being the timestamps and all aggregated columns. - """ - df = df.copy() - if df.shape[0] == 0: - return df - # Drops incomplete periods. - if drop_incomplete: - df = self._drop_incomplete_agg( - df=df, - agg_freq=agg_freq, - location=0, - num_points_per_agg_freq=num_points_per_agg_freq - ) - df = self._drop_incomplete_agg( - df=df, - agg_freq=agg_freq, - location=-1, - num_points_per_agg_freq=num_points_per_agg_freq - ) - # Checks if there are any missing timestamps in ``df``. - # This may result in incomplete periods. - df_check_incomplete_period = df[[self.time_col_, self.value_col_]].resample( - agg_freq, on=self.time_col_).count() - df_with_incomplete_periods = df_check_incomplete_period[ - df_check_incomplete_period[self.value_col_] < num_points_per_agg_freq] - if df_with_incomplete_periods.shape[0] > 0: - log_message( - message=f"There are missing timestamps in `df` when performing aggregation with " - f"frequency {agg_freq}. These points are {df_with_incomplete_periods}. " - f"This may cause the aggregated values to be biased.", - level=LoggingLevelEnum.WARNING - ) - - # Aggregates values - df = self._aggregate_values( - df=df[[self._get_freq_col(agg_freq)] + self._get_non_time_cols(list(df.columns))], - agg_freq=agg_freq, - agg_func=agg_func - ) - return df - - def _get_agg_dfs( - self, - df: pd.DataFrame, - agg_freq: str, - agg_func: Optional[callable], - train_length_in_seconds: int, - fit_length_in_seconds: Optional[int], - num_points_per_agg_freq: int, - max_ar_order: int): - """Given a dataframe, training/fitting configuration, gets the training data and fit data. - - If training data include incomplete periods during aggregation, the periods will be dropped. - If fit length is shorter than train length, fit length will be replaced by train length. - Training data is the data that the model is to be trained on. - Fit data is the data that the fitted values are to be calculated on. - - Parameters - ---------- - df : `pandas.DataFrame` - The input dataframe. - agg_freq : `str` - The aggregation frequency. For example, "D". - agg_func : Callable - The aggregation function. For example, `numpy.nanmean`. - train_length_in_seconds : `int` - The training data length in seconds. - fit_length_in_seconds : `int` or None - The fit data length in seconds. - If None, will use ``train_length_in_seconds``. - num_points_per_agg_freq : `int` - For ``agg_freq``, how many data points in data frequency should be in an entire period. - max_ar_order : `int` - The maximum order of AR. Used to generate ``past_df`` to be fed into the Silverkite models - to generate AR terms. - - Returns - ------- - result : `dict` - A dictionary with the following keys: - - train_df : `pandas.DataFrame` - The training df with aggregated frequency. - fit_df : `pandas.DataFrame` - The fit df with aggregated frequency. - df_past : `pandas.DataFrame` - The past df used to generate AR terms. - fit_df_has_incomplete_period : `bool` - Whether ``fit_df`` has incomplete period at the end. - """ - # Selects the columns in ``df`` excluding irrelevant aggregated time columns. - freq_col = self._get_freq_col(agg_freq) - non_time_cols = self._get_non_time_cols(list(df.columns)) - df = df[[self.time_col_, freq_col] + non_time_cols] - - train_end = df[self.time_col_].max() - # Subtracts 1 extra full aggregation period for completion. - # Because we drop incomplete periods in a later step before aggregation. - # If there are incomplete periods and we don't take the extra period, - # the actual length will be the desired length minus 1. - # The only case that this will add an extra period is when there is no incomplete period, - # and having an extra period does not lose anything from there. - train_start = train_end - relativedelta(seconds=train_length_in_seconds) - to_offset(agg_freq) - fit_start = train_end - relativedelta(seconds=fit_length_in_seconds) - to_offset(agg_freq) - - train_df = df[(df[self.time_col_] >= train_start) & (df[self.time_col_] <= train_end)] - fit_df = df[(df[self.time_col_] >= fit_start) & (df[self.time_col_] <= train_end)] - - # Checks if there are incomplete periods in ``fit_df`` at the end. - # They won't be dropped for not missing any prediction periods, - # but if there are regressors, we record this. - # Because the aggregated regressor value could be biased. - # A warning will be raised if such regressor exists in the model. - fit_df_has_incomplete_period = False - if fit_df[fit_df[freq_col] == fit_df[freq_col].iloc[-1]].shape[0] < num_points_per_agg_freq: - fit_df_has_incomplete_period = True - - # Removes incomplete periods aggregations since the aggregated values may be biased. - # We only drop incomplete aggregations for the training periods in case they affect - # training by including incorrect aggregated values. - # For fit/predict we don't drop incomplete aggregations because we want to make prediction - # for all data points in the original frequency. - train_df = self._drop_incomplete_agg_and_aggregate_values( - df=train_df, - agg_freq=agg_freq, - agg_func=agg_func, - num_points_per_agg_freq=num_points_per_agg_freq, - drop_incomplete=True - ) - fit_df = self._drop_incomplete_agg_and_aggregate_values( - df=fit_df, - agg_freq=agg_freq, - agg_func=agg_func, - num_points_per_agg_freq=num_points_per_agg_freq, - drop_incomplete=False - ) - - # Generates past dataframe for AR terms, if needed. - past_df = None - if max_ar_order > 0: - # Adds 2 complete periods to ensure we don't miss any data in ``past_df``. - past_df_end = train_start + 2 * to_offset(agg_freq) - # By +1, we ensure that the ``past_df`` still has enough length after dropping incomplete periods. - past_df_start = (fit_start - - relativedelta(seconds=to_offset(agg_freq).delta.total_seconds() * (max_ar_order + 1))) - past_df = df[(df[self.time_col_] >= past_df_start) & (df[self.time_col_] <= past_df_end)] - past_df = self._drop_incomplete_agg_and_aggregate_values( - df=past_df, - agg_freq=agg_freq, - agg_func=agg_func, - num_points_per_agg_freq=num_points_per_agg_freq, - drop_incomplete=True - ) - past_df = past_df[past_df[self.time_col_] < train_df[self.time_col_].min()] - - return { - "train_df": train_df, - "fit_df": fit_df, - "past_df": past_df, - "fit_df_has_incomplete_period": fit_df_has_incomplete_period - } - - def _get_silverkite_ar_max_order(self): - """Gets the AR order so that the model can use ``past_df`` to generate AR terms instead of imputation. - - This function only applies to the Silverkite family. - This function is called after ``freq`` and ``forecast_horizon`` parameters have been added to model instances. - - Returns - ------- - max_ar_orders : `list` [`int` or None] - The maximum AR orders needed in each model. - The value is 0 if the model does not belong to the Silverkite family or the autoregression - parameter is not configured. - """ - max_ar_orders = [] - for freq, model in zip(self.agg_freqs, self.models): - try: - # All Silverkite family estimators have the method ``get_max_ar_order``. - max_ar_order = model.get_max_ar_order() - except (AttributeError, TypeError): - max_ar_order = 0 - max_ar_orders.append(max_ar_order) - return max_ar_orders - - def _train( - self, - df: pd.DataFrame): - """Trains the Silverkite Multistage model with the given configurations. - - Parameters - ---------- - df : `pandas.DataFrame` - The input dataframe. - - Returns - ------- - fit_df : `pandas.DataFrame` - The dataframe with aggregated time columns and predictions. - """ - # Sorts the models by training data length from long to short. - (self.agg_freqs, self.agg_funcs, self.train_lengths_in_seconds, - self.fit_lengths_in_seconds, self.models, self.forecast_horizons) = zip(*sorted( - zip(self.agg_freqs, self.agg_funcs, self.train_lengths_in_seconds, - self.fit_lengths_in_seconds, self.models, self.forecast_horizons), - key=lambda x: x[2], # key is ``train_lengths_in_seconds`` - reverse=True)) - - # Here we add the ``forecast_horizon`` and ``freq`` attribute regardless of what model it is. - # This doesn't affect the model if it does not expect the ``forecast_horizon`` - # or ``freq`` attribute before fitting. - # If the forecast horizon parameter varies due to different periods in the fit - # and predict input, users can leave them as None and let the estimator automatically fill them. - # If the entry point is template, it's possible that "forecast_horizon" is a missing parameter. - # We add it here when it's missing. - # If either of these parameter is already set, we won't modify it. - for model, forecast_horizon, agg_freq in zip(self.models, self.forecast_horizons, self.agg_freqs): - if getattr(model, "forecast_horizon", None) is None: - model.forecast_horizon = forecast_horizon - if getattr(model, "freq", None) is None: - model.freq = agg_freq - - self.num_points_per_agg_freqs = self._get_num_points_per_agg_freq( - data_freq=self.freq, - agg_freqs=self.agg_freqs - ) - - self.max_ar_orders = self._get_silverkite_ar_max_order() - - # Adds resampled timestamps and aggregated columns to df. - df_with_freq = self._add_agg_freq_cols(df=df) - - # Makes a copy. This is used to store the results. - fit_result_df = df_with_freq.copy() - # Adds a column to track cumulative fitted values. - # At each stage this column will be subtracted from the original time series - # to obtain the current residual to be fitted on the next model. - fit_result_df["cum_fitted_values"] = 0 - - for (freq, func, train_length_in_seconds, fit_length_in_seconds, - model, num_points_per_agg_freq, max_order) in zip( - self.agg_freqs, self.agg_funcs, self.train_lengths_in_seconds, - self.fit_lengths_in_seconds, self.models, self.num_points_per_agg_freqs, - self.max_ar_orders): - - df_with_freq_copy = df_with_freq.copy() # makes a copy since we want to calculate the residuals. - df_with_freq_copy[self.value_col_] -= fit_result_df["cum_fitted_values"] - - # Gets the train_df and fit_df. - # The dfs will be subset and aggregated. - # The fitted values are to be calculated on fit_df. - # fit_df has length as the maximum of train_length and fit_length. - agg_dfs = self._get_agg_dfs( - df=df_with_freq_copy, - agg_freq=freq, - agg_func=func, - train_length_in_seconds=train_length_in_seconds, - fit_length_in_seconds=fit_length_in_seconds, - num_points_per_agg_freq=num_points_per_agg_freq, - max_ar_order=max_order - ) - train_df = agg_dfs["train_df"] - fit_df = agg_dfs["fit_df"] - # Adds ``past_df`` in case the model expects extra data to calculate autoregression terms. - if agg_dfs["past_df"] is not None: - model.past_df = agg_dfs["past_df"] - if agg_dfs["fit_df_has_incomplete_period"]: - regressor_cols = getattr(model, "regressor_cols", None) - if regressor_cols is not None and regressor_cols != []: - log_message( - message="There are incomplete periods in `fit_df`, thus the regressor " - "values are biased after aggregation.", - level=LoggingLevelEnum.WARNING - ) - # Adds the actual values to the result df. - fit_result_df = fit_result_df.merge( - train_df.rename(columns={ - self.time_col_: self._get_freq_col(freq), - self.value_col_: f"{self.value_col_}__{freq}"}), - on=self._get_freq_col(freq), - how="left") - - # Fits the model. - model.fit( - train_df, - time_col=self.time_col_, - value_col=self.value_col_) - - # Calculates fitted values. - y_fitted = model.predict(fit_df)[[cst.TIME_COL, cst.PREDICTED_COL]].rename( - columns={ - cst.TIME_COL: f"{cst.TIME_COL}__{freq}", - cst.PREDICTED_COL: f"{cst.PREDICTED_COL}__{freq}"}) - - # Joins the fitted values with the original ``fit_result_df``. - # This is a left join since the fitted values is a subset of the entire period. - fit_result_df = fit_result_df.merge( - y_fitted, - how="left", - left_on=self._get_freq_col(freq), - right_on=f"{cst.TIME_COL}__{freq}") - # Adds the current fitted values to the previous fitted values to get cumulated fitted values. - fit_result_df["cum_fitted_values"] += fit_result_df[f"{cst.PREDICTED_COL}__{freq}"] - - fit_result_df = fit_result_df.rename(columns={ - "cum_fitted_values": cst.PREDICTED_COL - }) - - return fit_result_df - - def _predict( - self, - df: pd.DataFrame): - """The prediction function. - - Parameters - ---------- - df : `pandas.DataFrame` - The input dataframe, covering the prediction phase. - - Returns - ------- - pred : `pandas.DataFrame` - The predicted dataframe. - """ - # Since Silverkite Multistage partitions the data and does not use the - # entire period to train the models, we do not allow predictions - # going beyond the earliest fit period. - # The predictions before the allowed periods will be marked as 0 for compatibility - # in calculating error metrics. - # This only affects the training evaluation in pipeline, not the validation/test/forecast. - fit_starts = [self.train_end - relativedelta(seconds=fit_length) for fit_length in self.fit_lengths_in_seconds] - - # Adds resampled timestamps and aggregated columns to df. - df_with_freq = self._add_agg_freq_cols(df=df) - - # Generates predictions for each freq/model. - for freq, func, model, start in zip(self.agg_freqs, self.agg_funcs, self.models, fit_starts): - freq_col = self._get_freq_col(freq) - current_df = df_with_freq[df_with_freq[self.time_col_] >= start] - current_df = current_df[ - [freq_col] - + [col for col in self._get_non_time_cols(current_df.columns) if cst.PREDICTED_COL not in col]] - # We do this aggregation for the concern that there are regressors in ``df``. - # Uses ``set_index`` instead of resample "on" because the aggregation functions are - # defined with ``partial`` and there's some incompatibility between partial - # and ``np.nanmean`` with the "on" column. - current_df = current_df.set_index(freq_col).resample(freq).apply(func) - current_df[self.time_col_] = current_df.index - current_df = current_df.reset_index(drop=True) - predicted_df = model.predict(current_df) # ``past_df`` has been added to model instance. - df_with_freq = df_with_freq.merge( - predicted_df[[cst.TIME_COL, cst.PREDICTED_COL]].rename(columns={ - cst.TIME_COL: freq_col, - cst.PREDICTED_COL: f"{cst.PREDICTED_COL}__{freq}" - }), - how="left", - on=freq_col - ) - - df_with_freq[cst.PREDICTED_COL] = 0 - for freq in self.agg_freqs: - df_with_freq[cst.PREDICTED_COL] += df_with_freq[f"{cst.PREDICTED_COL}__{freq}"] - - return df_with_freq - - def _fit_uncertainty(self): - fit_df_dropna = self.fit_df.dropna( - subset=[cst.PREDICTED_COL]).rename( - columns={cst.VALUE_COL: self.value_col_}) # Estimator predictions have standard value column. - - self.fit_uncertainty( - df=fit_df_dropna, - uncertainty_dict=self.uncertainty_dict, - ) - if self.uncertainty_model is not None: - fit_df_with_uncertainty = self.predict_uncertainty( - df=fit_df_dropna - ) - if fit_df_with_uncertainty is not None: - fit_df_with_uncertainty = self.fit_df.merge( - fit_df_with_uncertainty[[self.time_col_] + [ - col for col in fit_df_with_uncertainty if col not in self.fit_df.columns]], - on=self.time_col_, - how="left" - ) - self.fit_df = fit_df_with_uncertainty - - def plot_components(self): - """Makes component plots. - - Returns - ------- - figs : `list` [`plotly.graph_objects.Figure` or None] - A list of figures from each model. - """ - if self.fit_df is None: - raise ValueError("Please call `fit` before calling `plot_components`.") - figs = [] - for model in self.models: - try: - fig = model.plot_components() - except AttributeError: - fig = None - figs.append(fig) - return figs - - def summary(self): - """Gets model summaries. - - Returns - ------- - summaries : `list` [`~greykite.algo.common.model_summary.ModelSummary` or None] - A list of model summaries from each model. - """ - if self.fit_df is None: - raise ValueError("Please call `fit` before calling `summary`.") - summaries = [] - for model in self.models: - try: - summary = model.summary() - except AttributeError: - summary = None - summaries.append(summary) - return summaries diff --git a/greykite/tests/algo/forecast/silverkite/test_silverkite_diagnostics.py b/greykite/tests/algo/forecast/silverkite/test_silverkite_diagnostics.py deleted file mode 100644 index c7fea83..0000000 --- a/greykite/tests/algo/forecast/silverkite/test_silverkite_diagnostics.py +++ /dev/null @@ -1,261 +0,0 @@ -import datetime - -import numpy as np -import pandas as pd -import pytest -from pandas.testing import assert_frame_equal - -from greykite.algo.forecast.silverkite.silverkite_diagnostics import SilverkiteDiagnostics -from greykite.common import constants as cst -from greykite.common.features.timeseries_features import build_time_features_df - - -def test_get_silverkite_components(): - """Tests get_silverkite_components function""" - silverkite_diagnostics: SilverkiteDiagnostics = SilverkiteDiagnostics() - - # Dataframe with trend, seasonality and events - time_col = "ts" - # value_col name is chosen such that it contains keywords "ct" and "sin" - # so that we can test patterns specified for each component work correctly - value_col = "basin_impact" - df = pd.DataFrame({ - time_col: [ - datetime.datetime(2018, 1, 1), - datetime.datetime(2018, 1, 2), - datetime.datetime(2018, 1, 3), - datetime.datetime(2018, 1, 4), - datetime.datetime(2018, 1, 5)], - value_col: [1, 2, 3, 4, 5], - "dummy_col": [0, 0, 0, 0, 0], - }) - feature_df = pd.DataFrame({ - # Trend columns: growth, changepoints and interactions (total 5 columns) - "ct1": np.array([1.0, 1.0, 1.0, 1.0, 1.0]), - "ct1:tod": np.array([1.0, 1.0, 1.0, 1.0, 1.0]), - "ct_sqrt": np.array([1.0, 1.0, 1.0, 1.0, 1.0]), - "changepoint0_2018_01_02_00": np.array([1.0, 1.0, 1.0, 1.0, 1.0]), - "changepoint1_2018_01_04_00": np.array([1.0, 1.0, 1.0, 1.0, 1.0]), - # Daily seasonality with interaction (total 4 columns) - "sin1_tow_weekly": np.array([2.0, 2.0, 2.0, 2.0, 2.0]), - "cos1_tow_weekly": np.array([2.0, 2.0, 2.0, 2.0, 2.0]), - "is_weekend[T.True]:sin1_tow_weekly": np.array([2.0, 2.0, 2.0, 2.0, 2.0]), - "is_weekend[T.True]:cos1_tow_weekly": np.array([2.0, 2.0, 2.0, 2.0, 2.0]), - # Yearly seasonality (total 6 columns) - "sin1_ct1_yearly": np.array([3.0, 3.0, 3.0, 3.0, 3.0]), - "cos1_ct1_yearly": np.array([3.0, 3.0, 3.0, 3.0, 3.0]), - "sin2_ct1_yearly": np.array([3.0, 3.0, 3.0, 3.0, 3.0]), - "cos2_ct1_yearly": np.array([3.0, 3.0, 3.0, 3.0, 3.0]), - "sin3_ct1_yearly": np.array([3.0, 3.0, 3.0, 3.0, 3.0]), - "cos3_ct1_yearly": np.array([3.0, 3.0, 3.0, 3.0, 3.0]), - # Holiday with pre and post effect (1 at the where the date and event match) - # e.g. New Years Day is 1 at 1st January, 0 rest of the days - "Q('events_New Years Day')[T.event]": np.array([1.0, 0.0, 0.0, 0.0, 0.0]), - "Q('events_New Years Day_minus_1')[T.event]": np.array([0.0, 0.0, 0.0, 0.0, 0.0]), - "Q('events_New Years Day_minus_2')[T.event]": np.array([0.0, 0.0, 0.0, 0.0, 0.0]), - "Q('events_New Years Day_plus_1')[T.event]": np.array([0.0, 1.0, 0.0, 0.0, 0.0]), - "Q('events_New Years Day_plus_2')[T.event]": np.array([0.0, 0.0, 1.0, 0.0, 0.0]), - }) - components = silverkite_diagnostics.get_silverkite_components(df, time_col, value_col, feature_df) - expected_df = pd.DataFrame({ - time_col: df[time_col], - value_col: df[value_col], - "trend": 5 * np.array([1.0, 1.0, 1.0, 1.0, 1.0]), - "WEEKLY_SEASONALITY": 4 * np.array([2.0, 2.0, 2.0, 2.0, 2.0]), - "YEARLY_SEASONALITY": 6 * np.array([3.0, 3.0, 3.0, 3.0, 3.0]), - cst.EVENT_PREFIX: np.array([1.0, 1.0, 1.0, 0.0, 0.0]), - "trend_changepoints": np.array([0, 1, 0, 1, 0])}) - assert_frame_equal(components, expected_df) - - # Test error messages - with pytest.raises(ValueError, match="feature_df must be non-empty"): - silverkite_diagnostics.get_silverkite_components(df, time_col, value_col, feature_df=pd.DataFrame()) - - with pytest.raises(ValueError, match="df and feature_df must have same number of rows."): - silverkite_diagnostics.get_silverkite_components(df, time_col, value_col, feature_df=pd.DataFrame({"ts": [1, 2, 3]})) - - -def test_group_silverkite_seas_components(): - """Tests group_silverkite_seas_components""" - silverkite_diagnostics: SilverkiteDiagnostics = SilverkiteDiagnostics() - time_col = "ts" - # Daily - date_list = pd.date_range(start="2018-01-01", end="2018-01-07", freq="H").tolist() - time_df = build_time_features_df(date_list, conti_year_origin=2018) - df = pd.DataFrame({ - time_col: time_df["datetime"], - "DAILY_SEASONALITY": time_df["hour"] - }) - res = silverkite_diagnostics.group_silverkite_seas_components(df) - expected_df = pd.DataFrame({ - "Hour of day": np.arange(24.0), - "daily": np.arange(24.0), - }) - assert_frame_equal(res, expected_df) - - # Weekly - date_list = pd.date_range(start="2018-01-01", end="2018-01-20", freq="D").tolist() - time_df = build_time_features_df(date_list, conti_year_origin=2018) - df = pd.DataFrame({ - time_col: time_df["datetime"], - "WEEKLY_SEASONALITY": time_df["tow"] - }) - res = silverkite_diagnostics.group_silverkite_seas_components(df) - expected_df = pd.DataFrame({ - "Day of week": np.arange(7.0), - "weekly": np.arange(7.0), - }) - assert_frame_equal(res, expected_df) - - # Monthly - date_list = pd.date_range(start="2018-01-01", end="2018-01-31", freq="D").tolist() - time_df = build_time_features_df(date_list, conti_year_origin=2018) - df = pd.DataFrame({ - time_col: time_df["datetime"], - "MONTHLY_SEASONALITY": time_df["dom"] - }) - res = silverkite_diagnostics.group_silverkite_seas_components(df) - expected_df = pd.DataFrame({ - "Time of month": np.arange(31.0)/31, - "monthly": np.arange(1.0, 32.0), - }) - assert_frame_equal(res, expected_df) - - # Quarterly (92 day quarters) - date_list = pd.date_range(start="2018-07-01", end="2018-12-31", freq="D").tolist() - time_df = build_time_features_df(date_list, conti_year_origin=2018) - df = pd.DataFrame({ - time_col: time_df["datetime"], - "QUARTERLY_SEASONALITY": time_df["toq"] - }) - res = silverkite_diagnostics.group_silverkite_seas_components(df) - expected_df = pd.DataFrame({ - "Time of quarter": np.arange(92.0)/92, - "quarterly": np.arange(92.0)/92, - }) - assert_frame_equal(res, expected_df) - - # Quarterly (90 day quarter) - date_list = pd.date_range(start="2018-01-01", end="2018-03-31", freq="D").tolist() - time_df = build_time_features_df(date_list, conti_year_origin=2018) - df = pd.DataFrame({ - time_col: time_df["datetime"], - "QUARTERLY_SEASONALITY": time_df["toq"] - }) - res = silverkite_diagnostics.group_silverkite_seas_components(df) - expected_df = pd.DataFrame({ - "Time of quarter": np.arange(90.0)/90, - "quarterly": np.arange(90.0)/90, - }) - assert_frame_equal(res, expected_df) - - # Yearly (non-leap years) - date_list = pd.date_range(start="2018-01-01", end="2019-12-31", freq="D").tolist() - time_df = build_time_features_df(date_list, conti_year_origin=2018) - df = pd.DataFrame({ - time_col: time_df["datetime"], - "YEARLY_SEASONALITY": time_df["toy"] - }) - res = silverkite_diagnostics.group_silverkite_seas_components(df) - expected_df = pd.DataFrame({ - "Time of year": np.arange(365.0)/365, - "yearly": np.arange(365.0)/365, - }) - assert_frame_equal(res, expected_df) - - -def test_plot_silverkite_components(): - """Tests plot_silverkite_components function""" - silverkite_diagnostics: SilverkiteDiagnostics = SilverkiteDiagnostics() - # Dataframe with trend, seasonality and events - time_col = "ts" - # value_col name is chosen such that it contains keywords "ct" and "sin" - # so that we can test patterns specified for each component work correctly - value_col = "basin_impact" - df = pd.DataFrame({ - time_col: [ - datetime.datetime(2018, 1, 1), - datetime.datetime(2018, 1, 2), - datetime.datetime(2018, 1, 3), - datetime.datetime(2018, 1, 4), - datetime.datetime(2018, 1, 5)], - value_col: [1, 2, 3, 4, 5], - }) - feature_df = pd.DataFrame({ - # Trend columns: growth, changepoints and interactions (total 5 columns) - "ct1": np.array([1.0, 1.0, 1.0, 1.0, 1.0]), - "ct1:tod": np.array([1.0, 1.0, 1.0, 1.0, 1.0]), - "ct_sqrt": np.array([1.0, 1.0, 1.0, 1.0, 1.0]), - "changepoint0_2018_01_02_00": np.array([1.0, 1.0, 1.0, 1.0, 1.0]), - "changepoint1_2018_01_04_00": np.array([1.0, 1.0, 1.0, 1.0, 1.0]), - # Daily seasonality with interaction (total 4 columns) - "sin1_tow_weekly": np.array([2.0, 2.0, 2.0, 2.0, 2.0]), - "cos1_tow_weekly": np.array([2.0, 2.0, 2.0, 2.0, 2.0]), - "is_weekend[T.True]:sin1_tow_weekly": np.array([2.0, 2.0, 2.0, 2.0, 2.0]), - "is_weekend[T.True]:cos1_tow_weekly": np.array([2.0, 2.0, 2.0, 2.0, 2.0]), - # Yearly seasonality (total 6 columns) - "sin1_ct1_yearly": np.array([3.0, 3.0, 3.0, 3.0, 3.0]), - "cos1_ct1_yearly": np.array([3.0, 3.0, 3.0, 3.0, 3.0]), - "sin2_ct1_yearly": np.array([3.0, 3.0, 3.0, 3.0, 3.0]), - "cos2_ct1_yearly": np.array([3.0, 3.0, 3.0, 3.0, 3.0]), - "sin3_ct1_yearly": np.array([3.0, 3.0, 3.0, 3.0, 3.0]), - "cos3_ct1_yearly": np.array([3.0, 3.0, 3.0, 3.0, 3.0]), - # Holiday with pre and post effect (1 at the where the date and event match) - # e.g. New Years Day is 1 at 1st January, 0 rest of the days - "Q('events_New Years Day')[T.event]": np.array([1.0, 0.0, 0.0, 0.0, 0.0]), - "Q('events_New Years Day_minus_1')[T.event]": np.array([0.0, 0.0, 0.0, 0.0, 0.0]), - "Q('events_New Years Day_minus_2')[T.event]": np.array([0.0, 0.0, 0.0, 0.0, 0.0]), - "Q('events_New Years Day_plus_1')[T.event]": np.array([0.0, 1.0, 0.0, 0.0, 0.0]), - "Q('events_New Years Day_plus_2')[T.event]": np.array([0.0, 0.0, 1.0, 0.0, 0.0]), - }) - components = silverkite_diagnostics.get_silverkite_components(df, time_col, value_col, feature_df) - - # Check plot_silverkite_components with defaults - fig = silverkite_diagnostics.plot_silverkite_components(components) - assert len(fig.data) == 5 + 2 # 2 changepoints - assert [fig.data[i].name for i in range(len(fig.data))] == list(components.columns)[1: -1] + ["trend change point"] * 2 - - assert fig.layout.height == (len(fig.data) - 2) * 350 # changepoints do not create separate subplots - assert fig.layout.showlegend is True # legend for changepoints - assert fig.layout.title["text"] == "Component plots" - assert fig.layout.title["x"] == 0.5 - - assert fig.layout.xaxis.title["text"] == time_col - assert fig.layout.xaxis2.title["text"] == time_col - assert fig.layout.xaxis3.title["text"] == "Day of week" - assert fig.layout.xaxis4.title["text"] == "Time of year" - assert fig.layout.xaxis5.title["text"] == time_col - - assert fig.layout.yaxis.title["text"] == value_col - assert fig.layout.yaxis2.title["text"] == "trend" - assert fig.layout.yaxis3.title["text"] == "weekly" - assert fig.layout.yaxis4.title["text"] == "yearly" - assert fig.layout.yaxis5.title["text"] == "events" - - # Check plot_silverkite_components with provided component list and warnings - with pytest.warns(Warning) as record: - names = ["YEARLY_SEASONALITY", value_col, "DUMMY"] - title = "Component plot without trend and weekly seasonality" - fig = silverkite_diagnostics.plot_silverkite_components(components, names=names, title=title) - - expected_length = 2 - assert len(fig.data) == expected_length - assert [fig.data[i].name for i in range(len(fig.data))] == [value_col, "YEARLY_SEASONALITY"] - - assert fig.layout.height == expected_length*350 - assert fig.layout.showlegend is True - assert fig.layout.title["text"] == title - assert fig.layout.title["x"] == 0.5 - - assert fig.layout.xaxis.title["text"] == time_col - assert fig.layout.xaxis2.title["text"] == "Time of year" - - assert fig.layout.yaxis.title["text"] == value_col - assert fig.layout.yaxis2.title["text"] == "yearly" - assert f"The following components have not been specified in the model: " \ - f"{{'DUMMY'}}, plotting the rest." in record[0].message.args[0] - - # Check plot_silverkite_components with exception - with pytest.raises(ValueError, match="None of the provided components have been specified in the model."): - names = ["DUMMY"] - silverkite_diagnostics.plot_silverkite_components(components, names=names) diff --git a/greykite/tests/framework/templates/test_silverkite_multistage_template.py b/greykite/tests/framework/templates/test_silverkite_multistage_template.py deleted file mode 100644 index 48ef137..0000000 --- a/greykite/tests/framework/templates/test_silverkite_multistage_template.py +++ /dev/null @@ -1,865 +0,0 @@ -import datetime - -import numpy as np -import pytest -from testfixtures import LogCapture - -from greykite.common.constants import LOGGER_NAME -from greykite.common.constants import PREDICTED_COL -from greykite.common.constants import PREDICTED_LOWER_COL -from greykite.common.constants import PREDICTED_UPPER_COL -from greykite.common.constants import TIME_COL -from greykite.common.constants import VALUE_COL -from greykite.common.testing_utils import generate_df_for_tests -from greykite.framework.templates.autogen.forecast_config import EvaluationPeriodParam -from greykite.framework.templates.autogen.forecast_config import ForecastConfig -from greykite.framework.templates.autogen.forecast_config import MetadataParam -from greykite.framework.templates.autogen.forecast_config import ModelComponentsParam -from greykite.framework.templates.forecaster import Forecaster -from greykite.framework.templates.silverkite_multistage_template import SilverkiteMultistageTemplate -from greykite.framework.templates.silverkite_multistage_template_config import SILVERKITE_TWO_STAGE -from greykite.framework.templates.silverkite_multistage_template_config import SilverkiteMultistageTemplateConfig -from greykite.framework.templates.simple_silverkite_template import SimpleSilverkiteTemplate -from greykite.sklearn.estimator.simple_silverkite_estimator import SimpleSilverkiteEstimator -from greykite.sklearn.uncertainty.uncertainty_methods import UncertaintyMethodEnum - - -@pytest.fixture -def df(): - df = generate_df_for_tests( - freq="H", - periods=24 * 7 * 8, - train_start_date=datetime.datetime(2018, 1, 1), - conti_year_origin=2018)["df"] - df["regressor"] = np.arange(len(df)) - return df - - -@pytest.fixture -def silverkite_multistage_configs(): - configs = [ - SilverkiteMultistageTemplateConfig( - train_length="30D", - fit_length=None, - agg_func="nanmean", - agg_freq="D", - model_template="SILVERKITE", - model_components=ModelComponentsParam( - seasonality={ - "yearly_seasonality": 12, - "quarterly_seasonality": 5, - "monthly_seasonality": 5, - "weekly_seasonality": 4, - "daily_seasonality": 0, - }, - growth={ - "growth_term": "linear" - }, - events={ - "holidays_to_model_separately": "auto", - "holiday_lookup_countries": "auto", - "holiday_pre_num_days": 1, - "holiday_post_num_days": 1, - "holiday_pre_post_num_dict": None, - "daily_event_df_dict": None, - }, - changepoints={ - "changepoints_dict": None, - "seasonality_changepoints_dict": None - }, - autoregression={ - "autoreg_dict": "auto" - }, - regressors={ - "regressor_cols": [] - }, - lagged_regressors={ - "lagged_regressor_dict": None - }, - uncertainty={ - "uncertainty_dict": None - }, - custom={ - "fit_algorithm_dict": { - "fit_algorithm": "ridge", - "fit_algorithm_params": None, - }, - "feature_sets_enabled": "auto", # "auto" based on data freq and size - "max_daily_seas_interaction_order": 0, - "max_weekly_seas_interaction_order": 2, - "extra_pred_cols": [], - "min_admissible_value": None, - "max_admissible_value": None, - } - ) - ), - SilverkiteMultistageTemplateConfig( - train_length="7D", - fit_length=None, - agg_func="nanmean", - agg_freq=None, - model_template="SILVERKITE", - model_components=ModelComponentsParam( - seasonality={ - "yearly_seasonality": 0, - "quarterly_seasonality": 0, - "monthly_seasonality": 0, - "weekly_seasonality": 0, - "daily_seasonality": 12, - }, - growth={ - "growth_term": None - }, - events={ - "holidays_to_model_separately": [], - "holiday_lookup_countries": [], - "holiday_pre_num_days": 0, - "holiday_post_num_days": 0, - "holiday_pre_post_num_dict": None, - "daily_event_df_dict": None, - }, - changepoints={ - "changepoints_dict": None, - "seasonality_changepoints_dict": None - }, - autoregression={ - "autoreg_dict": "auto" - }, - regressors={ - "regressor_cols": [] - }, - lagged_regressors={ - "lagged_regressor_dict": None - }, - uncertainty={ - "uncertainty_dict": None - }, - custom={ - "fit_algorithm_dict": { - "fit_algorithm": "ridge", - "fit_algorithm_params": None, - }, - "feature_sets_enabled": "auto", # "auto" based on data freq and size - "max_daily_seas_interaction_order": 5, - "max_weekly_seas_interaction_order": 2, - "extra_pred_cols": [], - "min_admissible_value": None, - "max_admissible_value": None, - } - ) - ) - ] - return configs - - -@pytest.fixture -def forecast_config(silverkite_multistage_configs): - forecast_config = ForecastConfig( - model_template="SILVERKITE_TWO_STAGE", - forecast_horizon=12, - metadata_param=MetadataParam( - time_col=TIME_COL, - value_col=VALUE_COL, - freq="H" - ), - model_components_param=ModelComponentsParam( - custom=dict( - silverkite_multistage_configs=silverkite_multistage_configs - ) - ), - evaluation_period_param=EvaluationPeriodParam( - cv_max_splits=1, - cv_horizon=12, - test_horizon=12 - ) - ) - return forecast_config - - -def test_get_regressor_cols(df, forecast_config): - """Tests the `self.get_regressor_cols` method.""" - template = SilverkiteMultistageTemplate() - df["reg1"] = 1 - df["reg2"] = 2 - template.df = df - template.config = forecast_config - forecast_config.model_components_param.custom[ - "silverkite_multistage_configs"][0].model_components.regressors["regressor_cols"] = ["reg1"] - forecast_config.model_components_param.custom[ - "silverkite_multistage_configs"][1].model_components.regressors["regressor_cols"] = ["reg2"] - regressor_cols = template.get_regressor_cols() - assert set(regressor_cols) == {"reg1", "reg2"} - - -def test_get_lagged_regressor_info(df, forecast_config): - template = SilverkiteMultistageTemplate() - df["reg1"] = 1 - df["reg2"] = 2 - template.df = df - template.config = forecast_config - forecast_config.model_components_param.custom[ - "silverkite_multistage_configs"][0].model_components.lagged_regressors["lagged_regressor_dict"] = [{ - "reg1": { - "lag_dict": {"orders": [12]}, - "series_na_fill_func": lambda s: s.bfill().ffill()} - }] - forecast_config.model_components_param.custom[ - "silverkite_multistage_configs"][1].model_components.lagged_regressors["lagged_regressor_dict"] = [{ - "reg2": { - "lag_dict": {"orders": [12]}, - "series_na_fill_func": lambda s: s.bfill().ffill()} - }] - lagged_regressor_info = template.get_lagged_regressor_info() - assert lagged_regressor_info == dict( - lagged_regressor_cols=["reg1", "reg2"], - overall_min_lag_order=12.0, - overall_max_lag_order=12.0 - ) - - -def test_get_hyperparameter_grid(df, forecast_config): - template = SilverkiteMultistageTemplate() - - # Error when `self.config` is not available. - with pytest.raises( - ValueError, - match="Forecast config must be provided"): - template.get_hyperparameter_grid() - - template.df = df - # Adds a list of length 2 to each submodel. - # The result hyperparameter grid should have 2 * 2 = 4 grids. - forecast_config.model_components_param.custom[ - "silverkite_multistage_configs"][0].model_components.seasonality["weekly_seasonality"] = [1, 2] - forecast_config.model_components_param.custom[ - "silverkite_multistage_configs"][1].model_components.seasonality["daily_seasonality"] = [10, 12] - template.config = forecast_config - hyperparameter_grid = template.get_hyperparameter_grid() - assert hyperparameter_grid["estimator__forecast_horizon"] == [12] - assert hyperparameter_grid["estimator__freq"] == ["H"] - assert len(hyperparameter_grid["estimator__model_configs"]) == 4 - assert hyperparameter_grid["estimator__model_configs"][0][0].estimator_params["weekly_seasonality"] == 1 - assert hyperparameter_grid["estimator__model_configs"][0][1].estimator_params["daily_seasonality"] == 10 - assert hyperparameter_grid["estimator__model_configs"][1][0].estimator_params["weekly_seasonality"] == 1 - assert hyperparameter_grid["estimator__model_configs"][1][1].estimator_params["daily_seasonality"] == 12 - assert hyperparameter_grid["estimator__model_configs"][2][0].estimator_params["weekly_seasonality"] == 2 - assert hyperparameter_grid["estimator__model_configs"][2][1].estimator_params["daily_seasonality"] == 10 - assert hyperparameter_grid["estimator__model_configs"][3][0].estimator_params["weekly_seasonality"] == 2 - assert hyperparameter_grid["estimator__model_configs"][3][1].estimator_params["daily_seasonality"] == 12 - - -def test_get_hyperparameter_grid_same_template(df, forecast_config): - # Tests the behavior of using the same ``model_template`` to override. - template = SilverkiteMultistageTemplate() - template.df = df - # Sets weekly seasonality to 5. - forecast_config.model_components_param.custom[ - "silverkite_multistage_configs"][1].model_components.seasonality["weekly_seasonality"] = 5 - # Removes the daily seasonality specification. - del forecast_config.model_components_param.custom[ - "silverkite_multistage_configs"][1].model_components.seasonality["daily_seasonality"] - template.config = forecast_config - hyperparameter_grid = template.get_hyperparameter_grid() - # The original template has daily seasonality 12 and no weekly seasonality. - # The second model was overriden with the same ``model_template``, which is ``SILVERKITE``, - # so the hyperparameter_grid should have both daily seasonality 12 and weekly seasonality 5. - assert hyperparameter_grid["estimator__model_configs"][0][1].estimator_params["daily_seasonality"] == 12 - assert hyperparameter_grid["estimator__model_configs"][0][1].estimator_params["weekly_seasonality"] == 5 - - -def test_get_hyperparameter_grid_different_template(df, forecast_config): - # Tests the behavior of using the different ``model_template`` to override. - template = SilverkiteMultistageTemplate() - template.df = df - # Sets the model template to be ``SILVERKITE_EMPTY``. - forecast_config.model_components_param.custom[ - "silverkite_multistage_configs"][1].model_template = "SILVERKITE_EMPTY" - # Sets weekly seasonality to 5. - forecast_config.model_components_param.custom[ - "silverkite_multistage_configs"][1].model_components.seasonality["weekly_seasonality"] = 5 - # Removes the daily seasonality specification. - del forecast_config.model_components_param.custom[ - "silverkite_multistage_configs"][1].model_components.seasonality["daily_seasonality"] - template.config = forecast_config - hyperparameter_grid = template.get_hyperparameter_grid() - # The original template has daily seasonality 12 and no weekly seasonality. - # The second model was overriden with a different ``model_template``, which is ``SILVERKITE_EMPTY``, - # so the hyperparameter_grid should have only weekly seasonality 5 and daily seasonality 0. - assert hyperparameter_grid["estimator__model_configs"][0][1].estimator_params["daily_seasonality"] == 0 - assert hyperparameter_grid["estimator__model_configs"][0][1].estimator_params["weekly_seasonality"] == 5 - - -def test_get_hyperparameter_grid_extra_configs(df, forecast_config): - """Tests gets hyperparameter grid when the default and override have different lengths.""" - # The empty template has no configs. - # The override components has two configs. - forecast_config.model_template = "SILVERKITE_MULTISTAGE_EMPTY" - template = SilverkiteMultistageTemplate() - template.df = df - template.config = forecast_config - # The grid should have exactly two configs which are the same as the override configs. - hyperparameter_grid = template.get_hyperparameter_grid() - assert hyperparameter_grid["estimator__model_configs"][0][0].estimator_params == { - 'yearly_seasonality': 12, - 'quarterly_seasonality': 5, - 'monthly_seasonality': 5, - 'weekly_seasonality': 4, - 'daily_seasonality': 0, - 'growth_term': 'linear', - 'changepoints_dict': None, - 'seasonality_changepoints_dict': None, - 'holidays_to_model_separately': 'auto', - 'holiday_lookup_countries': 'auto', - 'holiday_pre_num_days': 1, - 'holiday_post_num_days': 1, - 'holiday_pre_post_num_dict': None, - 'daily_event_df_dict': None, - 'feature_sets_enabled': 'auto', - 'fit_algorithm_dict': { - 'fit_algorithm': 'ridge', - 'fit_algorithm_params': None}, - 'max_daily_seas_interaction_order': 0, - 'max_weekly_seas_interaction_order': 2, - 'extra_pred_cols': [], - 'drop_pred_cols': None, - 'explicit_pred_cols': None, - 'min_admissible_value': None, - 'max_admissible_value': None, - 'autoreg_dict': 'auto', - 'simulation_num': 10, - 'normalize_method': None, - 'regressor_cols': [], - 'lagged_regressor_dict': None, - 'regression_weight_col': None, - 'uncertainty_dict': None, - 'origin_for_time_vars': None, - 'train_test_thresh': None, - 'training_fraction': None} - assert hyperparameter_grid["estimator__model_configs"][0][1].estimator_params == { - 'yearly_seasonality': 0, - 'quarterly_seasonality': 0, - 'monthly_seasonality': 0, - 'weekly_seasonality': 0, - 'daily_seasonality': 12, - 'growth_term': None, - 'changepoints_dict': None, - 'seasonality_changepoints_dict': None, - 'holidays_to_model_separately': [], - 'holiday_lookup_countries': [], - 'holiday_pre_num_days': 0, - 'holiday_post_num_days': 0, - 'holiday_pre_post_num_dict': None, - 'daily_event_df_dict': None, - 'feature_sets_enabled': 'auto', - 'fit_algorithm_dict': { - 'fit_algorithm': 'ridge', - 'fit_algorithm_params': None}, - 'max_daily_seas_interaction_order': 5, - 'max_weekly_seas_interaction_order': 2, - 'extra_pred_cols': [], - 'drop_pred_cols': None, - 'explicit_pred_cols': None, - 'min_admissible_value': None, - 'max_admissible_value': None, - 'normalize_method': None, - 'autoreg_dict': 'auto', - 'simulation_num': 10, - 'regressor_cols': [], - 'lagged_regressor_dict': None, - 'regression_weight_col': None, - 'uncertainty_dict': None, - 'origin_for_time_vars': None, - 'train_test_thresh': None, - 'training_fraction': None} - - -def test_get_silverkite_multistage_configs_override(df, forecast_config): - template = SilverkiteMultistageTemplate() - template.df = df - # Adds a list of length 2 to each submodel. - # The result hyperparameter grid should have 2 * 2 = 4 grids. - forecast_config.model_components_param.custom[ - "silverkite_multistage_configs"][0].model_components.seasonality["weekly_seasonality"] = [1, 2] - forecast_config.model_components_param.custom[ - "silverkite_multistage_configs"][1].model_components.seasonality["daily_seasonality"] = [10, 12] - template.config = forecast_config - - default_model_components = template._SilverkiteMultistageTemplate__get_default_model_components( - forecast_config.model_template) - default_silverkite_multistage_configs = default_model_components.custom.get("silverkite_multistage_configs") - - new_configs = template._SilverkiteMultistageTemplate__get_silverkite_multistage_configs_override( - custom=forecast_config.model_components_param.custom, - model_template="SILVERKITE_TWO_STAGE", - default_silverkite_multistage_configs=default_silverkite_multistage_configs - ) - - assert new_configs == [ - SilverkiteMultistageTemplateConfig( - train_length='30D', - fit_length=None, - agg_func='nanmean', - agg_freq='D', - model_template='SILVERKITE', - model_components=ModelComponentsParam( - autoregression={ - 'autoreg_dict': 'auto' - }, - changepoints={ - 'changepoints_dict': None, - 'seasonality_changepoints_dict': None - }, - custom={ - 'fit_algorithm_dict': { - 'fit_algorithm': 'ridge', - 'fit_algorithm_params': None - }, - 'feature_sets_enabled': 'auto', - 'max_daily_seas_interaction_order': 0, - 'max_weekly_seas_interaction_order': 2, - 'extra_pred_cols': [], - 'min_admissible_value': None, - 'max_admissible_value': None - }, - events={ - 'holidays_to_model_separately': 'auto', - 'holiday_lookup_countries': 'auto', - 'holiday_pre_num_days': 1, - 'holiday_post_num_days': 1, - 'holiday_pre_post_num_dict': None, - 'daily_event_df_dict': None - }, - growth={ - 'growth_term': 'linear' - }, - hyperparameter_override={}, - regressors={ - 'regressor_cols': [] - }, - lagged_regressors={ - 'lagged_regressor_dict': None - }, - seasonality={ - 'yearly_seasonality': 12, - 'quarterly_seasonality': 5, - 'monthly_seasonality': 5, - 'weekly_seasonality': [1, 2], - 'daily_seasonality': 0}, - uncertainty={ - 'uncertainty_dict': None - })), - SilverkiteMultistageTemplateConfig( - train_length='7D', - fit_length=None, - agg_func='nanmean', - agg_freq=None, - model_template='SILVERKITE', - model_components=ModelComponentsParam( - autoregression={ - 'autoreg_dict': 'auto' - }, - changepoints={ - 'changepoints_dict': None, - 'seasonality_changepoints_dict': None - }, - custom={ - 'fit_algorithm_dict': { - 'fit_algorithm': 'ridge', - 'fit_algorithm_params': None - }, - 'feature_sets_enabled': 'auto', - 'max_daily_seas_interaction_order': 5, - 'max_weekly_seas_interaction_order': 2, - 'extra_pred_cols': [], - 'min_admissible_value': None, - 'max_admissible_value': None - }, - events={ - 'holidays_to_model_separately': [], - 'holiday_lookup_countries': [], - 'holiday_pre_num_days': 0, - 'holiday_post_num_days': 0, - 'holiday_pre_post_num_dict': None, - 'daily_event_df_dict': None - }, - growth={ - 'growth_term': None - }, - hyperparameter_override={}, - regressors={ - 'regressor_cols': [] - }, - lagged_regressors={ - 'lagged_regressor_dict': None - }, - seasonality={ - 'yearly_seasonality': 0, - 'quarterly_seasonality': 0, - 'monthly_seasonality': 0, - 'weekly_seasonality': 0, - 'daily_seasonality': [10, 12] - }, - uncertainty={ - 'uncertainty_dict': None - }))] - - -def test_get_estimators_and_params_from_template_configs(df, forecast_config): - template = SilverkiteMultistageTemplate() - template.df = df - # Adds a list of length 2 to each submodel. - # The result hyperparameter grid should have 2 * 2 = 4 grids. - forecast_config.model_components_param.custom[ - "silverkite_multistage_configs"][0].model_components.seasonality["weekly_seasonality"] = [1, 2] - forecast_config.model_components_param.custom[ - "silverkite_multistage_configs"][1].model_components.seasonality["daily_seasonality"] = [10, 12] - template.config = forecast_config - - default_model_components = template._SilverkiteMultistageTemplate__get_default_model_components( - forecast_config.model_template) - default_silverkite_multistage_configs = default_model_components.custom.get("silverkite_multistage_configs") - - new_configs = template._SilverkiteMultistageTemplate__get_silverkite_multistage_configs_override( - custom=forecast_config.model_components_param.custom, - model_template="SILVERKITE_TWO_STAGE", - default_silverkite_multistage_configs=default_silverkite_multistage_configs - ) - - estimator_list, estimator_params_list = template._SilverkiteMultistageTemplate__get_estimators_and_params_from_template_configs( - new_configs=new_configs - ) - - # We can't test ``time_properties`` - for d in estimator_params_list: - del d["estimator__time_properties"] - - assert estimator_list == [SimpleSilverkiteEstimator, SimpleSilverkiteEstimator] - assert estimator_params_list == [ - { - 'estimator__yearly_seasonality': [12], - 'estimator__quarterly_seasonality': [5], - 'estimator__monthly_seasonality': [5], - 'estimator__weekly_seasonality': [1, 2], - 'estimator__daily_seasonality': [0], - 'estimator__growth_term': ['linear'], - 'estimator__changepoints_dict': [None], - 'estimator__seasonality_changepoints_dict': [None], - 'estimator__holidays_to_model_separately': ['auto'], - 'estimator__holiday_lookup_countries': ['auto'], - 'estimator__holiday_pre_num_days': [1], - 'estimator__holiday_post_num_days': [1], - 'estimator__holiday_pre_post_num_dict': [None], - 'estimator__daily_event_df_dict': [None], - 'estimator__feature_sets_enabled': ['auto'], - 'estimator__fit_algorithm_dict': [{ - 'fit_algorithm': 'ridge', - 'fit_algorithm_params': None}], - 'estimator__max_daily_seas_interaction_order': [0], - 'estimator__max_weekly_seas_interaction_order': [2], - 'estimator__extra_pred_cols': [[]], - 'estimator__drop_pred_cols': [None], - 'estimator__explicit_pred_cols': [None], - 'estimator__min_admissible_value': [None], - 'estimator__max_admissible_value': [None], - 'estimator__normalize_method': [None], - 'estimator__autoreg_dict': ['auto'], - 'estimator__simulation_num': [10], - 'estimator__regressor_cols': [[]], - 'estimator__lagged_regressor_dict': [None], - 'estimator__regression_weight_col': [None], - 'estimator__uncertainty_dict': [None], - 'estimator__origin_for_time_vars': [None], - 'estimator__train_test_thresh': [None], - 'estimator__training_fraction': [None] - }, - { - 'estimator__yearly_seasonality': [0], - 'estimator__quarterly_seasonality': [0], - 'estimator__monthly_seasonality': [0], - 'estimator__weekly_seasonality': [0], - 'estimator__daily_seasonality': [10, 12], - 'estimator__growth_term': [None], - 'estimator__changepoints_dict': [None], - 'estimator__seasonality_changepoints_dict': [None], - 'estimator__holidays_to_model_separately': [[]], - 'estimator__holiday_lookup_countries': [[]], - 'estimator__holiday_pre_num_days': [0], - 'estimator__holiday_post_num_days': [0], - 'estimator__holiday_pre_post_num_dict': [None], - 'estimator__daily_event_df_dict': [None], - 'estimator__feature_sets_enabled': ['auto'], - 'estimator__fit_algorithm_dict': [{ - 'fit_algorithm': 'ridge', - 'fit_algorithm_params': None}], - 'estimator__max_daily_seas_interaction_order': [5], - 'estimator__max_weekly_seas_interaction_order': [2], - 'estimator__extra_pred_cols': [[]], - 'estimator__drop_pred_cols': [None], - 'estimator__explicit_pred_cols': [None], - 'estimator__min_admissible_value': [None], - 'estimator__max_admissible_value': [None], - 'estimator__normalize_method': [None], - 'estimator__autoreg_dict': ['auto'], - 'estimator__simulation_num': [10], - 'estimator__regressor_cols': [[]], - 'estimator__lagged_regressor_dict': [None], - 'estimator__regression_weight_col': [None], - 'estimator__uncertainty_dict': [None], - 'estimator__origin_for_time_vars': [None], - 'estimator__train_test_thresh': [None], - 'estimator__training_fraction': [None] - }] - - -def test_flatten_estimator_params_list(): - template = SilverkiteMultistageTemplate() - x = [{ - "estimator__a": [1], - "estimator__b": [2, 3] - }, { - "estimator__c": [4, 5] - }] - flattened_params = template._SilverkiteMultistageTemplate__flatten_estimator_params_list( - estimator_params_list=x - ) - assert flattened_params == [ - [{'a': 1, 'b': 2}, {'c': 4}], - [{'a': 1, 'b': 2}, {'c': 5}], - [{'a': 1, 'b': 3}, {'c': 4}], - [{'a': 1, 'b': 3}, {'c': 5}] - ] - - -def test_silverkite_multistage_model_template(df, forecast_config): - forecaster = Forecaster() - forecast_result = forecaster.run_forecast_config( - df=df, - config=forecast_config - ) - assert forecast_result.backtest is not None - assert forecast_result.grid_search is not None - assert forecast_result.forecast is not None - - assert len(forecast_result.model[-1].models) == 2 - # Checks the forecast horizons in each model. - assert forecast_result.model[-1].models[0].forecast_horizon == 1 # daily model - assert forecast_result.model[-1].models[1].forecast_horizon == 12 # hourly model - - # Checks the autoregression orders are as expected. - assert "y_lag1" in forecast_result.model[-1].models[0].model_dict["x_mat"].columns - assert "y_lag12" in forecast_result.model[-1].models[1].model_dict["x_mat"].columns - - # Checks the forecast is not NAN - assert len(forecast_result.forecast.df_test[PREDICTED_COL].dropna()) == len(forecast_result.forecast.df_test) - assert len(forecast_result.backtest.df_test[PREDICTED_COL].dropna()) == len(forecast_result.backtest.df_test) - - -def test_silverkite_multistage_model_template_with_regressor(df, forecast_config): - forecaster = Forecaster() - forecast_config.model_components_param.custom[ - "silverkite_multistage_configs"][0].model_components.regressors["regressor_cols"] = ["regressor"] - df.iloc[-12:, 1] = np.nan - forecast_result = forecaster.run_forecast_config( - df=df, - config=forecast_config - ) - assert forecast_result.backtest is not None - assert forecast_result.grid_search is not None - assert forecast_result.forecast is not None - - assert len(forecast_result.model[-1].models) == 2 - # Checks the forecast horizons in each model. - assert forecast_result.model[-1].models[0].forecast_horizon == 1 # daily model - assert forecast_result.model[-1].models[1].forecast_horizon == 12 # hourly model - - # Checks the autoregression orders are as expected. - assert "y_lag1" in forecast_result.model[-1].models[0].model_dict["x_mat"].columns - assert "y_lag12" in forecast_result.model[-1].models[1].model_dict["x_mat"].columns - - # Checks that the regressor column is included. - assert "regressor" in forecast_result.model[-1].models[0].model_dict["x_mat"].columns - - # Checks the forecast is not NAN - assert len(forecast_result.forecast.df_test[PREDICTED_COL].dropna()) == len(forecast_result.forecast.df_test) - assert len(forecast_result.backtest.df_test[PREDICTED_COL].dropna()) == len(forecast_result.backtest.df_test) - - -def test_silverkite_multistage_model_template_with_lagged_regressor(df, forecast_config): - forecaster = Forecaster() - forecast_config.model_components_param.custom[ - "silverkite_multistage_configs"][0].model_components.lagged_regressors["lagged_regressor_dict"] = [{ - "regressor": { - "lag_dict": {"orders": [12]}, - "series_na_fill_func": lambda s: s.bfill().ffill()} - }] - forecast_result = forecaster.run_forecast_config( - df=df, - config=forecast_config - ) - assert forecast_result.backtest is not None - assert forecast_result.grid_search is not None - assert forecast_result.forecast is not None - - assert len(forecast_result.model[-1].models) == 2 - # Checks the forecast horizons in each model. - assert forecast_result.model[-1].models[0].forecast_horizon == 1 # daily model - assert forecast_result.model[-1].models[1].forecast_horizon == 12 # hourly model - - # Checks the autoregression orders are as expected. - assert "y_lag1" in forecast_result.model[-1].models[0].model_dict["x_mat"].columns - assert "y_lag12" in forecast_result.model[-1].models[1].model_dict["x_mat"].columns - - # Checks that the regressor column is included. - assert "regressor_lag12" in forecast_result.model[-1].models[0].model_dict["x_mat"].columns - - # Checks the forecast is not NAN - assert len(forecast_result.forecast.df_test[PREDICTED_COL].dropna()) == len(forecast_result.forecast.df_test) - assert len(forecast_result.backtest.df_test[PREDICTED_COL].dropna()) == len(forecast_result.backtest.df_test) - - -def test_errors(df, forecast_config): - # No configs with SILVERKITE_MULTISTAGE_EMPTY. - template = SilverkiteMultistageTemplate() - template.df = df - forecast_config.model_components_param.custom["silverkite_multistage_configs"] = None - forecast_config.model_template = "SILVERKITE_MULTISTAGE_EMPTY" - template.config = forecast_config - with pytest.raises( - ValueError, - match="``SILVERKITE_MULTISTAGE_EMPTY`` can not be used without over"): - template.get_hyperparameter_grid() - - # The config has wrong type. - template = SilverkiteMultistageTemplate() - template.df = df - forecast_config.model_components_param.custom["silverkite_multistage_configs"] = 5 - forecast_config.model_template = "SILVERKITE_TWO_STAGE" - template.config = forecast_config - with pytest.raises( - ValueError, - match="The ``silverkite_multistage_configs`` parameter must be a list of"): - template.get_hyperparameter_grid() - - -def test_get_default_model_components(): - template = SilverkiteMultistageTemplate() - assert template._SilverkiteMultistageTemplate__get_default_model_components( - "SILVERKITE_TWO_STAGE") == SILVERKITE_TWO_STAGE - with pytest.raises( - ValueError, - match="The template name "): - template._SilverkiteMultistageTemplate__get_default_model_components("some_template") - - -def test_get_template_class(): - template = SilverkiteMultistageTemplate() - assert template._SilverkiteMultistageTemplate__get_template_class( - ForecastConfig(model_template="SILVERKITE") - ) == SimpleSilverkiteTemplate - with pytest.raises( - ValueError, - match="Currently Silverkite Multistage only supports"): - template._SilverkiteMultistageTemplate__get_template_class( - ForecastConfig(model_template="DAILY_CP_NONE") - ) - - -def test_uncertainty(df, forecast_config): - """Tests the uncertainty methods.""" - - # Tests no coverage and no uncertainty, there is no uncertainty. - - forecaster = Forecaster() - forecast_result = forecaster.run_forecast_config( - df=df, - config=forecast_config - ) - assert PREDICTED_LOWER_COL not in forecast_result.backtest.df_test - assert PREDICTED_LOWER_COL not in forecast_result.forecast.df_test - - # Tests coverage and no uncertainty, there is uncertainty. - forecast_config.coverage = 0.99 - forecaster = Forecaster() - forecast_result = forecaster.run_forecast_config( - df=df, - config=forecast_config - ) - assert PREDICTED_LOWER_COL in forecast_result.backtest.df_test - assert PREDICTED_LOWER_COL in forecast_result.forecast.df_test - assert forecast_result.model[-1].coverage == 0.99 - # Default method is used when coverage is given but ``uncertainty_dict`` is not given. - assert (forecast_result.model[-1].uncertainty_model.UNCERTAINTY_METHOD - == UncertaintyMethodEnum.simple_conditional_residuals.name) - last_interval_width_99 = (forecast_result.forecast.df[PREDICTED_UPPER_COL].iloc[-1] - - forecast_result.forecast.df[PREDICTED_LOWER_COL].iloc[-1]) - - # Tests coverage and uncertainty, there is uncertainty. - forecast_config.model_components_param.uncertainty = dict( - uncertainty_dict=dict( - uncertainty_method=UncertaintyMethodEnum.simple_conditional_residuals.name, - params=dict( - conditional_cols=["dow"] - ) - ) - ) - forecaster = Forecaster() - forecast_result = forecaster.run_forecast_config( - df=df, - config=forecast_config - ) - assert PREDICTED_LOWER_COL in forecast_result.backtest.df_test - assert PREDICTED_LOWER_COL in forecast_result.forecast.df_test - assert forecast_result.model[-1].coverage == 0.99 - # The last 2 days intervals should have different lengths due to conditioning on "dow". - last_day_interval_width_99 = (forecast_result.forecast.df[PREDICTED_UPPER_COL].iloc[-1] - - forecast_result.forecast.df[PREDICTED_LOWER_COL].iloc[-1]) - second_last_day_interval_width_99 = (forecast_result.forecast.df[PREDICTED_UPPER_COL].iloc[-25] - - forecast_result.forecast.df[PREDICTED_LOWER_COL].iloc[-25]) - assert last_day_interval_width_99 != second_last_day_interval_width_99 - - # Tests 95% coverage has narrower interval. - forecast_config.coverage = 0.95 - forecast_config.model_components_param.uncertainty = dict( - uncertainty_dict=dict( - uncertainty_method=UncertaintyMethodEnum.simple_conditional_residuals.name, - params=dict() - ) - ) - forecaster = Forecaster() - forecast_result = forecaster.run_forecast_config( - df=df, - config=forecast_config - ) - assert PREDICTED_LOWER_COL in forecast_result.backtest.df_test - assert PREDICTED_LOWER_COL in forecast_result.forecast.df_test - assert forecast_result.model[-1].coverage == 0.95 - # 95 interval is narrower than 99 interval. - last_interval_width_95 = (forecast_result.forecast.df[PREDICTED_UPPER_COL].iloc[-1] - - forecast_result.forecast.df[PREDICTED_LOWER_COL].iloc[-1]) - assert last_interval_width_99 > last_interval_width_95 - - -def test_uncertainty_fail(df, forecast_config): - """Tests the pipeline won't fail when uncertainty fails.""" - with LogCapture(LOGGER_NAME) as log_capture: - forecast_config.coverage = 0.95 - forecast_config.model_components_param.uncertainty = dict( - uncertainty_dict=dict( - uncertainty_method=UncertaintyMethodEnum.simple_conditional_residuals.name, - params=dict( - conditional_cols=["dowww"] - ) - ) - ) - forecaster = Forecaster() - forecast_result = forecaster.run_forecast_config( - df=df, - config=forecast_config - ) - # The forecast is still generated. - assert forecast_result.forecast is not None - assert (LOGGER_NAME, - "WARNING", - "The following errors occurred during fitting the uncertainty model, " - "the uncertainty model is skipped. " - "The following conditional columns are not found in `train_df`: ['dowww'].") in log_capture.actual() diff --git a/greykite/tests/framework/templates/test_silverkite_multistage_template_config.py b/greykite/tests/framework/templates/test_silverkite_multistage_template_config.py deleted file mode 100644 index 5e1d691..0000000 --- a/greykite/tests/framework/templates/test_silverkite_multistage_template_config.py +++ /dev/null @@ -1,160 +0,0 @@ -from greykite.framework.templates.auto_arima_template import AutoArimaTemplate -from greykite.framework.templates.autogen.forecast_config import ModelComponentsParam -from greykite.framework.templates.prophet_template import ProphetTemplate -from greykite.framework.templates.silverkite_multistage_template_config import SILVERKITE_MULTISTAGE_EMPTY -from greykite.framework.templates.silverkite_multistage_template_config import SILVERKITE_TWO_STAGE -from greykite.framework.templates.silverkite_multistage_template_config import SilverkiteMultistageModelTemplateEnum -from greykite.framework.templates.silverkite_multistage_template_config import SilverkiteMultistageTemplateConfig -from greykite.framework.templates.silverkite_multistage_template_config import SilverkiteMultistageTemplateConstants -from greykite.framework.templates.silverkite_template import SilverkiteTemplate -from greykite.framework.templates.simple_silverkite_template import SimpleSilverkiteTemplate - - -def test_silvekite_multistage_template_constants(): - """Tests `silverkite_muiltistage_template_constants`""" - constants = SilverkiteMultistageTemplateConstants() - - assert constants.SILVERKITE_TWO_STAGE == SILVERKITE_TWO_STAGE - assert constants.SILVERKITE_MULTISTAGE_EMPTY == SILVERKITE_MULTISTAGE_EMPTY - assert constants.SilverkiteMultistageModelTemplateEnum == SilverkiteMultistageModelTemplateEnum - - -def test_silverkite_multistage_template_config(): - """Tests the `SilverkiteMultistageTemplateConfig` data class.""" - assert SilverkiteMultistageTemplateConfig.train_length == f"{7 * 56}D" - assert SilverkiteMultistageTemplateConfig.fit_length is None - assert SilverkiteMultistageTemplateConfig.agg_freq is None - assert SilverkiteMultistageTemplateConfig.agg_func == "nanmean" - assert SilverkiteMultistageTemplateConfig.model_template == "SILVERKITE" - assert SilverkiteMultistageTemplateConfig.model_components is None - - -def test_silverkite_multistage(): - """Tests the SILVERKITE_TWO_STAGE template. To alert any changes to the template.""" - assert len(SILVERKITE_TWO_STAGE.custom["silverkite_multistage_configs"]) == 2 - - assert SILVERKITE_TWO_STAGE.custom["silverkite_multistage_configs"][0].train_length == f"{7 * 56}D" - assert SILVERKITE_TWO_STAGE.custom["silverkite_multistage_configs"][0].fit_length is None - assert SILVERKITE_TWO_STAGE.custom["silverkite_multistage_configs"][0].agg_func == "nanmean" - assert SILVERKITE_TWO_STAGE.custom["silverkite_multistage_configs"][0].agg_freq == "D" - assert SILVERKITE_TWO_STAGE.custom["silverkite_multistage_configs"][0].model_template == "SILVERKITE" - assert SILVERKITE_TWO_STAGE.custom["silverkite_multistage_configs"][0].model_components == ModelComponentsParam( - seasonality={ - "yearly_seasonality": 12, - "quarterly_seasonality": 5, - "monthly_seasonality": 5, - "weekly_seasonality": 4, - "daily_seasonality": 0, - }, - growth={ - "growth_term": "linear" - }, - events={ - "holidays_to_model_separately": "auto", - "holiday_lookup_countries": "auto", - "holiday_pre_num_days": 1, - "holiday_post_num_days": 1, - "holiday_pre_post_num_dict": None, - "daily_event_df_dict": None, - }, - changepoints={ - "changepoints_dict": { - "method": "auto", - "resample_freq": "D", - "regularization_strength": 0.5, - "potential_changepoint_distance": "15D", - "no_changepoint_distance_from_end": "30D", - "yearly_seasonality_order": 15, - "yearly_seasonality_change_freq": "365D" - }, - "seasonality_changepoints_dict": None - }, - autoregression={ - "autoreg_dict": "auto" - }, - regressors={ - "regressor_cols": [] - }, - lagged_regressors={ - "lagged_regressor_dict": None - }, - uncertainty={ - "uncertainty_dict": None - }, - custom={ - "fit_algorithm_dict": { - "fit_algorithm": "ridge", - "fit_algorithm_params": None, - }, - "feature_sets_enabled": "auto", # "auto" based on data freq and size - "max_daily_seas_interaction_order": 0, - "max_weekly_seas_interaction_order": 2, - "extra_pred_cols": [], - "min_admissible_value": None, - "max_admissible_value": None, - } - ) - - assert SILVERKITE_TWO_STAGE.custom["silverkite_multistage_configs"][1].train_length == f"{7 * 4}D" - assert SILVERKITE_TWO_STAGE.custom["silverkite_multistage_configs"][1].fit_length is None - assert SILVERKITE_TWO_STAGE.custom["silverkite_multistage_configs"][1].agg_func == "nanmean" - assert SILVERKITE_TWO_STAGE.custom["silverkite_multistage_configs"][1].agg_freq is None - assert SILVERKITE_TWO_STAGE.custom["silverkite_multistage_configs"][1].model_template == "SILVERKITE" - assert SILVERKITE_TWO_STAGE.custom["silverkite_multistage_configs"][1].model_components == ModelComponentsParam( - seasonality={ - "yearly_seasonality": 0, - "quarterly_seasonality": 0, - "monthly_seasonality": 0, - "weekly_seasonality": 0, - "daily_seasonality": 12, - }, - growth={ - "growth_term": None - }, - events={ - "holidays_to_model_separately": [], - "holiday_lookup_countries": [], - "holiday_pre_num_days": 0, - "holiday_post_num_days": 0, - "holiday_pre_post_num_dict": None, - "daily_event_df_dict": None, - }, - changepoints={ - "changepoints_dict": None, - "seasonality_changepoints_dict": None - }, - autoregression={ - "autoreg_dict": "auto" - }, - regressors={ - "regressor_cols": [] - }, - lagged_regressors={ - "lagged_regressor_dict": None - }, - uncertainty={ - "uncertainty_dict": None - }, - custom={ - "fit_algorithm_dict": { - "fit_algorithm": "ridge", - "fit_algorithm_params": None, - }, - "feature_sets_enabled": "auto", # "auto" based on data freq and size - "max_daily_seas_interaction_order": 5, - "max_weekly_seas_interaction_order": 2, - "extra_pred_cols": [], - "min_admissible_value": None, - "max_admissible_value": None, - } - ) - - -def test_silverkite_multistage_model_template_enum(): - """Tests the members of `SilverkiteMultistageModelTemplateEnum`.""" - assert SilverkiteMultistageModelTemplateEnum.SILVERKITE.value == SimpleSilverkiteTemplate - assert SilverkiteMultistageModelTemplateEnum.SILVERKITE_EMPTY.value == SimpleSilverkiteTemplate - assert SilverkiteMultistageModelTemplateEnum.SILVERKITE_WITH_AR.value == SimpleSilverkiteTemplate - assert SilverkiteMultistageModelTemplateEnum.PROPHET.value == ProphetTemplate - assert SilverkiteMultistageModelTemplateEnum.SK.value == SilverkiteTemplate - assert SilverkiteMultistageModelTemplateEnum.AUTO_ARIMA.value == AutoArimaTemplate diff --git a/greykite/tests/sklearn/estimator/test_silverkite_multistage_estimator.py b/greykite/tests/sklearn/estimator/test_silverkite_multistage_estimator.py deleted file mode 100644 index 92736ec..0000000 --- a/greykite/tests/sklearn/estimator/test_silverkite_multistage_estimator.py +++ /dev/null @@ -1,518 +0,0 @@ -import datetime - -import numpy as np -import pandas as pd -import pytest -from testfixtures import LogCapture - -import greykite.common.constants as cst -from greykite.common.testing_utils import generate_df_for_tests -from greykite.sklearn.estimator.silverkite_multistage_estimator import AggregationFunctionEnum -from greykite.sklearn.estimator.silverkite_multistage_estimator import SilverkiteMultistageEstimator -from greykite.sklearn.estimator.silverkite_multistage_estimator import SilverkiteMultistageModelConfig -from greykite.sklearn.estimator.simple_silverkite_estimator import SimpleSilverkiteEstimator -from greykite.sklearn.uncertainty.uncertainty_methods import UncertaintyMethodEnum - - -@pytest.fixture -def params(): - params = dict( - forecast_horizon=12, - freq="H", - model_configs=[ - SilverkiteMultistageModelConfig( - train_length="30D", - fit_length="30D", - agg_func="mean", - agg_freq="D", - estimator=SimpleSilverkiteEstimator, - estimator_params=dict( - coverage=None, - forecast_horizon=1, - freq="D", - daily_seasonality=0, - weekly_seasonality=3, - quarterly_seasonality=5, - monthly_seasonality=0, - yearly_seasonality=0, - changepoints_dict=None, - autoreg_dict="auto", - holidays_to_model_separately="auto", - holiday_lookup_countries="auto", - holiday_pre_num_days=1, - holiday_post_num_days=1, - holiday_pre_post_num_dict=None, - daily_event_df_dict=None, - fit_algorithm_dict={ - "fit_algorithm": "ridge", - "fit_algorithm_params": None, - }, - feature_sets_enabled="auto", - max_daily_seas_interaction_order=5, - max_weekly_seas_interaction_order=2, - extra_pred_cols=[], - min_admissible_value=None, - normalize_method="min_max" - ) - ), - SilverkiteMultistageModelConfig( - train_length="7D", - fit_length="7D", - agg_func="mean", - agg_freq=None, - estimator=SimpleSilverkiteEstimator, - estimator_params=dict( - coverage=None, - forecast_horizon=12, - freq="H", - growth_term=None, - daily_seasonality=12, - weekly_seasonality=0, - quarterly_seasonality=0, - monthly_seasonality=0, - yearly_seasonality=0, - autoreg_dict="auto", - holidays_to_model_separately=[], - holiday_lookup_countries=[], - holiday_pre_num_days=0, - holiday_post_num_days=0, - holiday_pre_post_num_dict=None, - daily_event_df_dict=None, - fit_algorithm_dict={ - "fit_algorithm": "ridge", - "fit_algorithm_params": None, - }, - feature_sets_enabled="auto", - max_daily_seas_interaction_order=5, - max_weekly_seas_interaction_order=2, - regressor_cols=None, - extra_pred_cols=None, - min_admissible_value=None, - normalize_method="min_max" - ) - ) - ] - ) - return params - - -@pytest.fixture -def hourly_data_with_reg(): - df = generate_df_for_tests( - freq="H", - periods=24 * 7 * 8, - train_start_date=datetime.datetime(2018, 1, 1), - conti_year_origin=2018)["df"] - df["regressor"] = np.arange(len(df)) - return df - - -def test_silverkite_multistage_model_config(): - """Tests the default parameters in ``SilverkiteMultistageModelConfig``.""" - config = SilverkiteMultistageModelConfig() - assert config.train_length == f"{7 * 56}D" - assert config.fit_length is None - assert config.agg_func == "nanmean" - assert config.agg_freq is None - assert config.estimator == SimpleSilverkiteEstimator - assert config.estimator_params is None - - -def test_aggregate_function_enum(): - """Tests the functions in ``AggregationFunctionEnum``.""" - array = np.array([1, 2, 6]) - assert AggregationFunctionEnum.mean.value(array) == 3 - assert AggregationFunctionEnum.median.value(array) == 2 - assert AggregationFunctionEnum.nanmean.value(array) == 3 - assert AggregationFunctionEnum.maximum.value(array) == 6 - assert AggregationFunctionEnum.minimum.value(array) == 1 - - -def test_set_up(params): - """Tests the set up of ``SilverkiteMultistageEstimator``.""" - # Instatiation. - model = SilverkiteMultistageEstimator(**params) - assert model.model_configs == params["model_configs"] - assert model.forecast_horizon == params["forecast_horizon"] - assert model.freq == params["freq"] - assert model.train_lengths is None - assert model.fit_lengths is None - assert model.agg_funcs is None - assert model.agg_freqs is None - assert model.estimators is None - assert model.estimator_params is None - assert model.train_lengths_in_seconds is None - assert model.fit_lengths_in_seconds is None - assert model.fit_lengths_in_seconds is None - assert model.max_ar_orders is None - assert model.data_freq_in_seconds is None - assert model.num_points_per_agg_freqs is None - assert model.models is None - assert model.fit_df is None - assert model.train_end is None - - # Initialization for some derived parameters. - model._initialize() - assert model.train_lengths == ["30D", "7D"] - assert model.fit_lengths == ["30D", "7D"] - assert len(model.agg_funcs) == 2 - assert model.agg_freqs == ["D", "H"] - assert model.estimators == [SimpleSilverkiteEstimator, SimpleSilverkiteEstimator] - assert model.estimator_params == [config.estimator_params for config in params["model_configs"]] - assert model.train_lengths_in_seconds == [60 * 60 * 24 * 30, 60 * 60 * 24 * 7] - assert model.fit_lengths_in_seconds == [60 * 60 * 24 * 30, 60 * 60 * 24 * 7] - assert len(model.models) == 2 - assert model.data_freq_in_seconds == 60 * 60 - - -def test_get_agg_func(params): - model = SilverkiteMultistageEstimator(**params) - with pytest.raises( - ValueError, - match="The aggregation function "): - model._get_agg_func("some_function") - - -def test_get_freq_col(params): - model = SilverkiteMultistageEstimator(**params) - model.time_col_ = "ttt" - freq_col = model._get_freq_col(freq="D") - assert freq_col == "ttt__D" - - -def test_get_non_time_cols(params): - model = SilverkiteMultistageEstimator(**params) - model.time_col_ = "ttt" - columns = ["ttt", "ttt__D", "ttt__5T", "ttt_H", "value", "reg"] - non_time_cols = model._get_non_time_cols(columns=columns) - assert non_time_cols == ["ttt_H", "value", "reg"] - - -def test_get_num_points_per_agg_freq(params): - model = SilverkiteMultistageEstimator(**params) - assert model._get_num_points_per_agg_freq( - data_freq="H", - agg_freqs=["D", "2H"]) == [24, 2] - - -def test_add_agg_freq_cols(params, hourly_data_with_reg): - model = SilverkiteMultistageEstimator(**params) - model.time_col_ = cst.TIME_COL - model.value_col_ = cst.VALUE_COL - model._initialize() - df = model._add_agg_freq_cols(df=hourly_data_with_reg) - assert df.shape[1] == 5 # includes the original 3 and 2 extra columns from the two aggregations. - assert list(df.columns) == [cst.TIME_COL, cst.VALUE_COL, "regressor", - f"{cst.TIME_COL}__D", f"{cst.TIME_COL}__H"] - assert df[f"{cst.TIME_COL}__D"].unique().shape[0] == 7 * 8 # data has 7 weeks. - assert df[f"{cst.TIME_COL}__H"].unique().shape[0] == df.shape[0] # hourly is the original freq. - - # Tests error. - with pytest.raises( - ValueError, - match="The df size is zero. Does your"): - model._add_agg_freq_cols(df.iloc[:0]) - - -def test_drop_incomplete_agg(params, hourly_data_with_reg): - model = SilverkiteMultistageEstimator(**params) - model.time_col_ = cst.TIME_COL - model.value_col_ = cst.VALUE_COL - model._initialize() - df = model._add_agg_freq_cols(df=hourly_data_with_reg) - df = df.iloc[:-1] # removes the last row so the last period becomes incomplete. - df_new = model._drop_incomplete_agg( - df=df, - agg_freq="D", - location=-1, - num_points_per_agg_freq=24 - ) - assert len(df_new) == len(hourly_data_with_reg) - 24 # minus 1 day. - assert df_new.reset_index(drop=True).equals(df.iloc[:-23]) - - -def test_aggregate_values(params, hourly_data_with_reg): - model = SilverkiteMultistageEstimator(**params) - model.time_col_ = cst.TIME_COL - model.value_col_ = cst.VALUE_COL - model._initialize() - df = model._add_agg_freq_cols(df=hourly_data_with_reg) - df_agg = model._aggregate_values( - df=df[[f"{cst.TIME_COL}__D", cst.VALUE_COL, "regressor"]], - agg_freq="D", - agg_func=np.nanmean - ) - assert len(df_agg) == 7 * 8 # data is 8 weeks. - assert round(df_agg[cst.VALUE_COL].iloc[0], 3) == round(df[cst.VALUE_COL].iloc[:24].mean(), 3) - assert round(df_agg["regressor"].iloc[0], 3) == round(df["regressor"].iloc[:24].mean(), 3) - - -def test_get_agg_dfs(params, hourly_data_with_reg): - model = SilverkiteMultistageEstimator(**params) - model.time_col_ = cst.TIME_COL - model.value_col_ = cst.VALUE_COL - model._initialize() - df = model._add_agg_freq_cols(df=hourly_data_with_reg) - df = df.iloc[1:-1] # both the beginning period and the end period are incomplete. - result = model._get_agg_dfs( - df=df, - agg_freq="D", - agg_func=np.mean, - train_length_in_seconds=60 * 60 * 24 * 30, - fit_length_in_seconds=60 * 60 * 24 * 30, - num_points_per_agg_freq=24, - max_ar_order=5 - ) - assert len(result["train_df"]) == 30 - assert len(result["fit_df"]) == 32 # fit includes incomplete periods on purpose - # ``past_df`` includes 1 more period to avoid errors - # This is to ensure there is no gap between ``past_df`` and ``train_df``, - # as well as to ensure we have at least the length of ``past_df`` needed for AR. - # Extra terms of ``past_df`` will be handled in ``SilverkiteForecast``. - assert len(result["past_df"]) == 6 - assert result["fit_df_has_incomplete_period"] is True - - -def test_get_silverkite_ar_max_order(params): - model = SilverkiteMultistageEstimator(**params) - model.time_col_ = cst.TIME_COL - model.value_col_ = cst.VALUE_COL - model._initialize() - assert model._get_silverkite_ar_max_order() == [21, 24 * 21] - - -def test_train_and_predict(params, hourly_data_with_reg): - """Tests train and prediction functionality.""" - params["model_configs"][0].estimator_params["regressor_cols"] = ["regressor"] - model = SilverkiteMultistageEstimator(**params) - # fit - model.fit(hourly_data_with_reg) - # predict training period - pred = model.predict(hourly_data_with_reg) - assert pred.shape[0] == hourly_data_with_reg.shape[0] - # predict future period - pred = model.predict(pd.DataFrame({ - cst.TIME_COL: pd.date_range(start=hourly_data_with_reg[cst.TIME_COL].max(), freq="H", periods=13)[1:], - cst.VALUE_COL: np.nan, - "regressor": 1 - })) - assert pred.shape[0] == 12 - assert pred[cst.PREDICTED_COL].dropna().shape[0] == 12 - # checks values- - assert model.fit_df[f"{cst.VALUE_COL}__D"].dropna().shape[0] >= 30 # daily training size - assert model.fit_df[f"{cst.VALUE_COL}__H"].dropna().shape[0] >= 24 * 7 # hourly training size - assert model.fit_df[f"{cst.PREDICTED_COL}__D"].dropna().shape[0] >= 30 # daily fit size - assert model.fit_df[f"{cst.PREDICTED_COL}__H"].dropna().shape[0] >= 30 # daily fit size - - # makes sure the AR orders are correct - assert "y_lag1" in model.models[0].model_dict["x_mat"].columns - assert "y_lag12" in model.models[1].model_dict["x_mat"].columns - # components plot - plots = model.plot_components() - assert len(plots) == 2 - # summary - summaries = model.summary() - assert len(summaries) == 2 - - -def test_error(params, hourly_data_with_reg): - model = SilverkiteMultistageEstimator(**params) - - # Calling plot components or summary before fitting. - with pytest.raises( - ValueError, - match="Please call `fit` before"): - model.plot_components() - - with pytest.raises( - ValueError, - match="Please call `fit` before"): - model.summary() - - # Minimum aggregation frequency is less than data frequency. - params["model_configs"][0].agg_freq = "5T" - with pytest.raises( - ValueError, - match="The minimum aggregation frequency"): - model.fit(hourly_data_with_reg) - - -def test_incomplete_fit_df_warning(params, hourly_data_with_reg): - model = SilverkiteMultistageEstimator(**params) - model.model_configs[0].estimator_params["regressor_cols"] = ["regressor"] - with LogCapture(cst.LOGGER_NAME) as log_capture: - model.fit(X=hourly_data_with_reg.iloc[:-1]) # The last period is incomplete. - log_capture.check( - (cst.LOGGER_NAME, - "WARNING", - "There are incomplete periods in `fit_df`, thus the regressor values are " - "biased after aggregation.") - ) - - -def test_missing_timestamps_during_aggregation(params, hourly_data_with_reg): - model = SilverkiteMultistageEstimator(**params) - model.time_col_ = cst.TIME_COL - model.value_col_ = cst.VALUE_COL - model._initialize() - df = model._add_agg_freq_cols(df=hourly_data_with_reg) - df = df.iloc[1:-1] # both the beginning period and the end period are incomplete. - # Removes one timestamp in the middle. - df = pd.concat([df.iloc[:50], df.iloc[51:]], axis=0).reset_index(drop=True) - with LogCapture(cst.LOGGER_NAME) as log_capture: - model._drop_incomplete_agg_and_aggregate_values( - df=df, - agg_freq="D", - agg_func=np.mean, - num_points_per_agg_freq=24, - drop_incomplete=True - ) - log_capture.check( - (cst.LOGGER_NAME, - "WARNING", - "There are missing timestamps in `df` when performing aggregation with " - "frequency D. These points are ts y\nts " - "\n2018-01-03 23 23. " - "This may cause the aggregated values to be biased.") - ) - - -def test_infer_forecast_horizons(hourly_data_with_reg, params): - """Tests that the estimator is able to infer the correct forecast horizon for - each stage of model, under different situations. - """ - model = SilverkiteMultistageEstimator(**params) - # The default forecast horizon is 12. - # We truncate df to have the future period overlapping 2 days. - df = hourly_data_with_reg.iloc[:-3] - model.fit(df) - assert model.forecast_horizons == (2, 12) - # Now we do not truncate df to have the future period overlapping 1 day. - model.fit(hourly_data_with_reg) - assert model.forecast_horizons == (1, 12) - - -def test_short_fit_length(params): - params["model_configs"][0].fit_length = "29D" - model = SilverkiteMultistageEstimator(**params) - with LogCapture(cst.LOGGER_NAME) as log_capture: - model._initialize() - log_capture.check( - (cst.LOGGER_NAME, - "INFO", - "Some `fit_length` is None or is shorter than `train_length`. " - "These `fit_length` have been replaced with `train_length`.") - ) - - -def test_same_agg_freq(params): - params["model_configs"][1].agg_freq = "D" - model = SilverkiteMultistageEstimator(**params) - with pytest.raises( - ValueError, - match="Models from different stages should have different aggregation"): - model._initialize() - - -def test_uncertainty(hourly_data_with_reg, params): - # ``uncertainty_dict`` is given. - params["uncertainty_dict"] = dict( - uncertainty_method=UncertaintyMethodEnum.simple_conditional_residuals.name, - params={} - ) - model = SilverkiteMultistageEstimator(**params) - # fit - model.fit(hourly_data_with_reg) - # predict - pred = model.predict(pd.DataFrame({ - cst.TIME_COL: pd.date_range(start=hourly_data_with_reg[cst.TIME_COL].max(), freq="H", periods=13)[1:], - cst.VALUE_COL: np.nan, - })) - assert cst.PREDICTED_LOWER_COL in pred - assert cst.PREDICTED_UPPER_COL in pred - assert (pred[cst.PREDICTED_LOWER_COL] + pred[cst.PREDICTED_UPPER_COL]).round(2).equals( - (pred[cst.PREDICTED_COL] * 2).round(2)) - - # ``uncertainty_dict`` is not given but coverage is given. - del params["uncertainty_dict"] - params["coverage"] = 0.95 - model = SilverkiteMultistageEstimator(**params) - # fit - model.fit(hourly_data_with_reg) - # predict - pred = model.predict(pd.DataFrame({ - cst.TIME_COL: pd.date_range(start=hourly_data_with_reg[cst.TIME_COL].max(), freq="H", periods=13)[1:], - cst.VALUE_COL: np.nan, - })) - assert cst.PREDICTED_LOWER_COL in pred - assert cst.PREDICTED_UPPER_COL in pred - assert (pred[cst.PREDICTED_LOWER_COL] + pred[cst.PREDICTED_UPPER_COL]).round(2).equals( - (pred[cst.PREDICTED_COL] * 2).round(2)) - - # ``uncertainty_dict`` and ``coverage`` are not given. - del params["coverage"] - model = SilverkiteMultistageEstimator(**params) - # fit - model.fit(hourly_data_with_reg) - # predict - pred = model.predict(pd.DataFrame({ - cst.TIME_COL: pd.date_range(start=hourly_data_with_reg[cst.TIME_COL].max(), freq="H", periods=13)[1:], - cst.VALUE_COL: np.nan, - })) - assert cst.PREDICTED_LOWER_COL not in pred - assert cst.PREDICTED_UPPER_COL not in pred - - -def test_uncertainty_nonstandard_cols(hourly_data_with_reg, params): - params["uncertainty_dict"] = dict( - uncertainty_method=UncertaintyMethodEnum.simple_conditional_residuals.name, - params={} - ) - model = SilverkiteMultistageEstimator(**params) - # fit - model.fit( - hourly_data_with_reg.rename(columns={ - cst.TIME_COL: "t", - cst.VALUE_COL: "z" - }), - time_col="t", - value_col="z" - ) - # predict - pred = model.predict(pd.DataFrame({ - "t": pd.date_range(start=hourly_data_with_reg[cst.TIME_COL].max(), freq="H", periods=13)[1:], - "z": np.nan, - })) - assert cst.PREDICTED_LOWER_COL in pred - assert cst.PREDICTED_UPPER_COL in pred - assert (pred[cst.PREDICTED_LOWER_COL] + pred[cst.PREDICTED_UPPER_COL]).round(2).equals( - (pred[cst.PREDICTED_COL] * 2).round(2)) - - -def test_uncertainty_with_error(hourly_data_with_reg, params): - """Tests model still produces results when uncertainty model fails.""" - with LogCapture(cst.LOGGER_NAME) as log_capture: - params["coverage"] = 0.95 - params["uncertainty_dict"] = dict( - uncertainty_method=UncertaintyMethodEnum.simple_conditional_residuals.name, - params={ - "value_col": "non_exist" - } - ) - model = SilverkiteMultistageEstimator(**params) - # fit - model.fit(hourly_data_with_reg) - # predict - pred = model.predict(pd.DataFrame({ - cst.TIME_COL: pd.date_range(start=hourly_data_with_reg[cst.TIME_COL].max(), freq="H", periods=13)[1:], - cst.VALUE_COL: np.nan, - })) - assert pred is not None - assert cst.PREDICTED_LOWER_COL not in pred - assert cst.PREDICTED_UPPER_COL not in pred - assert ( - cst.LOGGER_NAME, - "WARNING", - "The following errors occurred during fitting the uncertainty model, " - "the uncertainty model is skipped. `value_col` non_exist not found in `train_df`." - ) in log_capture.actual() diff --git a/greykite/tests/sklearn/uncertainty/__init__.py b/greykite/tests/sklearn/uncertainty/__init__.py deleted file mode 100644 index e69de29..0000000