Notebook example of Diviner’s GroupedPmdarima API

This notebook shows a comparison of 4 primary means of utilizing the GroupedPmdarima API in Diviner.

Examples shown

Standard ARIMA
A user-supplied manual configuration of ARIMA models to be built based on the configured order terms. This approach is useful for hierarchical multi-series optimization techniques (where ordering terms are determined by evaluating the optimal parameters from a higher level aggregation of disparate series that share a similar seasonality with one another).
Note that this is the fastest execution available and is recommended to be used if there is homogeny amongst a large collection of individual series (i.e., forecasting SKU scales at 500 different stores would use a global SKU forecasting model through AutoARIMA to determine the optimal ordering terms, then apply those to each individual store’s ARIMA models through this mode).
AutoARIMA
An automate approach that will perform a best-effort optimization of ordering terms.
Seasonal AutoARIMA
A much slower, but, depending on the nature of the series data, potentially much more accurate model for each series.
Pipeline preprocessing + AutoARIMA
This mode applies a data transformer (exogeneous transformers are currently not supported) such as a LogEndogTransformer or BoxCoxEndogTransformer. Depending on the nature of the data, this may dramatically improve the forecasting quality.

[2]:

import itertools
import pandas as pd
import numpy as np
import string
import random
from datetime import timedelta, datetime
from collections import namedtuple
import matplotlib.pyplot as plt
import matplotlib.cm as cmx

from pmdarima.arima.arima import ARIMA
from pmdarima.arima.auto import AutoARIMA
from pmdarima.pipeline import Pipeline
from pmdarima.preprocessing import LogEndogTransformer
from pmdarima.model_selection import SlidingWindowForecastCV
from diviner import GroupedPmdarima, PmdarimaAnalyzer

[3]:

def _build_trend(size):
    raw_trend = (
        np.arange(size) * np.random.uniform(-0.05, 0.2)
    ) + np.random.randint(200, 500)
    return raw_trend


def _build_seasonality(size, period):
    repeated_x = np.arange(period) + 3

    raw_values = np.where(repeated_x < 5, repeated_x**4,
                          np.where(repeated_x < 7, repeated_x**3,
                                   repeated_x**2)
                         )

    seasonality = raw_values
    for i in range(int(size / period) - 1):
        seasonality = np.append(seasonality, raw_values)
    return seasonality * np.random.randint(1, 4)


def _build_residuals(size):
    return np.random.randn(size) * np.random.randint(4, 10)


def _generate_time_series(size, seasonal_period):
    return (
        _build_trend(size)
        + _build_seasonality(size, seasonal_period)
        + _build_residuals(size)
    )


def _generate_grouping_columns(column_count: int, series_count: int):
    candidate_list = list(string.ascii_uppercase)
    candidates = random.sample(
        list(itertools.permutations(candidate_list, column_count)), series_count
    )
    column_names = sorted([f"key{x}" for x in range(column_count)], reverse=True)
    return [dict(zip(column_names, entries)) for entries in candidates]


def _generate_raw_df(
    column_count: int,
    series_count: int,
    series_size: int,
    series_seasonal_period: int,
    start_dt: str,
    days_period: int,
):
    candidates = _generate_grouping_columns(column_count, series_count)
    start_date = datetime.strptime(start_dt, "%Y-%M-%d")
    dates = np.arange(
        start_date,
        start_date + timedelta(days=series_size * days_period),
        timedelta(days=days_period),
    )
    df_collection = []
    for entry in candidates:
        generated_series = _generate_time_series(series_size, series_seasonal_period)
        series_dict = {"ds": dates, "y": generated_series}
        series_df = pd.DataFrame.from_dict(series_dict)
        for column, value in entry.items():
            series_df[column] = value
        df_collection.append(series_df)
    return pd.concat(df_collection)


def generate_example_data(
    column_count: int,
    series_count: int,
    series_size: int,
    series_seasonal_period: int,
    start_dt: str,
    days_period: int = 1,
):

    Structure = namedtuple("Structure", "df key_columns")
    data = _generate_raw_df(
        column_count, series_count, series_size, series_seasonal_period, start_dt, days_period
    )
    key_columns = list(data.columns)

    for key in ["ds", "y"]:
        key_columns.remove(key)

    return Structure(data, key_columns)

def plot_grouped_series(df, key_columns, time_col, y_col):
    grouped = df.groupby(key_columns)
    ncols = 1
    nrows = int(np.ceil(grouped.ngroups/ncols))
    fig, axes = plt.subplots(nrows=nrows,
                             ncols=ncols,
                             figsize=(16, 6*grouped.ngroups),
                             sharey=False,
                             sharex=False
                            )
    cmap = [cmx.Dark2(x) for x in np.linspace(0.0, 1.0, grouped.ngroups)]
    i=0
    for (key, ax) in zip(grouped.groups.keys(), axes.flatten()):
        ser = grouped.get_group(key)
        rgb = cmap[i]
        ax.plot(ser[time_col], ser[y_col], label="y value", c=rgb)
        ax.legend()
        ax.title.set_text(f"Group: {key}")
        i+=1
    plt.show()

def plot_grouped_series_forecast(df, key_columns, time_col, y_col, yhat_lower_col, yhat_upper_col):

    grouped = df.groupby(key_columns)
    ncols = 1
    nrows = int(np.ceil(grouped.ngroups/ncols))
    fig, axes = plt.subplots(nrows=nrows,
                             ncols=ncols,
                             figsize=(16, 8*grouped.ngroups),
                             sharey=False,
                             sharex=False
                            )
    cmap = [cmx.Dark2(x) for x in np.linspace(0.0, 1.0, grouped.ngroups)]
    i=0
    for (key, ax) in zip(grouped.groups.keys(), axes.flatten()):
        ser = grouped.get_group(key)
        rgb = cmap[i]
        ax.plot(ser[time_col], ser[y_col], label="y value", c=rgb)
        ax.fill_between(ser[time_col],
                        ser[yhat_lower_col],
                        ser[yhat_upper_col],
                        color=rgb,
                        alpha=0.3,
                        label="error"
                       )
        ax.legend(loc="upper left")
        ax.title.set_text(f"Group: {key}")
        ax.grid(color=rgb, linewidth=0.5, alpha=0.5)
        i+=1
    plt.show()

[4]:

data = generate_example_data(3, 5, 1050, 7, "2017-01-01", 1)

View the synthetic generated data

Note that this is a randomly generated data set.

[5]:

plot_grouped_series(data.df, data.key_columns, "ds", "y")

../../_images/tutorials-and-examples_notebooks_grouped_pmdarima_example_jupyter_6_0.png

ARIMA manually defined ordering terms

[6]:

data_utils = PmdarimaAnalyzer(data.df, data.key_columns, "y", "ds")

ndiffs = data_utils.calculate_ndiffs(alpha=0.05, test="kpss", max_d=7)
ndiffs

[6]:

{('F', 'E', 'K'): 1,
 ('J', 'X', 'V'): 0,
 ('N', 'X', 'C'): 1,
 ('O', 'T', 'Y'): 1,
 ('Y', 'V', 'G'): 1}

The results above for the ndiffs method shows that each group’s differencing value should be set to ‘1’. This isn’t surprising based on how the generated data was created. Real world data sets may have different optimal ‘d’ values, though. Performing a manual validation can dramatically reduce optimization time for the AutoARIMA methods (which will be covered further down in this example notebook).

Let’s check the seasonal differencing as well.

[7]:

nsdiffs = data_utils.calculate_nsdiffs(m=20, test="ocsb", max_D=30)
nsdiffs

[7]:

{('F', 'E', 'K'): 0,
 ('J', 'X', 'V'): 0,
 ('N', 'X', 'C'): 0,
 ('O', 'T', 'Y'): 0,
 ('Y', 'V', 'G'): 0}

Calculate the acf values for each group to aid in setting the ARIMA ‘p’ parameter.

[8]:

acf = data_utils.calculate_acf(alpha=0.05, qstat=True)
acf

[8]:

{('F',
  'E',
  'K'): {'acf': array([ 1.        , -0.08070348,  0.17113533, -0.57432053, -0.57269233,
          0.16892029, -0.08007331,  0.9866866 , -0.08029439,  0.16979978,
         -0.57130432, -0.56881352,  0.16684712, -0.07890206,  0.98009748,
         -0.07907823,  0.16909447, -0.56757814, -0.56480278,  0.16576373,
         -0.07812178,  0.97376666, -0.07840144,  0.16723731, -0.56373238,
         -0.56227819,  0.1649875 , -0.07771043,  0.96709519, -0.07748238,
          0.16583409]), 'confidence_intervals': array([[ 1.00000000e+00,  1.00000000e+00],
         [-1.41189283e-01, -2.02176793e-02],
         [ 1.10256857e-01,  2.32013805e-01],
         [-6.36934311e-01, -5.11706747e-01],
         [-6.52278609e-01, -4.93106048e-01],
         [ 7.54654815e-02,  2.62375098e-01],
         [-1.74638559e-01,  1.44919437e-02],
         [ 8.91873617e-01,  1.08149958e+00],
         [-2.07231513e-01,  4.66427328e-02],
         [ 4.26769784e-02,  2.96922590e-01],
         [-6.99254204e-01, -4.43354434e-01],
         [-7.05778407e-01, -4.31848638e-01],
         [ 2.14964890e-02,  3.12197760e-01],
         [-2.24951705e-01,  6.71475893e-02],
         [ 8.33891965e-01,  1.12630299e+00],
         [-2.47615343e-01,  8.94588851e-02],
         [ 4.21665597e-04,  3.37767275e-01],
         [-7.36869989e-01, -3.98286283e-01],
         [-7.40918901e-01, -3.88686666e-01],
         [-1.68589604e-02,  3.48386416e-01],
         [-2.61294110e-01,  1.05050546e-01],
         [ 7.90472475e-01,  1.15706084e+00],
         [-2.79734390e-01,  1.22931504e-01],
         [-3.42072984e-02,  3.68681928e-01],
         [-7.65684304e-01, -3.61780464e-01],
         [-7.69907420e-01, -3.54648952e-01],
         [-4.81397856e-02,  3.78114778e-01],
         [-2.91304472e-01,  1.35883613e-01],
         [ 7.53397735e-01,  1.18079264e+00],
         [-3.06633108e-01,  1.51668346e-01],
         [-6.34124686e-02,  3.95080645e-01]]), 'qstat': array([6.85826223e+00, 3.77273016e+01, 3.85717521e+02, 7.32068234e+02,
         7.62229696e+02, 7.69013606e+02, 1.80006234e+03, 1.80689685e+03,
         1.83749031e+03, 2.18415269e+03, 2.52812962e+03, 2.55775372e+03,
         2.56438508e+03, 3.58858286e+03, 3.59525674e+03, 3.62580196e+03,
         3.97027563e+03, 4.31171925e+03, 4.34115841e+03, 4.34770344e+03,
         5.36559020e+03, 5.37219501e+03, 5.40227661e+03, 5.74441645e+03,
         6.08512548e+03, 6.11448893e+03, 6.12100954e+03, 7.13187321e+03,
         7.13836830e+03, 7.16815021e+03]), 'pvalues': array([8.82323175e-003, 6.42126449e-009, 2.74600191e-083, 3.96377730e-157,
         1.71230216e-162, 7.61823371e-163, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000])},
 ('J',
  'X',
  'V'): {'acf': array([ 1.        , -0.08971388,  0.17178181, -0.57976408, -0.57751633,
          0.16928219, -0.08856344,  0.99113133, -0.08908178,  0.17049929,
         -0.57609728, -0.57368404,  0.16817819, -0.08774527,  0.9848123 ,
         -0.08839067,  0.16942107, -0.57210998, -0.56969456,  0.16699015,
         -0.08711197,  0.97796277, -0.08810664,  0.16820989, -0.5684689 ,
         -0.56560049,  0.16574469, -0.08629873,  0.97111987, -0.0872756 ,
          0.16688735]), 'confidence_intervals': array([[ 1.00000000e+00,  1.00000000e+00],
         [-1.50199679e-01, -2.92280757e-02],
         [ 1.10811128e-01,  2.32752494e-01],
         [-6.42480452e-01, -5.17047707e-01],
         [-6.57471324e-01, -4.97561327e-01],
         [ 7.52969431e-02,  2.63267435e-01],
         [-1.83657644e-01,  6.53076441e-03],
         [ 8.95735843e-01,  1.08652682e+00],
         [-2.16706827e-01,  3.85432747e-02],
         [ 4.26469632e-02,  2.98351626e-01],
         [-7.04778770e-01, -4.47415791e-01],
         [-7.11478716e-01, -4.35889368e-01],
         [ 2.19061278e-02,  3.14450251e-01],
         [-2.34723063e-01,  5.92325199e-02],
         [ 8.37642982e-01,  1.13198161e+00],
         [-2.57964550e-01,  8.11832075e-02],
         [-3.21283670e-04,  3.39163431e-01],
         [-7.42469873e-01, -4.01750086e-01],
         [-7.46944218e-01, -3.92444907e-01],
         [-1.68364179e-02,  3.50816720e-01],
         [-2.71492690e-01,  9.72687454e-02],
         [ 7.93431540e-01,  1.16249400e+00],
         [-2.90714385e-01,  1.14501097e-01],
         [-3.45379763e-02,  3.70957757e-01],
         [-7.71726697e-01, -3.65211110e-01],
         [-7.74594003e-01, -3.56606968e-01],
         [-4.87758122e-02,  3.80265202e-01],
         [-3.01287231e-01,  1.28689779e-01],
         [ 7.56004669e-01,  1.18623508e+00],
         [-3.17872831e-01,  1.43321630e-01],
         [-6.38306951e-02,  3.97605398e-01]]), 'qstat': array([   8.47517754,   39.5778789 ,  394.19603408,  746.40619894,
          776.69703653,  784.99580201, 1825.3545896 , 1833.76689546,
         1864.61294006, 2217.11637211, 2567.00919151, 2597.10784245,
         2605.30897302, 3639.38437728, 3647.72267797, 3678.386011  ,
         4028.38256167, 4375.76630468, 4405.64269257, 4413.78080166,
         5440.45892422, 5448.80013774, 5479.23263921, 5827.14599696,
         6171.8931755 , 6201.52677151, 6209.56829208, 7228.86313254,
         7237.10384927, 7267.26526426]), 'pvalues': array([3.60025244e-003, 2.54549820e-009, 4.00232684e-085, 3.11215093e-160,
         1.27131779e-165, 2.68648911e-166, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000])},
 ('N',
  'X',
  'C'): {'acf': array([ 1.        , -0.01968557,  0.22007368, -0.48359781, -0.48169266,
          0.217541  , -0.01947545,  0.98802832, -0.02016042,  0.21825466,
         -0.48114571, -0.47876755,  0.21525037, -0.02022865,  0.98066016,
         -0.02062928,  0.21593687, -0.47865674, -0.47685675,  0.21294954,
         -0.02105458,  0.97360499, -0.0210813 ,  0.21410622, -0.47613525,
         -0.47476485,  0.21066211, -0.02169228,  0.96607903, -0.02189333,
          0.21159558]), 'confidence_intervals': array([[ 1.        ,  1.        ],
         [-0.08017137,  0.04080023],
         [ 0.15956444,  0.28058291],
         [-0.54696776, -0.42022786],
         [-0.5573694 , -0.40601591],
         [ 0.13137411,  0.30370788],
         [-0.10762875,  0.06867786],
         [ 0.89985927,  1.07619737],
         [-0.14229436,  0.10197352],
         [ 0.09610855,  0.34040077],
         [-0.60471035, -0.35758106],
         [-0.6090063 , -0.34852881],
         [ 0.07872441,  0.35177632],
         [-0.1579906 ,  0.11753331],
         [ 0.84288734,  1.11843299],
         [-0.18193064,  0.14067209],
         [ 0.05462586,  0.37724789],
         [-0.64102185, -0.31629163],
         [-0.64430483, -0.30940867],
         [ 0.04060482,  0.38529426],
         [-0.19435926,  0.1522501 ],
         [ 0.80029094,  1.14691903],
         [-0.21336663,  0.17120404],
         [ 0.02181242,  0.40640001],
         [-0.66929924, -0.28297126],
         [-0.67217595, -0.27735376],
         [ 0.00911703,  0.4122072 ],
         [-0.22404133,  0.18065678],
         [ 0.76372146,  1.16843659],
         [-0.24047436,  0.19668769],
         [-0.00699346,  0.43018463]]), 'qstat': array([4.08061309e-01, 5.14562056e+01, 2.98189072e+02, 5.43215771e+02,
         5.93238914e+02, 5.93640224e+02, 1.62749495e+03, 1.62792581e+03,
         1.67847117e+03, 1.92435215e+03, 2.16804283e+03, 2.21734834e+03,
         2.21778421e+03, 3.24315833e+03, 3.24361252e+03, 3.29342499e+03,
         3.53841766e+03, 3.78180680e+03, 3.83039153e+03, 3.83086693e+03,
         4.84841572e+03, 4.84889326e+03, 4.89819851e+03, 5.14227073e+03,
         5.38517676e+03, 5.43304836e+03, 5.43355645e+03, 6.44229694e+03,
         6.44281550e+03, 6.49130169e+03]), 'pvalues': array([5.22955152e-001, 6.70543443e-012, 2.45301824e-064, 3.00422546e-116,
         5.84337991e-126, 5.48966949e-125, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000])},
 ('O',
  'T',
  'Y'): {'acf': array([ 1.        , -0.08259995,  0.17734596, -0.57067234, -0.5684687 ,
          0.17509455, -0.08130592,  0.99248242, -0.08216322,  0.17599856,
         -0.56701055, -0.56461355,  0.17391106, -0.08060416,  0.98566707,
         -0.08166764,  0.17457252, -0.56337294, -0.56083338,  0.17276644,
         -0.08014475,  0.97890769, -0.08129729,  0.17329419, -0.55960215,
         -0.55701671,  0.17141666, -0.07962553,  0.97213105, -0.08084958,
          0.17208343]), 'confidence_intervals': array([[ 1.        ,  1.        ],
         [-0.14308575, -0.02211415],
         [ 0.11644888,  0.23824304],
         [-0.63343051, -0.50791417],
         [-0.64797665, -0.48896075],
         [ 0.08189545,  0.26829364],
         [-0.17570083,  0.01308899],
         [ 0.89783164,  1.0871332 ],
         [-0.20930973,  0.04498329],
         [ 0.04865795,  0.30333917],
         [-0.695238  , -0.43878309],
         [-0.70170739, -0.4275197 ],
         [ 0.02855867,  0.31926346],
         [-0.22671584,  0.06550753],
         [ 0.8393928 ,  1.13194135],
         [-0.25050182,  0.08716655],
         [ 0.00559388,  0.34355117],
         [-0.73301013, -0.39373576],
         [-0.73718284, -0.38448393],
         [-0.00999187,  0.35552474],
         [-0.2634996 ,  0.10321009],
         [ 0.79542472,  1.16239065],
         [-0.2829843 ,  0.12038972],
         [-0.02851267,  0.37510105],
         [-0.76195271, -0.3572516 ],
         [-0.76495209, -0.34908134],
         [-0.04190791,  0.38474122],
         [-0.29345343,  0.13420237],
         [ 0.7581947 ,  1.18606741],
         [-0.31037882,  0.14867966],
         [-0.05754997,  0.40171684]]), 'qstat': array([7.18437719e+00, 4.03345925e+01, 3.83917858e+02, 7.25178711e+02,
         7.57585350e+02, 7.64579724e+02, 1.80777684e+03, 1.81493320e+03,
         1.84780115e+03, 2.18927226e+03, 2.52818827e+03, 2.56037391e+03,
         2.56729446e+03, 3.60316572e+03, 3.61028383e+03, 3.64284022e+03,
         3.98222838e+03, 4.31888957e+03, 4.35086859e+03, 4.35775699e+03,
         5.38642004e+03, 5.39352177e+03, 5.42582177e+03, 5.76296654e+03,
         6.09732909e+03, 6.12902558e+03, 6.13587154e+03, 7.15729017e+03,
         7.16436205e+03, 7.19643087e+03]), 'pvalues': array([7.35410753e-003, 1.74363074e-009, 6.73724508e-083, 1.23042405e-155,
         1.73027963e-161, 6.91273826e-162, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000])},
 ('Y',
  'V',
  'G'): {'acf': array([ 1.        , -0.06089902,  0.19406527, -0.53915391, -0.53707731,
          0.19151146, -0.06008225,  0.99253338, -0.0607289 ,  0.19239331,
         -0.53581884, -0.53377634,  0.18991977, -0.06011947,  0.98545395,
         -0.0607944 ,  0.19067928, -0.53253409, -0.53052952,  0.18844171,
         -0.06005286,  0.97843607, -0.06083826,  0.18909121, -0.52922907,
         -0.52732228,  0.18674487, -0.06008383,  0.9713386 , -0.06075151,
          0.18737217]), 'confidence_intervals': array([[ 1.00000000e+00,  1.00000000e+00],
         [-1.21384817e-01, -4.13213869e-04],
         [ 1.33355563e-01,  2.54774984e-01],
         [-6.02092283e-01, -4.76215529e-01],
         [-6.15104319e-01, -4.59050299e-01],
         [ 1.00964032e-01,  2.82058885e-01],
         [-1.52099649e-01,  3.19351498e-02],
         [ 9.00372563e-01,  1.08469419e+00],
         [-1.86035803e-01,  6.45780064e-02],
         [ 6.69787750e-02,  3.17807845e-01],
         [-6.62308552e-01, -4.09329121e-01],
         [-6.68314035e-01, -3.99238650e-01],
         [ 4.78453231e-02,  3.31994214e-01],
         [-2.03119714e-01,  8.28807826e-02],
         [ 8.42361262e-01,  1.12854664e+00],
         [-2.26870469e-01,  1.05281673e-01],
         [ 2.45218148e-02,  3.56836755e-01],
         [-6.99490201e-01, -3.65577979e-01],
         [-7.03588498e-01, -3.57470552e-01],
         [ 9.53145029e-03,  3.67351967e-01],
         [-2.39687804e-01,  1.19582075e-01],
         [ 7.98727701e-01,  1.15814445e+00],
         [-2.59080530e-01,  1.37404007e-01],
         [-9.21935613e-03,  3.87401771e-01],
         [-7.28198173e-01, -3.30259963e-01],
         [-7.31376425e-01, -3.23268139e-01],
         [-2.22353699e-02,  3.95725109e-01],
         [-2.69673698e-01,  1.49506040e-01],
         [ 7.61685720e-01,  1.18099147e+00],
         [-2.86268629e-01,  1.64765599e-01],
         [-3.82048147e-02,  4.12949146e-01]]), 'qstat': array([3.90526128e+00, 4.36005909e+01, 3.50279471e+02, 6.54891439e+02,
         6.93659875e+02, 6.97479290e+02, 1.74078352e+03, 1.74469309e+03,
         1.78396972e+03, 2.08890498e+03, 2.39181116e+03, 2.43019497e+03,
         2.43404493e+03, 3.46946828e+03, 3.47341278e+03, 3.51225388e+03,
         3.81550300e+03, 4.11676507e+03, 4.15481032e+03, 4.15867786e+03,
         5.18634999e+03, 5.19032708e+03, 5.22878424e+03, 5.53032433e+03,
         5.82998754e+03, 5.86760612e+03, 5.87150414e+03, 6.89125818e+03,
         6.89525113e+03, 6.93327138e+03]), 'pvalues': array([4.81351422e-002, 3.40605757e-010, 1.29766125e-075, 2.03514101e-140,
         1.15368607e-147, 2.14170577e-147, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000])}}

And the partial autocorrelation function values…

[9]:

pacf = data_utils.calculate_pacf(nlags=40, alpha=0.05)
pacf

[9]:

{('F',
  'E',
  'K'): {'pacf': array([ 1.        , -0.08078041,  0.16601981, -0.57034304, -0.9620566 ,
          0.61323735,  0.04746202,  0.77862671,  0.00637416, -0.08100356,
         -0.05871874, -0.13700759,  0.1174423 ,  0.0605576 ,  0.42934315,
          0.09243144, -0.07328271,  0.00398806, -0.100308  ,  0.1303113 ,
          0.05355023,  0.31839881,  0.08861446, -0.22613491,  0.01178529,
         -0.22631973,  0.18613883,  0.15154231,  0.24202115,  0.14823025,
         -0.36025751, -0.00747563, -0.09967852,  0.27377749,  0.2899224 ,
          0.03164718, -0.01778687, -0.72500717,  0.08315487,  0.44580926,
          1.87065584]), 'confidence_intervals': array([[ 1.00000000e+00,  1.00000000e+00],
         [-1.41266216e-01, -2.02946130e-02],
         [ 1.05534006e-01,  2.26505610e-01],
         [-6.30828837e-01, -5.09857234e-01],
         [-1.02254240e+00, -9.01570796e-01],
         [ 5.52751546e-01,  6.73723149e-01],
         [-1.30237844e-02,  1.07947819e-01],
         [ 7.18140908e-01,  8.39112511e-01],
         [-5.41116405e-02,  6.68599629e-02],
         [-1.41489364e-01, -2.05177602e-02],
         [-1.19204540e-01,  1.76706309e-03],
         [-1.97493394e-01, -7.65217907e-02],
         [ 5.69564994e-02,  1.77928103e-01],
         [ 7.18002697e-05,  1.21043404e-01],
         [ 3.68857350e-01,  4.89828953e-01],
         [ 3.19456401e-02,  1.52917244e-01],
         [-1.33768515e-01, -1.27969120e-02],
         [-5.64977374e-02,  6.44738661e-02],
         [-1.60793800e-01, -3.98221961e-02],
         [ 6.98254990e-02,  1.90797102e-01],
         [-6.93556812e-03,  1.14036035e-01],
         [ 2.57913009e-01,  3.78884612e-01],
         [ 2.81286603e-02,  1.49100264e-01],
         [-2.86620716e-01, -1.65649112e-01],
         [-4.87005161e-02,  7.22710873e-02],
         [-2.86805530e-01, -1.65833926e-01],
         [ 1.25653024e-01,  2.46624627e-01],
         [ 9.10565039e-02,  2.12028107e-01],
         [ 1.81535344e-01,  3.02506947e-01],
         [ 8.77444507e-02,  2.08716054e-01],
         [-4.20743308e-01, -2.99771705e-01],
         [-6.79614321e-02,  5.30101714e-02],
         [-1.60164320e-01, -3.91927162e-02],
         [ 2.13291689e-01,  3.34263293e-01],
         [ 2.29436600e-01,  3.50408204e-01],
         [-2.88386224e-02,  9.21329811e-02],
         [-7.82726713e-02,  4.26989322e-02],
         [-7.85492971e-01, -6.64521368e-01],
         [ 2.26690715e-02,  1.43640675e-01],
         [ 3.85323463e-01,  5.06295066e-01],
         [ 1.81017004e+00,  1.93114164e+00]])},
 ('J',
  'X',
  'V'): {'pacf': array([ 1.00000000e+00, -8.97994006e-02,  1.65379314e-01, -5.73871649e-01,
         -9.90181691e-01,  2.28389649e-02, -4.93185201e-01,  7.69240477e-01,
          1.03879793e-01, -5.28893124e-01,  1.34794508e-01, -3.51589756e-01,
          1.85686790e-01,  1.55883714e-01,  1.98571228e-01,  3.29974040e-01,
         -7.80295225e-01,  1.35211200e+00,  2.41677843e+00, -9.85720454e-01,
          3.04881070e+01,  1.02424994e+00, -2.40557539e-01, -5.62704983e-01,
          2.76483779e-01, -6.86759801e-02,  6.54149325e-01, -8.11543320e-02,
         -6.08749521e-01, -7.23722874e-01, -2.39649064e+00,  4.45281360e-01,
         -2.59062504e+00,  5.16719035e-01,  7.25642747e-01,  2.43975398e-01,
          3.81543627e-01, -1.01150937e+00, -2.65585463e+01,  9.87381089e-01,
         -2.23305532e+00]), 'confidence_intervals': array([[ 1.00000000e+00,  1.00000000e+00],
         [-1.50285202e-01, -2.93135989e-02],
         [ 1.04893512e-01,  2.25865116e-01],
         [-6.34357451e-01, -5.13385847e-01],
         [-1.05066749e+00, -9.29695889e-01],
         [-3.76468369e-02,  8.33247666e-02],
         [-5.53671003e-01, -4.32699399e-01],
         [ 7.08754675e-01,  8.29726278e-01],
         [ 4.33939913e-02,  1.64365595e-01],
         [-5.89378926e-01, -4.68407322e-01],
         [ 7.43087068e-02,  1.95280310e-01],
         [-4.12075557e-01, -2.91103954e-01],
         [ 1.25200988e-01,  2.46172591e-01],
         [ 9.53979124e-02,  2.16369516e-01],
         [ 1.38085427e-01,  2.59057030e-01],
         [ 2.69488238e-01,  3.90459842e-01],
         [-8.40781027e-01, -7.19809423e-01],
         [ 1.29162620e+00,  1.41259780e+00],
         [ 2.35629263e+00,  2.47726423e+00],
         [-1.04620626e+00, -9.25234652e-01],
         [ 3.04276212e+01,  3.05485928e+01],
         [ 9.63764140e-01,  1.08473574e+00],
         [-3.01043341e-01, -1.80071738e-01],
         [-6.23190784e-01, -5.02219181e-01],
         [ 2.15997977e-01,  3.36969581e-01],
         [-1.29161782e-01, -8.19017841e-03],
         [ 5.93663523e-01,  7.14635127e-01],
         [-1.41640134e-01, -2.06685303e-02],
         [-6.69235322e-01, -5.48263719e-01],
         [-7.84208676e-01, -6.63237072e-01],
         [-2.45697644e+00, -2.33600484e+00],
         [ 3.84795558e-01,  5.05767162e-01],
         [-2.65111084e+00, -2.53013923e+00],
         [ 4.56233234e-01,  5.77204837e-01],
         [ 6.65156945e-01,  7.86128549e-01],
         [ 1.83489596e-01,  3.04461199e-01],
         [ 3.21057825e-01,  4.42029428e-01],
         [-1.07199518e+00, -9.51023573e-01],
         [-2.66190321e+01, -2.64980605e+01],
         [ 9.26895288e-01,  1.04786689e+00],
         [-2.29354112e+00, -2.17256952e+00]])},
 ('N',
  'X',
  'C'): {'pacf': array([ 1.        , -0.01970433,  0.2201909 , -0.50176994, -0.69433673,
          0.95803865,  0.20401309,  0.8161953 , -0.01727898, -0.14639157,
          0.02840954, -0.17105713,  0.16707132, -0.03883989,  0.35872681,
          0.0038246 , -0.25951443,  0.10018443, -0.25434774,  0.14468707,
          0.00653858,  0.20690802,  0.10056146, -0.30360206,  0.22331149,
         -0.39152792,  0.26718222,  0.05444163, -0.14915054,  0.51674794,
         -1.46615051, -2.28765331,  0.89992292, -2.75190377, -1.41158622,
          0.72795062, -0.36145038,  0.05923125, -0.03008689, -0.12517299,
          0.28304207]), 'confidence_intervals': array([[ 1.00000000e+00,  1.00000000e+00],
         [-8.01901350e-02,  4.07814684e-02],
         [ 1.59705095e-01,  2.80676698e-01],
         [-5.62255742e-01, -4.41284138e-01],
         [-7.54822534e-01, -6.33850930e-01],
         [ 8.97552847e-01,  1.01852445e+00],
         [ 1.43527288e-01,  2.64498891e-01],
         [ 7.55709496e-01,  8.76681100e-01],
         [-7.77647795e-02,  4.32068239e-02],
         [-2.06877375e-01, -8.59057717e-02],
         [-3.20762615e-02,  8.88953419e-02],
         [-2.31542936e-01, -1.10571333e-01],
         [ 1.06585516e-01,  2.27557120e-01],
         [-9.93256891e-02,  2.16459144e-02],
         [ 2.98241006e-01,  4.19212609e-01],
         [-5.66612031e-02,  6.43104003e-02],
         [-3.20000229e-01, -1.99028626e-01],
         [ 3.96986243e-02,  1.60670228e-01],
         [-3.14833537e-01, -1.93861933e-01],
         [ 8.42012654e-02,  2.05172869e-01],
         [-5.39472258e-02,  6.70243777e-02],
         [ 1.46422222e-01,  2.67393825e-01],
         [ 4.00756561e-02,  1.61047260e-01],
         [-3.64087860e-01, -2.43116257e-01],
         [ 1.62825693e-01,  2.83797296e-01],
         [-4.52013719e-01, -3.31042116e-01],
         [ 2.06696419e-01,  3.27668022e-01],
         [-6.04417343e-03,  1.14927430e-01],
         [-2.09636341e-01, -8.86647377e-02],
         [ 4.56262137e-01,  5.77233740e-01],
         [-1.52663631e+00, -1.40566471e+00],
         [-2.34813911e+00, -2.22716750e+00],
         [ 8.39437119e-01,  9.60408723e-01],
         [-2.81238957e+00, -2.69141797e+00],
         [-1.47207203e+00, -1.35110042e+00],
         [ 6.67464820e-01,  7.88436423e-01],
         [-4.21936183e-01, -3.00964579e-01],
         [-1.25455353e-03,  1.19717050e-01],
         [-9.05726907e-02,  3.03989128e-02],
         [-1.85658788e-01, -6.46871844e-02],
         [ 2.22556271e-01,  3.43527874e-01]])},
 ('O',
  'T',
  'Y'): {'pacf': array([  1.        ,  -0.08267869,   0.17202456,  -0.56644104,
          -0.9541979 ,   0.85189822,   0.1845516 ,   0.94596428,
           0.43918741,  -1.15664315,  -1.64226152,   2.42653871,
           0.96331553,  -0.31849573,   5.24481561,   0.49895362,
          -0.98231553,   0.75347103,   4.98908903,  -0.80730486,
           2.87884606,   0.7683438 ,   0.94560433, -17.85764882,
          -0.97487154,  -1.64867233,   0.88285951,  -0.51857155,
          -0.28254564,   1.1636752 ,   2.94570193,  -1.51600933,
          -1.10973108,   1.65826343,   0.62080675,   2.96339001,
          -0.37477296,  -0.70521211,   0.6992378 ,  -0.3032168 ,
           0.13773625]), 'confidence_intervals': array([[  1.        ,   1.        ],
         [ -0.14316449,  -0.02219289],
         [  0.11153876,   0.23251036],
         [ -0.62692684,  -0.50595524],
         [ -1.0146837 ,  -0.8937121 ],
         [  0.79141242,   0.91238402],
         [  0.1240658 ,   0.2450374 ],
         [  0.88547848,   1.00645008],
         [  0.37870161,   0.49967321],
         [ -1.21712895,  -1.09615734],
         [ -1.70274732,  -1.58177572],
         [  2.36605291,   2.48702451],
         [  0.90282972,   1.02380133],
         [ -0.37898153,  -0.25800993],
         [  5.1843298 ,   5.30530141],
         [  0.43846782,   0.55943942],
         [ -1.04280133,  -0.92182972],
         [  0.69298523,   0.81395683],
         [  4.92860323,   5.04957483],
         [ -0.86779066,  -0.74681906],
         [  2.81836026,   2.93933186],
         [  0.707858  ,   0.8288296 ],
         [  0.88511853,   1.00609014],
         [-17.91813462, -17.79716302],
         [ -1.03535734,  -0.91438574],
         [ -1.70915813,  -1.58818653],
         [  0.82237371,   0.94334532],
         [ -0.57905735,  -0.45808574],
         [ -0.34303144,  -0.22205984],
         [  1.1031894 ,   1.224161  ],
         [  2.88521613,   3.00618773],
         [ -1.57649513,  -1.45552353],
         [ -1.17021688,  -1.04924528],
         [  1.59777763,   1.71874923],
         [  0.56032094,   0.68129255],
         [  2.90290421,   3.02387581],
         [ -0.43525876,  -0.31428716],
         [ -0.76569792,  -0.64472631],
         [  0.638752  ,   0.7597236 ],
         [ -0.3637026 ,  -0.242731  ],
         [  0.07725045,   0.19822205]])},
 ('Y',
  'V',
  'G'): {'pacf': array([ 1.        , -0.06095707,  0.19143118, -0.54121706, -0.84696723,
          0.95805607,  0.21854914,  0.94681651,  0.16707407, -1.51569922,
         -0.54723384, -1.73908393,  1.37645162,  0.39243366,  1.49812903,
         -0.31122597, -1.20178542, -0.84078145, -8.7457162 ,  1.02860706,
          0.3577604 ,  1.95108875, -0.46472826, -1.10813513, -1.61419444,
          3.30494158,  0.98217571,  1.47580919, -5.93462491, -1.14301777,
         -0.92691166,  2.31809935,  0.57009033,  1.03284862, -5.90888654,
         -0.63850567, -1.82253279, -0.85711637,  1.18090889,  0.52745047,
          2.32443609]), 'confidence_intervals': array([[ 1.00000000e+00,  1.00000000e+00],
         [-1.21442872e-01, -4.71268221e-04],
         [ 1.30945374e-01,  2.51916977e-01],
         [-6.01702859e-01, -4.80731256e-01],
         [-9.07453036e-01, -7.86481432e-01],
         [ 8.97570269e-01,  1.01854187e+00],
         [ 1.58063336e-01,  2.79034939e-01],
         [ 8.86330709e-01,  1.00730231e+00],
         [ 1.06588264e-01,  2.27559867e-01],
         [-1.57618503e+00, -1.45521342e+00],
         [-6.07719647e-01, -4.86748043e-01],
         [-1.79956973e+00, -1.67859812e+00],
         [ 1.31596581e+00,  1.43693742e+00],
         [ 3.31947858e-01,  4.52919462e-01],
         [ 1.43764323e+00,  1.55861483e+00],
         [-3.71711773e-01, -2.50740170e-01],
         [-1.26227123e+00, -1.14129962e+00],
         [-9.01267253e-01, -7.80295649e-01],
         [-8.80620200e+00, -8.68523040e+00],
         [ 9.68121254e-01,  1.08909286e+00],
         [ 2.97274601e-01,  4.18246204e-01],
         [ 1.89060295e+00,  2.01157455e+00],
         [-5.25214057e-01, -4.04242454e-01],
         [-1.16862093e+00, -1.04764933e+00],
         [-1.67468024e+00, -1.55370863e+00],
         [ 3.24445578e+00,  3.36542739e+00],
         [ 9.21689904e-01,  1.04266151e+00],
         [ 1.41532339e+00,  1.53629500e+00],
         [-5.99511071e+00, -5.87413911e+00],
         [-1.20350357e+00, -1.08253197e+00],
         [-9.87397461e-01, -8.66425858e-01],
         [ 2.25761354e+00,  2.37858515e+00],
         [ 5.09604531e-01,  6.30576134e-01],
         [ 9.72362822e-01,  1.09333443e+00],
         [-5.96937234e+00, -5.84840074e+00],
         [-6.98991475e-01, -5.78019871e-01],
         [-1.88301860e+00, -1.76204699e+00],
         [-9.17602175e-01, -7.96630572e-01],
         [ 1.12042309e+00,  1.24139470e+00],
         [ 4.66964672e-01,  5.87936275e-01],
         [ 2.26395029e+00,  2.38492189e+00]])}}

Start with a simple ARIMA model with explicitly defined (p, d, q) values.

These values were determined by aggregating all of the series into a single representative series, run through AutoARIMA to determine the p, d, q order terms, and applied to each series individually.

Note that this approach will not work if the series have different seasonality attributes (series ‘a’ has a weekly seasonality while series ‘b’ has a monthly seasonality), if the series has a complex seasonality (compounding weekly, day of month, and month of year effects), or if the any of the ordering terms (differencing, order, or moving average order) are significantly different for the different independent series.

[11]:

arima_base = ARIMA(order=(4, 1, 5), out_of_sample_size=14)
group_arima = GroupedPmdarima(
    model_template=arima_base
)

Fit the grouped Pmdarima model

[12]:

group_arima_model = group_arima.fit(df=data.df,
                                    group_key_columns=data.key_columns,
                                    y_col="y",
                                    datetime_col="ds",
                                    silence_warnings=True)

Let’s see what the parameters were from the run.

[13]:

group_arima_model.get_model_params()

[13]:

	grouping_key_columns	key2	key1	key0	maxiter	method	out_of_sample_size	scoring	scoring_args	start_params	suppress_warnings	trend	with_intercept	p	d	q
0	(key2, key1, key0)	F	E	K	50	lbfgs	14	mse	None	None	False	None	True	4	1	5
1	(key2, key1, key0)	J	X	V	50	lbfgs	14	mse	None	None	False	None	True	4	1	5
2	(key2, key1, key0)	N	X	C	50	lbfgs	14	mse	None	None	False	None	True	4	1	5
3	(key2, key1, key0)	O	T	Y	50	lbfgs	14	mse	None	None	False	None	True	4	1	5
4	(key2, key1, key0)	Y	V	G	50	lbfgs	14	mse	None	None	False	None	True	4	1	5

Let’s take a look at the training metrics

[14]:

group_arima_model.get_metrics()

[14]:

	grouping_key_columns	key2	key1	key0	hqic	aicc	oob	bic	aic
0	(key2, key1, key0)	F	E	K	7308.479882	7288.064205	26.639746	7342.321388	7287.809869
1	(key2, key1, key0)	J	X	V	8529.693560	8509.277883	73.661043	8563.535066	8509.023548
2	(key2, key1, key0)	N	X	C	7094.464510	7074.048832	33.927776	7128.306016	7073.794497
3	(key2, key1, key0)	O	T	Y	7636.027821	7615.612144	118.098407	7669.869327	7615.357809
4	(key2, key1, key0)	Y	V	G	8408.200891	8387.785213	177.272914	8442.042397	8387.530878

The out of bounds measure on the 14 day validation period doesn’t look too bad. Let’s save this model.

Save it to a local directory

[15]:

group_arima_model.save("./group_arima.gpmd")

Load the saved model and perform a forecast for each group.

[16]:

loaded_arima = GroupedPmdarima.load("./group_arima.gpmd")

[17]:

forecast = loaded_arima.predict(n_periods = 60, alpha=0.5, predict_col="forecast", return_conf_int=True)
forecast

[17]:

	grouping_key_columns	key2	key1	key0	forecast	yhat_lower	yhat_upper	ds
0	(key2, key1, key0)	F	E	K	439.455456	433.443671	445.467241	2019-11-17 00:01:00
1	(key2, key1, key0)	F	E	K	611.627138	605.596691	617.657586	2019-11-18 00:01:00
2	(key2, key1, key0)	F	E	K	485.765766	479.734559	491.796972	2019-11-19 00:01:00
3	(key2, key1, key0)	F	E	K	581.559916	575.493820	587.626011	2019-11-20 00:01:00
4	(key2, key1, key0)	F	E	K	410.152148	403.979627	416.324670	2019-11-21 00:01:00
...	...	...	...	...	...	...	...	...
295	(key2, key1, key0)	Y	V	G	868.854019	849.593089	888.114949	2020-01-11 00:01:00
296	(key2, key1, key0)	Y	V	G	844.56244	825.194837	863.930042	2020-01-12 00:01:00
297	(key2, key1, key0)	Y	V	G	1336.358013	1316.951209	1355.764817	2020-01-13 00:01:00
298	(key2, key1, key0)	Y	V	G	961.353673	941.932352	980.774994	2020-01-14 00:01:00
299	(key2, key1, key0)	Y	V	G	1226.602318	1207.077806	1246.126830	2020-01-15 00:01:00

300 rows × 8 columns

[18]:

plot_grouped_series_forecast(forecast, data.key_columns, "ds", "forecast", "yhat_lower", "yhat_upper")

../../_images/tutorials-and-examples_notebooks_grouped_pmdarima_example_jupyter_30_0.png

The ordering terms that were used above, (4, 1, 5) were determined via using AutoARIMA against the average of all generated series. This approach, a hierarchal optimization of multi series data, is very effective and should be pursued first if at all possible.

AutoARIMA with seasonality components

Since we know that these data sets have a weekly periodicity and that they’re generated on a daily basis, let’s set m=7. This will apply a seasonality term to the ARIMA model (making it a SARIMA model) and opening up the tuning of the seasonal components (P, D, Q).

Note: This will take much longer to run.

[19]:

auto_arima = AutoARIMA(out_of_sample_size=14,
                       maxiter=500,
                       max_order=7,
                       d=1,
                       m=7
                      )
auto_arima_model = GroupedPmdarima(model_template=auto_arima).fit(
    df=data.df,
    group_key_columns=data.key_columns,
    y_col="y",
    datetime_col="ds",
    silence_warnings=True)

Let’s see what the parameters are for this run.

[20]:

auto_arima_model.get_model_params()

[20]:

	grouping_key_columns	key2	key1	key0	maxiter	method	out_of_sample_size	scoring	scoring_args	start_params	suppress_warnings	trend	with_intercept	p	d	P	D	s
0	(key2, key1, key0)	F	E	K	500	lbfgs	14	mse	{}	None	True	None	False	5	1	2	1	7
1	(key2, key1, key0)	J	X	V	500	lbfgs	14	mse	{}	None	True	None	False	5	1	2	1	7
2	(key2, key1, key0)	N	X	C	500	lbfgs	14	mse	{}	None	True	None	False	5	1	2	1	7
3	(key2, key1, key0)	O	T	Y	500	lbfgs	14	mse	{}	None	True	None	False	5	1	2	1	7
4	(key2, key1, key0)	Y	V	G	500	lbfgs	14	mse	{}	None	True	None	False	5	1	2	1	7

And the training metrics…

[21]:

auto_arima_model.get_metrics()

[21]:

	grouping_key_columns	key2	key1	key0	hqic	aicc	oob	bic	aic
0	(key2, key1, key0)	F	E	K	7238.444457	7223.565458	36.179148	7263.018307	7223.427129
1	(key2, key1, key0)	J	X	V	7500.385256	7485.506257	104.947817	7524.959107	7485.367929
2	(key2, key1, key0)	N	X	C	6831.858389	6816.979390	18.105916	6856.432240	6816.841062
3	(key2, key1, key0)	O	T	Y	6355.091450	6340.212451	43.374184	6379.665300	6340.074122
4	(key2, key1, key0)	Y	V	G	6320.132335	6305.253336	23.108154	6344.706185	6305.115008

And check the cross validation of each group’s model to see what our error metrics are for prediction via backtesting.

[22]:

auto_arima_forecast = auto_arima_model.predict(n_periods = 60, alpha=0.05, return_conf_int=True)

[23]:

plot_grouped_series_forecast(auto_arima_forecast, data.key_columns, "ds", "yhat", "yhat_lower", "yhat_upper")

../../_images/tutorials-and-examples_notebooks_grouped_pmdarima_example_jupyter_40_0.png

The seasonal components for this example aren’t quite as great as the first example. However, this is due to the nature of this generated synthetic example data. For many real-world complex series data, using a seasonal approach with each model getting fit with its own optimal AR terms (p, d, q) and seasonal terms (P, D, Q) can provide better results than manually specifying them.

Cross validation backtesting on the models to get the error metrics

[24]:

auto_arima_cv_window = SlidingWindowForecastCV(h=28, step=180, window_size=365)

auto_arima_cv = auto_arima_model.cross_validate(df=data.df,
                                                metrics=["mean_squared_error", "smape", "mean_absolute_error"],
                                                cross_validator=auto_arima_cv_window,
                                                error_score=np.nan,
                                                verbosity=4
                                               )

[CV] fold=0 ..........................................................
fold=0, score=79.808 [time=27.476 sec]
[CV] fold=1 ..........................................................
fold=1, score=59.472 [time=41.604 sec]
[CV] fold=2 ..........................................................
fold=2, score=232.314 [time=25.329 sec]
[CV] fold=3 ..........................................................
fold=3, score=49.415 [time=21.327 sec]
[CV] fold=0 ..........................................................
fold=0, score=1.542 [time=26.895 sec]
[CV] fold=1 ..........................................................
fold=1, score=1.240 [time=40.395 sec]
[CV] fold=2 ..........................................................
fold=2, score=2.711 [time=25.616 sec]
[CV] fold=3 ..........................................................
fold=3, score=1.138 [time=21.324 sec]
[CV] fold=0 ..........................................................
fold=0, score=7.313 [time=26.776 sec]
[CV] fold=1 ..........................................................
fold=1, score=6.078 [time=40.482 sec]
[CV] fold=2 ..........................................................
fold=2, score=13.143 [time=25.371 sec]
[CV] fold=3 ..........................................................
fold=3, score=5.498 [time=21.812 sec]
[CV] fold=0 ..........................................................
fold=0, score=108.753 [time=21.509 sec]
[CV] fold=1 ..........................................................
fold=1, score=107.304 [time=24.203 sec]
[CV] fold=2 ..........................................................
fold=2, score=142.982 [time=22.971 sec]
[CV] fold=3 ..........................................................
fold=3, score=67.764 [time=20.946 sec]
[CV] fold=0 ..........................................................
fold=0, score=1.218 [time=21.455 sec]
[CV] fold=1 ..........................................................
fold=1, score=1.190 [time=23.443 sec]
[CV] fold=2 ..........................................................
fold=2, score=1.365 [time=22.825 sec]
[CV] fold=3 ..........................................................
fold=3, score=0.952 [time=21.210 sec]
[CV] fold=0 ..........................................................
fold=0, score=8.226 [time=21.380 sec]
[CV] fold=1 ..........................................................
fold=1, score=8.362 [time=23.636 sec]
[CV] fold=2 ..........................................................
fold=2, score=9.412 [time=22.880 sec]
[CV] fold=3 ..........................................................
fold=3, score=6.478 [time=21.000 sec]
[CV] fold=0 ..........................................................
fold=0, score=33.936 [time=26.360 sec]
[CV] fold=1 ..........................................................
fold=1, score=31.374 [time=26.617 sec]
[CV] fold=2 ..........................................................
fold=2, score=31.117 [time=26.668 sec]
[CV] fold=3 ..........................................................
fold=3, score=32.456 [time=32.644 sec]
[CV] fold=0 ..........................................................
fold=0, score=1.034 [time=2784.458 sec]
[CV] fold=1 ..........................................................
fold=1, score=0.923 [time=7289.441 sec]
[CV] fold=2 ..........................................................
fold=2, score=0.896 [time=3687.129 sec]
[CV] fold=3 ..........................................................
fold=3, score=0.918 [time=7270.761 sec]
[CV] fold=0 ..........................................................
fold=0, score=4.857 [time=7284.930 sec]
[CV] fold=1 ..........................................................
fold=1, score=4.560 [time=7287.635 sec]
[CV] fold=2 ..........................................................
fold=2, score=4.415 [time=3681.372 sec]
[CV] fold=3 ..........................................................
fold=3, score=4.672 [time=5103.397 sec]
[CV] fold=0 ..........................................................
fold=0, score=22.142 [time=3003.464 sec]
[CV] fold=1 ..........................................................
fold=1, score=37.220 [time=907.578 sec]
[CV] fold=2 ..........................................................
fold=2, score=36.748 [time=128.197 sec]
[CV] fold=3 ..........................................................
fold=3, score=30.669 [time=59.834 sec]
[CV] fold=0 ..........................................................
fold=0, score=0.652 [time=22.607 sec]
[CV] fold=1 ..........................................................
fold=1, score=0.836 [time=25.936 sec]
[CV] fold=2 ..........................................................
fold=2, score=0.827 [time=25.435 sec]
[CV] fold=3 ..........................................................
fold=3, score=0.726 [time=21.935 sec]
[CV] fold=0 ..........................................................
fold=0, score=4.110 [time=21.966 sec]
[CV] fold=1 ..........................................................
fold=1, score=4.930 [time=25.702 sec]
[CV] fold=2 ..........................................................
fold=2, score=5.035 [time=27.211 sec]
[CV] fold=3 ..........................................................
fold=3, score=4.385 [time=22.635 sec]
[CV] fold=0 ..........................................................
fold=0, score=17.119 [time=34.697 sec]
[CV] fold=1 ..........................................................
fold=1, score=107.078 [time=17.232 sec]
[CV] fold=2 ..........................................................
fold=2, score=19.922 [time=26.995 sec]
[CV] fold=3 ..........................................................
fold=3, score=16.925 [time=24.339 sec]
[CV] fold=0 ..........................................................
fold=0, score=0.396 [time=36.418 sec]
[CV] fold=1 ..........................................................
fold=1, score=1.083 [time=18.280 sec]
[CV] fold=2 ..........................................................
fold=2, score=0.419 [time=28.167 sec]
[CV] fold=3 ..........................................................
fold=3, score=0.383 [time=23.993 sec]
[CV] fold=0 ..........................................................
fold=0, score=3.361 [time=36.009 sec]
[CV] fold=1 ..........................................................
fold=1, score=9.168 [time=18.480 sec]
[CV] fold=2 ..........................................................
fold=2, score=3.774 [time=27.421 sec]
[CV] fold=3 ..........................................................
fold=3, score=3.246 [time=23.926 sec]

[25]:

auto_arima_cv

[25]:

	grouping_key_columns	key2	key1	key0	mean_squared_error_mean	mean_squared_error_stddev	smape_mean	smape_stddev	mean_absolute_error_mean	mean_absolute_error_stddev
0	(key2, key1, key0)	F	E	K	105.252090	74.171681	1.657663	0.626170	8.007901	3.036208
1	(key2, key1, key0)	J	X	V	106.700806	26.631834	1.181143	0.148013	8.119500	1.052836
2	(key2, key1, key0)	N	X	C	32.220902	1.110630	0.942818	0.053464	4.625816	0.161250
3	(key2, key1, key0)	O	T	Y	31.694749	6.090290	0.760451	0.076121	4.615029	0.381769
4	(key2, key1, key0)	Y	V	G	40.261072	38.594749	0.570198	0.296271	4.887450	2.479422

AutoARIMA without seasonality components

[26]:

auto_arima_no_seasonal = AutoARIMA(out_of_sample_size=14, maxiter=500, d=1, max_order=14) # leaving the 'm' arg out.

auto_arima_no_seasonal_obj = GroupedPmdarima(model_template=auto_arima_no_seasonal)

auto_arima_model_no_seasonal = auto_arima_no_seasonal_obj.fit(df=data.df,
                                                              group_key_columns=data.key_columns,
                                                              y_col="y",
                                                              datetime_col="ds",
                                                              silence_warnings=True
                                                             )

[27]:

auto_arima_model_no_seasonal.get_model_params()

[27]:

	grouping_key_columns	key2	key1	key0	maxiter	method	out_of_sample_size	scoring	scoring_args	start_params	suppress_warnings	trend	with_intercept	p	d	q
0	(key2, key1, key0)	F	E	K	500	lbfgs	14	mse	{}	None	True	None	False	2	1	2
1	(key2, key1, key0)	J	X	V	500	lbfgs	14	mse	{}	None	True	None	False	2	1	0
2	(key2, key1, key0)	N	X	C	500	lbfgs	14	mse	{}	None	True	None	True	4	1	3
3	(key2, key1, key0)	O	T	Y	500	lbfgs	14	mse	{}	None	True	None	False	4	1	5
4	(key2, key1, key0)	Y	V	G	500	lbfgs	14	mse	{}	None	True	None	False	5	1	4

[28]:

auto_arima_model_no_seasonal.get_metrics()

[28]:

	grouping_key_columns	key2	key1	key0	hqic	aicc	oob	bic	aic
0	(key2, key1, key0)	F	E	K	11803.021777	11793.683788	4803.128966	11818.404280	11793.626317
1	(key2, key1, key0)	J	X	V	13775.437281	13769.822949	32134.899126	13784.666782	13769.800005
2	(key2, key1, key0)	N	X	C	7591.362347	7574.623595	85.834726	7619.050852	7574.450519
3	(key2, key1, key0)	O	T	Y	7181.798445	7163.219266	51.496175	7212.563450	7163.007524
4	(key2, key1, key0)	Y	V	G	8291.866077	8273.286898	108.699561	8322.631082	8273.075156

[29]:

auto_arima_forecast_no_seasonal = auto_arima_model_no_seasonal.predict(n_periods = 60,
                                                                       alpha=0.05,
                                                                       return_conf_int=True
                                                                      )
plot_grouped_series_forecast(auto_arima_forecast_no_seasonal,
                             data.key_columns,
                             "ds",
                             "yhat",
                             "yhat_lower",
                             "yhat_upper"
                            )

../../_images/tutorials-and-examples_notebooks_grouped_pmdarima_example_jupyter_49_0.png

These are the results when allowing AutoARIMA to optimize without specifying the seasonal ‘m’ value. They’re definitely not as great since the generated data has a clear weekly seasonality component and the optimizer will struggle to find appropriate ordering terms for this type of data.

Pipeline orchestration with data preprocessing

[78]:

pipeline_obj = Pipeline(
    steps=[
        (
            "log",
            LogEndogTransformer(lmbda=0.2, neg_action="raise", floor=1e-12),
        ),
        ("arima", AutoARIMA(out_of_sample_size=14, max_order=14, d=1, suppress_warnings=True)),
    ]
)

pipeline_arima = GroupedPmdarima(
    y_col="y", datetime_col="ds", model_template=pipeline_obj
).fit(df=data.df, group_key_columns=data.key_columns, silence_warnings=True)

[79]:

pipeline_forecast = pipeline_arima.predict(n_periods = 60,
                                                         alpha=0.05,
                                                         return_conf_int=True
                                                        )

[89]:

plot_grouped_series_forecast(pipeline_forecast,
                             data.key_columns,
                             "ds",
                             "forecast",
                             "yhat_lower",
                             "yhat_upper"
                            )

../../_images/tutorials-and-examples_notebooks_grouped_pmdarima_example_jupyter_54_0.png

These series are definitely not in need of log transformation to build effective ARIMA models. That being said, your data may benefit from having an endogenous transformation applied to enforce stationarity.