Notebook example of Diviner’s GroupedPmdarima API

This notebook shows a comparison of 4 primary means of utilizing the GroupedPmdarima API in Diviner.

Examples shown

  • Standard ARIMA

  • A user-supplied manual configuration of ARIMA models to be built based on the configured order terms. This approach is useful for hierarchical multi-series optimization techniques (where ordering terms are determined by evaluating the optimal parameters from a higher level aggregation of disparate series that share a similar seasonality with one another).

  • Note that this is the fastest execution available and is recommended to be used if there is homogeny amongst a large collection of individual series (i.e., forecasting SKU scales at 500 different stores would use a global SKU forecasting model through AutoARIMA to determine the optimal ordering terms, then apply those to each individual store’s ARIMA models through this mode).

  • AutoARIMA

  • An automate approach that will perform a best-effort optimization of ordering terms.

  • Seasonal AutoARIMA

  • A much slower, but, depending on the nature of the series data, potentially much more accurate model for each series.

  • Pipeline preprocessing + AutoARIMA

  • This mode applies a data transformer (exogeneous transformers are currently not supported) such as a LogEndogTransformer or BoxCoxEndogTransformer. Depending on the nature of the data, this may dramatically improve the forecasting quality.

[2]:
import itertools
import pandas as pd
import numpy as np
import string
import random
from datetime import timedelta, datetime
from collections import namedtuple
import matplotlib.pyplot as plt
import matplotlib.cm as cmx

from pmdarima.arima.arima import ARIMA
from pmdarima.arima.auto import AutoARIMA
from pmdarima.pipeline import Pipeline
from pmdarima.preprocessing import LogEndogTransformer
from pmdarima.model_selection import SlidingWindowForecastCV
from diviner import GroupedPmdarima, PmdarimaAnalyzer
[3]:
def _build_trend(size):
    raw_trend = (
        np.arange(size) * np.random.uniform(-0.05, 0.2)
    ) + np.random.randint(200, 500)
    return raw_trend


def _build_seasonality(size, period):
    repeated_x = np.arange(period) + 3

    raw_values = np.where(repeated_x < 5, repeated_x**4,
                          np.where(repeated_x < 7, repeated_x**3,
                                   repeated_x**2)
                         )

    seasonality = raw_values
    for i in range(int(size / period) - 1):
        seasonality = np.append(seasonality, raw_values)
    return seasonality * np.random.randint(1, 4)


def _build_residuals(size):
    return np.random.randn(size) * np.random.randint(4, 10)


def _generate_time_series(size, seasonal_period):
    return (
        _build_trend(size)
        + _build_seasonality(size, seasonal_period)
        + _build_residuals(size)
    )


def _generate_grouping_columns(column_count: int, series_count: int):
    candidate_list = list(string.ascii_uppercase)
    candidates = random.sample(
        list(itertools.permutations(candidate_list, column_count)), series_count
    )
    column_names = sorted([f"key{x}" for x in range(column_count)], reverse=True)
    return [dict(zip(column_names, entries)) for entries in candidates]


def _generate_raw_df(
    column_count: int,
    series_count: int,
    series_size: int,
    series_seasonal_period: int,
    start_dt: str,
    days_period: int,
):
    candidates = _generate_grouping_columns(column_count, series_count)
    start_date = datetime.strptime(start_dt, "%Y-%M-%d")
    dates = np.arange(
        start_date,
        start_date + timedelta(days=series_size * days_period),
        timedelta(days=days_period),
    )
    df_collection = []
    for entry in candidates:
        generated_series = _generate_time_series(series_size, series_seasonal_period)
        series_dict = {"ds": dates, "y": generated_series}
        series_df = pd.DataFrame.from_dict(series_dict)
        for column, value in entry.items():
            series_df[column] = value
        df_collection.append(series_df)
    return pd.concat(df_collection)


def generate_example_data(
    column_count: int,
    series_count: int,
    series_size: int,
    series_seasonal_period: int,
    start_dt: str,
    days_period: int = 1,
):

    Structure = namedtuple("Structure", "df key_columns")
    data = _generate_raw_df(
        column_count, series_count, series_size, series_seasonal_period, start_dt, days_period
    )
    key_columns = list(data.columns)

    for key in ["ds", "y"]:
        key_columns.remove(key)

    return Structure(data, key_columns)

def plot_grouped_series(df, key_columns, time_col, y_col):
    grouped = df.groupby(key_columns)
    ncols = 1
    nrows = int(np.ceil(grouped.ngroups/ncols))
    fig, axes = plt.subplots(nrows=nrows,
                             ncols=ncols,
                             figsize=(16, 6*grouped.ngroups),
                             sharey=False,
                             sharex=False
                            )
    cmap = [cmx.Dark2(x) for x in np.linspace(0.0, 1.0, grouped.ngroups)]
    i=0
    for (key, ax) in zip(grouped.groups.keys(), axes.flatten()):
        ser = grouped.get_group(key)
        rgb = cmap[i]
        ax.plot(ser[time_col], ser[y_col], label="y value", c=rgb)
        ax.legend()
        ax.title.set_text(f"Group: {key}")
        i+=1
    plt.show()

def plot_grouped_series_forecast(df, key_columns, time_col, y_col, yhat_lower_col, yhat_upper_col):

    grouped = df.groupby(key_columns)
    ncols = 1
    nrows = int(np.ceil(grouped.ngroups/ncols))
    fig, axes = plt.subplots(nrows=nrows,
                             ncols=ncols,
                             figsize=(16, 8*grouped.ngroups),
                             sharey=False,
                             sharex=False
                            )
    cmap = [cmx.Dark2(x) for x in np.linspace(0.0, 1.0, grouped.ngroups)]
    i=0
    for (key, ax) in zip(grouped.groups.keys(), axes.flatten()):
        ser = grouped.get_group(key)
        rgb = cmap[i]
        ax.plot(ser[time_col], ser[y_col], label="y value", c=rgb)
        ax.fill_between(ser[time_col],
                        ser[yhat_lower_col],
                        ser[yhat_upper_col],
                        color=rgb,
                        alpha=0.3,
                        label="error"
                       )
        ax.legend(loc="upper left")
        ax.title.set_text(f"Group: {key}")
        ax.grid(color=rgb, linewidth=0.5, alpha=0.5)
        i+=1
    plt.show()
[4]:
data = generate_example_data(3, 5, 1050, 7, "2017-01-01", 1)

View the synthetic generated data

Note that this is a randomly generated data set.

[5]:
plot_grouped_series(data.df, data.key_columns, "ds", "y")
../../_images/tutorials-and-examples_notebooks_grouped_pmdarima_example_jupyter_6_0.png

ARIMA manually defined ordering terms

[6]:
data_utils = PmdarimaAnalyzer(data.df, data.key_columns, "y", "ds")

ndiffs = data_utils.calculate_ndiffs(alpha=0.05, test="kpss", max_d=7)
ndiffs
[6]:
{('F', 'E', 'K'): 1,
 ('J', 'X', 'V'): 0,
 ('N', 'X', 'C'): 1,
 ('O', 'T', 'Y'): 1,
 ('Y', 'V', 'G'): 1}

The results above for the ndiffs method shows that each group’s differencing value should be set to ‘1’. This isn’t surprising based on how the generated data was created. Real world data sets may have different optimal ‘d’ values, though. Performing a manual validation can dramatically reduce optimization time for the AutoARIMA methods (which will be covered further down in this example notebook).

Let’s check the seasonal differencing as well.

[7]:
nsdiffs = data_utils.calculate_nsdiffs(m=20, test="ocsb", max_D=30)
nsdiffs
[7]:
{('F', 'E', 'K'): 0,
 ('J', 'X', 'V'): 0,
 ('N', 'X', 'C'): 0,
 ('O', 'T', 'Y'): 0,
 ('Y', 'V', 'G'): 0}

Calculate the acf values for each group to aid in setting the ARIMA ‘p’ parameter.

[8]:
acf = data_utils.calculate_acf(alpha=0.05, qstat=True)
acf
[8]:
{('F',
  'E',
  'K'): {'acf': array([ 1.        , -0.08070348,  0.17113533, -0.57432053, -0.57269233,
          0.16892029, -0.08007331,  0.9866866 , -0.08029439,  0.16979978,
         -0.57130432, -0.56881352,  0.16684712, -0.07890206,  0.98009748,
         -0.07907823,  0.16909447, -0.56757814, -0.56480278,  0.16576373,
         -0.07812178,  0.97376666, -0.07840144,  0.16723731, -0.56373238,
         -0.56227819,  0.1649875 , -0.07771043,  0.96709519, -0.07748238,
          0.16583409]), 'confidence_intervals': array([[ 1.00000000e+00,  1.00000000e+00],
         [-1.41189283e-01, -2.02176793e-02],
         [ 1.10256857e-01,  2.32013805e-01],
         [-6.36934311e-01, -5.11706747e-01],
         [-6.52278609e-01, -4.93106048e-01],
         [ 7.54654815e-02,  2.62375098e-01],
         [-1.74638559e-01,  1.44919437e-02],
         [ 8.91873617e-01,  1.08149958e+00],
         [-2.07231513e-01,  4.66427328e-02],
         [ 4.26769784e-02,  2.96922590e-01],
         [-6.99254204e-01, -4.43354434e-01],
         [-7.05778407e-01, -4.31848638e-01],
         [ 2.14964890e-02,  3.12197760e-01],
         [-2.24951705e-01,  6.71475893e-02],
         [ 8.33891965e-01,  1.12630299e+00],
         [-2.47615343e-01,  8.94588851e-02],
         [ 4.21665597e-04,  3.37767275e-01],
         [-7.36869989e-01, -3.98286283e-01],
         [-7.40918901e-01, -3.88686666e-01],
         [-1.68589604e-02,  3.48386416e-01],
         [-2.61294110e-01,  1.05050546e-01],
         [ 7.90472475e-01,  1.15706084e+00],
         [-2.79734390e-01,  1.22931504e-01],
         [-3.42072984e-02,  3.68681928e-01],
         [-7.65684304e-01, -3.61780464e-01],
         [-7.69907420e-01, -3.54648952e-01],
         [-4.81397856e-02,  3.78114778e-01],
         [-2.91304472e-01,  1.35883613e-01],
         [ 7.53397735e-01,  1.18079264e+00],
         [-3.06633108e-01,  1.51668346e-01],
         [-6.34124686e-02,  3.95080645e-01]]), 'qstat': array([6.85826223e+00, 3.77273016e+01, 3.85717521e+02, 7.32068234e+02,
         7.62229696e+02, 7.69013606e+02, 1.80006234e+03, 1.80689685e+03,
         1.83749031e+03, 2.18415269e+03, 2.52812962e+03, 2.55775372e+03,
         2.56438508e+03, 3.58858286e+03, 3.59525674e+03, 3.62580196e+03,
         3.97027563e+03, 4.31171925e+03, 4.34115841e+03, 4.34770344e+03,
         5.36559020e+03, 5.37219501e+03, 5.40227661e+03, 5.74441645e+03,
         6.08512548e+03, 6.11448893e+03, 6.12100954e+03, 7.13187321e+03,
         7.13836830e+03, 7.16815021e+03]), 'pvalues': array([8.82323175e-003, 6.42126449e-009, 2.74600191e-083, 3.96377730e-157,
         1.71230216e-162, 7.61823371e-163, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000])},
 ('J',
  'X',
  'V'): {'acf': array([ 1.        , -0.08971388,  0.17178181, -0.57976408, -0.57751633,
          0.16928219, -0.08856344,  0.99113133, -0.08908178,  0.17049929,
         -0.57609728, -0.57368404,  0.16817819, -0.08774527,  0.9848123 ,
         -0.08839067,  0.16942107, -0.57210998, -0.56969456,  0.16699015,
         -0.08711197,  0.97796277, -0.08810664,  0.16820989, -0.5684689 ,
         -0.56560049,  0.16574469, -0.08629873,  0.97111987, -0.0872756 ,
          0.16688735]), 'confidence_intervals': array([[ 1.00000000e+00,  1.00000000e+00],
         [-1.50199679e-01, -2.92280757e-02],
         [ 1.10811128e-01,  2.32752494e-01],
         [-6.42480452e-01, -5.17047707e-01],
         [-6.57471324e-01, -4.97561327e-01],
         [ 7.52969431e-02,  2.63267435e-01],
         [-1.83657644e-01,  6.53076441e-03],
         [ 8.95735843e-01,  1.08652682e+00],
         [-2.16706827e-01,  3.85432747e-02],
         [ 4.26469632e-02,  2.98351626e-01],
         [-7.04778770e-01, -4.47415791e-01],
         [-7.11478716e-01, -4.35889368e-01],
         [ 2.19061278e-02,  3.14450251e-01],
         [-2.34723063e-01,  5.92325199e-02],
         [ 8.37642982e-01,  1.13198161e+00],
         [-2.57964550e-01,  8.11832075e-02],
         [-3.21283670e-04,  3.39163431e-01],
         [-7.42469873e-01, -4.01750086e-01],
         [-7.46944218e-01, -3.92444907e-01],
         [-1.68364179e-02,  3.50816720e-01],
         [-2.71492690e-01,  9.72687454e-02],
         [ 7.93431540e-01,  1.16249400e+00],
         [-2.90714385e-01,  1.14501097e-01],
         [-3.45379763e-02,  3.70957757e-01],
         [-7.71726697e-01, -3.65211110e-01],
         [-7.74594003e-01, -3.56606968e-01],
         [-4.87758122e-02,  3.80265202e-01],
         [-3.01287231e-01,  1.28689779e-01],
         [ 7.56004669e-01,  1.18623508e+00],
         [-3.17872831e-01,  1.43321630e-01],
         [-6.38306951e-02,  3.97605398e-01]]), 'qstat': array([   8.47517754,   39.5778789 ,  394.19603408,  746.40619894,
          776.69703653,  784.99580201, 1825.3545896 , 1833.76689546,
         1864.61294006, 2217.11637211, 2567.00919151, 2597.10784245,
         2605.30897302, 3639.38437728, 3647.72267797, 3678.386011  ,
         4028.38256167, 4375.76630468, 4405.64269257, 4413.78080166,
         5440.45892422, 5448.80013774, 5479.23263921, 5827.14599696,
         6171.8931755 , 6201.52677151, 6209.56829208, 7228.86313254,
         7237.10384927, 7267.26526426]), 'pvalues': array([3.60025244e-003, 2.54549820e-009, 4.00232684e-085, 3.11215093e-160,
         1.27131779e-165, 2.68648911e-166, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000])},
 ('N',
  'X',
  'C'): {'acf': array([ 1.        , -0.01968557,  0.22007368, -0.48359781, -0.48169266,
          0.217541  , -0.01947545,  0.98802832, -0.02016042,  0.21825466,
         -0.48114571, -0.47876755,  0.21525037, -0.02022865,  0.98066016,
         -0.02062928,  0.21593687, -0.47865674, -0.47685675,  0.21294954,
         -0.02105458,  0.97360499, -0.0210813 ,  0.21410622, -0.47613525,
         -0.47476485,  0.21066211, -0.02169228,  0.96607903, -0.02189333,
          0.21159558]), 'confidence_intervals': array([[ 1.        ,  1.        ],
         [-0.08017137,  0.04080023],
         [ 0.15956444,  0.28058291],
         [-0.54696776, -0.42022786],
         [-0.5573694 , -0.40601591],
         [ 0.13137411,  0.30370788],
         [-0.10762875,  0.06867786],
         [ 0.89985927,  1.07619737],
         [-0.14229436,  0.10197352],
         [ 0.09610855,  0.34040077],
         [-0.60471035, -0.35758106],
         [-0.6090063 , -0.34852881],
         [ 0.07872441,  0.35177632],
         [-0.1579906 ,  0.11753331],
         [ 0.84288734,  1.11843299],
         [-0.18193064,  0.14067209],
         [ 0.05462586,  0.37724789],
         [-0.64102185, -0.31629163],
         [-0.64430483, -0.30940867],
         [ 0.04060482,  0.38529426],
         [-0.19435926,  0.1522501 ],
         [ 0.80029094,  1.14691903],
         [-0.21336663,  0.17120404],
         [ 0.02181242,  0.40640001],
         [-0.66929924, -0.28297126],
         [-0.67217595, -0.27735376],
         [ 0.00911703,  0.4122072 ],
         [-0.22404133,  0.18065678],
         [ 0.76372146,  1.16843659],
         [-0.24047436,  0.19668769],
         [-0.00699346,  0.43018463]]), 'qstat': array([4.08061309e-01, 5.14562056e+01, 2.98189072e+02, 5.43215771e+02,
         5.93238914e+02, 5.93640224e+02, 1.62749495e+03, 1.62792581e+03,
         1.67847117e+03, 1.92435215e+03, 2.16804283e+03, 2.21734834e+03,
         2.21778421e+03, 3.24315833e+03, 3.24361252e+03, 3.29342499e+03,
         3.53841766e+03, 3.78180680e+03, 3.83039153e+03, 3.83086693e+03,
         4.84841572e+03, 4.84889326e+03, 4.89819851e+03, 5.14227073e+03,
         5.38517676e+03, 5.43304836e+03, 5.43355645e+03, 6.44229694e+03,
         6.44281550e+03, 6.49130169e+03]), 'pvalues': array([5.22955152e-001, 6.70543443e-012, 2.45301824e-064, 3.00422546e-116,
         5.84337991e-126, 5.48966949e-125, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000])},
 ('O',
  'T',
  'Y'): {'acf': array([ 1.        , -0.08259995,  0.17734596, -0.57067234, -0.5684687 ,
          0.17509455, -0.08130592,  0.99248242, -0.08216322,  0.17599856,
         -0.56701055, -0.56461355,  0.17391106, -0.08060416,  0.98566707,
         -0.08166764,  0.17457252, -0.56337294, -0.56083338,  0.17276644,
         -0.08014475,  0.97890769, -0.08129729,  0.17329419, -0.55960215,
         -0.55701671,  0.17141666, -0.07962553,  0.97213105, -0.08084958,
          0.17208343]), 'confidence_intervals': array([[ 1.        ,  1.        ],
         [-0.14308575, -0.02211415],
         [ 0.11644888,  0.23824304],
         [-0.63343051, -0.50791417],
         [-0.64797665, -0.48896075],
         [ 0.08189545,  0.26829364],
         [-0.17570083,  0.01308899],
         [ 0.89783164,  1.0871332 ],
         [-0.20930973,  0.04498329],
         [ 0.04865795,  0.30333917],
         [-0.695238  , -0.43878309],
         [-0.70170739, -0.4275197 ],
         [ 0.02855867,  0.31926346],
         [-0.22671584,  0.06550753],
         [ 0.8393928 ,  1.13194135],
         [-0.25050182,  0.08716655],
         [ 0.00559388,  0.34355117],
         [-0.73301013, -0.39373576],
         [-0.73718284, -0.38448393],
         [-0.00999187,  0.35552474],
         [-0.2634996 ,  0.10321009],
         [ 0.79542472,  1.16239065],
         [-0.2829843 ,  0.12038972],
         [-0.02851267,  0.37510105],
         [-0.76195271, -0.3572516 ],
         [-0.76495209, -0.34908134],
         [-0.04190791,  0.38474122],
         [-0.29345343,  0.13420237],
         [ 0.7581947 ,  1.18606741],
         [-0.31037882,  0.14867966],
         [-0.05754997,  0.40171684]]), 'qstat': array([7.18437719e+00, 4.03345925e+01, 3.83917858e+02, 7.25178711e+02,
         7.57585350e+02, 7.64579724e+02, 1.80777684e+03, 1.81493320e+03,
         1.84780115e+03, 2.18927226e+03, 2.52818827e+03, 2.56037391e+03,
         2.56729446e+03, 3.60316572e+03, 3.61028383e+03, 3.64284022e+03,
         3.98222838e+03, 4.31888957e+03, 4.35086859e+03, 4.35775699e+03,
         5.38642004e+03, 5.39352177e+03, 5.42582177e+03, 5.76296654e+03,
         6.09732909e+03, 6.12902558e+03, 6.13587154e+03, 7.15729017e+03,
         7.16436205e+03, 7.19643087e+03]), 'pvalues': array([7.35410753e-003, 1.74363074e-009, 6.73724508e-083, 1.23042405e-155,
         1.73027963e-161, 6.91273826e-162, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000])},
 ('Y',
  'V',
  'G'): {'acf': array([ 1.        , -0.06089902,  0.19406527, -0.53915391, -0.53707731,
          0.19151146, -0.06008225,  0.99253338, -0.0607289 ,  0.19239331,
         -0.53581884, -0.53377634,  0.18991977, -0.06011947,  0.98545395,
         -0.0607944 ,  0.19067928, -0.53253409, -0.53052952,  0.18844171,
         -0.06005286,  0.97843607, -0.06083826,  0.18909121, -0.52922907,
         -0.52732228,  0.18674487, -0.06008383,  0.9713386 , -0.06075151,
          0.18737217]), 'confidence_intervals': array([[ 1.00000000e+00,  1.00000000e+00],
         [-1.21384817e-01, -4.13213869e-04],
         [ 1.33355563e-01,  2.54774984e-01],
         [-6.02092283e-01, -4.76215529e-01],
         [-6.15104319e-01, -4.59050299e-01],
         [ 1.00964032e-01,  2.82058885e-01],
         [-1.52099649e-01,  3.19351498e-02],
         [ 9.00372563e-01,  1.08469419e+00],
         [-1.86035803e-01,  6.45780064e-02],
         [ 6.69787750e-02,  3.17807845e-01],
         [-6.62308552e-01, -4.09329121e-01],
         [-6.68314035e-01, -3.99238650e-01],
         [ 4.78453231e-02,  3.31994214e-01],
         [-2.03119714e-01,  8.28807826e-02],
         [ 8.42361262e-01,  1.12854664e+00],
         [-2.26870469e-01,  1.05281673e-01],
         [ 2.45218148e-02,  3.56836755e-01],
         [-6.99490201e-01, -3.65577979e-01],
         [-7.03588498e-01, -3.57470552e-01],
         [ 9.53145029e-03,  3.67351967e-01],
         [-2.39687804e-01,  1.19582075e-01],
         [ 7.98727701e-01,  1.15814445e+00],
         [-2.59080530e-01,  1.37404007e-01],
         [-9.21935613e-03,  3.87401771e-01],
         [-7.28198173e-01, -3.30259963e-01],
         [-7.31376425e-01, -3.23268139e-01],
         [-2.22353699e-02,  3.95725109e-01],
         [-2.69673698e-01,  1.49506040e-01],
         [ 7.61685720e-01,  1.18099147e+00],
         [-2.86268629e-01,  1.64765599e-01],
         [-3.82048147e-02,  4.12949146e-01]]), 'qstat': array([3.90526128e+00, 4.36005909e+01, 3.50279471e+02, 6.54891439e+02,
         6.93659875e+02, 6.97479290e+02, 1.74078352e+03, 1.74469309e+03,
         1.78396972e+03, 2.08890498e+03, 2.39181116e+03, 2.43019497e+03,
         2.43404493e+03, 3.46946828e+03, 3.47341278e+03, 3.51225388e+03,
         3.81550300e+03, 4.11676507e+03, 4.15481032e+03, 4.15867786e+03,
         5.18634999e+03, 5.19032708e+03, 5.22878424e+03, 5.53032433e+03,
         5.82998754e+03, 5.86760612e+03, 5.87150414e+03, 6.89125818e+03,
         6.89525113e+03, 6.93327138e+03]), 'pvalues': array([4.81351422e-002, 3.40605757e-010, 1.29766125e-075, 2.03514101e-140,
         1.15368607e-147, 2.14170577e-147, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000, 0.00000000e+000])}}

And the partial autocorrelation function values…

[9]:
pacf = data_utils.calculate_pacf(nlags=40, alpha=0.05)
pacf
[9]:
{('F',
  'E',
  'K'): {'pacf': array([ 1.        , -0.08078041,  0.16601981, -0.57034304, -0.9620566 ,
          0.61323735,  0.04746202,  0.77862671,  0.00637416, -0.08100356,
         -0.05871874, -0.13700759,  0.1174423 ,  0.0605576 ,  0.42934315,
          0.09243144, -0.07328271,  0.00398806, -0.100308  ,  0.1303113 ,
          0.05355023,  0.31839881,  0.08861446, -0.22613491,  0.01178529,
         -0.22631973,  0.18613883,  0.15154231,  0.24202115,  0.14823025,
         -0.36025751, -0.00747563, -0.09967852,  0.27377749,  0.2899224 ,
          0.03164718, -0.01778687, -0.72500717,  0.08315487,  0.44580926,
          1.87065584]), 'confidence_intervals': array([[ 1.00000000e+00,  1.00000000e+00],
         [-1.41266216e-01, -2.02946130e-02],
         [ 1.05534006e-01,  2.26505610e-01],
         [-6.30828837e-01, -5.09857234e-01],
         [-1.02254240e+00, -9.01570796e-01],
         [ 5.52751546e-01,  6.73723149e-01],
         [-1.30237844e-02,  1.07947819e-01],
         [ 7.18140908e-01,  8.39112511e-01],
         [-5.41116405e-02,  6.68599629e-02],
         [-1.41489364e-01, -2.05177602e-02],
         [-1.19204540e-01,  1.76706309e-03],
         [-1.97493394e-01, -7.65217907e-02],
         [ 5.69564994e-02,  1.77928103e-01],
         [ 7.18002697e-05,  1.21043404e-01],
         [ 3.68857350e-01,  4.89828953e-01],
         [ 3.19456401e-02,  1.52917244e-01],
         [-1.33768515e-01, -1.27969120e-02],
         [-5.64977374e-02,  6.44738661e-02],
         [-1.60793800e-01, -3.98221961e-02],
         [ 6.98254990e-02,  1.90797102e-01],
         [-6.93556812e-03,  1.14036035e-01],
         [ 2.57913009e-01,  3.78884612e-01],
         [ 2.81286603e-02,  1.49100264e-01],
         [-2.86620716e-01, -1.65649112e-01],
         [-4.87005161e-02,  7.22710873e-02],
         [-2.86805530e-01, -1.65833926e-01],
         [ 1.25653024e-01,  2.46624627e-01],
         [ 9.10565039e-02,  2.12028107e-01],
         [ 1.81535344e-01,  3.02506947e-01],
         [ 8.77444507e-02,  2.08716054e-01],
         [-4.20743308e-01, -2.99771705e-01],
         [-6.79614321e-02,  5.30101714e-02],
         [-1.60164320e-01, -3.91927162e-02],
         [ 2.13291689e-01,  3.34263293e-01],
         [ 2.29436600e-01,  3.50408204e-01],
         [-2.88386224e-02,  9.21329811e-02],
         [-7.82726713e-02,  4.26989322e-02],
         [-7.85492971e-01, -6.64521368e-01],
         [ 2.26690715e-02,  1.43640675e-01],
         [ 3.85323463e-01,  5.06295066e-01],
         [ 1.81017004e+00,  1.93114164e+00]])},
 ('J',
  'X',
  'V'): {'pacf': array([ 1.00000000e+00, -8.97994006e-02,  1.65379314e-01, -5.73871649e-01,
         -9.90181691e-01,  2.28389649e-02, -4.93185201e-01,  7.69240477e-01,
          1.03879793e-01, -5.28893124e-01,  1.34794508e-01, -3.51589756e-01,
          1.85686790e-01,  1.55883714e-01,  1.98571228e-01,  3.29974040e-01,
         -7.80295225e-01,  1.35211200e+00,  2.41677843e+00, -9.85720454e-01,
          3.04881070e+01,  1.02424994e+00, -2.40557539e-01, -5.62704983e-01,
          2.76483779e-01, -6.86759801e-02,  6.54149325e-01, -8.11543320e-02,
         -6.08749521e-01, -7.23722874e-01, -2.39649064e+00,  4.45281360e-01,
         -2.59062504e+00,  5.16719035e-01,  7.25642747e-01,  2.43975398e-01,
          3.81543627e-01, -1.01150937e+00, -2.65585463e+01,  9.87381089e-01,
         -2.23305532e+00]), 'confidence_intervals': array([[ 1.00000000e+00,  1.00000000e+00],
         [-1.50285202e-01, -2.93135989e-02],
         [ 1.04893512e-01,  2.25865116e-01],
         [-6.34357451e-01, -5.13385847e-01],
         [-1.05066749e+00, -9.29695889e-01],
         [-3.76468369e-02,  8.33247666e-02],
         [-5.53671003e-01, -4.32699399e-01],
         [ 7.08754675e-01,  8.29726278e-01],
         [ 4.33939913e-02,  1.64365595e-01],
         [-5.89378926e-01, -4.68407322e-01],
         [ 7.43087068e-02,  1.95280310e-01],
         [-4.12075557e-01, -2.91103954e-01],
         [ 1.25200988e-01,  2.46172591e-01],
         [ 9.53979124e-02,  2.16369516e-01],
         [ 1.38085427e-01,  2.59057030e-01],
         [ 2.69488238e-01,  3.90459842e-01],
         [-8.40781027e-01, -7.19809423e-01],
         [ 1.29162620e+00,  1.41259780e+00],
         [ 2.35629263e+00,  2.47726423e+00],
         [-1.04620626e+00, -9.25234652e-01],
         [ 3.04276212e+01,  3.05485928e+01],
         [ 9.63764140e-01,  1.08473574e+00],
         [-3.01043341e-01, -1.80071738e-01],
         [-6.23190784e-01, -5.02219181e-01],
         [ 2.15997977e-01,  3.36969581e-01],
         [-1.29161782e-01, -8.19017841e-03],
         [ 5.93663523e-01,  7.14635127e-01],
         [-1.41640134e-01, -2.06685303e-02],
         [-6.69235322e-01, -5.48263719e-01],
         [-7.84208676e-01, -6.63237072e-01],
         [-2.45697644e+00, -2.33600484e+00],
         [ 3.84795558e-01,  5.05767162e-01],
         [-2.65111084e+00, -2.53013923e+00],
         [ 4.56233234e-01,  5.77204837e-01],
         [ 6.65156945e-01,  7.86128549e-01],
         [ 1.83489596e-01,  3.04461199e-01],
         [ 3.21057825e-01,  4.42029428e-01],
         [-1.07199518e+00, -9.51023573e-01],
         [-2.66190321e+01, -2.64980605e+01],
         [ 9.26895288e-01,  1.04786689e+00],
         [-2.29354112e+00, -2.17256952e+00]])},
 ('N',
  'X',
  'C'): {'pacf': array([ 1.        , -0.01970433,  0.2201909 , -0.50176994, -0.69433673,
          0.95803865,  0.20401309,  0.8161953 , -0.01727898, -0.14639157,
          0.02840954, -0.17105713,  0.16707132, -0.03883989,  0.35872681,
          0.0038246 , -0.25951443,  0.10018443, -0.25434774,  0.14468707,
          0.00653858,  0.20690802,  0.10056146, -0.30360206,  0.22331149,
         -0.39152792,  0.26718222,  0.05444163, -0.14915054,  0.51674794,
         -1.46615051, -2.28765331,  0.89992292, -2.75190377, -1.41158622,
          0.72795062, -0.36145038,  0.05923125, -0.03008689, -0.12517299,
          0.28304207]), 'confidence_intervals': array([[ 1.00000000e+00,  1.00000000e+00],
         [-8.01901350e-02,  4.07814684e-02],
         [ 1.59705095e-01,  2.80676698e-01],
         [-5.62255742e-01, -4.41284138e-01],
         [-7.54822534e-01, -6.33850930e-01],
         [ 8.97552847e-01,  1.01852445e+00],
         [ 1.43527288e-01,  2.64498891e-01],
         [ 7.55709496e-01,  8.76681100e-01],
         [-7.77647795e-02,  4.32068239e-02],
         [-2.06877375e-01, -8.59057717e-02],
         [-3.20762615e-02,  8.88953419e-02],
         [-2.31542936e-01, -1.10571333e-01],
         [ 1.06585516e-01,  2.27557120e-01],
         [-9.93256891e-02,  2.16459144e-02],
         [ 2.98241006e-01,  4.19212609e-01],
         [-5.66612031e-02,  6.43104003e-02],
         [-3.20000229e-01, -1.99028626e-01],
         [ 3.96986243e-02,  1.60670228e-01],
         [-3.14833537e-01, -1.93861933e-01],
         [ 8.42012654e-02,  2.05172869e-01],
         [-5.39472258e-02,  6.70243777e-02],
         [ 1.46422222e-01,  2.67393825e-01],
         [ 4.00756561e-02,  1.61047260e-01],
         [-3.64087860e-01, -2.43116257e-01],
         [ 1.62825693e-01,  2.83797296e-01],
         [-4.52013719e-01, -3.31042116e-01],
         [ 2.06696419e-01,  3.27668022e-01],
         [-6.04417343e-03,  1.14927430e-01],
         [-2.09636341e-01, -8.86647377e-02],
         [ 4.56262137e-01,  5.77233740e-01],
         [-1.52663631e+00, -1.40566471e+00],
         [-2.34813911e+00, -2.22716750e+00],
         [ 8.39437119e-01,  9.60408723e-01],
         [-2.81238957e+00, -2.69141797e+00],
         [-1.47207203e+00, -1.35110042e+00],
         [ 6.67464820e-01,  7.88436423e-01],
         [-4.21936183e-01, -3.00964579e-01],
         [-1.25455353e-03,  1.19717050e-01],
         [-9.05726907e-02,  3.03989128e-02],
         [-1.85658788e-01, -6.46871844e-02],
         [ 2.22556271e-01,  3.43527874e-01]])},
 ('O',
  'T',
  'Y'): {'pacf': array([  1.        ,  -0.08267869,   0.17202456,  -0.56644104,
          -0.9541979 ,   0.85189822,   0.1845516 ,   0.94596428,
           0.43918741,  -1.15664315,  -1.64226152,   2.42653871,
           0.96331553,  -0.31849573,   5.24481561,   0.49895362,
          -0.98231553,   0.75347103,   4.98908903,  -0.80730486,
           2.87884606,   0.7683438 ,   0.94560433, -17.85764882,
          -0.97487154,  -1.64867233,   0.88285951,  -0.51857155,
          -0.28254564,   1.1636752 ,   2.94570193,  -1.51600933,
          -1.10973108,   1.65826343,   0.62080675,   2.96339001,
          -0.37477296,  -0.70521211,   0.6992378 ,  -0.3032168 ,
           0.13773625]), 'confidence_intervals': array([[  1.        ,   1.        ],
         [ -0.14316449,  -0.02219289],
         [  0.11153876,   0.23251036],
         [ -0.62692684,  -0.50595524],
         [ -1.0146837 ,  -0.8937121 ],
         [  0.79141242,   0.91238402],
         [  0.1240658 ,   0.2450374 ],
         [  0.88547848,   1.00645008],
         [  0.37870161,   0.49967321],
         [ -1.21712895,  -1.09615734],
         [ -1.70274732,  -1.58177572],
         [  2.36605291,   2.48702451],
         [  0.90282972,   1.02380133],
         [ -0.37898153,  -0.25800993],
         [  5.1843298 ,   5.30530141],
         [  0.43846782,   0.55943942],
         [ -1.04280133,  -0.92182972],
         [  0.69298523,   0.81395683],
         [  4.92860323,   5.04957483],
         [ -0.86779066,  -0.74681906],
         [  2.81836026,   2.93933186],
         [  0.707858  ,   0.8288296 ],
         [  0.88511853,   1.00609014],
         [-17.91813462, -17.79716302],
         [ -1.03535734,  -0.91438574],
         [ -1.70915813,  -1.58818653],
         [  0.82237371,   0.94334532],
         [ -0.57905735,  -0.45808574],
         [ -0.34303144,  -0.22205984],
         [  1.1031894 ,   1.224161  ],
         [  2.88521613,   3.00618773],
         [ -1.57649513,  -1.45552353],
         [ -1.17021688,  -1.04924528],
         [  1.59777763,   1.71874923],
         [  0.56032094,   0.68129255],
         [  2.90290421,   3.02387581],
         [ -0.43525876,  -0.31428716],
         [ -0.76569792,  -0.64472631],
         [  0.638752  ,   0.7597236 ],
         [ -0.3637026 ,  -0.242731  ],
         [  0.07725045,   0.19822205]])},
 ('Y',
  'V',
  'G'): {'pacf': array([ 1.        , -0.06095707,  0.19143118, -0.54121706, -0.84696723,
          0.95805607,  0.21854914,  0.94681651,  0.16707407, -1.51569922,
         -0.54723384, -1.73908393,  1.37645162,  0.39243366,  1.49812903,
         -0.31122597, -1.20178542, -0.84078145, -8.7457162 ,  1.02860706,
          0.3577604 ,  1.95108875, -0.46472826, -1.10813513, -1.61419444,
          3.30494158,  0.98217571,  1.47580919, -5.93462491, -1.14301777,
         -0.92691166,  2.31809935,  0.57009033,  1.03284862, -5.90888654,
         -0.63850567, -1.82253279, -0.85711637,  1.18090889,  0.52745047,
          2.32443609]), 'confidence_intervals': array([[ 1.00000000e+00,  1.00000000e+00],
         [-1.21442872e-01, -4.71268221e-04],
         [ 1.30945374e-01,  2.51916977e-01],
         [-6.01702859e-01, -4.80731256e-01],
         [-9.07453036e-01, -7.86481432e-01],
         [ 8.97570269e-01,  1.01854187e+00],
         [ 1.58063336e-01,  2.79034939e-01],
         [ 8.86330709e-01,  1.00730231e+00],
         [ 1.06588264e-01,  2.27559867e-01],
         [-1.57618503e+00, -1.45521342e+00],
         [-6.07719647e-01, -4.86748043e-01],
         [-1.79956973e+00, -1.67859812e+00],
         [ 1.31596581e+00,  1.43693742e+00],
         [ 3.31947858e-01,  4.52919462e-01],
         [ 1.43764323e+00,  1.55861483e+00],
         [-3.71711773e-01, -2.50740170e-01],
         [-1.26227123e+00, -1.14129962e+00],
         [-9.01267253e-01, -7.80295649e-01],
         [-8.80620200e+00, -8.68523040e+00],
         [ 9.68121254e-01,  1.08909286e+00],
         [ 2.97274601e-01,  4.18246204e-01],
         [ 1.89060295e+00,  2.01157455e+00],
         [-5.25214057e-01, -4.04242454e-01],
         [-1.16862093e+00, -1.04764933e+00],
         [-1.67468024e+00, -1.55370863e+00],
         [ 3.24445578e+00,  3.36542739e+00],
         [ 9.21689904e-01,  1.04266151e+00],
         [ 1.41532339e+00,  1.53629500e+00],
         [-5.99511071e+00, -5.87413911e+00],
         [-1.20350357e+00, -1.08253197e+00],
         [-9.87397461e-01, -8.66425858e-01],
         [ 2.25761354e+00,  2.37858515e+00],
         [ 5.09604531e-01,  6.30576134e-01],
         [ 9.72362822e-01,  1.09333443e+00],
         [-5.96937234e+00, -5.84840074e+00],
         [-6.98991475e-01, -5.78019871e-01],
         [-1.88301860e+00, -1.76204699e+00],
         [-9.17602175e-01, -7.96630572e-01],
         [ 1.12042309e+00,  1.24139470e+00],
         [ 4.66964672e-01,  5.87936275e-01],
         [ 2.26395029e+00,  2.38492189e+00]])}}

Start with a simple ARIMA model with explicitly defined (p, d, q) values.

These values were determined by aggregating all of the series into a single representative series, run through AutoARIMA to determine the p, d, q order terms, and applied to each series individually.

Note that this approach will not work if the series have different seasonality attributes (series ‘a’ has a weekly seasonality while series ‘b’ has a monthly seasonality), if the series has a complex seasonality (compounding weekly, day of month, and month of year effects), or if the any of the ordering terms (differencing, order, or moving average order) are significantly different for the different independent series.

[11]:
arima_base = ARIMA(order=(4, 1, 5), out_of_sample_size=14)
group_arima = GroupedPmdarima(
    model_template=arima_base
)

Fit the grouped Pmdarima model

[12]:
group_arima_model = group_arima.fit(df=data.df,
                                    group_key_columns=data.key_columns,
                                    y_col="y",
                                    datetime_col="ds",
                                    silence_warnings=True)

Let’s see what the parameters were from the run.

[13]:
group_arima_model.get_model_params()
[13]:
grouping_key_columns key2 key1 key0 maxiter method out_of_sample_size scoring scoring_args start_params suppress_warnings trend with_intercept p d q P D Q s
0 (key2, key1, key0) F E K 50 lbfgs 14 mse None None False None True 4 1 5 0 0 0 0
1 (key2, key1, key0) J X V 50 lbfgs 14 mse None None False None True 4 1 5 0 0 0 0
2 (key2, key1, key0) N X C 50 lbfgs 14 mse None None False None True 4 1 5 0 0 0 0
3 (key2, key1, key0) O T Y 50 lbfgs 14 mse None None False None True 4 1 5 0 0 0 0
4 (key2, key1, key0) Y V G 50 lbfgs 14 mse None None False None True 4 1 5 0 0 0 0

Let’s take a look at the training metrics

[14]:
group_arima_model.get_metrics()
[14]:
grouping_key_columns key2 key1 key0 hqic aicc oob bic aic
0 (key2, key1, key0) F E K 7308.479882 7288.064205 26.639746 7342.321388 7287.809869
1 (key2, key1, key0) J X V 8529.693560 8509.277883 73.661043 8563.535066 8509.023548
2 (key2, key1, key0) N X C 7094.464510 7074.048832 33.927776 7128.306016 7073.794497
3 (key2, key1, key0) O T Y 7636.027821 7615.612144 118.098407 7669.869327 7615.357809
4 (key2, key1, key0) Y V G 8408.200891 8387.785213 177.272914 8442.042397 8387.530878

The out of bounds measure on the 14 day validation period doesn’t look too bad. Let’s save this model.

Save it to a local directory

[15]:
group_arima_model.save("./group_arima.gpmd")

Load the saved model and perform a forecast for each group.

[16]:
loaded_arima = GroupedPmdarima.load("./group_arima.gpmd")
[17]:
forecast = loaded_arima.predict(n_periods = 60, alpha=0.5, predict_col="forecast", return_conf_int=True)
forecast
[17]:
grouping_key_columns key2 key1 key0 forecast yhat_lower yhat_upper ds
0 (key2, key1, key0) F E K 439.455456 433.443671 445.467241 2019-11-17 00:01:00
1 (key2, key1, key0) F E K 611.627138 605.596691 617.657586 2019-11-18 00:01:00
2 (key2, key1, key0) F E K 485.765766 479.734559 491.796972 2019-11-19 00:01:00
3 (key2, key1, key0) F E K 581.559916 575.493820 587.626011 2019-11-20 00:01:00
4 (key2, key1, key0) F E K 410.152148 403.979627 416.324670 2019-11-21 00:01:00
... ... ... ... ... ... ... ... ...
295 (key2, key1, key0) Y V G 868.854019 849.593089 888.114949 2020-01-11 00:01:00
296 (key2, key1, key0) Y V G 844.56244 825.194837 863.930042 2020-01-12 00:01:00
297 (key2, key1, key0) Y V G 1336.358013 1316.951209 1355.764817 2020-01-13 00:01:00
298 (key2, key1, key0) Y V G 961.353673 941.932352 980.774994 2020-01-14 00:01:00
299 (key2, key1, key0) Y V G 1226.602318 1207.077806 1246.126830 2020-01-15 00:01:00

300 rows × 8 columns

[18]:
plot_grouped_series_forecast(forecast, data.key_columns, "ds", "forecast", "yhat_lower", "yhat_upper")
../../_images/tutorials-and-examples_notebooks_grouped_pmdarima_example_jupyter_30_0.png

The ordering terms that were used above, (4, 1, 5) were determined via using AutoARIMA against the average of all generated series. This approach, a hierarchal optimization of multi series data, is very effective and should be pursued first if at all possible.

AutoARIMA with seasonality components

Since we know that these data sets have a weekly periodicity and that they’re generated on a daily basis, let’s set m=7. This will apply a seasonality term to the ARIMA model (making it a SARIMA model) and opening up the tuning of the seasonal components (P, D, Q).

Note: This will take much longer to run.

[19]:
auto_arima = AutoARIMA(out_of_sample_size=14,
                       maxiter=500,
                       max_order=7,
                       d=1,
                       m=7
                      )
auto_arima_model = GroupedPmdarima(model_template=auto_arima).fit(
    df=data.df,
    group_key_columns=data.key_columns,
    y_col="y",
    datetime_col="ds",
    silence_warnings=True)

Let’s see what the parameters are for this run.

[20]:
auto_arima_model.get_model_params()
[20]:
grouping_key_columns key2 key1 key0 maxiter method out_of_sample_size scoring scoring_args start_params suppress_warnings trend with_intercept p d q P D Q s
0 (key2, key1, key0) F E K 500 lbfgs 14 mse {} None True None False 5 1 0 2 1 0 7
1 (key2, key1, key0) J X V 500 lbfgs 14 mse {} None True None False 5 1 0 2 1 0 7
2 (key2, key1, key0) N X C 500 lbfgs 14 mse {} None True None False 5 1 0 2 1 0 7
3 (key2, key1, key0) O T Y 500 lbfgs 14 mse {} None True None False 5 1 0 2 1 0 7
4 (key2, key1, key0) Y V G 500 lbfgs 14 mse {} None True None False 5 1 0 2 1 0 7

And the training metrics…

[21]:
auto_arima_model.get_metrics()
[21]:
grouping_key_columns key2 key1 key0 hqic aicc oob bic aic
0 (key2, key1, key0) F E K 7238.444457 7223.565458 36.179148 7263.018307 7223.427129
1 (key2, key1, key0) J X V 7500.385256 7485.506257 104.947817 7524.959107 7485.367929
2 (key2, key1, key0) N X C 6831.858389 6816.979390 18.105916 6856.432240 6816.841062
3 (key2, key1, key0) O T Y 6355.091450 6340.212451 43.374184 6379.665300 6340.074122
4 (key2, key1, key0) Y V G 6320.132335 6305.253336 23.108154 6344.706185 6305.115008

And check the cross validation of each group’s model to see what our error metrics are for prediction via backtesting.

[22]:
auto_arima_forecast = auto_arima_model.predict(n_periods = 60, alpha=0.05, return_conf_int=True)
[23]:
plot_grouped_series_forecast(auto_arima_forecast, data.key_columns, "ds", "yhat", "yhat_lower", "yhat_upper")
../../_images/tutorials-and-examples_notebooks_grouped_pmdarima_example_jupyter_40_0.png

The seasonal components for this example aren’t quite as great as the first example. However, this is due to the nature of this generated synthetic example data. For many real-world complex series data, using a seasonal approach with each model getting fit with its own optimal AR terms (p, d, q) and seasonal terms (P, D, Q) can provide better results than manually specifying them.

Cross validation backtesting on the models to get the error metrics

[24]:
auto_arima_cv_window = SlidingWindowForecastCV(h=28, step=180, window_size=365)

auto_arima_cv = auto_arima_model.cross_validate(df=data.df,
                                                metrics=["mean_squared_error", "smape", "mean_absolute_error"],
                                                cross_validator=auto_arima_cv_window,
                                                error_score=np.nan,
                                                verbosity=4
                                               )
[CV] fold=0 ..........................................................
fold=0, score=79.808 [time=27.476 sec]
[CV] fold=1 ..........................................................
fold=1, score=59.472 [time=41.604 sec]
[CV] fold=2 ..........................................................
fold=2, score=232.314 [time=25.329 sec]
[CV] fold=3 ..........................................................
fold=3, score=49.415 [time=21.327 sec]
[CV] fold=0 ..........................................................
fold=0, score=1.542 [time=26.895 sec]
[CV] fold=1 ..........................................................
fold=1, score=1.240 [time=40.395 sec]
[CV] fold=2 ..........................................................
fold=2, score=2.711 [time=25.616 sec]
[CV] fold=3 ..........................................................
fold=3, score=1.138 [time=21.324 sec]
[CV] fold=0 ..........................................................
fold=0, score=7.313 [time=26.776 sec]
[CV] fold=1 ..........................................................
fold=1, score=6.078 [time=40.482 sec]
[CV] fold=2 ..........................................................
fold=2, score=13.143 [time=25.371 sec]
[CV] fold=3 ..........................................................
fold=3, score=5.498 [time=21.812 sec]
[CV] fold=0 ..........................................................
fold=0, score=108.753 [time=21.509 sec]
[CV] fold=1 ..........................................................
fold=1, score=107.304 [time=24.203 sec]
[CV] fold=2 ..........................................................
fold=2, score=142.982 [time=22.971 sec]
[CV] fold=3 ..........................................................
fold=3, score=67.764 [time=20.946 sec]
[CV] fold=0 ..........................................................
fold=0, score=1.218 [time=21.455 sec]
[CV] fold=1 ..........................................................
fold=1, score=1.190 [time=23.443 sec]
[CV] fold=2 ..........................................................
fold=2, score=1.365 [time=22.825 sec]
[CV] fold=3 ..........................................................
fold=3, score=0.952 [time=21.210 sec]
[CV] fold=0 ..........................................................
fold=0, score=8.226 [time=21.380 sec]
[CV] fold=1 ..........................................................
fold=1, score=8.362 [time=23.636 sec]
[CV] fold=2 ..........................................................
fold=2, score=9.412 [time=22.880 sec]
[CV] fold=3 ..........................................................
fold=3, score=6.478 [time=21.000 sec]
[CV] fold=0 ..........................................................
fold=0, score=33.936 [time=26.360 sec]
[CV] fold=1 ..........................................................
fold=1, score=31.374 [time=26.617 sec]
[CV] fold=2 ..........................................................
fold=2, score=31.117 [time=26.668 sec]
[CV] fold=3 ..........................................................
fold=3, score=32.456 [time=32.644 sec]
[CV] fold=0 ..........................................................
fold=0, score=1.034 [time=2784.458 sec]
[CV] fold=1 ..........................................................
fold=1, score=0.923 [time=7289.441 sec]
[CV] fold=2 ..........................................................
fold=2, score=0.896 [time=3687.129 sec]
[CV] fold=3 ..........................................................
fold=3, score=0.918 [time=7270.761 sec]
[CV] fold=0 ..........................................................
fold=0, score=4.857 [time=7284.930 sec]
[CV] fold=1 ..........................................................
fold=1, score=4.560 [time=7287.635 sec]
[CV] fold=2 ..........................................................
fold=2, score=4.415 [time=3681.372 sec]
[CV] fold=3 ..........................................................
fold=3, score=4.672 [time=5103.397 sec]
[CV] fold=0 ..........................................................
fold=0, score=22.142 [time=3003.464 sec]
[CV] fold=1 ..........................................................
fold=1, score=37.220 [time=907.578 sec]
[CV] fold=2 ..........................................................
fold=2, score=36.748 [time=128.197 sec]
[CV] fold=3 ..........................................................
fold=3, score=30.669 [time=59.834 sec]
[CV] fold=0 ..........................................................
fold=0, score=0.652 [time=22.607 sec]
[CV] fold=1 ..........................................................
fold=1, score=0.836 [time=25.936 sec]
[CV] fold=2 ..........................................................
fold=2, score=0.827 [time=25.435 sec]
[CV] fold=3 ..........................................................
fold=3, score=0.726 [time=21.935 sec]
[CV] fold=0 ..........................................................
fold=0, score=4.110 [time=21.966 sec]
[CV] fold=1 ..........................................................
fold=1, score=4.930 [time=25.702 sec]
[CV] fold=2 ..........................................................
fold=2, score=5.035 [time=27.211 sec]
[CV] fold=3 ..........................................................
fold=3, score=4.385 [time=22.635 sec]
[CV] fold=0 ..........................................................
fold=0, score=17.119 [time=34.697 sec]
[CV] fold=1 ..........................................................
fold=1, score=107.078 [time=17.232 sec]
[CV] fold=2 ..........................................................
fold=2, score=19.922 [time=26.995 sec]
[CV] fold=3 ..........................................................
fold=3, score=16.925 [time=24.339 sec]
[CV] fold=0 ..........................................................
fold=0, score=0.396 [time=36.418 sec]
[CV] fold=1 ..........................................................
fold=1, score=1.083 [time=18.280 sec]
[CV] fold=2 ..........................................................
fold=2, score=0.419 [time=28.167 sec]
[CV] fold=3 ..........................................................
fold=3, score=0.383 [time=23.993 sec]
[CV] fold=0 ..........................................................
fold=0, score=3.361 [time=36.009 sec]
[CV] fold=1 ..........................................................
fold=1, score=9.168 [time=18.480 sec]
[CV] fold=2 ..........................................................
fold=2, score=3.774 [time=27.421 sec]
[CV] fold=3 ..........................................................
fold=3, score=3.246 [time=23.926 sec]
[25]:
auto_arima_cv
[25]:
grouping_key_columns key2 key1 key0 mean_squared_error_mean mean_squared_error_stddev smape_mean smape_stddev mean_absolute_error_mean mean_absolute_error_stddev
0 (key2, key1, key0) F E K 105.252090 74.171681 1.657663 0.626170 8.007901 3.036208
1 (key2, key1, key0) J X V 106.700806 26.631834 1.181143 0.148013 8.119500 1.052836
2 (key2, key1, key0) N X C 32.220902 1.110630 0.942818 0.053464 4.625816 0.161250
3 (key2, key1, key0) O T Y 31.694749 6.090290 0.760451 0.076121 4.615029 0.381769
4 (key2, key1, key0) Y V G 40.261072 38.594749 0.570198 0.296271 4.887450 2.479422

AutoARIMA without seasonality components

[26]:
auto_arima_no_seasonal = AutoARIMA(out_of_sample_size=14, maxiter=500, d=1, max_order=14) # leaving the 'm' arg out.

auto_arima_no_seasonal_obj = GroupedPmdarima(model_template=auto_arima_no_seasonal)

auto_arima_model_no_seasonal = auto_arima_no_seasonal_obj.fit(df=data.df,
                                                              group_key_columns=data.key_columns,
                                                              y_col="y",
                                                              datetime_col="ds",
                                                              silence_warnings=True
                                                             )
[27]:
auto_arima_model_no_seasonal.get_model_params()
[27]:
grouping_key_columns key2 key1 key0 maxiter method out_of_sample_size scoring scoring_args start_params suppress_warnings trend with_intercept p d q P D Q s
0 (key2, key1, key0) F E K 500 lbfgs 14 mse {} None True None False 2 1 2 0 0 0 0
1 (key2, key1, key0) J X V 500 lbfgs 14 mse {} None True None False 2 1 0 0 0 0 0
2 (key2, key1, key0) N X C 500 lbfgs 14 mse {} None True None True 4 1 3 0 0 0 0
3 (key2, key1, key0) O T Y 500 lbfgs 14 mse {} None True None False 4 1 5 0 0 0 0
4 (key2, key1, key0) Y V G 500 lbfgs 14 mse {} None True None False 5 1 4 0 0 0 0
[28]:
auto_arima_model_no_seasonal.get_metrics()
[28]:
grouping_key_columns key2 key1 key0 hqic aicc oob bic aic
0 (key2, key1, key0) F E K 11803.021777 11793.683788 4803.128966 11818.404280 11793.626317
1 (key2, key1, key0) J X V 13775.437281 13769.822949 32134.899126 13784.666782 13769.800005
2 (key2, key1, key0) N X C 7591.362347 7574.623595 85.834726 7619.050852 7574.450519
3 (key2, key1, key0) O T Y 7181.798445 7163.219266 51.496175 7212.563450 7163.007524
4 (key2, key1, key0) Y V G 8291.866077 8273.286898 108.699561 8322.631082 8273.075156
[29]:
auto_arima_forecast_no_seasonal = auto_arima_model_no_seasonal.predict(n_periods = 60,
                                                                       alpha=0.05,
                                                                       return_conf_int=True
                                                                      )
plot_grouped_series_forecast(auto_arima_forecast_no_seasonal,
                             data.key_columns,
                             "ds",
                             "yhat",
                             "yhat_lower",
                             "yhat_upper"
                            )
../../_images/tutorials-and-examples_notebooks_grouped_pmdarima_example_jupyter_49_0.png

These are the results when allowing AutoARIMA to optimize without specifying the seasonal ‘m’ value. They’re definitely not as great since the generated data has a clear weekly seasonality component and the optimizer will struggle to find appropriate ordering terms for this type of data.

Pipeline orchestration with data preprocessing

[78]:
pipeline_obj = Pipeline(
    steps=[
        (
            "log",
            LogEndogTransformer(lmbda=0.2, neg_action="raise", floor=1e-12),
        ),
        ("arima", AutoARIMA(out_of_sample_size=14, max_order=14, d=1, suppress_warnings=True)),
    ]
)

pipeline_arima = GroupedPmdarima(
    y_col="y", datetime_col="ds", model_template=pipeline_obj
).fit(df=data.df, group_key_columns=data.key_columns, silence_warnings=True)
[79]:
pipeline_forecast = pipeline_arima.predict(n_periods = 60,
                                                         alpha=0.05,
                                                         return_conf_int=True
                                                        )
[89]:
plot_grouped_series_forecast(pipeline_forecast,
                             data.key_columns,
                             "ds",
                             "forecast",
                             "yhat_lower",
                             "yhat_upper"
                            )
../../_images/tutorials-and-examples_notebooks_grouped_pmdarima_example_jupyter_54_0.png

These series are definitely not in need of log transformation to build effective ARIMA models. That being said, your data may benefit from having an endogenous transformation applied to enforce stationarity.