GroupedPmdarima Example Scripts

The scripts included below show the various options available for utilizing the GroupedPmdarima API. For an alternative view of these examples with data visualizations, see the notebooks here

Scripts

GroupedPmdarima ARIMA
GroupedPmdarima AutoARIMA
GroupedPmdarima Pipeline Example
GroupedPmdarima Group Subset Prediction Example
GroupedPmdarima Series Analysis Example
GroupedPmdarima Differencing Term Manual Calculation Example
Supplementary

GroupedPmdarima ARIMA 

This example shows using a manually-configured (order values provided for a non-seasonal collection of series) ARIMA model that is applied to each group.

Using this approach (a static order configuration) can be useful for homogenous collections of series. If each member of the grouped collection of series shares a common characteristic in the residuals (i.e., the differencing terms for both an auto-correlation and partial auto-correlation analysis shows similar relationships for all groups), this approach will be faster and less expensive to fit a model than any other means.

GroupedPmdarima manually configured ARIMA model

import numpy as np
from pmdarima.arima.arima import ARIMA
from pmdarima.model_selection import SlidingWindowForecastCV
from diviner.utils.example_utils.example_data_generator import generate_example_data
from diviner import GroupedPmdarima


def get_and_print_model_metrics_params(grouped_model):
    fit_metrics = grouped_model.get_metrics()
    fit_params = grouped_model.get_model_params()

    print(f"\nModel Fit Metrics:\n{fit_metrics.to_string()}")
    print(f"\nModel Fit Params:\n{fit_params.to_string()}")


if __name__ == "__main__":

    # Generate a few years of daily data across 4 different groups, defined by 3 columns that
    # define each group
    generated_data = generate_example_data(
        column_count=3,
        series_count=4,
        series_size=365 * 4,
        start_dt="2019-01-01",
        days_period=1,
    )

    training_data = generated_data.df
    group_key_columns = generated_data.key_columns

    # Build a GroupedPmdarima model by specifying an ARIMA model
    arima_obj = ARIMA(order=(2, 1, 3), out_of_sample_size=60)
    base_arima = GroupedPmdarima(model_template=arima_obj).fit(
        df=training_data,
        group_key_columns=group_key_columns,
        y_col="y",
        datetime_col="ds",
        silence_warnings=True,
    )

    # Save to local directory
    save_dir = "/tmp/group_pmdarima/arima.gpmd"
    base_arima.save(save_dir)

    # Load from saved model
    loaded_model = GroupedPmdarima.load(save_dir)

    print("\nARIMA results:\n", "-" * 40)
    get_and_print_model_metrics_params(loaded_model)

    prediction = loaded_model.predict(
        n_periods=30, alpha=0.02, predict_col="forecast", return_conf_int=True
    )
    print("\nPredictions:\n", "-" * 40)
    print(prediction.to_string())

    print("\nCross validation metric results:\n", "-" * 40)
    cross_validator = SlidingWindowForecastCV(h=90, step=365, window_size=730)
    cv_results = loaded_model.cross_validate(
        df=training_data,
        metrics=["mean_squared_error", "smape", "mean_absolute_error"],
        cross_validator=cross_validator,
        error_score=np.nan,
        verbosity=4,
    )

    print(cv_results.to_string())

GroupedPmdarima AutoARIMA 

For projects that do not have homogeneous relationships amongst groups of series, using the AutoARIMA functionality of pmdarima is advised. This will allow for individualized optimation of the order terms (p, d, q) and, for seasonal series, the (P, D, Q) seasonal order terms as well.

Note

If using a seasonal approach, the parameter m must be set to an integer value that represents the seasonal periodicity. In this mode, with m set, the ARIMA terms (p, d, q) will be optimized along with (P, D, Q). Due to the complexity of optimizing these terms, this execution mode will take far longer than an optimization of a non-seasonal model.

GroupedPmdarima non-seasonal AutoARIMA model

import numpy as np
from pmdarima.arima.auto import AutoARIMA
from pmdarima.model_selection import SlidingWindowForecastCV
from diviner.utils.example_utils.example_data_generator import generate_example_data
from diviner import GroupedPmdarima


def get_and_print_model_metrics_params(grouped_model):
    fit_metrics = grouped_model.get_metrics()
    fit_params = grouped_model.get_model_params()

    print(f"\nModel Fit Metrics:\n{fit_metrics.to_string()}")
    print(f"\nModel Fit Params:\n{fit_params.to_string()}")


if __name__ == "__main__":

    # Generate a few years of daily data across 4 different groups, defined by 3 columns that
    # define each group
    generated_data = generate_example_data(
        column_count=3,
        series_count=4,
        series_size=365 * 3,
        start_dt="2019-01-01",
        days_period=1,
    )

    training_data = generated_data.df
    group_key_columns = generated_data.key_columns

    # Utilize pmdarima's AutoARIMA to auto-tune the ARIMA order values
    auto_arima_obj = AutoARIMA(out_of_sample_size=60, maxiter=100)
    base_auto_arima = GroupedPmdarima(model_template=auto_arima_obj).fit(
        df=training_data,
        group_key_columns=group_key_columns,
        y_col="y",
        datetime_col="ds",
        silence_warnings=True,
    )

    # Save to local directory
    save_dir = "/tmp/group_pmdarima/autoarima.gpmd"
    base_auto_arima.save(save_dir)

    # Load from saved model
    loaded_model = GroupedPmdarima.load(save_dir)

    print("\nAutoARIMA results:\n", "-" * 40)
    get_and_print_model_metrics_params(loaded_model)

    print("\nPredictions:\n", "-" * 40)
    prediction = loaded_model.predict(n_periods=30, alpha=0.1, return_conf_int=True)
    print(prediction.to_string())

    print("\nCross validation metric results:\n", "-" * 40)
    cross_validator = SlidingWindowForecastCV(h=30, step=180, window_size=365)
    cv_results = loaded_model.cross_validate(
        df=training_data,
        metrics=["mean_squared_error", "smape", "mean_absolute_error"],
        cross_validator=cross_validator,
        error_score=np.nan,
        verbosity=3,
    )

    print(cv_results.to_string())

GroupedPmdarima Pipeline Example 

This example shows the utilization of a pmdarima.pipeline.Pipeline, incorporating preprocessing operations to each series. In the example below, a Box Cox transformation is applied to each series to force stationarity.

Note

The data set used for these examples is a randomly generated non-deterministic group of series data. As such, The relevance of utilizing a normalcy transform on this data is somewhere between ‘unlikely’ and ‘zero’. Using a BoxCox transform here is used as an API example only.

GroupedPmdarima with Pipeline model

import numpy as np
from pmdarima.arima.auto import AutoARIMA
from pmdarima.pipeline import Pipeline
from pmdarima.preprocessing import BoxCoxEndogTransformer
from pmdarima.model_selection import RollingForecastCV
from diviner.utils.example_utils.example_data_generator import generate_example_data
from diviner import GroupedPmdarima


def get_and_print_model_metrics_params(grouped_model):
    fit_metrics = grouped_model.get_metrics()
    fit_params = grouped_model.get_model_params()

    print(f"\nModel Fit Metrics:\n{fit_metrics.to_string()}")
    print(f"\nModel Fit Params:\n{fit_params.to_string()}")


if __name__ == "__main__":

    # Generate a few years of daily data across 2 different groups, defined by 3 columns that
    # define each group
    generated_data = generate_example_data(
        column_count=3,
        series_count=2,
        series_size=365 * 3,
        start_dt="2019-01-01",
        days_period=1,
    )

    training_data = generated_data.df
    group_key_columns = generated_data.key_columns

    pipeline_obj = Pipeline(
        steps=[
            (
                "box",
                BoxCoxEndogTransformer(lmbda2=0.4, neg_action="ignore", floor=1e-12),
            ),
            ("arima", AutoARIMA(out_of_sample_size=60, max_p=4, max_q=4, max_d=4)),
        ]
    )
    pipeline_arima = GroupedPmdarima(model_template=pipeline_obj).fit(
        df=training_data,
        group_key_columns=group_key_columns,
        y_col="y",
        datetime_col="ds",
        silence_warnings=True,
    )

    # Save to local directory
    save_dir = "/tmp/group_pmdarima/pipeline.gpmd"
    pipeline_arima.save(save_dir)

    # Load from saved model
    loaded_model = GroupedPmdarima.load(save_dir)

    print("\nPipeline AutoARIMA results:\n", "-" * 40)
    get_and_print_model_metrics_params(loaded_model)

    print("\nPredictions:\n", "-" * 40)
    prediction = loaded_model.predict(
        n_periods=30, alpha=0.2, predict_col="predictions", return_conf_int=True
    )
    print(prediction.to_string())

    print("\nCross validation metric results:\n", "-" * 40)
    cross_validator = RollingForecastCV(h=30, step=365, initial=730)
    cv_results = loaded_model.cross_validate(
        df=training_data,
        metrics=["mean_squared_error"],
        cross_validator=cross_validator,
        error_score=np.nan,
        verbosity=3,
    )

    print(cv_results.to_string())

GroupedPmdarima Group Subset Prediction Example 

This example shows a subset prediction of groups by using the predict_groups <diviner.GroupedPmdarima.predict_groups> method.

GroupedPmdarima Subset Groups Prediction

from pmdarima.arima.arima import ARIMA
from diviner import GroupedPmdarima
from diviner.utils.example_utils.example_data_generator import generate_example_data

if __name__ == "__main__":

    generated_data = generate_example_data(
        column_count=2,
        series_count=6,
        series_size=365 * 4,
        start_dt="2019-01-01",
        days_period=1,
    )

    training_data = generated_data.df
    group_key_columns = generated_data.key_columns

    arima_obj = ARIMA(order=(2, 1, 3), out_of_sample_size=60)
    base_arima = GroupedPmdarima(model_template=arima_obj).fit(
        df=training_data,
        group_key_columns=group_key_columns,
        y_col="y",
        datetime_col="ds",
        silence_warnings=True,
    )

    # Get a subset of group keys to generate forecasts for
    group_df = training_data.copy()
    group_df["groups"] = list(zip(*[group_df[c] for c in group_key_columns]))
    distinct_groups = group_df["groups"].unique()
    groups_to_predict = list(distinct_groups[:3])

    print("-" * 65)
    print(f"Unique groups that have been modeled: {distinct_groups}")
    print(f"Subset of groups to generate predictions for: {groups_to_predict}")
    print("-" * 65)

    forecasts = base_arima.predict_groups(
        groups=groups_to_predict,
        n_periods=60,
        predict_col="forecast_values",
        on_error="warn",
    )

    print(f"\nForecast values:\n{forecasts.to_string()}")

GroupedPmdarima Series Analysis Example 

The below script illustrates how to perform analytics on a grouped series data set. Applying the results of these utilities can aid in determining appropriate order values (p, d, q) and seasonal order values (P, D, Q) for the example shown in the ARIMA example.

GroupedPmdarima series exploration and analysis

import pprint
from diviner import PmdarimaAnalyzer

from diviner.utils.example_utils.example_data_generator import generate_example_data


def _print_dict(data, name):
    print("\n" + "-" * 100)
    print(f"{name} values for the groups")
    print("-" * 100, "\n")
    pprint.PrettyPrinter(indent=2).pprint(data)


if __name__ == "__main__":

    generated_data = generate_example_data(
        column_count=4,
        series_count=3,
        series_size=365 * 12,
        start_dt="2010-01-01",
        days_period=1,
    )
    training_data = generated_data.df
    group_key_columns = generated_data.key_columns

    # Create a utility object for performing analyses
    # We reuse this object because the grouped data set collection is lazily evaluated and can be
    # reused for subsequent analytics operations on the data set.
    analyzer = PmdarimaAnalyzer(
        df=training_data,
        group_key_columns=group_key_columns,
        y_col="y",
        datetime_col="ds",
    )

    # Decompose the trends of each group
    decomposed_trends = analyzer.decompose_groups(m=7, type_="additive")

    print("Decomposed trend data for the groups")
    print("-" * 100, "\n")
    print(decomposed_trends[:50].to_string())

    # Calculate optimal differencing for ARMA terms
    ndiffs = analyzer.calculate_ndiffs(alpha=0.1, test="kpss", max_d=5)

    _print_dict(ndiffs, "Differencing")

    # Calculate seasonal differencing
    nsdiffs = analyzer.calculate_nsdiffs(m=365, test="ocsb", max_D=5)

    _print_dict(nsdiffs, "Seasonal Differencing")

    # Get the autocorrelation function for each group
    group_acf = analyzer.calculate_acf(
        unbiased=True, nlags=120, qstat=True, fft=True, alpha=0.05, adjusted=True
    )

    _print_dict(group_acf, "Autocorrelation function")

    # Get the partial autocorrelation function for each group
    group_pacf = analyzer.calculate_pacf(nlags=120, method="yw", alpha=0.05)

    _print_dict(group_pacf, "Partial Autocorrelation function")

    # Perform a diff operation on each group
    group_diff = analyzer.generate_diff(lag=7, differences=1)

    _print_dict(group_diff, "Differencing")

    # Invert the diff operation on each group
    group_diff_inv = analyzer.generate_diff_inversion(
        group_diff, lag=7, differences=1, recenter=True
    )

    _print_dict(group_diff_inv, "Differencing Inversion")

GroupedPmdarima Differencing Term Manual Calculation Example 

This script below shows a means of dramatically reducing the optimization time of AutoARIMA through the manual calculation of the differencing term 'd' for each series in the grouped series data set. By manually setting this argument (which can be either unique for each group or homogenous across all groups), the optimization algorithm can reduce the total number of iterative validation tests.

GroupedPmdarima manual differencing term extraction and application to AutoARIMA

from diviner.utils.example_utils.example_data_generator import generate_example_data
from diviner import GroupedPmdarima, PmdarimaAnalyzer
from pmdarima.pipeline import Pipeline
from pmdarima import AutoARIMA
from pmdarima.model_selection import SlidingWindowForecastCV


def get_and_print_model_metrics_params(grouped_model):
    fit_metrics = grouped_model.get_metrics()
    fit_params = grouped_model.get_model_params()

    print(f"\nModel Fit Metrics:\n{fit_metrics.to_string()}")
    print(f"\nModel Fit Params:\n{fit_params.to_string()}")


if __name__ == "__main__":

    # Generate 6 years of daily data across 4 different groups, defined by 3 columns that
    # define each group
    generated_data = generate_example_data(
        column_count=3,
        series_count=3,
        series_size=365 * 3,
        start_dt="2019-01-01",
        days_period=1,
    )

    training_data = generated_data.df
    group_key_columns = generated_data.key_columns

    pipeline = Pipeline(
        steps=[
            (
                "arima",
                AutoARIMA(
                    max_order=14,
                    out_of_sample_size=90,
                    suppress_warnings=True,
                    error_action="ignore",
                ),
            )
        ]
    )

    diff_analyzer = PmdarimaAnalyzer(
        df=training_data,
        group_key_columns=group_key_columns,
        y_col="y",
        datetime_col="ds",
    )
    ndiff = diff_analyzer.calculate_ndiffs(
        alpha=0.05,
        test="kpss",
        max_d=4,
    )

    grouped_model = GroupedPmdarima(model_template=pipeline).fit(
        df=training_data,
        group_key_columns=group_key_columns,
        y_col="y",
        datetime_col="ds",
        ndiffs=ndiff,
        silence_warnings=True,
    )

    # Save to local directory
    save_dir = "/tmp/group_pmdarima/pipeline_override.gpmd"
    grouped_model.save(save_dir)

    # Load from saved model
    loaded_model = GroupedPmdarima.load(save_dir)

    print("\nAutoARIMA results:\n", "-" * 40)
    get_and_print_model_metrics_params(loaded_model)

    print("\nPredictions:\n", "-" * 40)
    prediction = loaded_model.predict(
        n_periods=30, alpha=0.1, predict_col="forecasted_values", return_conf_int=True
    )
    print(prediction.to_string())

    cv_evaluator = SlidingWindowForecastCV(h=90, step=120, window_size=180)
    cross_validation = loaded_model.cross_validate(
        df=training_data,
        metrics=["smape", "mean_squared_error", "mean_absolute_error"],
        cross_validator=cv_evaluator,
    )

    print("\nCross validation metrics:\n", "-" * 40)
    print(cross_validation.to_string())

Supplementary 

Note

To run these examples for yourself with the data generator example, utlize the following code:

Synthetic Data Generator

import itertools
import pandas as pd
import numpy as np
import string
import random
from datetime import timedelta, datetime
from collections import namedtuple


def _generate_time_series(series_size: int):
    residuals = np.random.lognormal(
        mean=np.random.uniform(low=0.5, high=3.0),
        sigma=np.random.uniform(low=0.6, high=0.98),
        size=series_size,
    )
    trend = [
        np.polyval([23.0, 1.0, 5], x)
        for x in np.linspace(start=0, stop=np.random.randint(low=0, high=4), num=series_size)
    ]
    seasonality = [
        90 * np.sin(2 * np.pi * 1000 * (i / (series_size * 200))) + 40
        for i in np.arange(0, series_size)
    ]

    return residuals + trend + seasonality + np.random.uniform(low=20.0, high=1000.0)


def _generate_grouping_columns(column_count: int, series_count: int):
    candidate_list = list(string.ascii_uppercase)
    candidates = random.sample(
        list(itertools.permutations(candidate_list, column_count)), series_count
    )
    column_names = sorted([f"key{x}" for x in range(column_count)], reverse=True)
    return [dict(zip(column_names, entries)) for entries in candidates]


def _generate_raw_df(
    column_count: int,
    series_count: int,
    series_size: int,
    start_dt: str,
    days_period: int,
):
    candidates = _generate_grouping_columns(column_count, series_count)
    start_date = datetime.strptime(start_dt, "%Y-%M-%d")
    dates = np.arange(
        start_date,
        start_date + timedelta(days=series_size * days_period),
        timedelta(days=days_period),
    )
    df_collection = []
    for entry in candidates:
        generated_series = _generate_time_series(series_size)
        series_dict = {"ds": dates, "y": generated_series}
        series_df = pd.DataFrame.from_dict(series_dict)
        for column, value in entry.items():
            series_df[column] = value
        df_collection.append(series_df)
    return pd.concat(df_collection)


def generate_example_data(
    column_count: int,
    series_count: int,
    series_size: int,
    start_dt: str,
    days_period: int = 1,
):

    Structure = namedtuple("Structure", "df key_columns")
    data = _generate_raw_df(column_count, series_count, series_size, start_dt, days_period)
    key_columns = list(data.columns)

    for key in ["ds", "y"]:
        key_columns.remove(key)

    return Structure(data, key_columns)

GroupedPmdarima Example Scripts

GroupedPmdarima ARIMA

GroupedPmdarima AutoARIMA

GroupedPmdarima Pipeline Example

GroupedPmdarima Group Subset Prediction Example

GroupedPmdarima Series Analysis Example

GroupedPmdarima Differencing Term Manual Calculation Example

Supplementary

GroupedPmdarima ARIMA 

GroupedPmdarima AutoARIMA 

GroupedPmdarima Pipeline Example 

GroupedPmdarima Group Subset Prediction Example 

GroupedPmdarima Series Analysis Example 

GroupedPmdarima Differencing Term Manual Calculation Example 

Supplementary 