GroupedPmdarima Example Scripts

The scripts included below show the various options available for utilizing the GroupedPmdarima API. For an alternative view of these examples with data visualizations, see the notebooks here

GroupedPmdarima ARIMA

This example shows using a manually-configured (order values provided for a non-seasonal collection of series) ARIMA model that is applied to each group.

Using this approach (a static order configuration) can be useful for homogenous collections of series. If each member of the grouped collection of series shares a common characteristic in the residuals (i.e., the differencing terms for both an auto-correlation and partial auto-correlation analysis shows similar relationships for all groups), this approach will be faster and less expensive to fit a model than any other means.

GroupedPmdarima manually configured ARIMA model
 1import numpy as np
 2from pmdarima.arima.arima import ARIMA
 3from pmdarima.model_selection import SlidingWindowForecastCV
 4from diviner.utils.example_utils.example_data_generator import generate_example_data
 5from diviner import GroupedPmdarima
 6
 7
 8def get_and_print_model_metrics_params(grouped_model):
 9    fit_metrics = grouped_model.get_metrics()
10    fit_params = grouped_model.get_model_params()
11
12    print(f"\nModel Fit Metrics:\n{fit_metrics.to_string()}")
13    print(f"\nModel Fit Params:\n{fit_params.to_string()}")
14
15
16if __name__ == "__main__":
17
18    # Generate a few years of daily data across 4 different groups, defined by 3 columns that
19    # define each group
20    generated_data = generate_example_data(
21        column_count=3,
22        series_count=4,
23        series_size=365 * 4,
24        start_dt="2019-01-01",
25        days_period=1,
26    )
27
28    training_data = generated_data.df
29    group_key_columns = generated_data.key_columns
30
31    # Build a GroupedPmdarima model by specifying an ARIMA model
32    arima_obj = ARIMA(order=(2, 1, 3), out_of_sample_size=60)
33    base_arima = GroupedPmdarima(model_template=arima_obj).fit(
34        df=training_data,
35        group_key_columns=group_key_columns,
36        y_col="y",
37        datetime_col="ds",
38        silence_warnings=True,
39    )
40
41    # Save to local directory
42    save_dir = "/tmp/group_pmdarima/arima.gpmd"
43    base_arima.save(save_dir)
44
45    # Load from saved model
46    loaded_model = GroupedPmdarima.load(save_dir)
47
48    print("\nARIMA results:\n", "-" * 40)
49    get_and_print_model_metrics_params(loaded_model)
50
51    prediction = loaded_model.predict(
52        n_periods=30, alpha=0.02, predict_col="forecast", return_conf_int=True
53    )
54    print("\nPredictions:\n", "-" * 40)
55    print(prediction.to_string())
56
57    print("\nCross validation metric results:\n", "-" * 40)
58    cross_validator = SlidingWindowForecastCV(h=90, step=365, window_size=730)
59    cv_results = loaded_model.cross_validate(
60        df=training_data,
61        metrics=["mean_squared_error", "smape", "mean_absolute_error"],
62        cross_validator=cross_validator,
63        error_score=np.nan,
64        verbosity=4,
65    )
66
67    print(cv_results.to_string())

GroupedPmdarima AutoARIMA

For projects that do not have homogeneous relationships amongst groups of series, using the AutoARIMA functionality of pmdarima is advised. This will allow for individualized optimation of the order terms (p, d, q) and, for seasonal series, the (P, D, Q) seasonal order terms as well.

Note

If using a seasonal approach, the parameter m must be set to an integer value that represents the seasonal periodicity. In this mode, with m set, the ARIMA terms (p, d, q) will be optimized along with (P, D, Q). Due to the complexity of optimizing these terms, this execution mode will take far longer than an optimization of a non-seasonal model.

GroupedPmdarima non-seasonal AutoARIMA model
 1import numpy as np
 2from pmdarima.arima.auto import AutoARIMA
 3from pmdarima.model_selection import SlidingWindowForecastCV
 4from diviner.utils.example_utils.example_data_generator import generate_example_data
 5from diviner import GroupedPmdarima
 6
 7
 8def get_and_print_model_metrics_params(grouped_model):
 9    fit_metrics = grouped_model.get_metrics()
10    fit_params = grouped_model.get_model_params()
11
12    print(f"\nModel Fit Metrics:\n{fit_metrics.to_string()}")
13    print(f"\nModel Fit Params:\n{fit_params.to_string()}")
14
15
16if __name__ == "__main__":
17
18    # Generate a few years of daily data across 4 different groups, defined by 3 columns that
19    # define each group
20    generated_data = generate_example_data(
21        column_count=3,
22        series_count=4,
23        series_size=365 * 3,
24        start_dt="2019-01-01",
25        days_period=1,
26    )
27
28    training_data = generated_data.df
29    group_key_columns = generated_data.key_columns
30
31    # Utilize pmdarima's AutoARIMA to auto-tune the ARIMA order values
32    auto_arima_obj = AutoARIMA(out_of_sample_size=60, maxiter=100)
33    base_auto_arima = GroupedPmdarima(model_template=auto_arima_obj).fit(
34        df=training_data,
35        group_key_columns=group_key_columns,
36        y_col="y",
37        datetime_col="ds",
38        silence_warnings=True,
39    )
40
41    # Save to local directory
42    save_dir = "/tmp/group_pmdarima/autoarima.gpmd"
43    base_auto_arima.save(save_dir)
44
45    # Load from saved model
46    loaded_model = GroupedPmdarima.load(save_dir)
47
48    print("\nAutoARIMA results:\n", "-" * 40)
49    get_and_print_model_metrics_params(loaded_model)
50
51    print("\nPredictions:\n", "-" * 40)
52    prediction = loaded_model.predict(n_periods=30, alpha=0.1, return_conf_int=True)
53    print(prediction.to_string())
54
55    print("\nCross validation metric results:\n", "-" * 40)
56    cross_validator = SlidingWindowForecastCV(h=30, step=180, window_size=365)
57    cv_results = loaded_model.cross_validate(
58        df=training_data,
59        metrics=["mean_squared_error", "smape", "mean_absolute_error"],
60        cross_validator=cross_validator,
61        error_score=np.nan,
62        verbosity=3,
63    )
64
65    print(cv_results.to_string())

GroupedPmdarima Pipeline Example

This example shows the utilization of a pmdarima.pipeline.Pipeline, incorporating preprocessing operations to each series. In the example below, a Box Cox transformation is applied to each series to force stationarity.

Note

The data set used for these examples is a randomly generated non-deterministic group of series data. As such, The relevance of utilizing a normalcy transform on this data is somewhere between ‘unlikely’ and ‘zero’. Using a BoxCox transform here is used as an API example only.

GroupedPmdarima with Pipeline model
 1import numpy as np
 2from pmdarima.arima.auto import AutoARIMA
 3from pmdarima.pipeline import Pipeline
 4from pmdarima.preprocessing import BoxCoxEndogTransformer
 5from pmdarima.model_selection import RollingForecastCV
 6from diviner.utils.example_utils.example_data_generator import generate_example_data
 7from diviner import GroupedPmdarima
 8
 9
10def get_and_print_model_metrics_params(grouped_model):
11    fit_metrics = grouped_model.get_metrics()
12    fit_params = grouped_model.get_model_params()
13
14    print(f"\nModel Fit Metrics:\n{fit_metrics.to_string()}")
15    print(f"\nModel Fit Params:\n{fit_params.to_string()}")
16
17
18if __name__ == "__main__":
19
20    # Generate a few years of daily data across 2 different groups, defined by 3 columns that
21    # define each group
22    generated_data = generate_example_data(
23        column_count=3,
24        series_count=2,
25        series_size=365 * 3,
26        start_dt="2019-01-01",
27        days_period=1,
28    )
29
30    training_data = generated_data.df
31    group_key_columns = generated_data.key_columns
32
33    pipeline_obj = Pipeline(
34        steps=[
35            (
36                "box",
37                BoxCoxEndogTransformer(lmbda2=0.4, neg_action="ignore", floor=1e-12),
38            ),
39            ("arima", AutoARIMA(out_of_sample_size=60, max_p=4, max_q=4, max_d=4)),
40        ]
41    )
42    pipeline_arima = GroupedPmdarima(model_template=pipeline_obj).fit(
43        df=training_data,
44        group_key_columns=group_key_columns,
45        y_col="y",
46        datetime_col="ds",
47        silence_warnings=True,
48    )
49
50    # Save to local directory
51    save_dir = "/tmp/group_pmdarima/pipeline.gpmd"
52    pipeline_arima.save(save_dir)
53
54    # Load from saved model
55    loaded_model = GroupedPmdarima.load(save_dir)
56
57    print("\nPipeline AutoARIMA results:\n", "-" * 40)
58    get_and_print_model_metrics_params(loaded_model)
59
60    print("\nPredictions:\n", "-" * 40)
61    prediction = loaded_model.predict(
62        n_periods=30, alpha=0.2, predict_col="predictions", return_conf_int=True
63    )
64    print(prediction.to_string())
65
66    print("\nCross validation metric results:\n", "-" * 40)
67    cross_validator = RollingForecastCV(h=30, step=365, initial=730)
68    cv_results = loaded_model.cross_validate(
69        df=training_data,
70        metrics=["mean_squared_error"],
71        cross_validator=cross_validator,
72        error_score=np.nan,
73        verbosity=3,
74    )
75
76    print(cv_results.to_string())

GroupedPmdarima Group Subset Prediction Example

This example shows a subset prediction of groups by using the predict_groups <diviner.GroupedPmdarima.predict_groups> method.

GroupedPmdarima Subset Groups Prediction
 1from pmdarima.arima.arima import ARIMA
 2from diviner import GroupedPmdarima
 3from diviner.utils.example_utils.example_data_generator import generate_example_data
 4
 5if __name__ == "__main__":
 6
 7    generated_data = generate_example_data(
 8        column_count=2,
 9        series_count=6,
10        series_size=365 * 4,
11        start_dt="2019-01-01",
12        days_period=1,
13    )
14
15    training_data = generated_data.df
16    group_key_columns = generated_data.key_columns
17
18    arima_obj = ARIMA(order=(2, 1, 3), out_of_sample_size=60)
19    base_arima = GroupedPmdarima(model_template=arima_obj).fit(
20        df=training_data,
21        group_key_columns=group_key_columns,
22        y_col="y",
23        datetime_col="ds",
24        silence_warnings=True,
25    )
26
27    # Get a subset of group keys to generate forecasts for
28    group_df = training_data.copy()
29    group_df["groups"] = list(zip(*[group_df[c] for c in group_key_columns]))
30    distinct_groups = group_df["groups"].unique()
31    groups_to_predict = list(distinct_groups[:3])
32
33    print("-" * 65)
34    print(f"Unique groups that have been modeled: {distinct_groups}")
35    print(f"Subset of groups to generate predictions for: {groups_to_predict}")
36    print("-" * 65)
37
38    forecasts = base_arima.predict_groups(
39        groups=groups_to_predict,
40        n_periods=60,
41        predict_col="forecast_values",
42        on_error="warn",
43    )
44
45    print(f"\nForecast values:\n{forecasts.to_string()}")

GroupedPmdarima Series Analysis Example

The below script illustrates how to perform analytics on a grouped series data set. Applying the results of these utilities can aid in determining appropriate order values (p, d, q) and seasonal order values (P, D, Q) for the example shown in the ARIMA example.

GroupedPmdarima series exploration and analysis
 1import pprint
 2from diviner import PmdarimaAnalyzer
 3
 4from diviner.utils.example_utils.example_data_generator import generate_example_data
 5
 6
 7def _print_dict(data, name):
 8    print("\n" + "-" * 100)
 9    print(f"{name} values for the groups")
10    print("-" * 100, "\n")
11    pprint.PrettyPrinter(indent=2).pprint(data)
12
13
14if __name__ == "__main__":
15
16    generated_data = generate_example_data(
17        column_count=4,
18        series_count=3,
19        series_size=365 * 12,
20        start_dt="2010-01-01",
21        days_period=1,
22    )
23    training_data = generated_data.df
24    group_key_columns = generated_data.key_columns
25
26    # Create a utility object for performing analyses
27    # We reuse this object because the grouped data set collection is lazily evaluated and can be
28    # reused for subsequent analytics operations on the data set.
29    analyzer = PmdarimaAnalyzer(
30        df=training_data,
31        group_key_columns=group_key_columns,
32        y_col="y",
33        datetime_col="ds",
34    )
35
36    # Decompose the trends of each group
37    decomposed_trends = analyzer.decompose_groups(m=7, type_="additive")
38
39    print("Decomposed trend data for the groups")
40    print("-" * 100, "\n")
41    print(decomposed_trends[:50].to_string())
42
43    # Calculate optimal differencing for ARMA terms
44    ndiffs = analyzer.calculate_ndiffs(alpha=0.1, test="kpss", max_d=5)
45
46    _print_dict(ndiffs, "Differencing")
47
48    # Calculate seasonal differencing
49    nsdiffs = analyzer.calculate_nsdiffs(m=365, test="ocsb", max_D=5)
50
51    _print_dict(nsdiffs, "Seasonal Differencing")
52
53    # Get the autocorrelation function for each group
54    group_acf = analyzer.calculate_acf(
55        unbiased=True, nlags=120, qstat=True, fft=True, alpha=0.05, adjusted=True
56    )
57
58    _print_dict(group_acf, "Autocorrelation function")
59
60    # Get the partial autocorrelation function for each group
61    group_pacf = analyzer.calculate_pacf(nlags=120, method="yw", alpha=0.05)
62
63    _print_dict(group_pacf, "Partial Autocorrelation function")
64
65    # Perform a diff operation on each group
66    group_diff = analyzer.generate_diff(lag=7, differences=1)
67
68    _print_dict(group_diff, "Differencing")
69
70    # Invert the diff operation on each group
71    group_diff_inv = analyzer.generate_diff_inversion(
72        group_diff, lag=7, differences=1, recenter=True
73    )
74
75    _print_dict(group_diff_inv, "Differencing Inversion")

GroupedPmdarima Differencing Term Manual Calculation Example

This script below shows a means of dramatically reducing the optimization time of AutoARIMA through the manual calculation of the differencing term 'd' for each series in the grouped series data set. By manually setting this argument (which can be either unique for each group or homogenous across all groups), the optimization algorithm can reduce the total number of iterative validation tests.

GroupedPmdarima manual differencing term extraction and application to AutoARIMA
 1from diviner.utils.example_utils.example_data_generator import generate_example_data
 2from diviner import GroupedPmdarima, PmdarimaAnalyzer
 3from pmdarima.pipeline import Pipeline
 4from pmdarima import AutoARIMA
 5from pmdarima.model_selection import SlidingWindowForecastCV
 6
 7
 8def get_and_print_model_metrics_params(grouped_model):
 9    fit_metrics = grouped_model.get_metrics()
10    fit_params = grouped_model.get_model_params()
11
12    print(f"\nModel Fit Metrics:\n{fit_metrics.to_string()}")
13    print(f"\nModel Fit Params:\n{fit_params.to_string()}")
14
15
16if __name__ == "__main__":
17
18    # Generate 6 years of daily data across 4 different groups, defined by 3 columns that
19    # define each group
20    generated_data = generate_example_data(
21        column_count=3,
22        series_count=3,
23        series_size=365 * 3,
24        start_dt="2019-01-01",
25        days_period=1,
26    )
27
28    training_data = generated_data.df
29    group_key_columns = generated_data.key_columns
30
31    pipeline = Pipeline(
32        steps=[
33            (
34                "arima",
35                AutoARIMA(
36                    max_order=14,
37                    out_of_sample_size=90,
38                    suppress_warnings=True,
39                    error_action="ignore",
40                ),
41            )
42        ]
43    )
44
45    diff_analyzer = PmdarimaAnalyzer(
46        df=training_data,
47        group_key_columns=group_key_columns,
48        y_col="y",
49        datetime_col="ds",
50    )
51    ndiff = diff_analyzer.calculate_ndiffs(
52        alpha=0.05,
53        test="kpss",
54        max_d=4,
55    )
56
57    grouped_model = GroupedPmdarima(model_template=pipeline).fit(
58        df=training_data,
59        group_key_columns=group_key_columns,
60        y_col="y",
61        datetime_col="ds",
62        ndiffs=ndiff,
63        silence_warnings=True,
64    )
65
66    # Save to local directory
67    save_dir = "/tmp/group_pmdarima/pipeline_override.gpmd"
68    grouped_model.save(save_dir)
69
70    # Load from saved model
71    loaded_model = GroupedPmdarima.load(save_dir)
72
73    print("\nAutoARIMA results:\n", "-" * 40)
74    get_and_print_model_metrics_params(loaded_model)
75
76    print("\nPredictions:\n", "-" * 40)
77    prediction = loaded_model.predict(
78        n_periods=30, alpha=0.1, predict_col="forecasted_values", return_conf_int=True
79    )
80    print(prediction.to_string())
81
82    cv_evaluator = SlidingWindowForecastCV(h=90, step=120, window_size=180)
83    cross_validation = loaded_model.cross_validate(
84        df=training_data,
85        metrics=["smape", "mean_squared_error", "mean_absolute_error"],
86        cross_validator=cv_evaluator,
87    )
88
89    print("\nCross validation metrics:\n", "-" * 40)
90    print(cross_validation.to_string())

Supplementary

Note

To run these examples for yourself with the data generator example, utlize the following code:

Synthetic Data Generator
 1import itertools
 2import pandas as pd
 3import numpy as np
 4import string
 5import random
 6from datetime import timedelta, datetime
 7from collections import namedtuple
 8
 9
10def _generate_time_series(series_size: int):
11    residuals = np.random.lognormal(
12        mean=np.random.uniform(low=0.5, high=3.0),
13        sigma=np.random.uniform(low=0.6, high=0.98),
14        size=series_size,
15    )
16    trend = [
17        np.polyval([23.0, 1.0, 5], x)
18        for x in np.linspace(start=0, stop=np.random.randint(low=0, high=4), num=series_size)
19    ]
20    seasonality = [
21        90 * np.sin(2 * np.pi * 1000 * (i / (series_size * 200))) + 40
22        for i in np.arange(0, series_size)
23    ]
24
25    return residuals + trend + seasonality + np.random.uniform(low=20.0, high=1000.0)
26
27
28def _generate_grouping_columns(column_count: int, series_count: int):
29    candidate_list = list(string.ascii_uppercase)
30    candidates = random.sample(
31        list(itertools.permutations(candidate_list, column_count)), series_count
32    )
33    column_names = sorted([f"key{x}" for x in range(column_count)], reverse=True)
34    return [dict(zip(column_names, entries)) for entries in candidates]
35
36
37def _generate_raw_df(
38    column_count: int,
39    series_count: int,
40    series_size: int,
41    start_dt: str,
42    days_period: int,
43):
44    candidates = _generate_grouping_columns(column_count, series_count)
45    start_date = datetime.strptime(start_dt, "%Y-%M-%d")
46    dates = np.arange(
47        start_date,
48        start_date + timedelta(days=series_size * days_period),
49        timedelta(days=days_period),
50    )
51    df_collection = []
52    for entry in candidates:
53        generated_series = _generate_time_series(series_size)
54        series_dict = {"ds": dates, "y": generated_series}
55        series_df = pd.DataFrame.from_dict(series_dict)
56        for column, value in entry.items():
57            series_df[column] = value
58        df_collection.append(series_df)
59    return pd.concat(df_collection)
60
61
62def generate_example_data(
63    column_count: int,
64    series_count: int,
65    series_size: int,
66    start_dt: str,
67    days_period: int = 1,
68):
69
70    Structure = namedtuple("Structure", "df key_columns")
71    data = _generate_raw_df(column_count, series_count, series_size, start_dt, days_period)
72    key_columns = list(data.columns)
73
74    for key in ["ds", "y"]:
75        key_columns.remove(key)
76
77    return Structure(data, key_columns)