GroupedPmdarima Example Scripts
The scripts included below show the various options available for utilizing the GroupedPmdarima
API.
For an alternative view of these examples with data visualizations, see the notebooks here
Scripts
GroupedPmdarima ARIMA
This example shows using a manually-configured (order values provided for a non-seasonal collection of series) ARIMA model that is applied to each group.
Using this approach (a static order configuration) can be useful for homogenous collections of series. If each member of the grouped collection of series shares a common characteristic in the residuals (i.e., the differencing terms for both an auto-correlation and partial auto-correlation analysis shows similar relationships for all groups), this approach will be faster and less expensive to fit a model than any other means.
1import numpy as np
2from pmdarima.arima.arima import ARIMA
3from pmdarima.model_selection import SlidingWindowForecastCV
4from diviner.utils.example_utils.example_data_generator import generate_example_data
5from diviner import GroupedPmdarima
6
7
8def get_and_print_model_metrics_params(grouped_model):
9 fit_metrics = grouped_model.get_metrics()
10 fit_params = grouped_model.get_model_params()
11
12 print(f"\nModel Fit Metrics:\n{fit_metrics.to_string()}")
13 print(f"\nModel Fit Params:\n{fit_params.to_string()}")
14
15
16if __name__ == "__main__":
17
18 # Generate a few years of daily data across 4 different groups, defined by 3 columns that
19 # define each group
20 generated_data = generate_example_data(
21 column_count=3,
22 series_count=4,
23 series_size=365 * 4,
24 start_dt="2019-01-01",
25 days_period=1,
26 )
27
28 training_data = generated_data.df
29 group_key_columns = generated_data.key_columns
30
31 # Build a GroupedPmdarima model by specifying an ARIMA model
32 arima_obj = ARIMA(order=(2, 1, 3), out_of_sample_size=60)
33 base_arima = GroupedPmdarima(model_template=arima_obj).fit(
34 df=training_data,
35 group_key_columns=group_key_columns,
36 y_col="y",
37 datetime_col="ds",
38 silence_warnings=True,
39 )
40
41 # Save to local directory
42 save_dir = "/tmp/group_pmdarima/arima.gpmd"
43 base_arima.save(save_dir)
44
45 # Load from saved model
46 loaded_model = GroupedPmdarima.load(save_dir)
47
48 print("\nARIMA results:\n", "-" * 40)
49 get_and_print_model_metrics_params(loaded_model)
50
51 prediction = loaded_model.predict(
52 n_periods=30, alpha=0.02, predict_col="forecast", return_conf_int=True
53 )
54 print("\nPredictions:\n", "-" * 40)
55 print(prediction.to_string())
56
57 print("\nCross validation metric results:\n", "-" * 40)
58 cross_validator = SlidingWindowForecastCV(h=90, step=365, window_size=730)
59 cv_results = loaded_model.cross_validate(
60 df=training_data,
61 metrics=["mean_squared_error", "smape", "mean_absolute_error"],
62 cross_validator=cross_validator,
63 error_score=np.nan,
64 verbosity=4,
65 )
66
67 print(cv_results.to_string())
GroupedPmdarima AutoARIMA
For projects that do not have homogeneous relationships amongst groups of series, using the AutoARIMA functionality of pmdarima is advised. This will allow for individualized optimation of the order terms (p, d, q) and, for seasonal series, the (P, D, Q) seasonal order terms as well.
Note
If using a seasonal approach, the parameter m
must be set to an integer value that represents the seasonal
periodicity. In this mode, with m
set, the ARIMA terms (p, d, q) will be optimized along with (P, D, Q). Due to
the complexity of optimizing these terms, this execution mode will take far longer than an optimization of a
non-seasonal model.
1import numpy as np
2from pmdarima.arima.auto import AutoARIMA
3from pmdarima.model_selection import SlidingWindowForecastCV
4from diviner.utils.example_utils.example_data_generator import generate_example_data
5from diviner import GroupedPmdarima
6
7
8def get_and_print_model_metrics_params(grouped_model):
9 fit_metrics = grouped_model.get_metrics()
10 fit_params = grouped_model.get_model_params()
11
12 print(f"\nModel Fit Metrics:\n{fit_metrics.to_string()}")
13 print(f"\nModel Fit Params:\n{fit_params.to_string()}")
14
15
16if __name__ == "__main__":
17
18 # Generate a few years of daily data across 4 different groups, defined by 3 columns that
19 # define each group
20 generated_data = generate_example_data(
21 column_count=3,
22 series_count=4,
23 series_size=365 * 3,
24 start_dt="2019-01-01",
25 days_period=1,
26 )
27
28 training_data = generated_data.df
29 group_key_columns = generated_data.key_columns
30
31 # Utilize pmdarima's AutoARIMA to auto-tune the ARIMA order values
32 auto_arima_obj = AutoARIMA(out_of_sample_size=60, maxiter=100)
33 base_auto_arima = GroupedPmdarima(model_template=auto_arima_obj).fit(
34 df=training_data,
35 group_key_columns=group_key_columns,
36 y_col="y",
37 datetime_col="ds",
38 silence_warnings=True,
39 )
40
41 # Save to local directory
42 save_dir = "/tmp/group_pmdarima/autoarima.gpmd"
43 base_auto_arima.save(save_dir)
44
45 # Load from saved model
46 loaded_model = GroupedPmdarima.load(save_dir)
47
48 print("\nAutoARIMA results:\n", "-" * 40)
49 get_and_print_model_metrics_params(loaded_model)
50
51 print("\nPredictions:\n", "-" * 40)
52 prediction = loaded_model.predict(n_periods=30, alpha=0.1, return_conf_int=True)
53 print(prediction.to_string())
54
55 print("\nCross validation metric results:\n", "-" * 40)
56 cross_validator = SlidingWindowForecastCV(h=30, step=180, window_size=365)
57 cv_results = loaded_model.cross_validate(
58 df=training_data,
59 metrics=["mean_squared_error", "smape", "mean_absolute_error"],
60 cross_validator=cross_validator,
61 error_score=np.nan,
62 verbosity=3,
63 )
64
65 print(cv_results.to_string())
GroupedPmdarima Pipeline Example
This example shows the utilization of a pmdarima.pipeline.Pipeline
, incorporating preprocessing operations
to each series. In the example below, a
Box Cox
transformation is applied to each series to force stationarity.
Note
The data set used for these examples is a randomly generated non-deterministic group of series data. As such, The relevance of utilizing a normalcy transform on this data is somewhere between ‘unlikely’ and ‘zero’. Using a BoxCox transform here is used as an API example only.
1import numpy as np
2from pmdarima.arima.auto import AutoARIMA
3from pmdarima.pipeline import Pipeline
4from pmdarima.preprocessing import BoxCoxEndogTransformer
5from pmdarima.model_selection import RollingForecastCV
6from diviner.utils.example_utils.example_data_generator import generate_example_data
7from diviner import GroupedPmdarima
8
9
10def get_and_print_model_metrics_params(grouped_model):
11 fit_metrics = grouped_model.get_metrics()
12 fit_params = grouped_model.get_model_params()
13
14 print(f"\nModel Fit Metrics:\n{fit_metrics.to_string()}")
15 print(f"\nModel Fit Params:\n{fit_params.to_string()}")
16
17
18if __name__ == "__main__":
19
20 # Generate a few years of daily data across 2 different groups, defined by 3 columns that
21 # define each group
22 generated_data = generate_example_data(
23 column_count=3,
24 series_count=2,
25 series_size=365 * 3,
26 start_dt="2019-01-01",
27 days_period=1,
28 )
29
30 training_data = generated_data.df
31 group_key_columns = generated_data.key_columns
32
33 pipeline_obj = Pipeline(
34 steps=[
35 (
36 "box",
37 BoxCoxEndogTransformer(lmbda2=0.4, neg_action="ignore", floor=1e-12),
38 ),
39 ("arima", AutoARIMA(out_of_sample_size=60, max_p=4, max_q=4, max_d=4)),
40 ]
41 )
42 pipeline_arima = GroupedPmdarima(model_template=pipeline_obj).fit(
43 df=training_data,
44 group_key_columns=group_key_columns,
45 y_col="y",
46 datetime_col="ds",
47 silence_warnings=True,
48 )
49
50 # Save to local directory
51 save_dir = "/tmp/group_pmdarima/pipeline.gpmd"
52 pipeline_arima.save(save_dir)
53
54 # Load from saved model
55 loaded_model = GroupedPmdarima.load(save_dir)
56
57 print("\nPipeline AutoARIMA results:\n", "-" * 40)
58 get_and_print_model_metrics_params(loaded_model)
59
60 print("\nPredictions:\n", "-" * 40)
61 prediction = loaded_model.predict(
62 n_periods=30, alpha=0.2, predict_col="predictions", return_conf_int=True
63 )
64 print(prediction.to_string())
65
66 print("\nCross validation metric results:\n", "-" * 40)
67 cross_validator = RollingForecastCV(h=30, step=365, initial=730)
68 cv_results = loaded_model.cross_validate(
69 df=training_data,
70 metrics=["mean_squared_error"],
71 cross_validator=cross_validator,
72 error_score=np.nan,
73 verbosity=3,
74 )
75
76 print(cv_results.to_string())
GroupedPmdarima Group Subset Prediction Example
This example shows a subset prediction of groups by using the predict_groups <diviner.GroupedPmdarima.predict_groups> method.
1from pmdarima.arima.arima import ARIMA
2from diviner import GroupedPmdarima
3from diviner.utils.example_utils.example_data_generator import generate_example_data
4
5if __name__ == "__main__":
6
7 generated_data = generate_example_data(
8 column_count=2,
9 series_count=6,
10 series_size=365 * 4,
11 start_dt="2019-01-01",
12 days_period=1,
13 )
14
15 training_data = generated_data.df
16 group_key_columns = generated_data.key_columns
17
18 arima_obj = ARIMA(order=(2, 1, 3), out_of_sample_size=60)
19 base_arima = GroupedPmdarima(model_template=arima_obj).fit(
20 df=training_data,
21 group_key_columns=group_key_columns,
22 y_col="y",
23 datetime_col="ds",
24 silence_warnings=True,
25 )
26
27 # Get a subset of group keys to generate forecasts for
28 group_df = training_data.copy()
29 group_df["groups"] = list(zip(*[group_df[c] for c in group_key_columns]))
30 distinct_groups = group_df["groups"].unique()
31 groups_to_predict = list(distinct_groups[:3])
32
33 print("-" * 65)
34 print(f"Unique groups that have been modeled: {distinct_groups}")
35 print(f"Subset of groups to generate predictions for: {groups_to_predict}")
36 print("-" * 65)
37
38 forecasts = base_arima.predict_groups(
39 groups=groups_to_predict,
40 n_periods=60,
41 predict_col="forecast_values",
42 on_error="warn",
43 )
44
45 print(f"\nForecast values:\n{forecasts.to_string()}")
GroupedPmdarima Series Analysis Example
The below script illustrates how to perform analytics on a grouped series data set. Applying the results of these utilities can aid in determining appropriate order values (p, d, q) and seasonal order values (P, D, Q) for the example shown in the ARIMA example.
1import pprint
2from diviner import PmdarimaAnalyzer
3
4from diviner.utils.example_utils.example_data_generator import generate_example_data
5
6
7def _print_dict(data, name):
8 print("\n" + "-" * 100)
9 print(f"{name} values for the groups")
10 print("-" * 100, "\n")
11 pprint.PrettyPrinter(indent=2).pprint(data)
12
13
14if __name__ == "__main__":
15
16 generated_data = generate_example_data(
17 column_count=4,
18 series_count=3,
19 series_size=365 * 12,
20 start_dt="2010-01-01",
21 days_period=1,
22 )
23 training_data = generated_data.df
24 group_key_columns = generated_data.key_columns
25
26 # Create a utility object for performing analyses
27 # We reuse this object because the grouped data set collection is lazily evaluated and can be
28 # reused for subsequent analytics operations on the data set.
29 analyzer = PmdarimaAnalyzer(
30 df=training_data,
31 group_key_columns=group_key_columns,
32 y_col="y",
33 datetime_col="ds",
34 )
35
36 # Decompose the trends of each group
37 decomposed_trends = analyzer.decompose_groups(m=7, type_="additive")
38
39 print("Decomposed trend data for the groups")
40 print("-" * 100, "\n")
41 print(decomposed_trends[:50].to_string())
42
43 # Calculate optimal differencing for ARMA terms
44 ndiffs = analyzer.calculate_ndiffs(alpha=0.1, test="kpss", max_d=5)
45
46 _print_dict(ndiffs, "Differencing")
47
48 # Calculate seasonal differencing
49 nsdiffs = analyzer.calculate_nsdiffs(m=365, test="ocsb", max_D=5)
50
51 _print_dict(nsdiffs, "Seasonal Differencing")
52
53 # Get the autocorrelation function for each group
54 group_acf = analyzer.calculate_acf(
55 unbiased=True, nlags=120, qstat=True, fft=True, alpha=0.05, adjusted=True
56 )
57
58 _print_dict(group_acf, "Autocorrelation function")
59
60 # Get the partial autocorrelation function for each group
61 group_pacf = analyzer.calculate_pacf(nlags=120, method="yw", alpha=0.05)
62
63 _print_dict(group_pacf, "Partial Autocorrelation function")
64
65 # Perform a diff operation on each group
66 group_diff = analyzer.generate_diff(lag=7, differences=1)
67
68 _print_dict(group_diff, "Differencing")
69
70 # Invert the diff operation on each group
71 group_diff_inv = analyzer.generate_diff_inversion(
72 group_diff, lag=7, differences=1, recenter=True
73 )
74
75 _print_dict(group_diff_inv, "Differencing Inversion")
GroupedPmdarima Differencing Term Manual Calculation Example
This script below shows a means of dramatically reducing the optimization time of AutoARIMA through the manual
calculation of the differencing term 'd'
for each series in the grouped series data set. By manually setting
this argument (which can be either unique for each group or homogenous across all groups), the optimization algorithm
can reduce the total number of iterative validation tests.
1from diviner.utils.example_utils.example_data_generator import generate_example_data
2from diviner import GroupedPmdarima, PmdarimaAnalyzer
3from pmdarima.pipeline import Pipeline
4from pmdarima import AutoARIMA
5from pmdarima.model_selection import SlidingWindowForecastCV
6
7
8def get_and_print_model_metrics_params(grouped_model):
9 fit_metrics = grouped_model.get_metrics()
10 fit_params = grouped_model.get_model_params()
11
12 print(f"\nModel Fit Metrics:\n{fit_metrics.to_string()}")
13 print(f"\nModel Fit Params:\n{fit_params.to_string()}")
14
15
16if __name__ == "__main__":
17
18 # Generate 6 years of daily data across 4 different groups, defined by 3 columns that
19 # define each group
20 generated_data = generate_example_data(
21 column_count=3,
22 series_count=3,
23 series_size=365 * 3,
24 start_dt="2019-01-01",
25 days_period=1,
26 )
27
28 training_data = generated_data.df
29 group_key_columns = generated_data.key_columns
30
31 pipeline = Pipeline(
32 steps=[
33 (
34 "arima",
35 AutoARIMA(
36 max_order=14,
37 out_of_sample_size=90,
38 suppress_warnings=True,
39 error_action="ignore",
40 ),
41 )
42 ]
43 )
44
45 diff_analyzer = PmdarimaAnalyzer(
46 df=training_data,
47 group_key_columns=group_key_columns,
48 y_col="y",
49 datetime_col="ds",
50 )
51 ndiff = diff_analyzer.calculate_ndiffs(
52 alpha=0.05,
53 test="kpss",
54 max_d=4,
55 )
56
57 grouped_model = GroupedPmdarima(model_template=pipeline).fit(
58 df=training_data,
59 group_key_columns=group_key_columns,
60 y_col="y",
61 datetime_col="ds",
62 ndiffs=ndiff,
63 silence_warnings=True,
64 )
65
66 # Save to local directory
67 save_dir = "/tmp/group_pmdarima/pipeline_override.gpmd"
68 grouped_model.save(save_dir)
69
70 # Load from saved model
71 loaded_model = GroupedPmdarima.load(save_dir)
72
73 print("\nAutoARIMA results:\n", "-" * 40)
74 get_and_print_model_metrics_params(loaded_model)
75
76 print("\nPredictions:\n", "-" * 40)
77 prediction = loaded_model.predict(
78 n_periods=30, alpha=0.1, predict_col="forecasted_values", return_conf_int=True
79 )
80 print(prediction.to_string())
81
82 cv_evaluator = SlidingWindowForecastCV(h=90, step=120, window_size=180)
83 cross_validation = loaded_model.cross_validate(
84 df=training_data,
85 metrics=["smape", "mean_squared_error", "mean_absolute_error"],
86 cross_validator=cv_evaluator,
87 )
88
89 print("\nCross validation metrics:\n", "-" * 40)
90 print(cross_validation.to_string())
Supplementary
Note
To run these examples for yourself with the data generator example, utlize the following code:
1import itertools
2import pandas as pd
3import numpy as np
4import string
5import random
6from datetime import timedelta, datetime
7from collections import namedtuple
8
9
10def _generate_time_series(series_size: int):
11 residuals = np.random.lognormal(
12 mean=np.random.uniform(low=0.5, high=3.0),
13 sigma=np.random.uniform(low=0.6, high=0.98),
14 size=series_size,
15 )
16 trend = [
17 np.polyval([23.0, 1.0, 5], x)
18 for x in np.linspace(start=0, stop=np.random.randint(low=0, high=4), num=series_size)
19 ]
20 seasonality = [
21 90 * np.sin(2 * np.pi * 1000 * (i / (series_size * 200))) + 40
22 for i in np.arange(0, series_size)
23 ]
24
25 return residuals + trend + seasonality + np.random.uniform(low=20.0, high=1000.0)
26
27
28def _generate_grouping_columns(column_count: int, series_count: int):
29 candidate_list = list(string.ascii_uppercase)
30 candidates = random.sample(
31 list(itertools.permutations(candidate_list, column_count)), series_count
32 )
33 column_names = sorted([f"key{x}" for x in range(column_count)], reverse=True)
34 return [dict(zip(column_names, entries)) for entries in candidates]
35
36
37def _generate_raw_df(
38 column_count: int,
39 series_count: int,
40 series_size: int,
41 start_dt: str,
42 days_period: int,
43):
44 candidates = _generate_grouping_columns(column_count, series_count)
45 start_date = datetime.strptime(start_dt, "%Y-%M-%d")
46 dates = np.arange(
47 start_date,
48 start_date + timedelta(days=series_size * days_period),
49 timedelta(days=days_period),
50 )
51 df_collection = []
52 for entry in candidates:
53 generated_series = _generate_time_series(series_size)
54 series_dict = {"ds": dates, "y": generated_series}
55 series_df = pd.DataFrame.from_dict(series_dict)
56 for column, value in entry.items():
57 series_df[column] = value
58 df_collection.append(series_df)
59 return pd.concat(df_collection)
60
61
62def generate_example_data(
63 column_count: int,
64 series_count: int,
65 series_size: int,
66 start_dt: str,
67 days_period: int = 1,
68):
69
70 Structure = namedtuple("Structure", "df key_columns")
71 data = _generate_raw_df(column_count, series_count, series_size, start_dt, days_period)
72 key_columns = list(data.columns)
73
74 for key in ["ds", "y"]:
75 key_columns.remove(key)
76
77 return Structure(data, key_columns)