GroupedProphet Example Scripts
The scripts included below show the various options available for utilizing the GroupedProphet
API.
For an alternative view of these examples with data visualizations, see the notebooks here
GroupedProphet Example
This script shows a simple example of training a series of Prophet models, saving a GroupedProphet
instance,
loading that instance, cross validating through backtesting, and generating a forecast for each group.
1from diviner.utils.example_utils.example_data_generator import generate_example_data
2from diviner import GroupedProphet
3
4
5def execute_grouped_prophet():
6 """
7 This function call will generate synthetic group time series data in a normalized format.
8 The structure will be of:
9
10 ============ ====== =========== =========== ===========
11 ds y group_key_1 group_key_2 group_key_3
12 ============ ====== =========== =========== ===========
13 "2016-02-01" 1234.5 A B C
14 ============ ====== =========== =========== ===========
15
16 With the grouping key values that are generated per ``ds`` and ``y`` values assigned in a
17 non-deterministic fashion.
18
19 For utililzation of this API, the normalized representation of the data is required, such that
20 a particular target variables' data 'y' and the associated indexed datetime values in ``'ds'``
21 are 'stacked' (unioned) from a more traditional denormalized data storage paradigm.
22
23 For guidance on this data transposition from denormalized representations, see:
24 https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.melt.html
25 """
26
27 generated_data = generate_example_data(
28 column_count=3,
29 series_count=10,
30 series_size=365 * 5,
31 start_dt="2016-02-01",
32 days_period=1,
33 )
34
35 # Extract the normalized grouped datetime series data
36 training_data = generated_data.df
37
38 # Extract the names of the grouping columns that define the unique series data
39 grouping_key_columns = generated_data.key_columns
40
41 # Create a GroupedProphet model instance
42 grouped_model = GroupedProphet(n_changepoints=20, uncertainty_samples=0).fit(
43 training_data, grouping_key_columns
44 )
45
46 # Save the model to the local file system
47 save_path = "/tmp/grouped_prophet.gpm"
48 grouped_model.save(path="/tmp/grouped_prophet.gpm")
49
50 # Load the model from the local storage location
51 retrieved_model = GroupedProphet.load(save_path)
52
53 # Score the model and print the results
54 model_scores = retrieved_model.cross_validate_and_score(
55 horizon="30 days",
56 period="180 days",
57 initial="365 days",
58 parallel="threads",
59 rolling_window=0.05,
60 monthly=False,
61 )
62
63 print(f"Model scores:\n{model_scores.to_string()}")
64
65 # Run a forecast for each group
66 forecasts = retrieved_model.forecast(horizon=20, frequency="D")
67
68 print(f"Forecasted data:\n{forecasts[:50].to_string()}")
69
70 # Extract the parameters from each model for logging
71 params = retrieved_model.extract_model_params()
72
73 print(f"Model parameters:\n{params.to_string()}")
74
75
76if __name__ == "__main__":
77 execute_grouped_prophet()
GroupedProphet Subset Group Prediction Example
This script shows a simple example of training a series of Prophet models and generating a group subset prediction.
1from diviner.utils.example_utils.example_data_generator import generate_example_data
2from diviner import GroupedProphet
3
4if __name__ == "__main__":
5
6 generated_data = generate_example_data(
7 column_count=3,
8 series_count=10,
9 series_size=365 * 5,
10 start_dt="2016-02-01",
11 days_period=1,
12 )
13
14 # Extract the normalized grouped datetime series data
15 training_data = generated_data.df
16
17 # Extract the names of the grouping columns that define the unique series data
18 group_key_columns = generated_data.key_columns
19
20 # Create a GroupedProphet model instance
21 grouped_model = GroupedProphet(n_changepoints=20, uncertainty_samples=0).fit(
22 training_data, group_key_columns
23 )
24
25 # Get a subset of group keys to generate forecasts for
26 group_df = training_data.copy()
27 group_df["groups"] = list(zip(*[group_df[c] for c in group_key_columns]))
28 distinct_groups = group_df["groups"].unique()
29 groups_to_predict = list(distinct_groups[:3])
30
31 print("-" * 65)
32 print(f"\nUnique groups that have been modeled: \n{distinct_groups}\n")
33 print(f"Subset of groups to generate predictions for: \n{groups_to_predict}\n")
34 print("-" * 65)
35
36 forecasts = grouped_model.predict_groups(
37 groups=groups_to_predict,
38 horizon=60,
39 frequency="D",
40 predict_col="forecast_values",
41 on_error="warn",
42 )
43
44 print(f"\nForecast values:\n{forecasts.to_string()}")
Supplementary
Note
To run these examples for yourself with the data generator example, utilize the following code:
1import itertools
2import pandas as pd
3import numpy as np
4import string
5import random
6from datetime import timedelta, datetime
7from collections import namedtuple
8
9
10def _generate_time_series(series_size: int):
11 residuals = np.random.lognormal(
12 mean=np.random.uniform(low=0.5, high=3.0),
13 sigma=np.random.uniform(low=0.6, high=0.98),
14 size=series_size,
15 )
16 trend = [
17 np.polyval([23.0, 1.0, 5], x)
18 for x in np.linspace(start=0, stop=np.random.randint(low=0, high=4), num=series_size)
19 ]
20 seasonality = [
21 90 * np.sin(2 * np.pi * 1000 * (i / (series_size * 200))) + 40
22 for i in np.arange(0, series_size)
23 ]
24
25 return residuals + trend + seasonality + np.random.uniform(low=20.0, high=1000.0)
26
27
28def _generate_grouping_columns(column_count: int, series_count: int):
29 candidate_list = list(string.ascii_uppercase)
30 candidates = random.sample(
31 list(itertools.permutations(candidate_list, column_count)), series_count
32 )
33 column_names = sorted([f"key{x}" for x in range(column_count)], reverse=True)
34 return [dict(zip(column_names, entries)) for entries in candidates]
35
36
37def _generate_raw_df(
38 column_count: int,
39 series_count: int,
40 series_size: int,
41 start_dt: str,
42 days_period: int,
43):
44 candidates = _generate_grouping_columns(column_count, series_count)
45 start_date = datetime.strptime(start_dt, "%Y-%M-%d")
46 dates = np.arange(
47 start_date,
48 start_date + timedelta(days=series_size * days_period),
49 timedelta(days=days_period),
50 )
51 df_collection = []
52 for entry in candidates:
53 generated_series = _generate_time_series(series_size)
54 series_dict = {"ds": dates, "y": generated_series}
55 series_df = pd.DataFrame.from_dict(series_dict)
56 for column, value in entry.items():
57 series_df[column] = value
58 df_collection.append(series_df)
59 return pd.concat(df_collection)
60
61
62def generate_example_data(
63 column_count: int,
64 series_count: int,
65 series_size: int,
66 start_dt: str,
67 days_period: int = 1,
68):
69
70 Structure = namedtuple("Structure", "df key_columns")
71 data = _generate_raw_df(column_count, series_count, series_size, start_dt, days_period)
72 key_columns = list(data.columns)
73
74 for key in ["ds", "y"]:
75 key_columns.remove(key)
76
77 return Structure(data, key_columns)