Source code for diviner.data.base_group_generator

"""
Abstract Base Class for defining the API contract for group generator operations.
This base class is a template for package-specific implementations that function to
convert a normalized representation of grouped time series into per-group collections
of discrete time series so that forecasting models can be trained on each group.
"""
import abc
from typing import Tuple
from diviner.exceptions import DivinerException


[docs]class BaseGroupGenerator(abc.ABC): """ Abstract class for defining the basic elements of performing a group processing collection generation operation. """
[docs] def __init__(self, group_key_columns: Tuple, datetime_col: str, y_col: str): """ Grouping key columns must be defined to serve in the construction of a consolidated single unique key that is used to identify a particular unique time series. The unique combinations of these provided fields define and control the grouping of univariate series data in order to train (fit) a particular model upon each of the unique series (that are defined by the combination of the values within these supplied columns). The primary purpose of the children of this class is to generate a dictionary of: ``{<group_key> : <DataFrame with unique univariate series>}``. The ```group_key`` element is constructed as a tuple of the values within the columns specified by ``_group_key_columns`` in this class constructor. For example, with a normalized data set provided of: =========== ==== ====== ====== ds y group1 group2 =========== ==== ====== ====== 2021-09-02 11.1 "a" "z" 2021-09-03 7.33 "a" "z" 2021-09-02 31.1 "b" "q" 2021-09-03 44.1 "b" "q" =========== ==== ====== ====== There are two separate univariate series: ``("a", "z")`` and ``("b", "q")``. The group generator's function is to convert this unioned ``DataFrame`` into the following: ``{ ("a", "z"):`` ========== ==== ====== ====== ds y group1 group2 ========== ==== ====== ====== 2021-09-02 11.1 "a" "z" 2021-09-03 7.33 "a" "z" ========== ==== ====== ====== ``,("b", "q"):`` ========== ==== ====== ====== ds y group1 group2 ========== ==== ====== ====== 2021-09-02 31.1 "b" "q" 2021-09-03 44.1 "b" "q" ========== ==== ====== ====== ``}`` This grouping allows for a model to be fit to each of these series in isolation. :param group_key_columns: ``Tuple[str]`` of column names that determine which elements of the submitted ``DataFrame`` determine uniqueness of a particular time series. :param datetime_col: The name of the column that contains the ``datetime`` values for each series. :param y_col: The endogenous regressor element of the series. This is the value that is used for training and is the element that is intending to be forecast. """ if not group_key_columns or len(group_key_columns) == 0: raise DivinerException( "Argument '_group_key_columns' tuple must contain at " "least one string entry." ) self._group_key_columns = group_key_columns self._datetime_col = datetime_col self._y_col = y_col self._master_group_key = "grouping_key"
[docs] @abc.abstractmethod def generate_processing_groups(self, df): """ Abstract method for the generation of processing execution groups for individual models. Implementations of this method should generate a processing collection that is a relation between the unique combinations of ``_group_key_columns`` values, generated as a ``_master_group_key`` entry that defines a specific datetime series for forecasting. For example, with a normalized dataframe input of ========== ======= ======= == ds region country y ========== ======= ======= == 2020-01-01 SW USA 42 2020-01-02 SW USA 11 2020-01-01 NE USA 31 2020-01-01 Ontario CA 12 ========== ======= ======= == The output structure should be, with the group_keys value specified as: ``("country", "region"):[{ ("USA", "SW"):`` ========== ====== ======= == ds region country y ========== ====== ======= == 2020-01-01 SW USA 42 2020-01-02 SW USA 11 ========== ====== ======= == ``}. {("USA", "NE"):`` ========== ====== ======= == ds region country y ========== ====== ======= == 2020-01-01 NE USA 31 ========== ====== ======= == ``}, {("CA", "Ontario"):`` ========== ======= ======= == ds region country y ========== ======= ======= == 2020-01-01 Ontario CA 12 ========== ======= ======= == ``}]`` The list wrapper around dictionaries is to allow for multiprocessing support without having to contend with encapsulating the entire dictionary for the processing of a single key and value pair. :param df: The user-input normalized DataFrame with _group_key_columns :return: A list of dictionaries of ``{group_key: <group's univariate series data>}`` structure for isolated processing by the model APIs. """
[docs] @abc.abstractmethod def generate_prediction_groups(self, df): """ Abstract method for generating the data set collection required for manual prediction for arbitrary datetime and key groupings. :param df: Normalized ``DataFrame`` that contains the columns defined in instance attribute ``_group_key_columns`` within its schema and the dates for prediction within the ``datetime_col`` field. :return: ``List(tuple(master_group_key, df))`` the processing collection of ``DataFrame`` coupled with their group identifier. """