Source code for diviner.data.base_group_generator

"""
Abstract Base Class for defining the API contract for group generator operations.
This base class is a template for package-specific implementations that function to
convert a normalized representation of grouped time series into per-group collections
of discrete time series so that forecasting models can be trained on each group.
"""
import abc
from typing import Tuple
from diviner.exceptions import DivinerException


[docs]class BaseGroupGenerator(abc.ABC):
    """
    Abstract class for defining the basic elements of performing a group processing collection
    generation operation.
    """

[docs]    def __init__(self, group_key_columns: Tuple, datetime_col: str, y_col: str):
        """
        Grouping key columns must be defined to serve in the construction of a consolidated
        single unique key that is used to identify a particular unique time series. The
        unique combinations of these provided fields define and control the grouping of
        univariate series data in order to train (fit) a particular model upon each of the
        unique series (that are defined by the combination of the values within these supplied
        columns).

        The primary purpose of the children of this class is to generate a dictionary of:
        ``{<group_key> : <DataFrame with unique univariate series>}``.
        The ```group_key`` element is constructed as a tuple of the values within the columns
        specified by ``_group_key_columns`` in this class constructor.

        For example, with a normalized data set provided of:

        =========== ==== ====== ======
        ds          y    group1 group2
        =========== ==== ====== ======
        2021-09-02  11.1 "a"    "z"
        2021-09-03  7.33 "a"    "z"
        2021-09-02  31.1 "b"    "q"
        2021-09-03  44.1 "b"    "q"
        =========== ==== ====== ======

        There are two separate univariate series: ``("a", "z")`` and ``("b", "q")``.
        The group generator's function is to convert this unioned ``DataFrame`` into the following:

        ``{ ("a", "z"):``

        ========== ==== ====== ======
        ds         y    group1 group2
        ========== ==== ====== ======
        2021-09-02 11.1 "a"    "z"
        2021-09-03 7.33 "a"    "z"
        ========== ==== ====== ======

          ``,("b", "q"):``

        ========== ==== ====== ======
        ds         y    group1 group2
        ========== ==== ====== ======
        2021-09-02 31.1 "b"    "q"
        2021-09-03 44.1 "b"    "q"
        ========== ==== ====== ======

        ``}``

        This grouping allows for a model to be fit to each of these series in isolation.

        :param group_key_columns: ``Tuple[str]`` of column names that determine which elements of
                                  the submitted ``DataFrame`` determine uniqueness of a particular
                                  time series.
        :param datetime_col: The name of the column that contains the ``datetime`` values for
                             each series.
        :param y_col: The endogenous regressor element of the series. This is the value that is
                      used for training and is the element that is intending to be forecast.
        """
        if not group_key_columns or len(group_key_columns) == 0:
            raise DivinerException(
                "Argument '_group_key_columns' tuple must contain at " "least one string entry."
            )

        self._group_key_columns = group_key_columns
        self._datetime_col = datetime_col
        self._y_col = y_col
        self._master_group_key = "grouping_key"

[docs]    @abc.abstractmethod
    def generate_processing_groups(self, df):
        """
        Abstract method for the generation of processing execution groups for individual models.
        Implementations of this method should generate a processing collection that is a relation
        between the unique combinations of ``_group_key_columns`` values, generated as a
        ``_master_group_key`` entry that defines a specific datetime series for forecasting.

        For example, with a normalized dataframe input of

        ========== ======= ======= ==
        ds         region  country y
        ========== ======= ======= ==
        2020-01-01 SW      USA     42
        2020-01-02 SW      USA     11
        2020-01-01 NE      USA     31
        2020-01-01 Ontario CA      12
        ========== ======= ======= ==

        The output structure should be, with the group_keys value specified as:

        ``("country", "region"):[{ ("USA", "SW"):``

        ========== ====== ======= ==
        ds         region country y
        ========== ====== ======= ==
        2020-01-01 SW     USA     42
        2020-01-02 SW     USA     11
        ========== ====== ======= ==

        ``}. {("USA", "NE"):``

        ========== ====== ======= ==
        ds         region country y
        ========== ====== ======= ==
        2020-01-01 NE     USA     31
        ========== ====== ======= ==

        ``}, {("CA", "Ontario"):``

        ========== ======= ======= ==
        ds         region  country y
        ========== ======= ======= ==
        2020-01-01 Ontario CA      12
        ========== ======= ======= ==

        ``}]``

        The list wrapper around dictionaries is to allow for multiprocessing support without having
        to contend with encapsulating the entire dictionary for the processing of a single key
        and value pair.

        :param df: The user-input normalized DataFrame with _group_key_columns
        :return: A list of dictionaries of ``{group_key: <group's univariate series data>}``
                 structure for isolated processing by the model APIs.
        """

[docs]    @abc.abstractmethod
    def generate_prediction_groups(self, df):
        """
        Abstract method for generating the data set collection required for manual prediction for
        arbitrary datetime and key groupings.

        :param df: Normalized ``DataFrame`` that contains the columns defined in instance attribute
                   ``_group_key_columns`` within its schema and the dates for prediction within the
                   ``datetime_col`` field.
        :return: ``List(tuple(master_group_key, df))`` the processing collection of ``DataFrame``
                 coupled with their group identifier.
        """