Source code for diviner.data.pandas_group_generator

from diviner.data.base_group_generator import BaseGroupGenerator
from diviner.utils.common import _validate_keys_in_df
import pandas as pd
from typing import Tuple


[docs]class PandasGroupGenerator(BaseGroupGenerator): """ This class is used to convert a normalized collection of time series data within a single ``DataFrame``, e.g.: =========== ==== ============ ====== region zone ds y =========== ==== ============ ====== 'northeast' 1 "2021-10-01" 1234.5 'northeast' 2 "2021-10-01" 3255.6 'northeast' 1 "2021-10-02" 1255.9 =========== ==== ============ ====== With the grouping keys ``['region', 'zone']`` define the unique series of the target ``y`` indexed by ``ds``. This class will #. Generate a `master group key` that is a tuple zip of the grouping key arguments specified by the user, preserving the order of declaration of these keys. #. Group the ``DataFrame`` by these master grouping keys and generate a collection of tuples of the form ``(master_grouping_key, <series DataFrame>)`` which is used for iterating over to generate the individualized forecasting models for each master key group. """
[docs] def __init__(self, group_key_columns: Tuple, datetime_col: str, y_col: str): """ :param group_key_columns: Grouping columns that a combination of which designates a combination of ``ds`` and ``y`` that represent a distinct series. :param datetime_col: The name of the column that contains the ``datetime`` values for each series. :param y_col: The endogenous regressor element of the series. This is the value that is used for training and is the element that is intending to be forecast. """ self._group_key_columns = group_key_columns self._datetime_col = datetime_col self._y_col = y_col super().__init__(group_key_columns, datetime_col, y_col)
[docs] def _get_df_with_master_key_column(self, df: pd.DataFrame) -> pd.DataFrame: """ Method for creating the 'master_group_key' column that defines a unique group. The master_group_key column is generated from the concatenation (within a tuple) of the values in each of the individual `_group_key_columns`, serving as an aggregation grouping key to define a unique collection of datetime series values. For example: =========== ==== ============ ====== region zone ds y =========== ==== ============ ====== 'northeast' 1 "2021-10-01" 1234.5 'northeast' 2 "2021-10-01" 3255.6 'northeast' 1 "2021-10-02" 1255.9 =========== ==== ============ ====== With the above dataset, the ``group_key_columns`` passed in would be: ``('region', 'zone')`` This method will modify the input ``DataFrame`` by adding the ``master_group_key`` as follows: =========== ==== ============ ====== ================ region zone ds y grouping_key =========== ==== ============ ====== ================ 'northeast' 1 "2021-10-01" 1234.5 ('northeast', 1) 'northeast' 2 "2021-10-01" 3255.6 ('northeast', 2) 'northeast' 1 "2021-10-02" 1255.9 ('northeast', 1) =========== ==== ============ ====== ================ :param df: The normalized ``DataFrame`` :return: A copy of the passed-in ``DataFrame`` with a master grouping key column added that contains the group definitions per row of the input ``DataFrame``. """ _validate_keys_in_df(df, self._group_key_columns) master_group_df = df.copy() master_group_df[self._master_group_key] = master_group_df[[*self._group_key_columns]].apply( lambda column: tuple(column), axis=1 ) # pylint: disable=unnecessary-lambda return master_group_df
[docs] def generate_processing_groups(self, df: pd.DataFrame): """ Method for generating the collection of ``[(master_grouping_key, <group DataFrame>)]`` This method will call ``_create_master_key_column()`` to generate a column containing the tuple of the values within the ``_group_key_columns`` fields, then generate an iterable collection of ``key`` -> ``DataFrame`` representation. For example, after adding the ``grouping_key`` column from ``_create_master_key_column()``, the ``DataFrame`` will look like this =========== ==== ============ ====== ================ region zone ds y grouping_key =========== ==== ============ ====== ================ 'northeast' 1 "2021-10-01" 1234.5 ('northeast', 1) 'northeast' 2 "2021-10-01" 3255.6 ('northeast', 2) 'northeast' 1 "2021-10-02" 1255.9 ('northeast', 1) =========== ==== ============ ====== ================ This method will translate this structure to ``[(('northeast', 1),`` ============ ====== ds y ============ ====== "2021-10-01" 1234.5 "2021-10-02" 1255.9 ============ ====== ``), (('northeast', 2),`` ============ ====== ds y ============ ====== "2021-10-01" 3255.6 "2021-10-02" 1255.9 ============ ====== ``)]`` :param df: Normalized ``DataFrame`` that contains the columns defined in instance attribute ``_group_key_columns`` within its schema. :return: ``List(tuple(master_group_key, df))`` the processing collection of ``DataFrame`` coupled with their group identifier. """ master_key_generation = self._get_df_with_master_key_column(df) group_consolidation_df = ( master_key_generation.groupby([self._master_group_key, self._datetime_col])[self._y_col] .agg("sum") .reset_index() ) grouped_data = list( dict(tuple(group_consolidation_df.groupby(self._master_group_key))).items() ) return grouped_data
[docs] def generate_prediction_groups(self, df: pd.DataFrame): """ Method for generating the data set collection required to run a manual per ``datetime`` prediction for arbitrary datetime and key groupings. :param df: Normalized ``DataFrame`` that contains the columns defined in instance attribute ``_group_key_columns`` within its schema and the dates for prediction within the ``datetime_col`` field. :return: ``List(tuple(master_group_key, df))`` the processing collection of ``DataFrame`` coupled with their group identifier. """ master_key_generation = self._get_df_with_master_key_column(df) grouped_data = list( dict(tuple(master_key_generation.groupby(self._master_group_key))).items() ) return grouped_data