Source code for diviner.analysis.pmdarima_analyzer

import warnings

import pandas as pd
from packaging.version import Version
from pmdarima.arima import decompose, ndiffs, nsdiffs, is_constant
from pmdarima.utils import acf, pacf, diff, diff_inv
from diviner.data.pandas_group_generator import PandasGroupGenerator
from diviner.utils.common import _restructure_predictions
from diviner.exceptions import DivinerException
from diviner.utils.annotators import experimental


[docs]class PmdarimaAnalyzer: def __init__(self, df, group_key_columns, y_col, datetime_col): """ A class for performing analysis of a grouped timeseries data set. Included in this class are methods for: ``decompose_groups``: trend decomposition into `'trend'`, `'seasonal'` and `'random'` elements that collectively make up the underlying series data. ``calculate_ndiffs``: optimal selection of the ``d`` (differencing) term for ARIMA to convert a series to one that is stationary. Specifying this as a constant to ``AutoARIMA`` can drastically reduce training time. ``calculate_nsdiffs``: optimal selection of the ``D`` (seasonal differencing) term for ``SARIMAX`` to convert a seasonally-influenced series to a stationary one. Providing this as a constant to the ``AutoARIMA`` args ``D=<value>`` can reduce training time, eliminating recursive loops during optimization. ``calculate_is_constant``: a validation method to verify that each group's series contains more than a single value (constancy check). This can aid in filtering out submitted groups from the ``GroupedPmdarima.fit()`` method to prevent non-useful forecasts from being generated. (i.e., a repeated constant value throughout a series provides no value for forecasting and is a waste of runtime resources). ``calculate_acf``: grouped calculation of auto correlation factor values for each grouped series. ``calculate_pacf``: grouped calculation of partial auto correlation factor values for each grouped series. :param df: A DataFrame consisting of at least ``y_col``, ``group_key_columns``, and ``datetime_col`` columns to be analyzed. :param group_key_columns: The columns in the ``df`` argument that define, in aggregate, a unique time series entry. :param y_col: The name of the column within the DataFrame that contains the endogenous regressor term. :param datetime_col: The name of the column within the DataFrame input that defines the datetime or date values associated with each row of the endogenous regressor ``y_col`` data. """ self._df = df self._group_df = None self._group_key_columns = group_key_columns self._y_col = y_col self._datetime_col = datetime_col self._master_key = "grouping_key" def _create_group_df(self): if not self._group_df: self._group_df = PandasGroupGenerator( self._group_key_columns, self._datetime_col, self._y_col ).generate_processing_groups(self._df) def _decompose_group(self, group_df, group_key, m, type_, filter_): group_df.reset_index(inplace=True) group_decomposition = decompose(x=group_df[self._y_col], type_=type_, m=m, filter_=filter_) group_result = { key: getattr(group_decomposition, key) for key in group_decomposition._fields } output_df = pd.DataFrame.from_dict(group_result) output_df[self._datetime_col] = group_df[self._datetime_col] output_df[self._master_key] = output_df.apply(lambda x: group_key, 1) return output_df
[docs] @experimental def decompose_groups(self, m, type_, filter_=None): """ Utility method that wraps ``pmdarima.arima.decompose()`` for each group within the passed-in DataFrame. Note: decomposition works best if the total number of entries within the series being decomposed is a multiple of the `m` parameter value. :param m: The frequency of the endogenous series. (i.e., for daily data, an ``m`` value of ``'7'`` would be appropriate for estimating a weekly seasonality, while setting ``m`` to ``'365'`` would be effective for yearly seasonality effects.) :param type_: The type of decomposition to perform. One of: ``['additive', 'multiplicative']`` See: https://alkaline-ml.com/pmdarima/modules/generated/pmdarima.arima.\ decompose.html :param filter_: Optional Array for performing convolution. This is specified as a filter for coefficients (the Moving Average and/or Auto Regressor coefficients) in reverse time order in order to filter out a seasonal component. Default: None :return: Pandas DataFrame with the decomposed trends for each group. """ self._create_group_df() group_decomposition = { group_key: self._decompose_group(group_df, group_key, m, type_, filter_) for group_key, group_df in self._group_df } return _restructure_predictions( group_decomposition, self._group_key_columns, self._master_key )
[docs] @experimental def calculate_ndiffs(self, alpha=0.05, test="kpss", max_d=2): """ Utility method for determining the optimal ``d`` value for ARIMA ordering. Calculating this as a fixed value can dramatically increase the tuning time for ``pmdarima`` models. :param alpha: significance level for determining if a pvalue used for testing a value of ``'d'`` is significant or not. Default: ``0.05`` :param test: Type of unit test for stationarity determination to use. Supported values: ``['kpss', 'adf', 'pp']`` See: https://alkaline-ml.com/pmdarima/modules/generated/pmdarima.arima.KPSSTest.\ html#pmdarima.arima.KPSSTest https://alkaline-ml.com/pmdarima/modules/generated/pmdarima.arima.PPTest.\ html#pmdarima.arima.PPTest https://alkaline-ml.com/pmdarima/modules/generated/pmdarima.arima.ADFTest.\ html#pmdarima.arima.ADFTest Default: ``'kpss'`` :param max_d: The max value for ``d`` to test. :return: Dictionary of ``{<group_key>: <optimal 'd' value>}`` """ self._create_group_df() group_ndiffs = { group: ndiffs(x=group_df[self._y_col], alpha=alpha, test=test, max_d=max_d) for group, group_df in self._group_df } return group_ndiffs
[docs] @experimental def calculate_nsdiffs(self, m, test="ocsb", max_D=2): """ Utility method for determining the optimal ``D`` value for seasonal ``SARIMAX`` ordering of ``('P', 'D', 'Q')``. :param m: The number of seasonal periods in the series. :param test: Type of unit test for seasonality. Supported tests: ``['ocsb', 'ch']`` See: https://alkaline-ml.com/pmdarima/modules/generated/pmdarima.arima.OCSBTest.\ html#pmdarima.arima.OCSBTest https://alkaline-ml.com/pmdarima/modules/generated/pmdarima.arima.CHTest.\ html#pmdarima.arima.CHTest Default: ``'ocsb'`` :param max_D: Maximum number of seasonal differences to test for. Default: 2 :return: Dictionary of ``{<group_key>: <optimal 'D' value>}`` """ self._create_group_df() group_nsdiffs = { group: nsdiffs(x=group_df[self._y_col], m=m, max_D=max_D, test=test) for group, group_df in self._group_df } return group_nsdiffs
[docs] @experimental def calculate_is_constant(self): """ Utility method for determining whether or not a series is composed of all of the same elements or not. (e.g. a series of {1, 2, 3, 4, 5, 1, 2, 3} will return 'False', while a series of {1, 1, 1, 1, 1, 1, 1, 1, 1} will return 'True') :return: Dictionary of ``{<group_key>: <Boolean constancy check>}`` """ self._create_group_df() group_constant_check = { group: is_constant(group_df[self._y_col]) for group, group_df in self._group_df } return group_constant_check
[docs] @experimental def calculate_acf( self, unbiased=False, nlags=None, qstat=False, fft=None, alpha=None, missing="none", adjusted=False, ): """ Utility for calculating the autocorrelation function for each group. Combined with a partial autocorrelation function calculation, the return values can greatly assist in setting AR, MA, or ARMA terms for a given model. The general rule to determine whether to use an AR, MA, or ARMA configuration for ARIMA (or AutoARIMA) is as follows: * ACF gradually trend to significance, PACF significance achieved after 1 lag -> AR model * ACF significance after 1 lag, PACF gradually trend to significance -> MA model * ACF gradually trend to significance, PACF gradually trend to significance -> ARMA model These results can help to set the order terms of an ARIMA model (p and q) or, for AutoARIMA, set restrictions on maximum search space terms to assist in faster optimization of the model. :param unbiased: Boolean flag that sets the autocovariance denominator to ``'n-k'`` if ``True`` and ``n`` if ``False``. Note: This argument is deprecated and removed in versions of pmdarima > 2.0.0 Default: ``False`` :param nlags: The count of autocorrelation lags to calculate and return. Default: ``40`` :param qstat: Boolean flag to calculate and return the Ljung-Box statistic for each lag. Default: ``False`` :param fft: Boolean flag for whether to use fast fourier transformation (fft) for computing the autocorrelation function. FFT is recommended for large time series data sets. Default: ``None`` :param alpha: If specified, calculates and returns the confidence intervals for the acf values at the level set (i.e., for 90% confidence, an alpha of 0.1 would be set) Default: ``None`` :param missing: handling of NaN values in the series data. Available options: ``['none', 'raise', 'conservative', 'drop']``. ``none``: no checks are performed. ``raise``: an Exception is raised if NaN values are in the series. ``conservative``: the autocovariance is calculated by removing NaN values from the mean and cross-product calculations but are not eliminated from the series. ``drop``: ``NaN`` values are removed from the series and adjacent values to ``NaN``'s are treated as contiguous (which may invalidate the results in certain situations). Default: ``'none'`` :param adjusted: Deprecation handler for the underlying ``statsmodels`` arguments that have become the ``unbiased`` argument. This is a duplicated value for the denominator mode of calculation for the autocovariance of the series. :return: Dictionary of ``{<group_key>: {<acf terms>: <values as array>}}`` """ import pmdarima self._create_group_df() group_acf_data = {} for group, df in self._group_df: if Version(pmdarima.__version__) < Version("2.0.0"): acf_data = acf( # pylint: disable=unexpected-keyword-arg x=df[self._y_col], unbiased=unbiased, nlags=nlags, qstat=qstat, fft=fft, alpha=alpha, missing=missing, ) else: acf_data = acf( x=df[self._y_col], nlags=nlags, qstat=qstat, fft=fft, alpha=alpha, missing=missing, adjusted=adjusted, ) group_data = {"acf": acf_data[0]} if isinstance(acf_data, tuple) else {"acf": acf_data} if alpha: group_data["confidence_intervals"] = acf_data[1] if qstat: group_data["qstat"] = acf_data[2] group_data["pvalues"] = acf_data[3] else: if qstat: group_data["qstat"] = acf_data[1] group_data["pvalues"] = acf_data[2] group_acf_data[group] = group_data return group_acf_data
[docs] @experimental def calculate_pacf(self, nlags=None, method="ywadjusted", alpha=None): """ Utility for calculating the partial autocorrelation function for each group. In conjunction with the autocorrelation function ``calculate_acf``, the values returned from a pacf calculation can assist in setting values or bounds on AR, MA, and ARMA terms for an ARIMA model. The general rule to determine whether to use an AR, MA, or ARMA configuration for ``ARIMA`` (or ``AutoARIMA``) is as follows: * ACF gradually trend to significance, PACF significance achieved after 1 lag -> AR model * ACF significance after 1 lag, PACF gradually trend to significance -> MA model * ACF gradually trend to significance, PACF gradually trend to significance -> ARMA model These results can help to set the order terms of an ARIMA model (``p`` and ``q``) or, for ``AutoARIMA``, set restrictions on maximum search space terms to assist in faster optimization of the model. :param nlags: The count of partial autocorrelation lags to calculate and return. Default: ``40`` :param method: The method used for pacf calculation. See the ``pmdarima`` docs for full listing of methods: https://alkaline-ml.com/pmdarima/modules/generated/pmdarima.utils.pacf.html Default: ``'ywadjusted'`` :param alpha: If specified, returns confidence intervals based on the alpha value supplied. Default: ``None`` :return: Dictionary of ``{<group_key>: {<pacf terms>: <values as array>}}`` """ self._create_group_df() group_pacf_data = {} for group, df in self._group_df: pacf_data = pacf(x=df[self._y_col], nlags=nlags, method=method, alpha=alpha) group_data = ( {"pacf": pacf_data[0]} if isinstance(pacf_data, tuple) else {"pacf": pacf_data} ) if alpha: group_data["confidence_intervals"] = pacf_data[1] group_pacf_data[group] = group_data return group_pacf_data
[docs] @experimental def generate_diff(self, lag=1, differences=1): """ A utility for generating the array diff (lag differences) for each group. To support invertability, this method will return the starting value of each array as well as the differenced values. :param lag: Determines the magnitude of the lag to calculate the differencing function for. Default: ``1`` :param differences: The order of the differencing to be performed. Note that values > 1 will generate n fewer results. Default: ``1`` :return: Dictionary of ``{<group_key>: {"series_start": <float>, "diff": <diff_array>}}`` """ self._create_group_df() group_diff_data = {} for group, df in self._group_df: df.reset_index(inplace=True) group_data = { "diff": diff(x=df[self._y_col], lag=lag, differences=differences), "series_start": df[self._y_col][0], } group_diff_data[group] = group_data return group_diff_data
[docs] @staticmethod @experimental def generate_diff_inversion(group_diff_data, lag=1, differences=1, recenter=False): """ A utility for inverting a previously differenced group of timeseries data. This utility supports returning each group's series data to the original range of the data if the recenter argument is set to `True` and the start conditions are contained within the ``group_diff_data`` argument's dictionary structure. :param group_diff_data: Differenced payload consisting of a dictionary of ``{<group_key>: {'diff': <differenced data>, [optional]'series_start': float}}`` :param lag: The lag to use to perform the differencing inversion. Default: ``1`` :param differences: The order of differencing to be used during the inversion. Default: ``1`` :param recenter: If ``True`` and ``'series_start'`` exists in ``group_diff_data`` dict, will restore the original series range for each group based on the series start value calculated through the ``generate_diff()`` method. If the ``group_diff_data`` does not contain the starting values, the data will not be re-centered. Default: ``False`` :return: Dictionary of ``{<group_key>: <series_inverted_data>}`` """ warn_check = False series_data = {} for group, payload in group_diff_data.items(): data = payload.get("diff", None) if data is None: raise DivinerException( f"group_diff_data does not contain the key `diff` for group" f"{group}" ) inverted = diff_inv(x=data, lag=lag, differences=differences) if recenter: start = payload.get("series_start", None) if not start: if not warn_check: warnings.warn( "Recentering is not possible due to `series_start` missing " "from `group_diff_data` argument." ) warn_check = True series_data[group] = inverted else: series_data[group] = inverted + start else: series_data[group] = inverted return series_data