Source code for etna.models.statsforecast

import warnings
from typing import Dict
from typing import List
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import Union

import numpy as np
import pandas as pd
from statsforecast.models import AutoARIMA
from statsforecast.models import AutoCES
from statsforecast.models import AutoETS
from statsforecast.models import AutoTheta

from etna.distributions import BaseDistribution
from etna.distributions import IntDistribution
from etna.libs.statsforecast import ARIMA
from etna.models.base import BaseAdapter
from etna.models.base import NonPredictionIntervalContextIgnorantAbstractModel
from etna.models.base import PredictionIntervalContextIgnorantAbstractModel
from etna.models.mixins import NonPredictionIntervalContextIgnorantModelMixin
from etna.models.mixins import PerSegmentModelMixin
from etna.models.mixins import PredictionIntervalContextIgnorantModelMixin
from etna.models.utils import determine_freq
from etna.models.utils import determine_num_steps

StatsForecastModel = Union[AutoCES, AutoARIMA, AutoTheta, AutoETS, ARIMA]


[docs]class _StatsForecastBaseAdapter(BaseAdapter):
    """Base class for adapters for models from statsforecast package."""

    def __init__(self, model: StatsForecastModel, support_prediction_intervals: bool):
        """
        Init model with given parameters.

        Parameters
        ----------
        model:
            Model from statsforecast.
        support_prediction_intervals:
            Should model support prediction intervals.
        """
        self.regressor_columns: Optional[List[str]] = None
        self._freq: Optional[str] = None
        self._first_train_timestamp: Optional[pd.Timestamp] = None
        self._last_train_timestamp: Optional[pd.Timestamp] = None
        self._model = model
        self._support_prediction_intervals = support_prediction_intervals

    def _check_not_used_columns(self, df: pd.DataFrame):
        if self.regressor_columns is None:
            raise ValueError("Something went wrong, regressor_columns is None!")

        columns_not_used = [col for col in df.columns if col not in ["target", "timestamp"] + self.regressor_columns]
        if columns_not_used:
            warnings.warn(
                message=f"This model doesn't work with exogenous features unknown in future. "
                f"Columns {columns_not_used} won't be used."
            )

    def _select_regressors(self, df: pd.DataFrame) -> Optional[np.ndarray]:
        """Select data with regressors.

        During fit there can't be regressors with NaNs, they are removed at higher level.
        Look at the issue: https://github.com/tinkoff-ai/etna/issues/557

        During prediction without validation NaNs in regressors lead to NaNs in the answer.

        This model requires data to be in float dtype.
        """
        if self.regressor_columns is None:
            raise ValueError("Something went wrong, regressor_columns is None!")

        regressors_with_nans = [regressor for regressor in self.regressor_columns if df[regressor].isna().sum() > 0]
        if regressors_with_nans:
            raise ValueError(
                f"Regressors {regressors_with_nans} contain NaN values. "
                "Try to lower horizon value, or drop these regressors."
            )

        if self.regressor_columns:
            try:
                result = df[self.regressor_columns].values.astype(float)
            except ValueError as e:
                raise ValueError(f"Only convertible to float features are allowed! Error: {str(e)}")
        else:
            result = None

        return result

[docs]    def fit(self, df: pd.DataFrame, regressors: List[str]) -> "_StatsForecastBaseAdapter":
        """Fit statsforecast adapter.

        Parameters
        ----------
        df:
            Features dataframe
        regressors:
            List of the columns with regressors

        Returns
        -------
        :
            Fitted adapter
        """
        self.regressor_columns = regressors
        self._check_not_used_columns(df)

        endog_data = df["target"].values
        exog_data = self._select_regressors(df)

        self._model.fit(y=endog_data, X=exog_data)

        self._freq = determine_freq(timestamps=df["timestamp"])
        self._first_train_timestamp = df["timestamp"].min()
        self._last_train_timestamp = df["timestamp"].max()

        return self

[docs]    def forecast(
        self, df: pd.DataFrame, prediction_interval: bool = False, quantiles: Sequence[float] = ()
    ) -> pd.DataFrame:
        """Compute predictions on future data from a statsforecast model.

        This method only works on data that goes right after the train.

        Parameters
        ----------
        df:
            Features dataframe
        prediction_interval:
             If True returns prediction interval for forecast
        quantiles:
            Levels of prediction distribution

        Returns
        -------
        :
            DataFrame with predictions
        """
        if self._freq is None:
            raise ValueError("Model is not fitted! Fit the model before calling predict method!")

        start_timestamp = df["timestamp"].min()
        end_timestamp = df["timestamp"].max()

        if start_timestamp < self._last_train_timestamp:
            raise NotImplementedError(
                "This model can't make forecast on history data! Use method predict for in-sample prediction."
            )

        # determine index of start_timestamp if counting from last timestamp of train
        start_idx = determine_num_steps(
            start_timestamp=self._last_train_timestamp, end_timestamp=start_timestamp, freq=self._freq  # type: ignore
        )
        # determine index of end_timestamp if counting from last timestamp of train
        end_idx = determine_num_steps(
            start_timestamp=self._last_train_timestamp, end_timestamp=end_timestamp, freq=self._freq  # type: ignore
        )

        if start_idx > 1:
            raise NotImplementedError(
                "This model can't make forecast on out-of-sample data that goes after training data with a gap! "
                "You can only forecast from the next point after the last one in the training dataset."
            )

        h = end_idx
        exog_data = self._select_regressors(df)
        if prediction_interval and self._support_prediction_intervals:
            levels = []
            for quantile in quantiles:
                width = abs(1 / 2 - quantile) * 2
                level = int(width * 100)
                levels.append(level)

            # get unique levels to prevent strange behavior with stacking interval predictions
            unique_levels = list(set(levels))
            forecast = self._model.predict(h=h, X=exog_data, level=unique_levels)
            y_pred = pd.DataFrame({"target": forecast["mean"]})

            for quantile, level in zip(quantiles, levels):
                if quantile < 1 / 2:
                    series = forecast[f"lo-{level}"]
                else:
                    series = forecast[f"hi-{level}"]
                y_pred[f"target_{quantile:.4g}"] = series
        else:
            forecast = self._model.predict(h=h, X=exog_data)
            y_pred = pd.DataFrame({"target": forecast["mean"]})

        return y_pred

[docs]    def predict(
        self, df: pd.DataFrame, prediction_interval: bool = False, quantiles: Sequence[float] = ()
    ) -> pd.DataFrame:
        """Compute in-sample predictions from a statsforecast model.

        This method only works on train data.

        Parameters
        ----------
        df:
            Features dataframe
        prediction_interval:
            If True returns prediction interval for forecast
        quantiles:
            Levels of prediction distribution

        Returns
        -------
        :
            DataFrame with predictions
        """
        if self._freq is None:
            raise ValueError("Model is not fitted! Fit the model before calling predict method!")

        start_timestamp = df["timestamp"].min()
        end_timestamp = df["timestamp"].max()

        if start_timestamp < self._first_train_timestamp:
            raise NotImplementedError(
                "This model can't make predict on past out-of-sample data! The data before training is given."
            )

        if end_timestamp > self._last_train_timestamp:
            raise NotImplementedError(
                "This model can't make predict on future out-of-sample data! "
                "Use forecast method for this type of prediction."
            )

        # determine index of start_timestamp if counting from first timestamp of train
        start_idx = determine_num_steps(
            start_timestamp=self._first_train_timestamp, end_timestamp=start_timestamp, freq=self._freq  # type: ignore
        )
        # determine index of end_timestamp if counting from first timestamp of train
        end_idx = determine_num_steps(
            start_timestamp=self._first_train_timestamp, end_timestamp=end_timestamp, freq=self._freq  # type: ignore
        )

        if prediction_interval and self._support_prediction_intervals:
            levels = []
            for quantile in quantiles:
                width = abs(1 / 2 - quantile) * 2
                level = int(width * 100)
                levels.append(level)

            # get unique levels to prevent strange behavior with stacking interval predictions
            unique_levels = list(set(levels))
            forecast = self._model.predict_in_sample(level=unique_levels)  # type: ignore
            y_pred = pd.DataFrame({"target": forecast["fitted"][start_idx : end_idx + 1]})

            for quantile, level in zip(quantiles, levels):
                if quantile < 1 / 2:
                    series = forecast[f"fitted-lo-{level}"]
                else:
                    series = forecast[f"fitted-hi-{level}"]
                y_pred[f"target_{quantile:.4g}"] = series[start_idx : end_idx + 1]
        else:
            forecast = self._model.predict_in_sample()
            y_pred = pd.DataFrame({"target": forecast["fitted"][start_idx : end_idx + 1]})

        return y_pred

[docs]    def forecast_components(self, df: pd.DataFrame) -> pd.DataFrame:
        """Estimate forecast components.

        Parameters
        ----------
        df:
            features dataframe

        Returns
        -------
        :
            dataframe with forecast components
        """
        raise NotImplementedError("This mode isn't currently implemented!")

[docs]    def predict_components(self, df: pd.DataFrame) -> pd.DataFrame:
        """Estimate prediction components.

        Parameters
        ----------
        df:
            features dataframe

        Returns
        -------
        :
            dataframe with prediction components
        """
        raise NotImplementedError("This mode isn't currently implemented!")

[docs]    def get_model(self) -> StatsForecastModel:
        """Get statsforecast model that is used inside etna class.

        Returns
        -------
        :
           Internal model
        """
        return self._model


[docs]class _AutoARIMAAdapter(_StatsForecastBaseAdapter):
    """Adapter for :py:class:`statsforecast.models.AutoARIMA`."""

    def __init__(
        self,
        d: Optional[int] = None,
        D: Optional[int] = None,  # noqa: N803
        max_p: int = 5,
        max_q: int = 5,
        max_P: int = 2,
        max_Q: int = 2,
        max_order: int = 5,
        max_d: int = 2,
        max_D: int = 1,
        start_p: int = 2,
        start_q: int = 2,
        start_P: int = 1,
        start_Q: int = 1,
        season_length: int = 1,
        **kwargs,
    ):
        """Init model with given params.

        Parameters
        ----------
        d:
            Order of first-differencing.
        D:
            Order of seasonal-differencing.
        max_p:
            Max autorregresives p.
        max_q:
            Max moving averages q.
        max_P:
            Max seasonal autorregresives P.
        max_Q:
            Max seasonal moving averages Q.
        max_order:
            Max p+q+P+Q value if not stepwise selection.
        max_d:
            Max non-seasonal differences.
        max_D:
            Max seasonal differences.
        start_p:
            Starting value of p in stepwise procedure.
        start_q:
            Starting value of q in stepwise procedure.
        start_P:
            Starting value of P in stepwise procedure.
        start_Q:
            Starting value of Q in stepwise procedure.
        season_length:
            Number of observations per unit of time. Ex: 24 Hourly data.
        **kwargs:
            Additional parameters for :py:class:`statsforecast.models.AutoARIMA`.
        """
        self.d = d
        self.D = D
        self.max_p = max_p
        self.max_q = max_q
        self.max_P = max_P
        self.max_Q = max_Q
        self.max_order = max_order
        self.max_d = max_d
        self.max_D = max_D
        self.start_p = start_p
        self.start_q = start_q
        self.start_P = start_P
        self.start_Q = start_Q
        self.season_length = season_length
        self.kwargs = kwargs
        super().__init__(
            model=AutoARIMA(
                d=self.d,
                D=self.D,
                max_p=self.max_p,
                max_q=self.max_q,
                max_P=self.max_P,
                max_Q=self.max_Q,
                max_order=self.max_order,
                max_d=self.max_d,
                max_D=self.max_D,
                start_p=self.start_P,
                start_q=self.start_q,
                start_P=self.start_P,
                start_Q=self.start_Q,
                season_length=self.season_length,
                **self.kwargs,
            ),
            support_prediction_intervals=True,
        )


[docs]class _ARIMAAdapter(_StatsForecastBaseAdapter):
    """Adapter for :py:class:`statsforecast.models.ARIMA`."""

    def __init__(
        self,
        order: Tuple[int, int, int] = (0, 0, 0),
        season_length: int = 1,
        seasonal_order: Tuple[int, int, int] = (0, 0, 0),
        **kwargs,
    ):
        """Init model with given params.

        Parameters
        ----------
        order:
            A specification of the non-seasonal part of the ARIMA model: the three components (p, d, q) are the AR order, the degree of differencing, and the MA order.
        season_length:
            Number of observations per unit of time. Ex: 24 Hourly data.
        seasonal_order:
            A specification of the seasonal part of the ARIMA model.
            (P, D, Q) for the  AR order, the degree of differencing, the MA order.
        **kwargs:
            Additional parameters for :py:class:`statsforecast.models.ARIMA`.
        """
        self.order = order
        self.season_length = season_length
        self.seasonal_order = seasonal_order
        self.kwargs = kwargs
        super().__init__(
            model=ARIMA(
                order=self.order,
                season_length=self.season_length,
                seasonal_order=self.seasonal_order,
                **self.kwargs,
            ),
            support_prediction_intervals=True,
        )


[docs]class _AutoThetaAdapter(_StatsForecastBaseAdapter):
    """Adapter for :py:class:`statsforecast.models.AutoTheta`."""

    def __init__(
        self,
        season_length: int = 1,
        decomposition_type: str = "multiplicative",
        model: Optional[str] = None,
    ):
        """Init model with given params.

        Parameters
        ----------
        season_length:
            Number of observations per unit of time. Ex: 24 Hourly data.
        decomposition_type:
            Sesonal decomposition type, 'multiplicative' (default) or 'additive'.
        model:
            Controlling Theta Model. By default searches the best model.
        """
        self.season_length = season_length
        self.decomposition_type = decomposition_type
        self.model = model
        super().__init__(
            model=AutoTheta(
                season_length=self.season_length, decomposition_type=self.decomposition_type, model=self.model
            ),
            support_prediction_intervals=True,
        )


[docs]class _AutoCESAdapter(_StatsForecastBaseAdapter):
    """Adapter for :py:class:`statsforecast.models.AutoCES`."""

    def __init__(self, season_length: int = 1, model: str = "Z"):
        """Init model with given params.

        Parameters
        ----------
        season_length:
            Number of observations per unit of time. Ex: 24 Hourly data.
        model:
            Controlling state-space-equations.
        """
        self.season_length = season_length
        self.model = model
        super().__init__(
            model=AutoCES(season_length=self.season_length, model=self.model),
            support_prediction_intervals=False,
        )


[docs]class _AutoETSAdapter(_StatsForecastBaseAdapter):
    """Adapter for :py:class:`statsforecast.models.AutoETS`."""

    def __init__(self, season_length: int = 1, model: str = "ZZZ", damped: Optional[bool] = None):
        """Init model with given params.

        Parameters
        ----------
        season_length:
            Number of observations per unit of time. Ex: 24 Hourly data.
        model:
            Controlling state-space-equations.
        damped:
            A parameter that 'dampens' the trend.
        """
        self.season_length = season_length
        self.model = model
        self.damped = damped
        super().__init__(
            model=AutoETS(
                season_length=self.season_length,
                model=self.model,
                damped=self.damped,
            ),
            support_prediction_intervals=True,
        )


[docs]class StatsForecastAutoARIMAModel(
    PerSegmentModelMixin, PredictionIntervalContextIgnorantModelMixin, PredictionIntervalContextIgnorantAbstractModel
):
    """
    Class for holding :py:class:`statsforecast.models.AutoARIMA`.

    `Documentation for the underlying model <https://nixtla.github.io/statsforecast/src/core/models.html#autoarima>`_.
    """

    def __init__(
        self,
        d: Optional[int] = None,
        D: Optional[int] = None,  # noqa: N803
        max_p: int = 5,
        max_q: int = 5,
        max_P: int = 2,
        max_Q: int = 2,
        max_order: int = 5,
        max_d: int = 2,
        max_D: int = 1,
        start_p: int = 2,
        start_q: int = 2,
        start_P: int = 1,
        start_Q: int = 1,
        season_length: int = 1,
        **kwargs,
    ):
        """Init model with given params.

        Parameters
        ----------
        d:
            Order of first-differencing.
        D:
            Order of seasonal-differencing.
        max_p:
            Max autorregresives p.
        max_q:
            Max moving averages q.
        max_P:
            Max seasonal autorregresives P.
        max_Q:
            Max seasonal moving averages Q.
        max_order:
            Max p+q+P+Q value if not stepwise selection.
        max_d:
            Max non-seasonal differences.
        max_D:
            Max seasonal differences.
        start_p:
            Starting value of p in stepwise procedure.
        start_q:
            Starting value of q in stepwise procedure.
        start_P:
            Starting value of P in stepwise procedure.
        start_Q:
            Starting value of Q in stepwise procedure.
        season_length:
            Number of observations per unit of time. Ex: 24 Hourly data.
        **kwargs:
            Additional parameters for :py:class:`statsforecast.models.AutoARIMA`.
        """
        self.d = d
        self.D = D
        self.max_p = max_p
        self.max_q = max_q
        self.max_P = max_P
        self.max_Q = max_Q
        self.max_order = max_order
        self.max_d = max_d
        self.max_D = max_D
        self.start_p = start_p
        self.start_q = start_q
        self.start_P = start_P
        self.start_Q = start_Q
        self.season_length = season_length
        self.kwargs = kwargs
        super().__init__(
            base_model=_AutoARIMAAdapter(
                d=self.d,
                D=self.D,
                max_p=self.max_p,
                max_q=self.max_q,
                max_P=self.max_P,
                max_Q=self.max_Q,
                max_order=self.max_order,
                max_d=self.max_d,
                max_D=self.max_D,
                start_p=self.start_P,
                start_q=self.start_q,
                start_P=self.start_P,
                start_Q=self.start_Q,
                season_length=self.season_length,
                **self.kwargs,
            )
        )


[docs]class StatsForecastARIMAModel(
    PerSegmentModelMixin, PredictionIntervalContextIgnorantModelMixin, PredictionIntervalContextIgnorantAbstractModel
):
    """
    Class for holding :py:class:`statsforecast.models.ARIMA`.

    `Documentation for the underlying model <https://nixtla.github.io/statsforecast/src/core/models.html#arima>`_.
    """

    def __init__(
        self,
        order: Tuple[int, int, int] = (0, 0, 0),
        season_length: int = 1,
        seasonal_order: Tuple[int, int, int] = (0, 0, 0),
        **kwargs,
    ):
        """Init model with given params.

        Parameters
        ----------
        order:
            A specification of the non-seasonal part of the ARIMA model: the three components (p, d, q) are the AR order, the degree of differencing, and the MA order.
        season_length:
            Number of observations per unit of time. Ex: 24 Hourly data.
        seasonal_order:
            A specification of the seasonal part of the ARIMA model.
            (P, D, Q) for the  AR order, the degree of differencing, the MA order.
        **kwargs:
            Additional parameters for :py:class:`statsforecast.models.ARIMA`.
        """
        self.order = order
        self.season_length = season_length
        self.seasonal_order = seasonal_order
        self.kwargs = kwargs
        super().__init__(
            base_model=_ARIMAAdapter(
                order=self.order,
                season_length=self.season_length,
                seasonal_order=self.seasonal_order,
                **self.kwargs,
            ),
        )

[docs]    def params_to_tune(self) -> Dict[str, BaseDistribution]:
        """Get default grid for tuning hyperparameters.

        This grid tunes parameters: ``order.0``, ``order.1``, ``order.2``.
        If ``self.season_length`` is greater than one, then it also tunes parameters:
        ``seasonal_order.0``, ``seasonal_order.1``, ``seasonal_order.2``.
        Other parameters are expected to be set by the user.

        Returns
        -------
        :
            Grid to tune.
        """
        grid: Dict[str, BaseDistribution] = {
            "order.0": IntDistribution(low=1, high=6),
            "order.1": IntDistribution(low=1, high=2),
            "order.2": IntDistribution(low=1, high=6),
        }

        num_periods = self.season_length
        if num_periods > 1:
            grid.update(
                {
                    "seasonal_order.0": IntDistribution(low=0, high=2),
                    "seasonal_order.1": IntDistribution(low=0, high=1),
                    "seasonal_order.2": IntDistribution(low=0, high=1),
                }
            )

        return grid


[docs]class StatsForecastAutoThetaModel(
    PerSegmentModelMixin, PredictionIntervalContextIgnorantModelMixin, PredictionIntervalContextIgnorantAbstractModel
):
    """
    Class for holding :py:class:`statsforecast.models.AutoTheta`.

    `Documentation for the underlying model <https://nixtla.github.io/statsforecast/src/core/models.html#autotheta>`_.
    """

    def __init__(
        self,
        season_length: int = 1,
        decomposition_type: str = "multiplicative",
        model: Optional[str] = None,
    ):
        """Init model with given params.

        Parameters
        ----------
        season_length:
            Number of observations per unit of time. Ex: 24 Hourly data.
        decomposition_type:
            Sesonal decomposition type, 'multiplicative' (default) or 'additive'.
        model:
            Controlling Theta Model. By default searches the best model.
        """
        self.season_length = season_length
        self.decomposition_type = decomposition_type
        self.model = model
        super().__init__(
            base_model=_AutoThetaAdapter(
                season_length=self.season_length, decomposition_type=self.decomposition_type, model=self.model
            ),
        )


[docs]class StatsForecastAutoCESModel(
    PerSegmentModelMixin,
    NonPredictionIntervalContextIgnorantModelMixin,
    NonPredictionIntervalContextIgnorantAbstractModel,
):
    """
    Class for holding :py:class:`statsforecast.models.AutoCES`.

    `Documentation for the underlying model <https://nixtla.github.io/statsforecast/src/core/models.html#autoces>`_.
    """

    def __init__(self, season_length: int = 1, model: str = "Z"):
        """Init model with given params.

        Parameters
        ----------
        season_length:
            Number of observations per unit of time. Ex: 24 Hourly data.
        model:
            Controlling state-space-equations.
        """
        self.season_length = season_length
        self.model = model
        super().__init__(
            base_model=_AutoCESAdapter(season_length=self.season_length, model=self.model),
        )


[docs]class StatsForecastAutoETSModel(
    PerSegmentModelMixin, PredictionIntervalContextIgnorantModelMixin, PredictionIntervalContextIgnorantAbstractModel
):
    """
    Class for holding :py:class:`statsforecast.models.AutoETS`.

    `Documentation for the underlying model <https://nixtla.github.io/statsforecast/src/core/models.html#autoets>`_.
    """

    def __init__(self, season_length: int = 1, model: str = "ZZZ", damped: Optional[bool] = None):
        """Init model with given params.


        Parameters
        ----------
        season_length:
            Number of observations per unit of time. Ex: 24 Hourly data.
        model:
            Controlling state-space-equations.
        damped:
            A parameter that 'dampens' the trend.
        """
        self.season_length = season_length
        self.model = model
        self.damped = damped
        super().__init__(
            base_model=_AutoETSAdapter(season_length=self.season_length, model=self.model, damped=self.damped),
        )