Source code for etna.transforms.encoders.mean_segment_encoder
import reprlib
from typing import Dict
from typing import List
from typing import Optional
import numpy as np
import pandas as pd
from etna.transforms import IrreversibleTransform
from etna.transforms.base import FutureMixin
from etna.transforms.math.statistics import MeanTransform
[docs]class MeanSegmentEncoderTransform(IrreversibleTransform, FutureMixin):
"""Makes expanding mean target encoding of the segment. Creates column 'segment_mean'."""
idx = pd.IndexSlice
def __init__(self):
super().__init__(required_features=["target"])
self.mean_encoder = MeanTransform(in_column="target", window=-1, out_column="segment_mean")
self.global_means: Optional[Dict[str, float]] = None
def _fit(self, df: pd.DataFrame) -> "MeanSegmentEncoderTransform":
"""
Fit encoder.
Parameters
----------
df:
dataframe with data to fit expanding mean target encoder.
Returns
-------
:
Fitted transform
"""
self.mean_encoder._fit(df)
mean_values = df.loc[:, self.idx[:, "target"]].mean().to_dict()
mean_values = {key[0]: value for key, value in mean_values.items()}
self.global_means = mean_values
return self
def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Get encoded values for the segment.
Parameters
----------
df:
dataframe with data to transform.
Returns
-------
:
result dataframe
Raises
------
ValueError:
If transform isn't fitted.
NotImplementedError:
If there are segments that weren't present during training.
"""
if self.global_means is None:
raise ValueError("The transform isn't fitted!")
segments = df.columns.get_level_values("segment").unique().tolist()
new_segments = set(segments) - self.global_means.keys()
if len(new_segments) > 0:
raise NotImplementedError(
f"This transform can't process segments that weren't present on train data: {reprlib.repr(new_segments)}"
)
df = self.mean_encoder._transform(df)
segment = segments[0]
nan_timestamps = df[df.loc[:, self.idx[segment, "target"]].isna()].index
values_to_set = np.array([self.global_means[x] for x in segments])
# repetition isn't necessary for pandas >= 1.2
values_to_set = np.repeat(values_to_set[np.newaxis, :], len(nan_timestamps), axis=0)
df.loc[nan_timestamps, self.idx[:, "segment_mean"]] = values_to_set
return df
[docs] def get_regressors_info(self) -> List[str]:
"""Return the list with regressors created by the transform."""
return ["segment_mean"]