geronimo.features

Geronimo Features Module.

The features module provides the abstractions for defining feature transformations and pipelines. It is inspired by scikit-learn's fit/transform paradigm but optimized for production systems where consistency between training and serving is critical.

Key components:

  • FeatureSet: A logical grouping of features (e.g., user features, item features).
  • Feature: A specific transformation logic (e.g., OneHotEncoding, Normalization).

This module ensures that the exact same feature engineering logic is applied during batch training and real-time inference preventing training-serving skew.

 1"""Geronimo Features Module.
 2
 3The features module provides the abstractions for defining feature transformations
 4and pipelines. It is inspired by scikit-learn's fit/transform paradigm but
 5optimized for production systems where consistency between training and serving is critical.
 6
 7Key components:
 8- FeatureSet: A logical grouping of features (e.g., user features, item features).
 9- Feature: A specific transformation logic (e.g., OneHotEncoding, Normalization).
10
11This module ensures that the exact same feature engineering logic is applied during
12batch training and real-time inference preventing training-serving skew.
13"""
14
15from geronimo.features.base import FeatureSet
16from geronimo.features.feature import Feature
17
18__all__ = ["FeatureSet", "Feature"]
19
20__docformat__ = "google"
class FeatureSet:
 15class FeatureSet:
 16    """Base class for feature engineering pipelines.
 17
 18    Provides fit/transform semantics for training vs production,
 19    with integrated artifact storage for encoders and transformers.
 20
 21    Example:
 22        ```python
 23        from geronimo.features import FeatureSet, Feature
 24        from sklearn.preprocessing import StandardScaler, OneHotEncoder
 25
 26        class CustomerFeatures(FeatureSet):
 27            data_source = DataSource(
 28                name="customers",
 29                source="snowflake",
 30                query=Query.from_file("queries/customers.sql"),
 31            )
 32
 33            age = Feature(dtype="numeric", transformer=StandardScaler())
 34            income = Feature(dtype="numeric", transformer=StandardScaler())
 35            segment = Feature(dtype="categorical", encoder=OneHotEncoder(sparse_output=False))
 36
 37        # Training: fit and transform
 38        features = CustomerFeatures()
 39        X = features.fit_transform(training_df)
 40
 41        # Production: transform only (uses fitted encoders)
 42        X = features.transform(production_df)
 43        ```
 44    """
 45
 46    # Override in subclass
 47    data_source: Optional["DataSource"] = None
 48
 49    def __init__(self):
 50        """Initialize feature set."""
 51        self._features: dict[str, Feature] = {}
 52        self._is_fitted: bool = False
 53
 54        # Collect Feature descriptors from class
 55        for name in dir(self.__class__):
 56            attr = getattr(self.__class__, name, None)
 57            if isinstance(attr, Feature):
 58                self._features[name] = attr
 59
 60    @property
 61    def feature_names(self) -> list[str]:
 62        """Get list of feature names (excluding dropped)."""
 63        return [f.name for f in self._features.values() if not f.drop]
 64
 65    @property
 66    def numeric_features(self) -> list[Feature]:
 67        """Get numeric features."""
 68        return [f for f in self._features.values() if f.dtype == "numeric" and not f.drop]
 69
 70    @property
 71    def categorical_features(self) -> list[Feature]:
 72        """Get categorical features."""
 73        return [
 74            f for f in self._features.values() if f.dtype == "categorical" and not f.drop
 75        ]
 76
 77    def fit(self, df: pd.DataFrame) -> "FeatureSet":
 78        """Fit all transformers and encoders.
 79
 80        Args:
 81            df: Training DataFrame.
 82
 83        Returns:
 84            Self for chaining.
 85        """
 86        for feature in self._features.values():
 87            if feature.drop:
 88                continue
 89            self._process_feature(feature, df, mode="fit")
 90
 91        self._is_fitted = True
 92        return self
 93
 94    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
 95        """Transform DataFrame using fitted transformers.
 96
 97        Args:
 98            df: Input DataFrame.
 99
100        Returns:
101            Transformed DataFrame.
102
103        Raises:
104            ValueError: If not fitted.
105        """
106        if not self._is_fitted:
107            raise ValueError("FeatureSet not fitted. Call fit() first.")
108
109        result = pd.DataFrame(index=df.index)
110
111        for feature in self._features.values():
112            if feature.drop:
113                continue
114            
115            transformed = self._process_feature(feature, df, mode="transform")
116            if transformed is not None:
117                if isinstance(transformed, dict):
118                    # Multi-column output from encoders
119                    for col_name, values in transformed.items():
120                        result[col_name] = values
121                else:
122                    result[feature.name] = transformed
123
124        return result
125
126    def _process_feature(
127        self,
128        feature: Feature,
129        df: pd.DataFrame,
130        mode: str,
131    ) -> any:
132        """Process a single feature for fit or transform.
133        
134        Unified processing logic to reduce code duplication between
135        fit() and transform() methods.
136        
137        Args:
138            feature: Feature descriptor to process.
139            df: Input DataFrame.
140            mode: Either "fit" or "transform".
141            
142        Returns:
143            For mode="fit": None (modifies transformers/encoders in place).
144            For mode="transform": Transformed values (Series, array, or dict for multi-column).
145        """
146        # Handle derived features with custom functions
147        if feature.has_derived_fn:
148            derived_values = feature.apply(df)
149            
150            if feature.has_transformer:
151                if mode == "fit":
152                    feature.transformer.fit(derived_values.values.reshape(-1, 1))
153                    return None
154                else:  # transform
155                    transformed = feature.transformer.transform(
156                        derived_values.values.reshape(-1, 1)
157                    )
158                    return transformed.flatten()
159            else:
160                if mode == "fit":
161                    return None
162                return derived_values.values
163
164        # Standard features
165        col_name = feature.source_column
166        if col_name not in df.columns:
167            return None
168
169        if feature.has_transformer:
170            if mode == "fit":
171                feature.transformer.fit(df[[col_name]])
172                return None
173            else:  # transform
174                transformed = feature.transformer.transform(df[[col_name]])
175                return transformed.flatten()
176        elif feature.has_encoder:
177            if mode == "fit":
178                feature.encoder.fit(df[[col_name]])
179                return None
180            else:  # transform
181                encoded = feature.encoder.transform(df[[col_name]])
182                # Handle multi-column output from encoders
183                if hasattr(feature.encoder, "get_feature_names_out"):
184                    enc_names = feature.encoder.get_feature_names_out([col_name])
185                    return {enc_name: encoded[:, i] for i, enc_name in enumerate(enc_names)}
186                else:
187                    return encoded.flatten()
188        else:
189            if mode == "fit":
190                return None
191            return df[col_name].values
192
193    def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
194        """Fit and transform in one step.
195
196        Args:
197            df: Training DataFrame.
198
199        Returns:
200            Transformed DataFrame.
201        """
202        return self.fit(df).transform(df)
203
204    def save(self, store: "ArtifactStore") -> None:
205        """Save fitted transformers and encoders to artifact store.
206
207        Args:
208            store: ArtifactStore instance.
209        """
210        for name, feature in self._features.items():
211            if feature.has_transformer:
212                store.save(
213                    f"transformer_{name}",
214                    feature.transformer,
215                    artifact_type="transformer",
216                )
217            if feature.has_encoder:
218                store.save(
219                    f"encoder_{name}",
220                    feature.encoder,
221                    artifact_type="encoder",
222                )
223
224    def load(self, store: "ArtifactStore") -> None:
225        """Load fitted transformers and encoders from artifact store.
226
227        Args:
228            store: ArtifactStore instance.
229        """
230        for name, feature in self._features.items():
231            if feature.has_transformer:
232                feature.transformer = store.get(f"transformer_{name}")
233            if feature.has_encoder:
234                feature.encoder = store.get(f"encoder_{name}")
235
236        self._is_fitted = True
237
238    @property
239    def is_fitted(self) -> bool:
240        """Check if feature set has been fitted."""
241        return self._is_fitted
242
243    def __repr__(self) -> str:
244        status = "fitted" if self._is_fitted else "not fitted"
245        return f"{self.__class__.__name__}({len(self._features)} features, {status})"

Base class for feature engineering pipelines.

Provides fit/transform semantics for training vs production, with integrated artifact storage for encoders and transformers.

Example:
from geronimo.features import FeatureSet, Feature
from sklearn.preprocessing import StandardScaler, OneHotEncoder

class CustomerFeatures(FeatureSet):
    data_source = DataSource(
        name="customers",
        source="snowflake",
        query=Query.from_file("queries/customers.sql"),
    )

    age = Feature(dtype="numeric", transformer=StandardScaler())
    income = Feature(dtype="numeric", transformer=StandardScaler())
    segment = Feature(dtype="categorical", encoder=OneHotEncoder(sparse_output=False))

# Training: fit and transform
features = CustomerFeatures()
X = features.fit_transform(training_df)

# Production: transform only (uses fitted encoders)
X = features.transform(production_df)
FeatureSet()
49    def __init__(self):
50        """Initialize feature set."""
51        self._features: dict[str, Feature] = {}
52        self._is_fitted: bool = False
53
54        # Collect Feature descriptors from class
55        for name in dir(self.__class__):
56            attr = getattr(self.__class__, name, None)
57            if isinstance(attr, Feature):
58                self._features[name] = attr

Initialize feature set.

data_source: Optional[geronimo.data_sources.DataSource] = None
feature_names: list[str]
60    @property
61    def feature_names(self) -> list[str]:
62        """Get list of feature names (excluding dropped)."""
63        return [f.name for f in self._features.values() if not f.drop]

Get list of feature names (excluding dropped).

numeric_features: list[Feature]
65    @property
66    def numeric_features(self) -> list[Feature]:
67        """Get numeric features."""
68        return [f for f in self._features.values() if f.dtype == "numeric" and not f.drop]

Get numeric features.

categorical_features: list[Feature]
70    @property
71    def categorical_features(self) -> list[Feature]:
72        """Get categorical features."""
73        return [
74            f for f in self._features.values() if f.dtype == "categorical" and not f.drop
75        ]

Get categorical features.

def fit( self, df: pandas.core.frame.DataFrame) -> FeatureSet:
77    def fit(self, df: pd.DataFrame) -> "FeatureSet":
78        """Fit all transformers and encoders.
79
80        Args:
81            df: Training DataFrame.
82
83        Returns:
84            Self for chaining.
85        """
86        for feature in self._features.values():
87            if feature.drop:
88                continue
89            self._process_feature(feature, df, mode="fit")
90
91        self._is_fitted = True
92        return self

Fit all transformers and encoders.

Arguments:
  • df: Training DataFrame.
Returns:

Self for chaining.

def transform(self, df: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame:
 94    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
 95        """Transform DataFrame using fitted transformers.
 96
 97        Args:
 98            df: Input DataFrame.
 99
100        Returns:
101            Transformed DataFrame.
102
103        Raises:
104            ValueError: If not fitted.
105        """
106        if not self._is_fitted:
107            raise ValueError("FeatureSet not fitted. Call fit() first.")
108
109        result = pd.DataFrame(index=df.index)
110
111        for feature in self._features.values():
112            if feature.drop:
113                continue
114            
115            transformed = self._process_feature(feature, df, mode="transform")
116            if transformed is not None:
117                if isinstance(transformed, dict):
118                    # Multi-column output from encoders
119                    for col_name, values in transformed.items():
120                        result[col_name] = values
121                else:
122                    result[feature.name] = transformed
123
124        return result

Transform DataFrame using fitted transformers.

Arguments:
  • df: Input DataFrame.
Returns:

Transformed DataFrame.

Raises:
  • ValueError: If not fitted.
def fit_transform(self, df: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame:
193    def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
194        """Fit and transform in one step.
195
196        Args:
197            df: Training DataFrame.
198
199        Returns:
200            Transformed DataFrame.
201        """
202        return self.fit(df).transform(df)

Fit and transform in one step.

Arguments:
  • df: Training DataFrame.
Returns:

Transformed DataFrame.

def save(self, store: geronimo.artifacts.ArtifactStore) -> None:
204    def save(self, store: "ArtifactStore") -> None:
205        """Save fitted transformers and encoders to artifact store.
206
207        Args:
208            store: ArtifactStore instance.
209        """
210        for name, feature in self._features.items():
211            if feature.has_transformer:
212                store.save(
213                    f"transformer_{name}",
214                    feature.transformer,
215                    artifact_type="transformer",
216                )
217            if feature.has_encoder:
218                store.save(
219                    f"encoder_{name}",
220                    feature.encoder,
221                    artifact_type="encoder",
222                )

Save fitted transformers and encoders to artifact store.

Arguments:
  • store: ArtifactStore instance.
def load(self, store: geronimo.artifacts.ArtifactStore) -> None:
224    def load(self, store: "ArtifactStore") -> None:
225        """Load fitted transformers and encoders from artifact store.
226
227        Args:
228            store: ArtifactStore instance.
229        """
230        for name, feature in self._features.items():
231            if feature.has_transformer:
232                feature.transformer = store.get(f"transformer_{name}")
233            if feature.has_encoder:
234                feature.encoder = store.get(f"encoder_{name}")
235
236        self._is_fitted = True

Load fitted transformers and encoders from artifact store.

Arguments:
  • store: ArtifactStore instance.
is_fitted: bool
238    @property
239    def is_fitted(self) -> bool:
240        """Check if feature set has been fitted."""
241        return self._is_fitted

Check if feature set has been fitted.

class Feature:
  7class Feature:
  8    """Feature descriptor for defining individual features.
  9
 10    Used within FeatureSet classes to define feature columns
 11    with their types and transformations.
 12
 13    Order of Operations
 14    -------------------
 15    When processing features, the following order is applied:
 16
 17    1. **derived_feature_fn** (if provided):
 18       - Called first with the full DataFrame
 19       - Computes derived values from source_columns
 20       - Output becomes input for subsequent steps
 21
 22    2. **transformer** (if provided):
 23       - Applied after derived_feature_fn (or to source column if no derive fn)
 24       - Must implement sklearn fit/transform interface
 25       - Typically for numeric normalization (StandardScaler, MinMaxScaler)
 26
 27    3. **encoder** (if provided):
 28       - Applied to categorical values
 29       - Must implement sklearn fit/transform interface
 30       - Typically for categorical encoding (OneHotEncoder, LabelEncoder)
 31
 32    Note: transformer and encoder are mutually exclusive - use one or the other.
 33
 34    Example:
 35        ```python
 36        from geronimo.features import FeatureSet, Feature
 37        from sklearn.preprocessing import StandardScaler, OneHotEncoder
 38
 39        class CustomerFeatures(FeatureSet):
 40            # Simple numeric feature with transformer
 41            age = Feature(dtype="numeric", transformer=StandardScaler())
 42
 43            # Categorical feature with encoder
 44            segment = Feature(dtype="categorical", encoder=OneHotEncoder())
 45
 46            # Derived feature: single input → custom logic
 47            age_bucket = Feature(
 48                dtype="derived",
 49                source_columns=["age"],
 50                derived_feature_fn=lambda df: (df["age"] // 10) * 10,
 51            )
 52
 53            # Derived feature: multiple inputs → single output
 54            bmi = Feature(
 55                dtype="derived",
 56                source_columns=["weight_kg", "height_m"],
 57                derived_feature_fn=lambda df: df["weight_kg"] / (df["height_m"] ** 2),
 58            )
 59
 60            # Derived + transformed: compute then normalize
 61            bmi_normalized = Feature(
 62                dtype="derived",
 63                source_columns=["weight_kg", "height_m"],
 64                derived_feature_fn=lambda df: df["weight_kg"] / (df["height_m"] ** 2),
 65                transformer=StandardScaler(),  # Applied after derive
 66            )
 67
 68            # Drop from final output
 69            name = Feature(dtype="text", drop=True)
 70        ```
 71    """
 72
 73    def __init__(
 74        self,
 75        dtype: Literal["numeric", "categorical", "text", "derived"] = "numeric",
 76        transformer: Optional[Any] = None,
 77        encoder: Optional[Any] = None,
 78        source_column: Optional[str] = None,
 79        source_columns: Optional[list[str]] = None,
 80        derived_feature_fn: Optional[Callable] = None,
 81        drop: bool = False,
 82        description: Optional[str] = None,
 83    ):
 84        """Initialize feature.
 85
 86        Args:
 87            dtype: Feature data type.
 88                - "numeric": Numeric values (int, float)
 89                - "categorical": Categorical/discrete values
 90                - "text": Text data (typically dropped or embedded)
 91                - "derived": Computed from other columns via derived_feature_fn
 92
 93            transformer: Sklearn-compatible transformer for numeric features.
 94                Applied AFTER derived_feature_fn if both are provided.
 95                Must implement fit() and transform() methods.
 96                Example: StandardScaler(), MinMaxScaler()
 97
 98            encoder: Sklearn-compatible encoder for categorical features.
 99                Must implement fit() and transform() methods.
100                Example: OneHotEncoder(), LabelEncoder()
101
102            source_column: Single input column name (if different from attribute name).
103                Used when feature maps 1:1 from a differently-named source column.
104
105            source_columns: List of input column names for derived features.
106                Required when derived_feature_fn needs multiple input columns.
107
108            derived_feature_fn: Custom function for feature engineering.
109                Receives full DataFrame, returns Series or array.
110                Called BEFORE transformer (if both provided).
111                Example: lambda df: df["weight"] / (df["height"] ** 2)
112
113            drop: If True, exclude feature from final output.
114                Useful for passthrough columns needed only for derived features.
115
116            description: Optional human-readable feature description.
117        """
118        self.dtype = dtype
119        self.transformer = transformer
120        self.encoder = encoder
121        self.source_column = source_column
122        self.source_columns = source_columns
123        self.derived_feature_fn = derived_feature_fn
124        self.drop = drop
125        self.description = description
126        self._name: Optional[str] = None
127    
128    dtype: Literal["numeric", "categorical", "text", "derived"]
129    """Feature data type."""
130
131    transformer: Optional[Any]
132    """Sklearn-compatible transformer for numeric features."""
133
134    encoder: Optional[Any]
135    """Sklearn-compatible encoder for categorical features."""
136
137    source_column: Optional[str]
138    """Single input column name."""
139
140    source_columns: Optional[list[str]]
141    """List of input column names for derived features."""
142
143    derived_feature_fn: Optional[Callable]
144    """Custom function for feature engineering."""
145
146    drop: bool
147    """If True, exclude feature from final output."""
148
149    description: Optional[str]
150    """Optional human-readable feature description."""
151
152    def __set_name__(self, owner, name: str) -> None:
153        """Capture attribute name when defined in class."""
154        self._name = name
155        if self.source_column is None and self.source_columns is None:
156            self.source_column = name
157
158    @property
159    def name(self) -> str:
160        """Get feature name."""
161        return self._name or "unnamed"
162
163    @property
164    def input_columns(self) -> list[str]:
165        """Get list of input column names."""
166        if self.source_columns:
167            return self.source_columns
168        return [self.source_column or self.name]
169
170    @property
171    def has_transformer(self) -> bool:
172        """Check if feature has a transformer."""
173        return self.transformer is not None
174
175    @property
176    def has_encoder(self) -> bool:
177        """Check if feature has an encoder."""
178        return self.encoder is not None
179
180    @property
181    def has_derived_fn(self) -> bool:
182        """Check if feature has a derived feature function."""
183        return self.derived_feature_fn is not None
184
185    @property
186    def is_derived(self) -> bool:
187        """Check if feature is derived from custom function."""
188        return self.derived_feature_fn is not None or self.dtype == "derived"
189
190    def apply(self, df) -> Any:
191        """Apply derived feature function to DataFrame.
192
193        Args:
194            df: Input DataFrame with source columns.
195
196        Returns:
197            Transformed feature values (Series or array).
198        """
199        if self.derived_feature_fn is not None:
200            return self.derived_feature_fn(df)
201        elif self.source_column:
202            return df[self.source_column]
203        else:
204            return df[self.name]
205
206    def __repr__(self) -> str:
207        extras = []
208        if self.has_derived_fn:
209            extras.append("derived_feature_fn")
210        if self.source_columns:
211            extras.append(f"inputs={self.source_columns}")
212        if self.has_transformer:
213            extras.append("transformer")
214        if self.has_encoder:
215            extras.append("encoder")
216        extra_str = f", {', '.join(extras)}" if extras else ""
217        return f"Feature({self.name}, dtype={self.dtype}{extra_str})"

Feature descriptor for defining individual features.

Used within FeatureSet classes to define feature columns with their types and transformations.

Order of Operations

When processing features, the following order is applied:

  1. derived_feature_fn (if provided):

    • Called first with the full DataFrame
    • Computes derived values from source_columns
    • Output becomes input for subsequent steps
  2. transformer (if provided):

    • Applied after derived_feature_fn (or to source column if no derive fn)
    • Must implement sklearn fit/transform interface
    • Typically for numeric normalization (StandardScaler, MinMaxScaler)
  3. encoder (if provided):

    • Applied to categorical values
    • Must implement sklearn fit/transform interface
    • Typically for categorical encoding (OneHotEncoder, LabelEncoder)

Note: transformer and encoder are mutually exclusive - use one or the other.

Example:
from geronimo.features import FeatureSet, Feature
from sklearn.preprocessing import StandardScaler, OneHotEncoder

class CustomerFeatures(FeatureSet):
    # Simple numeric feature with transformer
    age = Feature(dtype="numeric", transformer=StandardScaler())

    # Categorical feature with encoder
    segment = Feature(dtype="categorical", encoder=OneHotEncoder())

    # Derived feature: single input → custom logic
    age_bucket = Feature(
        dtype="derived",
        source_columns=["age"],
        derived_feature_fn=lambda df: (df["age"] // 10) * 10,
    )

    # Derived feature: multiple inputs → single output
    bmi = Feature(
        dtype="derived",
        source_columns=["weight_kg", "height_m"],
        derived_feature_fn=lambda df: df["weight_kg"] / (df["height_m"] ** 2),
    )

    # Derived + transformed: compute then normalize
    bmi_normalized = Feature(
        dtype="derived",
        source_columns=["weight_kg", "height_m"],
        derived_feature_fn=lambda df: df["weight_kg"] / (df["height_m"] ** 2),
        transformer=StandardScaler(),  # Applied after derive
    )

    # Drop from final output
    name = Feature(dtype="text", drop=True)
Feature( dtype: Literal['numeric', 'categorical', 'text', 'derived'] = 'numeric', transformer: Optional[Any] = None, encoder: Optional[Any] = None, source_column: Optional[str] = None, source_columns: Optional[list[str]] = None, derived_feature_fn: Optional[Callable] = None, drop: bool = False, description: Optional[str] = None)
 73    def __init__(
 74        self,
 75        dtype: Literal["numeric", "categorical", "text", "derived"] = "numeric",
 76        transformer: Optional[Any] = None,
 77        encoder: Optional[Any] = None,
 78        source_column: Optional[str] = None,
 79        source_columns: Optional[list[str]] = None,
 80        derived_feature_fn: Optional[Callable] = None,
 81        drop: bool = False,
 82        description: Optional[str] = None,
 83    ):
 84        """Initialize feature.
 85
 86        Args:
 87            dtype: Feature data type.
 88                - "numeric": Numeric values (int, float)
 89                - "categorical": Categorical/discrete values
 90                - "text": Text data (typically dropped or embedded)
 91                - "derived": Computed from other columns via derived_feature_fn
 92
 93            transformer: Sklearn-compatible transformer for numeric features.
 94                Applied AFTER derived_feature_fn if both are provided.
 95                Must implement fit() and transform() methods.
 96                Example: StandardScaler(), MinMaxScaler()
 97
 98            encoder: Sklearn-compatible encoder for categorical features.
 99                Must implement fit() and transform() methods.
100                Example: OneHotEncoder(), LabelEncoder()
101
102            source_column: Single input column name (if different from attribute name).
103                Used when feature maps 1:1 from a differently-named source column.
104
105            source_columns: List of input column names for derived features.
106                Required when derived_feature_fn needs multiple input columns.
107
108            derived_feature_fn: Custom function for feature engineering.
109                Receives full DataFrame, returns Series or array.
110                Called BEFORE transformer (if both provided).
111                Example: lambda df: df["weight"] / (df["height"] ** 2)
112
113            drop: If True, exclude feature from final output.
114                Useful for passthrough columns needed only for derived features.
115
116            description: Optional human-readable feature description.
117        """
118        self.dtype = dtype
119        self.transformer = transformer
120        self.encoder = encoder
121        self.source_column = source_column
122        self.source_columns = source_columns
123        self.derived_feature_fn = derived_feature_fn
124        self.drop = drop
125        self.description = description
126        self._name: Optional[str] = None

Initialize feature.

Arguments:
  • dtype: Feature data type.
    • "numeric": Numeric values (int, float)
    • "categorical": Categorical/discrete values
    • "text": Text data (typically dropped or embedded)
    • "derived": Computed from other columns via derived_feature_fn
  • transformer: Sklearn-compatible transformer for numeric features. Applied AFTER derived_feature_fn if both are provided. Must implement fit() and transform() methods. Example: StandardScaler(), MinMaxScaler()
  • encoder: Sklearn-compatible encoder for categorical features. Must implement fit() and transform() methods. Example: OneHotEncoder(), LabelEncoder()
  • source_column: Single input column name (if different from attribute name). Used when feature maps 1:1 from a differently-named source column.
  • source_columns: List of input column names for derived features. Required when derived_feature_fn needs multiple input columns.
  • derived_feature_fn: Custom function for feature engineering. Receives full DataFrame, returns Series or array. Called BEFORE transformer (if both provided). Example: lambda df: df["weight"] / (df["height"] ** 2)
  • drop: If True, exclude feature from final output. Useful for passthrough columns needed only for derived features.
  • description: Optional human-readable feature description.
dtype: Literal['numeric', 'categorical', 'text', 'derived']

Feature data type.

transformer: Optional[Any]

Sklearn-compatible transformer for numeric features.

encoder: Optional[Any]

Sklearn-compatible encoder for categorical features.

source_column: Optional[str]

Single input column name.

source_columns: Optional[list[str]]

List of input column names for derived features.

derived_feature_fn: Optional[Callable]

Custom function for feature engineering.

drop: bool

If True, exclude feature from final output.

description: Optional[str]

Optional human-readable feature description.

name: str
158    @property
159    def name(self) -> str:
160        """Get feature name."""
161        return self._name or "unnamed"

Get feature name.

input_columns: list[str]
163    @property
164    def input_columns(self) -> list[str]:
165        """Get list of input column names."""
166        if self.source_columns:
167            return self.source_columns
168        return [self.source_column or self.name]

Get list of input column names.

has_transformer: bool
170    @property
171    def has_transformer(self) -> bool:
172        """Check if feature has a transformer."""
173        return self.transformer is not None

Check if feature has a transformer.

has_encoder: bool
175    @property
176    def has_encoder(self) -> bool:
177        """Check if feature has an encoder."""
178        return self.encoder is not None

Check if feature has an encoder.

has_derived_fn: bool
180    @property
181    def has_derived_fn(self) -> bool:
182        """Check if feature has a derived feature function."""
183        return self.derived_feature_fn is not None

Check if feature has a derived feature function.

is_derived: bool
185    @property
186    def is_derived(self) -> bool:
187        """Check if feature is derived from custom function."""
188        return self.derived_feature_fn is not None or self.dtype == "derived"

Check if feature is derived from custom function.

def apply(self, df) -> Any:
190    def apply(self, df) -> Any:
191        """Apply derived feature function to DataFrame.
192
193        Args:
194            df: Input DataFrame with source columns.
195
196        Returns:
197            Transformed feature values (Series or array).
198        """
199        if self.derived_feature_fn is not None:
200            return self.derived_feature_fn(df)
201        elif self.source_column:
202            return df[self.source_column]
203        else:
204            return df[self.name]

Apply derived feature function to DataFrame.

Arguments:
  • df: Input DataFrame with source columns.
Returns:

Transformed feature values (Series or array).