Source code for scitex_ml.metrics._calc_feature_importance

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: "2025-10-03 04:00:00 (ywatanabe)"
# File: /ssh:sp:/home/ywatanabe/proj/scitex_repo/src/scitex/ml/metrics/_calc_feature_importance.py

"""
Calculate feature importance from trained models.

This module provides a unified interface for extracting feature importance
from various model types (tree-based, linear models, etc.).
"""

from typing import Dict, List, Optional, Tuple

import numpy as np


[docs] def calc_feature_importance( model, feature_names: Optional[List[str]] = None, top_n: Optional[int] = None, ) -> Tuple[Dict[str, float], np.ndarray]: """ Calculate feature importance from a trained model. Parameters ---------- model : object Trained model with feature importance attributes Supports: - Tree-based: feature_importances_ (RandomForest, XGBoost, etc.) - Linear: coef_ (LogisticRegression, LinearSVC, etc.) feature_names : List[str], optional Names of features. If None, uses feature_0, feature_1, ... top_n : int, optional Return only top N most important features Returns ------- importance_dict : Dict[str, float] Dictionary mapping feature names to importance scores importance_array : np.ndarray Array of importance scores (same order as feature_names) Raises ------ ValueError If model doesn't support feature importance extraction Examples -------- >>> from sklearn.ensemble import RandomForestClassifier >>> import numpy as np >>> X = np.random.rand(100, 5) >>> y = np.random.randint(0, 2, 100) >>> model = RandomForestClassifier().fit(X, y) >>> importance_dict, importance_array = calc_feature_importance( ... model, feature_names=['f1', 'f2', 'f3', 'f4', 'f5'] ... ) """ # Extract importance scores based on model type if hasattr(model, "feature_importances_"): # Tree-based models (RandomForest, XGBoost, etc.) importances = model.feature_importances_ elif hasattr(model, "coef_"): # Linear models (LogisticRegression, LinearSVC, etc.) coef = model.coef_ if coef.ndim == 2: # For multiclass, use mean absolute coefficient importances = np.abs(coef).mean(axis=0) else: importances = np.abs(coef) else: raise ValueError( f"Model {type(model).__name__} does not support feature importance extraction. " f"Model must have either 'feature_importances_' or 'coef_' attribute." ) # Generate feature names if not provided if feature_names is None: n_features = len(importances) feature_names = [f"feature_{i}" for i in range(n_features)] # Validate feature names match importance array length if len(feature_names) != len(importances): raise ValueError( f"Number of feature names ({len(feature_names)}) " f"does not match number of importances ({len(importances)})" ) # Create dictionary importance_dict = { name: float(imp) for name, imp in zip(feature_names, importances) } # Sort by importance sorted_items = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True) # Apply top_n filter if requested if top_n is not None: sorted_items = sorted_items[:top_n] # Return sorted dictionary and original array return dict(sorted_items), importances
[docs] def calc_permutation_importance( model, X: np.ndarray, y: np.ndarray, feature_names: Optional[List[str]] = None, n_repeats: int = 10, random_state: Optional[int] = None, scoring: Optional[str] = None, ) -> Tuple[Dict[str, float], Dict[str, float]]: """ Calculate permutation feature importance. More reliable than built-in importance for some models, but slower. Parameters ---------- model : object Trained model X : np.ndarray Feature matrix y : np.ndarray Target vector feature_names : List[str], optional Names of features n_repeats : int, default 10 Number of times to permute each feature random_state : int, optional Random seed for reproducibility scoring : str, optional Scoring metric (default uses model's score method) Returns ------- importance_mean : Dict[str, float] Mean importance for each feature importance_std : Dict[str, float] Standard deviation of importance for each feature """ from sklearn.inspection import permutation_importance # Calculate permutation importance result = permutation_importance( model, X, y, n_repeats=n_repeats, random_state=random_state, scoring=scoring, ) # Generate feature names if not provided if feature_names is None: n_features = X.shape[1] feature_names = [f"feature_{i}" for i in range(n_features)] # Create dictionaries importance_mean = { name: float(imp) for name, imp in zip(feature_names, result.importances_mean) } importance_std = { name: float(imp) for name, imp in zip(feature_names, result.importances_std) } # Sort by mean importance sorted_names = sorted( importance_mean.keys(), key=lambda x: importance_mean[x], reverse=True ) importance_mean = {name: importance_mean[name] for name in sorted_names} importance_std = {name: importance_std[name] for name in sorted_names} return importance_mean, importance_std
# EOF