Note
Go to the end to download the full example code.
Multiclass classification with under-sampling#
Some balancing methods allow for balancing dataset with multiples classes. We provide an example to illustrate the use of those methods which do not differ from the binary case.
Training target statistics: Counter({np.int64(1): 38, np.int64(2): 38, np.int64(0): 17})
Testing target statistics: Counter({np.int64(1): 12, np.int64(2): 12, np.int64(0): 8})
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
# License: MIT
from collections import Counter
import skore
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.datasets import make_imbalance
from imblearn.pipeline import make_pipeline
from imblearn.under_sampling import NearMiss
print(__doc__)
RANDOM_STATE = 42
# Create a folder to fetch the dataset
iris = load_iris()
X, y = make_imbalance(
iris.data,
iris.target,
sampling_strategy={0: 25, 1: 50, 2: 50},
random_state=RANDOM_STATE,
)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE)
print(f"Training target statistics: {Counter(y_train)}")
print(f"Testing target statistics: {Counter(y_test)}")
# Create a pipeline
pipeline = make_pipeline(NearMiss(version=2), StandardScaler(), LogisticRegression())
pipeline.fit(X_train, y_train)
# Classify and report the results
report = skore.evaluate(pipeline, X_test, y_test, splitter="prefit")
report.metrics.summarize().frame()
Total running time of the script: (0 minutes 4.532 seconds)
Estimated memory usage: 313 MB