Porto Seguro: balancing samples in mini-batches with Keras#

This example compares two strategies to train a neural-network on the Porto Seguro Kaggle data set 1. The data set is imbalanced and we show that balancing each mini-batch allows to improve performance and reduce the training time.

References#

1

https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/data

# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
# License: MIT

print(__doc__)

Data loading#

from collections import Counter

import numpy as np
import pandas as pd

First, you should download the Porto Seguro data set from Kaggle. See the link in the introduction.

training_data = pd.read_csv("./input/train.csv")
testing_data = pd.read_csv("./input/test.csv")

y_train = training_data[["id", "target"]].set_index("id")
X_train = training_data.drop(["target"], axis=1).set_index("id")
X_test = testing_data.set_index("id")

The data set is imbalanced and it will have an effect on the fitting.

print(f"The data set is imbalanced: {Counter(y_train['target'])}")

Define the pre-processing pipeline#

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler


def convert_float64(X):
    return X.astype(np.float64)

We want to standard scale the numerical features while we want to one-hot encode the categorical features. In this regard, we make use of the ColumnTransformer.

numerical_columns = [
    name for name in X_train.columns if "_calc_" in name and "_bin" not in name
]
numerical_pipeline = make_pipeline(
    FunctionTransformer(func=convert_float64, validate=False), StandardScaler()
)

categorical_columns = [name for name in X_train.columns if "_cat" in name]
categorical_pipeline = make_pipeline(
    SimpleImputer(missing_values=-1, strategy="most_frequent"),
    OneHotEncoder(categories="auto"),
)

preprocessor = ColumnTransformer(
    [
        ("numerical_preprocessing", numerical_pipeline, numerical_columns),
        (
            "categorical_preprocessing",
            categorical_pipeline,
            categorical_columns,
        ),
    ],
    remainder="drop",
)

# Create an environment variable to avoid using the GPU. This can be changed.
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

from tensorflow.keras.layers import Activation, BatchNormalization, Dense, Dropout

Create a neural-network#

from tensorflow.keras.models import Sequential


def make_model(n_features):
    model = Sequential()
    model.add(Dense(200, input_shape=(n_features,), kernel_initializer="glorot_normal"))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    model.add(Dropout(0.5))
    model.add(Dense(100, kernel_initializer="glorot_normal", use_bias=False))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    model.add(Dropout(0.25))
    model.add(Dense(50, kernel_initializer="glorot_normal", use_bias=False))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    model.add(Dropout(0.15))
    model.add(Dense(25, kernel_initializer="glorot_normal", use_bias=False))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    model.add(Dropout(0.1))
    model.add(Dense(1, activation="sigmoid"))

    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

    return model

We create a decorator to report the computation time

import time
from functools import wraps


def timeit(f):
    @wraps(f)
    def wrapper(*args, **kwds):
        start_time = time.time()
        result = f(*args, **kwds)
        elapsed_time = time.time() - start_time
        print(f"Elapsed computation time: {elapsed_time:.3f} secs")
        return (elapsed_time, result)

    return wrapper

The first model will be trained using the fit method and with imbalanced mini-batches.

from sklearn.metrics import roc_auc_score


@timeit
def fit_predict_imbalanced_model(X_train, y_train, X_test, y_test):
    model = make_model(X_train.shape[1])
    model.fit(X_train, y_train, epochs=2, verbose=1, batch_size=1000)
    y_pred = model.predict_proba(X_test, batch_size=1000)
    return roc_auc_score(y_test, y_pred)

In the contrary, we will use imbalanced-learn to create a generator of mini-batches which will yield balanced mini-batches.

from imblearn.keras import BalancedBatchGenerator


@timeit
def fit_predict_balanced_model(X_train, y_train, X_test, y_test):
    model = make_model(X_train.shape[1])
    training_generator = BalancedBatchGenerator(
        X_train, y_train, batch_size=1000, random_state=42
    )
    model.fit(training_generator, epochs=5, verbose=1)
    y_pred = model.predict(X_test, batch_size=1000)
    return roc_auc_score(y_test, y_pred)

Classification loop#

We will perform a 10-fold cross-validation and train the neural-network with the two different strategies previously presented.

from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=10)

cv_results_imbalanced = []
cv_time_imbalanced = []
cv_results_balanced = []
cv_time_balanced = []
for train_idx, valid_idx in skf.split(X_train, y_train):
    X_local_train = preprocessor.fit_transform(X_train.iloc[train_idx])
    y_local_train = y_train.iloc[train_idx].values.ravel()
    X_local_test = preprocessor.transform(X_train.iloc[valid_idx])
    y_local_test = y_train.iloc[valid_idx].values.ravel()

    elapsed_time, roc_auc = fit_predict_imbalanced_model(
        X_local_train, y_local_train, X_local_test, y_local_test
    )
    cv_time_imbalanced.append(elapsed_time)
    cv_results_imbalanced.append(roc_auc)

    elapsed_time, roc_auc = fit_predict_balanced_model(
        X_local_train, y_local_train, X_local_test, y_local_test
    )
    cv_time_balanced.append(elapsed_time)
    cv_results_balanced.append(roc_auc)

Plot of the results and computation time#

df_results = pd.DataFrame(
    {
        "Balanced model": cv_results_balanced,
        "Imbalanced model": cv_results_imbalanced,
    }
)
df_results = df_results.unstack().reset_index()

df_time = pd.DataFrame(
    {"Balanced model": cv_time_balanced, "Imbalanced model": cv_time_imbalanced}
)
df_time = df_time.unstack().reset_index()

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure()
sns.boxplot(y="level_0", x=0, data=df_time)
sns.despine(top=True, right=True, left=True)
plt.xlabel("time [s]")
plt.ylabel("")
plt.title("Computation time difference using a random under-sampling")

plt.figure()
sns.boxplot(y="level_0", x=0, data=df_results, whis=10.0)
sns.despine(top=True, right=True, left=True)
ax = plt.gca()
ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, pos: "%i%%" % (100 * x)))
plt.xlabel("ROC-AUC")
plt.ylabel("")
plt.title("Difference in terms of ROC-AUC using a random under-sampling")

Total running time of the script: ( 0 minutes 0.000 seconds)

Related examples

Example of topic classification in text documents

Example of topic classification in text documents

Metrics specific to imbalanced learning

Metrics specific to imbalanced learning

Benchmark over-sampling methods in a face recognition task

Benchmark over-sampling methods in a face recognition task

Usage of pipeline embedding samplers

Usage of pipeline embedding samplers

Multiclass classification with under-sampling

Multiclass classification with under-sampling

Gallery generated by Sphinx-Gallery