"""
Includes the implementation of SOMO.
"""
# Author: Georgios Douzas <gdouzas@icloud.com>
# License: MIT
from math import sqrt
from sklearn.base import clone
from sklearn.utils import check_random_state
from sklearn.utils import check_scalar
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling.base import BaseOverSampler
from imblearn.utils import Substitution
from imblearn.utils._docstring import _random_state_docstring, _n_jobs_docstring
from ._cluster import ClusterOverSampler
from ..distribution._density import DensityDistributor
[docs]@Substitution(
sampling_strategy=BaseOverSampler._sampling_strategy_docstring,
n_jobs=_n_jobs_docstring,
random_state=_random_state_docstring,
)
class SOMO(ClusterOverSampler):
"""Applies the SOM algorithm to the input space before applying SMOTE.
This is an implementation of the algorithm described in [1]_.
Read more in the :ref:`user guide <user_guide>`.
Parameters
----------
{sampling_strategy}
{random_state}
k_neighbors : int or object, default=5
Defines the number of nearest neighbors to be used by SMOTE.
- If ``int``, this number is used to construct synthetic
samples.
- If ``object``, an estimator that inherits from
:class:`sklearn.neighbors.base.KNeighborsMixin` that will be
used to find the number of nearest neighbors.
som_estimator : None or object or int or float, default=None
Defines the SOM clusterer applied to the input space.
- If ``None``, :class:`` is used which
tends to be better with large number of samples.
- If KMeans object, then an instance from either
:class:`sklearn.cluster.KMeans` or :class:`sklearn.cluster.MiniBatchKMeans`.
- If ``int``, the number of clusters to be used.
- If ``float``, the proportion of the number of clusters over the number
of samples to be used.
distribution_ratio : float, default=0.8
The ratio of intra-cluster to inter-cluster generated samples. It is a
number in the :math:`[0.0, 1.0]` range. The default value is ``0.8``, a
number equal to the proportion of intra-cluster generated samples over
the total number of generated samples. As the number decreases, less
intra-cluster and more inter-cluster samples are generated.
raise_error : boolean, default=True
{n_jobs}
Attributes
----------
clusterer_ : object
A fitted :class:`somlearn.SOM` instance.
distributor_ : object
A fitted :class:`clover.distribution.DensityDistributor` instance.
labels_ : array, shape (n_samples,)
Labels of each sample.
neighbors_ : array, (n_neighboring_pairs, 2) or None
An array that contains all neighboring pairs with each row being
a unique neighboring pair.
oversampler_ : object
A fitted :class:`imblearn.over_sampling.SMOTE` instance.
random_state_ : object
An instance of ``RandomState`` class.
sampling_strategy_ : dict
Actual sampling strategy.
References
----------
.. [1] Georgios Douzas, Fernando Bacao, "Self-Organizing Map
Oversampling (SOMO) for imbalanced data set learning"
https://www.sciencedirect.com/science/article/abs/pii/S0957417417302324?via%3Dihub
Examples
--------
>>> import numpy as np
>>> from clover.over_sampling import SOMO
>>> from sklearn.datasets import make_blobs
>>> blobs = [100, 800, 100]
>>> X, y = make_blobs(blobs, centers=[(-10, 0), (0,0), (10, 0)])
>>> # Add a single 0 sample in the middle blob
>>> X = np.concatenate([X, [[0, 0]]])
>>> y = np.append(y, 0)
>>> # Make this a binary classification problem
>>> y = y == 1
>>> somo = SOMO(random_state=42)
>>> X_res, y_res = somo.fit_resample(X, y)
>>> # Find the number of new samples in the middle blob
>>> n_res_in_middle = ((X_res[:, 0] > -5) & (X_res[:, 0] < 5)).sum()
>>> print("Samples in the middle blob: %s" % n_res_in_middle)
Samples in the middle blob: 801
>>> print("Middle blob unchanged: %s" % (n_res_in_middle == blobs[1] + 1))
Middle blob unchanged: True
>>> print("More 0 samples: %s" % ((y_res == 0).sum() > (y == 0).sum()))
More 0 samples: True
"""
[docs] def __init__(
self,
sampling_strategy='auto',
random_state=None,
k_neighbors=5,
som_estimator=None,
distribution_ratio=0.8,
raise_error=True,
n_jobs=None,
):
self.sampling_strategy = sampling_strategy
self.random_state = random_state
self.k_neighbors = k_neighbors
self.som_estimator = som_estimator
self.distribution_ratio = distribution_ratio
self.raise_error = raise_error
self.n_jobs = n_jobs
def _initialize_fitting(self, X):
"""Initialize fitting process."""
# Import SOM
try:
from somlearn import SOM
except ImportError:
raise ImportError(
'SOMO class requires the package `som-learn` to be installed.'
)
# Check random state
self.random_state_ = check_random_state(self.random_state)
# Check oversampler
self.oversampler_ = SMOTE(
sampling_strategy=self.sampling_strategy,
k_neighbors=self.k_neighbors,
random_state=self.random_state_,
n_jobs=self.n_jobs,
)
# Check clusterer and number of clusters
if self.som_estimator is None:
self.clusterer_ = SOM(random_state=self.random_state_)
elif isinstance(self.som_estimator, int):
check_scalar(self.som_estimator, 'som_estimator', int, 1)
n = round(sqrt(self.som_estimator))
self.clusterer_ = SOM(
n_columns=n, n_rows=n, random_state=self.random_state_
)
elif isinstance(self.som_estimator, float):
check_scalar(self.som_estimator, 'som_estimator', float, 0.0, 1.0)
n = round(sqrt((X.shape[0] - 1) * self.som_estimator + 1))
self.clusterer_ = SOM(
n_columns=n, n_rows=n, random_state=self.random_state_
)
elif isinstance(self.som_estimator, SOM):
self.clusterer_ = clone(self.som_estimator)
else:
raise TypeError(
'Parameter `som_estimator` should be '
'either `None` or the number of clusters '
'or a float in the [0.0, 1.0] range equal to'
' the number of clusters over the number of '
'samples or an instance of the `SOM` class.'
)
# Check distributor
self.distributor_ = DensityDistributor(
distribution_ratio=self.distribution_ratio,
filtering_threshold=1.0,
distances_exponent=2.0,
)
return self