"""
Includes the implementation of SOMO.
"""
# Author: Georgios Douzas <gdouzas@icloud.com>
# License: MIT
from math import sqrt
from sklearn.base import clone
from sklearn.utils import check_random_state
from sklearn.utils import check_scalar
from imblearn.over_sampling.base import BaseOverSampler
from imblearn.utils import Substitution
from imblearn.utils._docstring import _random_state_docstring, _n_jobs_docstring
from ._cluster import ClusterOverSampler
from ..distribution import DensityDistributor
[docs]@Substitution(
sampling_strategy=BaseOverSampler._sampling_strategy_docstring,
n_jobs=_n_jobs_docstring,
random_state=_random_state_docstring,
)
class GeometricSOMO(ClusterOverSampler):
"""Applies the SOM algorithm to the input space before applying Geometric SMOTE.
Read more in the :ref:`user guide <user_guide>`.
Parameters
----------
{sampling_strategy}
{random_state}
k_neighbors : int or object, default=5
Defines the number of nearest neighbors to be used by Geometric SMOTE.
- If ``int``, this number is used to construct synthetic
samples.
- If ``object``, an estimator that inherits from
:class:`sklearn.neighbors.base.KNeighborsMixin` that will be
used to find the number of nearest neighbors.
truncation_factor : float, default=1.0
The type of truncation. The values should be in the [-1.0, 1.0] range.
deformation_factor : float, default=0.0
The type of geometry. The values should be in the [0.0, 1.0] range.
selection_strategy : str, default='combined'
The type of Geometric SMOTE algorithm with the following options:
``'combined'``, ``'majority'``, ``'minority'``.
som_estimator : None or object or int or float, default=None
Defines the SOM clusterer applied to the input space.
- If ``None``, :class:`` is used which
tends to be better with large number of samples.
- If KMeans object, then an instance from either
:class:`sklearn.cluster.KMeans` or :class:`sklearn.cluster.MiniBatchKMeans`.
- If ``int``, the number of clusters to be used.
- If ``float``, the proportion of the number of clusters over the number
of samples to be used.
imbalance_ratio_threshold : 'auto' or float, default='auto'
The threshold of a filtered cluster. It can be any non-negative number or
``'auto'`` to be calculated automatically.
- If ``'auto'``, the filtering threshold is calculated from the imbalance
ratio of the target for the binary case or the maximum of the target's
imbalance ratios for the multiclass case.
- If ``float`` then it is manually set to this number.
Any cluster that has an imbalance ratio smaller than the filtering threshold is
identified as a filtered cluster and can be potentially used to generate
minority class instances. Higher values increase the number of filtered
clusters.
distances_exponent : 'auto' or float, default='auto'
The exponent of the mean distance in the density calculation. It can be
any non-negative number or ``'auto'`` to be calculated automatically.
- If ``'auto'`` then it is set equal to the number of
features. Higher values make the calculation of density more sensitive
to the cluster's size i.e. clusters with large mean euclidean distance
between samples are penalized.
- If ``float`` then it is manually set to this number.
distribution_ratio : float, default=0.8
The ratio of intra-cluster to inter-cluster generated samples. It is a
number in the :math:`[0.0, 1.0]` range. The default value is ``0.8``, a
number equal to the proportion of intra-cluster generated samples over
the total number of generated samples. As the number decreases, less
intra-cluster and more inter-cluster samples are generated.
raise_error : bool, default=True
Raise an error when no samples are generated.
- If ``True``, it raises an error when no filtered clusters are
identified and therefore no samples are generated.
- If ``False``, it displays a warning.
{n_jobs}
Attributes
----------
clusterer_ : object
A fitted :class:`somlearn.SOM` instance.
distributor_ : object
A fitted :class:`clover.distribution.DensityDistributor` instance.
labels_ : array, shape (n_samples,)
Labels of each sample.
neighbors_ : array, (n_neighboring_pairs, 2) or None
An array that contains all neighboring pairs with each row being
a unique neighboring pair.
oversampler_ : object
A fitted :class:`imblearn.over_sampling.SMOTE` instance.
random_state_ : object
An instance of ``RandomState`` class.
sampling_strategy_ : dict
Actual sampling strategy.
Examples
--------
>>> import numpy as np
>>> from clover.over_sampling import GeometricSOMO
>>> from sklearn.datasets import make_blobs
>>> blobs = [100, 800, 100]
>>> X, y = make_blobs(blobs, centers=[(-10, 0), (0,0), (10, 0)])
>>> # Add a single 0 sample in the middle blob
>>> X = np.concatenate([X, [[0, 0]]])
>>> y = np.append(y, 0)
>>> # Make this a binary classification problem
>>> y = y == 1
>>> somo = GeometricSOMO(random_state=42)
>>> X_res, y_res = somo.fit_resample(X, y)
>>> # Find the number of new samples in the middle blob
>>> n_res_in_middle = ((X_res[:, 0] > -5) & (X_res[:, 0] < 5)).sum()
>>> print("Samples in the middle blob: %s" % n_res_in_middle)
Samples in the middle blob: 801
>>> print("Middle blob unchanged: %s" % (n_res_in_middle == blobs[1] + 1))
Middle blob unchanged: True
>>> print("More 0 samples: %s" % ((y_res == 0).sum() > (y == 0).sum()))
More 0 samples: True
"""
[docs] def __init__(
self,
sampling_strategy='auto',
random_state=None,
k_neighbors=5,
truncation_factor=1.0,
deformation_factor=0.0,
selection_strategy='combined',
som_estimator=None,
imbalance_ratio_threshold='auto',
distances_exponent='auto',
distribution_ratio=0.8,
raise_error=True,
n_jobs=None,
):
self.sampling_strategy = sampling_strategy
self.random_state = random_state
self.k_neighbors = k_neighbors
self.truncation_factor = truncation_factor
self.deformation_factor = deformation_factor
self.selection_strategy = selection_strategy
self.som_estimator = som_estimator
self.distribution_ratio = distribution_ratio
self.imbalance_ratio_threshold = imbalance_ratio_threshold
self.distances_exponent = distances_exponent
self.raise_error = raise_error
self.n_jobs = n_jobs
def _initialize_fitting(self, X):
"""Initialize fitting process."""
# Import SOM and GeometricSMOTE
try:
from somlearn import SOM
except ImportError:
raise ImportError(
'GeometricSOMO class requires the package `som-learn` to be installed.'
)
try:
from gsmote import GeometricSMOTE
except ImportError:
raise ImportError(
'GeometricSOMO class requires the package `geometric-smote` to '
'be installed.'
)
# Check random state
self.random_state_ = check_random_state(self.random_state)
# Check oversampler
self.oversampler_ = GeometricSMOTE(
sampling_strategy=self.sampling_strategy,
k_neighbors=self.k_neighbors,
truncation_factor=self.truncation_factor,
deformation_factor=self.deformation_factor,
selection_strategy=self.selection_strategy,
random_state=self.random_state_,
n_jobs=self.n_jobs,
)
if self.som_estimator is None:
self.clusterer_ = SOM(random_state=self.random_state_)
elif isinstance(self.som_estimator, int):
check_scalar(self.som_estimator, 'som_estimator', int, 1)
n = round(sqrt(self.som_estimator))
self.clusterer_ = SOM(
n_columns=n, n_rows=n, random_state=self.random_state_
)
elif isinstance(self.som_estimator, float):
check_scalar(self.som_estimator, 'som_estimator', float, 0.0, 1.0)
n = round(sqrt((X.shape[0] - 1) * self.som_estimator + 1))
self.clusterer_ = SOM(
n_columns=n, n_rows=n, random_state=self.random_state_
)
elif isinstance(self.som_estimator, SOM):
self.clusterer_ = clone(self.som_estimator)
else:
raise TypeError(
'Parameter `som_estimator` should be '
'either `None` or the number of clusters '
'or a float in the [0.0, 1.0] range equal to'
' the number of clusters over the number of '
'samples or an instance of the `SOM` class.'
)
# Check distributor
self.distributor_ = DensityDistributor(
filtering_threshold=self.imbalance_ratio_threshold,
distances_exponent=self.distances_exponent,
distribution_ratio=self.distribution_ratio,
)
return self