Source code for imblearn.over_sampling.smote

"""Class to perform over-sampling using SMOTE."""

# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
#          Fernando Nogueira
#          Christos Aridas
# License: MIT

from __future__ import division

import numpy as np
from sklearn.svm import SVC
from sklearn.utils import check_random_state

from .base import BaseOverSampler
from ..exceptions import raise_isinstance_error
from ..utils import check_neighbors_object
from ..utils.deprecation import deprecate_parameter


SMOTE_KIND = ('regular', 'borderline1', 'borderline2', 'svm')


[docs]class SMOTE(BaseOverSampler): """Class to perform over-sampling using SMOTE. This object is an implementation of SMOTE - Synthetic Minority Over-sampling Technique, and the variants Borderline SMOTE 1, 2 and SVM-SMOTE. Parameters ---------- ratio : str, dict, or callable, optional (default='auto') Ratio to use for resampling the data set. - If ``str``, has to be one of: (i) ``'minority'``: resample the minority class; (ii) ``'majority'``: resample the majority class, (iii) ``'not minority'``: resample all classes apart of the minority class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: correspond to ``'all'`` with for over-sampling methods and ``'not minority'`` for under-sampling methods. The classes targeted will be over-sampled or under-sampled to achieve an equal number of sample with the majority or minority class. - If ``dict``, the keys correspond to the targeted classes. The values correspond to the desired number of samples. - If callable, function taking ``y`` and returns a ``dict``. The keys correspond to the targeted classes. The values correspond to the desired number of samples. random_state : int, RandomState instance or None, optional (default=None) If int, ``random_state`` is the seed used by the random number generator; If ``RandomState`` instance, random_state is the random number generator; If ``None``, the random number generator is the ``RandomState`` instance used by ``np.random``. k : int, optional (default=None) Number of nearest neighbours to used to construct synthetic samples. .. deprecated:: 0.2 ``k`` is deprecated from 0.2 and will be replaced in 0.4 Use ``k_neighbors`` instead. k_neighbors : int or object, optional (default=5) If ``int``, number of nearest neighbours to used to construct synthetic samples. If object, an estimator that inherits from :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to find the k_neighbors. m : int, optional (default=None) Number of nearest neighbours to use to determine if a minority sample is in danger. Used with ``kind={'borderline1', 'borderline2', 'svm'}``. .. deprecated:: 0.2 ``m`` is deprecated from 0.2 and will be replaced in 0.4 Use ``m_neighbors`` instead. m_neighbors : int int or object, optional (default=10) If int, number of nearest neighbours to use to determine if a minority sample is in danger. Used with ``kind={'borderline1', 'borderline2', 'svm'}``. If object, an estimator that inherits from :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to find the k_neighbors. out_step : float, optional (default=0.5) Step size when extrapolating. Used with ``kind='svm'``. kind : str, optional (default='regular') The type of SMOTE algorithm to use one of the following options: ``'regular'``, ``'borderline1'``, ``'borderline2'``, ``'svm'``. svm_estimator : object, optional (default=SVC()) If ``kind='svm'``, a parametrized :class:`sklearn.svm.SVC` classifier can be passed. n_jobs : int, optional (default=1) The number of threads to open if possible. Notes ----- See the original papers: [1]_, [2]_, [3]_ for more details. Supports mutli-class resampling. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.over_sampling import \ SMOTE # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape {}'.format(Counter(y))) Original dataset shape Counter({1: 900, 0: 100}) >>> sm = SMOTE(random_state=42) >>> X_res, y_res = sm.fit_sample(X, y) >>> print('Resampled dataset shape {}'.format(Counter(y_res))) Resampled dataset shape Counter({0: 900, 1: 900}) References ---------- .. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE: synthetic minority over-sampling technique," Journal of artificial intelligence research, 321-357, 2002. .. [2] H. Han, W. Wen-Yuan, M. Bing-Huan, "Borderline-SMOTE: a new over-sampling method in imbalanced data sets learning," Advances in intelligent computing, 878-887, 2005. .. [3] H. M. Nguyen, E. W. Cooper, K. Kamei, "Borderline over-sampling for imbalanced data classification," International Journal of Knowledge Engineering and Soft Data Paradigms, 3(1), pp.4-21, 2001. """
[docs] def __init__(self, ratio='auto', random_state=None, k=None, k_neighbors=5, m=None, m_neighbors=10, out_step=0.5, kind='regular', svm_estimator=None, n_jobs=1): super(SMOTE, self).__init__(ratio=ratio, random_state=random_state) self.kind = kind self.k = k self.k_neighbors = k_neighbors self.m = m self.m_neighbors = m_neighbors self.out_step = out_step self.svm_estimator = svm_estimator self.n_jobs = n_jobs
def _in_danger_noise(self, samples, target_class, y, kind='danger'): """Estimate if a set of sample are in danger or noise. Parameters ---------- samples : ndarray, shape (n_samples, n_features) The samples to check if either they are in danger or not. target_class : int or str, The target corresponding class being over-sampled. y : ndarray, shape (n_samples, ) The true label in order to check the neighbour labels. kind : str, optional (default='danger') The type of classification to use. Can be either: - If 'danger', check if samples are in danger, - If 'noise', check if samples are noise. Returns ------- output : ndarray, shape (n_samples, ) A boolean array where True refer to samples in danger or noise. """ x = self.nn_m_.kneighbors(samples, return_distance=False)[:, 1:] nn_label = (y[x] != target_class).astype(int) n_maj = np.sum(nn_label, axis=1) if kind == 'danger': # Samples are in danger for m/2 <= m' < m return np.bitwise_and( n_maj >= (self.nn_m_.n_neighbors - 1) / 2, n_maj < self.nn_m_.n_neighbors - 1) elif kind == 'noise': # Samples are noise for m = m' return n_maj == self.nn_m_.n_neighbors - 1 else: raise NotImplementedError def _make_samples(self, X, y_type, nn_data, nn_num, n_samples, step_size=1.): """A support function that returns artificial samples constructed along the line connecting nearest neighbours. Parameters ---------- X : ndarray, shape (n_samples, n_features) Points from which the points will be created. y_type : str or int The minority target value, just so the function can return the target values for the synthetic variables with correct length in a clear format. nn_data : ndarray, shape (n_samples_all, n_features) Data set carrying all the neighbours to be used nn_num : ndarray, shape (n_samples_all, k_nearest_neighbours) The nearest neighbours of each sample in nn_data. n_samples : int The number of samples to generate. step_size : float, optional (default=1.) The step size to create samples. Returns ------- X_new : ndarray, shape (n_samples_new, n_features) Synthetically generated samples. y_new : ndarray, shape (n_samples_new, ) Target values for synthetic samples. """ random_state = check_random_state(self.random_state) X_new = np.zeros((n_samples, X.shape[1])) samples = random_state.randint( low=0, high=len(nn_num.flatten()), size=n_samples) steps = step_size * random_state.uniform(size=n_samples) rows = np.floor_divide(samples, nn_num.shape[1]) cols = np.mod(samples, nn_num.shape[1]) for i, (sample, row, col, step) in enumerate(zip(samples, rows, cols, steps)): X_new[i] = X[row] - step * (X[row] - nn_data[nn_num[row, col]]) y_new = np.array([y_type] * len(X_new)) return X_new, y_new def _validate_estimator(self): """Create the necessary objects for SMOTE.""" # FIXME Deprecated in 0.2, to be removed in 0.4 deprecate_parameter(self, '0.2', 'k', 'k_neighbors') deprecate_parameter(self, '0.2', 'm', 'm_neighbors') if self.kind not in SMOTE_KIND: raise ValueError('Unknown kind for SMOTE algorithm.' ' Choices are {}. Got {} instead.'.format( SMOTE_KIND, self.kind)) self.nn_k_ = check_neighbors_object('k_neighbors', self.k_neighbors, additional_neighbor=1) self.nn_k_.set_params(**{'n_jobs': self.n_jobs}) if self.kind != 'regular': self.nn_m_ = check_neighbors_object('m_neighbors', self.m_neighbors, additional_neighbor=1) self.nn_m_.set_params(**{'n_jobs': self.n_jobs}) if self.kind == 'svm': if self.svm_estimator is None: self.svm_estimator_ = SVC(random_state=self.random_state) elif isinstance(self.svm_estimator, SVC): self.svm_estimator_ = self.svm_estimator else: raise_isinstance_error('svm_estimator', [SVC], self.svm_estimator) def _sample_regular(self, X, y): """Resample the dataset using the regular SMOTE implementation. Use the regular SMOTE algorithm proposed in [1]_. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. Returns ------- X_resampled : ndarray, shape (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new) The corresponding label of `X_resampled`. References ---------- .. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE: synthetic minority over-sampling technique," Journal of artificial intelligence research, 321-357, 2002. """ X_resampled = X.copy() y_resampled = y.copy() for class_sample, n_samples in self.ratio_.items(): if n_samples == 0: continue X_class = X[y == class_sample] self.nn_k_.fit(X_class) nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:] X_new, y_new = self._make_samples(X_class, class_sample, X_class, nns, n_samples, 1.0) X_resampled = np.concatenate((X_resampled, X_new), axis=0) y_resampled = np.concatenate((y_resampled, y_new), axis=0) return X_resampled, y_resampled def _sample_borderline(self, X, y): """Resample the dataset using the borderline SMOTE implementation. Use the borderline SMOTE algorithm proposed in [2]_. Two methods can be used: (i) borderline-1 or (ii) borderline-2. A nearest-neighbours algorithm is used to determine the samples forming the boundaries and will create samples next to those features depending on some criterion. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. Returns ------- X_resampled : ndarray, shape (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new) The corresponding label of `X_resampled`. References ---------- .. [2] H. Han, W. Wen-Yuan, M. Bing-Huan, "Borderline-SMOTE: a new over-sampling method in imbalanced data sets learning," Advances in intelligent computing, 878-887, 2005. """ X_resampled = X.copy() y_resampled = y.copy() for class_sample, n_samples in self.ratio_.items(): if n_samples == 0: continue X_class = X[y == class_sample] self.nn_m_.fit(X) danger_index = self._in_danger_noise(X_class, class_sample, y, kind='danger') if not any(danger_index): continue self.nn_k_.fit(X_class) nns = self.nn_k_.kneighbors( X_class[danger_index], return_distance=False)[:, 1:] # divergence between borderline-1 and borderline-2 if self.kind == 'borderline1': # Create synthetic samples for borderline points. X_new, y_new = self._make_samples(X_class[danger_index], class_sample, X_class, nns, n_samples) X_resampled = np.concatenate((X_resampled, X_new), axis=0) y_resampled = np.concatenate((y_resampled, y_new), axis=0) else: random_state = check_random_state(self.random_state) fractions = random_state.beta(10, 10) # only minority X_new_1, y_new_1 = self._make_samples( X_class[danger_index], class_sample, X_class, nns, int(fractions * (n_samples + 1)), step_size=1.) # we use a one-vs-rest policy to handle the multiclass in which # new samples will be created considering not only the majority # class but all over classes. X_new_2, y_new_2 = self._make_samples( X_class[danger_index], class_sample, X[y != class_sample], nns, int((1 - fractions) * n_samples), step_size=0.5) # Concatenate the newly generated samples to the original # data set X_resampled = np.concatenate((X_resampled, X_new_1, X_new_2), axis=0) y_resampled = np.concatenate((y_resampled, y_new_1, y_new_2), axis=0) return X_resampled, y_resampled def _sample_svm(self, X, y): """Resample the dataset using the SVM SMOTE implementation. Use the SVM SMOTE algorithm proposed in [3]_. A SVM classifier detect support vectors to get a notion of the boundary. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. Returns ------- X_resampled : ndarray, shape (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new) The corresponding label of `X_resampled`. References ---------- .. [3] H. M. Nguyen, E. W. Cooper, K. Kamei, "Borderline over-sampling for imbalanced data classification," International Journal of Knowledge Engineering and Soft Data Paradigms, 3(1), pp.4-21, 2001. """ random_state = check_random_state(self.random_state) X_resampled = X.copy() y_resampled = y.copy() for class_sample, n_samples in self.ratio_.items(): if n_samples == 0: continue X_class = X[y == class_sample] self.svm_estimator_.fit(X, y) support_index = self.svm_estimator_.support_[ y[self.svm_estimator_.support_] == class_sample] support_vector = X[support_index] self.nn_m_.fit(X) noise_bool = self._in_danger_noise(support_vector, class_sample, y, kind='noise') support_vector = support_vector[np.logical_not(noise_bool)] danger_bool = self._in_danger_noise(support_vector, class_sample, y, kind='danger') safety_bool = np.logical_not(danger_bool) self.nn_k_.fit(X_class) fractions = random_state.beta(10, 10) if np.count_nonzero(danger_bool) > 0: nns = self.nn_k_.kneighbors(support_vector[danger_bool], return_distance=False)[:, 1:] X_new_1, y_new_1 = self._make_samples( support_vector[danger_bool], class_sample, X_class, nns, int(fractions * (n_samples + 1)), step_size=1.) if np.count_nonzero(safety_bool) > 0: nns = self.nn_k_.kneighbors(support_vector[safety_bool], return_distance=False)[:, 1:] X_new_2, y_new_2 = self._make_samples( support_vector[safety_bool], class_sample, X_class, nns, int((1 - fractions) * n_samples), step_size=-self.out_step) if (np.count_nonzero(danger_bool) > 0 and np.count_nonzero(safety_bool) > 0): X_resampled = np.concatenate((X_resampled, X_new_1, X_new_2), axis=0) y_resampled = np.concatenate((y_resampled, y_new_1, y_new_2), axis=0) elif np.count_nonzero(danger_bool) == 0: X_resampled = np.concatenate((X_resampled, X_new_2), axis=0) y_resampled = np.concatenate((y_resampled, y_new_2), axis=0) elif np.count_nonzero(safety_bool) == 0: X_resampled = np.concatenate((X_resampled, X_new_1), axis=0) y_resampled = np.concatenate((y_resampled, y_new_1), axis=0) return X_resampled, y_resampled def _sample(self, X, y): """Resample the dataset. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. Returns ------- X_resampled : ndarray, shape (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new) The corresponding label of `X_resampled` """ self._validate_estimator() if self.kind == 'regular': return self._sample_regular(X, y) elif self.kind == 'borderline1' or self.kind == 'borderline2': return self._sample_borderline(X, y) elif self.kind == 'svm': return self._sample_svm(X, y)