Source code for imblearn.over_sampling.smote

"""Class to perform over-sampling using SMOTE."""

# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
#          Fernando Nogueira
#          Christos Aridas
# License: MIT

from __future__ import division

import numpy as np
from sklearn.svm import SVC
from sklearn.utils import check_random_state

from .base import BaseOverSampler
from ..exceptions import raise_isinstance_error
from ..utils import check_neighbors_object
from ..utils.deprecation import deprecate_parameter


SMOTE_KIND = ('regular', 'borderline1', 'borderline2', 'svm')


[docs]class SMOTE(BaseOverSampler):
    """Class to perform over-sampling using SMOTE.

    This object is an implementation of SMOTE - Synthetic Minority
    Over-sampling Technique, and the variants Borderline SMOTE 1, 2 and
    SVM-SMOTE.

    Parameters
    ----------
    ratio : str, dict, or callable, optional (default='auto')
        Ratio to use for resampling the data set.

        - If ``str``, has to be one of: (i) ``'minority'``: resample the
          minority class; (ii) ``'majority'``: resample the majority class,
          (iii) ``'not minority'``: resample all classes apart of the minority
          class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``:
          correspond to ``'all'`` with for over-sampling methods and ``'not
          minority'`` for under-sampling methods. The classes targeted will be
          over-sampled or under-sampled to achieve an equal number of sample
          with the majority or minority class.
        - If ``dict``, the keys correspond to the targeted classes. The values
          correspond to the desired number of samples.
        - If callable, function taking ``y`` and returns a ``dict``. The keys
          correspond to the targeted classes. The values correspond to the
          desired number of samples.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, ``random_state`` is the seed used by the random number
        generator; If ``RandomState`` instance, random_state is the random
        number generator; If ``None``, the random number generator is the
        ``RandomState`` instance used by ``np.random``.

    k : int, optional (default=None)
        Number of nearest neighbours to used to construct synthetic samples.

        .. deprecated:: 0.2
           ``k`` is deprecated from 0.2 and will be replaced in 0.4
           Use ``k_neighbors`` instead.

    k_neighbors : int or object, optional (default=5)
        If ``int``, number of nearest neighbours to used to construct synthetic
        samples.  If object, an estimator that inherits from
        :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to
        find the k_neighbors.

    m : int, optional (default=None)
        Number of nearest neighbours to use to determine if a minority sample
        is in danger. Used with ``kind={'borderline1', 'borderline2',
        'svm'}``.

        .. deprecated:: 0.2
           ``m`` is deprecated from 0.2 and will be replaced in 0.4
           Use ``m_neighbors`` instead.

    m_neighbors : int int or object, optional (default=10)
        If int, number of nearest neighbours to use to determine if a minority
        sample is in danger. Used with ``kind={'borderline1', 'borderline2',
        'svm'}``.  If object, an estimator that inherits
        from :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used
        to find the k_neighbors.

    out_step : float, optional (default=0.5)
        Step size when extrapolating. Used with ``kind='svm'``.

    kind : str, optional (default='regular')
        The type of SMOTE algorithm to use one of the following options:
        ``'regular'``, ``'borderline1'``, ``'borderline2'``, ``'svm'``.

    svm_estimator : object, optional (default=SVC())
        If ``kind='svm'``, a parametrized :class:`sklearn.svm.SVC`
        classifier can be passed.

    n_jobs : int, optional (default=1)
        The number of threads to open if possible.

    Notes
    -----
    See the original papers: [1]_, [2]_, [3]_ for more details.

    Supports mutli-class resampling.

    Examples
    --------

    >>> from collections import Counter
    >>> from sklearn.datasets import make_classification
    >>> from imblearn.over_sampling import \
SMOTE # doctest: +NORMALIZE_WHITESPACE
    >>> X, y = make_classification(n_classes=2, class_sep=2,
    ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
    ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
    >>> print('Original dataset shape {}'.format(Counter(y)))
    Original dataset shape Counter({1: 900, 0: 100})
    >>> sm = SMOTE(random_state=42)
    >>> X_res, y_res = sm.fit_sample(X, y)
    >>> print('Resampled dataset shape {}'.format(Counter(y_res)))
    Resampled dataset shape Counter({0: 900, 1: 900})

    References
    ----------
    .. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE:
       synthetic minority over-sampling technique," Journal of artificial
       intelligence research, 321-357, 2002.

    .. [2] H. Han, W. Wen-Yuan, M. Bing-Huan, "Borderline-SMOTE: a new
       over-sampling method in imbalanced data sets learning," Advances in
       intelligent computing, 878-887, 2005.

    .. [3] H. M. Nguyen, E. W. Cooper, K. Kamei, "Borderline over-sampling for
       imbalanced data classification," International Journal of Knowledge
       Engineering and Soft Data Paradigms, 3(1), pp.4-21, 2001.

    """

[docs]    def __init__(self,
                 ratio='auto',
                 random_state=None,
                 k=None,
                 k_neighbors=5,
                 m=None,
                 m_neighbors=10,
                 out_step=0.5,
                 kind='regular',
                 svm_estimator=None,
                 n_jobs=1):
        super(SMOTE, self).__init__(ratio=ratio, random_state=random_state)
        self.kind = kind
        self.k = k
        self.k_neighbors = k_neighbors
        self.m = m
        self.m_neighbors = m_neighbors
        self.out_step = out_step
        self.svm_estimator = svm_estimator
        self.n_jobs = n_jobs

    def _in_danger_noise(self, samples, target_class, y, kind='danger'):
        """Estimate if a set of sample are in danger or noise.

        Parameters
        ----------
        samples : ndarray, shape (n_samples, n_features)
            The samples to check if either they are in danger or not.

        target_class : int or str,
            The target corresponding class being over-sampled.

        y : ndarray, shape (n_samples, )
            The true label in order to check the neighbour labels.

        kind : str, optional (default='danger')
            The type of classification to use. Can be either:

            - If 'danger', check if samples are in danger,
            - If 'noise', check if samples are noise.

        Returns
        -------
        output : ndarray, shape (n_samples, )
            A boolean array where True refer to samples in danger or noise.

        """
        x = self.nn_m_.kneighbors(samples, return_distance=False)[:, 1:]
        nn_label = (y[x] != target_class).astype(int)
        n_maj = np.sum(nn_label, axis=1)

        if kind == 'danger':
            # Samples are in danger for m/2 <= m' < m
            return np.bitwise_and(
                n_maj >= (self.nn_m_.n_neighbors - 1) / 2,
                n_maj < self.nn_m_.n_neighbors - 1)
        elif kind == 'noise':
            # Samples are noise for m = m'
            return n_maj == self.nn_m_.n_neighbors - 1
        else:
            raise NotImplementedError

    def _make_samples(self,
                      X,
                      y_type,
                      nn_data,
                      nn_num,
                      n_samples,
                      step_size=1.):
        """A support function that returns artificial samples constructed along
        the line connecting nearest neighbours.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Points from which the points will be created.

        y_type : str or int
            The minority target value, just so the function can return the
            target values for the synthetic variables with correct length in
            a clear format.

        nn_data : ndarray, shape (n_samples_all, n_features)
            Data set carrying all the neighbours to be used

        nn_num : ndarray, shape (n_samples_all, k_nearest_neighbours)
            The nearest neighbours of each sample in nn_data.

        n_samples : int
            The number of samples to generate.

        step_size : float, optional (default=1.)
            The step size to create samples.

        Returns
        -------
        X_new : ndarray, shape (n_samples_new, n_features)
            Synthetically generated samples.

        y_new : ndarray, shape (n_samples_new, )
            Target values for synthetic samples.

        """
        random_state = check_random_state(self.random_state)
        X_new = np.zeros((n_samples, X.shape[1]))
        samples = random_state.randint(
            low=0, high=len(nn_num.flatten()), size=n_samples)
        steps = step_size * random_state.uniform(size=n_samples)
        rows = np.floor_divide(samples, nn_num.shape[1])
        cols = np.mod(samples, nn_num.shape[1])
        for i, (sample, row, col, step) in enumerate(zip(samples, rows,
                                                         cols, steps)):
            X_new[i] = X[row] - step * (X[row] - nn_data[nn_num[row, col]])
        y_new = np.array([y_type] * len(X_new))

        return X_new, y_new

    def _validate_estimator(self):
        """Create the necessary objects for SMOTE."""

        # FIXME Deprecated in 0.2, to be removed in 0.4
        deprecate_parameter(self, '0.2', 'k', 'k_neighbors')
        deprecate_parameter(self, '0.2', 'm', 'm_neighbors')

        if self.kind not in SMOTE_KIND:
            raise ValueError('Unknown kind for SMOTE algorithm.'
                             ' Choices are {}. Got {} instead.'.format(
                                 SMOTE_KIND, self.kind))

        self.nn_k_ = check_neighbors_object('k_neighbors',
                                            self.k_neighbors,
                                            additional_neighbor=1)
        self.nn_k_.set_params(**{'n_jobs': self.n_jobs})

        if self.kind != 'regular':
            self.nn_m_ = check_neighbors_object('m_neighbors',
                                                self.m_neighbors,
                                                additional_neighbor=1)
            self.nn_m_.set_params(**{'n_jobs': self.n_jobs})

        if self.kind == 'svm':
            if self.svm_estimator is None:
                self.svm_estimator_ = SVC(random_state=self.random_state)
            elif isinstance(self.svm_estimator, SVC):
                self.svm_estimator_ = self.svm_estimator
            else:
                raise_isinstance_error('svm_estimator', [SVC],
                                       self.svm_estimator)

    def _sample_regular(self, X, y):
        """Resample the dataset using the regular SMOTE implementation.

        Use the regular SMOTE algorithm proposed in [1]_.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : ndarray, shape (n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new)
            The corresponding label of `X_resampled`.

        References
        ----------
        .. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE:
           synthetic minority over-sampling technique," Journal of artificial
           intelligence research, 321-357, 2002.

        """
        X_resampled = X.copy()
        y_resampled = y.copy()

        for class_sample, n_samples in self.ratio_.items():
            if n_samples == 0:
                continue
            X_class = X[y == class_sample]

            self.nn_k_.fit(X_class)
            nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
            X_new, y_new = self._make_samples(X_class, class_sample, X_class,
                                              nns, n_samples, 1.0)

            X_resampled = np.concatenate((X_resampled, X_new), axis=0)
            y_resampled = np.concatenate((y_resampled, y_new), axis=0)

        return X_resampled, y_resampled

    def _sample_borderline(self, X, y):
        """Resample the dataset using the borderline SMOTE implementation.

        Use the borderline SMOTE algorithm proposed in [2]_. Two methods can be
        used: (i) borderline-1 or (ii) borderline-2. A nearest-neighbours
        algorithm is used to determine the samples forming the boundaries and
        will create samples next to those features depending on some criterion.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : ndarray, shape (n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new)
            The corresponding label of `X_resampled`.

        References
        ----------
        .. [2] H. Han, W. Wen-Yuan, M. Bing-Huan, "Borderline-SMOTE: a new
           over-sampling method in imbalanced data sets learning," Advances in
           intelligent computing, 878-887, 2005.

        """
        X_resampled = X.copy()
        y_resampled = y.copy()

        for class_sample, n_samples in self.ratio_.items():
            if n_samples == 0:
                continue
            X_class = X[y == class_sample]

            self.nn_m_.fit(X)
            danger_index = self._in_danger_noise(X_class, class_sample, y,
                                                 kind='danger')
            if not any(danger_index):
                continue

            self.nn_k_.fit(X_class)
            nns = self.nn_k_.kneighbors(
               X_class[danger_index], return_distance=False)[:, 1:]

            # divergence between borderline-1 and borderline-2
            if self.kind == 'borderline1':
                # Create synthetic samples for borderline points.
                X_new, y_new = self._make_samples(X_class[danger_index],
                                                  class_sample, X_class,
                                                  nns, n_samples)
                X_resampled = np.concatenate((X_resampled, X_new), axis=0)
                y_resampled = np.concatenate((y_resampled, y_new), axis=0)

            else:
                random_state = check_random_state(self.random_state)
                fractions = random_state.beta(10, 10)

                # only minority
                X_new_1, y_new_1 = self._make_samples(
                    X_class[danger_index], class_sample, X_class, nns,
                    int(fractions * (n_samples + 1)), step_size=1.)

                # we use a one-vs-rest policy to handle the multiclass in which
                # new samples will be created considering not only the majority
                # class but all over classes.
                X_new_2, y_new_2 = self._make_samples(
                    X_class[danger_index], class_sample, X[y != class_sample],
                    nns, int((1 - fractions) * n_samples), step_size=0.5)

                # Concatenate the newly generated samples to the original
                # data set
                X_resampled = np.concatenate((X_resampled, X_new_1, X_new_2),
                                             axis=0)
                y_resampled = np.concatenate((y_resampled, y_new_1, y_new_2),
                                             axis=0)

        return X_resampled, y_resampled

    def _sample_svm(self, X, y):
        """Resample the dataset using the SVM SMOTE implementation.

        Use the SVM SMOTE algorithm proposed in [3]_. A SVM classifier detect
        support vectors to get a notion of the boundary.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : ndarray, shape (n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new)
            The corresponding label of `X_resampled`.

        References
        ----------
        .. [3] H. M. Nguyen, E. W. Cooper, K. Kamei, "Borderline over-sampling
           for imbalanced data classification," International Journal of
           Knowledge Engineering and Soft Data Paradigms, 3(1), pp.4-21, 2001.

        """
        random_state = check_random_state(self.random_state)
        X_resampled = X.copy()
        y_resampled = y.copy()

        for class_sample, n_samples in self.ratio_.items():
            if n_samples == 0:
                continue
            X_class = X[y == class_sample]

            self.svm_estimator_.fit(X, y)
            support_index = self.svm_estimator_.support_[
                y[self.svm_estimator_.support_] == class_sample]
            support_vector = X[support_index]

            self.nn_m_.fit(X)
            noise_bool = self._in_danger_noise(support_vector, class_sample, y,
                                               kind='noise')
            support_vector = support_vector[np.logical_not(noise_bool)]
            danger_bool = self._in_danger_noise(support_vector, class_sample,
                                                y, kind='danger')
            safety_bool = np.logical_not(danger_bool)

            self.nn_k_.fit(X_class)
            fractions = random_state.beta(10, 10)
            if np.count_nonzero(danger_bool) > 0:
                nns = self.nn_k_.kneighbors(support_vector[danger_bool],
                                            return_distance=False)[:, 1:]

                X_new_1, y_new_1 = self._make_samples(
                    support_vector[danger_bool], class_sample, X_class,
                    nns, int(fractions * (n_samples + 1)), step_size=1.)

            if np.count_nonzero(safety_bool) > 0:
                nns = self.nn_k_.kneighbors(support_vector[safety_bool],
                                            return_distance=False)[:, 1:]

                X_new_2, y_new_2 = self._make_samples(
                    support_vector[safety_bool], class_sample, X_class,
                    nns, int((1 - fractions) * n_samples),
                    step_size=-self.out_step)

            if (np.count_nonzero(danger_bool) > 0 and
                    np.count_nonzero(safety_bool) > 0):
                X_resampled = np.concatenate((X_resampled, X_new_1, X_new_2),
                                             axis=0)
                y_resampled = np.concatenate((y_resampled, y_new_1, y_new_2),
                                             axis=0)
            elif np.count_nonzero(danger_bool) == 0:
                X_resampled = np.concatenate((X_resampled, X_new_2), axis=0)
                y_resampled = np.concatenate((y_resampled, y_new_2), axis=0)
            elif np.count_nonzero(safety_bool) == 0:
                X_resampled = np.concatenate((X_resampled, X_new_1), axis=0)
                y_resampled = np.concatenate((y_resampled, y_new_1), axis=0)

        return X_resampled, y_resampled

    def _sample(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : ndarray, shape (n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new)
            The corresponding label of `X_resampled`

        """
        self._validate_estimator()

        if self.kind == 'regular':
            return self._sample_regular(X, y)
        elif self.kind == 'borderline1' or self.kind == 'borderline2':
            return self._sample_borderline(X, y)
        elif self.kind == 'svm':
            return self._sample_svm(X, y)