Source code for imblearn.datasets.zenodo

"""Collection of imbalanced datasets.

This collection of datasets has been proposed in [1]_. The
characteristics of the available datasets are presented in the table
below.

 ID    Name           Repository & Target           Ratio  #S       #F
 1     ecoli          UCI, target: imU              8.6:1  336      7
 2     optical_digits UCI, target: 8                9.1:1  5,620    64
 3     satimage       UCI, target: 4                9.3:1  6,435    36
 4     pen_digits     UCI, target: 5                9.4:1  10,992   16
 5     abalone        UCI, target: 7                9.7:1  4,177    10
 6     sick_euthyroid UCI, target: sick euthyroid   9.8:1  3,163    42
 7     spectrometer   UCI, target: >=44             11:1   531      93
 8     car_eval_34    UCI, target: good, v good     12:1   1,728    21
 9     isolet         UCI, target: A, B             12:1   7,797    617
 10    us_crime       UCI, target: >0.65            12:1   1,994    100
 11    yeast_ml8      LIBSVM, target: 8             13:1   2,417    103
 12    scene          LIBSVM, target: >one label    13:1   2,407    294
 13    libras_move    UCI, target: 1                14:1   360      90
 14    thyroid_sick   UCI, target: sick             15:1   3,772    52
 15    coil_2000      KDD, CoIL, target: minority   16:1   9,822    85
 16    arrhythmia     UCI, target: 06               17:1   452      278
 17    solar_flare_m0 UCI, target: M->0             19:1   1,389    32
 18    oil            UCI, target: minority         22:1   937      49
 19    car_eval_4     UCI, target: vgood            26:1   1,728    21
 20    wine_quality   UCI, wine, target: <=4        26:1   4,898    11
 21    letter_img     UCI, target: Z                26:1   20,000   16
 22    yeast_me2      UCI, target: ME2              28:1   1,484    8
 23    webpage        LIBSVM, w7a, target: minority 33:1   34,780   300
 24    ozone_level    UCI, ozone, data              34:1   2,536    72
 25    mammography    UCI, target: minority         42:1   11,183   6
 26    protein_homo   KDD CUP 2004, minority        111:1  145,751  74
 27    abalone_19     UCI, target: 19               130:1  4,177    10

References
----------
.. [1] Ding, Zejin, "Diversified Ensemble Classifiers for Highly
   Imbalanced Data Learning and their Application in Bioinformatics."
   Dissertation, Georgia State University, (2011).

"""

# Author: Guillaume Lemaitre
# License: BSD 3 clause

from collections import OrderedDict
import tarfile
from io import BytesIO
import logging
from os.path import join, isfile
try:
    from urllib2 import urlopen
except ImportError:
    from urllib.request import urlopen

import numpy as np

from sklearn.datasets import get_data_home
from sklearn.datasets.base import Bunch
from sklearn.utils.fixes import makedirs
from sklearn.externals import six
from sklearn.utils import check_random_state

URL = ('https://zenodo.org/record/61452/files/'
       'benchmark-imbalanced-learn.tar.gz')
PRE_FILENAME = 'x'
POST_FILENAME = 'data.npz'

MAP_NAME_ID_KEYS = ['ecoli',
                    'optical_digits',
                    'satimage',
                    'pen_digits',
                    'abalone',
                    'sick_euthyroid',
                    'spectrometer',
                    'car_eval_34',
                    'isolet',
                    'us_crime',
                    'yeast_ml8',
                    'scene',
                    'libras_move',
                    'thyroid_sick',
                    'coil_2000',
                    'arrhythmia',
                    'solar_flare_m0',
                    'oil',
                    'car_eval_4',
                    'wine_quality',
                    'letter_img',
                    'yeast_me2',
                    'webpage',
                    'ozone_level',
                    'mammography',
                    'protein_homo',
                    'abalone_19']

MAP_NAME_ID = OrderedDict()
MAP_ID_NAME = OrderedDict()
for v, k in enumerate(MAP_NAME_ID_KEYS):
    MAP_NAME_ID[k] = v + 1
    MAP_ID_NAME[v + 1] = k

logger = logging.getLogger()


[docs]def fetch_datasets(data_home=None, filter_data=None, download_if_missing=True, random_state=None, shuffle=False): """Load the benchmark datasets from Zenodo, downloading it if necessary. Parameters ---------- data_home : string, optional (default=None) Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. filter_data : tuple of str/int or None, optional (default=None) A tuple containing the ID or the name of the datasets to be returned. Refer to the above table to get the ID and name of the datasets. download_if_missing : boolean, optional (default=True) If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. random_state : int, RandomState instance or None, optional (default=None) Random state for shuffling the dataset. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. shuffle : bool, optional (default=False) Whether to shuffle dataset. Returns ------- datasets : OrderedDict of Bunch object, The ordered is defined by ``filter_data``. Each Bunch object --- refered as dataset --- have the following attributes: dataset.data : ndarray, shape (n_samples, n_features) dataset.target : ndarray, shape (n_samples, ) dataset.DESCR : string Description of the each dataset. Notes ----- This collection of datasets have been proposed in [1]_. The characteristics of the available datasets are presented in the table below. +--+--------------+-------------------------------+-------+---------+-----+ |ID|Name | Repository & Target | Ratio | #S | #F | +==+==============+===============================+=======+=========+=====+ |1 |ecoli | UCI, target: imU | 8.6:1 | 336 | 7 | +--+--------------+-------------------------------+-------+---------+-----+ |2 |optical_digits| UCI, target: 8 | 9.1:1 | 5,620 | 64 | +--+--------------+-------------------------------+-------+---------+-----+ |3 |satimage | UCI, target: 4 | 9.3:1 | 6,435 | 36 | +--+--------------+-------------------------------+-------+---------+-----+ |4 |pen_digits | UCI, target: 5 | 9.4:1 | 10,992 | 16 | +--+--------------+-------------------------------+-------+---------+-----+ |5 |abalone | UCI, target: 7 | 9.7:1 | 4,177 | 10 | +--+--------------+-------------------------------+-------+---------+-----+ |6 |sick_euthyroid| UCI, target: sick euthyroid | 9.8:1 | 3,163 | 42 | +--+--------------+-------------------------------+-------+---------+-----+ |7 |spectrometer | UCI, target: >=44 | 11:1 | 531 | 93 | +--+--------------+-------------------------------+-------+---------+-----+ |8 |car_eval_34 | UCI, target: good, v good | 12:1 | 1,728 | 21 | +--+--------------+-------------------------------+-------+---------+-----+ |9 |isolet | UCI, target: A, B | 12:1 | 7,797 | 617 | +--+--------------+-------------------------------+-------+---------+-----+ |10|us_crime | UCI, target: >0.65 | 12:1 | 1,994 | 100 | +--+--------------+-------------------------------+-------+---------+-----+ |11|yeast_ml8 | LIBSVM, target: 8 | 13:1 | 2,417 | 103 | +--+--------------+-------------------------------+-------+---------+-----+ |12|scene | LIBSVM, target: >one label | 13:1 | 2,407 | 294 | +--+--------------+-------------------------------+-------+---------+-----+ |13|libras_move | UCI, target: 1 | 14:1 | 360 | 90 | +--+--------------+-------------------------------+-------+---------+-----+ |14|thyroid_sick | UCI, target: sick | 15:1 | 3,772 | 52 | +--+--------------+-------------------------------+-------+---------+-----+ |15|coil_2000 | KDD, CoIL, target: minority | 16:1 | 9,822 | 85 | +--+--------------+-------------------------------+-------+---------+-----+ |16|arrhythmia | UCI, target: 06 | 17:1 | 452 | 278 | +--+--------------+-------------------------------+-------+---------+-----+ |17|solar_flare_m0| UCI, target: M->0 | 19:1 | 1,389 | 32 | +--+--------------+-------------------------------+-------+---------+-----+ |18|oil | UCI, target: minority | 22:1 | 937 | 49 | +--+--------------+-------------------------------+-------+---------+-----+ |19|car_eval_4 | UCI, target: vgood | 26:1 | 1,728 | 21 | +--+--------------+-------------------------------+-------+---------+-----+ |20|wine_quality | UCI, wine, target: <=4 | 26:1 | 4,898 | 11 | +--+--------------+-------------------------------+-------+---------+-----+ |21|letter_img | UCI, target: Z | 26:1 | 20,000 | 16 | +--+--------------+-------------------------------+-------+---------+-----+ |22|yeast_me2 | UCI, target: ME2 | 28:1 | 1,484 | 8 | +--+--------------+-------------------------------+-------+---------+-----+ |23|webpage | LIBSVM, w7a, target: minority | 33:1 | 34,780 | 300 | +--+--------------+-------------------------------+-------+---------+-----+ |24|ozone_level | UCI, ozone, data | 34:1 | 2,536 | 72 | +--+--------------+-------------------------------+-------+---------+-----+ |25|mammography | UCI, target: minority | 42:1 | 11,183 | 6 | +--+--------------+-------------------------------+-------+---------+-----+ |26|protein_homo | KDD CUP 2004, minority | 11:1 | 145,751 | 74 | +--+--------------+-------------------------------+-------+---------+-----+ |27|abalone_19 | UCI, target: 19 | 130:1 | 4,177 | 10 | +--+--------------+-------------------------------+-------+---------+-----+ References ---------- .. [1] Ding, Zejin, "Diversified Ensemble Classifiers for Highly Imbalanced Data Learning and their Application in Bioinformatics." Dissertation, Georgia State University, (2011). """ data_home = get_data_home(data_home=data_home) zenodo_dir = join(data_home, "zenodo") datasets = OrderedDict() if filter_data is None: filter_data_ = MAP_NAME_ID.keys() else: list_data = MAP_NAME_ID.keys() filter_data_ = [] for it in filter_data: if isinstance(it, six.string_types): if it not in list_data: raise ValueError('{} is not a dataset available. ' 'The available datasets are {}'.format( it, list_data)) else: filter_data_.append(it) elif isinstance(it, int): if it < 1 or it > 27: raise ValueError('The dataset with the ID={} is not an ' 'available dataset. The IDs are ' '{}'.format(it, range(1, 28))) else: # The index start at one, then we need to remove one # to not have issue with the indexing. filter_data_.append(MAP_ID_NAME[it]) else: raise ValueError('The value in the tuple should be str or int.' ' Got {} instead.'.format(type(it))) # go through the list and check if the data are available for it in filter_data_: filename = PRE_FILENAME + str(MAP_NAME_ID[it]) + POST_FILENAME filename = join(zenodo_dir, filename) available = isfile(filename) if download_if_missing and not available: makedirs(zenodo_dir, exist_ok=True) logger.warning("Downloading %s" % URL) f = BytesIO(urlopen(URL).read()) tar = tarfile.open(fileobj=f) tar.extractall(path=zenodo_dir) elif not download_if_missing and not available: raise IOError("Data not found and `download_if_missing` is False") data = np.load(filename) X, y = data['data'], data['label'] if shuffle: ind = np.arange(X.shape[0]) rng = check_random_state(random_state) rng.shuffle(ind) X = X[ind] y = y[ind] datasets[it] = Bunch(data=X, target=y, DESCR=it) return datasets