Source code for asreview.models.balance.undersample

# Copyright 2019-2020 The ASReview Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from math import ceil

import numpy as np

from asreview.models.balance.base import BaseBalance
from asreview.utils import get_random_state


[docs]class UndersampleBalance(BaseBalance): """Balancing class that undersamples the data with a given ratio. This undersamples the data, leaving out excluded papers so that the included and excluded papers are in some particular ratio (closer to one). Arguments --------- ratio: double Undersampling ratio of the zero's. If for example we set a ratio of 0.25, we would sample only a quarter of the zeros and all the ones. """ name = "undersample" def __init__(self, ratio=1.0, random_state=None): """Initialize the undersampling balance strategy.""" super(UndersampleBalance, self).__init__() self.ratio = ratio self._random_state = get_random_state(random_state)
[docs] def sample(self, X, y, train_idx, shared): """Resample the training data. Arguments --------- X: numpy.ndarray Complete feature matrix. y: numpy.ndarray Labels for all papers. train_idx: numpy.ndarray Training indices, that is all papers that have been reviewed. shared: dict Dictionary to share data between balancing models and other models. Returns ------- numpy.ndarray,numpy.ndarray: X_train, y_train: the resampled matrix, labels. """ one_ind = train_idx[np.where(y[train_idx] == 1)] zero_ind = train_idx[np.where(y[train_idx] == 0)] n_one = len(one_ind) n_zero = len(zero_ind) # If we don't have an excess of 0's, give back all training_samples. if n_one / n_zero >= self.ratio: shuf_ind = np.append(one_ind, zero_ind) else: n_zero_epoch = ceil(n_one / self.ratio) zero_under = self._random_state.choice( np.arange(n_zero), n_zero_epoch, replace=False) shuf_ind = np.append(one_ind, zero_ind[zero_under]) self._random_state.shuffle(shuf_ind) return X[shuf_ind], y[shuf_ind]
def full_hyper_space(self): from hyperopt import hp parameter_space = { "bal_ratio": hp.lognormal('bal_ratio', 0, 1), } return parameter_space, {}