Source code for asreview.models.balance.triple

# Copyright 2019-2022 The ASReview Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

__all__ = []

import logging

import numpy as np

from asreview.models.balance.base import BaseBalance
from asreview.models.balance.double import DoubleBalance
from asreview.models.balance.double import _one_weight
from asreview.models.balance.double import _zero_weight
from asreview.models.balance.double import fill_training
from asreview.models.balance.double import random_round
from asreview.utils import get_random_state



[docs]
class TripleBalance(BaseBalance):
    """Triple balance strategy (``triple``).

    Broken. Only for internal and experimental use.

    This divides the training data into three sets: included papers, excluded
    papers found with random sampling and papers found with max sampling. They
    are balanced according to formulas depending on the percentage of papers
    read in the dataset, the number of papers with random/max sampling etc.
    Works best for stochastic training algorithms. Reduces to both full
    sampling and undersampling with corresponding parameters.

    Arguments
    ---------
    a: float
        Governs the weight of the 1's. Higher values mean linearly more 1's
        in your training sample.
    alpha: float
        Governs the scaling the weight of the 1's, as a function of the
        ratio of ones to zeros. A positive value means that the lower the
        ratio of zeros to ones, the higher the weight of the ones.
    b: float
        Governs how strongly we want to sample depending on the total
        number of samples. A value of 1 means no dependence on the total
        number of samples, while lower values mean increasingly stronger
        dependence on the number of samples.
    beta: float
        Governs the scaling of the weight of the zeros depending on the
        number of samples. Higher values means that larger samples are more
        strongly penalizing zeros.
    c: float
        Value between one and zero that governs the weight of samples done
        with maximal sampling. Higher values mean higher weight.
    gamma: float
        Governs the scaling of the weight of the max samples as a function
        of the % of papers read. Higher values mean stronger scaling.
    """

    name = "triple"
    label = "Dynamic resampling (Triple)"

    def __init__(
        self,
        a=2.155,
        alpha=0.94,
        b=0.789,
        beta=1.0,
        c=0.835,
        gamma=2.0,
        shuffle=True,
        random_state=None,
    ):
        """Initialize the triple balance strategy."""

        super().__init__()
        self.a = a
        self.alpha = alpha
        self.b = b
        self.beta = beta
        self.c = c
        self.gamma = gamma
        self.shuffle = shuffle
        self.fallback_model = DoubleBalance(
            a=a, alpha=alpha, b=b, beta=beta, random_state=random_state
        )
        self._random_state = get_random_state(random_state)


[docs]
    def sample(self, X, y, train_idx, shared):
        """Resample the training data.

        Arguments
        ---------
        X: numpy.ndarray
            Complete feature matrix.
        y: numpy.ndarray
            Labels for all papers.
        train_idx: numpy.ndarray
            Training indices, that is all papers that have been reviewed.
        shared: dict
            Dictionary to share data between balancing models and other models.

        Returns
        -------
        numpy.ndarray,numpy.ndarray:
            X_train, y_train: the resampled matrix, labels.
        """
        max_idx = np.array(shared["query_src"].get("max", []), dtype=int)
        rand_idx = np.array([], dtype=int)
        for qtype in shared["query_src"]:
            if qtype != "max":
                rand_idx = np.append(rand_idx, shared["query_src"][qtype])

        rand_idx = rand_idx.astype(int)
        # Write them back for next round.
        if self.shuffle:
            self._random_state.shuffle(rand_idx)
            self._random_state.shuffle(max_idx)

        if len(rand_idx) == 0 or len(max_idx) == 0:
            logging.debug(
                "Warning: trying to use triple balance, but unable"
                f" to, because we have {len(max_idx)} max samples "
                f"and {len(rand_idx)} random samples."
            )
            return self.fallback_model.sample(X, y, train_idx, shared)

        # Split the idx into three groups: 1's, random 0's, max 0's.
        one_idx = train_idx[np.where(y[train_idx] == 1)]
        zero_max_idx = max_idx[np.where(y[max_idx] == 0)]
        zero_rand_idx = rand_idx[np.where(y[rand_idx] == 0)]

        if len(zero_rand_idx) == 0 or len(zero_max_idx) == 0:
            logging.debug(
                "Warning: trying to use triple balance, but unable "
                f"to, because we have {len(zero_max_idx)} zero max"
                f"samples and {len(zero_rand_idx)} random samples."
            )
            return self.fallback_model.sample(X, y, train_idx, shared)

        n_one = len(one_idx)
        n_zero_rand = len(zero_rand_idx)
        n_zero_max = len(zero_max_idx)
        n_samples = len(y)
        n_train = len(train_idx)

        # Get the distribution of 1's, and random 0's and max 0's.
        n_one_train, n_zero_rand_train, n_zero_max_train = _get_triple_dist(
            n_one,
            n_zero_rand,
            n_zero_max,
            n_samples,
            n_train,
            self.a,
            self.alpha,
            self.b,
            self.beta,
            self.c,
            self.gamma,
            self._random_state,
        )
        logging.debug(
            f"(1, 0_rand, 0_max) = ({n_one_train}, "
            f"{n_zero_rand_train}, {n_zero_max_train})"
        )

        one_train_idx = fill_training(one_idx, n_one_train, self._random_state)
        zero_rand_train_idx = fill_training(
            zero_rand_idx, n_zero_rand_train, self._random_state
        )
        zero_max_train_idx = fill_training(
            zero_max_idx, n_zero_max_train, self._random_state
        )

        all_idx = np.concatenate(
            [one_train_idx, zero_rand_train_idx, zero_max_train_idx]
        )
        self._random_state.shuffle(all_idx)

        return X[all_idx], y[all_idx]




def _zero_max_weight(fraction_read, c, gamma):
    """
    Get the weight ratio between ones and zeros.

    Parameters
    ----------
    beta:
        Exponent governing decay of 1's to 0's.
    delta:
        Asymptotic ratio for infinite number of samples.

    Returns
    -------
    float:
        Weight ratio between 1's and 0's
    """
    weight = 1 - (1 - c) * (1 - fraction_read) ** gamma
    return weight


def _get_triple_dist(
    n_one,
    n_zero_rand,
    n_zero_max,
    n_samples,
    n_train,
    one_a,
    one_alpha,
    zero_b,
    zero_beta,
    zero_max_c,
    zero_max_gamma,
    random_state,
):
    "Get the number of 1's, random 0's and max 0's in each mini epoch."
    n_zero = n_zero_rand + n_zero_max
    n_read = n_one + n_zero
    one_weight = _one_weight(n_one, n_zero, one_a, one_alpha)
    zero_weight = _zero_weight(n_read, zero_b, zero_beta)
    zero_max_weight = _zero_max_weight(n_read / n_samples, zero_max_c, zero_max_gamma)

    tot_zo_weight = one_weight * n_one + zero_weight * n_zero

    n_one_train = random_round(
        one_weight * n_one * n_train / tot_zo_weight, random_state
    )
    n_one_train = max(1, min(n_train - 2, n_one_train))
    n_zero_train = n_train - n_one_train

    tot_rm_weight = 1 * n_zero_rand + zero_max_weight * n_zero_max
    n_zero_rand_train = random_round(
        n_zero_train * 1 * n_zero_rand / tot_rm_weight, random_state
    )
    n_zero_rand_train = max(1, min(n_zero_rand - 1, n_zero_rand_train))
    n_zero_max_train = n_zero_train - n_zero_rand_train

    return n_one_train, n_zero_rand_train, n_zero_max_train