Source code for asreview.models.query.mixed

# Copyright 2019-2022 The ASReview Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

__all__ = ["MixedQuery", "MaxRandomQuery", "MaxUncertaintyQuery"]

import numpy as np

from asreview.models.query.base import BaseQueryStrategy
from asreview.models.query.utils import get_query_model
from asreview.utils import get_random_state


def _parse_mixed_kwargs(kwargs, strategy_name):
    kwargs_new = {}
    for key, value in kwargs.items():
        if key.startswith(strategy_name):
            new_key = key[len(strategy_name) + 1 :]
            kwargs_new[new_key] = value

    return kwargs_new



[docs]
class MixedQuery(BaseQueryStrategy):
    """Mixed query strategy.

    Use two different query strategies at the same time with a
    ratio of one to the other. A mix of two query strategies is used. For
    example mixing max and random sampling with a mix ratio of 0.95 would mean
    that at each query 95% of the instances would be sampled with the max
    query strategy after which the remaining 5% would be sampled with the
    random query strategy. It would be called the `max_random` query strategy.
    Every combination of primitive query strategy is possible.

    Arguments
    ---------
    strategy_1: str
        Name of the first query strategy. Default 'max'.
    strategy_2: str
        Name of the second query strategy. Default 'random'
    mix_ratio: float
        Sampling from strategy_1 and strategy_2 according a Bernoulli
        distribution. E.g. for mix_ratio=0.95, this implies strategy_1
        with probability 0.95 and strategy_2 with probability 0.05.
        Default 0.95.
    random_state: float
        Seed for the numpy random number generator.
    **kwargs: dict
        Keyword arguments for the two strategy. To specify which of the
        strategies the argument is for, prepend with the name of the query
        strategy and an underscore, e.g. 'max' for maximal sampling.
    """

    def __init__(
        self,
        strategy_1="max",
        strategy_2="random",
        mix_ratio=0.95,
        random_state=None,
        **kwargs,
    ):
        """Initialize the Mixed query strategy."""
        super().__init__()

        self.strategy_1 = strategy_1
        self.strategy_2 = strategy_2

        self.mix_ratio = mix_ratio
        self._random_state = get_random_state(random_state)

        self.kwargs_1 = _parse_mixed_kwargs(kwargs, strategy_1)
        self.kwargs_2 = _parse_mixed_kwargs(kwargs, strategy_2)

        self.query_model1 = get_query_model(strategy_1, **self.kwargs_1)
        if "random_state" in self.query_model1.default_param:
            self.query_model1 = get_query_model(
                strategy_1, random_state=self._random_state, **self.kwargs_1
            )

        self.query_model2 = get_query_model(strategy_2, **self.kwargs_2)
        if "random_state" in self.query_model2.default_param:
            self.query_model2 = get_query_model(
                strategy_2, random_state=self._random_state, **self.kwargs_2
            )


[docs]
    def query(
        self, X, classifier, n_instances=None, return_classifier_scores=False, **kwargs
    ):
        # set the number of instances to len(X) if None
        if n_instances is None:
            n_instances = X.shape[0]

        # compute the predictions
        predictions = classifier.predict_proba(X)

        # Perform the query with strategy 1.
        try:
            query_idx_1 = self.query_model1._query(predictions, n_instances=n_instances)
        except AttributeError:
            # for random for example
            query_idx_1 = self.query_model1.query(
                X, classifier, n_instances=n_instances, return_classifier_scores=False
            )

        # Perform the query with strategy 2.
        try:
            query_idx_2 = self.query_model2._query(predictions, n_instances=n_instances)
        except AttributeError:
            # for random for example
            query_idx_2 = self.query_model2.query(
                X, classifier, n_instances, return_classifier_scores=False
            )

        # mix the 2 query strategies into one list
        query_idx_mix = []
        i = 0
        j = 0

        while i < len(query_idx_1) and j < len(query_idx_2):
            if self._random_state.rand() < self.mix_ratio:
                query_idx_mix.append(query_idx_1[i])
                i = i + 1
            else:
                query_idx_mix.append(query_idx_2[j])
                j = j + 1

        indexes = np.unique(query_idx_mix, return_index=True)[1]
        ranking = [query_idx_mix[i] for i in sorted(indexes)][0:n_instances]

        if return_classifier_scores:
            return ranking, predictions
        else:
            return ranking


    @property
    def name(self):
        return "_".join([self.strategy_1, self.strategy_2])




[docs]
class MaxRandomQuery(MixedQuery):
    """Mixed (95% Maximum and 5% Random) query strategy (``max_random``).

    A mix of maximum and random query strategies with a mix ratio of 0.95.
    At each query 95% of the instances would be sampled with the maximum
    query strategy after which the remaining 5% would be sampled with
    the random query strategy.
    """

    name = "max_random"
    label = "Mixed (95% Maximum and 5% Random)"

    def __init__(self, mix_ratio=0.95, random_state=None, **kwargs):
        """Initialize the Mixed (Maximum and Random) query strategy."""
        super().__init__(
            strategy_1="max",
            strategy_2="random",
            mix_ratio=mix_ratio,
            random_state=random_state,
            **kwargs,
        )




[docs]
class MaxUncertaintyQuery(MixedQuery):
    """Mixed (95% Maximum and 5% Uncertainty) query strategy (``max_uncertainty``).

    A mix of maximum and random query strategies with a mix ratio of 0.95.
    At each query 95% of the instances would be sampled with the maximum
    query strategy after which the remaining 5% would be sampled with
    the uncertainty query strategy.
    """

    name = "max_uncertainty"
    label = "Mixed (95% Maximum and 5% Uncertainty)"

    def __init__(self, mix_ratio=0.95, random_state=None, **kwargs):
        """Initialize the Mixed (Maximum and Uncertainty) query strategy."""
        super().__init__(
            strategy_1="max",
            strategy_2="uncertainty",
            mix_ratio=mix_ratio,
            random_state=random_state,
            **kwargs,
        )