Source code for asreview.models.query.mixed

# Copyright 2019-2022 The ASReview Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

__all__ = ["MixedQuery", "MaxRandomQuery", "MaxUncertaintyQuery"]

import numpy as np

from asreview.models.query.base import BaseQueryStrategy
from asreview.models.query.utils import get_query_model
from asreview.utils import get_random_state


def _parse_mixed_kwargs(kwargs, strategy_name):
    kwargs_new = {}
    for key, value in kwargs.items():
        if key.startswith(strategy_name):
            new_key = key[len(strategy_name) + 1 :]
            kwargs_new[new_key] = value

    return kwargs_new


[docs] class MixedQuery(BaseQueryStrategy): """Mixed query strategy. Use two different query strategies at the same time with a ratio of one to the other. A mix of two query strategies is used. For example mixing max and random sampling with a mix ratio of 0.95 would mean that at each query 95% of the instances would be sampled with the max query strategy after which the remaining 5% would be sampled with the random query strategy. It would be called the `max_random` query strategy. Every combination of primitive query strategy is possible. Arguments --------- strategy_1: str Name of the first query strategy. Default 'max'. strategy_2: str Name of the second query strategy. Default 'random' mix_ratio: float Sampling from strategy_1 and strategy_2 according a Bernoulli distribution. E.g. for mix_ratio=0.95, this implies strategy_1 with probability 0.95 and strategy_2 with probability 0.05. Default 0.95. random_state: float Seed for the numpy random number generator. **kwargs: dict Keyword arguments for the two strategy. To specify which of the strategies the argument is for, prepend with the name of the query strategy and an underscore, e.g. 'max' for maximal sampling. """ def __init__( self, strategy_1="max", strategy_2="random", mix_ratio=0.95, random_state=None, **kwargs ): """Initialize the Mixed query strategy.""" super(MixedQuery, self).__init__() self.strategy_1 = strategy_1 self.strategy_2 = strategy_2 self.mix_ratio = mix_ratio self._random_state = get_random_state(random_state) self.kwargs_1 = _parse_mixed_kwargs(kwargs, strategy_1) self.kwargs_2 = _parse_mixed_kwargs(kwargs, strategy_2) self.query_model1 = get_query_model(strategy_1, **self.kwargs_1) if "random_state" in self.query_model1.default_param: self.query_model1 = get_query_model( strategy_1, random_state=self._random_state, **self.kwargs_1 ) self.query_model2 = get_query_model(strategy_2, **self.kwargs_2) if "random_state" in self.query_model2.default_param: self.query_model2 = get_query_model( strategy_2, random_state=self._random_state, **self.kwargs_2 )
[docs] def query( self, X, classifier, n_instances=None, return_classifier_scores=False, **kwargs ): # set the number of instances to len(X) if None if n_instances is None: n_instances = X.shape[0] # compute the predictions predictions = classifier.predict_proba(X) # Perform the query with strategy 1. try: query_idx_1 = self.query_model1._query(predictions, n_instances=n_instances) except AttributeError: # for random for example query_idx_1 = self.query_model1.query( X, classifier, n_instances=n_instances, return_classifier_scores=False ) # Perform the query with strategy 2. try: query_idx_2 = self.query_model2._query(predictions, n_instances=n_instances) except AttributeError: # for random for example query_idx_2 = self.query_model2.query( X, classifier, n_instances, return_classifier_scores=False ) # mix the 2 query strategies into one list query_idx_mix = [] i = 0 j = 0 while i < len(query_idx_1) and j < len(query_idx_2): if self._random_state.rand() < self.mix_ratio: query_idx_mix.append(query_idx_1[i]) i = i + 1 else: query_idx_mix.append(query_idx_2[j]) j = j + 1 indexes = np.unique(query_idx_mix, return_index=True)[1] ranking = [query_idx_mix[i] for i in sorted(indexes)][0:n_instances] if return_classifier_scores: return ranking, predictions else: return ranking
[docs] def full_hyper_space(self): from hyperopt import hp space_1, choices_1 = self.query_model1.hyper_space() space_2, choices_2 = self.query_model2.hyper_space() parameter_space = {} hyper_choices = {} for key, value in space_1.items(): new_key = "qry_" + self.strategy_1 + key[4:] parameter_space[new_key] = value hyper_choices[new_key] = choices_1[key] for key, value in space_2.items(): new_key = "qry_" + self.strategy_2 + key[4:] parameter_space[new_key] = value hyper_choices[new_key] = choices_2[key] parameter_space["qry_mix_ratio"] = hp.uniform("qry_mix_ratio", 0, 1) return parameter_space, hyper_choices
@property def name(self): return "_".join([self.strategy_1, self.strategy_2])
[docs] class MaxRandomQuery(MixedQuery): """Mixed (95% Maximum and 5% Random) query strategy (``max_random``). A mix of maximum and random query strategies with a mix ratio of 0.95. At each query 95% of the instances would be sampled with the maximum query strategy after which the remaining 5% would be sampled with the random query strategy. """ name = "max_random" label = "Mixed (95% Maximum and 5% Random)" def __init__(self, mix_ratio=0.95, random_state=None, **kwargs): """Initialize the Mixed (Maximum and Random) query strategy.""" super(MaxRandomQuery, self).__init__( strategy_1="max", strategy_2="random", mix_ratio=mix_ratio, random_state=random_state, **kwargs )
[docs] class MaxUncertaintyQuery(MixedQuery): """Mixed (95% Maximum and 5% Uncertainty) query strategy (``max_uncertainty``). A mix of maximum and random query strategies with a mix ratio of 0.95. At each query 95% of the instances would be sampled with the maximum query strategy after which the remaining 5% would be sampled with the uncertainty query strategy. """ name = "max_uncertainty" label = "Mixed (95% Maximum and 5% Uncertainty)" def __init__(self, mix_ratio=0.95, random_state=None, **kwargs): """Initialize the Mixed (Maximum and Uncertainty) query strategy.""" super(MaxUncertaintyQuery, self).__init__( strategy_1="max", strategy_2="uncertainty", mix_ratio=mix_ratio, random_state=random_state, **kwargs )