# Copyright 2019-2020 The ASReview Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import numpy as np
from asreview.compat import convert_id_to_idx
from asreview.init_sampling import sample_prior_knowledge
from asreview.review import BaseReview
[docs]class ReviewSimulate(BaseReview):
"""ASReview Simulation mode class.
Arguments
---------
as_data: asreview.ASReviewData
The data object which contains the text, labels, etc.
model: BaseModel
Initialized model to fit the data during active learning.
See asreview.models.utils.py for possible models.
query_model: BaseQueryModel
Initialized model to query new instances for review, such as random
sampling or max sampling.
See asreview.query_strategies.utils.py for query models.
balance_model: BaseBalanceModel
Initialized model to redistribute the training data during the
active learning process. They might either resample or undersample
specific papers.
feature_model: BaseFeatureModel
Feature extraction model that converts texts and keywords to
feature matrices.
n_prior_included: int
Sample n prior included papers.
n_prior_excluded: int
Sample n prior excluded papers.
prior_idx: int
Prior indices by row number.
n_papers: int
Number of papers to review during the active learning process,
excluding the number of initial priors. To review all papers, set
n_papers to None.
n_instances: int
Number of papers to query at each step in the active learning
process.
n_queries: int
Number of steps/queries to perform. Set to None for no limit.
start_idx: numpy.ndarray
Start the simulation/review with these indices. They are assumed to
be already labeled. Failing to do so might result bad behaviour.
init_seed: int
Seed for setting the prior indices if the --prior_idx option is
not used. If the option prior_idx is used with one or more
index, this option is ignored.
state_file: str
Path to state file. Replaces log_file argument.
"""
name = "simulate"
def __init__(self,
as_data,
*args,
n_prior_included=0,
n_prior_excluded=0,
prior_idx=None,
init_seed=None,
**kwargs):
self.n_prior_included = n_prior_included
self.n_prior_excluded = n_prior_excluded
# check for partly labeled data
labels = as_data.labels
labeled_idx = np.where((labels == 0) | (labels == 1))[0]
if len(labeled_idx) != len(labels):
raise ValueError("Expected fully labeled dataset.")
if prior_idx is not None and len(prior_idx) != 0:
start_idx = prior_idx
else:
start_idx = as_data.prior_data_idx
if len(start_idx) == 0 and n_prior_included + n_prior_excluded > 0:
start_idx = sample_prior_knowledge(labels,
n_prior_included,
n_prior_excluded,
random_state=init_seed)
super(ReviewSimulate, self).__init__(as_data,
*args,
start_idx=start_idx,
**kwargs)
def _get_labels(self, ind):
"""Get the labels directly from memory.
Arguments
---------
ind: list, numpy.ndarray
A list with indices
Returns
-------
list, numpy.ndarray
The corresponding true labels for each indice.
"""
return self.y[ind, ]