Source code for asreview.search

# Copyright 2019-2022 The ASReview Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

__all__ = ["fuzzy_find"]

import re
from difflib import SequenceMatcher

import numpy as np
import pandas as pd

from asreview.utils import format_to_str


def _create_inverted_index(match_strings):
    index = {}
    word = re.compile(r"['\w]+")
    for i, match in enumerate(match_strings):
        tokens = word.findall(match.lower())
        for token in tokens:
            if token in index:
                if index[token][-1] != i:
                    index[token].append(i)
            else:
                index[token] = [i]
    return index


def _get_fuzzy_scores(keywords, match_strings, threshold=0.9):
    """Rank a list of strings, depending on a set of keywords.

    Arguments
    ---------
    keywords: str
        Keywords that we are trying to find in the string list.
    str_list: list
        List of strings that should be scored according to the keywords.

    Returns
    -------
    numpy.ndarray
        Array of scores ordered in the same way as the str_list input.
    """
    inv_index = _create_inverted_index(match_strings)

    n_match = len(match_strings)
    word = re.compile(r"['\w]+")
    key_list = word.findall(keywords.lower())

    ratios = np.zeros(n_match)
    for key in key_list:
        cur_ratios = {}
        s = SequenceMatcher()
        s.set_seq2(key)
        for token in inv_index:
            s.set_seq1(token)
            ratio = s.quick_ratio()
            if ratio < threshold:
                continue
            for idx in inv_index[token]:
                if ratio > cur_ratios.get(idx, 0.0):
                    cur_ratios[idx] = ratio

        for idx, rat in cur_ratios.items():
            ratios[idx] += rat

    return (100 * ratios) / len(key_list)


[docs] def fuzzy_find( as_data, keywords, threshold=60, max_return=10, exclude=None, ): """Find a record using keywords. It looks for keywords in the title/authors/keywords (for as much is available). Using the diflib package it creates a ranking based on token set matching. Arguments --------- as_data: asreview.Dataset ASReview data object to search keywords: str A string of keywords together, can be a combination. threshold: float Don't return records below this threshold. max_return: int Maximum number of records to return. exclude: list, numpy.ndarray List of indices that should be excluded in the search. You would put papers that were already labeled here for example. Returns ------- list Sorted list of indexes that match best the keywords. """ if as_data.title is None: raise ValueError("Cannot search dataset without titles.") all_strings = pd.Series(as_data.title).fillna("") if as_data.authors is not None: all_strings += " " + pd.Series(as_data.authors).map(format_to_str).fillna("") if as_data.keywords is not None: all_strings += " " + pd.Series(as_data.keywords).map(format_to_str).fillna("") new_ranking = _get_fuzzy_scores(keywords, all_strings.values) sorted_idx = np.argsort(-new_ranking) best_idx = [] if exclude is None: exclude = np.array([], dtype=int) for idx in sorted_idx: if idx in exclude: continue if len(best_idx) >= max_return: break if len(best_idx) > 0 and new_ranking[idx] < threshold: break best_idx.append(idx) fuzz_idx = np.array(best_idx, dtype=int) return fuzz_idx.tolist()