Source code for asreview.search

# Copyright 2019-2022 The ASReview Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

__all__ = ["fuzzy_find"]

import re
from difflib import SequenceMatcher

import numpy as np
import pandas as pd

from asreview.utils import format_to_str


def _create_inverted_index(match_strings):
    index = {}
    word = re.compile(r"['\w]+")
    for i, match in enumerate(match_strings):
        tokens = word.findall(match.lower())
        for token in tokens:
            if token in index:
                if index[token][-1] != i:
                    index[token].append(i)
            else:
                index[token] = [i]
    return index


def _get_fuzzy_scores(keywords, match_strings, threshold=0.9):
    """Rank a list of strings, depending on a set of keywords.

    Arguments
    ---------
    keywords: str
        Keywords that we are trying to find in the string list.
    str_list: list
        List of strings that should be scored according to the keywords.

    Returns
    -------
    numpy.ndarray
        Array of scores ordered in the same way as the str_list input.
    """
    inv_index = _create_inverted_index(match_strings)

    n_match = len(match_strings)
    word = re.compile(r"['\w]+")
    key_list = word.findall(keywords.lower())

    ratios = np.zeros(n_match)
    for key in key_list:
        cur_ratios = {}
        s = SequenceMatcher()
        s.set_seq2(key)
        for token in inv_index:
            s.set_seq1(token)
            ratio = s.quick_ratio()
            if ratio < threshold:
                continue
            for idx in inv_index[token]:
                if ratio > cur_ratios.get(idx, 0.0):
                    cur_ratios[idx] = ratio

        for idx, rat in cur_ratios.items():
            ratios[idx] += rat

    return (100 * ratios) / len(key_list)



[docs]
def fuzzy_find(
    as_data,
    keywords,
    threshold=60,
    max_return=10,
    exclude=None,
):
    """Find a record using keywords.

    It looks for keywords in the title/authors/keywords
    (for as much is available). Using the diflib package it creates
    a ranking based on token set matching.

    Arguments
    ---------
    as_data: asreview.Dataset
        ASReview data object to search
    keywords: str
        A string of keywords together, can be a combination.
    threshold: float
        Don't return records below this threshold.
    max_return: int
        Maximum number of records to return.
    exclude: list, numpy.ndarray
        List of indices that should be excluded in the search. You would
        put papers that were already labeled here for example.

    Returns
    -------
    list
        Sorted list of indexes that match best the keywords.
    """

    if as_data.title is None:
        raise ValueError("Cannot search dataset without titles.")

    all_strings = pd.Series(as_data.title).fillna("")

    if as_data.authors is not None:
        all_strings += " " + pd.Series(as_data.authors).map(format_to_str).fillna("")

    if as_data.keywords is not None:
        all_strings += " " + pd.Series(as_data.keywords).map(format_to_str).fillna("")

    new_ranking = _get_fuzzy_scores(keywords, all_strings.values)
    sorted_idx = np.argsort(-new_ranking)
    best_idx = []
    if exclude is None:
        exclude = np.array([], dtype=int)
    for idx in sorted_idx:
        if idx in exclude:
            continue
        if len(best_idx) >= max_return:
            break
        if len(best_idx) > 0 and new_ranking[idx] < threshold:
            break
        best_idx.append(idx)
    fuzz_idx = np.array(best_idx, dtype=int)
    return fuzz_idx.tolist()