Source code for asreview.datasets

# Copyright 2019-2022 The ASReview Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

__all__ = [
    "BaseDataGroup",
    "BaseDataSet",
    "BenchmarkDataGroup",
    "DatasetManager",
    "DatasetNotFoundError",
    "NaturePublicationDataGroup",
    "SynergyDataGroup",
    "SynergyDataSet",
]

import json
import socket
import tempfile
import warnings
from abc import ABC
from abc import abstractmethod
from pathlib import Path
from urllib.error import URLError
from urllib.request import urlopen
from urllib.request import urlretrieve

import synergy_dataset as sd

from asreview.io import CSVReader
from asreview.utils import _entry_points
from asreview.utils import _get_filename_from_url
from asreview.utils import is_iterable


class DatasetNotFoundError(Exception):
    pass


def _download_from_metadata(url):
    """Download metadata to dataset."""

    try:
        with urlopen(url, timeout=10) as f:
            meta_data = json.loads(f.read().decode())
    except URLError as e:
        if isinstance(e.reason, socket.timeout):
            raise Exception("Connection time out.")
        raise e

    datasets = []
    for data in meta_data.values():
        # raise error on versioned datasets
        if "type" in data and data["type"] == "versioned":
            raise ValueError("Datasets of type 'versioned' are deprecated")

        datasets.append(BaseDataSet(**data))

    return datasets


[docs] class BaseDataSet: def __init__( self, dataset_id, filepath=None, title=None, description=None, authors=None, topic=None, link=None, reference=None, img_url=None, license=None, year=None, aliases=[], **kwargs, ): """Base class for metadata of dataset. A BaseDataSet is a class with metadata about a (labeled) dataset used in ASReview LAB. The dataset can be used via the frontend or via command line interface. In general, a BaseDataSet is part of a group (BaseDataGroup). Examples -------- The following example simulates a dataset with dataset_id 'cord19'. The name of the group is 'covid'. >>> asreview simulate covid:cord_19 Parameters ---------- dataset_id: str Identifier of the dataset. The value is a alphanumeric string used to indentify the dataset via the command line interface. Example: 'groupname:DATASET_ID' where DATASET_ID is the value of dataset_id. filepath: str Path to file or URL to the dataset. See asreview.readthedocs.io/{URL} for information about valid datasets. title: str Title of the dataset. description: str Description of the dataset. Optional. authors: list Authors of the dataset. Optional. topic: str Topics of the dataset. Optional. link: str Link to a website or additional information. reference: str (Academic) reference describing the dataset. Optional. license: str License of the dataset. Optional year: str Year of publication of the dataset. Optional. img_url: str Image for display in graphical interfaces. Optional. aliases: list Additional identifiers for the dataset_id. This can be useful for long of complex dataset_id's. Optional. """ self.dataset_id = dataset_id self.filepath = filepath self.title = title self.description = description self.authors = authors self.topic = topic self.link = link self.reference = reference self.license = license self.year = year self.img_url = img_url self.aliases = aliases self.kwargs = kwargs def __str__(self): return f"<BaseDataSet dataset_id='{self.dataset_id}' title='{self.title}'>" def __dict__(self): return { "dataset_id": self.dataset_id, "filepath": self.filepath, "title": self.title, "description": self.description, "authors": self.authors, "topic": self.topic, "link": self.link, "reference": self.reference, "license": self.license, "year": self.year, "img_url": self.img_url, "aliases": self.aliases, **self.kwargs, } @property def reader(self): return None @property def filename(self): if not hasattr(self, "_filename"): self._filename = _get_filename_from_url(self.filepath) return self._filename
[docs] def to_file(self, path): # todo return without store urlretrieve(self.filepath, path)
[docs] class BaseDataGroup(ABC): def __init__(self, *datasets): """Group of datasets. Group containing one or more datasets. Parameters ---------- *datasets: One or more datasets. """ self.datasets = list(datasets) @property @abstractmethod def group_id(cls): pass @property @abstractmethod def description(cls): pass def __str__(self): return f"<BaseDataGroup group_id='{self.group_id}'>" def __dict__(self): return {d.dataset_id: d for d in self.datasets}
[docs] def append(self, dataset): """Append dataset to group. dataset: asreview.datasets.BaseDataSet A asreview BaseDataSet-like object. """ if not issubclass(dataset, BaseDataSet): raise ValueError("Expected BaseDataSet or subclass of BaseDataSet.") self.datasets.append(dataset)
[docs] def find(self, dataset_id): """Find dataset in the group. Parameters ---------- dataset_id: str Identifier of the dataset to look for. It can also be one of the aliases. Case insensitive. Returns ------- asreview.datasets.BaseDataSet: Returns base dataset with the given dataset_id. """ results = [] for d in self.datasets: if dataset_id.lower() == d.dataset_id.lower() or dataset_id.lower() in [ a.lower() for a in d.aliases ]: results.append(d) if len(results) > 1: raise ValueError( f"Broken dataset group '{self.group_id}' containing multiple" f" datasets with the same name/alias '{dataset_id}'." ) elif len(results) == 1: return results[0] raise DatasetNotFoundError(f"Dataset {dataset_id} not found")
[docs] class DatasetManager: @property def groups(self): return list(_entry_points(group="asreview.datasets").names)
[docs] def find(self, dataset_id): """Find a dataset. Parameters ---------- dataset_id: str, iterable Look for this term in aliases within any dataset. A group can be specified by setting dataset_id to 'group_id:dataset_id'. This can be helpful if the dataset_id is not unique. The dataset_id can also be a non-string iterable, in which case a list will be returned with all terms. Dataset_ids should not contain semicolons (:). Return None if the dataset could not be found. Returns ------- BaseDataSet: Return the dataset with dataset_id. """ # If dataset_id is a non-string iterable, return a list. if is_iterable(dataset_id): return [self.find(x) for x in dataset_id] # If dataset_id is a valid path, create a dataset from it. if Path(dataset_id).is_file(): return BaseDataSet(dataset_id) dataset_id = str(dataset_id) # get installed dataset groups dataset_groups = _entry_points(group="asreview.datasets") # Split into group/dataset if possible. split_dataset_id = dataset_id.split(":") if len(split_dataset_id) == 2: data_group = split_dataset_id[0] split_dataset_id = split_dataset_id[1] if data_group in self.groups: return dataset_groups[data_group].load()().find(split_dataset_id) # Look through all available/installed groups for the name. all_results = {} for data_group in dataset_groups: try: all_results[data_group.name] = data_group.load()().find(dataset_id) except Exception: # don't raise error on loading entry point pass # If we have multiple results, throw an error. if len(all_results) > 1: raise ValueError( f"Multiple datasets found: {list(all_results)}." "Use DATAGROUP:DATASET format to specify which one" " you want." ) if len(all_results) == 1: return list(all_results.values())[0] # Could not find dataset raise DatasetNotFoundError(f"Dataset {dataset_id} not found")
[docs] def list(self, include=None, exclude=None, serialize=True, raise_on_error=False): """List the available datasets. Parameters ---------- include: str, iterable List of groups to include exclude: str, iterable List of groups to exclude from all groups. serialize: bool Make returned list serializable. raise_on_error: bool Raise error when entry point can't be loaded. Returns ------- list: List with datasets as values. """ if include is not None and exclude is not None: raise ValueError("Cannot exclude groups when include is not None.") if include is not None: if not is_iterable(include): include = [include] groups = include elif exclude is not None: exclude = exclude if is_iterable(exclude) else [exclude] groups = list(set(self.groups) - set(exclude)) else: groups = self.groups.copy() dataset_groups = _entry_points(group="asreview.datasets") group_list = [] for group in groups: try: group_list.append(dataset_groups[group].load()()) except Exception as err: # don't raise error on loading entry point if raise_on_error: raise err if serialize: dataset_list_ser = [] for data_group in group_list: try: group_ser = [] for dataset in data_group.datasets: group_ser.append(dataset.__dict__()) dataset_list_ser.append( { "group_id": data_group.group_id, "description": data_group.description, "datasets": group_ser, } ) except Exception as err: # don't raise error on loading entry point if raise_on_error: raise err return dataset_list_ser return group_list
[docs] class NaturePublicationDataGroup(BaseDataGroup): """Datasets used in the paper Van de Schoot et al. 2020.""" group_id = "benchmark-nature" description = ( "Datasets used in the validation paper published" " in Nature Machine Intelligence (van de Schoot et al. 2021)" ) def __init__(self): meta_file = "https://raw.githubusercontent.com/asreview/paper-asreview/master/index_v1.json" # noqa datasets = _download_from_metadata(meta_file) super(NaturePublicationDataGroup, self).__init__(*datasets)
class SynergyDataSet(BaseDataSet): @property def filename(self): return self.dataset_id + ".csv" @property def reader(self): return CSVReader def to_file(self, path=None): # download, build, and store to local file try: return sd.Dataset(self.dataset_id).to_frame().to_csv(path) except FileNotFoundError: tmp_synergy_folder = tempfile.mkdtemp() sd.download_raw_subset(self.dataset_id, path=tmp_synergy_folder) for d in sd.iter_datasets(path=tmp_synergy_folder): if d.name == self.dataset_id: return d.to_frame().to_csv(path) raise ValueError("Synergy dataset not found")
[docs] class SynergyDataGroup(BaseDataGroup): """Datasets available in the SYNERGY dataset.""" group_id = "synergy" description = "SYNERGY datasets (asreview.ai/synergy)" def __init__(self): # The following code was used to generate the metadata # # import synergy_dataset as sd # from pprint import pprint # meta_synergy = {} # for x in sd.iter_datasets(): # meta_synergy[x.name] = { # "title": x.metadata["publication"]["display_name"], # "authors": x.cite.split(",")[0] + " et al.", # "topic": x.metadata # ["data"]["concepts"]["included"][0]["display_name"], # "link": "https://doi.org/10.34894/HE6NAQ", # "reference": x.metadata["publication"]["doi"], # "license": "See Synergy dataset", # "year": x.metadata["publication"]["publication_year"] # } # pprint(meta_synergy) synergy_metadata = { "Appenzeller-Herzog_2019": { "authors": "Appenzeller‐Herzog et al.", "license": "See Synergy dataset", "link": "https://doi.org/10.34894/HE6NAQ", "reference": "https://doi.org/10.1111/liv.14179", "title": "Comparative effectiveness of common " "therapies for Wilson disease: A " "systematic review and meta‐analysis of " "controlled studies", "topic": "Medicine", "year": 2019, }, "Bos_2018": { "authors": "Bos et al.", "license": "See Synergy dataset", "link": "https://doi.org/10.34894/HE6NAQ", "reference": "https://doi.org/10.1016/j.jalz.2018.04.007", "title": "Cerebral small vessel disease and the risk of " "dementia: A systematic review and meta‐analysis of " "population‐based evidence", "topic": "Medicine", "year": 2018, }, "Brouwer_2019": { "authors": "Brouwer et al.", "license": "See Synergy dataset", "link": "https://doi.org/10.34894/HE6NAQ", "reference": "https://doi.org/10.1016/j.cpr.2019.101773", "title": "Psychological theories of depressive relapse and " "recurrence: A systematic review and meta-analysis " "of prospective studies", "topic": "Psychology", "year": 2019, }, "Chou_2003": { "authors": "Chou et al.", "license": "See Synergy dataset", "link": "https://doi.org/10.34894/HE6NAQ", "reference": "https://doi.org/10.1016/j.jpainsymman.2003.03.003", "title": "Comparative efficacy and safety of long-acting oral " "opioids for chronic non-cancer pain: a systematic " "review", "topic": "Medicine", "year": 2003, }, "Chou_2004": { "authors": "Chou et al.", "license": "See Synergy dataset", "link": "https://doi.org/10.34894/HE6NAQ", "reference": "https://doi.org/10.1016/j.jpainsymman.2004.05.002", "title": "Comparative efficacy and safety of skeletal muscle " "relaxants for spasticity and musculoskeletal " "conditions: a systematic review", "topic": "Medicine", "year": 2004, }, "Donners_2021": { "authors": "Donners et al.", "license": "See Synergy dataset", "link": "https://doi.org/10.34894/HE6NAQ", "reference": "https://doi.org/10.1007/s40262-021-01042-w", "title": "Pharmacokinetics and Associated Efficacy of " "Emicizumab in Humans: A Systematic Review", "topic": "Medicine", "year": 2021, }, "Hall_2012": { "authors": "Hall et al.", "license": "See Synergy dataset", "link": "https://doi.org/10.34894/HE6NAQ", "reference": "https://doi.org/10.1109/tse.2011.103", "title": "A Systematic Literature Review on Fault Prediction " "Performance in Software Engineering", "topic": "Computer science", "year": 2012, }, "Jeyaraman_2020": { "authors": "Jeyaraman et al.", "license": "See Synergy dataset", "link": "https://doi.org/10.34894/HE6NAQ", "reference": "https://doi.org/10.1177/1947603520951623", "title": "Does the Source of Mesenchymal Stem Cell Have an " "Effect in the Management of Osteoarthritis of " "the Knee? Meta-Analysis of Randomized Controlled " "Trials", "topic": "Medicine", "year": 2020, }, "Leenaars_2019": { "authors": "Leenaars et al.", "license": "See Synergy dataset", "link": "https://doi.org/10.34894/HE6NAQ", "reference": "https://doi.org/10.5334/jcr.183", "title": "Sleep and Microdialysis: An Experiment and a " "Systematic Review of Histamine and Several Amino " "Acids", "topic": "Psychology", "year": 2019, }, "Leenaars_2020": { "authors": "Leenaars et al.", "license": "See Synergy dataset", "link": "https://doi.org/10.34894/HE6NAQ", "reference": "https://doi.org/10.3390/ani10061047", "title": "A Systematic Review Comparing Experimental Design " "of Animal and Human Methotrexate Efficacy Studies " "for Rheumatoid Arthritis: Lessons for the " "Translational Value of Animal Studies", "topic": "Medicine", "year": 2020, }, "Meijboom_2021": { "authors": "Meijboom et al.", "license": "See Synergy dataset", "link": "https://doi.org/10.34894/HE6NAQ", "reference": "https://doi.org/10.1007/s40259-021-00508-4", "title": "Patients Retransitioning from Biosimilar TNFα " "Inhibitor to the Corresponding Originator After " "Initial Transitioning to the Biosimilar: A " "Systematic Review", "topic": "Medicine", "year": 2021, }, "Menon_2022": { "authors": "Menon et al.", "license": "See Synergy dataset", "link": "https://doi.org/10.34894/HE6NAQ", "reference": "https://doi.org/10.1080/10408444.2022.2082917", "title": "The methodological rigour of systematic reviews in " "environmental health", "topic": "Medicine", "year": 2022, }, "Moran_2021": { "authors": "Moran et al.", "license": "See Synergy dataset", "link": "https://doi.org/10.34894/HE6NAQ", "reference": "https://doi.org/10.1111/brv.12655", "title": "Poor nutritional condition promotes high‐risk " "behaviours: a systematic review and meta‐analysis", "topic": "Biology", "year": 2021, }, "Muthu_2021": { "authors": "Muthu et al.", "license": "See Synergy dataset", "link": "https://doi.org/10.34894/HE6NAQ", "reference": "https://doi.org/10.1097/brs.0000000000003645", "title": "Fragility Analysis of Statistically Significant " "Outcomes of Randomized Control Trials in Spine " "Surgery", "topic": "Medicine", "year": 2021, }, "Nelson_2002": { "authors": "Nelson et al.", "license": "See Synergy dataset", "link": "https://doi.org/10.34894/HE6NAQ", "reference": "https://doi.org/10.1001/jama.288.7.872", "title": "Postmenopausal Hormone Replacement Therapy", "topic": "Medicine", "year": 2002, }, "Oud_2018": { "authors": "Oud et al.", "license": "See Synergy dataset", "link": "https://doi.org/10.34894/HE6NAQ", "reference": "https://doi.org/10.1177/0004867418791257", "title": "Specialized psychotherapies for adults with borderline " "personality disorder: A systematic review and " "meta-analysis", "topic": "Psychology", "year": 2018, }, "Radjenovic_2013": { "authors": "Radjenović et al.", "license": "See Synergy dataset", "link": "https://doi.org/10.34894/HE6NAQ", "reference": "https://doi.org/10.1016/j.infsof.2013.02.009", "title": "Software fault prediction metrics: A systematic " "literature review", "topic": "Computer science", "year": 2013, }, "Sep_2021": { "authors": "Sep et al.", "license": "See Synergy dataset", "link": "https://doi.org/10.34894/HE6NAQ", "reference": "https://doi.org/10.1371/journal.pone.0249102", "title": "The rodent object-in-context task: A systematic review " "and meta-analysis of important variables", "topic": "Psychology", "year": 2021, }, "Smid_2020": { "authors": "Smid et al.", "license": "See Synergy dataset", "link": "https://doi.org/10.34894/HE6NAQ", "reference": "https://doi.org/10.1080/10705511.2019.1577140", "title": "Bayesian Versus Frequentist Estimation for Structural " "Equation Models in Small Sample Contexts: A " "Systematic Review", "topic": "Computer science", "year": 2020, }, "Walker_2018": { "authors": "Walker et al.", "license": "See Synergy dataset", "link": "https://doi.org/10.34894/HE6NAQ", "reference": "https://doi.org/10.1016/j.envint.2017.12.032", "title": "Human and animal evidence of potential " "transgenerational inheritance of health effects: An " "evidence map and state-of-the-science evaluation", "topic": "Biology", "year": 2018, }, "Wassenaar_2017": { "authors": "Wassenaar et al.", "license": "See Synergy dataset", "link": "https://doi.org/10.34894/HE6NAQ", "reference": "https://doi.org/10.1289/ehp1233", "title": "Systematic Review and Meta-Analysis of " "Early-Life Exposure to Bisphenol A and " "Obesity-Related Outcomes in Rodents", "topic": "Medicine", "year": 2017, }, "Wolters_2018": { "authors": "Wolters et al.", "license": "See Synergy dataset", "link": "https://doi.org/10.34894/HE6NAQ", "reference": "https://doi.org/10.1016/j.jalz.2018.01.007", "title": "Coronary heart disease, heart failure, and the " "risk of dementia: A systematic review and " "meta‐analysis", "topic": "Medicine", "year": 2018, }, "van_Dis_2020": { "authors": "van Dis et al.", "license": "See Synergy dataset", "link": "https://doi.org/10.34894/HE6NAQ", "reference": "https://doi.org/10.1001/jamapsychiatry.2019.3986", "title": "Long-term Outcomes of Cognitive Behavioral Therapy " "for Anxiety-Related Disorders", "topic": "Psychology", "year": 2020, }, "van_de_Schoot_2018": { "authors": "van de Schoot et al.", "license": "See Synergy dataset", "link": "https://doi.org/10.34894/HE6NAQ", "reference": "https://doi.org/10.1080/00273171.2017.1412293", "title": "Bayesian PTSD-Trajectory Analysis with " "Informed Priors Based on a Systematic " "Literature Search and Expert Elicitation", "topic": "Psychology", "year": 2018, }, "van_der_Valk_2021": { "authors": "Valk et al.", "license": "See Synergy dataset", "link": "https://doi.org/10.34894/HE6NAQ", "reference": "https://doi.org/10.1111/obr.13376", "title": "Cross‐sectional relation of long‐term " "glucocorticoids in hair with anthropometric " "measurements and their possible determinants: " "A systematic review and meta‐analysis", "topic": "Medicine", "year": 2021, }, "van_der_Waal_2022": { "authors": "van der Waal et al.", "license": "See Synergy dataset", "link": "https://doi.org/10.34894/HE6NAQ", "reference": "https://doi.org/10.1016/j.jgo.2022.09.012", "title": "A meta-analysis on the role older adults with " "cancer favour in treatment decision making", "topic": "Medicine", "year": 2022, }, } datasets = [SynergyDataSet(k, **v) for k, v in synergy_metadata.items()] super(SynergyDataGroup, self).__init__(*datasets)
class BenchmarkDataGroup(BaseDataGroup): """Datasets available in the benchmark platform. Deprecated """ group_id = "benchmark" description = "DEPRECATED: Datasets available in the online benchmark platform" def __init__(self): warnings.warn( "The use of 'benchmark' datasets is deprecated, " "use SYNERGY dataset instead. For more information, see " "https://github.com/asreview/synergy-dataset.", category=UserWarning ) meta_file = "https://raw.githubusercontent.com/asreview/systematic-review-datasets/master/index_v1.json" # noqa datasets = _download_from_metadata(meta_file) super(BenchmarkDataGroup, self).__init__(*datasets)