# Copyright 2019-2022 The ASReview Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__all__ = [
"BaseDataGroup",
"BaseDataSet",
"BenchmarkDataGroup",
"DatasetManager",
"DatasetNotFoundError",
"NaturePublicationDataGroup",
"SynergyDataGroup",
"SynergyDataSet",
]
import json
import socket
import tempfile
import warnings
from abc import ABC
from abc import abstractmethod
from pathlib import Path
from urllib.error import URLError
from urllib.request import urlopen
from urllib.request import urlretrieve
import synergy_dataset as sd
from asreview.io import CSVReader
from asreview.utils import _entry_points
from asreview.utils import _get_filename_from_url
from asreview.utils import is_iterable
class DatasetNotFoundError(Exception):
pass
def _download_from_metadata(url):
"""Download metadata to dataset."""
try:
with urlopen(url, timeout=10) as f:
meta_data = json.loads(f.read().decode())
except URLError as e:
if isinstance(e.reason, socket.timeout):
raise Exception("Connection time out.")
raise e
datasets = []
for data in meta_data.values():
# raise error on versioned datasets
if "type" in data and data["type"] == "versioned":
raise ValueError("Datasets of type 'versioned' are deprecated")
datasets.append(BaseDataSet(**data))
return datasets
[docs]
class BaseDataSet:
def __init__(
self,
dataset_id,
filepath=None,
title=None,
description=None,
authors=None,
topic=None,
link=None,
reference=None,
img_url=None,
license=None,
year=None,
aliases=[],
**kwargs,
):
"""Base class for metadata of dataset.
A BaseDataSet is a class with metadata about a (labeled)
dataset used in ASReview LAB. The dataset can be used via
the frontend or via command line interface.
In general, a BaseDataSet is part of a group (BaseDataGroup).
Examples
--------
The following example simulates a dataset with dataset_id
'cord19'. The name of the group is 'covid'.
>>> asreview simulate covid:cord_19
Parameters
----------
dataset_id: str
Identifier of the dataset. The value is a alphanumeric
string used to indentify the dataset via the command line
interface. Example: 'groupname:DATASET_ID' where DATASET_ID
is the value of dataset_id.
filepath: str
Path to file or URL to the dataset. See
asreview.readthedocs.io/{URL} for information about valid
datasets.
title: str
Title of the dataset.
description: str
Description of the dataset. Optional.
authors: list
Authors of the dataset. Optional.
topic: str
Topics of the dataset. Optional.
link: str
Link to a website or additional information.
reference: str
(Academic) reference describing the dataset. Optional.
license: str
License of the dataset. Optional
year: str
Year of publication of the dataset. Optional.
img_url: str
Image for display in graphical interfaces. Optional.
aliases: list
Additional identifiers for the dataset_id. This can be
useful for long of complex dataset_id's. Optional.
"""
self.dataset_id = dataset_id
self.filepath = filepath
self.title = title
self.description = description
self.authors = authors
self.topic = topic
self.link = link
self.reference = reference
self.license = license
self.year = year
self.img_url = img_url
self.aliases = aliases
self.kwargs = kwargs
def __str__(self):
return f"<BaseDataSet dataset_id='{self.dataset_id}' title='{self.title}'>"
def __dict__(self):
return {
"dataset_id": self.dataset_id,
"filepath": self.filepath,
"title": self.title,
"description": self.description,
"authors": self.authors,
"topic": self.topic,
"link": self.link,
"reference": self.reference,
"license": self.license,
"year": self.year,
"img_url": self.img_url,
"aliases": self.aliases,
**self.kwargs,
}
@property
def reader(self):
return None
@property
def filename(self):
if not hasattr(self, "_filename"):
self._filename = _get_filename_from_url(self.filepath)
return self._filename
[docs]
def to_file(self, path):
# todo return without store
urlretrieve(self.filepath, path)
[docs]
class BaseDataGroup(ABC):
def __init__(self, *datasets):
"""Group of datasets.
Group containing one or more datasets.
Parameters
----------
*datasets:
One or more datasets.
"""
self.datasets = list(datasets)
@property
@abstractmethod
def group_id(cls):
pass
@property
@abstractmethod
def description(cls):
pass
def __str__(self):
return f"<BaseDataGroup group_id='{self.group_id}'>"
def __dict__(self):
return {d.dataset_id: d for d in self.datasets}
[docs]
def append(self, dataset):
"""Append dataset to group.
dataset: asreview.datasets.BaseDataSet
A asreview BaseDataSet-like object.
"""
if not issubclass(dataset, BaseDataSet):
raise ValueError("Expected BaseDataSet or subclass of BaseDataSet.")
self.datasets.append(dataset)
[docs]
def find(self, dataset_id):
"""Find dataset in the group.
Parameters
----------
dataset_id: str
Identifier of the dataset to look for. It can also be one
of the aliases. Case insensitive.
Returns
-------
asreview.datasets.BaseDataSet:
Returns base dataset with the given dataset_id.
"""
results = []
for d in self.datasets:
if dataset_id.lower() == d.dataset_id.lower() or dataset_id.lower() in [
a.lower() for a in d.aliases
]:
results.append(d)
if len(results) > 1:
raise ValueError(
f"Broken dataset group '{self.group_id}' containing multiple"
f" datasets with the same name/alias '{dataset_id}'."
)
elif len(results) == 1:
return results[0]
raise DatasetNotFoundError(f"Dataset {dataset_id} not found")
[docs]
class DatasetManager:
@property
def groups(self):
return list(_entry_points(group="asreview.datasets").names)
[docs]
def find(self, dataset_id):
"""Find a dataset.
Parameters
----------
dataset_id: str, iterable
Look for this term in aliases within any dataset. A group can
be specified by setting dataset_id to 'group_id:dataset_id'.
This can be helpful if the dataset_id is not unique.
The dataset_id can also be a non-string iterable, in which case
a list will be returned with all terms.
Dataset_ids should not contain semicolons (:).
Return None if the dataset could not be found.
Returns
-------
BaseDataSet:
Return the dataset with dataset_id.
"""
# If dataset_id is a non-string iterable, return a list.
if is_iterable(dataset_id):
return [self.find(x) for x in dataset_id]
# If dataset_id is a valid path, create a dataset from it.
if Path(dataset_id).is_file():
return BaseDataSet(dataset_id)
dataset_id = str(dataset_id)
# get installed dataset groups
dataset_groups = _entry_points(group="asreview.datasets")
# Split into group/dataset if possible.
split_dataset_id = dataset_id.split(":")
if len(split_dataset_id) == 2:
data_group = split_dataset_id[0]
split_dataset_id = split_dataset_id[1]
if data_group in self.groups:
return dataset_groups[data_group].load()().find(split_dataset_id)
# Look through all available/installed groups for the name.
all_results = {}
for data_group in dataset_groups:
try:
all_results[data_group.name] = data_group.load()().find(dataset_id)
except Exception:
# don't raise error on loading entry point
pass
# If we have multiple results, throw an error.
if len(all_results) > 1:
raise ValueError(
f"Multiple datasets found: {list(all_results)}."
"Use DATAGROUP:DATASET format to specify which one"
" you want."
)
if len(all_results) == 1:
return list(all_results.values())[0]
# Could not find dataset
raise DatasetNotFoundError(f"Dataset {dataset_id} not found")
[docs]
def list(self, include=None, exclude=None, serialize=True, raise_on_error=False):
"""List the available datasets.
Parameters
----------
include: str, iterable
List of groups to include
exclude: str, iterable
List of groups to exclude from all groups.
serialize: bool
Make returned list serializable.
raise_on_error: bool
Raise error when entry point can't be loaded.
Returns
-------
list:
List with datasets as values.
"""
if include is not None and exclude is not None:
raise ValueError("Cannot exclude groups when include is not None.")
if include is not None:
if not is_iterable(include):
include = [include]
groups = include
elif exclude is not None:
exclude = exclude if is_iterable(exclude) else [exclude]
groups = list(set(self.groups) - set(exclude))
else:
groups = self.groups.copy()
dataset_groups = _entry_points(group="asreview.datasets")
group_list = []
for group in groups:
try:
group_list.append(dataset_groups[group].load()())
except Exception as err:
# don't raise error on loading entry point
if raise_on_error:
raise err
if serialize:
dataset_list_ser = []
for data_group in group_list:
try:
group_ser = []
for dataset in data_group.datasets:
group_ser.append(dataset.__dict__())
dataset_list_ser.append(
{
"group_id": data_group.group_id,
"description": data_group.description,
"datasets": group_ser,
}
)
except Exception as err:
# don't raise error on loading entry point
if raise_on_error:
raise err
return dataset_list_ser
return group_list
[docs]
class NaturePublicationDataGroup(BaseDataGroup):
"""Datasets used in the paper Van de Schoot et al. 2020."""
group_id = "benchmark-nature"
description = (
"Datasets used in the validation paper published"
" in Nature Machine Intelligence (van de Schoot et al. 2021)"
)
def __init__(self):
meta_file = "https://raw.githubusercontent.com/asreview/paper-asreview/master/index_v1.json" # noqa
datasets = _download_from_metadata(meta_file)
super(NaturePublicationDataGroup, self).__init__(*datasets)
class SynergyDataSet(BaseDataSet):
@property
def filename(self):
return self.dataset_id + ".csv"
@property
def reader(self):
return CSVReader
def to_file(self, path=None):
# download, build, and store to local file
try:
return sd.Dataset(self.dataset_id).to_frame().to_csv(path)
except FileNotFoundError:
tmp_synergy_folder = tempfile.mkdtemp()
sd.download_raw_subset(self.dataset_id, path=tmp_synergy_folder)
for d in sd.iter_datasets(path=tmp_synergy_folder):
if d.name == self.dataset_id:
return d.to_frame().to_csv(path)
raise ValueError("Synergy dataset not found")
[docs]
class SynergyDataGroup(BaseDataGroup):
"""Datasets available in the SYNERGY dataset."""
group_id = "synergy"
description = "SYNERGY datasets (asreview.ai/synergy)"
def __init__(self):
# The following code was used to generate the metadata
#
# import synergy_dataset as sd
# from pprint import pprint
# meta_synergy = {}
# for x in sd.iter_datasets():
# meta_synergy[x.name] = {
# "title": x.metadata["publication"]["display_name"],
# "authors": x.cite.split(",")[0] + " et al.",
# "topic": x.metadata
# ["data"]["concepts"]["included"][0]["display_name"],
# "link": "https://doi.org/10.34894/HE6NAQ",
# "reference": x.metadata["publication"]["doi"],
# "license": "See Synergy dataset",
# "year": x.metadata["publication"]["publication_year"]
# }
# pprint(meta_synergy)
synergy_metadata = {
"Appenzeller-Herzog_2019": {
"authors": "Appenzeller‐Herzog et al.",
"license": "See Synergy dataset",
"link": "https://doi.org/10.34894/HE6NAQ",
"reference": "https://doi.org/10.1111/liv.14179",
"title": "Comparative effectiveness of common "
"therapies for Wilson disease: A "
"systematic review and meta‐analysis of "
"controlled studies",
"topic": "Medicine",
"year": 2019,
},
"Bos_2018": {
"authors": "Bos et al.",
"license": "See Synergy dataset",
"link": "https://doi.org/10.34894/HE6NAQ",
"reference": "https://doi.org/10.1016/j.jalz.2018.04.007",
"title": "Cerebral small vessel disease and the risk of "
"dementia: A systematic review and meta‐analysis of "
"population‐based evidence",
"topic": "Medicine",
"year": 2018,
},
"Brouwer_2019": {
"authors": "Brouwer et al.",
"license": "See Synergy dataset",
"link": "https://doi.org/10.34894/HE6NAQ",
"reference": "https://doi.org/10.1016/j.cpr.2019.101773",
"title": "Psychological theories of depressive relapse and "
"recurrence: A systematic review and meta-analysis "
"of prospective studies",
"topic": "Psychology",
"year": 2019,
},
"Chou_2003": {
"authors": "Chou et al.",
"license": "See Synergy dataset",
"link": "https://doi.org/10.34894/HE6NAQ",
"reference": "https://doi.org/10.1016/j.jpainsymman.2003.03.003",
"title": "Comparative efficacy and safety of long-acting oral "
"opioids for chronic non-cancer pain: a systematic "
"review",
"topic": "Medicine",
"year": 2003,
},
"Chou_2004": {
"authors": "Chou et al.",
"license": "See Synergy dataset",
"link": "https://doi.org/10.34894/HE6NAQ",
"reference": "https://doi.org/10.1016/j.jpainsymman.2004.05.002",
"title": "Comparative efficacy and safety of skeletal muscle "
"relaxants for spasticity and musculoskeletal "
"conditions: a systematic review",
"topic": "Medicine",
"year": 2004,
},
"Donners_2021": {
"authors": "Donners et al.",
"license": "See Synergy dataset",
"link": "https://doi.org/10.34894/HE6NAQ",
"reference": "https://doi.org/10.1007/s40262-021-01042-w",
"title": "Pharmacokinetics and Associated Efficacy of "
"Emicizumab in Humans: A Systematic Review",
"topic": "Medicine",
"year": 2021,
},
"Hall_2012": {
"authors": "Hall et al.",
"license": "See Synergy dataset",
"link": "https://doi.org/10.34894/HE6NAQ",
"reference": "https://doi.org/10.1109/tse.2011.103",
"title": "A Systematic Literature Review on Fault Prediction "
"Performance in Software Engineering",
"topic": "Computer science",
"year": 2012,
},
"Jeyaraman_2020": {
"authors": "Jeyaraman et al.",
"license": "See Synergy dataset",
"link": "https://doi.org/10.34894/HE6NAQ",
"reference": "https://doi.org/10.1177/1947603520951623",
"title": "Does the Source of Mesenchymal Stem Cell Have an "
"Effect in the Management of Osteoarthritis of "
"the Knee? Meta-Analysis of Randomized Controlled "
"Trials",
"topic": "Medicine",
"year": 2020,
},
"Leenaars_2019": {
"authors": "Leenaars et al.",
"license": "See Synergy dataset",
"link": "https://doi.org/10.34894/HE6NAQ",
"reference": "https://doi.org/10.5334/jcr.183",
"title": "Sleep and Microdialysis: An Experiment and a "
"Systematic Review of Histamine and Several Amino "
"Acids",
"topic": "Psychology",
"year": 2019,
},
"Leenaars_2020": {
"authors": "Leenaars et al.",
"license": "See Synergy dataset",
"link": "https://doi.org/10.34894/HE6NAQ",
"reference": "https://doi.org/10.3390/ani10061047",
"title": "A Systematic Review Comparing Experimental Design "
"of Animal and Human Methotrexate Efficacy Studies "
"for Rheumatoid Arthritis: Lessons for the "
"Translational Value of Animal Studies",
"topic": "Medicine",
"year": 2020,
},
"Meijboom_2021": {
"authors": "Meijboom et al.",
"license": "See Synergy dataset",
"link": "https://doi.org/10.34894/HE6NAQ",
"reference": "https://doi.org/10.1007/s40259-021-00508-4",
"title": "Patients Retransitioning from Biosimilar TNFα "
"Inhibitor to the Corresponding Originator After "
"Initial Transitioning to the Biosimilar: A "
"Systematic Review",
"topic": "Medicine",
"year": 2021,
},
"Menon_2022": {
"authors": "Menon et al.",
"license": "See Synergy dataset",
"link": "https://doi.org/10.34894/HE6NAQ",
"reference": "https://doi.org/10.1080/10408444.2022.2082917",
"title": "The methodological rigour of systematic reviews in "
"environmental health",
"topic": "Medicine",
"year": 2022,
},
"Moran_2021": {
"authors": "Moran et al.",
"license": "See Synergy dataset",
"link": "https://doi.org/10.34894/HE6NAQ",
"reference": "https://doi.org/10.1111/brv.12655",
"title": "Poor nutritional condition promotes high‐risk "
"behaviours: a systematic review and meta‐analysis",
"topic": "Biology",
"year": 2021,
},
"Muthu_2021": {
"authors": "Muthu et al.",
"license": "See Synergy dataset",
"link": "https://doi.org/10.34894/HE6NAQ",
"reference": "https://doi.org/10.1097/brs.0000000000003645",
"title": "Fragility Analysis of Statistically Significant "
"Outcomes of Randomized Control Trials in Spine "
"Surgery",
"topic": "Medicine",
"year": 2021,
},
"Nelson_2002": {
"authors": "Nelson et al.",
"license": "See Synergy dataset",
"link": "https://doi.org/10.34894/HE6NAQ",
"reference": "https://doi.org/10.1001/jama.288.7.872",
"title": "Postmenopausal Hormone Replacement Therapy",
"topic": "Medicine",
"year": 2002,
},
"Oud_2018": {
"authors": "Oud et al.",
"license": "See Synergy dataset",
"link": "https://doi.org/10.34894/HE6NAQ",
"reference": "https://doi.org/10.1177/0004867418791257",
"title": "Specialized psychotherapies for adults with borderline "
"personality disorder: A systematic review and "
"meta-analysis",
"topic": "Psychology",
"year": 2018,
},
"Radjenovic_2013": {
"authors": "Radjenović et al.",
"license": "See Synergy dataset",
"link": "https://doi.org/10.34894/HE6NAQ",
"reference": "https://doi.org/10.1016/j.infsof.2013.02.009",
"title": "Software fault prediction metrics: A systematic "
"literature review",
"topic": "Computer science",
"year": 2013,
},
"Sep_2021": {
"authors": "Sep et al.",
"license": "See Synergy dataset",
"link": "https://doi.org/10.34894/HE6NAQ",
"reference": "https://doi.org/10.1371/journal.pone.0249102",
"title": "The rodent object-in-context task: A systematic review "
"and meta-analysis of important variables",
"topic": "Psychology",
"year": 2021,
},
"Smid_2020": {
"authors": "Smid et al.",
"license": "See Synergy dataset",
"link": "https://doi.org/10.34894/HE6NAQ",
"reference": "https://doi.org/10.1080/10705511.2019.1577140",
"title": "Bayesian Versus Frequentist Estimation for Structural "
"Equation Models in Small Sample Contexts: A "
"Systematic Review",
"topic": "Computer science",
"year": 2020,
},
"Walker_2018": {
"authors": "Walker et al.",
"license": "See Synergy dataset",
"link": "https://doi.org/10.34894/HE6NAQ",
"reference": "https://doi.org/10.1016/j.envint.2017.12.032",
"title": "Human and animal evidence of potential "
"transgenerational inheritance of health effects: An "
"evidence map and state-of-the-science evaluation",
"topic": "Biology",
"year": 2018,
},
"Wassenaar_2017": {
"authors": "Wassenaar et al.",
"license": "See Synergy dataset",
"link": "https://doi.org/10.34894/HE6NAQ",
"reference": "https://doi.org/10.1289/ehp1233",
"title": "Systematic Review and Meta-Analysis of "
"Early-Life Exposure to Bisphenol A and "
"Obesity-Related Outcomes in Rodents",
"topic": "Medicine",
"year": 2017,
},
"Wolters_2018": {
"authors": "Wolters et al.",
"license": "See Synergy dataset",
"link": "https://doi.org/10.34894/HE6NAQ",
"reference": "https://doi.org/10.1016/j.jalz.2018.01.007",
"title": "Coronary heart disease, heart failure, and the "
"risk of dementia: A systematic review and "
"meta‐analysis",
"topic": "Medicine",
"year": 2018,
},
"van_Dis_2020": {
"authors": "van Dis et al.",
"license": "See Synergy dataset",
"link": "https://doi.org/10.34894/HE6NAQ",
"reference": "https://doi.org/10.1001/jamapsychiatry.2019.3986",
"title": "Long-term Outcomes of Cognitive Behavioral Therapy "
"for Anxiety-Related Disorders",
"topic": "Psychology",
"year": 2020,
},
"van_de_Schoot_2018": {
"authors": "van de Schoot et al.",
"license": "See Synergy dataset",
"link": "https://doi.org/10.34894/HE6NAQ",
"reference": "https://doi.org/10.1080/00273171.2017.1412293",
"title": "Bayesian PTSD-Trajectory Analysis with "
"Informed Priors Based on a Systematic "
"Literature Search and Expert Elicitation",
"topic": "Psychology",
"year": 2018,
},
"van_der_Valk_2021": {
"authors": "Valk et al.",
"license": "See Synergy dataset",
"link": "https://doi.org/10.34894/HE6NAQ",
"reference": "https://doi.org/10.1111/obr.13376",
"title": "Cross‐sectional relation of long‐term "
"glucocorticoids in hair with anthropometric "
"measurements and their possible determinants: "
"A systematic review and meta‐analysis",
"topic": "Medicine",
"year": 2021,
},
"van_der_Waal_2022": {
"authors": "van der Waal et al.",
"license": "See Synergy dataset",
"link": "https://doi.org/10.34894/HE6NAQ",
"reference": "https://doi.org/10.1016/j.jgo.2022.09.012",
"title": "A meta-analysis on the role older adults with "
"cancer favour in treatment decision making",
"topic": "Medicine",
"year": 2022,
},
}
datasets = [SynergyDataSet(k, **v) for k, v in synergy_metadata.items()]
super(SynergyDataGroup, self).__init__(*datasets)
class BenchmarkDataGroup(BaseDataGroup):
"""Datasets available in the benchmark platform.
Deprecated
"""
group_id = "benchmark"
description = "DEPRECATED: Datasets available in the online benchmark platform"
def __init__(self):
warnings.warn(
"The use of 'benchmark' datasets is deprecated, "
"use SYNERGY dataset instead. For more information, see "
"https://github.com/asreview/synergy-dataset.",
category=UserWarning
)
meta_file = "https://raw.githubusercontent.com/asreview/systematic-review-datasets/master/index_v1.json" # noqa
datasets = _download_from_metadata(meta_file)
super(BenchmarkDataGroup, self).__init__(*datasets)