# Copyright 2019-2022 The ASReview Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__all__ = ["Dataset", "Record"]
import logging
from dataclasses import dataclass
from pathlib import Path
import numpy as np
import pandas as pd
from pandas.api.types import is_object_dtype
from pandas.api.types import is_string_dtype
from asreview.config import COLUMN_DEFINITIONS
from asreview.config import LABEL_NA
from asreview.exceptions import BadFileFormatError
from asreview.utils import _entry_points
from asreview.utils import is_iterable
def _type_from_column(col_name, col_definitions):
"""Transform a column name to its standardized form.
Arguments
---------
col_name: str
Name of the column in the dataframe.
col_definitions: dict
Dictionary of {standardized_name: [list of possible names]}.
Ex. {"title": ["title", "primary_title"],
"authors": ["authors", "author names", "first_authors"]}
Returns
-------
str:
The standardized name. If it wasn't found, return None.
"""
for name, definition in col_definitions.items():
if col_name.lower() in definition:
return name
return None
def _convert_keywords(keywords):
"""Split keywords separated by commas etc to lists."""
if not isinstance(keywords, str):
return keywords
current_best = [keywords]
for splitter in [", ", "; ", ": ", ";", ":"]:
new_split = keywords.split(splitter)
if len(new_split) > len(current_best):
current_best = new_split
return current_best
@dataclass
class Record:
"""A record from the dataset.
The record contains only fields that are relevant for the
systematic review. Other fields are stored not included.
Arguments
---------
record_id: int
Identifier for this record.
title: str
Title of the record.
abstract: str
Abstract of the record.
authors: str
Authors of the record.
notes: str
Notes of the record.
keywords: str
Keywords of the record.
included: int
Label of the record.
type_of_reference: str
Type of reference.
year: int
Year of publication.
doi: str
DOI of the record.
url: str
URL of the record.
is_prior: bool
Whether the record is a prior record.
"""
record_id: int
title: str = None
abstract: str = None
authors: str = None
notes: str = None
keywords: str = None
type_of_reference: str = None
year: int = None
doi: str = None
url: str = None
included: int = None
is_prior: bool = False
[docs]
class Dataset:
"""Dataset object to the dataset with texts, labels, DOIs etc.
Arguments
---------
df: pandas.DataFrame
Dataframe containing the data for the ASReview data object.
column_spec: dict
Specification for which column corresponds to which standard
specification. Key is the standard specification, key is which column
it is actually in. Default: None.
Attributes
----------
record_ids: numpy.ndarray
Return an array representing the data in the Index.
texts: numpy.ndarray
Returns an array with either headings, bodies, or both.
headings: numpy.ndarray
Returns an array with dataset headings.
title: numpy.ndarray
Identical to headings.
bodies: numpy.ndarray
Returns an array with dataset bodies.
abstract: numpy.ndarray
Identical to bodies.
notes: numpy.ndarray
Returns an array with dataset notes.
keywords: numpy.ndarray
Returns an array with dataset keywords.
authors: numpy.ndarray
Returns an array with dataset authors.
doi: numpy.ndarray
Returns an array with dataset DOI.
included: numpy.ndarray
Returns an array with document inclusion markers.
final_included: numpy.ndarray
Pending deprecation! Returns an array with document inclusion markers.
labels: numpy.ndarray
Identical to included.
"""
def __init__(self, df=None, column_spec=None):
self.df = df
self.column_spec = column_spec
if column_spec is None:
self._get_column_spec_df()
self.df.columns = self.df.columns.str.strip()
# Convert labels to integers.
if self.column_spec and "included" in list(self.column_spec):
col = self.column_spec["included"]
self.df[col] = self.df[col].fillna(LABEL_NA).astype(int)
self.df["record_id"] = np.arange(len(self.df.index)).astype("int64")
self.df.set_index("record_id", inplace=True)
# Check if we either have abstracts or titles.
if "abstract" not in list(self.column_spec) and "title" not in list(
self.column_spec
):
raise BadFileFormatError(
"File supplied without 'abstract' or 'title'" " fields."
)
if "abstract" not in list(self.column_spec):
logging.warning("Unable to detect abstracts in dataset.")
if "title" not in list(self.column_spec):
logging.warning("Unable to detect titles in dataset.")
def _get_column_spec_df(self):
self.column_spec = {}
for col_name in list(self.df):
data_type = _type_from_column(col_name, COLUMN_DEFINITIONS)
if data_type is not None:
self.column_spec[data_type] = col_name
def __len__(self):
if self.df is None:
return 0
return len(self.df.index)
[docs]
def record(self, i):
"""Create a record from an index.
Arguments
---------
i: int, iterable
Index of the record, or list of indices.
Returns
-------
Record
The corresponding record if i was an integer, or a list of records
if i was an iterable.
"""
if not is_iterable(i):
index_list = [i]
else:
index_list = i
column_spec_inv = {v: k for k, v in self.column_spec.items()}
records = [
Record(
record_id=int(self.df.index.values[j]),
**self.df.rename(column_spec_inv, axis=1)[self.column_spec.keys()]
.iloc[j]
.replace(np.nan, None)
.to_dict(),
)
for j in index_list
]
if is_iterable(i):
return records
return records[0]
@property
def record_ids(self):
return self.df.index.values
@property
def texts(self):
if self.title is None:
return self.abstract
if self.abstract is None:
return self.title
s_title = pd.Series(self.title)
s_abstract = pd.Series(self.abstract)
cur_texts = (s_title + " " + s_abstract).str.strip()
return cur_texts.values
@property
def headings(self):
return self.title
@property
def title(self):
try:
return self.df[self.column_spec["title"]].fillna("").values
except KeyError:
return None
@property
def bodies(self):
return self.abstract
@property
def abstract(self):
try:
return self.df[self.column_spec["abstract"]].fillna("").values
except KeyError:
return None
@property
def notes(self):
try:
return self.df[self.column_spec["notes"]].values
except KeyError:
return None
@property
def keywords(self):
try:
return self.df[self.column_spec["keywords"]].apply(_convert_keywords).values
except KeyError:
return None
@property
def authors(self):
try:
return self.df[self.column_spec["authors"]].values
except KeyError:
return None
@property
def doi(self):
try:
return self.df[self.column_spec["doi"]].values
except KeyError:
return None
@property
def url(self):
try:
return self.df[self.column_spec["url"]].values
except KeyError:
return None
[docs]
def get(self, name):
"Get column with name."
try:
return self.df[self.column_spec[name]].values
except KeyError:
return self.df[name].values
@property
def included(self):
return self.labels
@property
def labels(self):
try:
column = self.column_spec["included"]
return self.df[column].values
except KeyError:
return None
@labels.setter
def labels(self, labels):
try:
column = self.column_spec["included"]
self.df[column] = labels
except KeyError:
self.df["included"] = labels
[docs]
def is_prior(self):
"""Get the labels that are marked as 'prior'.
Returns
-------
numpy.ndarray
Array of booleans that have the 'prior' property.
"""
column = self.column_spec["is_prior"]
return self.df[column] == 1
[docs]
def to_file(
self, fp, labels=None, ranking=None, writer=None, keep_old_labels=False
):
"""Export data object to file.
RIS, CSV, TSV and Excel are supported file formats at the moment.
Arguments
---------
fp: str
Filepath to export to.
labels: list, numpy.ndarray
Labels to be inserted into the dataframe before export.
ranking: list, numpy.ndarray
Optionally, dataframe rows can be reordered.
writer: class
Writer to export the file.
keep_old_labels: bool
If True, the old labels are kept in a column 'asreview_label_to_validate'.
Default False.
"""
df = self.to_dataframe(
labels=labels, ranking=ranking, keep_old_labels=keep_old_labels
)
if writer is not None:
writer().write_data(df, fp)
else:
best_suffix = None
for entry in _entry_points(group="asreview.writers"):
if Path(fp).suffix == entry.name:
if best_suffix is None or len(entry.name) > len(best_suffix):
best_suffix = entry.name
if best_suffix is None:
raise BadFileFormatError(
f"Error exporting file {fp}, no capabilities "
"for exporting such a file."
)
writer = _entry_points(group="asreview.writers")[best_suffix].load()
writer.write_data(df, fp)
[docs]
def to_dataframe(self, labels=None, ranking=None, keep_old_labels=False):
"""Create new dataframe with updated label (order).
Arguments
---------
labels: list, numpy.ndarray
Current labels will be overwritten by these labels
(including unlabelled). No effect if labels is None.
ranking: list
Reorder the dataframe according to these record_ids.
Default ordering if ranking is None.
keep_old_labels: bool
If True, the old labels are kept in a column 'asreview_label_to_validate'.
Default False.
Returns
-------
pandas.DataFrame
Dataframe of all available record data.
"""
result_df = pd.DataFrame.copy(self.df)
# if there are labels, add them to the frame
if "included" in self.column_spec and labels is not None:
col_label = self.column_spec["included"]
# unnest list of nested (record_id, label) tuples
labeled_record_ids = [x[0] for x in labels]
labeled_values = [x[1] for x in labels]
if keep_old_labels:
result_df["asreview_label_to_validate"] = (
result_df[col_label].replace(LABEL_NA, None).astype("Int64")
)
# remove the old results and write the values
result_df[col_label] = LABEL_NA
result_df.loc[labeled_record_ids, col_label] = labeled_values
result_df[col_label] = (
result_df[col_label].replace(LABEL_NA, None).astype("Int64")
)
# if there is a ranking, apply this ranking as order
if ranking is not None:
# sort the datasets based on the ranking
result_df = result_df.loc[ranking]
# append a column with 1 to n
result_df["asreview_ranking"] = np.arange(1, len(result_df) + 1)
return result_df
[docs]
def duplicated(self, pid="doi"):
"""Return boolean Series denoting duplicate rows.
Identify duplicates based on titles and abstracts and if available,
on a persistent identifier (PID) such as the Digital Object Identifier
(`DOI <https://www.doi.org/>`_).
Arguments
---------
pid: string
Which persistent identifier to use for deduplication.
Default is 'doi'.
Returns
-------
pandas.Series
Boolean series for each duplicated rows.
"""
if pid in self.df.columns:
# in case of strings, strip whitespaces and replace empty strings with None
if is_string_dtype(self.df[pid]) or is_object_dtype(self.df[pid]):
s_pid = self.df[pid].str.strip().replace("", None)
if pid == "doi":
s_pid = s_pid.str.lower().str.replace(
r"^https?://(www\.)?doi\.org/", "", regex=True
)
else:
s_pid = self.df[pid]
# save boolean series for duplicates based on persistent identifiers
s_dups_pid = (s_pid.duplicated()) & (s_pid.notnull())
else:
s_dups_pid = None
# get the texts, clean them and replace empty strings with None
s = (
pd.Series(self.texts)
.str.replace("[^A-Za-z0-9]", "", regex=True)
.str.lower()
.str.strip()
.replace("", None)
)
# save boolean series for duplicates based on titles/abstracts
s_dups_text = (s.duplicated()) & (s.notnull())
# final boolean series for all duplicates
if s_dups_pid is not None:
s_dups = s_dups_pid | s_dups_text
else:
s_dups = s_dups_text
return s_dups
[docs]
def drop_duplicates(self, pid="doi", inplace=False, reset_index=True):
"""Drop duplicate records.
Drop duplicates based on titles and abstracts and if available,
on a persistent identifier (PID) such the Digital Object Identifier
(`DOI <https://www.doi.org/>`_).
Arguments
---------
pid: string, default 'doi'
Which persistent identifier to use for deduplication.
inplace: boolean, default False
Whether to modify the DataFrame rather than creating a new one.
reset_index: boolean, default True
If True, the existing index column is reset to the default integer index.
Returns
-------
pandas.DataFrame or None
DataFrame with duplicates removed or None if inplace=True
"""
df = self.df[~self.duplicated(pid)]
if reset_index:
df = df.reset_index(drop=True)
if inplace:
self.df = df
return
return Dataset(df, self.column_spec)