Source code for asreview.data.base

from abc import ABC
from abc import abstractmethod

import numpy as np
import pandas as pd

from asreview.data.record import Record
from asreview.data.utils import convert_value_to_int
from asreview.data.utils import convert_value_to_list
from asreview.data.utils import standardize_included_label


[docs] class BaseReader(ABC): """Base class for data readers. Reading data from a file happens in three steps: read the raw data, perform data cleaning and turn it into `Record` instances. This happens in `read_data`, `clean_data` and `to_records`. Anyone implementing a `BaseReader` should provide an implementation of `read_data`. There are default implementations of `clean_data` and `to_records`. They assume that `read_data` produces a pandas DataFrame. There are a number of ways to customize the default cleaning behavior, see the comments next to the class attributes. """ # When a data reader reads a file and turns it into records, it needs to know # which columns of the input data to put into which columns of the record. By # default these should be the same, but you can allow for alternative input column # names. For example, ASReview allows both 'title' or 'primary_title' for the # title column. The format is {record_column_name: [list of input column names]}, # where the list of input column names is in order from most important to least # important. So when the input dataset contains two possible input columns for a # record column, it will pick the first it finds in the list. # If a field is not in this mapping, only the record column is allowed as input # column. __alternative_column_names__ = { "abstract": ["abstract", "notes_abstract", "abstract note"], "authors": ["authors", "first_authors", "author names"], "included": [ "asreview_label", "included", "label", "final_included", "label_included", "included_label", "included_final", "included_flag", "include", ], "title": ["title", "primary_title"], } # Dictionary {column name : function to apply to the column} of function that clean # the data after reading it. The function should act on individual values. __cleaning_methods__ = { "authors": [convert_value_to_list], "keywords": [convert_value_to_list], "included": [standardize_included_label], "year": [convert_value_to_int], } # Fill missing values with this value. It should be a tuple with one entry which is # the value that will be used to fill all missing values. To disable filling the # missing values, put `None` instead of `(None,)`. __fillna_default__ = (None,)
[docs] @classmethod def read_records(cls, fp, dataset_id, record_cls=Record, *args, **kwargs): df = cls.read_data(fp, *args, **kwargs) df.replace([pd.NA, np.nan], cls.__fillna_default__[0], inplace=True) df = cls.clean_data(df) return cls.to_records(df, dataset_id=dataset_id, record_cls=record_cls)
[docs] @classmethod @abstractmethod def read_data(cls, fp, *args, **kwargs): """Read the raw data from a file. The data type of the output should be equal to the data type of the input of `clean_data`. Typically this will mean a pandas DataFrame, but anyone creating a custom class can choose a different data type. This method should not perform any cleaning of the data. That way data writers can add columns to a dataset without changing the original data: Use `reader.read_data` to get the data, then add the column, then write away the data to a file. Parameters ---------- fp : Path Filepath of the file to read. Returns ------- pd.DataFrame A dataframe of user input data that has not been cleaned yet. """ raise NotImplementedError
[docs] @classmethod def clean_data(cls, df): """Clean the raw data. Parameters ---------- df : pd.DataFrame Data to clean. This should be of the same type as the output of `read_data`. Returns ------- pd.DataFrame Cleaned data. By default it standardizes the column names, some data types and missing values. """ df = cls.standardize_column_names(df) for column, cleaning_methods in cls.__cleaning_methods__.items(): if column in df.columns: for cleaning_method in cleaning_methods: df[column] = df[column].apply(cleaning_method) if cls.__fillna_default__ is not None: df.replace([pd.NA, np.nan], cls.__fillna_default__[0], inplace=True) return df
[docs] @classmethod def to_records(cls, df, dataset_id=None, record_cls=Record): """Turn the cleaned data into records. Parameters ---------- df : pd.DataFrame Cleaned data. dataset_id : str, optional Identifier of the dataset, by default None record_cls : asreview.data.record.Base, optional Record class to use, by default Record Returns ------- list[Record] List of records. """ columns_present = set(df.columns).intersection(set(record_cls.get_columns())) columns_present.discard("record_id") records = [] for row in df[list(columns_present)].itertuples(): row_dict = row._asdict() idx = row_dict.pop("Index") try: records.append( record_cls(dataset_row=idx, dataset_id=dataset_id, **row_dict) ) except ValueError as e: raise ValueError(f"Error when reading row {idx} of dataset: {e}") from e return records
[docs] @classmethod def standardize_column_names(cls, df): """Standardize column names of input data. The reader can accept multiple names for a specific type of data, for example both 'title' and 'primary_title' could refer to the column containing the title data. This function makes sure the correct columns are used. See also the attribute `__alternative_column_names__` for customizing this behavior. Parameters ---------- df : pd.DataFrame Dataframe containing raw data. Returns ------- pd.DataFrame Dataframe with column names lowercased and stripped of white space. In addition, for the columns in `__alternative_column_names__`, the first alternative column name in the data will be used as input for the column values. """ # The original dataset object allowed for uppercase column names. # Here I just lowercase all column names, but might cause bugs if we then # have two columns with the same name. I.e. 'Title' & 'title' -> 'title'. I # assume this won't really happen though. df.columns = [col.lower() for col in df.columns] # This one also occurred in the original dataset object. df.columns = [col.strip() for col in df.columns] # Allow for alternative column names. for column, alternative_columns in cls.__alternative_column_names__.items(): if column in df.columns: continue for alternative_column in alternative_columns: if alternative_column in df.columns: df[column] = df[alternative_column] break return df