Source code for asreview.data.tabular

# Copyright 2019-2022 The ASReview Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

__all__ = ["CSVReader"]


import pandas as pd

from asreview.config import COLUMN_DEFINITIONS
from asreview.data.base import Dataset


[docs] class CSVReader: """CVS file reader.""" read_format = [".csv", ".tab", ".tsv"] write_format = [".csv", ".tsv", ".xlsx"]
[docs] @classmethod def read_data(cls, fp): """Import dataset. Arguments --------- fp: str, pathlib.Path File path to the CSV file. Returns ------- list: List with entries. """ for encoding in ["utf-8", "ISO-8859-1"]: try: df = pd.read_csv(fp, sep=None, encoding=encoding, engine="python") return Dataset(df) except UnicodeDecodeError: # if unicode error, go to next encoding continue raise UnicodeDecodeError("The encoding of the file is not supported.")
[docs] class CSVWriter: """CSV file writer.""" name = "csv" label = "CSV (UTF-8)" write_format = ".csv"
[docs] @classmethod def write_data(cls, df, fp, sep=","): """Export dataset. Arguments --------- df: pandas.Dataframe Dataframe of all available record data. fp: str, NoneType Filepath or None for buffer. sep: str Seperator of the file. Returns ------- CSV file Dataframe of all available record data. """ return df.to_csv(fp, sep=sep, index=True)
[docs] class ExcelReader: """Excel file reader.""" read_format = [".xlsx"] write_format = [".csv", ".tsv", ".xlsx"]
[docs] @classmethod def read_data(cls, fp): """Import dataset. Arguments --------- fp: str, pathlib.Path File path to the Excel file (.xlsx). Returns ------- list: List with entries. """ try: dfs = pd.read_excel(fp, sheet_name=None) except UnicodeDecodeError: dfs = pd.read_excel(fp, sheet_name=None, encoding="ISO-8859-1") best_sheet = None sheet_obj_val = -1 wanted_columns = [] for _type_name, type_list in COLUMN_DEFINITIONS.items(): wanted_columns.extend(type_list) for sheet_name in dfs: col_names = set([col.lower() for col in list(dfs[sheet_name])]) obj_val = len(col_names & set(wanted_columns)) if obj_val > sheet_obj_val: sheet_obj_val = obj_val best_sheet = sheet_name return Dataset(dfs[best_sheet])
[docs] class ExcelWriter: """Excel file writer.""" name = "xlsx" label = "Excel" write_format = ".xlsx"
[docs] @classmethod def write_data(cls, df, fp): """Export dataset. Arguments --------- df: pandas.Dataframe Dataframe of all available record data. fp: str, NoneType Filepath or None for buffer. Returns ------- Excel file Dataframe of all available record data. """ return df.to_excel(fp, index=True, engine="xlsxwriter")
[docs] class TSVWriter: """TSV file writer.""" name = "tsv" label = "TSV (UTF-8)" write_format = ".tsv"
[docs] @classmethod def write_data(cls, df, fp, sep="\t"): """Export dataset. Arguments --------- df: pandas.Dataframe Dataframe of all available record data. fp: str, NoneType Filepath or None for buffer. sep: str Seperator of the file. Returns ------- TSV file Dataframe of all available record data. """ return df.to_csv(fp, sep=sep, index=True)