Source code for asreview.data.ris

# Copyright 2019-2025 The ASReview Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

__all__ = ["RISReader", "RISWriter"]

import copy
import io
import json
import re
from urllib.request import urlopen

import pandas as pd
import rispy

from asreview.data.base import BaseReader
from asreview.data.utils import convert_value_to_list
from asreview.utils import _is_url

RIS_NOTE_LABEL_MAPPING = {
    "ASReview_relevant": 1,
    "ASReview_irrelevant": 0,
    "ASReview_not_seen": None,
}
LABEL_RIS_NOTE_MAPPING = {val: key for key, val in RIS_NOTE_LABEL_MAPPING.items()}


def _parse_label_from_notes(note_list):
    if not isinstance(note_list, list):
        return
    for note in note_list:
        for key, val in RIS_NOTE_LABEL_MAPPING.items():
            if key in note:
                return val


def _remove_asreview_data_from_notes(note_list):
    """Remove ASReview data from notes.

    Parameters
    ----------
    note_list: list
        A list of notes, coming from the Dataframe's "notes" column.

    Returns
    -------
    asreview_new_notes: list
        A list of updated notes, where ASReview data has been removed.
    """

    # Return {} and an empty list
    if not isinstance(note_list, list):
        return []
    return [
        note
        for note in note_list
        if note not in RIS_NOTE_LABEL_MAPPING and not note.startswith("asreview_")
    ]



[docs]
class RISReader(BaseReader):
    """RIS file reader."""

    read_format = [".ris", ".txt"]
    write_format = [".csv", ".tsv", ".xlsx", ".ris"]

    mime_types = {
        "application/x-research-info-systems": [".ris", ".txt"],
        "text/plain": [".txt", ".ris"],
    }

    def _strip_zotero_p_tags(note_list):
        """Converter function for removing the XHTML <p></p> tags from Zotero export.

        Parameters
        ----------
        note_list: list
            A list of notes, coming from the Dataframe's "notes" column.

        Returns
        -------
        new_notes: list
            A list of updated notes, where XHTML <p></p> tags have been stripped.
        note_list: list
            The original note_list, when no XHTML <p></p> tags have been found.
        """
        if isinstance(note_list, list):
            new_notes = []
            for v in note_list:
                try:
                    new_notes.append(re.sub(r"^<p>|<\/p>$", "", v))
                except Exception:
                    new_notes.append(v)
            return new_notes
        else:
            return note_list

    @classmethod
    def _read_from_file(cls, fp, encoding="utf8"):
        with open(fp, encoding=encoding) as bibliography_file:
            return list(rispy.load(bibliography_file, skip_unknown_tags=True))

    @classmethod
    def _read_from_url(cls, fp, encoding="utf8"):
        url_input = urlopen(fp)

        bibliography_file = io.StringIO(url_input.read().decode(encoding))
        entries = list(rispy.load(bibliography_file, skip_unknown_tags=True))
        bibliography_file.close()

        return entries


[docs]
    @classmethod
    def read_data(cls, fp):
        """Import dataset.

        Parameters
        ----------
        fp: str, pathlib.Path
            File path to the RIS file.

        Returns
        -------
        pd.DataFrame:
            Dataframe with entries. If the notes field contains a note with the text
            `ASReview_relevant`, `ASReview_irrelevant` or `ASReview_not_seen`, the
            data frame will have a column `included` with the value `1`, `0` or `None`.

        Raises
        ------
        ValueError
            File with unrecognized encoding is used as input.
        """
        encodings = ["utf-8", "utf-8-sig", "ISO-8859-1"]
        entries = None
        for encoding in encodings:
            try:
                if _is_url(fp):
                    entries = cls._read_from_url(fp, encoding=encoding)
                    break
                else:
                    entries = cls._read_from_file(fp, encoding=encoding)
                    break
            except UnicodeDecodeError:
                continue
            except Exception as e:
                raise ValueError(f"Error reading RIS file: {e}")

        if entries is None:
            raise ValueError("Cannot find proper encoding for data file")

        # Turn the entries dictionary into a Pandas dataframe
        df = pd.DataFrame(entries)

        # Check if "notes" column is present
        if "notes" in df:
            # Strip Zotero XHTML <p> tags on "notes"
            df["notes"] = df["notes"].apply(cls._strip_zotero_p_tags)
            # Parse the labels from the notes if present.
            labels = df["notes"].apply(_parse_label_from_notes)
            if not labels.isna().all():
                df["included"] = labels
            df["notes"] = df["notes"].apply(_remove_asreview_data_from_notes)

        if "included" in df:
            df["included"] = df["included"].astype("Int64")

        return df



[docs]
    @classmethod
    def clean_data(cls, df):
        # We drop the 'label' column if it's available. For RIS files ASReview stores
        # and loads the labels from the notes field.
        df.drop("label", axis=1, inplace=True, errors="ignore")

        # We combine the values of 'authors' and 'first_authors' into one list of
        # authors. Internally we only use the authors when searching for a record, so
        # it does not matter too much if combining the two lists leads to the wrong
        # order of authors, or duplicate authors appearing in the list.
        if "authors" not in df:
            df["authors"] = None
        if "first_authors" not in df:
            df["first_authors"] = None
        df["authors"] = df["authors"].apply(convert_value_to_list)
        df["first_authors"] = df["first_authors"].apply(convert_value_to_list)
        df["authors"] = df["authors"] + df["first_authors"]

        return super().clean_data(df)





[docs]
class RISWriter:
    """RIS file writer."""

    name = "ris"
    label = "RIS"
    caution = "Available only if you imported a RIS file when creating the project"
    write_format = ".ris"


[docs]
    @classmethod
    def write_data(cls, df, fp):
        """Export dataset.

        Parameters
        ----------
        df: pd.Dataframe
            Dataframe of all available record data.
        fp: str, pathlib.Path
            File path to the RIS file, if exists.

        Returns
        -------
        RIS file
            Dataframe of all available record data. Any column from the data frame that
            starts with `asreview_` is added to the RIS file as note in the notes field
            of the form: `asreview_{column_name}: json.dumps({column_value})`. If the
            dataframe contains a column `asreview_label`, also a note is added with the
            value `ASReview_relevant`, `ASReview_irrelevant` or `ASReview_not_seen`
            corresponding to the value `1`, `0` or `None` in that column.
        """
        # Turn pandas DataFrame into records (list of dictionaries) for rispy
        records = copy.deepcopy(df.to_dict("records"))

        # Create an array for storing modified records
        records_new = []

        # Iterate over all available records
        for rec in records:

            def _isnull(v):
                if isinstance(v, list):
                    return v == []

                return pd.isnull(v)

            rec_copy = {}
            rec_copy["notes"] = rec.pop("notes", [])
            for key, val in rec.items():
                if key == "asreview_label":
                    rec_copy["notes"].insert(
                        0, LABEL_RIS_NOTE_MAPPING[rec["asreview_label"]]
                    )
                elif _isnull(val):
                    continue
                elif key.startswith("asreview_"):
                    rec_copy["notes"].append(f"{key}: {json.dumps(val)}")
                else:
                    rec_copy[key] = val
            if rec_copy["notes"] == []:
                rec_copy.pop("notes")

            # Throw away columns that can not be exported to RIS.
            rec_copy = {
                key: val
                for key, val in rec_copy.items()
                if not key == "included" and not key.startswith("asreview_")
            }
            # Append the deepcopied and updated record to a new array
            records_new.append(rec_copy)

        # From buffered dataframe
        if fp is None:
            # Write the whole content to buffer
            return rispy.dumps(records_new)

        # From IO dataframe
        else:
            # Write the whole content to a file
            with open(fp, "w", encoding="utf8") as fp:
                rispy.dump(records_new, fp)