Source code for asreview.data.statistics

# Copyright 2019-2022 The ASReview Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

__all__ = [
    "n_records",
    "n_relevant",
    "n_irrelevant",
    "n_unlabeled",
    "n_missing_title",
    "n_missing_abstract",
    "title_length",
    "abstract_length",
    "n_keywords",
    "n_duplicates",
]

import numpy as np



[docs]
def n_records(data):
    """Return the number of records.

    Arguments
    ---------
    data: asreview.Dataset
        An Dataset object with the records.

    Return
    ------
    int:
        The statistic
    """
    return len(data)




[docs]
def n_relevant(data):
    """Return the number of relevant records.

    Arguments
    ---------
    data: asreview.Dataset
        An Dataset object with the records.

    Return
    ------
    int:
        The statistic
    """
    if data.labels is not None:
        return len(np.where(data.labels == 1)[0])
    return None




[docs]
def n_irrelevant(data):
    """Return the number of irrelevant records.

    Arguments
    ---------
    data: asreview.Dataset
        An Dataset object with the records.

    Return
    ------
    int:
        The statistic
    """
    if data.labels is None:
        return None
    return len(np.where(data.labels == 0)[0])




[docs]
def n_unlabeled(data):
    """Return the number of unlabeled records.

    Arguments
    ---------
    data: asreview.Dataset
        An Dataset object with the records.

    Return
    ------
    int:
        The statistic
    """
    if data.labels is None:
        return None
    return len(data.labels) - n_relevant(data) - n_irrelevant(data)




[docs]
def n_missing_title(data):
    """Return the number of records with missing titles.

    Arguments
    ---------
    data: asreview.Dataset
        An Dataset object with the records.

    Return
    ------
    int:
        The statistic
    """
    n_missing = 0
    if data.title is None:
        return None, None
    if data.labels is None:
        n_missing_included = None
    else:
        n_missing_included = 0
    for i in range(len(data.title)):
        if len(data.title[i]) == 0:
            n_missing += 1
            if data.labels is not None and data.labels[i] == 1:
                n_missing_included += 1
    return n_missing, n_missing_included




[docs]
def n_missing_abstract(data):
    """Return the number of records with missing abstracts.

    Arguments
    ---------
    data: asreview.Dataset
        An Dataset object with the records.

    Return
    ------
    int:
        The statistic
    """
    n_missing = 0
    if data.abstract is None:
        return None, None
    if data.labels is None:
        n_missing_included = None
    else:
        n_missing_included = 0

    for i in range(len(data.abstract)):
        if len(data.abstract[i]) == 0:
            n_missing += 1
            if data.labels is not None and data.labels[i] == 1:
                n_missing_included += 1

    return n_missing, n_missing_included




[docs]
def title_length(data):
    """Return the average length of the titles.

    Arguments
    ---------
    data: asreview.Dataset
        An Dataset object with the records.

    Return
    ------
    int:
        The statistic
    """
    if data.title is None:
        return None
    avg_len = 0
    for i in range(len(data.title)):
        avg_len += len(data.title[i])
    return avg_len / len(data.title)




[docs]
def abstract_length(data):
    """Return the average length of the abstracts.

    Arguments
    ---------
    data: asreview.Dataset
        An Dataset object with the records.

    Return
    ------
    int:
        The statistic
    """
    if data.abstract is None:
        return None
    avg_len = 0
    for i in range(len(data.abstract)):
        avg_len += len(data.abstract[i])
    return avg_len / len(data.abstract)




[docs]
def n_keywords(data):
    """Return the number of keywords.

    Arguments
    ---------
    data: asreview.Dataset
        An Dataset object with the records.

    Return
    ------
    int:
        The statistic
    """
    if data.keywords is None:
        return None
    return np.average([len(keywords) for keywords in data.keywords])




[docs]
def n_duplicates(data, pid="doi"):
    """Number of duplicates.

    Duplicate detection can be a very challenging task. Multiple
    algorithms can be used and results can be vary.

    Arguments
    ---------
    data: asreview.Dataset
        An Dataset object with the records.
    pid: string
        Which persistent identifier (PID) to use for deduplication.
        Default is 'doi'.

    Return
    ------
    int:
        Number of duplicates
    """
    return int(data.duplicated(pid).sum())