Source code for asreview.models.feature_extraction.base

# Copyright 2019-2022 The ASReview Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

__all__ = ["BaseFeatureExtraction"]

from abc import abstractmethod

import numpy as np
from scipy.sparse import hstack
from scipy.sparse import issparse

from asreview.models.base import BaseModel



[docs]
class BaseFeatureExtraction(BaseModel):
    """Base class for feature extraction methods."""

    name = "base-feature"

    def __init__(self, split_ta=0, use_keywords=0):
        self.split_ta = split_ta
        self.use_keywords = use_keywords


[docs]
    def fit_transform(self, texts, titles=None, abstracts=None, keywords=None):
        """Fit and transform a list of texts.

        Arguments
        ---------
        texts: numpy.ndarray
            A sequence of texts to be transformed. They are not yet tokenized.

        Returns
        -------
        numpy.ndarray
            Feature matrix representing the texts.
        """
        self.fit(texts)
        if self.split_ta > 0:
            if titles is None or abstracts is None:
                raise ValueError(
                    "Error: if splitting titles and abstracts," " supply them!"
                )
            X_titles = self.transform(titles)
            X_abstracts = self.transform(abstracts)
            if issparse(X_titles) and issparse(X_abstracts):
                X = hstack([X_titles, X_abstracts]).tocsr()
            else:
                X = np.concatenate((X_titles, X_abstracts), axis=1)
        else:
            X = self.transform(texts)

        if self.use_keywords and keywords is not None:
            join_keys = np.array([" ".join(key) for key in keywords])
            X_keywords = self.transform(join_keys)
            if issparse(X_keywords):
                X = hstack([X, X_keywords]).tocsr()
            else:
                X = np.concatenate((X, X_keywords), axis=1)

        return X



[docs]
    def fit(self, texts):
        """Fit the model to the texts.

        It is not always necessary to implement this if there's not real
        fitting being done.

        Arguments
        ---------
        texts: numpy.ndarray
            Texts to be fitted.
        """
        pass



[docs]
    @abstractmethod
    def transform(self, texts):
        """Transform a list of texts.

        Arguments
        ---------
        texts: numpy.ndarray
            A sequence of texts to be transformed. They are not yet tokenized.

        Returns
        -------
        numpy.ndarray
            Feature matrix representing the texts.
        """
        raise NotImplementedError