Source code for asreview.models.feature_extractors

# Copyright 2019-2025 The ASReview Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

__all__ = ["Tfidf", "OneHot"]


class TextMerger(TransformerMixin, BaseEstimator):
    """Merge text columns into a single column.

    Merge multiple columns into a single column. This can be useful when
    multiple columns contain text information that should be combined.

    Parameters
    ----------
    columns: list
        List of columns to merge.
    sep: str
        Separator to use when merging the columns.
    """

    def __init__(self, columns, sep=" "):
        self.columns = columns
        self.sep = sep

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.columns].fillna("").apply(lambda x: self.sep.join(x), axis=1)



[docs]
class Tfidf(Pipeline):
    """TF-IDF feature extraction.

    Based on the sklearn implementation of the TF-IDF feature extraction
    sklearn.feature_extraction.text.TfidfVectorizer.

    Parameters
    ----------
    columns: list, default=["title", "abstract"]
        See TextMerger
    sep: str, default=" "
        See TextMerger
    lowercase: bool, default=True
        See ScikitLearn CountVectorizer
    stop_words: {'english'} or list or None, default=None
        See ScikitLearn CountVectorizer
    token_pattern: str or None, default=r"(?u)\\b\\w\\w+\\b"
        See ScikitLearn CountVectorizer
    ngram_range: tuple (min_n, max_n), default=(1,1)
        See ScikitLearn CountVectorizer
    max_df: float in range [0.0, 1.0] or int, default=1.0
        See ScikitLearn CountVectorizer
    min_df: float in range [0.0, 1.0] or int, default=1
        See ScikitLearn CountVectorizer
    max_features: int, default=None
        See ScikitLearn CountVectorizer
    vocabulary: Mapping or iterable, default=None
        See ScikitLearn CountVectorizer
    binary: bool, default=False
        See ScikitLearn CountVectorizer
    norm: {"l1", "l2"} or None, default="l2"
        See ScikitLearn CountVectorizer
    use_idf: bool, default=True
        See ScikitLearn CountVectorizer
    smooth_idf: bool, default=True
        See ScikitLearn CountVectorizer
    sublinear_tf: bool, default=False
        See ScikitLearn CountVectorizer
    **kwargs: dict
        See ScikitLearn CountVectorizer for additional parameters
    """

    name = "tfidf"
    label = "TF-IDF"

    def __init__(
        self,
        columns=["title", "abstract"],
        sep=" ",
        lowercase=True,
        stop_words=None,
        token_pattern=r"(?u)\b\w\w+\b",
        ngram_range=(1, 1),
        max_df=1.0,
        min_df=1,
        max_features=None,
        vocabulary=None,
        binary=False,
        norm="l2",
        use_idf=True,
        smooth_idf=True,
        sublinear_tf=False,
        **kwargs,
    ):
        self.columns = columns
        self.sep = sep
        self.lowercase = lowercase
        self.stop_words = stop_words
        self.token_pattern = token_pattern
        self.ngram_range = tuple(ngram_range)
        self.max_df = max_df
        self.min_df = min_df
        self.max_features = max_features
        self.vocabulary = vocabulary
        self.binary = binary
        self.norm = norm
        self.use_idf = use_idf
        self.smooth_idf = smooth_idf
        self.sublinear_tf = sublinear_tf
        super().__init__(
            [
                ("text_merger", TextMerger(columns=self.columns, sep=self.sep)),
                (
                    "tfidf",
                    TfidfVectorizer(
                        lowercase=self.lowercase,
                        stop_words=self.stop_words,
                        token_pattern=self.token_pattern,
                        ngram_range=self.ngram_range,
                        max_df=self.max_df,
                        min_df=self.min_df,
                        max_features=self.max_features,
                        vocabulary=self.vocabulary,
                        binary=self.binary,
                        norm=self.norm,
                        use_idf=self.use_idf,
                        smooth_idf=self.smooth_idf,
                        sublinear_tf=self.sublinear_tf,
                        **kwargs,
                    ),
                ),
            ]
        )




[docs]
class OneHot(Pipeline):
    """One-hot feature extraction.

    Based on the sklearn implementation of the one-hot feature extraction
    sklearn.feature_extraction.text.CountVectorizer with binary=True.

    Parameters
    ----------
    columns: list, default=["title", "abstract"]
        See TextMerger
    sep: str, default=" "
        See TextMerger
    lowercase: bool, default=True
        See ScikitLearn CountVectorizer
    stop_words: {'english'} or list or None, default=None
        See ScikitLearn CountVectorizer
    token_pattern: str or None, default=r"(?u)\\b\\w\\w+\\b"
        See ScikitLearn CountVectorizer
    ngram_range: tuple (min_n, max_n), default=(1,1)
        See ScikitLearn CountVectorizer
    max_df: float in range [0.0, 1.0] or int, default=1.0
        See ScikitLearn CountVectorizer
    min_df: float in range [0.0, 1.0] or int, default=1
        See ScikitLearn CountVectorizer
    max_features: int, default=None
        See ScikitLearn CountVectorizer
    vocabulary: Mapping or iterable, default=None
        See ScikitLearn CountVectorizer
    **kwargs: dict
        See ScikitLearn CountVectorizer for additional parameters
    """

    name = "onehot"
    label = "OneHot"

    def __init__(
        self,
        columns=["title", "abstract"],
        sep=" ",
        lowercase=True,
        stop_words=None,
        token_pattern=r"(?u)\b\w\w+\b",
        ngram_range=(1, 1),
        max_df=1.0,
        min_df=1,
        max_features=None,
        vocabulary=None,
        **kwargs,
    ):
        self.columns = columns
        self.sep = sep
        self.lowercase = lowercase
        self.token_pattern = token_pattern
        self.stop_words = stop_words
        self.max_df = max_df
        self.min_df = min_df
        self.max_features = max_features
        self.ngram_range = tuple(ngram_range)
        self.vocabulary = vocabulary
        super().__init__(
            [
                ("text_merger", TextMerger(columns=self.columns, sep=self.sep)),
                (
                    "onehot",
                    CountVectorizer(
                        lowercase=self.lowercase,
                        token_pattern=self.token_pattern,
                        stop_words=self.stop_words,
                        max_df=self.max_df,
                        min_df=self.min_df,
                        max_features=self.max_features,
                        ngram_range=self.ngram_range,
                        vocabulary=self.vocabulary,
                        binary=True,
                        **kwargs,
                    ),
                ),
            ]
        )