Source code for asreview.models.feature_extraction.doc2vec

# Copyright 2019-2022 The ASReview Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

__all__ = ["Doc2Vec"]

import numpy as np

try:
    from gensim.models.doc2vec import Doc2Vec as GenSimDoc2Vec
    from gensim.models.doc2vec import TaggedDocument
    from gensim.utils import simple_preprocess

    GENSIM_AVAILABLE = True
except ImportError:
    GENSIM_AVAILABLE = False

from asreview.models.feature_extraction.base import BaseFeatureExtraction


def _check_gensim():
    if not GENSIM_AVAILABLE:
        raise ImportError("Install gensim package to use" " Doc2Vec.")


def _train_model(corpus, *args, **kwargs):
    model = GenSimDoc2Vec(*args, **kwargs)
    model.build_vocab(corpus)
    model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)
    return model


def _transform_text(model, corpus):
    X = []
    for record_id in range(len(corpus)):
        doc_vec = model.infer_vector(corpus[record_id].words)
        X.append(doc_vec)
    return np.array(X)



[docs]
class Doc2Vec(BaseFeatureExtraction):
    """Doc2Vec feature extraction technique (``doc2vec``).

    Feature extraction technique provided by the `gensim
    <https://radimrehurek.com/gensim/>`__ package. It takes relatively long to
    create a feature matrix with this method. However, this only has to be
    done once per simulation/review. The upside of this method is the
    dimension- reduction that generally takes place, which makes the modelling
    quicker.

    .. note::

        This feature extraction technique requires ``gensim`` to be installed.
        Use ``pip install asreview[gensim]`` or install all optional ASReview
        dependencies with ``pip install asreview[all]``

    Arguments
    ---------
    vector_size: int
        Output size of the vector.
    epochs: int
        Number of epochs to train the doc2vec model.
    min_count: int
        Minimum number of occurences for a word in the corpus for it to
        be included in the model.
    n_jobs: int
        Number of threads to train the model with.
    window: int
        Maximum distance over which word vectors influence each other.
    dm_concat: int
        Whether to concatenate word vectors or not.
        See paper for more detail.
    dm: int
        Model to use.
        0: Use distribute bag of words (DBOW).
        1: Use distributed memory (DM).
        2: Use both of the above with half the vector size and concatenate
        them.
    dbow_words: int
        Whether to train the word vectors using the skipgram method.
    """

    name = "doc2vec"
    label = "Doc2Vec"

    def __init__(
        self,
        *args,
        vector_size=40,
        epochs=33,
        min_count=1,
        n_jobs=1,
        window=7,
        dm_concat=0,
        dm=2,
        dbow_words=0,
        **kwargs,
    ):
        """Initialize the doc2vec model."""
        super().__init__(*args, **kwargs)
        self.vector_size = int(vector_size)
        self.epochs = int(epochs)
        self.min_count = int(min_count)
        self.n_jobs = int(n_jobs)
        self.window = int(window)
        self.dm_concat = int(dm_concat)
        self.dm = int(dm)
        self.dbow_words = int(dbow_words)
        self._model = None
        self._model_dm = None
        self._model_dbow = None


[docs]
    def fit(self, texts):
        # check is gensim is available
        _check_gensim()

        model_param = {
            "vector_size": self.vector_size,
            "epochs": self.epochs,
            "min_count": self.min_count,
            "workers": self.n_jobs,
            "window": self.window,
            "dm_concat": self.dm_concat,
            "dbow_words": self.dbow_words,
        }

        corpus = [
            TaggedDocument(simple_preprocess(text), [i]) for i, text in enumerate(texts)
        ]

        # If self.dm is 2, train both models and concatenate the feature
        # vectors later. Resulting vector size should be the same.
        if self.dm == 2:
            model_param["vector_size"] = int(model_param["vector_size"] / 2)
            self.model_dm = _train_model(corpus, **model_param, dm=1)
            self.model_dbow = _train_model(corpus, **model_param, dm=0)
        else:
            self.model = _train_model(corpus, **model_param, dm=self.dm)



[docs]
    def transform(self, texts):
        # check is gensim is available
        _check_gensim()

        corpus = [
            TaggedDocument(simple_preprocess(text), [i]) for i, text in enumerate(texts)
        ]

        if self.dm == 2:
            X_dm = _transform_text(self.model_dm, corpus)
            X_dbow = _transform_text(self.model_dbow, corpus)
            X = np.concatenate((X_dm, X_dbow), axis=1)
        else:
            X = _transform_text(self.model, corpus)
        return X