Source code for asreview.models.feature_extraction.doc2vec

# Copyright 2019-2022 The ASReview Authors. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.

__all__ = ["Doc2Vec"]

import numpy as np

    from gensim.models.doc2vec import Doc2Vec as GenSimDoc2Vec
    from gensim.models.doc2vec import TaggedDocument
    from gensim.utils import simple_preprocess

except ImportError:

from asreview.models.feature_extraction.base import BaseFeatureExtraction

def _check_gensim():
        raise ImportError("Install gensim package to use" " Doc2Vec.")

def _train_model(corpus, *args, **kwargs):
    model = GenSimDoc2Vec(*args, **kwargs)
    model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)
    return model

def _transform_text(model, corpus):
    X = []
    for record_id in range(len(corpus)):
        doc_vec = model.infer_vector(corpus[record_id].words)
    return np.array(X)

[docs] class Doc2Vec(BaseFeatureExtraction): """Doc2Vec feature extraction technique (``doc2vec``). Feature extraction technique provided by the `gensim <>`__ package. It takes relatively long to create a feature matrix with this method. However, this only has to be done once per simulation/review. The upside of this method is the dimension- reduction that generally takes place, which makes the modelling quicker. .. note:: This feature extraction technique requires ``gensim`` to be installed. Use ``pip install asreview[gensim]`` or install all optional ASReview dependencies with ``pip install asreview[all]`` Arguments --------- vector_size: int Output size of the vector. epochs: int Number of epochs to train the doc2vec model. min_count: int Minimum number of occurences for a word in the corpus for it to be included in the model. n_jobs: int Number of threads to train the model with. window: int Maximum distance over which word vectors influence each other. dm_concat: int Whether to concatenate word vectors or not. See paper for more detail. dm: int Model to use. 0: Use distribute bag of words (DBOW). 1: Use distributed memory (DM). 2: Use both of the above with half the vector size and concatenate them. dbow_words: int Whether to train the word vectors using the skipgram method. """ name = "doc2vec" label = "Doc2Vec" def __init__( self, *args, vector_size=40, epochs=33, min_count=1, n_jobs=1, window=7, dm_concat=0, dm=2, dbow_words=0, **kwargs, ): """Initialize the doc2vec model.""" super().__init__(*args, **kwargs) self.vector_size = int(vector_size) self.epochs = int(epochs) self.min_count = int(min_count) self.n_jobs = int(n_jobs) self.window = int(window) self.dm_concat = int(dm_concat) = int(dm) self.dbow_words = int(dbow_words) self._model = None self._model_dm = None self._model_dbow = None
[docs] def fit(self, texts): # check is gensim is available _check_gensim() model_param = { "vector_size": self.vector_size, "epochs": self.epochs, "min_count": self.min_count, "workers": self.n_jobs, "window": self.window, "dm_concat": self.dm_concat, "dbow_words": self.dbow_words, } corpus = [ TaggedDocument(simple_preprocess(text), [i]) for i, text in enumerate(texts) ] # If is 2, train both models and concatenate the feature # vectors later. Resulting vector size should be the same. if == 2: model_param["vector_size"] = int(model_param["vector_size"] / 2) self.model_dm = _train_model(corpus, **model_param, dm=1) self.model_dbow = _train_model(corpus, **model_param, dm=0) else: self.model = _train_model(corpus, **model_param,
[docs] def transform(self, texts): # check is gensim is available _check_gensim() corpus = [ TaggedDocument(simple_preprocess(text), [i]) for i, text in enumerate(texts) ] if == 2: X_dm = _transform_text(self.model_dm, corpus) X_dbow = _transform_text(self.model_dbow, corpus) X = np.concatenate((X_dm, X_dbow), axis=1) else: X = _transform_text(self.model, corpus) return X