Source code for pvops.text.nlp_utils

from sklearn.base import BaseEstimator
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
import scipy
import numpy as np
from importlib import resources
from gensim.models import Word2Vec

from pvops.text import preprocess



[docs]
class Doc2VecModel(BaseEstimator):
    """Performs a gensim Doc2Vec transformation of the input documents to create
    embedded representations of the documents. See gensim's
    Doc2Vec model for information regarding the hyperparameters: https://radimrehurek.com/gensim/models/doc2vec.html.
    Inherits from `sklearn.base.BaseEstimator`. This class is built specifically to work inside a sklearn pipeline.
    Therefore, it uses the default ``transform``, ``fit``, ``fit_transform`` method structure.
    """

    def __init__(
        self,
        vector_size=100,
        dm_mean=None,
        dm=1,
        dbow_words=0,
        dm_concat=0,
        dm_tag_count=1,
        dv=None,
        dv_mapfile=None,
        comment=None,
        trim_rule=None,
        callbacks=(),
        window=5,
        epochs=10,
    ):
        self.d2v_model = None
        self.vector_size = vector_size
        self.dm_mean = dm_mean
        self.dm = dm
        self.dbow_words = dbow_words
        self.dm_concat = dm_concat
        self.dm_tag_count = dm_tag_count
        self.dv = dv
        self.dv_mapfile = dv_mapfile
        self.comment = comment
        self.trim_rule = trim_rule
        self.callbacks = callbacks
        self.window = window
        self.epochs = epochs


[docs]
    def fit(self, raw_documents, y=None):
        """
        Fits the Doc2Vec model.

        Parameters
        ----------
        raw_documents : list
            Input documents.
        y : None
            Placeholder; not utilized.

        Returns
        -------
        Doc2VecModel object
        """
        # Initialize model
        self.d2v_model = Doc2Vec(
            vector_size=self.vector_size,
            dm_mean=self.dm_mean,
            dm=self.dm,
            dbow_words=self.dbow_words,
            dm_concat=self.dm_concat,
            dm_tag_count=self.dm_tag_count,
            dv=self.dv,
            dv_mapfile=self.dv_mapfile,
            comment=self.comment,
            trim_rule=self.trim_rule,
            window=self.window,
            epochs=self.epochs,
        )
        # Tag docs
        tagged_documents = [
            TaggedDocument(words=preprocess.regex_tokenize(_d.lower()), tags=[str(i)])
            for i, _d in enumerate(raw_documents)
        ]
        # Build vocabulary
        self.d2v_model.build_vocab(tagged_documents)
        # Train model
        self.d2v_model.train(
            tagged_documents,
            total_examples=len(tagged_documents),
            epochs=self.d2v_model.epochs,
        )
        return self



[docs]
    def transform(self, raw_documents):
        """
        Transforms the documents into Doc2Vec vectors.

        Parameters
        ----------
        raw_documents : list
            Input documents.

        Returns
        -------
        list
            Embeddings of the input documents.
        """
        X = []
        for doc in raw_documents:
            X.append(self.d2v_model.infer_vector(preprocess.regex_tokenize(doc)))
        return X



[docs]
    def fit_transform(self, raw_documents, y=None):
        """
        Utilizes the ``fit()`` and ``transform()`` methods in this class.

        Parameters
        ----------
        raw_documents : list
            Input documents.
        y : None
            Placeholder; not utilized.

        Returns
        -------
        list
            Embeddings of the input documents.
        """
        self.fit(raw_documents)
        return self.transform(raw_documents)





[docs]
class DataDensifier(BaseEstimator):
    """A data structure transformer which converts sparse data to dense data.
    This process is usually incorporated in this library when doing unsupervised machine learning.
    This class is built specifically to work inside a sklearn pipeline.
    Therefore, it uses the default ``transform``, ``fit``, ``fit_transform`` method structure.
    """


[docs]
    def transform(self, X, y=None):
        """Return a dense array if the input array is sparse.

        Parameters
        ----------
        X : array
            Input data of numerical values. For this package, these values could
            represent embedded representations of documents.

        Returns
        -------
        dense array
        """
        if scipy.sparse.issparse(X):
            return X.toarray()
        else:
            return X.copy()



[docs]
    def fit(self, X, y=None):
        """Placeholder method to conform to the sklearn class structure.

        Parameters
        ----------
        X : array
            Input data
        y : Not utilized.

        Returns
        -------
        DataDensifier object
        """
        return self



[docs]
    def fit_transform(self, X, y=None):
        """Performs same action as ``DataDensifier.transform()``,
        which returns a dense array when the input is sparse.

        Parameters
        ----------
        X : array
            Input data
        y : None
            Placeholder; not utilized.

        Returns
        -------
        dense array
        """
        return self.transform(X=X, y=y)





[docs]
def create_stopwords(lst_add_words=[], lst_keep_words=[]):
    """Concatenate a list of stopwords using both words grabbed from nltk and user-specified words.
    The nltk stopwords are those that were current at the release of pvOps version 0.5.0 on
    Febuary 19th, 2025. See below for more on nltk.

    Bird, Steven, Edward Loper and Ewan Klein (2009), Natural Language Processing with Python. O'Reilly Media Inc.

    https://www.nltk.org/

    Parameters
    ----------
    lst_add_words : list
        List of words(e.g., "road" or "street") to add to stopwords list. If these words are already included in the nltk list, a duplicate will not be added.
    lst_keep_words : list
        List of words(e.g., "before" or "until") to remove from stopwords list. This is usually used to modify default stop words that might be of interest to PV.

    Returns
    -------
    list
        List of alphabetized stopwords
    """
    lst_stopwords = set()

    with resources.files('pvops.text').joinpath('stopwords.txt').open('r') as file:
        default_stopwords = file.read().split()

    lst_stopwords = lst_stopwords.union(default_stopwords)
    lst_stopwords = lst_stopwords.union(lst_add_words)
    lst_stopwords = list(set(lst_stopwords) - set(lst_keep_words))
    return sorted(list(set(lst_stopwords)))




[docs]
def summarize_text_data(om_df, colname):
    """Display information about a set of documents located in a dataframe, including
    the number of samples, average number of words, vocabulary size, and number of words
    in total.

    Parameters
    ----------
    om_df : DataFrame
        A pandas dataframe containing O&M data, which contains at least the colname of interest
    colname : str
        Column name of column with text

    Returns
    -------
    dict
        dictionary containing printed summary data
    """
    df = om_df.copy()
    text = df[colname].tolist()

    nonan_text = [x for x in text if (str(x) != "nan" and x is not None)]

    tokenized = [sentence.split() for sentence in nonan_text]
    avg_n_words = np.array([len(tokens) for tokens in tokenized]).mean()
    sum_n_words = np.array([len(tokens) for tokens in tokenized]).sum()
    model = Word2Vec(tokenized, min_count=1)

    # Total vocabulary
    vocab = model.wv

    # Bold title.
    print("\033[1m" + "DETAILS" + "\033[0m")

    info = {
        "n_samples": len(df),
        "n_nan_docs": len(df) - len(nonan_text),
        "n_words_doc_average": avg_n_words,
        "n_unique_words": len(vocab),
        "n_total_words": sum_n_words,
    }

    # Display information.
    print(f'  {info["n_samples"]} samples')
    print(f'  {info["n_nan_docs"]} invalid documents')
    print("  {:.2f} words per sample on average".format(
        info["n_words_doc_average"]))
    print(f'  Number of unique words {info["n_unique_words"]}')
    print("  {:.2f} total words".format(info["n_total_words"]))

    return info