Source code for pvops.text.classify

# Classifiers
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from scipy.sparse import issparse

import numpy as np
import pandas as pd
import copy

from pvops.text.preprocess import get_keywords_of_interest


[docs]
def classification_deployer(
    X,
    y,
    n_splits,
    classifiers,
    search_space,
    pipeline_steps,
    scoring,
    greater_is_better=True,
    verbose=3,
):
    """
    The classification deployer builds a classifier evaluator with an ingrained hyperparameter fine-tuning grid search protocol.
    The output of this function will be a data frame showing the performance of each classifier when utilizing a specific hyperparameter
    configuration.

    To see an example of this method's application, see ``tutorials/tutorial_text_preprocess_classify.ipynb``.

    Parameters
    ----------
    X : list of str
        List of documents (str). The documents will be passed through the pipeline_steps, where they will be transformed into vectors.
    y : list
        List of labels corresponding with the documents in X
    n_splits : int
        Integer defining the number of splits in the cross validation split during training
    classifiers : dict
        Dictionary with key as classifier identifier (str) and value as classifier instance following sklearn's
        base model convention: sklearn_docs.

        .. sklearn_docs: https://scikit-learn.org/stable/modules/generated/sklearn.base.is_classifier.html
        .. code-block:: python

            classifiers = {
                'LinearSVC' : LinearSVC(),
                'AdaBoostClassifier' : AdaBoostClassifier(),
                'RidgeClassifier' : RidgeClassifier()
            }

        See ``supervised_classifier_defs.py`` or ``unsupervised_classifier_defs.py`` for this package's defaults.
    search_space : dict
        Dictionary with classifier identifiers, as used in ``classifiers``, mapped to its hyperparameters.

        .. code-block:: python

            search_space = {
                'LinearSVC' : {
                'clf__C' : [1e-2,1e-1],
                'clf__max_iter':[800,1000],
                },
                'AdaBoostClassifier' : {
                'clf__n_estimators' : [50,100],
                'clf__learning_rate':[1.,0.9,0.8],
                'clf__algorithm' : ['SAMME.R']
                },
                'RidgeClassifier' : {
                'clf__alpha' : [0.,1e-3,1.],
                'clf__normalize' : [False,True]
                }
            }

        See ``supervised_classifier_defs.py`` or ``unsupervised_classifier_defs.py`` for this package's defaults.
    pipeline_steps : list of tuples
        Define embedding and machine learning pipeline. The last tuple must be ``('clf', None)`` so that the output
        of the pipeline is a prediction.
        For supervised classifiers using a TFIDF embedding, one could specify

        .. code-block:: python

            pipeline_steps = [('tfidf', TfidfVectorizer()),
                              ('clf', None)]

        For unsupervised clusterers using a TFIDF embedding, one could specify

        .. code-block:: python

            pipeline_steps = [('tfidf', TfidfVectorizer()),
                              ('to_dense', DataDensifier.DataDensifier()),
                              ('clf', None)]

        A densifier is required from some clusters, which fail if sparse data is passed.
    scoring : sklearn callable scorer (i.e., any statistic that summarizes predictions relative to observations).
        Example scorers include f1_score, accuracy, etc.
        Callable object that returns a scalar score created using sklearn.metrics.make_scorer
        For supervised classifiers, one could specify

        .. code-block:: python

            scoring = make_scorer(f1_score, average = 'weighted', pos_label = None)

        For unsupervised classifiers, one could specify

        .. code-block:: python

            scoring = make_scorer(homogeneity_score)

    greater_is_better : bool
        Whether the scoring parameter is better when greater (i.e. accuracy) or not.

    verbose : int
        Control the specificity of the prints. If greater than 1, a print out is shown when a new "best classifier"
        is found while iterating. Additionally, the verbosity during the grid search follows sklearn's definitions.
        The frequency of the messages increase with the verbosity level.

    Returns
    -------
    DataFrame
        Summarization of results from all of the classifiers
    """

    rows = []

    if issparse(X):
        print("Converting passed data to dense array...")
        X = X.toarray()

    # get position of 'clf' in pipeline_steps
    idx_clf_pipeline = [i for i, it in enumerate(
        pipeline_steps) if it[0] == "clf"][0]

    best_gs_instance = None
    if greater_is_better:
        best_model_score = 0.0
    else:
        best_model_score = np.inf
    for iter_idx, key in enumerate(classifiers.keys()):
        clas = classifiers[key]
        space = search_space[key]

        iter_pipeline_steps = copy.deepcopy(pipeline_steps)
        iter_pipeline_steps[idx_clf_pipeline] = ("clf", clas)
        pipe = Pipeline(iter_pipeline_steps)

        gs_clf = GridSearchCV(
            pipe,
            space,
            scoring=scoring,
            cv=n_splits,
            n_jobs=-1,
            return_train_score=True,
            verbose=verbose,
        )
        gs_clf.fit(X, y)
        params = gs_clf.cv_results_["params"]
        scores = []
        for i in range(n_splits):
            r1 = gs_clf.cv_results_[f"split{i}_test_score"]
            scores.append(r1.reshape(len(params), 1))

        r2 = gs_clf.cv_results_["mean_fit_time"]

        all_scores = np.hstack(scores)
        for param, score, time in zip(params, all_scores, r2):
            param["mean_fit_time"] = time
            d = {
                "estimator" : key,
                "min_score" : min(score),
                "max_score" : max(score),
                "mean_score" : np.mean(score),
                "std_score" : np.std(score),
            }
            rows.append((pd.Series({**param, **d})))

        if greater_is_better:
            replacement_logic = gs_clf.best_score_ > best_model_score
        else:
            replacement_logic = gs_clf.best_score_ < best_model_score

        if replacement_logic:
            if verbose > 1:
                print(
                    "Better score ({:.3f}) found on classifier: {}".format(
                        gs_clf.best_score_, key
                    )
                )
            best_model_score = gs_clf.best_score_
            best_gs_instance = gs_clf

    return pd.concat(rows, axis=1).T, best_gs_instance.best_estimator_



[docs]
def get_attributes_from_keywords(om_df, col_dict, reference_df, reference_col_dict):
    """
    Find keywords of interest in specified column of dataframe, return as new column value.

    If keywords of interest given in a reference dataframe are in the specified column of the
    dataframe, return the keyword category, or categories.
    For example, if the string 'inverter' is in the list of text, return ['inverter'].

    Parameters
    ----------
    om_df : pd.DataFrame
        Dataframe to search for keywords of interest, must include text_col.
    col_dict : dict of {str: str}
        A dictionary that contains the column names needed:
        - data : string, should be assigned to associated column which stores the tokenized text logs
        - predicted_col : string, will be used to create keyword search label column
    reference_df : DataFrame
        Holds columns that define the reference dictionary to search for keywords of interest,
        Note: This function can currently only handle single words, no n-gram functionality.
    reference_col_dict : dict of {str: str}
        A dictionary that contains the column names that describes how
        referencing is going to be done
        - reference_col_from : string, should be assigned to
        associated column name in reference_df that are possible input reference values.
        Example: pd.Series(['inverter', 'invert', 'inv'])
        - reference_col_to : string, should be assigned to
        associated column name in reference_df that are the output reference values of interest.
        Example: pd.Series(['inverter', 'inverter', 'inverter'])

    Returns
    -------
    om_df: pd.DataFrame
        Input df with new_col added, where each found keyword is its own row, may result in
        duplicate rows if more than one keywords of interest was found in text_col.
    """
    om_df[col_dict['predicted_col']] = om_df[col_dict['data']].apply(get_keywords_of_interest,
                                                                     reference_df=reference_df,
                                                                     reference_col_dict=reference_col_dict)

    # each multi-category now in its own row, some logs have multiple equipment issues
    multiple_keywords_df = om_df[om_df[col_dict['predicted_col']].str.len() > 1]
    om_df = om_df.explode(col_dict['predicted_col'])

    msg = f'{len(multiple_keywords_df)} entries had multiple keywords of interest. Reference: {multiple_keywords_df.index} in original dataframe.'
    print(msg)

    return om_df