Text Module Tutorial 3: Machine Learning Classification

This tutorial demonstrates how to perform machine learning analysis of text data.

[1]:
# basic manipulation and plotting
import pandas as pd
import matplotlib.pyplot as plt

# machine learning imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

# pvops functionality
from pvops.text import utils as text_utils
from pvops.text import nlp_utils as text_nlp_utils
from pvops.text import visualize as text_visualize
from pvops.text import preprocess as text_preprocess
from pvops.text import defaults as text_defaults
from pvops.text import classify as text_classify
WARNING:tensorflow:From c:\Users\agmoore\AppData\Local\anaconda3\envs\pvops\Lib\site-packages\keras\src\backend\common\global_state.py:82: The name tf.reset_default_graph is deprecated. Please use tf.compat.v1.reset_default_graph instead.

Preprocessing

See text module tutorial 1 for more information on preprocessing steps. Here, we quickly run through them.

[2]:
# read in dataset
df = pd.read_csv('example_data/example_ML_ticket_data.csv')

# perform asset label remapping
remapping_df = pd.read_csv('example_data/remappings_asset.csv')
remapping_col_dict = dict(attribute_col='Asset',
                          remapping_col_from='in',
                          remapping_col_to='out_')
df = text_utils.remap_attributes(df, remapping_df, remapping_col_dict, allow_missing_mappings=True)

# let's only keep asset labels which have more than one instance in the dataset
label_counts = df.value_counts('Asset')
labels_with_multiple_occurrences = label_counts.loc[label_counts > 1].index
df = df.loc[df['Asset'].isin(labels_with_multiple_occurrences)]

# bulid a custom set of stopwords
stopwords = text_nlp_utils.create_stopwords(lst_add_words=['dtype', 'say', 'length', 'object', 'u', 'ha', 'wa'])

# run our preprocessing function to clean up the text data and prepare it for ML
col_dict = dict(data='CompletionDesc',
                eventstart='Date_EventStart',
                save_data_column='CleanDesc',
                save_date_column='ExtractedDates')
df = text_preprocess.preprocessor(df, stopwords, col_dict)