Source code for pvops.text.preprocess

import re

import numpy as np
import datefinder
import traceback
from datetime import datetime, timedelta


[docs] def regex_tokenize(doc, pattern=r'(?<=\w)([^\w\s]+)(?=\s)|(?<=\s)([^\w\s]+)(?=\w)'): """ Tokenize a document into words and punctuation using a regular expression pattern. This function takes a string document and splits it into tokens based on the specified regular expression pattern. The default pattern splits leading and trailing punctuation from words while ignoring punctuation interior to words or numbers (i.e., alphanumeric characters). From there, the tokens are split along whitespace. Parameters ---------- doc : str The input document (text) to be tokenized. pattern : str, optional The regular expression pattern to be used. Returns ------- list of str A list of tokens extracted from the input document, including both words and punctuation. """ # Temporarily buffer the document with spaces doc = ' ' + doc + ' ' # Buffer anything matching the pattern with spaces on either side doc = re.sub(pattern, r' \1\2 ', doc) # replace any contiguous whitespace with a single space doc = re.sub(r'\s+', ' ', doc) # remove leading and ending whitespace; break into tokens along spaces return doc.strip().split(' ')
[docs] def preprocessor( om_df, lst_stopwords, col_dict, print_info=False, extract_dates_only=False ): """ Preprocessing function which processes the raw text data into processed text data and extracts dates Parameters ---------- om_df : DataFrame A pandas dataframe containing O&M data, which contains at least the columns within col_dict. lst_stopwords : list List of stop words which will be filtered in final preprocessing step col_dict : dict of {str: str} A dictionary that contains the column names relevant for the get_dates fn - data : string, should be assigned to associated column which stores the text logs - eventstart : string, should be assigned to associated column which stores the log submission datetime - save_data_column : string, should be assigned to associated column where the processed text should be stored - save_date_column : string, should be assigned to associated column where the extracted dates from the text should be stored print_info : bool Flag indicating whether to print information about the preprocessing progress extract_dates_only : bool If True, return after extracting dates in each ticket If False, return with preprocessed text and extracted dates Returns ------- df : DataFrame Contains the original columns as well as the processed data, located in columns defined by the inputs """ DATA_COLUMN = col_dict["data"] EVENTSTART_COLUMN = col_dict["eventstart"] SAVE_DATA_COLUMN = col_dict["save_data_column"] SAVE_DATE_COLUMN = col_dict["save_date_column"] df = om_df.copy() dates_extracted = [] clean_corpus = [] # basedate_extracted = [] n_nans = 0 # n_success_date = 0 n_fails_date = 0 n_total = len(df.index) n_fails_prep = 0 tally = 0 lens = [] df = om_df.copy() df.reset_index(drop=True, inplace=True) for ind, row in df.iterrows(): document = row[DATA_COLUMN] if document == np.nan: n_nans += 1 try: document = str(document).lower() document = text_remove_nondate_nums( document, PRINT_INFO=print_info) dts = get_dates(document, df, ind, col_dict, print_info) if print_info: print("Dates: ", dts) lens.append(len(dts)) except Exception as e: print(e) dts = np.nan n_fails_date += 1 lens.append(np.nan) dates_extracted.append(dts) if not extract_dates_only: try: out = text_remove_numbers_stopwords(document, lst_stopwords) clean_corpus.append(out) except: print(traceback.format_exc()) clean_corpus.append("") n_fails_prep += 1 if print_info: print( f"len clean corpus: {len(clean_corpus)}, len deduced dates: {len(dates_extracted)}" ) print( f"num_total {n_total}, num_nans {n_nans}, num_fails_date {n_fails_date}, num_fails_prep {n_fails_prep}, tally {tally}" ) df[SAVE_DATE_COLUMN] = dates_extracted if not extract_dates_only: df[SAVE_DATA_COLUMN] = clean_corpus filtered_dates = [] for ind, row in df.iterrows(): nlp_dates = row[SAVE_DATE_COLUMN] if len(nlp_dates) == 0: filtered_dates.append(nlp_dates) continue try: date = datetime.strptime( row[EVENTSTART_COLUMN], "%Y-%m-%d %H:%M:%S") fltrd = [] for dt in nlp_dates: # d = datetime.strptime(dt, '%m-%d-%Y %H:%M:%S') # if less than a year, include if abs((date - dt).total_seconds()) < 3.154e7: fltrd.append(dt) filtered_dates.append(fltrd) except: # NaN values filtered_dates.append(nlp_dates) df[SAVE_DATE_COLUMN] = filtered_dates return df
[docs] def get_dates( document, om_df, ind, col_dict, print_info, infer_date_surrounding_rows=True ): """Extract dates from the input document. This method is utilized within ``preprocessor.py``. For an easy way to extract dates, utilize the preprocessor and set extract_dates_only = True. Parameters ---------- document : str String representation of a document om_df : DataFrame A pandas dataframe containing O&M data, which contains at least the columns within col_dict. ind : integer Designates the row of the dataframe which is currently being observed. This is required because if the current row does not have a valid date in the `eventstart`, then an iterative search is conducted by first starting at the nearest rows. col_dict : dict of {str: str} A dictionary that contains the column names relevant for the get_dates fn - data : string, should be assigned to associated column which stores the text logs - eventstart : string, should be assigned to associated column which stores the log submission datetime print_info : bool Flag indicating whether to print information about the preprocessing progress infer_date_surrounding_rows : bool If True, utilizes iterative search in dataframe to infer the datetime from surrounding rows if the current row's date value is nan If False, does not utilize the base datetime. Consequentially, today's date is used to replace the missing parts of the datetime. Recommendation: set True if you frequently publish documents and your dataframe is ordered chronologically Returns ------- list List of dates found in text """ DATA_COLUMN = col_dict["data"] EVENTSTART_COLUMN = col_dict["eventstart"] try: row = om_df.iloc[ind] if print_info: print("Start time: ", row[EVENTSTART_COLUMN]) no_base_date_found = False if isinstance(row[EVENTSTART_COLUMN], float) and np.isnan( row[EVENTSTART_COLUMN] ): # Was given a NaN value as event start date, so look before an after this row for a date if infer_date_surrounding_rows: no_base_date_found = True else: if print_info: print("found nan") find_valid = False w = 1 om_df_len = len(om_df.index) while find_valid is False and no_base_date_found is False: ind_behind = ind - w ind_ahead = ind + w if ind_behind >= 0: if print_info: print("checking index: ", ind_behind) row_behind = om_df.iloc[ind_behind] if isinstance( row_behind[EVENTSTART_COLUMN], float ) and np.isnan(row_behind[EVENTSTART_COLUMN]): pass else: basedate = list( datefinder.find_dates( row_behind[EVENTSTART_COLUMN]) )[0] find_valid = True continue if ind_ahead < om_df_len: if print_info: print("checking index: ", ind_ahead) row_ahead = om_df.iloc[ind_ahead] if isinstance(row_ahead[EVENTSTART_COLUMN], float) and np.isnan( row_ahead[EVENTSTART_COLUMN] ): pass else: basedate = list( datefinder.find_dates( row_ahead[EVENTSTART_COLUMN]) )[0] find_valid = True continue # not needed but consistent syntax if ind_ahead > om_df_len and ind_behind < 0: no_base_date_found = True w += 1 else: basedate = list(datefinder.find_dates(row[EVENTSTART_COLUMN]))[0] if no_base_date_found: matches = list(datefinder.find_dates(document)) else: matches = list(datefinder.find_dates(document, base_date=basedate)) except Exception as e: matches = [] if print_info: print(traceback.format_exc()) print("\n") print("date") print(row[EVENTSTART_COLUMN]) print("proc") print(document) print("raw") print(om_df.iloc[[ind]][DATA_COLUMN].tolist()[0]) print(ind) print(e) print(traceback.format_exc()) valid_matches = [] # valid_inds = [] for mtch in matches: try: if (mtch > datetime.strptime("01/01/1970", "%m/%d/%Y")) and ( mtch < datetime.now() + timedelta(days=365 * 100) ): valid_matches.append(mtch) except Exception as e: if print_info: print(e) return valid_matches
[docs] def text_remove_nondate_nums(document, PRINT_INFO=False): """Conduct initial text processing steps to prepare the text for date extractions. Function mostly uses regex-based text substitution to remove numerical structures within the text, which may be mistaken as a date by the date extractor. Parameters ---------- document : str String representation of a document PRINT_INFO : bool Flag indicating whether to print information about the preprocessing progress Returns ------- string string of processed document """ if PRINT_INFO: print() print() print("IN: ", document) # Remove URLs find_URL = r"""(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))""" document = re.sub(find_URL, " ", document) regexs = [ r"\d+(\%|\s\%|\bpercent\b|\s\bpercent\b)", # Take out 'd%', 'd %' r"(#\s|#)\d+", # '#d', '# d' # r'-?\d+(,\d+)+(].]d*)?', # take out lists of numbers with no space r"-?\d+(,\d+)+(\.\d*)?", # take out lists of numbers with no space # r'-?\d+(,\s\d+)+(].]d*)?', # take out list of numbers with space r"\s\d{3}\s", # numeric with 3 digits r"\b(0|00|1[3-9]|[2-9]\d)\b-\d{4}", # [1-12]-4DIGIT allowed only r"\s+\d+\.\d+\s+", # e.g.: 10.1 and 10.2 with space before and after # Take out numbers longer than 8 digits (8 because datetimes 20190320 should stay) r"\d{9,}", r"\d+[.]+\d+[.]\d+[.][\d?]", # Take out IP numbers # Take out single digit-hyphen trios e.g. 3-1-4 but leave 10-20-18 (possible date) r"\d-\d-\d", # Take out single digit-hyphen trios e.g. 3-1-4 but leave 10-20-18 (possible date) r"\d[.]\d[.]\d", r"\d+(\.\d*)?\s*[kK]?[wW]\s", r"\b(?!([jJ]an(uary)?|[fF]eb(r)?(uary)?|[mM]ar(ch)?|[aA]pr(il)?|[mM]ay|[jJ]un(e)?|[jJ]ul(y)?|[aA]ug(ust)?|[sS]ep(t)?(ember)?|[oO]ct(ober)?|[nN]ov(ember)?|[dD]ec(ember)?\b))[a-zA-Z]+-\d+", # ^ take out e.g. webbox-10 r"[\w\.-]+@[\w\.-]+\.\w+", # take out email addresses # take out phone numbers r"(\s\d{3}[-\.\s]?\d{3}[-\.\s]?\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]?\d{4}|\s\d{3}[-\.\s]\d{4})", # e.g. neff - cb 2.1b.16 - forced outage ; unknown. at 1645 26-jun cb 2.1b.16 offline.. 0000 - unknown r"\s\d+[.]\d+\D+[.]\d+\s", r"\s\D[.]\d[,\s]", ] replacements = [ "", "", "", " ", " ", " ", "", "", "", "", " kW", " ", " ", " ", " ", " ", ] document = document.center(len(document) + 2) # add spaces on either side for regex, repl in zip(regexs, replacements): # print('\t',regex) document = re.sub(regex, repl, document) # print('\t',words) if PRINT_INFO: print("SUB1:", document) # Decision to change all hyphens (-) to 'to' # to get rid of invalid timezone extrapolations document = str(document).lower() # print('prechk:',words) document = regex_tokenize(document) if PRINT_INFO: print("TOKENED: ", document) # Remove single-character tokens (mostly punctuation) document = [word for word in document if len(word) > 1] if PRINT_INFO: print("FLTRD: ", document) document = " ".join(document) if PRINT_INFO: print("JOINED: ", document) # print('chkpt: ',words) regexs = [ r"\d+(\%|\s\%|\bpercent\b|\s\bpercent\b)", # Take out 'd%', 'd %' r"(#\s|#)\d+", # '#d', '# d' r"\d+(,\d+)+(].]d*)?", # take out lists of numbers with no space r"\d+(,\s\d+)+(].]d*)?", # take out list of numbers with space r"\s\d{3}\s", # numeric with 3 digits # [1-12]-4DIGIT allowed only: 91-1010 r"\s\b(0|00|1[3-9]|[2-9]\d)\b[-/]\d{4}\s", # 4DIGIT-[1-12] allowed only: 4301/43 r"\s\d{4}[-/]\b(0|00|1[3-9]|[2-9]\d)\b\s", r"\s\d+\[.]\d+\s", # e.g.: ' 10.1 ' and 10.2 # Take out numbers longer than 8 digits (8 because datetimes # 20190320 should stay) r"\d{9,}", r"\s\D[.]\s", # Take out " m. " for maybe, ' c. ' for cerca, etc. # Take out 123/29 because not a date format, usually indicating # temperature/etc. r"\s\d{3}\/\d{2}\s", r"\s[a-zA-Z]+-[a-zA-Z]+\d\s", # this and next one: e-a4 they are e7-1 r"\s[a-zA-Z]\d+-\d+\s", r"\s[a-zA-Z]\d+\s", # take out examples like `j23` ] replacements = ["", "", "", "", " ", " ", " ", " ", "", " ", " ", " ", " ", " "] document = document.center(len(document) + 2) # add spaces on either side for regex, repl in zip(regexs, replacements): document = re.sub(regex, repl, document) if PRINT_INFO: print("TO DFINDER: ", document) return document
[docs] def text_remove_numbers_stopwords(document, lst_stopwords): """Conduct final processing steps after date extraction Parameters ---------- document : str String representation of a document lst_stopwords : list List of stop words which will be filtered in final preprocessing step Returns ------- string string of processed document """ for char in "<>,.*?!/\\:\"'@#$%^&(){}[]|~`_-": document = document.replace(char, " ") # many documents use ; or - as sentence partitioners # for char in ';-': # document = document.replace(char,'') rem_num = re.sub("[0-9]+", "", document) # remove all spaces document_tok = regex_tokenize(rem_num) document = [i for i in document_tok if i not in lst_stopwords] document = " ".join(document) return document
[docs] def get_keywords_of_interest(document_tok, reference_df, reference_col_dict): """Find keywords of interest in list of strings from reference dict. If keywords of interest given in a reference dict are in the list of strings, return the keyword category, or categories. For example, if the string 'inverter' is in the list of text, return ['inverter']. Parameters ---------- document_tok : list of str Tokenized text, functionally a list of string values. reference_df : DataFrame Holds columns that define the reference dictionary to search for keywords of interest, Note: This function can currently only handle single words, no n-gram functionality. reference_col_dict : dict of {str: str} A dictionary that contains the column names that describes how referencing is going to be done - reference_col_from : string, should be assigned to associated column name in reference_df that are possible input reference values Example: pd.Series(['inverter', 'invert', 'inv']) - reference_col_to : string, should be assigned to associated column name in reference_df that are the output reference values of interest Example: pd.Series(['inverter', 'inverter', 'inverter']) Returns ------- included_equipment: list of str List of keywords from reference_dict found in list_of_txt, can be more than one value. """ REFERENCE_COL_FROM = reference_col_dict["reference_col_from"] REFERENCE_COL_TO = reference_col_dict["reference_col_to"] reference_dict = dict( zip(reference_df[REFERENCE_COL_FROM], reference_df[REFERENCE_COL_TO]) ) # keywords of interest overlap_keywords = reference_dict.keys() & document_tok included_keywords = list({reference_dict[x] for x in overlap_keywords}) return included_keywords