Source code for pvops.text.preprocess

import re

import numpy as np
import datefinder
import traceback
from datetime import datetime, timedelta



[docs]
def regex_tokenize(doc, pattern=r'(?<=\w)([^\w\s]+)(?=\s)|(?<=\s)([^\w\s]+)(?=\w)'):
    """
    Tokenize a document into words and punctuation using a regular expression pattern.

    This function takes a string document and splits it into tokens based on the specified regular expression pattern.
    The default pattern splits leading and trailing punctuation from words while ignoring punctuation
    interior to words or numbers (i.e., alphanumeric characters). From there, the tokens
    are split along whitespace.

    Parameters
    ----------
    doc : str
        The input document (text) to be tokenized.

    pattern : str, optional
        The regular expression pattern to be used.

    Returns
    -------
    list of str
        A list of tokens extracted from the input document, including both words and punctuation.
    """

    # Temporarily buffer the document with spaces
    doc = ' ' + doc + ' '
    # Buffer anything matching the pattern with spaces on either side
    doc = re.sub(pattern, r' \1\2 ', doc)
    # replace any contiguous whitespace with a single space
    doc = re.sub(r'\s+', ' ', doc)
    # remove leading and ending whitespace; break into tokens along spaces
    return doc.strip().split(' ')




[docs]
def preprocessor(
    om_df, lst_stopwords, col_dict, print_info=False, extract_dates_only=False
):
    """
    Preprocessing function which processes the raw text data into processed text data and extracts dates

    Parameters
    ----------
    om_df : DataFrame
        A pandas dataframe containing O&M data, which contains at least the columns within col_dict.
    lst_stopwords : list
        List of stop words which will be filtered in final preprocessing step
    col_dict : dict of {str: str}
        A dictionary that contains the column names relevant for the get_dates fn
        - data : string, should be assigned to associated column which stores the text logs
        - eventstart : string, should be assigned to associated column which stores the log submission datetime
        - save_data_column : string, should be assigned to associated column where the processed text should be stored
        - save_date_column : string, should be assigned to associated column where the extracted dates from the text should be stored

    print_info : bool
        Flag indicating whether to print information about the preprocessing progress
    extract_dates_only : bool
        If True, return after extracting dates in each ticket
        If False, return with preprocessed text and extracted dates

    Returns
    -------
    df : DataFrame
        Contains the original columns as well as the processed data, located in columns defined by the inputs
    """

    DATA_COLUMN = col_dict["data"]
    EVENTSTART_COLUMN = col_dict["eventstart"]
    SAVE_DATA_COLUMN = col_dict["save_data_column"]
    SAVE_DATE_COLUMN = col_dict["save_date_column"]
    df = om_df.copy()

    dates_extracted = []
    clean_corpus = []
    # basedate_extracted = []

    n_nans = 0
    # n_success_date = 0
    n_fails_date = 0
    n_total = len(df.index)
    n_fails_prep = 0

    tally = 0
    lens = []

    df = om_df.copy()

    df.reset_index(drop=True, inplace=True)

    for ind, row in df.iterrows():

        document = row[DATA_COLUMN]
        if document == np.nan:
            n_nans += 1
        try:
            document = str(document).lower()
            document = text_remove_nondate_nums(
                document, PRINT_INFO=print_info)
            dts = get_dates(document, df, ind, col_dict, print_info)
            if print_info:
                print("Dates: ", dts)

            lens.append(len(dts))

        except Exception as e:
            print(e)
            dts = np.nan
            n_fails_date += 1

            lens.append(np.nan)

        dates_extracted.append(dts)

        if not extract_dates_only:
            try:
                out = text_remove_numbers_stopwords(document, lst_stopwords)
                clean_corpus.append(out)
            except:
                print(traceback.format_exc())
                clean_corpus.append("")
                n_fails_prep += 1

    if print_info:
        print(
            f"len clean corpus: {len(clean_corpus)}, len deduced dates: {len(dates_extracted)}"
        )
        print(
            f"num_total {n_total}, num_nans {n_nans}, num_fails_date {n_fails_date}, num_fails_prep {n_fails_prep}, tally {tally}"
        )

    df[SAVE_DATE_COLUMN] = dates_extracted
    if not extract_dates_only:
        df[SAVE_DATA_COLUMN] = clean_corpus

    filtered_dates = []
    for ind, row in df.iterrows():
        nlp_dates = row[SAVE_DATE_COLUMN]

        if len(nlp_dates) == 0:
            filtered_dates.append(nlp_dates)
            continue

        try:
            date = datetime.strptime(
                row[EVENTSTART_COLUMN], "%Y-%m-%d %H:%M:%S")

            fltrd = []
            for dt in nlp_dates:
                # d = datetime.strptime(dt, '%m-%d-%Y %H:%M:%S')

                # if less than a year, include
                if abs((date - dt).total_seconds()) < 3.154e7:
                    fltrd.append(dt)
            filtered_dates.append(fltrd)

        except:
            # NaN values
            filtered_dates.append(nlp_dates)

    df[SAVE_DATE_COLUMN] = filtered_dates

    return df




[docs]
def get_dates(
    document, om_df, ind, col_dict, print_info, infer_date_surrounding_rows=True
):
    """Extract dates from the input document.

    This method is utilized within ``preprocessor.py``. For an easy way to extract dates, utilize the preprocessor and set
    extract_dates_only = True.

    Parameters
    ----------
    document : str
        String representation of a document
    om_df : DataFrame
        A pandas dataframe containing O&M data, which contains at least the columns within col_dict.
    ind : integer
        Designates the row of the dataframe which is currently being observed. This is required because if the
        current row does not have a valid date in the `eventstart`, then an iterative search is conducted
        by first starting at the nearest rows.
    col_dict : dict of {str: str}
        A dictionary that contains the column names relevant for the get_dates fn

        - data : string, should be assigned to associated column which stores the text logs
        - eventstart : string, should be assigned to associated column which stores the log submission datetime

    print_info : bool
        Flag indicating whether to print information about the preprocessing progress
    infer_date_surrounding_rows : bool
        If True, utilizes iterative search in dataframe to infer the datetime from surrounding rows if the current row's date value is nan
        If False, does not utilize the base datetime. Consequentially, today's date is used to replace the missing parts of the datetime.
        Recommendation: set True if you frequently publish documents and your dataframe is ordered chronologically

    Returns
    -------
    list
        List of dates found in text
    """

    DATA_COLUMN = col_dict["data"]
    EVENTSTART_COLUMN = col_dict["eventstart"]

    try:
        row = om_df.iloc[ind]
        if print_info:
            print("Start time: ", row[EVENTSTART_COLUMN])

        no_base_date_found = False
        if isinstance(row[EVENTSTART_COLUMN], float) and np.isnan(
            row[EVENTSTART_COLUMN]
        ):
            # Was given a NaN value as event start date, so look before an after this row for a date

            if infer_date_surrounding_rows:
                no_base_date_found = True

            else:
                if print_info:
                    print("found nan")
                find_valid = False

                w = 1
                om_df_len = len(om_df.index)

                while find_valid is False and no_base_date_found is False:
                    ind_behind = ind - w
                    ind_ahead = ind + w

                    if ind_behind >= 0:
                        if print_info:
                            print("checking index: ", ind_behind)
                        row_behind = om_df.iloc[ind_behind]
                        if isinstance(
                            row_behind[EVENTSTART_COLUMN], float
                        ) and np.isnan(row_behind[EVENTSTART_COLUMN]):
                            pass
                        else:
                            basedate = list(
                                datefinder.find_dates(
                                    row_behind[EVENTSTART_COLUMN])
                            )[0]
                            find_valid = True
                            continue

                    if ind_ahead < om_df_len:
                        if print_info:
                            print("checking index: ", ind_ahead)
                        row_ahead = om_df.iloc[ind_ahead]
                        if isinstance(row_ahead[EVENTSTART_COLUMN], float) and np.isnan(
                            row_ahead[EVENTSTART_COLUMN]
                        ):
                            pass
                        else:
                            basedate = list(
                                datefinder.find_dates(
                                    row_ahead[EVENTSTART_COLUMN])
                            )[0]
                            find_valid = True
                            continue  # not needed but consistent syntax

                    if ind_ahead > om_df_len and ind_behind < 0:
                        no_base_date_found = True
                    w += 1

        else:
            basedate = list(datefinder.find_dates(row[EVENTSTART_COLUMN]))[0]

        if no_base_date_found:
            matches = list(datefinder.find_dates(document))
        else:
            matches = list(datefinder.find_dates(document, base_date=basedate))

    except Exception as e:
        matches = []
        if print_info:
            print(traceback.format_exc())
            print("\n")
            print("date")
            print(row[EVENTSTART_COLUMN])
            print("proc")
            print(document)
            print("raw")
            print(om_df.iloc[[ind]][DATA_COLUMN].tolist()[0])
            print(ind)
            print(e)
            print(traceback.format_exc())

    valid_matches = []
    # valid_inds = []
    for mtch in matches:
        try:
            if (mtch > datetime.strptime("01/01/1970", "%m/%d/%Y")) and (
                mtch < datetime.now() + timedelta(days=365 * 100)
            ):

                valid_matches.append(mtch)

        except Exception as e:
            if print_info:
                print(e)

    return valid_matches




[docs]
def text_remove_nondate_nums(document, PRINT_INFO=False):
    """Conduct initial text processing steps to prepare the text for date
    extractions. Function mostly uses regex-based text substitution to
    remove numerical structures within the text, which may be mistaken
    as a date by the date extractor.

    Parameters
    ----------
    document : str
        String representation of a document
    PRINT_INFO : bool
        Flag indicating whether to print information about the preprocessing
        progress

    Returns
    -------
    string
        string of processed document
    """

    if PRINT_INFO:
        print()
        print()
        print("IN: ", document)

    # Remove URLs
    find_URL = r"""(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))"""
    document = re.sub(find_URL, " ", document)

    regexs = [
        r"\d+(\%|\s\%|\bpercent\b|\s\bpercent\b)",  # Take out 'd%', 'd %'
        r"(#\s|#)\d+",  # '#d', '# d'
        # r'-?\d+(,\d+)+(].]d*)?', # take out lists of numbers with no space
        r"-?\d+(,\d+)+(\.\d*)?",  # take out lists of numbers with no space
        # r'-?\d+(,\s\d+)+(].]d*)?', # take out list of numbers with space
        r"\s\d{3}\s",  # numeric with 3 digits
        r"\b(0|00|1[3-9]|[2-9]\d)\b-\d{4}",  # [1-12]-4DIGIT  allowed only
        r"\s+\d+\.\d+\s+",  # e.g.: 10.1 and 10.2 with space before and after
        # Take out numbers longer than 8 digits (8 because datetimes 20190320 should stay)
        r"\d{9,}",
        r"\d+[.]+\d+[.]\d+[.][\d?]",  # Take out IP numbers
        # Take out single digit-hyphen trios e.g. 3-1-4  but leave 10-20-18 (possible date)
        r"\d-\d-\d",
        # Take out single digit-hyphen trios e.g. 3-1-4  but leave 10-20-18 (possible date)
        r"\d[.]\d[.]\d",
        r"\d+(\.\d*)?\s*[kK]?[wW]\s",
        r"\b(?!([jJ]an(uary)?|[fF]eb(r)?(uary)?|[mM]ar(ch)?|[aA]pr(il)?|[mM]ay|[jJ]un(e)?|[jJ]ul(y)?|[aA]ug(ust)?|[sS]ep(t)?(ember)?|[oO]ct(ober)?|[nN]ov(ember)?|[dD]ec(ember)?\b))[a-zA-Z]+-\d+",
        # ^ take out e.g. webbox-10
        r"[\w\.-]+@[\w\.-]+\.\w+",  # take out email addresses
        # take out phone numbers
        r"(\s\d{3}[-\.\s]?\d{3}[-\.\s]?\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]?\d{4}|\s\d{3}[-\.\s]\d{4})",
        # e.g. neff - cb 2.1b.16 - forced outage ; unknown. at 1645 26-jun cb 2.1b.16 offline.. 0000 - unknown
        r"\s\d+[.]\d+\D+[.]\d+\s",
        r"\s\D[.]\d[,\s]",
    ]

    replacements = [
        "",
        "",
        "",
        " ",
        " ",
        " ",
        "",
        "",
        "",
        "",
        " kW",
        " ",
        " ",
        " ",
        " ",
        " ",
    ]
    document = document.center(len(document) + 2)  # add spaces on either side
    for regex, repl in zip(regexs, replacements):
        # print('\t',regex)
        document = re.sub(regex, repl, document)
        # print('\t',words)

    if PRINT_INFO:
        print("SUB1:", document)
    # Decision to change all hyphens (-) to 'to'
    # to get rid of invalid timezone extrapolations

    document = str(document).lower()
    # print('prechk:',words)
    document = regex_tokenize(document)

    if PRINT_INFO:
        print("TOKENED: ", document)

    # Remove single-character tokens (mostly punctuation)
    document = [word for word in document if len(word) > 1]
    if PRINT_INFO:
        print("FLTRD: ", document)
    document = " ".join(document)
    if PRINT_INFO:
        print("JOINED: ", document)

    # print('chkpt: ',words)
    regexs = [
        r"\d+(\%|\s\%|\bpercent\b|\s\bpercent\b)",  # Take out 'd%', 'd %'
        r"(#\s|#)\d+",  # '#d', '# d'
        r"\d+(,\d+)+(].]d*)?",  # take out lists of numbers with no space
        r"\d+(,\s\d+)+(].]d*)?",  # take out list of numbers with space
        r"\s\d{3}\s",  # numeric with 3 digits
        # [1-12]-4DIGIT  allowed only: 91-1010
        r"\s\b(0|00|1[3-9]|[2-9]\d)\b[-/]\d{4}\s",
        # 4DIGIT-[1-12]  allowed only: 4301/43
        r"\s\d{4}[-/]\b(0|00|1[3-9]|[2-9]\d)\b\s",
        r"\s\d+\[.]\d+\s",  # e.g.: ' 10.1 ' and 10.2
        # Take out numbers longer than 8 digits (8 because datetimes
        # 20190320 should stay)
        r"\d{9,}",
        r"\s\D[.]\s",  # Take out " m. " for maybe, ' c. ' for cerca, etc.
        # Take out 123/29 because not a date format, usually indicating
        # temperature/etc.
        r"\s\d{3}\/\d{2}\s",
        r"\s[a-zA-Z]+-[a-zA-Z]+\d\s",  # this and next one: e-a4 they are e7-1
        r"\s[a-zA-Z]\d+-\d+\s",
        r"\s[a-zA-Z]\d+\s",  # take out examples like `j23`
    ]
    replacements = ["", "", "", "", " ", " ",
                    " ", " ", "", " ", " ", " ", " ", " "]

    document = document.center(len(document) + 2)  # add spaces on either side
    for regex, repl in zip(regexs, replacements):
        document = re.sub(regex, repl, document)

    if PRINT_INFO:
        print("TO DFINDER: ", document)

    return document




[docs]
def text_remove_numbers_stopwords(document, lst_stopwords):
    """Conduct final processing steps after date extraction

    Parameters
    ----------
    document : str
        String representation of a document
    lst_stopwords : list
        List of stop words which will be filtered in final preprocessing step

    Returns
    -------
    string
        string of processed document
    """

    for char in "<>,.*?!/\\:\"'@#$%^&(){}[]|~`_-":
        document = document.replace(char, " ")

    # many documents use ; or - as sentence partitioners
    # for char in ';-':
    # document = document.replace(char,'')

    rem_num = re.sub("[0-9]+", "", document)

    # remove all spaces
    document_tok = regex_tokenize(rem_num)
    document = [i for i in document_tok if i not in lst_stopwords]
    document = " ".join(document)

    return document




[docs]
def get_keywords_of_interest(document_tok, reference_df, reference_col_dict):
    """Find keywords of interest in list of strings from reference dict.

    If keywords of interest given in a reference dict are in the list of
    strings, return the keyword category, or categories. For example,
    if the string 'inverter' is in the list of text, return ['inverter'].

    Parameters
    ----------
    document_tok : list of str
        Tokenized text, functionally a list of string values.
    reference_df : DataFrame
        Holds columns that define the reference dictionary to search for keywords of interest,
        Note: This function can currently only handle single words, no n-gram functionality.
    reference_col_dict : dict of {str: str}
        A dictionary that contains the column names that describes how
        referencing is going to be done

        - reference_col_from : string, should be assigned to
          associated column name in reference_df that are possible input reference values
          Example: pd.Series(['inverter', 'invert', 'inv'])
        - reference_col_to : string, should be assigned to
          associated column name in reference_df that are the output reference values
          of interest
          Example: pd.Series(['inverter', 'inverter', 'inverter'])

    Returns
    -------
    included_equipment: list of str
        List of keywords from reference_dict found in list_of_txt, can be more than one value.
    """
    REFERENCE_COL_FROM = reference_col_dict["reference_col_from"]
    REFERENCE_COL_TO = reference_col_dict["reference_col_to"]

    reference_dict = dict(
        zip(reference_df[REFERENCE_COL_FROM], reference_df[REFERENCE_COL_TO])
    )

    # keywords of interest
    overlap_keywords = reference_dict.keys() & document_tok
    included_keywords = list({reference_dict[x] for x in overlap_keywords})
    return included_keywords