Source code for pvops.text.utils

import pandas as pd
import numpy as np



[docs]
def remap_attributes(om_df, remapping_df, remapping_col_dict,
                     allow_missing_mappings=False, print_info=False):
    """
    A utility function which remaps the attributes of om_df using columns
    within remapping_df.

    Parameters
    ----------
    om_df : DataFrame
        A pandas dataframe containing O&M data, which needs to be remapped.
    remapping_df : DataFrame
        Holds columns that define the remappings
    remapping_col_dict : dict of {str: str}
        A dictionary that contains the column names that describes how
        remapping is going to be done
        - attribute_col : string, should be assigned to associated
        column name in om_df which will be remapped
        - remapping_col_from : string, should be assigned
        to associated column name in remapping_df that matches
        original attribute of interest in om_df
        - remapping_col_to : string, should be assigned to
        associated column name in remapping_df that contains the
        final mapped entries
    allow_missing_mappings : bool
        If True, allow attributes without specified mappings to exist in
        the final dataframe.
        If False, only attributes specified in `remapping_df` will be in
        final dataframe.
    print_info : bool
        If True, print information about remapping.

    Returns
    -------
    DataFrame
        dataframe with remapped columns populated
    """
    df = om_df.copy()
    ATTRIBUTE_COL = remapping_col_dict["attribute_col"]
    REMAPPING_COL_FROM = remapping_col_dict["remapping_col_from"]
    REMAPPING_COL_TO = remapping_col_dict["remapping_col_to"]

    # Lower all columns
    df[ATTRIBUTE_COL] = df[ATTRIBUTE_COL].str.lower()

    if print_info:
        print("Initial value counts:")
        print(df[ATTRIBUTE_COL].value_counts())

    remapping_df[REMAPPING_COL_FROM] = remapping_df[REMAPPING_COL_FROM].str.lower()
    remapping_df[REMAPPING_COL_TO] = remapping_df[REMAPPING_COL_TO].str.lower()

    if allow_missing_mappings:
        # Find attributes not considered in mapping
        unique_words_in_data = set(df[ATTRIBUTE_COL].tolist())
        missing_mappings = list(unique_words_in_data
                                ^ set(remapping_df[REMAPPING_COL_FROM]))
        missing_mappings = [word for word in missing_mappings
                            if word in unique_words_in_data]
        temp_remapping_df = pd.DataFrame()
        temp_remapping_df[REMAPPING_COL_FROM] = missing_mappings
        temp_remapping_df[REMAPPING_COL_TO] = missing_mappings
        remapping_df = pd.concat([remapping_df, temp_remapping_df]) 

    if print_info:
        print("All mappings:\n", remapping_df)
    renamer = dict(
        zip(remapping_df[REMAPPING_COL_FROM], remapping_df[REMAPPING_COL_TO])
    )
    df[ATTRIBUTE_COL] = df[ATTRIBUTE_COL].map(renamer)

    if print_info:
        print("Final attribute distribution:")
        print(df[ATTRIBUTE_COL].value_counts())

        print(f"Number of nan definitions of {ATTRIBUTE_COL}:"
              "{sum(df[ATTRIBUTE_COL].isna())}")

    return df



[docs]
def remap_words_in_text(om_df, remapping_df, remapping_col_dict):
    """
    A utility function which remaps a text column of om_df using columns
    within remapping_df.

    Parameters
    ----------
    om_df : DataFrame
        A pandas dataframe containing O&M note data
    remapping_df : DataFrame
        Holds columns that define the remappings
    remapping_col_dict : dict of {str: str}
        A dictionary that contains the column names that describes how
        remapping is going to be done
        - data : string, should be assigned to associated
        column name in om_df which will have its text tokenized and remapped
        - remapping_col_from : string, should be assigned
        to associated column name in remapping_df that matches
        original attribute of interest in om_df
        - remapping_col_to : string, should be assigned to
        associated column name in remapping_df that contains the
        final mapped entries

    Returns
    -------
    DataFrame
        dataframe with remapped columns populated
    """
    df = om_df.copy()
    TEXT_COL = remapping_col_dict["data"]
    REMAPPING_COL_FROM = remapping_col_dict["remapping_col_from"]
    REMAPPING_COL_TO = remapping_col_dict["remapping_col_to"]

    # drop any values where input value is equal to output value
    remapping_df = remapping_df[remapping_df[REMAPPING_COL_FROM] != remapping_df[REMAPPING_COL_TO]].copy()

    # case-sensitive
    remapping_df[REMAPPING_COL_FROM] = remapping_df[REMAPPING_COL_FROM].str.lower()
    remapping_df[REMAPPING_COL_TO] = remapping_df[REMAPPING_COL_TO].str.lower()
    df[TEXT_COL] = df[TEXT_COL].str.lower()

    renamer = dict(
        zip(remapping_df[REMAPPING_COL_FROM], remapping_df[REMAPPING_COL_TO])
    )

    df[TEXT_COL] = df[TEXT_COL].replace(renamer, regex=True)

    return df