Source code for pvops.text.utils

import pandas as pd
import numpy as np


[docs] def remap_attributes(om_df, remapping_df, remapping_col_dict, allow_missing_mappings=False, print_info=False): """ A utility function which remaps the attributes of om_df using columns within remapping_df. Parameters ---------- om_df : DataFrame A pandas dataframe containing O&M data, which needs to be remapped. remapping_df : DataFrame Holds columns that define the remappings remapping_col_dict : dict of {str: str} A dictionary that contains the column names that describes how remapping is going to be done - attribute_col : string, should be assigned to associated column name in om_df which will be remapped - remapping_col_from : string, should be assigned to associated column name in remapping_df that matches original attribute of interest in om_df - remapping_col_to : string, should be assigned to associated column name in remapping_df that contains the final mapped entries allow_missing_mappings : bool If True, allow attributes without specified mappings to exist in the final dataframe. If False, only attributes specified in `remapping_df` will be in final dataframe. print_info : bool If True, print information about remapping. Returns ------- DataFrame dataframe with remapped columns populated """ df = om_df.copy() ATTRIBUTE_COL = remapping_col_dict["attribute_col"] REMAPPING_COL_FROM = remapping_col_dict["remapping_col_from"] REMAPPING_COL_TO = remapping_col_dict["remapping_col_to"] # Lower all columns df[ATTRIBUTE_COL] = df[ATTRIBUTE_COL].str.lower() if print_info: print("Initial value counts:") print(df[ATTRIBUTE_COL].value_counts()) remapping_df[REMAPPING_COL_FROM] = remapping_df[REMAPPING_COL_FROM].str.lower() remapping_df[REMAPPING_COL_TO] = remapping_df[REMAPPING_COL_TO].str.lower() if allow_missing_mappings: # Find attributes not considered in mapping unique_words_in_data = set(df[ATTRIBUTE_COL].tolist()) missing_mappings = list(unique_words_in_data ^ set(remapping_df[REMAPPING_COL_FROM])) missing_mappings = [word for word in missing_mappings if word in unique_words_in_data] temp_remapping_df = pd.DataFrame() temp_remapping_df[REMAPPING_COL_FROM] = missing_mappings temp_remapping_df[REMAPPING_COL_TO] = missing_mappings remapping_df = pd.concat([remapping_df, temp_remapping_df]) if print_info: print("All mappings:\n", remapping_df) renamer = dict( zip(remapping_df[REMAPPING_COL_FROM], remapping_df[REMAPPING_COL_TO]) ) df[ATTRIBUTE_COL] = df[ATTRIBUTE_COL].map(renamer) if print_info: print("Final attribute distribution:") print(df[ATTRIBUTE_COL].value_counts()) print(f"Number of nan definitions of {ATTRIBUTE_COL}:" "{sum(df[ATTRIBUTE_COL].isna())}") return df
[docs] def remap_words_in_text(om_df, remapping_df, remapping_col_dict): """ A utility function which remaps a text column of om_df using columns within remapping_df. Parameters ---------- om_df : DataFrame A pandas dataframe containing O&M note data remapping_df : DataFrame Holds columns that define the remappings remapping_col_dict : dict of {str: str} A dictionary that contains the column names that describes how remapping is going to be done - data : string, should be assigned to associated column name in om_df which will have its text tokenized and remapped - remapping_col_from : string, should be assigned to associated column name in remapping_df that matches original attribute of interest in om_df - remapping_col_to : string, should be assigned to associated column name in remapping_df that contains the final mapped entries Returns ------- DataFrame dataframe with remapped columns populated """ df = om_df.copy() TEXT_COL = remapping_col_dict["data"] REMAPPING_COL_FROM = remapping_col_dict["remapping_col_from"] REMAPPING_COL_TO = remapping_col_dict["remapping_col_to"] # drop any values where input value is equal to output value remapping_df = remapping_df[remapping_df[REMAPPING_COL_FROM] != remapping_df[REMAPPING_COL_TO]].copy() # case-sensitive remapping_df[REMAPPING_COL_FROM] = remapping_df[REMAPPING_COL_FROM].str.lower() remapping_df[REMAPPING_COL_TO] = remapping_df[REMAPPING_COL_TO].str.lower() df[TEXT_COL] = df[TEXT_COL].str.lower() renamer = dict( zip(remapping_df[REMAPPING_COL_FROM], remapping_df[REMAPPING_COL_TO]) ) df[TEXT_COL] = df[TEXT_COL].replace(renamer, regex=True) return df