Source code for pvops.text2time.visualize

"""These functions focus on visualizing the processed O&M and production data"""
import pandas as pd
import numpy as np
from scipy.signal import find_peaks
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import MaxNLocator


[docs] def visualize_counts(om_df, om_col_dict, count_var, fig_sets): """ Produces a seaborn countplot of an O&M categorical column using sns.countplot() Parameters ---------- om_df : DataFrame A data frame corresponding to the O&M data after having been pre-processed to address NANs and date consistency, and after applying the ``om_summary_stats`` function. This data frame needs at least the columns specified in om_col_dict. om_col_dict : dict of {str : str} A dictionary that contains the column names relevant for the O&M data - **siteid** (*string*), should be assigned to column name for associated site-ID in om_df. - **modatestart** (*string*), should be assigned to column name desired for month of event start. This column is calculated by ``om_summary_stats`` count_var:str Column name that contains categorical variable to be plotted fig_sets : dict A dictionary that contains the settings to be used for the figure to be generated, and those settings should include: - **figsize** (*tuple*), which is a tuple of the figure settings (e.g. *(12,10)* ) - **fontsize** (*int*), which is the desired font-size for the figure Returns ------- None """ # assigning dictionary items to local variables for cleaner code om_site = om_col_dict["siteid"] om_mo_st = om_col_dict["modatestart"] my_figsize = fig_sets["figsize"] my_fontsize = fig_sets["fontsize"] # For plot title labels if count_var == om_site: ttl_key = "Site" hue = None elif count_var == om_mo_st: ttl_key = "Month" hue = om_site else: ttl_key = count_var hue = None fig = plt.figure(figsize=my_figsize) om_df[count_var] = om_df[count_var].astype("category") ax = sns.countplot(x=count_var, data=om_df, hue=hue) ax.set_xticklabels( ax.get_xticklabels(), rotation=45, horizontalalignment="center", fontweight="medium", fontsize=my_fontsize - 2, ) yticks = ax.get_yticks() yticks = [int(i) for i in yticks] ax.set_yticks(yticks) ax.set_yticklabels( yticks, verticalalignment="center", fontweight="medium", fontsize=my_fontsize - 2, ) ax.yaxis.set_major_locator(MaxNLocator(integer=True)) ax.set_xlabel(count_var, fontsize=my_fontsize, fontweight="bold") ax.set_ylabel("Count", fontsize=my_fontsize, fontweight="bold") ax.set_title("Number of Reported Events by " + ttl_key, fontsize=my_fontsize) fig.tight_layout() return fig
[docs] def visualize_categorical_scatter(om_df, om_col_dict, cat_varx, cat_vary, fig_sets): """ Produces a seaborn categorical scatter plot to show the relationship between an O&M numerical column and a categorical column using sns.catplot() Parameters ---------- om_df : DataFrame A data frame corresponding to the O&M data after having been pre-processed to address NANs and date consistency, and after applying the ``om_summary_stats`` function. This data frame needs at least the columns specified in om_col_dict. om_col_dict : dict of {str : str} A dictionary that contains the column names relevant for the O&M data - **eventdur** (*string*), should be assigned to column name desired for repair duration. This column is calculated by ``om_summary_stats`` - **agedatestart** (*string*), should be assigned to column name desired for age of site when event started. This column is calculated by ``om_summary_stats`` cat_varx : str Column name that contains categorical variable to be plotted cat_vary : str Column name that contains numerical variable to be plotted fig_sets : dict A dictionary that contains the settings to be used for the figure to be generated, and those settings should include: - **figsize** (*tuple*), which is a tuple of the figure settings (e.g. *(12,10)* ) - **fontsize** (*int*), which is the desired font-size for the figure Returns ------- None """ # assigning dictionary items to local variables for cleaner code om_rep_dur = om_col_dict["eventdur"] om_age_st = om_col_dict["agedatestart"] my_figsize = fig_sets["figsize"] my_fontsize = fig_sets["fontsize"] hue = cat_varx sns.catplot(x=cat_varx, y=cat_vary, data=om_df, hue=hue) ax = plt.gca() xticks = ax.get_xticks() ax.set_xticks(xticks) ax.set_xticklabels( xticks, rotation=45, horizontalalignment="center", fontweight="medium", fontsize=my_fontsize - 2, ) yticks = ax.get_yticks() yticks = [int(i) for i in yticks] ax.set_yticks(yticks) ax.set_yticklabels( yticks, verticalalignment="center", fontweight="medium", fontsize=my_fontsize - 2, ) if cat_vary == om_age_st: ttl_key = "Age of System at Event Occurence" elif cat_vary == om_rep_dur: ttl_key = "Duration of Event" else: ttl_key = cat_vary ax.set_xlabel("Site ID", fontsize=my_fontsize, fontweight="bold") ax.set_ylabel("Days", fontsize=my_fontsize, fontweight="bold") ax.set_title(ttl_key, fontsize=my_fontsize) fig = plt.gcf() fig.set_size_inches(my_figsize) fig.tight_layout() return fig
[docs] def visualize_om_prod_overlap( prod_df, om_df, prod_col_dict, om_col_dict, prod_fldr, e_cumu, be_cumu, samp_freq="H", pshift=0.0, baselineflag=True ): """ Creates Plotly figures of performance data overlaid with coinciding O&M tickets. A separate figure for each site in the production data frame (prod_df) is generated. Parameters ---------- prod_df : DataFrame A data frame corresponding to the performance data after (ideally) having been processed by the perf_om_NA_qc and overlappingDFs functions. This data frame needs to contain the columns specified in prod_col_dict. om_df : DataFrame A data frame corresponding to the O&M data after (ideally) having been processed by the perf_om_NA_qc and overlappingDFs functions. This data frame needs to contain the columns specified in om_col_dict. prod_col_dict : dict of {str : str} A dictionary that contains the column names relevant for the production data - **siteid** (*string*), should be assigned to associated site-ID column name in prod_df - **timestamp** (*string*), should be assigned to associated time-stamp column name in prod_df - **energyprod** (*string*), should be assigned to associated production column name in prod_df - **irradiance** (*string*), should be assigned to associated irradiance column name in prod_df. Data should be in [W/m^2]. om_col_dict : dict of {str : str} A dictionary that contains the column names relevant for the O&M data - **siteid** (*string*), should be assigned to column name for user's site-ID - **datestart** (*string*), should be assigned to column name for user's O&M event start-date - **dateend** (*string*), should be assigned to column name for user's O&M event end-date - **workID** (*string*), should be assigned to column name for user's O&M unique event ID - **worktype** (*string*), should be assigned to column name for user's O&M ticket type (corrective, predictive, etc) - **asset** (*string*), should be assigned to column name for affected asset in user's O&M ticket prod_fldr : str Path to directory where plots should be saved. e_cumu : bool Boolean flag that specifies whether the production (energy output) data is input as cumulative information ("True") or on a per time-step basis ("False"). be_cumu : bool Boolean that specifies whether the baseline production data is input as cumulative information ("True") or on a per time-step basis ("False"). samp_freq : str Specifies how the performance data should be resampled. String value is any frequency that is valid for pandas.DataFrame.resample(). For example, a value of 'D' will resample on a daily basis, and a value of 'H' will resample on an hourly basis. pshift : float Value that specifies how many hours the performance data should be shifted by to help align performance data with O&M data. Mostly necessary when resampling frequencies are larger than an hour baselineflag : bool Boolean that specifies whether or not to display the baseline (i.e., expected production profile) as calculated with the irradiance data using the baseline production data. A value of 'True' will display the baseline production profile on the generated Plotly figures, and a value of 'False' will not. Returns ------- list List of Plotly figure handles generated by function for each site within prod_df. """ # assigning dictionary items to local variables for cleaner code om_site = om_col_dict["siteid"] om_date_s = om_col_dict["datestart"] om_date_e = om_col_dict["dateend"] om_wo_id = om_col_dict["workID"] om_wtype = om_col_dict["worktype"] om_asset = om_col_dict["asset"] prod_site = prod_col_dict["siteid"] prod_ts = prod_col_dict["timestamp"] prod_ener = prod_col_dict["energyprod"] prod_baseline = prod_col_dict[ "baseline" ] # if none is provided, using iec_calc() is recommended # creating local dataframes to not modify originals prod_df = prod_df.copy() om_df = om_df.copy() # Setting multi-indices for ease of plotting prod_df.set_index([prod_site, prod_ts], inplace=True) prod_df.sort_index(inplace=True) om_df.set_index([om_site, om_date_s], inplace=True) om_df.sort_index(inplace=True) figs = [] for i in prod_df.index.get_level_values(0).unique(): # Resampling the performance data to obtain daily energy production # (different between cumulative and non-cumulative energy output) if e_cumu: # energy data is cumulative over time, so take difference between # largest and smallest value on any given day tstep = np.diff(prod_df.loc[i].index)[2] / np.timedelta64(1, "s") if samp_freq == "H" and tstep >= 3599: # 3599 to consider roundoff error enrg_site = prod_df.loc[i, prod_ener].diff() elif samp_freq == "T" and tstep >= 59.9: enrg_site = prod_df.loc[i, prod_ener].diff() else: enrg_site = ( prod_df.loc[i, prod_ener].resample(samp_freq, label="left").max() - prod_df.loc[i, prod_ener].resample(samp_freq, label="left").min() ) else: # energy data is given on a per TIMESTEP basis, therefore... enrg_site = ( prod_df.loc[i, prod_ener].resample(samp_freq, label="left").sum() ) # Resampling baselineE and assigning to separate variable to not resample entire data frame. if be_cumu: # baseline energy cumulative over time baseline_site = ( prod_df.loc[i, prod_baseline].resample(samp_freq, label="left").max() - prod_df.loc[i, prod_baseline].resample(samp_freq, label="left").min() ) else: # baseline energy is on a per time-step basis baseline_site = ( prod_df.loc[i, prod_baseline].resample(samp_freq, label="left").sum() ) # shifting time for prod_df and baseline baseline_site.index += pd.Timedelta(hours=pshift) enrg_site.index += pd.Timedelta(hours=pshift) # finding where energy dips: first by location/index (by integer, not # index location => use iloc), and then by converting to dates using original index edips_all = find_peaks(enrg_site * -1)[0] edips_all_dates = enrg_site.index[edips_all] # Finding the corresponding closest dip to each OM om_date_s for # each ticket: first by location/index, and then by converting those # indices to dates edips_nearom_indices = [ np.argmin(abs(edips_all_dates - xx)) for xx in om_df.loc[i].index ] edips_nearom_dates = edips_all_dates[edips_nearom_indices] # Adding the nearest performance-dip-date to the OM data frame om_df.loc[i, "corr_perfDip"] = edips_nearom_dates # Taking largest value of daily output(perf-data) to create a [ficticious] # plot-value/column for OM-data => "_h" implies hover text om_df.loc[i, "perfval_plotcol"] = enrg_site.max() om_start_h = 0.75 # To place StartDate points in visible region om_end_h = 0.5 # To place EndDate points below the StartDate points om_reg_h = 1.05 # To make the om-region slightly higher than the perf data # Correction for om-region if baseline for production data is plotted if baselineflag: om_reg_hcorr = baseline_site.max() / enrg_site.max() else: om_reg_hcorr = 1.0 # initializing plotly-figure fig = go.Figure( layout_yaxis_range=[-5, enrg_site.max() * om_reg_h * om_reg_hcorr] ) # plotting all Perf data for i-th site (captured in enrg_site) if samp_freq == "D": perf_name = "Daily Energy" baseline_name = "Daily Baseline" elif samp_freq == "H": perf_name = "Hourly Energy" baseline_name = "Hourly Baseline" fig.add_trace(go.Scatter(x=enrg_site.index, y=enrg_site.values, name=perf_name)) if baselineflag: fig.add_trace( go.Scatter( x=baseline_site.index, y=baseline_site.values, name=baseline_name ) ) # For loop to add shaded regions for each ticket, where left side of the region corresponds # to the EventStart (index of om data in this case), and right side of region corresponds # to the EventEnd. These two dates make the edges of the region, x below. for j in range(len(om_df.loc[i])): fig.add_trace( dict( type="scatter", x=[om_df.loc[i].index[j], om_df.loc[i, om_date_e][j]], y=om_df.loc[i, "perfval_plotcol"].values[0:2] * om_reg_h * om_reg_hcorr, mode="markers+lines", line=dict(width=0), marker=dict(size=[0, 0]), fill="tozeroy", fillcolor="rgba(190,0,0,.15)", hoverinfo="none", showlegend=False, name="OM Ticket", ) ) # Adding EventStart Points with hover-text fig.add_trace( go.Scatter( x=om_df.loc[i].index, y=om_df.loc[i, "perfval_plotcol"].values * om_start_h, mode="markers", hovertemplate="Start: " + "%{x} <br>" + "WO#: " + om_df.loc[i, om_wo_id].astype(str) + "<br>" + "Type: " + om_df.loc[i, om_wtype].astype(str) + "<br>" + "Asset: " + om_df.loc[i, om_asset].fillna("Asset_NA").astype(str) + "<br>" + "Nearest Prod Dip: " + om_df.loc[i, "corr_perfDip"].dt.strftime("%b %d, %Y"), name="OM_start", ) ) # Adding EventEnd Points with hover-text fig.add_trace( go.Scatter( x=om_df.loc[i, om_date_e], y=om_df.loc[i, "perfval_plotcol"].values * om_end_h, mode="markers", hovertemplate="End: " + "%{x} <br>" + "WO#: " + om_df.loc[i, om_wo_id].astype(str) + "<br>" + "Type: " + om_df.loc[i, om_wtype].astype(str) + "<br>" + "Asset: " + om_df.loc[i, om_asset].fillna("Asset_NA").astype(str), name="OM_end", ) ) # Setting y-axes and title fig.update_yaxes(title_text="Energy Delivered (kWh)") fig.update_layout(title_text="Site: " + i) # appending fig object to figs list figs.append(fig) # Saving Figure fig.write_html(prod_fldr + "/" + i + ".html") # Resetting index before completion of function since DFs are mutable prod_df.reset_index(inplace=True) om_df.reset_index(inplace=True) return figs