Source code for pvops.text2time.utils

"""
These helper functions focus on performing secondary 
calcuations from the O&M and production data to create
visualizations of the merged data
"""
import pandas as pd
import numpy as np
import tqdm


[docs]
def interpolate_data(prod_df, om_df, prod_col_dict, om_col_dict, om_cols_to_translate=["asset", "prod_impact"]):
    """
    Provides general overview of the overlapping production and O&M data.

    Parameters
    ----------
    prod_df : DataFrame
        A data frame corresponding to the production
        data after having been processed by the perf_om_NA_qc function. This
        data frame needs the columns specified in prod_col_dict.
    om_df : DataFrame
        A data frame corresponding to the O&M data after
        having been processed by the perf_om_NA_qc function. This data frame
        needs the columns specified in om_col_dict.
    prod_col_dict : dict of {str : str}
        A dictionary that contains the column names relevant for the production data

        - **siteid** (*string*), should be assigned to associated site-ID column name in prod_df
        - **timestamp** (*string*), should be assigned to associated time-stamp column name in
          prod_df
        - **energyprod** (*string*), should be assigned to associated production column name in
          prod_df
        - **irradiance** (*string*), should be assigned to associated irradiance column name in
          prod_df

    om_col_dict : dict of {str : str}
        A dictionary that contains the column names relevant for the O&M data

        - **siteid** (*string*), should be assigned to associated site-ID column name in om_df
        - **datestart** (*string*), should be assigned to associated O&M event start-date
          column name in om_df
        - **dateend** (*string*), should be assigned to associated O&M event end-date
          column name in om_df
        - Others specified in om_cols_to_translate

    om_cols_to_translate : list
        List of om_col_dict keys to translate into prod_df

    Returns
    -------
    prod_output : DataFrame
        A data frame that includes statistics for the production data per site in the data frame.
        Two statistical parameters are calculated and assigned to separate columns:

        - **Actual # Time Stamps** (*datetime.datetime*), total number of overlapping
          production time-stamps
        - **Max # Time Stamps** (*datetime.datetime*), maximum number of production time-stamps,
          including NANs

    om_out : DataFrame
        A data frame that includes statistics for the O&M data per site in the data frame.
        Three statistical parameters are calculated and assigned to separate columns:

        - **Earliest Event Start** (*datetime.datetime*), column that specifies timestamp of
          earliest start of all events per site.
        - **Latest Event End** (*datetime.datetime*), column that specifies timestamp for
          latest conclusion of all events per site.
        - **Total Events** (*int*), column that specifies total number of events per site

    """

    prod_df = prod_df.copy()
    om_df = om_df.copy()

    om_site = om_col_dict["siteid"]
    om_date_s = om_col_dict["datestart"]
    om_date_e = om_col_dict["dateend"]
    om_asset = om_col_dict["asset"]

    unique_assets = om_df[om_asset].unique()
    translation_keys = [om_col_dict[key] for key in om_cols_to_translate]

    # Prime the columns in prod_df
    for asset in unique_assets:
        for key in translation_keys:
            prod_df[str(asset) + "_" + key] = [[] for _ in range(len(prod_df))]

    prod_ts = prod_col_dict["timestamp"]
    prod_df["has_ticket"] = False

    # Obtaining new DFs to extract statistics by using overlapping_data function
    prod_df_overlap, om_df_overlap = overlapping_data(
        prod_df, om_df, prod_col_dict, om_col_dict)

    print(f'processing {len(om_df_overlap)} rows')
    for ind, row in tqdm.tqdm(om_df_overlap.iterrows()):
        mask = ((prod_df[prod_ts] >= row[om_date_s]) &
                (prod_df[prod_ts] < row[om_date_e]) &
                (prod_df[om_site] == row[om_site]))
        idxs = np.where(mask)[0]
        for key in translation_keys:
            for i in idxs:
                prod_df.iloc[i, :][str(row[om_asset]) + "_" + key
                                   ].append(row[key])
        prod_df.loc[mask, "has_ticket"] = True
    return prod_df, om_df_overlap




[docs]
def summarize_overlaps(prod_df, om_df, prod_col_dict, om_col_dict):
    """
    Provides general overview of the overlapping production and O&M data.

    Parameters
    ----------
    prod_df : DataFrame
        A data frame corresponding to the production
        data after having been processed by the perf_om_NA_qc function. This
        data frame needs the columns specified in prod_col_dict.
    om_df : DataFrame
        A data frame corresponding to the O&M data after
        having been processed by the perf_om_NA_qc function. This data frame
        needs the columns specified in om_col_dict.

    prod_col_dict : dict of {str : str}
        A dictionary that contains the column names relevant for the production data

        - **siteid** (*string*), should be assigned to associated site-ID column name in prod_df
        - **timestamp** (*string*), should be assigned to associated time-stamp column name in
          prod_df
        - **energyprod** (*string*), should be assigned to associated production column name in
          prod_df
        - **irradiance** (*string*), should be assigned to associated irradiance column name in
          prod_df

    om_col_dict : dict of {str : str}
        A dictionary that contains the column names relevant for the O&M data

        - **siteid** (*string*), should be assigned to associated site-ID column name in om_df
        - **datestart** (*string*), should be assigned to associated O&M event start-date
          column name in om_df
        - **dateend** (*string*), should be assigned to associated O&M event end-date
          column name in om_df

    Returns
    -------
    prod_output : DataFrame
        A data frame that includes statistics for the production data per site in the data frame.
        Two statistical parameters are calculated and assigned to separate columns:

        - **Actual # Time Stamps** (*datetime.datetime*), total number of overlapping
          production time-stamps
        - **Max # Time Stamps** (*datetime.datetime*), maximum number of production time-stamps,
          including NANs

    om_out : DataFrame
        A data frame that includes statistics for the O&M data per site in the data frame.
        Three statistical parameters are calculated and assigned to separate columns:

        - **Earliest Event Start** (*datetime.datetime*), column that specifies timestamp of
          earliest start of all events per site.
        - **Latest Event End** (datetime.datetime*), column that specifies timestamp for
          latest conclusion of all events per site.
        - **Total Events** (*int*), column that specifies total number of events per site

    """

    # Obtaining new DFs to extract statistics by using overlapping_data function
    prod_df, om_df = overlapping_data(
        prod_df, om_df, prod_col_dict, om_col_dict)

    om_site = om_col_dict["siteid"]
    om_date_s = om_col_dict["datestart"]
    om_date_e = om_col_dict["dateend"]

    prod_site = prod_col_dict["siteid"]
    prod_ts = prod_col_dict["timestamp"]

    # total number of OM events per site
    num_om_events = om_df[[om_site, om_date_s]].groupby([om_site]).count()

    # earliest dates of O&M events per site
    min_date = om_df[[om_site, om_date_s]].groupby([om_site]).min()

    # earliest dates of O&M events per site
    max_date = om_df[[om_site, om_date_e]].groupby([om_site]).max()

    # concatenating
    om_output = pd.concat([min_date, max_date, num_om_events], axis=1)
    om_output.columns = ["Earliest Event Start",
                         "Latest Event End", "Total Events"]

    # production data timestep frequency in number of hours
    prod_max_ts = prod_df[[prod_site, prod_ts]].groupby([prod_site]).size()
    prod_act_ts = prod_df[[prod_site, prod_ts]].groupby([prod_site]).count()

    prod_output = pd.concat([prod_act_ts, prod_max_ts], axis=1)
    prod_output.columns = ["Actual # Time Stamps", "Max # Time Stamps"]

    return prod_output, om_output




[docs]
def om_summary_stats(om_df, meta_df, om_col_dict, meta_col_dict):
    """
    Adds columns to OM dataframe capturing statistics (e.g., event duration, month of
    occurrence, and age).
    Latter is calculated by using corresponding site commissioning date within the
    metadata dataframe.

    Parameters
    ----------
    om_df : DataFrame
        A data frame corresponding to the O&M data after having been pre-processed
        by the QC and overlappingDFs functions. This data frame needs
        to have the columns specified in om_col_dict.

    meta_df : DataFrame
        A data frame corresponding to the metadata that contains columns specified in meta_col_dict.

    om_col_dict : dict of {str : str}
        A dictionary that contains the column names relevant for the O&M data which consist of
        at least:

        - **siteid** (*string*), should be assigned to column name for associated site-ID
        - **datestart** (*string*), should be assigned to column name for associated O&M event
          start-date
        - **dateend** (*string*), should be assigned to column name for associated O&M event
          end-date
        - **eventdur** (*string*), should be assigned to column name desired for calculated
          event duration (calculated here, in hours)
        - **modatestart** (*string*), should be assigned to column name desired for month of
          event start (calculated here)
        - **agedatestart** (*string*), should be assigned to column name desired for calculated
          age of site when event started (calculated here, in days)

    meta_col_dict : dict
        A dictionary that contains the column names relevant for the meta-data

        - **siteid** (*string*), should be assigned to associated site-ID column name in meta_df
        - **COD** (*string*), should be asigned to column name corresponding to associated
          commisioning dates for all sites captured in om_df

    Returns
    -------
    om_df : DataFrame
        An updated version of the input dataframe, but with three new columns
        added for visualizations:  event duration, month of event occurrence, and
        age of system at time of event occurrence.  See om_col_dict for mapping
        of expected variables to user-defined variables.
    """

    # assigning dictionary items to local variables for cleaner code
    om_site = om_col_dict["siteid"]
    om_date_s = om_col_dict["datestart"]
    om_date_e = om_col_dict["dateend"]
    om_rep_dur = om_col_dict["eventdur"]
    om_mo_st = om_col_dict["modatestart"]
    om_age_st = om_col_dict["agedatestart"]

    meta_site = meta_col_dict["siteid"]
    meta_cod = meta_col_dict["COD"]

    # creating local dataframes to not modify originals
    meta_df = meta_df.copy()
    om_df = om_df.copy()

    # Setting randid as index
    om_df.set_index(om_site, inplace=True)

    # Calculating duration of repairs on OM data
    om_df[om_rep_dur] = om_df.loc[:][om_date_e] - om_df[:][om_date_s]

    # Converting Month on which OM-event Starts to an int-type for plotting
    # purposes (To make x-label show in int format)
    om_df[om_mo_st] = om_df[om_date_s].dt.month.astype(int)

    # Calculating age of system at time OM-event occurred using meta-data =>
    meta_df = meta_df.set_index(meta_site)

    # =========================================================================
    # Extracting commissioning dates of only the sites in the O&M data-frame
    # (in case meta_df has more sites)
    cod_dates = pd.to_datetime(
        meta_df.loc[om_df.index.unique()][meta_cod].copy())

    # Adding age column to om_df, but first initiating a COD column in the
    # OM-data (using NANs) to be able to take the difference between two columns
    om_df[meta_cod] = np.nan
    om_df[meta_cod] = om_df[meta_cod].astype("O")

    for i in cod_dates.index:
        om_df.loc[i, meta_cod] = cod_dates[i]
    om_df[meta_cod] = pd.to_datetime(om_df[meta_cod])
    om_df[meta_cod] = om_df[meta_cod].dt.floor(
        "D")  # hour on commisioning data is
    # unimportant for this analysis
    om_df[om_age_st] = om_df.loc[:, om_date_s] - om_df.loc[:, meta_cod]
    # =========================================================================

    # Converting durations to Days
    # Rounding to # of whole days and converting to int (using .dt.days) to do
    # catplot (pandas won't plot timedeltas on y-axis)
    om_df[om_age_st] = om_df[om_age_st].dt.round("D").dt.days
    om_df[om_rep_dur] = om_df[om_rep_dur].dt.seconds / 3600.0

    # Resetting index before completion of function since DFs are mutable
    om_df.reset_index(inplace=True)

    return om_df




[docs]
def overlapping_data(prod_df, om_df, prod_col_dict, om_col_dict):
    """
    Finds the overlapping time-range between the production data and O&M data
    for any given site.  The outputs are a truncated version of the input data
    frames, that contains only data with overlapping dates between the two DFs.

    Parameters
    ----------
    prod_df : DataFrame
        A data frame corresponding to the production
        data after having been processed by the perf_om_NA_qc function. This
        data frame needs the columns specified in prod_col_dict. The
        time-stamp column should not have any NANs for proper operation
        of this function.
    om_df : DataFrame
        A data frame corresponding to the O&M data after
        having been processed by the perf_om_NA_qc function. This data frame needs
        the columns specified in om_col_dict. The time-stamp columns should not
        have any NANs for proper operation of this function.
    prod_col_dict : dict of {str : str}
        A dictionary that contains the column names relevant for the production data

        - **siteid** (*string*), should be assigned to associated site-ID column name in prod_df
        - **timestamp** (*string*), should be assigned to associated time-stamp
          column name in prod_df
        - **energyprod** (*string*), should be assigned to associated production
          column name in prod_df
        - **irradiance** (*string*), should be assigned to associated irradiance
          column name in prod_df

    om_col_dict : dict of {str : str}
        A dictionary that contains the column names relevant for the O&M data

        - **siteid** (*string*), should be assigned to associated site-ID column name in om_df
        - **datestart** (*string*), should be assigned to associated O&M event start-date
          column name in om_df
        - **dateend** (*string*), should be assigned to associated O&M event end-date
          column name in om_df

    Returns
    -------
    prod_df : DataFrame
        Production data frame similar to the input data frame, but truncated
        to only contain data that overlaps in time with the O&M data.
    om_df : DataFrame
        O&M data frame similar to the input data frame, but truncated to only
        contain data that overlaps in time with the production data.

    """
    # assigning dictionary items to local variables for cleaner code
    om_site = om_col_dict["siteid"]
    om_date_s = om_col_dict["datestart"]
    om_date_e = om_col_dict["dateend"]

    prod_site = prod_col_dict["siteid"]
    prod_ts = prod_col_dict["timestamp"]

    # creating local dataframes to not modify originals
    prod_df = prod_df.copy()
    om_df = om_df.copy()

    # setting randid as the index
    om_df = om_df.set_index(om_site)
    prod_df = prod_df.set_index(prod_site)

    # initializing new dataframes
    om_df_commondates = pd.DataFrame()
    prod_df_commondates = pd.DataFrame()

    # finding overlapping DFs
    for rid in prod_df.index.unique():
        if rid in om_df.index.unique():
            # OM Keepers:
            # Only OM tickets that have: (1) an end-date greater than the
            # earliest perf-date AND (2) a start-date less than the last perf-date
            omtail_gt_phead_mask = om_df.loc[rid, om_date_e] >= min(
                prod_df.loc[rid][prod_ts]
            )
            omhead_lt_ptail_mask = om_df.loc[rid, om_date_s] <= max(
                prod_df.loc[rid][prod_ts]
            )

            # Perf Keepers:
            # Only Perf data that has:  (1) a date greater than the START of
            # the earliest OM ticket AND (2) a date less than the END of the oldest OM ticket
            if isinstance(pd.to_datetime(om_df.loc[rid][om_date_s]), pd.Series):
                om_datestart_check = om_df.loc[rid][om_date_s]
                om_dateend_check = om_df.loc[rid][om_date_e]
            else:
                om_datestart_check = [om_df.loc[rid][om_date_s]]
                om_dateend_check = [om_df.loc[rid][om_date_e]]

            # To show production data for the full day if an event occurs
            perf_gt_omhead_mask = prod_df.loc[rid][prod_ts].dt.ceil("D") >= min(
                om_datestart_check
            )
            perf_lt_omtail_mask = prod_df.loc[rid][prod_ts].dt.floor("D") <= max(
                om_dateend_check
            )

            # Creating NEW DataFrames using masks generated above and concatenate to
            # "_commondates" DFs
            if isinstance(pd.to_datetime(om_df.loc[rid][om_date_s]), pd.Series):
                om_overlap_section = om_df.loc[rid][
                    (omtail_gt_phead_mask) & (omhead_lt_ptail_mask)
                ]
                om_df_commondates = pd.concat(
                    [om_df_commondates, om_overlap_section])
            else:
                om_overlap_section = om_df.loc[[rid]][
                    [(omtail_gt_phead_mask) & (omhead_lt_ptail_mask)]
                ]
                om_df_commondates = pd.concat(
                    [om_df_commondates, om_overlap_section])

            perf_overlap_section = prod_df.loc[rid][
                (perf_gt_omhead_mask) & (perf_lt_omtail_mask)
            ]
            prod_df_commondates = pd.concat(
                [prod_df_commondates, perf_overlap_section])

    # resetting index of DFs before return
    prod_df_commondates.reset_index(inplace=True)
    om_df_commondates.reset_index(inplace=True)

    prod_df = prod_df_commondates
    om_df = om_df_commondates

    return prod_df, om_df




[docs]
def prod_anomalies(prod_df, prod_col_dict, minval=1.0, repval=np.nan, ffill=True):
    """
    For production data with cumulative energy entries, 1) addresses time-stamps where production
    unexpectedly drops to near zero and 2) replaces unexpected production drops with NANs or with
    user-specified value.  If unexpected production drops are replaced with NANs and if 'ffill'
    is set to 'True' in the input argument, a forward-fill method is used to replace the
    unexpected drops.

    Parameters
    ----------
    prod_df : DataFrame
        A data frame corresponding to production data were production is logged on
        a cumulative basis.
    prod_col_dict : dict of {str : str}
        A dictionary that contains the column names associated with the production data,
        which consist of at least:

        - **energyprod** (*string*), should be assigned to the associated cumulative
          production column name in prod_df

    minval : float
        Cutoff value for production data that determines where anomalies are defined. Any production
        values below minval will be addressed by this function. Default minval is 1.0
    repval : float
        Value that should replace the anomalies in a cumulative production data format.
        Default value is numpy's NAN.
    ffill : boolean
        Boolean flag that determines whether NANs in production column in prod_df
        should be filled using a forward-fill method.

    Returns
    -------
    prod_df : DataFrame
        An updated version of the input dataframe, but with zero production values
        converted to user's preference.
    addressed : DataFrame
        A data frame showing rows from the input that were addressed by this function.
    """

    prod_ener = prod_col_dict["energyprod"]

    prod_df = prod_df.copy()
    mask = prod_df.loc[:, prod_ener] < minval
    maskna = prod_df.loc[:, prod_ener].isna()
    addressed = prod_df[mask]
    addressedwna = prod_df[mask | maskna]
    prod_df.loc[mask, prod_ener] = repval

    if ffill:
        prod_df.loc[:, prod_ener].ffill(inplace=True)
        addressed = addressedwna

    return prod_df, addressed




[docs]
def prod_quant(prod_df, prod_col_dict, comp_type, ecumu=True):
    """
    Compares performance of observed production data in relation to an expected baseline

    Parameters
    ----------
    prod_df : DataFrame
        A data frame corresponding to the production data after having been
        processed by the QC and overlappingDFs functions. This data
        frame needs at least the columns specified in prod_col_dict.
    prod_col_dict : dict of {str : str}
        A dictionary that contains the column names relevant for the production data

        - **siteid** (*string*), should be assigned to associated site-ID column name in prod_df
        - **timestamp** (*string*), should be assigned to associated time-stamp
          column name in prod_df
        - **energyprod** (*string*), should be assigned to associated production
          column name in prod_df
        - **baseline** (*string*), should be assigned to associated expected baseline
          production column name in prod_df
        - **compared** (*string*), should be assigned to column name desired for
          quantified production data (calculated here)
        - **energy_pstep** (*string*), should be assigned to column name desired for
          energy per time-step (calculated here)

    comp_type : str
        Flag that specifies how the energy production should be compared to the
        expected baseline. A flag of 'diff' shows the subtracted difference between
        the two (baseline - observed). A flag of 'norm' shows the ratio of the two
        (observed/baseline)
    ecumu : bool
        Boolean flag that specifies whether the production (energy output)
        data is input as cumulative information ("True") or on a per time-step basis ("False").

    Returns
    -------
    DataFrame
        A data frame similar to the input, with an added column for the performance comparisons
    """

    prod_site = prod_col_dict["siteid"]
    prod_ener = prod_col_dict["energyprod"]
    baseline_ener = prod_col_dict["baseline"]
    quant_ener = prod_col_dict["compared"]
    pstep_ener = prod_col_dict["energy_pstep"]

    # creating local dataframes to not modify originals
    prod_df = prod_df.copy()
    prod_df.set_index(prod_site, inplace=True)

    for rid in prod_df.index.unique():
        # adding per timestep column for energy production if energy format is cumulative
        if ecumu:
            prod_df.loc[rid, pstep_ener] = prod_df.loc[rid, prod_ener].diff()
        else:
            prod_df.loc[rid, pstep_ener] = prod_df.loc[rid, prod_ener]

        if comp_type == "diff":
            prod_df.loc[rid, quant_ener] = (
                prod_df.loc[rid, baseline_ener] - prod_df.loc[rid, pstep_ener]
            )

        elif comp_type == "norm":
            prod_df.loc[rid, quant_ener] = (
                prod_df.loc[rid, pstep_ener] / prod_df.loc[rid, baseline_ener]
            )

    prod_df.reset_index(inplace=True)

    return prod_df