Source code for pvops.text2time.preprocess

"""
These functions focus on pre-processing user O&M and production data to
create visualizations of the merged data
"""
from datetime import datetime
import pandas as pd



[docs]
def data_site_na(pom_df, df_col_dict):
    """
    Drops rows where site-ID is missing (NAN) within either production 
    or O&M data.

    Parameters
    ----------
    pom_df : DataFrame
        A data frame corresponding to either the production or O&M 
        data.
    df_col_dict : dict of {str : str}
        A dictionary that contains the column names associated with 
        the 
        input `pom_df` and contains at least:

        - **siteid** (*string*), should be assigned to column name 
          for user's site-ID

    Returns
    -------
    pom_df : DataFrame
        An updated version of the input data frame, where rows with 
        site-IDs of NAN are dropped.
    addressed : DataFrame
        A data frame showing rows from the input that were removed 
        by this function.
    """

    df_site = df_col_dict["siteid"]

    pom_df = pom_df.copy()

    namask = pom_df.loc[:, df_site].isna()
    addressed = pom_df.loc[namask]

    pom_df.dropna(subset=[df_site], inplace=True)

    return pom_df, addressed




[docs]
def om_date_convert(om_df, om_col_dict, toffset=0.0):
    """
    Converts dates from string format to date time object in O&M
    dataframe.

    Parameters
    ----------
    om_df : DataFrame
        A data frame corresponding to O&M data.
    om_col_dict : dict of {str : str}
        A dictionary that contains the column names associated with
        the O&M data, which consist of at least:

          - **datestart** (*string*), should be assigned to column
            name for O&M event start date in om_df
          - **dateend** (*string*), should be assigned to column name
            for O&M event end date  in om_df

    toffset : float
       Value that specifies how many hours the O&M data should be
       shifted by in case time-stamps in production data and O&M data
       don't align as they should

    Returns
    -------
    DataFrame
        An updated version of the input dataframe, but with 
        time-stamps converted to localized (time-zone agnostic) 
        date-time objects.
    """

    om_df = om_df.copy()

    om_date_s = om_col_dict["datestart"]
    om_date_e = om_col_dict["dateend"]

    # Converting date-data from string data to DateTime objects
    om_df[om_date_s] = pd.to_datetime(
        om_df[om_date_s]) + pd.Timedelta(hours=toffset)
    om_df[om_date_e] = pd.to_datetime(
        om_df[om_date_e]) + pd.Timedelta(hours=toffset)

    # localizing timestamp
    om_df[om_date_s] = om_df[om_date_s].dt.tz_localize(None)
    om_df[om_date_e] = om_df[om_date_e].dt.tz_localize(None)

    return om_df




[docs]
def om_datelogic_check(om_df, om_col_dict, om_dflag="swap"):
    """
    Addresses issues with O&M dates where the start
    of an event is listed as occurring after its end.  These row are
    either dropped or the dates are swapped, depending on the user's
    preference.

    Parameters
    ----------
    om_df : DataFrame
        A data frame corresponding to O&M data.
    om_col_dict : dict of {str : str}
        A dictionary that contains the column names associated with
        the O&M data, which consist of at least:

          - **datestart** (*string*), should be assigned to column
            name for associated O&M event start date in om_df
          - **dateend** (*string*), should be assigned to column name
            for associated O&M event end date in om_df

    om_dflag : str
       A flag that specifies how to address rows where the start of
       an event occurs after its conclusion. A flag of 'drop' will
       drop those rows, and a flag of 'swap' swap the two dates for
       that row.

    Returns
    -------
    om_df : DataFrame
        An updated version of the input dataframe, but with O&M data
        quality issues addressed to ensure the start of an event
        precedes the event end date.
    addressed : DataFrame
        A data frame showing rows from the input that were addressed
        by this function.
    """

    # assigning dictionary items to local variables for cleaner code
    om_date_s = om_col_dict["datestart"]
    om_date_e = om_col_dict["dateend"]

    om_df = om_df.copy()

    # addressing cases where Date_EventEnd ocurrs before Date_EventStart
    mask = om_df.loc[:, om_date_e] < om_df.loc[:, om_date_s]
    addressed = om_df.loc[mask]
    # swap dates for rows where End < Start
    if any(mask) and om_dflag == "swap":
        om_df.loc[mask, [om_date_s, om_date_e]] = om_df.loc[
            mask, [om_date_e, om_date_s]
        ].values[0]
    # drop rows where End < Start
    elif any(mask) and om_dflag == "drop":
        om_df = om_df[~mask]

    return om_df, addressed




[docs]
def om_nadate_process(om_df, om_col_dict, om_dendflag="drop"):
    """
    Addresses issues with O&M dataframe where dates are missing
    (NAN). Two operations are performed : 1) rows are dropped 
    where start of an event is missing and (2) rows where the 
    conclusion of an event is NAN can either be dropped or marked 
    with the time at which program is run, depending on the user's
    preference.

    Parameters
    ----------
    om_df : DataFrame
        A data frame corresponding to O&M data.

    om_col_dict : dict of {str : str}
        A dictionary that contains the column names associated with
        the O&M data, which consist of at least:

          - **datestart** (*string*), should be assigned to column
            name for user's O&M event start-date
          - **dateend** (*string*), should be assigned to column name
            for user's O&M event end-date

    om_dendflag : str
       A flag that specifies how to address rows where the conclusion
       of an event is missing (NAN). A flag of 'drop' will drop those
       rows, and a flag of 'today' will replace the NAN with the time
       at which the program is run.  Any other value will leave the
       rows untouched.

    Returns
    -------
    om_df : DataFrame
        An updated version of the input dataframe, but with no
        missing time-stamps in the O&M data.

    addressed : DataFrame
        A data frame showing rows from the input that were addressed
        by this function.
    """

    om_df = om_df.copy()

    # assigning dictionary items to local variables for cleaner code
    om_date_s = om_col_dict["datestart"]
    om_date_e = om_col_dict["dateend"]

    # Dropping rows where om_date_s has values of NA in om_df
    mask1 = om_df.loc[:, om_date_s].isna()
    om_df.dropna(
        subset=[om_date_s], inplace=True
    )  # drops rows with om_date_e of NA in om_df

    # Addressing rows with 'om_date_e' values of NA in om_df
    mask2 = om_df.loc[:, om_date_e].isna()
    mask = mask1 | mask2
    addressed = om_df.loc[mask]

    if om_dendflag == "drop":
        om_df.dropna(
            subset=[om_date_e], inplace=True
        )  # drops rows with om_date_e of NA in om_df
    elif om_dendflag == "today":
        om_df[om_date_e].fillna(
            pd.to_datetime(str(datetime.now())[:20]), inplace=True
        )  # replacing NANs with today's date
    else:
        raise SyntaxError('Undefined om_dendflag')

    return om_df, addressed




[docs]
def prod_date_convert(prod_df, prod_col_dict, toffset=0.0):
    """Converts dates from string format to datetime format in
    production dataframe.


    Parameters
    ----------
    prod_df : DataFrame
        A data frame corresponding to production data.

    prod_col_dict : dict of {str : str}
        A dictionary that contains the column names associated with 
        the production data, which consist of at least:

          - **timestamp** (*string*), should be assigned to user's
            time-stamp column name

    toffset : float
       Value that specifies how many hours the production data 
       should be shifted by in case time-stamps in production data
       and O&M data don't align as they should.

    Returns
    -------
    DataFrame
        An updated version of the input dataframe, but with
        time-stamps converted to localized (time-zone agnostic)
        date-time objects.
    """

    # creating local dataframes to not modify originals
    prod_df = prod_df.copy()

    prod_ts = prod_col_dict["timestamp"]

    # Converting date-data from string data to DateTime objects
    prod_df[prod_ts] = pd.to_datetime(
        prod_df[prod_ts]) + pd.Timedelta(hours=toffset)

    # localizing timestamp
    prod_df[prod_ts] = prod_df[prod_ts].dt.tz_localize(None)

    return prod_df




[docs]
def prod_nadate_process(prod_df, prod_col_dict, pnadrop=False):
    """
    Processes rows of production data frame for missing time-stamp
    info (NAN).


    Parameters
    ----------
    prod_df : DataFrame
        A data frame corresponding to production data.

    prod_df_col_dict : dict of {str : str}
        A dictionary that contains the column names associated with
        the production data, which consist of at least:

          - **timestamp** (*string*), should be assigned to
            associated time-stamp column name in prod_df

    pnadrop : bool
        Boolean flag that determines what to do with rows where
        time-stamp is missing. A value of `True` will drop these
        rows.  Leaving the default value of `False` will identify
        rows with missing time-stamps for the user, but the function
        will output the same input data frame with no modifications.

    Returns
    -------
    prod_df : DataFrame
        The output data frame.  If pflag = 'drop', an updated version
        of the input data frame is output, but rows with missing
        time-stamps are removed. If default value is maintained, the
        input data frame is output with no modifications.

    addressed : DataFrame
        A data frame showing rows from the input that were addressed
        or identified by this function.
    """

    prod_df = prod_df.copy()

    # creating local dataframes to not modify originals
    prod_df = prod_df.copy()

    prod_ts = prod_col_dict["timestamp"]

    # Dropping rows
    mask = prod_df.loc[:, prod_ts].isna()
    addressed = prod_df[mask]
    if pnadrop:
        prod_df.dropna(subset=[prod_ts], inplace=True)

    return prod_df, addressed