Source code for pvops.timeseries.models.survival

from scipy import stats
from sksurv.nonparametric import kaplan_meier_estimator


[docs]
def fit_survival_function(df, col_dict, method):
    """
    Calculate the survival function for different groups in a DataFrame using specified methods.

    This function computes the survival function for each unique group in the input DataFrame 
    based on the specified method. It supports the Kaplan-Meier estimator and Weibull distribution 
    fitting for survival analysis. The Kaplan-Meier estimator is a non-parametric statistic,
    while the Weibull distribution is a parametric model.

    Parameters
    ----------
    df : pandas.DataFrame
        A DataFrame containing failure data with at least three columns specified in `col_dict`:
        one for grouping, one for the time to failure, and one indicating whether the failure was observed

    col_dict : dict of {str : str}
        A dictionary that contains the column names relevant for survival analysis

        - **group_by** (*string*), should be assigned to the column to group by
        - **time_to_fail** (*string*), should be assigned to the column containing the time until failure
        - **was_observed** (*string*), should be assigned to the column indicating whether the failure was observed

    method : str
        The method to use for calculating the survival function. Must be one of:

        - 'kaplan-meier': Uses the Kaplan-Meier estimator for survival analysis.
        - 'weibull': Fits a Weibull distribution to the data.

    Returns
    -------
    dict

        - If `method` is `'kaplan-meier'`, contains keys `'times'`, `'fail_prob'`, and `'conf_int'`, which denote the times, failure probabilities, and confidence intervals on the failure probabilities.
        - If `method` is `'weibull'`, contains keys `'shape'`, `'scale'`, and `'distribution'`, which denote the shape parameter, scale parameter, and corresponding fitted `stats.weibull_min` distribution.
    """

    implemented_methods = ['kaplan-meier', 'weibull']
    if method not in implemented_methods:
        raise ValueError(f'method argument must be one of {implemented_methods}, got {method}')

    df = df.reset_index()

    group_by = col_dict['group_by']
    time_to_fail = col_dict['time_to_fail']
    was_observed = col_dict['was_observed']

    results = {}

    unique_group_by = df[group_by].unique()
    for group in unique_group_by:
        group_df = df[df[group_by] == group]

        if method == 'kaplan-meier':
            km_result = kaplan_meier_estimator(group_df[was_observed], group_df[time_to_fail], conf_type='log-log')
            group_result = {'times': km_result[0], 'fail_prob': km_result[1], 'conf_int': km_result[2]}

        elif method == 'weibull':
            uncensored_times = group_df[group_df[was_observed]][time_to_fail]
            censored_times = group_df[~group_df[was_observed]][time_to_fail]
            data = stats.CensoredData(uncensored=uncensored_times, right=censored_times)
            shape, _, scale = stats.weibull_min.fit(data, floc=0)
            group_result = {'shape': shape, 'scale': scale, 'distribution': stats.weibull_min(c=shape, scale=scale)}

        results[group] = group_result

    return results