DIHI_adult_decomp / Git / [1d802c] /Code/utils/df

Models:
RaymondKing/
DIHI_adult_decomp
Downloads: 1
[1d802c]: / Code / utils / df_utils.py
History
Download this file
56 lines (50 with data), 3.4 kB

import pandas as pd

def sum_count(input_file, output_file, data_element_name, encoding=None):
    """ sum all months' data element counts
    :param input_file: a temp csv file created by awk command, which stores data element counts by each month
    :param output_file: output csv file
    :param data_element_name: the column of data element name to group by
    :param encoding: the encoding of the input csv file
    :return:
    """
    df = pd.read_csv(input_file, encoding=encoding)
    df_sum = df.groupby(data_element_name)['count'].sum().reset_index()
    df_sort = df_sum.sort_values(by='count', ascending=False).reset_index(drop=True)
    df_sort.to_csv(output_file)
    return None

def clean_data_element_distribution(data_counts, data_element_name, data_distribution_df, outcome_data_distribution_df, grouper, cohort_count):
    '''
    :param data_counts: dataframe that stores the data element counts
    :param data_element_name: string of data element name
    :param data_distribution_df: the dataframe that stores the data element names and corresponding patient counts
    :param outcome_data_distribution_df: the dataframe that stores the data element names and corresponding patient counts, who have an outcome
    :param grouper : dataframe that stores current data element grouper
    :param cohort_count: total count of patients in cohort
    '''
    top_100 = data_counts.merge(data_distribution_df, how='left', left_on=data_element_name, right_on=data_element_name)
    top_100 = top_100.merge(outcome_data_distribution_df, how='left', left_on=data_element_name, right_on=data_element_name)

    top_100['pat_count/cohort_pat_count'] = top_100['pat_count'] / cohort_count
    top_100['icu_pat_count/pat_count'] = top_100['icu_pat_count'] / top_100['pat_count']
    top_100 = top_100.merge(grouper, how='left', left_on=data_element_name, right_on=data_element_name)
    top_100 = top_100.rename(columns = {'grouper':'current_grouper'})
    return top_100

def clean_data_element_prev(data_element_name, prev_df, total_enc_count):
    '''Take a raw file generated by awk and create a percentage prevalance df for the data elements
    '''
    prev_df = prev_df.drop_duplicates()
    prev_df = prev_df.groupby(data_element_name)['pat_enc_csn_id'].count().to_frame(name='enc_count').reset_index()
    prev_df['prev'] = prev_df['enc_count'] / total_enc_count
    prev_df = prev_df.sort_values(by='prev', ascending=False).reset_index(drop=True)
    return prev_df

def gap_analysis_freq(data_element_name, icu_df, no_icu_df, icu_enc_num, no_icu_enc_num):
    freq_df = icu_df.merge(no_icu_df, left_on=data_element_name, right_on=data_element_name, how='inner', suffixes=('_1', '_0'))
    freq_df['count_1_std'] = freq_df['count_1'] / icu_enc_num
    freq_df['count_0_std'] = freq_df['count_0'] / no_icu_enc_num
    freq_df['ratio'] = freq_df['count_1_std'] / freq_df['count_0_std']
    freq_df = freq_df.sort_values(by='ratio', ascending=False).reset_index(drop=True)
    return freq_df

def gap_analysis_prev(data_element_name, icu_df, no_icu_df):
    prev_df = icu_df.merge(no_icu_df, left_on=data_element_name, right_on=data_element_name, how='inner', suffixes=('_1' ,'_0'))
    prev_df['ratio'] = prev_df['prev_1'] / prev_df['prev_0']
    prev_df = prev_df.sort_values(by='ratio', ascending=False).reset_index(drop=True)
    return prev_df