--- a +++ b/src/data_processing.py @@ -0,0 +1,210 @@ +""" +Data Processing Module for Diversity in Head and Neck Cancer Clinical Trials Analysis + +This module contains functions for data loading, filtering, and processing +to support the analysis of diversity in head and neck cancer clinical trials. + +The main analysis focuses on computing a diversity score for each study, +defined as: + Diversity Score = (# non-white participants) / (# total participants) × 100 +where total participants = # white participants + # non-white participants + +Studies are then categorized into top 20% diverse and bottom 20% diverse groups +for comparative analysis of various features. +""" + +import warnings +import pandas as pd +import numpy as np + +# Suppress warnings for cleaner output +warnings.simplefilter(action='ignore') + +def load_data(filepath): + """ + Load clinical trial data from CSV file. + + Args: + filepath (str): Path to the CSV file + + Returns: + pandas.DataFrame: Loaded data + """ + return pd.read_csv(filepath) + +def filter_usa_studies(df): + """ + Filter for studies conducted only in the United States. + + Args: + df (pandas.DataFrame): Original dataset + + Returns: + pandas.DataFrame: Filtered dataset containing only USA studies + """ + return df[df['Area Offered'] == "United States"] + +def filter_race_information(df): + """ + Filter for studies that contain race information. + + Args: + df (pandas.DataFrame): Dataset to filter + + Returns: + pandas.DataFrame: Filtered dataset containing only studies with race information + """ + return df[~(df["# White"].isna() | df["# Non White"].isna())] + +def calculate_success_metric(df): + """ + Calculate success metric (percentage of non-white participants) for each study. + + The diversity score is calculated as: + Diversity Score = (# non-white participants) / (# total participants) × 100 + where total participants = # white participants + # non-white participants + + Args: + df (pandas.DataFrame): Dataset with race information + + Returns: + pandas.DataFrame: Dataset with added success_metric column + """ + df_with_metric = df.copy() + # Calculate diversity score + df_with_metric["success_metric"] = ( + df_with_metric["# Non White"] / + (df_with_metric["# White"] + df_with_metric["# Non White"]) * + 100.0 + ) + return df_with_metric + +def categorize_by_diversity(df, top_percentile=0.8, bottom_percentile=0.2): + """ + Categorize studies as top, bottom, or middle based on the success metric. + + Args: + df (pandas.DataFrame): Dataset with success_metric column + top_percentile (float): Percentile threshold for top category (default: 0.8) + bottom_percentile (float): Percentile threshold for bottom category (default: 0.2) + + Returns: + pandas.DataFrame: Dataset with added success_category column + """ + top_threshold = np.quantile(df.success_metric, top_percentile) + bottom_threshold = np.quantile(df.success_metric, bottom_percentile) + + def get_category_label(x): + if x >= top_threshold: + return "Top20" + elif x <= bottom_threshold: + return "Bottom20" + else: + return "Neither" + + df_categorized = df.copy() + df_categorized["success_category"] = df_categorized["success_metric"].apply(get_category_label) + return df_categorized + +def get_top_bottom_datasets(df): + """ + Extract top and bottom diversity datasets based on success_metric thresholds. + + Args: + df (pandas.DataFrame): Dataset with success_metric column + + Returns: + tuple: (top_dataset, bottom_dataset) + """ + success_metric_80th_perc = np.quantile(df.success_metric, 0.8) + success_metric_20th_perc = np.quantile(df.success_metric, 0.2) + + df_top = df[df.success_metric >= success_metric_80th_perc] + df_bottom = df[df.success_metric <= success_metric_20th_perc] + + return df_top, df_bottom + +def extract_eligibility_features(df): + """ + Extract and categorize eligibility features for analysis. + + This function processes the raw data to extract binary flags for + different eligibility criteria, which can affect trial diversity. + + Eligibility features include: + - age_restrict: 0 if only 18+, 1 for other restrictions (e.g., 18-75) + - stage_size: Restrictions on cancer stage and tumor size + - cancer_site: Restrictions on cancer site + - histological_type: Limited to specific histology (e.g., SCC only) + - performance_score: ECOG or other performance score requirements + - comorbidities: Restrictions on comorbidities + - hx_of_tt: Restrictions on treatment history + - lab_values: Restrictions on laboratory values + - pregnancy_or_contraception: Pregnancy or contraception requirements + - misc: Other restrictions (smoking, ethnicity, etc.) + + Args: + df (pandas.DataFrame): Dataset to process + + Returns: + pandas.DataFrame: Dataset with added eligibility feature columns + """ + # This is a placeholder for the actual implementation + # In a real scenario, this would parse eligibility criteria text + # and extract the relevant features + + df_with_features = df.copy() + + # Calculate overall eligibility score (if not already present) + if 'eligibility_score' not in df_with_features.columns: + eligibility_cols = [ + 'eligibility_age_restrict', + 'eligibility_stage_size', + 'eligibility_site', + 'eligibility_histological_type', + 'eligibility_performance_score', + 'eligibility_comorbidities', + 'eligibility_hx_of_tt', + 'eligibility_lab_values', + 'eligibility_pregnancy_or_contraception', + 'eligibility_misc' + ] + + # If these columns exist, calculate the sum + if all(col in df_with_features.columns for col in eligibility_cols): + df_with_features['eligibility_score'] = df_with_features[eligibility_cols].sum(axis=1) + + return df_with_features + +def preprocess_data(input_filepath): + """ + Complete preprocessing pipeline for diversity analysis. + + Args: + input_filepath (str): Path to input CSV file + + Returns: + tuple: (processed_df, top_df, bottom_df) + """ + # Load data + df = load_data(input_filepath) + + # Filter for USA studies + df_usa = filter_usa_studies(df) + + # Filter for studies with race information + df_final = filter_race_information(df_usa) + + # Calculate success metric + df_final = calculate_success_metric(df_final) + + # Extract eligibility features + df_final = extract_eligibility_features(df_final) + + # Categorize studies + df_final = categorize_by_diversity(df_final) + + # Get top and bottom datasets + df_top, df_bottom = get_top_bottom_datasets(df_final) + + return df_final, df_top, df_bottom \ No newline at end of file