Diff of /src/data_processing.py [000000] .. [3cdecf]

Switch to side-by-side view

--- a
+++ b/src/data_processing.py
@@ -0,0 +1,210 @@
+"""
+Data Processing Module for Diversity in Head and Neck Cancer Clinical Trials Analysis
+
+This module contains functions for data loading, filtering, and processing
+to support the analysis of diversity in head and neck cancer clinical trials.
+
+The main analysis focuses on computing a diversity score for each study,
+defined as:
+    Diversity Score = (# non-white participants) / (# total participants) × 100
+where total participants = # white participants + # non-white participants
+
+Studies are then categorized into top 20% diverse and bottom 20% diverse groups
+for comparative analysis of various features.
+"""
+
+import warnings
+import pandas as pd
+import numpy as np
+
+# Suppress warnings for cleaner output
+warnings.simplefilter(action='ignore')
+
+def load_data(filepath):
+    """
+    Load clinical trial data from CSV file.
+    
+    Args:
+        filepath (str): Path to the CSV file
+        
+    Returns:
+        pandas.DataFrame: Loaded data
+    """
+    return pd.read_csv(filepath)
+
+def filter_usa_studies(df):
+    """
+    Filter for studies conducted only in the United States.
+    
+    Args:
+        df (pandas.DataFrame): Original dataset
+        
+    Returns:
+        pandas.DataFrame: Filtered dataset containing only USA studies
+    """
+    return df[df['Area Offered'] == "United States"]
+
+def filter_race_information(df):
+    """
+    Filter for studies that contain race information.
+    
+    Args:
+        df (pandas.DataFrame): Dataset to filter
+        
+    Returns:
+        pandas.DataFrame: Filtered dataset containing only studies with race information
+    """
+    return df[~(df["# White"].isna() | df["# Non White"].isna())]
+
+def calculate_success_metric(df):
+    """
+    Calculate success metric (percentage of non-white participants) for each study.
+    
+    The diversity score is calculated as:
+    Diversity Score = (# non-white participants) / (# total participants) × 100
+    where total participants = # white participants + # non-white participants
+    
+    Args:
+        df (pandas.DataFrame): Dataset with race information
+        
+    Returns:
+        pandas.DataFrame: Dataset with added success_metric column
+    """
+    df_with_metric = df.copy()
+    # Calculate diversity score
+    df_with_metric["success_metric"] = (
+        df_with_metric["# Non White"] / 
+        (df_with_metric["# White"] + df_with_metric["# Non White"]) * 
+        100.0
+    )
+    return df_with_metric
+
+def categorize_by_diversity(df, top_percentile=0.8, bottom_percentile=0.2):
+    """
+    Categorize studies as top, bottom, or middle based on the success metric.
+    
+    Args:
+        df (pandas.DataFrame): Dataset with success_metric column
+        top_percentile (float): Percentile threshold for top category (default: 0.8)
+        bottom_percentile (float): Percentile threshold for bottom category (default: 0.2)
+        
+    Returns:
+        pandas.DataFrame: Dataset with added success_category column
+    """
+    top_threshold = np.quantile(df.success_metric, top_percentile)
+    bottom_threshold = np.quantile(df.success_metric, bottom_percentile)
+    
+    def get_category_label(x):
+        if x >= top_threshold:
+            return "Top20"
+        elif x <= bottom_threshold:
+            return "Bottom20"
+        else:
+            return "Neither"
+    
+    df_categorized = df.copy()
+    df_categorized["success_category"] = df_categorized["success_metric"].apply(get_category_label)
+    return df_categorized
+
+def get_top_bottom_datasets(df):
+    """
+    Extract top and bottom diversity datasets based on success_metric thresholds.
+    
+    Args:
+        df (pandas.DataFrame): Dataset with success_metric column
+        
+    Returns:
+        tuple: (top_dataset, bottom_dataset)
+    """
+    success_metric_80th_perc = np.quantile(df.success_metric, 0.8)
+    success_metric_20th_perc = np.quantile(df.success_metric, 0.2)
+    
+    df_top = df[df.success_metric >= success_metric_80th_perc]
+    df_bottom = df[df.success_metric <= success_metric_20th_perc]
+    
+    return df_top, df_bottom
+
+def extract_eligibility_features(df):
+    """
+    Extract and categorize eligibility features for analysis.
+    
+    This function processes the raw data to extract binary flags for 
+    different eligibility criteria, which can affect trial diversity.
+    
+    Eligibility features include:
+    - age_restrict: 0 if only 18+, 1 for other restrictions (e.g., 18-75)
+    - stage_size: Restrictions on cancer stage and tumor size
+    - cancer_site: Restrictions on cancer site
+    - histological_type: Limited to specific histology (e.g., SCC only)
+    - performance_score: ECOG or other performance score requirements
+    - comorbidities: Restrictions on comorbidities
+    - hx_of_tt: Restrictions on treatment history
+    - lab_values: Restrictions on laboratory values
+    - pregnancy_or_contraception: Pregnancy or contraception requirements
+    - misc: Other restrictions (smoking, ethnicity, etc.)
+    
+    Args:
+        df (pandas.DataFrame): Dataset to process
+        
+    Returns:
+        pandas.DataFrame: Dataset with added eligibility feature columns
+    """
+    # This is a placeholder for the actual implementation
+    # In a real scenario, this would parse eligibility criteria text
+    # and extract the relevant features
+    
+    df_with_features = df.copy()
+    
+    # Calculate overall eligibility score (if not already present)
+    if 'eligibility_score' not in df_with_features.columns:
+        eligibility_cols = [
+            'eligibility_age_restrict', 
+            'eligibility_stage_size', 
+            'eligibility_site', 
+            'eligibility_histological_type',
+            'eligibility_performance_score', 
+            'eligibility_comorbidities', 
+            'eligibility_hx_of_tt',
+            'eligibility_lab_values', 
+            'eligibility_pregnancy_or_contraception', 
+            'eligibility_misc'
+        ]
+        
+        # If these columns exist, calculate the sum
+        if all(col in df_with_features.columns for col in eligibility_cols):
+            df_with_features['eligibility_score'] = df_with_features[eligibility_cols].sum(axis=1)
+    
+    return df_with_features
+
+def preprocess_data(input_filepath):
+    """
+    Complete preprocessing pipeline for diversity analysis.
+    
+    Args:
+        input_filepath (str): Path to input CSV file
+        
+    Returns:
+        tuple: (processed_df, top_df, bottom_df)
+    """
+    # Load data
+    df = load_data(input_filepath)
+    
+    # Filter for USA studies
+    df_usa = filter_usa_studies(df)
+    
+    # Filter for studies with race information
+    df_final = filter_race_information(df_usa)
+    
+    # Calculate success metric
+    df_final = calculate_success_metric(df_final)
+    
+    # Extract eligibility features
+    df_final = extract_eligibility_features(df_final)
+    
+    # Categorize studies
+    df_final = categorize_by_diversity(df_final)
+    
+    # Get top and bottom datasets
+    df_top, df_bottom = get_top_bottom_datasets(df_final)
+    
+    return df_final, df_top, df_bottom
\ No newline at end of file