Head-and-Neck-Trials / Git / [3cdecf] /src/data

Models:
joseph-gordon/
Head-and-Neck-Trials
Downloads: 1
[3cdecf]: / src / data_processing.py
History
Download this file
210 lines (163 with data), 7.0 kB

"""
Data Processing Module for Diversity in Head and Neck Cancer Clinical Trials Analysis

This module contains functions for data loading, filtering, and processing
to support the analysis of diversity in head and neck cancer clinical trials.

The main analysis focuses on computing a diversity score for each study,
defined as:
    Diversity Score = (# non-white participants) / (# total participants) × 100
where total participants = # white participants + # non-white participants

Studies are then categorized into top 20% diverse and bottom 20% diverse groups
for comparative analysis of various features.
"""

import warnings
import pandas as pd
import numpy as np

# Suppress warnings for cleaner output
warnings.simplefilter(action='ignore')

def load_data(filepath):
    """
    Load clinical trial data from CSV file.
    
    Args:
        filepath (str): Path to the CSV file
        
    Returns:
        pandas.DataFrame: Loaded data
    """
    return pd.read_csv(filepath)

def filter_usa_studies(df):
    """
    Filter for studies conducted only in the United States.
    
    Args:
        df (pandas.DataFrame): Original dataset
        
    Returns:
        pandas.DataFrame: Filtered dataset containing only USA studies
    """
    return df[df['Area Offered'] == "United States"]

def filter_race_information(df):
    """
    Filter for studies that contain race information.
    
    Args:
        df (pandas.DataFrame): Dataset to filter
        
    Returns:
        pandas.DataFrame: Filtered dataset containing only studies with race information
    """
    return df[~(df["# White"].isna() | df["# Non White"].isna())]

def calculate_success_metric(df):
    """
    Calculate success metric (percentage of non-white participants) for each study.
    
    The diversity score is calculated as:
    Diversity Score = (# non-white participants) / (# total participants) × 100
    where total participants = # white participants + # non-white participants
    
    Args:
        df (pandas.DataFrame): Dataset with race information
        
    Returns:
        pandas.DataFrame: Dataset with added success_metric column
    """
    df_with_metric = df.copy()
    # Calculate diversity score
    df_with_metric["success_metric"] = (
        df_with_metric["# Non White"] / 
        (df_with_metric["# White"] + df_with_metric["# Non White"]) * 
        100.0
    )
    return df_with_metric

def categorize_by_diversity(df, top_percentile=0.8, bottom_percentile=0.2):
    """
    Categorize studies as top, bottom, or middle based on the success metric.
    
    Args:
        df (pandas.DataFrame): Dataset with success_metric column
        top_percentile (float): Percentile threshold for top category (default: 0.8)
        bottom_percentile (float): Percentile threshold for bottom category (default: 0.2)
        
    Returns:
        pandas.DataFrame: Dataset with added success_category column
    """
    top_threshold = np.quantile(df.success_metric, top_percentile)
    bottom_threshold = np.quantile(df.success_metric, bottom_percentile)
    
    def get_category_label(x):
        if x >= top_threshold:
            return "Top20"
        elif x <= bottom_threshold:
            return "Bottom20"
        else:
            return "Neither"
    
    df_categorized = df.copy()
    df_categorized["success_category"] = df_categorized["success_metric"].apply(get_category_label)
    return df_categorized

def get_top_bottom_datasets(df):
    """
    Extract top and bottom diversity datasets based on success_metric thresholds.
    
    Args:
        df (pandas.DataFrame): Dataset with success_metric column
        
    Returns:
        tuple: (top_dataset, bottom_dataset)
    """
    success_metric_80th_perc = np.quantile(df.success_metric, 0.8)
    success_metric_20th_perc = np.quantile(df.success_metric, 0.2)
    
    df_top = df[df.success_metric >= success_metric_80th_perc]
    df_bottom = df[df.success_metric <= success_metric_20th_perc]
    
    return df_top, df_bottom

def extract_eligibility_features(df):
    """
    Extract and categorize eligibility features for analysis.
    
    This function processes the raw data to extract binary flags for 
    different eligibility criteria, which can affect trial diversity.
    
    Eligibility features include:
    - age_restrict: 0 if only 18+, 1 for other restrictions (e.g., 18-75)
    - stage_size: Restrictions on cancer stage and tumor size
    - cancer_site: Restrictions on cancer site
    - histological_type: Limited to specific histology (e.g., SCC only)
    - performance_score: ECOG or other performance score requirements
    - comorbidities: Restrictions on comorbidities
    - hx_of_tt: Restrictions on treatment history
    - lab_values: Restrictions on laboratory values
    - pregnancy_or_contraception: Pregnancy or contraception requirements
    - misc: Other restrictions (smoking, ethnicity, etc.)
    
    Args:
        df (pandas.DataFrame): Dataset to process
        
    Returns:
        pandas.DataFrame: Dataset with added eligibility feature columns
    """
    # This is a placeholder for the actual implementation
    # In a real scenario, this would parse eligibility criteria text
    # and extract the relevant features
    
    df_with_features = df.copy()
    
    # Calculate overall eligibility score (if not already present)
    if 'eligibility_score' not in df_with_features.columns:
        eligibility_cols = [
            'eligibility_age_restrict', 
            'eligibility_stage_size', 
            'eligibility_site', 
            'eligibility_histological_type',
            'eligibility_performance_score', 
            'eligibility_comorbidities', 
            'eligibility_hx_of_tt',
            'eligibility_lab_values', 
            'eligibility_pregnancy_or_contraception', 
            'eligibility_misc'
        ]
        
        # If these columns exist, calculate the sum
        if all(col in df_with_features.columns for col in eligibility_cols):
            df_with_features['eligibility_score'] = df_with_features[eligibility_cols].sum(axis=1)
    
    return df_with_features

def preprocess_data(input_filepath):
    """
    Complete preprocessing pipeline for diversity analysis.
    
    Args:
        input_filepath (str): Path to input CSV file
        
    Returns:
        tuple: (processed_df, top_df, bottom_df)
    """
    # Load data
    df = load_data(input_filepath)
    
    # Filter for USA studies
    df_usa = filter_usa_studies(df)
    
    # Filter for studies with race information
    df_final = filter_race_information(df_usa)
    
    # Calculate success metric
    df_final = calculate_success_metric(df_final)
    
    # Extract eligibility features
    df_final = extract_eligibility_features(df_final)
    
    # Categorize studies
    df_final = categorize_by_diversity(df_final)
    
    # Get top and bottom datasets
    df_top, df_bottom = get_top_bottom_datasets(df_final)
    
    return df_final, df_top, df_bottom