Head-and-Neck-Trials / Git / [3cdecf] /src/analysis.py

Models:
joseph-gordon/
Head-and-Neck-Trials
Downloads: 1
[3cdecf]: / src / analysis.py
History
Download this file
314 lines (267 with data), 12.7 kB

"""
Analysis Module for Diversity in Head and Neck Cancer Clinical Trials

This module contains functions for analyzing the factors that contribute to
diversity in head and neck cancer clinical trials, with a focus on eligibility
criteria and other study characteristics.
"""

import pandas as pd
import numpy as np
from scipy import stats

def compare_eligibility_scores(df_top, df_bottom):
    """
    Compare eligibility scores between top and bottom diverse studies.
    
    The eligibility score is the sum of binary flags for all eligibility criteria,
    with higher scores indicating more restrictive criteria.
    
    Args:
        df_top (pandas.DataFrame): Dataset of top diverse studies
        df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
        
    Returns:
        dict: Dictionary with comparison results and statistical test
    """
    # Get eligibility scores for both groups
    top_scores = df_top['eligibility_score']
    bottom_scores = df_bottom['eligibility_score']
    
    # Perform statistical comparison (Mann-Whitney U test)
    u_stat, p_value = stats.mannwhitneyu(top_scores, bottom_scores, alternative='two-sided')
    
    # Calculate descriptive statistics
    top_stats = {
        'mean': np.mean(top_scores),
        'median': np.median(top_scores),
        'std': np.std(top_scores),
        'min': np.min(top_scores),
        'max': np.max(top_scores)
    }
    
    bottom_stats = {
        'mean': np.mean(bottom_scores),
        'median': np.median(bottom_scores),
        'std': np.std(bottom_scores),
        'min': np.min(bottom_scores),
        'max': np.max(bottom_scores)
    }
    
    return {
        'top_stats': top_stats,
        'bottom_stats': bottom_stats,
        'u_statistic': u_stat,
        'p_value': p_value,
        'significant': p_value < 0.05
    }

def compare_eligibility_criteria(df_top, df_bottom):
    """
    Compare the prevalence of specific eligibility criteria between top and bottom diverse studies.
    
    This function examines the following eligibility criteria:
    - age_restrict: 0 if the restriction is age>18, 1 for other restrictions (e.g., 18<age<75)
    - stage_size: Restrictions on the cancer stage and the size of the tumor
    - cancer_site: Restrictions on the cancer site
    - histological_type: Whether the study was limited to SCC or any other type
    - performance_score: Restrictions on performance score (e.g., ECOG performance)
    - comorbidities: Restrictions on comorbidities
    - hx_of_tt: Restrictions on treatment history for cancer
    - lab_values: Restrictions on lab test values
    - pregnancy_or_contraception: Restrictions on pregnancy or particular contraceptives
    - misc: Other restrictions (smoking status, ethnicity requirements)
    
    Args:
        df_top (pandas.DataFrame): Dataset of top diverse studies
        df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
        
    Returns:
        dict: Dictionary mapping criteria names to comparison results
    """
    # Define eligibility criteria fields to compare
    criteria_fields = [
        'eligibility_age_restrict',
        'eligibility_stage_size',
        'eligibility_site',
        'eligibility_histological_type',
        'eligibility_performance_score',
        'eligibility_comorbidities',
        'eligibility_hx_of_tt',
        'eligibility_lab_values',
        'eligibility_pregnancy_or_contraception',
        'eligibility_misc'
    ]
    
    # Compare each criterion
    results = {}
    for criterion in criteria_fields:
        # Calculate prevalence in each group
        top_prevalence = df_top[criterion].mean() * 100
        bottom_prevalence = df_bottom[criterion].mean() * 100
        
        # Perform Fisher's exact test for significance
        contingency_table = pd.crosstab(
            pd.Series(df_top[criterion].tolist() + df_bottom[criterion].tolist()), 
            pd.Series(['top'] * len(df_top) + ['bottom'] * len(df_bottom))
        )
        
        _, p_value = stats.fisher_exact(contingency_table)
        
        # Store results
        results[criterion] = {
            'top_prevalence': top_prevalence,
            'bottom_prevalence': bottom_prevalence,
            'difference': top_prevalence - bottom_prevalence,
            'p_value': p_value,
            'significant': p_value < 0.05
        }
    
    return results

def compare_study_characteristics(df_top, df_bottom):
    """
    Compare general study characteristics between top and bottom diverse studies.
    
    This function examines the following characteristics:
    1. Study start date and end date
    2. Single vs. multi-institutional study
    3. Number of participants (total, male, female)
    4. Modality (Drug/Radiation/Biologic/Combination)
    5. Trial type (Primary/Palliative/Recurrent/Metastatic)
    
    Args:
        df_top (pandas.DataFrame): Dataset of top diverse studies
        df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
        
    Returns:
        dict: Dictionary mapping characteristic names to comparison results
    """
    # Define characteristics to compare
    characteristics = [
        'num_sites',
        'is_single_institution',
        'num_participants',
        'num_male_participants',
        'num_female_participants',
        'modalities',
        'trial_type',
        'trial_phase',
        'tumor_type',
        'cancer_site',
        'study_start_date',
        'apc_date'
    ]
    
    results = {}
    for char in characteristics:
        if char in df_top.columns and char in df_bottom.columns:
            if df_top[char].dtype in [np.int64, np.float64]:
                # Numeric characteristic - use Mann-Whitney U test
                u_stat, p_value = stats.mannwhitneyu(
                    df_top[char].dropna(), 
                    df_bottom[char].dropna(), 
                    alternative='two-sided'
                )
                
                results[char] = {
                    'top_mean': df_top[char].mean(),
                    'bottom_mean': df_bottom[char].mean(),
                    'difference': df_top[char].mean() - df_bottom[char].mean(),
                    'test': 'Mann-Whitney U',
                    'statistic': u_stat,
                    'p_value': p_value,
                    'significant': p_value < 0.05
                }
            else:
                # Categorical characteristic - use Chi-square test
                try:
                    contingency_table = pd.crosstab(
                        pd.Series(df_top[char].tolist() + df_bottom[char].tolist()), 
                        pd.Series(['top'] * len(df_top) + ['bottom'] * len(df_bottom))
                    )
                    
                    chi2, p_value, _, _ = stats.chi2_contingency(contingency_table)
                    
                    # Calculate prevalence of each category in both groups
                    top_distribution = df_top[char].value_counts(normalize=True).to_dict()
                    bottom_distribution = df_bottom[char].value_counts(normalize=True).to_dict()
                    
                    results[char] = {
                        'top_distribution': top_distribution,
                        'bottom_distribution': bottom_distribution,
                        'test': 'Chi-square',
                        'statistic': chi2,
                        'p_value': p_value,
                        'significant': p_value < 0.05
                    }
                except (ValueError, np.linalg.LinAlgError):
                    # Handle cases where Chi-square test cannot be performed
                    top_distribution = df_top[char].value_counts(normalize=True).to_dict()
                    bottom_distribution = df_bottom[char].value_counts(normalize=True).to_dict()
                    
                    results[char] = {
                        'top_distribution': top_distribution,
                        'bottom_distribution': bottom_distribution,
                        'test': 'Not performed',
                        'statistic': None,
                        'p_value': None,
                        'significant': None
                    }
    
    return results

def analyze_geographic_distribution(df_top, df_bottom):
    """
    Analyze the geographic distribution of studies and its impact on diversity.
    
    This function examines how geographic location relates to diversity in clinical trials,
    considering that areas with more diverse populations may have higher potential for
    diverse study recruitment.
    
    Args:
        df_top (pandas.DataFrame): Dataset of top diverse studies
        df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
        
    Returns:
        dict: Dictionary with geographic analysis results
    """
    # Extract location information
    location_field = 'location'
    if location_field in df_top.columns and location_field in df_bottom.columns:
        top_locations = df_top[location_field].value_counts().to_dict()
        bottom_locations = df_bottom[location_field].value_counts().to_dict()
        
        # Identify common and unique locations
        all_locations = set(top_locations.keys()).union(set(bottom_locations.keys()))
        common_locations = set(top_locations.keys()).intersection(set(bottom_locations.keys()))
        top_only_locations = set(top_locations.keys()) - set(bottom_locations.keys())
        bottom_only_locations = set(bottom_locations.keys()) - set(top_locations.keys())
        
        return {
            'top_locations': top_locations,
            'bottom_locations': bottom_locations,
            'all_locations': list(all_locations),
            'common_locations': list(common_locations),
            'top_only_locations': list(top_only_locations),
            'bottom_only_locations': list(bottom_only_locations),
            'num_top_locations': len(top_locations),
            'num_bottom_locations': len(bottom_locations)
        }
    else:
        return {
            'error': f"Location field '{location_field}' not found in datasets"
        }

def analyze_institutional_setting(df_top, df_bottom):
    """
    Analyze the impact of institutional setting (single vs. multi-institution) on diversity.
    
    This function examines whether single-institution or multi-institution studies
    tend to have higher diversity.
    
    Args:
        df_top (pandas.DataFrame): Dataset of top diverse studies
        df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
        
    Returns:
        dict: Dictionary with institutional setting analysis results
    """
    institution_field = 'is_single_institution'
    
    if institution_field in df_top.columns and institution_field in df_bottom.columns:
        # Calculate prevalence of single-institution studies in each group
        top_single_inst_perc = df_top[institution_field].mean() * 100
        bottom_single_inst_perc = df_bottom[institution_field].mean() * 100
        
        # Perform Fisher's exact test for significance
        contingency_table = pd.crosstab(
            pd.Series(df_top[institution_field].tolist() + df_bottom[institution_field].tolist()), 
            pd.Series(['top'] * len(df_top) + ['bottom'] * len(df_bottom))
        )
        
        _, p_value = stats.fisher_exact(contingency_table)
        
        return {
            'top_single_institution_percent': top_single_inst_perc,
            'bottom_single_institution_percent': bottom_single_inst_perc,
            'difference': top_single_inst_perc - bottom_single_inst_perc,
            'p_value': p_value,
            'significant': p_value < 0.05
        }
    else:
        return {
            'error': f"Institution field '{institution_field}' not found in datasets"
        }

def analyze_all_factors(df_top, df_bottom):
    """
    Perform a comprehensive analysis of all factors that may affect diversity.
    
    Args:
        df_top (pandas.DataFrame): Dataset of top diverse studies
        df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
        
    Returns:
        dict: Dictionary with all analysis results
    """
    return {
        'eligibility_scores': compare_eligibility_scores(df_top, df_bottom),
        'eligibility_criteria': compare_eligibility_criteria(df_top, df_bottom),
        'study_characteristics': compare_study_characteristics(df_top, df_bottom),
        'geographic_distribution': analyze_geographic_distribution(df_top, df_bottom),
        'institutional_setting': analyze_institutional_setting(df_top, df_bottom)
    }