--- a +++ b/src/analysis.py @@ -0,0 +1,314 @@ +""" +Analysis Module for Diversity in Head and Neck Cancer Clinical Trials + +This module contains functions for analyzing the factors that contribute to +diversity in head and neck cancer clinical trials, with a focus on eligibility +criteria and other study characteristics. +""" + +import pandas as pd +import numpy as np +from scipy import stats + +def compare_eligibility_scores(df_top, df_bottom): + """ + Compare eligibility scores between top and bottom diverse studies. + + The eligibility score is the sum of binary flags for all eligibility criteria, + with higher scores indicating more restrictive criteria. + + Args: + df_top (pandas.DataFrame): Dataset of top diverse studies + df_bottom (pandas.DataFrame): Dataset of bottom diverse studies + + Returns: + dict: Dictionary with comparison results and statistical test + """ + # Get eligibility scores for both groups + top_scores = df_top['eligibility_score'] + bottom_scores = df_bottom['eligibility_score'] + + # Perform statistical comparison (Mann-Whitney U test) + u_stat, p_value = stats.mannwhitneyu(top_scores, bottom_scores, alternative='two-sided') + + # Calculate descriptive statistics + top_stats = { + 'mean': np.mean(top_scores), + 'median': np.median(top_scores), + 'std': np.std(top_scores), + 'min': np.min(top_scores), + 'max': np.max(top_scores) + } + + bottom_stats = { + 'mean': np.mean(bottom_scores), + 'median': np.median(bottom_scores), + 'std': np.std(bottom_scores), + 'min': np.min(bottom_scores), + 'max': np.max(bottom_scores) + } + + return { + 'top_stats': top_stats, + 'bottom_stats': bottom_stats, + 'u_statistic': u_stat, + 'p_value': p_value, + 'significant': p_value < 0.05 + } + +def compare_eligibility_criteria(df_top, df_bottom): + """ + Compare the prevalence of specific eligibility criteria between top and bottom diverse studies. + + This function examines the following eligibility criteria: + - age_restrict: 0 if the restriction is age>18, 1 for other restrictions (e.g., 18<age<75) + - stage_size: Restrictions on the cancer stage and the size of the tumor + - cancer_site: Restrictions on the cancer site + - histological_type: Whether the study was limited to SCC or any other type + - performance_score: Restrictions on performance score (e.g., ECOG performance) + - comorbidities: Restrictions on comorbidities + - hx_of_tt: Restrictions on treatment history for cancer + - lab_values: Restrictions on lab test values + - pregnancy_or_contraception: Restrictions on pregnancy or particular contraceptives + - misc: Other restrictions (smoking status, ethnicity requirements) + + Args: + df_top (pandas.DataFrame): Dataset of top diverse studies + df_bottom (pandas.DataFrame): Dataset of bottom diverse studies + + Returns: + dict: Dictionary mapping criteria names to comparison results + """ + # Define eligibility criteria fields to compare + criteria_fields = [ + 'eligibility_age_restrict', + 'eligibility_stage_size', + 'eligibility_site', + 'eligibility_histological_type', + 'eligibility_performance_score', + 'eligibility_comorbidities', + 'eligibility_hx_of_tt', + 'eligibility_lab_values', + 'eligibility_pregnancy_or_contraception', + 'eligibility_misc' + ] + + # Compare each criterion + results = {} + for criterion in criteria_fields: + # Calculate prevalence in each group + top_prevalence = df_top[criterion].mean() * 100 + bottom_prevalence = df_bottom[criterion].mean() * 100 + + # Perform Fisher's exact test for significance + contingency_table = pd.crosstab( + pd.Series(df_top[criterion].tolist() + df_bottom[criterion].tolist()), + pd.Series(['top'] * len(df_top) + ['bottom'] * len(df_bottom)) + ) + + _, p_value = stats.fisher_exact(contingency_table) + + # Store results + results[criterion] = { + 'top_prevalence': top_prevalence, + 'bottom_prevalence': bottom_prevalence, + 'difference': top_prevalence - bottom_prevalence, + 'p_value': p_value, + 'significant': p_value < 0.05 + } + + return results + +def compare_study_characteristics(df_top, df_bottom): + """ + Compare general study characteristics between top and bottom diverse studies. + + This function examines the following characteristics: + 1. Study start date and end date + 2. Single vs. multi-institutional study + 3. Number of participants (total, male, female) + 4. Modality (Drug/Radiation/Biologic/Combination) + 5. Trial type (Primary/Palliative/Recurrent/Metastatic) + + Args: + df_top (pandas.DataFrame): Dataset of top diverse studies + df_bottom (pandas.DataFrame): Dataset of bottom diverse studies + + Returns: + dict: Dictionary mapping characteristic names to comparison results + """ + # Define characteristics to compare + characteristics = [ + 'num_sites', + 'is_single_institution', + 'num_participants', + 'num_male_participants', + 'num_female_participants', + 'modalities', + 'trial_type', + 'trial_phase', + 'tumor_type', + 'cancer_site', + 'study_start_date', + 'apc_date' + ] + + results = {} + for char in characteristics: + if char in df_top.columns and char in df_bottom.columns: + if df_top[char].dtype in [np.int64, np.float64]: + # Numeric characteristic - use Mann-Whitney U test + u_stat, p_value = stats.mannwhitneyu( + df_top[char].dropna(), + df_bottom[char].dropna(), + alternative='two-sided' + ) + + results[char] = { + 'top_mean': df_top[char].mean(), + 'bottom_mean': df_bottom[char].mean(), + 'difference': df_top[char].mean() - df_bottom[char].mean(), + 'test': 'Mann-Whitney U', + 'statistic': u_stat, + 'p_value': p_value, + 'significant': p_value < 0.05 + } + else: + # Categorical characteristic - use Chi-square test + try: + contingency_table = pd.crosstab( + pd.Series(df_top[char].tolist() + df_bottom[char].tolist()), + pd.Series(['top'] * len(df_top) + ['bottom'] * len(df_bottom)) + ) + + chi2, p_value, _, _ = stats.chi2_contingency(contingency_table) + + # Calculate prevalence of each category in both groups + top_distribution = df_top[char].value_counts(normalize=True).to_dict() + bottom_distribution = df_bottom[char].value_counts(normalize=True).to_dict() + + results[char] = { + 'top_distribution': top_distribution, + 'bottom_distribution': bottom_distribution, + 'test': 'Chi-square', + 'statistic': chi2, + 'p_value': p_value, + 'significant': p_value < 0.05 + } + except (ValueError, np.linalg.LinAlgError): + # Handle cases where Chi-square test cannot be performed + top_distribution = df_top[char].value_counts(normalize=True).to_dict() + bottom_distribution = df_bottom[char].value_counts(normalize=True).to_dict() + + results[char] = { + 'top_distribution': top_distribution, + 'bottom_distribution': bottom_distribution, + 'test': 'Not performed', + 'statistic': None, + 'p_value': None, + 'significant': None + } + + return results + +def analyze_geographic_distribution(df_top, df_bottom): + """ + Analyze the geographic distribution of studies and its impact on diversity. + + This function examines how geographic location relates to diversity in clinical trials, + considering that areas with more diverse populations may have higher potential for + diverse study recruitment. + + Args: + df_top (pandas.DataFrame): Dataset of top diverse studies + df_bottom (pandas.DataFrame): Dataset of bottom diverse studies + + Returns: + dict: Dictionary with geographic analysis results + """ + # Extract location information + location_field = 'location' + if location_field in df_top.columns and location_field in df_bottom.columns: + top_locations = df_top[location_field].value_counts().to_dict() + bottom_locations = df_bottom[location_field].value_counts().to_dict() + + # Identify common and unique locations + all_locations = set(top_locations.keys()).union(set(bottom_locations.keys())) + common_locations = set(top_locations.keys()).intersection(set(bottom_locations.keys())) + top_only_locations = set(top_locations.keys()) - set(bottom_locations.keys()) + bottom_only_locations = set(bottom_locations.keys()) - set(top_locations.keys()) + + return { + 'top_locations': top_locations, + 'bottom_locations': bottom_locations, + 'all_locations': list(all_locations), + 'common_locations': list(common_locations), + 'top_only_locations': list(top_only_locations), + 'bottom_only_locations': list(bottom_only_locations), + 'num_top_locations': len(top_locations), + 'num_bottom_locations': len(bottom_locations) + } + else: + return { + 'error': f"Location field '{location_field}' not found in datasets" + } + +def analyze_institutional_setting(df_top, df_bottom): + """ + Analyze the impact of institutional setting (single vs. multi-institution) on diversity. + + This function examines whether single-institution or multi-institution studies + tend to have higher diversity. + + Args: + df_top (pandas.DataFrame): Dataset of top diverse studies + df_bottom (pandas.DataFrame): Dataset of bottom diverse studies + + Returns: + dict: Dictionary with institutional setting analysis results + """ + institution_field = 'is_single_institution' + + if institution_field in df_top.columns and institution_field in df_bottom.columns: + # Calculate prevalence of single-institution studies in each group + top_single_inst_perc = df_top[institution_field].mean() * 100 + bottom_single_inst_perc = df_bottom[institution_field].mean() * 100 + + # Perform Fisher's exact test for significance + contingency_table = pd.crosstab( + pd.Series(df_top[institution_field].tolist() + df_bottom[institution_field].tolist()), + pd.Series(['top'] * len(df_top) + ['bottom'] * len(df_bottom)) + ) + + _, p_value = stats.fisher_exact(contingency_table) + + return { + 'top_single_institution_percent': top_single_inst_perc, + 'bottom_single_institution_percent': bottom_single_inst_perc, + 'difference': top_single_inst_perc - bottom_single_inst_perc, + 'p_value': p_value, + 'significant': p_value < 0.05 + } + else: + return { + 'error': f"Institution field '{institution_field}' not found in datasets" + } + +def analyze_all_factors(df_top, df_bottom): + """ + Perform a comprehensive analysis of all factors that may affect diversity. + + Args: + df_top (pandas.DataFrame): Dataset of top diverse studies + df_bottom (pandas.DataFrame): Dataset of bottom diverse studies + + Returns: + dict: Dictionary with all analysis results + """ + return { + 'eligibility_scores': compare_eligibility_scores(df_top, df_bottom), + 'eligibility_criteria': compare_eligibility_criteria(df_top, df_bottom), + 'study_characteristics': compare_study_characteristics(df_top, df_bottom), + 'geographic_distribution': analyze_geographic_distribution(df_top, df_bottom), + 'institutional_setting': analyze_institutional_setting(df_top, df_bottom) + } \ No newline at end of file