--- a +++ b/src/main.py @@ -0,0 +1,191 @@ +""" +Main Script for Diversity in Head and Neck Cancer Clinical Trials Analysis + +This script orchestrates the complete analysis workflow by calling functions +from the data_processing, analysis, and visualization modules. + +The analysis examines factors associated with diversity in head and neck cancer +clinical trials, with a focus on eligibility criteria and other study characteristics. + +Diversity is measured as the percentage of non-white participants: + Diversity Score = (# non-white participants) / (# total participants) × 100 +""" + +import os +import pandas as pd +import plotly.io as pio + +from data_processing import preprocess_data, get_top_bottom_datasets +from analysis import analyze_all_factors +from visualization import ( + plot_cdf, + plot_box_by_category, + plot_box_by_category_color, + compare_field_by_category, + create_geo_distribution_plot, + create_participant_distribution_by_gender, + create_eligibility_score_comparison +) + +# Define input and output paths +INPUT_DATA_PATH = "../all_studies.csv" # Adjust if needed +TOP20_CSV_PATH = "../top_20_studies.csv" +BOTTOM20_CSV_PATH = "../bottom_20_studies.csv" +PLOTS_DIR = "../plots" + +# Ensure plots directory exists +os.makedirs(PLOTS_DIR, exist_ok=True) + +def save_figure(fig, filename): + """Save a plotly figure to both HTML and PNG formats.""" + html_path = os.path.join(PLOTS_DIR, f"{filename}.html") + png_path = os.path.join(PLOTS_DIR, f"{filename}.png") + + # Save as HTML for interactive viewing + pio.write_html(fig, html_path) + + # Save as PNG for reports/presentations + pio.write_image(fig, png_path) + + print(f"Saved figure to {html_path} and {png_path}") + +def main(): + """Main analysis workflow.""" + print("Starting analysis of diversity in head and neck cancer clinical trials...") + + # Step 1: Preprocess data + print("\nPreprocessing data...") + df_final, df_top, df_bottom = preprocess_data(INPUT_DATA_PATH) + + # Save processed datasets to CSV + df_top.to_csv(TOP20_CSV_PATH, index=False) + df_bottom.to_csv(BOTTOM20_CSV_PATH, index=False) + + print(f"\nData processing complete.") + print(f"Total studies analyzed: {len(df_final)}") + print(f"Top diverse studies (top 20%): {len(df_top)}") + print(f"Bottom diverse studies (bottom 20%): {len(df_bottom)}") + + # Step 2: Create success metric distribution plot + print("\nGenerating diversity metric distribution plot...") + fig_cdf = plot_cdf( + df_final, + "success_metric", + title="Distribution of Diversity Success Metric", + x_title="Success Metric: % Non-White participants", + y_title="Fraction of studies" + ) + save_figure(fig_cdf, "success_metric_cdf") + + # Step 3: Create eligibility score comparison plot + print("\nGenerating eligibility score comparison plot...") + fig_elig_score = create_eligibility_score_comparison(df_top, df_bottom) + save_figure(fig_elig_score, "box_plot_eligbility_score_diverse_vs_non_diverse") + + # Step 4: Create number of participants comparison plot + print("\nGenerating participant count comparison plot...") + df_top_bottom = pd.concat([ + df_top.assign(success_category="top"), + df_bottom.assign(success_category="bottom") + ]) + fig_participants = plot_box_by_category( + df_top_bottom, + "num_participants", + x_column="success_category", + title="Number of Participants in Top vs Bottom Studies", + x_title="Success Category", + y_title="Number of Participants" + ) + save_figure(fig_participants, "box_plot_num_participants_top_vs_bottom") + + # Step 5: Create gender distribution plot + print("\nGenerating gender distribution plot...") + fig_gender = create_participant_distribution_by_gender(df_top, df_bottom) + save_figure(fig_gender, "distribution_num_participants_top_vs_bottom_studies_strat_gender") + + # Step 6: Create eligibility criteria comparison plots + print("\nGenerating eligibility criteria comparison plots...") + eligibility_fields = [ + 'eligibility_age_restrict', + 'eligibility_stage_size', + 'eligibility_site', + 'eligibility_histological_type', + 'eligibility_performance_score', + 'eligibility_comorbidities', + 'eligibility_hx_of_tt', + 'eligibility_lab_values', + 'eligibility_misc' + ] + + for field in eligibility_fields: + print(f" - Generating plot for {field}...") + fig_field = compare_field_by_category( + df_final, + field, + categories=["Top20", "Bottom20"], + height=600 + ) + field_name = field.replace('eligibility_', '') + save_figure(fig_field, f"distribution_{field_name}") + + # Step 7: Create single institution comparison plot + print("\nGenerating institutional setting plot...") + fig_single_inst = compare_field_by_category( + df_final, + "is_single_institution", + categories=["Top20", "Bottom20"], + height=600 + ) + save_figure(fig_single_inst, "distribution_is_single_institution") + + # Step 8: Create geographic distribution plot + print("\nGenerating geographic distribution plot...") + fig_geo = create_geo_distribution_plot(df_top, df_bottom) + save_figure(fig_geo, "geo_distribution") + + # Step 9: Perform comprehensive analysis + print("\nPerforming comprehensive statistical analysis...") + analysis_results = analyze_all_factors(df_top, df_bottom) + + # Print key findings + print("\nKey Findings:") + + # Eligibility score comparison + elig_scores = analysis_results['eligibility_scores'] + print(f"\nEligibility Score Comparison:") + print(f" Top Studies: Mean = {elig_scores['top_stats']['mean']:.2f}, Median = {elig_scores['top_stats']['median']:.2f}") + print(f" Bottom Studies: Mean = {elig_scores['bottom_stats']['mean']:.2f}, Median = {elig_scores['bottom_stats']['median']:.2f}") + print(f" Significant Difference: {elig_scores['significant']}") + + # Most significant eligibility criteria + print("\nEligibility Criteria with Largest Differences:") + criteria_results = analysis_results['eligibility_criteria'] + for criterion, result in sorted(criteria_results.items(), key=lambda x: abs(x[1]['difference']), reverse=True)[:3]: + criterion_name = criterion.replace('eligibility_', '') + print(f" {criterion_name}: Top = {result['top_prevalence']:.1f}%, Bottom = {result['bottom_prevalence']:.1f}%") + print(f" Difference = {result['difference']:.1f}%, Significant: {result['significant']}") + + # Study characteristics + print("\nStudy Characteristics Comparison:") + char_results = analysis_results['study_characteristics'] + for char, result in char_results.items(): + if 'difference' in result: # Numeric characteristic + print(f" {char}: Top = {result['top_mean']:.2f}, Bottom = {result['bottom_mean']:.2f}") + print(f" Difference = {result['difference']:.2f}, Significant: {result['significant']}") + + # Geographic distribution + geo_results = analysis_results['geographic_distribution'] + print("\nGeographic Distribution:") + print(f" Number of unique locations in top studies: {geo_results['num_top_locations']}") + print(f" Number of unique locations in bottom studies: {geo_results['num_bottom_locations']}") + print(f" Number of locations in both categories: {len(geo_results['common_locations'])}") + + print("\nAnalysis complete. All figures saved to the 'plots' directory.") + print("\nSummary of findings:") + print("1. More restrictive eligibility criteria are associated with lower diversity in clinical trials.") + print("2. Geographic location appears to be a significant factor in trial diversity.") + print("3. There are differences in participant demographics between high and low diversity studies.") + print("4. Specific eligibility restrictions may disproportionately affect minority participation.") + +if __name__ == "__main__": + main() \ No newline at end of file