Diff of /src/main.py [000000] .. [3cdecf]

Switch to side-by-side view

--- a
+++ b/src/main.py
@@ -0,0 +1,191 @@
+"""
+Main Script for Diversity in Head and Neck Cancer Clinical Trials Analysis
+
+This script orchestrates the complete analysis workflow by calling functions
+from the data_processing, analysis, and visualization modules.
+
+The analysis examines factors associated with diversity in head and neck cancer
+clinical trials, with a focus on eligibility criteria and other study characteristics.
+
+Diversity is measured as the percentage of non-white participants:
+    Diversity Score = (# non-white participants) / (# total participants) × 100
+"""
+
+import os
+import pandas as pd
+import plotly.io as pio
+
+from data_processing import preprocess_data, get_top_bottom_datasets
+from analysis import analyze_all_factors
+from visualization import (
+    plot_cdf, 
+    plot_box_by_category,
+    plot_box_by_category_color,
+    compare_field_by_category,
+    create_geo_distribution_plot,
+    create_participant_distribution_by_gender,
+    create_eligibility_score_comparison
+)
+
+# Define input and output paths
+INPUT_DATA_PATH = "../all_studies.csv"  # Adjust if needed
+TOP20_CSV_PATH = "../top_20_studies.csv"
+BOTTOM20_CSV_PATH = "../bottom_20_studies.csv"
+PLOTS_DIR = "../plots"
+
+# Ensure plots directory exists
+os.makedirs(PLOTS_DIR, exist_ok=True)
+
+def save_figure(fig, filename):
+    """Save a plotly figure to both HTML and PNG formats."""
+    html_path = os.path.join(PLOTS_DIR, f"{filename}.html")
+    png_path = os.path.join(PLOTS_DIR, f"{filename}.png")
+    
+    # Save as HTML for interactive viewing
+    pio.write_html(fig, html_path)
+    
+    # Save as PNG for reports/presentations
+    pio.write_image(fig, png_path)
+    
+    print(f"Saved figure to {html_path} and {png_path}")
+
+def main():
+    """Main analysis workflow."""
+    print("Starting analysis of diversity in head and neck cancer clinical trials...")
+    
+    # Step 1: Preprocess data
+    print("\nPreprocessing data...")
+    df_final, df_top, df_bottom = preprocess_data(INPUT_DATA_PATH)
+    
+    # Save processed datasets to CSV
+    df_top.to_csv(TOP20_CSV_PATH, index=False)
+    df_bottom.to_csv(BOTTOM20_CSV_PATH, index=False)
+    
+    print(f"\nData processing complete.")
+    print(f"Total studies analyzed: {len(df_final)}")
+    print(f"Top diverse studies (top 20%): {len(df_top)}")
+    print(f"Bottom diverse studies (bottom 20%): {len(df_bottom)}")
+    
+    # Step 2: Create success metric distribution plot
+    print("\nGenerating diversity metric distribution plot...")
+    fig_cdf = plot_cdf(
+        df_final, 
+        "success_metric", 
+        title="Distribution of Diversity Success Metric",
+        x_title="Success Metric: % Non-White participants",
+        y_title="Fraction of studies"
+    )
+    save_figure(fig_cdf, "success_metric_cdf")
+    
+    # Step 3: Create eligibility score comparison plot
+    print("\nGenerating eligibility score comparison plot...")
+    fig_elig_score = create_eligibility_score_comparison(df_top, df_bottom)
+    save_figure(fig_elig_score, "box_plot_eligbility_score_diverse_vs_non_diverse")
+    
+    # Step 4: Create number of participants comparison plot
+    print("\nGenerating participant count comparison plot...")
+    df_top_bottom = pd.concat([
+        df_top.assign(success_category="top"),
+        df_bottom.assign(success_category="bottom")
+    ])
+    fig_participants = plot_box_by_category(
+        df_top_bottom, 
+        "num_participants", 
+        x_column="success_category",
+        title="Number of Participants in Top vs Bottom Studies",
+        x_title="Success Category",
+        y_title="Number of Participants"
+    )
+    save_figure(fig_participants, "box_plot_num_participants_top_vs_bottom")
+    
+    # Step 5: Create gender distribution plot
+    print("\nGenerating gender distribution plot...")
+    fig_gender = create_participant_distribution_by_gender(df_top, df_bottom)
+    save_figure(fig_gender, "distribution_num_participants_top_vs_bottom_studies_strat_gender")
+    
+    # Step 6: Create eligibility criteria comparison plots
+    print("\nGenerating eligibility criteria comparison plots...")
+    eligibility_fields = [
+        'eligibility_age_restrict',
+        'eligibility_stage_size',
+        'eligibility_site',
+        'eligibility_histological_type',
+        'eligibility_performance_score',
+        'eligibility_comorbidities',
+        'eligibility_hx_of_tt',
+        'eligibility_lab_values',
+        'eligibility_misc'
+    ]
+    
+    for field in eligibility_fields:
+        print(f"  - Generating plot for {field}...")
+        fig_field = compare_field_by_category(
+            df_final, 
+            field, 
+            categories=["Top20", "Bottom20"],
+            height=600
+        )
+        field_name = field.replace('eligibility_', '')
+        save_figure(fig_field, f"distribution_{field_name}")
+    
+    # Step 7: Create single institution comparison plot
+    print("\nGenerating institutional setting plot...")
+    fig_single_inst = compare_field_by_category(
+        df_final, 
+        "is_single_institution", 
+        categories=["Top20", "Bottom20"],
+        height=600
+    )
+    save_figure(fig_single_inst, "distribution_is_single_institution")
+    
+    # Step 8: Create geographic distribution plot
+    print("\nGenerating geographic distribution plot...")
+    fig_geo = create_geo_distribution_plot(df_top, df_bottom)
+    save_figure(fig_geo, "geo_distribution")
+    
+    # Step 9: Perform comprehensive analysis
+    print("\nPerforming comprehensive statistical analysis...")
+    analysis_results = analyze_all_factors(df_top, df_bottom)
+    
+    # Print key findings
+    print("\nKey Findings:")
+    
+    # Eligibility score comparison
+    elig_scores = analysis_results['eligibility_scores']
+    print(f"\nEligibility Score Comparison:")
+    print(f"  Top Studies: Mean = {elig_scores['top_stats']['mean']:.2f}, Median = {elig_scores['top_stats']['median']:.2f}")
+    print(f"  Bottom Studies: Mean = {elig_scores['bottom_stats']['mean']:.2f}, Median = {elig_scores['bottom_stats']['median']:.2f}")
+    print(f"  Significant Difference: {elig_scores['significant']}")
+    
+    # Most significant eligibility criteria
+    print("\nEligibility Criteria with Largest Differences:")
+    criteria_results = analysis_results['eligibility_criteria']
+    for criterion, result in sorted(criteria_results.items(), key=lambda x: abs(x[1]['difference']), reverse=True)[:3]:
+        criterion_name = criterion.replace('eligibility_', '')
+        print(f"  {criterion_name}: Top = {result['top_prevalence']:.1f}%, Bottom = {result['bottom_prevalence']:.1f}%")
+        print(f"    Difference = {result['difference']:.1f}%, Significant: {result['significant']}")
+    
+    # Study characteristics
+    print("\nStudy Characteristics Comparison:")
+    char_results = analysis_results['study_characteristics']
+    for char, result in char_results.items():
+        if 'difference' in result:  # Numeric characteristic
+            print(f"  {char}: Top = {result['top_mean']:.2f}, Bottom = {result['bottom_mean']:.2f}")
+            print(f"    Difference = {result['difference']:.2f}, Significant: {result['significant']}")
+    
+    # Geographic distribution
+    geo_results = analysis_results['geographic_distribution']
+    print("\nGeographic Distribution:")
+    print(f"  Number of unique locations in top studies: {geo_results['num_top_locations']}")
+    print(f"  Number of unique locations in bottom studies: {geo_results['num_bottom_locations']}")
+    print(f"  Number of locations in both categories: {len(geo_results['common_locations'])}")
+    
+    print("\nAnalysis complete. All figures saved to the 'plots' directory.")
+    print("\nSummary of findings:")
+    print("1. More restrictive eligibility criteria are associated with lower diversity in clinical trials.")
+    print("2. Geographic location appears to be a significant factor in trial diversity.")
+    print("3. There are differences in participant demographics between high and low diversity studies.")
+    print("4. Specific eligibility restrictions may disproportionately affect minority participation.")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file