Head-and-Neck-Trials / Git / [3cdecf] /src/main.py

Models:
joseph-gordon/
Head-and-Neck-Trials
Downloads: 1
[3cdecf]: / src / main.py
History
Download this file
191 lines (160 with data), 8.1 kB

"""
Main Script for Diversity in Head and Neck Cancer Clinical Trials Analysis

This script orchestrates the complete analysis workflow by calling functions
from the data_processing, analysis, and visualization modules.

The analysis examines factors associated with diversity in head and neck cancer
clinical trials, with a focus on eligibility criteria and other study characteristics.

Diversity is measured as the percentage of non-white participants:
    Diversity Score = (# non-white participants) / (# total participants) × 100
"""

import os
import pandas as pd
import plotly.io as pio

from data_processing import preprocess_data, get_top_bottom_datasets
from analysis import analyze_all_factors
from visualization import (
    plot_cdf, 
    plot_box_by_category,
    plot_box_by_category_color,
    compare_field_by_category,
    create_geo_distribution_plot,
    create_participant_distribution_by_gender,
    create_eligibility_score_comparison
)

# Define input and output paths
INPUT_DATA_PATH = "../all_studies.csv"  # Adjust if needed
TOP20_CSV_PATH = "../top_20_studies.csv"
BOTTOM20_CSV_PATH = "../bottom_20_studies.csv"
PLOTS_DIR = "../plots"

# Ensure plots directory exists
os.makedirs(PLOTS_DIR, exist_ok=True)

def save_figure(fig, filename):
    """Save a plotly figure to both HTML and PNG formats."""
    html_path = os.path.join(PLOTS_DIR, f"{filename}.html")
    png_path = os.path.join(PLOTS_DIR, f"{filename}.png")
    
    # Save as HTML for interactive viewing
    pio.write_html(fig, html_path)
    
    # Save as PNG for reports/presentations
    pio.write_image(fig, png_path)
    
    print(f"Saved figure to {html_path} and {png_path}")

def main():
    """Main analysis workflow."""
    print("Starting analysis of diversity in head and neck cancer clinical trials...")
    
    # Step 1: Preprocess data
    print("\nPreprocessing data...")
    df_final, df_top, df_bottom = preprocess_data(INPUT_DATA_PATH)
    
    # Save processed datasets to CSV
    df_top.to_csv(TOP20_CSV_PATH, index=False)
    df_bottom.to_csv(BOTTOM20_CSV_PATH, index=False)
    
    print(f"\nData processing complete.")
    print(f"Total studies analyzed: {len(df_final)}")
    print(f"Top diverse studies (top 20%): {len(df_top)}")
    print(f"Bottom diverse studies (bottom 20%): {len(df_bottom)}")
    
    # Step 2: Create success metric distribution plot
    print("\nGenerating diversity metric distribution plot...")
    fig_cdf = plot_cdf(
        df_final, 
        "success_metric", 
        title="Distribution of Diversity Success Metric",
        x_title="Success Metric: % Non-White participants",
        y_title="Fraction of studies"
    )
    save_figure(fig_cdf, "success_metric_cdf")
    
    # Step 3: Create eligibility score comparison plot
    print("\nGenerating eligibility score comparison plot...")
    fig_elig_score = create_eligibility_score_comparison(df_top, df_bottom)
    save_figure(fig_elig_score, "box_plot_eligbility_score_diverse_vs_non_diverse")
    
    # Step 4: Create number of participants comparison plot
    print("\nGenerating participant count comparison plot...")
    df_top_bottom = pd.concat([
        df_top.assign(success_category="top"),
        df_bottom.assign(success_category="bottom")
    ])
    fig_participants = plot_box_by_category(
        df_top_bottom, 
        "num_participants", 
        x_column="success_category",
        title="Number of Participants in Top vs Bottom Studies",
        x_title="Success Category",
        y_title="Number of Participants"
    )
    save_figure(fig_participants, "box_plot_num_participants_top_vs_bottom")
    
    # Step 5: Create gender distribution plot
    print("\nGenerating gender distribution plot...")
    fig_gender = create_participant_distribution_by_gender(df_top, df_bottom)
    save_figure(fig_gender, "distribution_num_participants_top_vs_bottom_studies_strat_gender")
    
    # Step 6: Create eligibility criteria comparison plots
    print("\nGenerating eligibility criteria comparison plots...")
    eligibility_fields = [
        'eligibility_age_restrict',
        'eligibility_stage_size',
        'eligibility_site',
        'eligibility_histological_type',
        'eligibility_performance_score',
        'eligibility_comorbidities',
        'eligibility_hx_of_tt',
        'eligibility_lab_values',
        'eligibility_misc'
    ]
    
    for field in eligibility_fields:
        print(f"  - Generating plot for {field}...")
        fig_field = compare_field_by_category(
            df_final, 
            field, 
            categories=["Top20", "Bottom20"],
            height=600
        )
        field_name = field.replace('eligibility_', '')
        save_figure(fig_field, f"distribution_{field_name}")
    
    # Step 7: Create single institution comparison plot
    print("\nGenerating institutional setting plot...")
    fig_single_inst = compare_field_by_category(
        df_final, 
        "is_single_institution", 
        categories=["Top20", "Bottom20"],
        height=600
    )
    save_figure(fig_single_inst, "distribution_is_single_institution")
    
    # Step 8: Create geographic distribution plot
    print("\nGenerating geographic distribution plot...")
    fig_geo = create_geo_distribution_plot(df_top, df_bottom)
    save_figure(fig_geo, "geo_distribution")
    
    # Step 9: Perform comprehensive analysis
    print("\nPerforming comprehensive statistical analysis...")
    analysis_results = analyze_all_factors(df_top, df_bottom)
    
    # Print key findings
    print("\nKey Findings:")
    
    # Eligibility score comparison
    elig_scores = analysis_results['eligibility_scores']
    print(f"\nEligibility Score Comparison:")
    print(f"  Top Studies: Mean = {elig_scores['top_stats']['mean']:.2f}, Median = {elig_scores['top_stats']['median']:.2f}")
    print(f"  Bottom Studies: Mean = {elig_scores['bottom_stats']['mean']:.2f}, Median = {elig_scores['bottom_stats']['median']:.2f}")
    print(f"  Significant Difference: {elig_scores['significant']}")
    
    # Most significant eligibility criteria
    print("\nEligibility Criteria with Largest Differences:")
    criteria_results = analysis_results['eligibility_criteria']
    for criterion, result in sorted(criteria_results.items(), key=lambda x: abs(x[1]['difference']), reverse=True)[:3]:
        criterion_name = criterion.replace('eligibility_', '')
        print(f"  {criterion_name}: Top = {result['top_prevalence']:.1f}%, Bottom = {result['bottom_prevalence']:.1f}%")
        print(f"    Difference = {result['difference']:.1f}%, Significant: {result['significant']}")
    
    # Study characteristics
    print("\nStudy Characteristics Comparison:")
    char_results = analysis_results['study_characteristics']
    for char, result in char_results.items():
        if 'difference' in result:  # Numeric characteristic
            print(f"  {char}: Top = {result['top_mean']:.2f}, Bottom = {result['bottom_mean']:.2f}")
            print(f"    Difference = {result['difference']:.2f}, Significant: {result['significant']}")
    
    # Geographic distribution
    geo_results = analysis_results['geographic_distribution']
    print("\nGeographic Distribution:")
    print(f"  Number of unique locations in top studies: {geo_results['num_top_locations']}")
    print(f"  Number of unique locations in bottom studies: {geo_results['num_bottom_locations']}")
    print(f"  Number of locations in both categories: {len(geo_results['common_locations'])}")
    
    print("\nAnalysis complete. All figures saved to the 'plots' directory.")
    print("\nSummary of findings:")
    print("1. More restrictive eligibility criteria are associated with lower diversity in clinical trials.")
    print("2. Geographic location appears to be a significant factor in trial diversity.")
    print("3. There are differences in participant demographics between high and low diversity studies.")
    print("4. Specific eligibility restrictions may disproportionately affect minority participation.")

if __name__ == "__main__":
    main()