Diff of /src/main.py [000000] .. [3cdecf]

Switch to unified view

a b/src/main.py
1
"""
2
Main Script for Diversity in Head and Neck Cancer Clinical Trials Analysis
3
4
This script orchestrates the complete analysis workflow by calling functions
5
from the data_processing, analysis, and visualization modules.
6
7
The analysis examines factors associated with diversity in head and neck cancer
8
clinical trials, with a focus on eligibility criteria and other study characteristics.
9
10
Diversity is measured as the percentage of non-white participants:
11
    Diversity Score = (# non-white participants) / (# total participants) × 100
12
"""
13
14
import os
15
import pandas as pd
16
import plotly.io as pio
17
18
from data_processing import preprocess_data, get_top_bottom_datasets
19
from analysis import analyze_all_factors
20
from visualization import (
21
    plot_cdf, 
22
    plot_box_by_category,
23
    plot_box_by_category_color,
24
    compare_field_by_category,
25
    create_geo_distribution_plot,
26
    create_participant_distribution_by_gender,
27
    create_eligibility_score_comparison
28
)
29
30
# Define input and output paths
31
INPUT_DATA_PATH = "../all_studies.csv"  # Adjust if needed
32
TOP20_CSV_PATH = "../top_20_studies.csv"
33
BOTTOM20_CSV_PATH = "../bottom_20_studies.csv"
34
PLOTS_DIR = "../plots"
35
36
# Ensure plots directory exists
37
os.makedirs(PLOTS_DIR, exist_ok=True)
38
39
def save_figure(fig, filename):
40
    """Save a plotly figure to both HTML and PNG formats."""
41
    html_path = os.path.join(PLOTS_DIR, f"{filename}.html")
42
    png_path = os.path.join(PLOTS_DIR, f"{filename}.png")
43
    
44
    # Save as HTML for interactive viewing
45
    pio.write_html(fig, html_path)
46
    
47
    # Save as PNG for reports/presentations
48
    pio.write_image(fig, png_path)
49
    
50
    print(f"Saved figure to {html_path} and {png_path}")
51
52
def main():
53
    """Main analysis workflow."""
54
    print("Starting analysis of diversity in head and neck cancer clinical trials...")
55
    
56
    # Step 1: Preprocess data
57
    print("\nPreprocessing data...")
58
    df_final, df_top, df_bottom = preprocess_data(INPUT_DATA_PATH)
59
    
60
    # Save processed datasets to CSV
61
    df_top.to_csv(TOP20_CSV_PATH, index=False)
62
    df_bottom.to_csv(BOTTOM20_CSV_PATH, index=False)
63
    
64
    print(f"\nData processing complete.")
65
    print(f"Total studies analyzed: {len(df_final)}")
66
    print(f"Top diverse studies (top 20%): {len(df_top)}")
67
    print(f"Bottom diverse studies (bottom 20%): {len(df_bottom)}")
68
    
69
    # Step 2: Create success metric distribution plot
70
    print("\nGenerating diversity metric distribution plot...")
71
    fig_cdf = plot_cdf(
72
        df_final, 
73
        "success_metric", 
74
        title="Distribution of Diversity Success Metric",
75
        x_title="Success Metric: % Non-White participants",
76
        y_title="Fraction of studies"
77
    )
78
    save_figure(fig_cdf, "success_metric_cdf")
79
    
80
    # Step 3: Create eligibility score comparison plot
81
    print("\nGenerating eligibility score comparison plot...")
82
    fig_elig_score = create_eligibility_score_comparison(df_top, df_bottom)
83
    save_figure(fig_elig_score, "box_plot_eligbility_score_diverse_vs_non_diverse")
84
    
85
    # Step 4: Create number of participants comparison plot
86
    print("\nGenerating participant count comparison plot...")
87
    df_top_bottom = pd.concat([
88
        df_top.assign(success_category="top"),
89
        df_bottom.assign(success_category="bottom")
90
    ])
91
    fig_participants = plot_box_by_category(
92
        df_top_bottom, 
93
        "num_participants", 
94
        x_column="success_category",
95
        title="Number of Participants in Top vs Bottom Studies",
96
        x_title="Success Category",
97
        y_title="Number of Participants"
98
    )
99
    save_figure(fig_participants, "box_plot_num_participants_top_vs_bottom")
100
    
101
    # Step 5: Create gender distribution plot
102
    print("\nGenerating gender distribution plot...")
103
    fig_gender = create_participant_distribution_by_gender(df_top, df_bottom)
104
    save_figure(fig_gender, "distribution_num_participants_top_vs_bottom_studies_strat_gender")
105
    
106
    # Step 6: Create eligibility criteria comparison plots
107
    print("\nGenerating eligibility criteria comparison plots...")
108
    eligibility_fields = [
109
        'eligibility_age_restrict',
110
        'eligibility_stage_size',
111
        'eligibility_site',
112
        'eligibility_histological_type',
113
        'eligibility_performance_score',
114
        'eligibility_comorbidities',
115
        'eligibility_hx_of_tt',
116
        'eligibility_lab_values',
117
        'eligibility_misc'
118
    ]
119
    
120
    for field in eligibility_fields:
121
        print(f"  - Generating plot for {field}...")
122
        fig_field = compare_field_by_category(
123
            df_final, 
124
            field, 
125
            categories=["Top20", "Bottom20"],
126
            height=600
127
        )
128
        field_name = field.replace('eligibility_', '')
129
        save_figure(fig_field, f"distribution_{field_name}")
130
    
131
    # Step 7: Create single institution comparison plot
132
    print("\nGenerating institutional setting plot...")
133
    fig_single_inst = compare_field_by_category(
134
        df_final, 
135
        "is_single_institution", 
136
        categories=["Top20", "Bottom20"],
137
        height=600
138
    )
139
    save_figure(fig_single_inst, "distribution_is_single_institution")
140
    
141
    # Step 8: Create geographic distribution plot
142
    print("\nGenerating geographic distribution plot...")
143
    fig_geo = create_geo_distribution_plot(df_top, df_bottom)
144
    save_figure(fig_geo, "geo_distribution")
145
    
146
    # Step 9: Perform comprehensive analysis
147
    print("\nPerforming comprehensive statistical analysis...")
148
    analysis_results = analyze_all_factors(df_top, df_bottom)
149
    
150
    # Print key findings
151
    print("\nKey Findings:")
152
    
153
    # Eligibility score comparison
154
    elig_scores = analysis_results['eligibility_scores']
155
    print(f"\nEligibility Score Comparison:")
156
    print(f"  Top Studies: Mean = {elig_scores['top_stats']['mean']:.2f}, Median = {elig_scores['top_stats']['median']:.2f}")
157
    print(f"  Bottom Studies: Mean = {elig_scores['bottom_stats']['mean']:.2f}, Median = {elig_scores['bottom_stats']['median']:.2f}")
158
    print(f"  Significant Difference: {elig_scores['significant']}")
159
    
160
    # Most significant eligibility criteria
161
    print("\nEligibility Criteria with Largest Differences:")
162
    criteria_results = analysis_results['eligibility_criteria']
163
    for criterion, result in sorted(criteria_results.items(), key=lambda x: abs(x[1]['difference']), reverse=True)[:3]:
164
        criterion_name = criterion.replace('eligibility_', '')
165
        print(f"  {criterion_name}: Top = {result['top_prevalence']:.1f}%, Bottom = {result['bottom_prevalence']:.1f}%")
166
        print(f"    Difference = {result['difference']:.1f}%, Significant: {result['significant']}")
167
    
168
    # Study characteristics
169
    print("\nStudy Characteristics Comparison:")
170
    char_results = analysis_results['study_characteristics']
171
    for char, result in char_results.items():
172
        if 'difference' in result:  # Numeric characteristic
173
            print(f"  {char}: Top = {result['top_mean']:.2f}, Bottom = {result['bottom_mean']:.2f}")
174
            print(f"    Difference = {result['difference']:.2f}, Significant: {result['significant']}")
175
    
176
    # Geographic distribution
177
    geo_results = analysis_results['geographic_distribution']
178
    print("\nGeographic Distribution:")
179
    print(f"  Number of unique locations in top studies: {geo_results['num_top_locations']}")
180
    print(f"  Number of unique locations in bottom studies: {geo_results['num_bottom_locations']}")
181
    print(f"  Number of locations in both categories: {len(geo_results['common_locations'])}")
182
    
183
    print("\nAnalysis complete. All figures saved to the 'plots' directory.")
184
    print("\nSummary of findings:")
185
    print("1. More restrictive eligibility criteria are associated with lower diversity in clinical trials.")
186
    print("2. Geographic location appears to be a significant factor in trial diversity.")
187
    print("3. There are differences in participant demographics between high and low diversity studies.")
188
    print("4. Specific eligibility restrictions may disproportionately affect minority participation.")
189
190
if __name__ == "__main__":
191
    main()