|
a |
|
b/src/main.py |
|
|
1 |
""" |
|
|
2 |
Main Script for Diversity in Head and Neck Cancer Clinical Trials Analysis |
|
|
3 |
|
|
|
4 |
This script orchestrates the complete analysis workflow by calling functions |
|
|
5 |
from the data_processing, analysis, and visualization modules. |
|
|
6 |
|
|
|
7 |
The analysis examines factors associated with diversity in head and neck cancer |
|
|
8 |
clinical trials, with a focus on eligibility criteria and other study characteristics. |
|
|
9 |
|
|
|
10 |
Diversity is measured as the percentage of non-white participants: |
|
|
11 |
Diversity Score = (# non-white participants) / (# total participants) × 100 |
|
|
12 |
""" |
|
|
13 |
|
|
|
14 |
import os |
|
|
15 |
import pandas as pd |
|
|
16 |
import plotly.io as pio |
|
|
17 |
|
|
|
18 |
from data_processing import preprocess_data, get_top_bottom_datasets |
|
|
19 |
from analysis import analyze_all_factors |
|
|
20 |
from visualization import ( |
|
|
21 |
plot_cdf, |
|
|
22 |
plot_box_by_category, |
|
|
23 |
plot_box_by_category_color, |
|
|
24 |
compare_field_by_category, |
|
|
25 |
create_geo_distribution_plot, |
|
|
26 |
create_participant_distribution_by_gender, |
|
|
27 |
create_eligibility_score_comparison |
|
|
28 |
) |
|
|
29 |
|
|
|
30 |
# Define input and output paths |
|
|
31 |
INPUT_DATA_PATH = "../all_studies.csv" # Adjust if needed |
|
|
32 |
TOP20_CSV_PATH = "../top_20_studies.csv" |
|
|
33 |
BOTTOM20_CSV_PATH = "../bottom_20_studies.csv" |
|
|
34 |
PLOTS_DIR = "../plots" |
|
|
35 |
|
|
|
36 |
# Ensure plots directory exists |
|
|
37 |
os.makedirs(PLOTS_DIR, exist_ok=True) |
|
|
38 |
|
|
|
39 |
def save_figure(fig, filename): |
|
|
40 |
"""Save a plotly figure to both HTML and PNG formats.""" |
|
|
41 |
html_path = os.path.join(PLOTS_DIR, f"{filename}.html") |
|
|
42 |
png_path = os.path.join(PLOTS_DIR, f"{filename}.png") |
|
|
43 |
|
|
|
44 |
# Save as HTML for interactive viewing |
|
|
45 |
pio.write_html(fig, html_path) |
|
|
46 |
|
|
|
47 |
# Save as PNG for reports/presentations |
|
|
48 |
pio.write_image(fig, png_path) |
|
|
49 |
|
|
|
50 |
print(f"Saved figure to {html_path} and {png_path}") |
|
|
51 |
|
|
|
52 |
def main(): |
|
|
53 |
"""Main analysis workflow.""" |
|
|
54 |
print("Starting analysis of diversity in head and neck cancer clinical trials...") |
|
|
55 |
|
|
|
56 |
# Step 1: Preprocess data |
|
|
57 |
print("\nPreprocessing data...") |
|
|
58 |
df_final, df_top, df_bottom = preprocess_data(INPUT_DATA_PATH) |
|
|
59 |
|
|
|
60 |
# Save processed datasets to CSV |
|
|
61 |
df_top.to_csv(TOP20_CSV_PATH, index=False) |
|
|
62 |
df_bottom.to_csv(BOTTOM20_CSV_PATH, index=False) |
|
|
63 |
|
|
|
64 |
print(f"\nData processing complete.") |
|
|
65 |
print(f"Total studies analyzed: {len(df_final)}") |
|
|
66 |
print(f"Top diverse studies (top 20%): {len(df_top)}") |
|
|
67 |
print(f"Bottom diverse studies (bottom 20%): {len(df_bottom)}") |
|
|
68 |
|
|
|
69 |
# Step 2: Create success metric distribution plot |
|
|
70 |
print("\nGenerating diversity metric distribution plot...") |
|
|
71 |
fig_cdf = plot_cdf( |
|
|
72 |
df_final, |
|
|
73 |
"success_metric", |
|
|
74 |
title="Distribution of Diversity Success Metric", |
|
|
75 |
x_title="Success Metric: % Non-White participants", |
|
|
76 |
y_title="Fraction of studies" |
|
|
77 |
) |
|
|
78 |
save_figure(fig_cdf, "success_metric_cdf") |
|
|
79 |
|
|
|
80 |
# Step 3: Create eligibility score comparison plot |
|
|
81 |
print("\nGenerating eligibility score comparison plot...") |
|
|
82 |
fig_elig_score = create_eligibility_score_comparison(df_top, df_bottom) |
|
|
83 |
save_figure(fig_elig_score, "box_plot_eligbility_score_diverse_vs_non_diverse") |
|
|
84 |
|
|
|
85 |
# Step 4: Create number of participants comparison plot |
|
|
86 |
print("\nGenerating participant count comparison plot...") |
|
|
87 |
df_top_bottom = pd.concat([ |
|
|
88 |
df_top.assign(success_category="top"), |
|
|
89 |
df_bottom.assign(success_category="bottom") |
|
|
90 |
]) |
|
|
91 |
fig_participants = plot_box_by_category( |
|
|
92 |
df_top_bottom, |
|
|
93 |
"num_participants", |
|
|
94 |
x_column="success_category", |
|
|
95 |
title="Number of Participants in Top vs Bottom Studies", |
|
|
96 |
x_title="Success Category", |
|
|
97 |
y_title="Number of Participants" |
|
|
98 |
) |
|
|
99 |
save_figure(fig_participants, "box_plot_num_participants_top_vs_bottom") |
|
|
100 |
|
|
|
101 |
# Step 5: Create gender distribution plot |
|
|
102 |
print("\nGenerating gender distribution plot...") |
|
|
103 |
fig_gender = create_participant_distribution_by_gender(df_top, df_bottom) |
|
|
104 |
save_figure(fig_gender, "distribution_num_participants_top_vs_bottom_studies_strat_gender") |
|
|
105 |
|
|
|
106 |
# Step 6: Create eligibility criteria comparison plots |
|
|
107 |
print("\nGenerating eligibility criteria comparison plots...") |
|
|
108 |
eligibility_fields = [ |
|
|
109 |
'eligibility_age_restrict', |
|
|
110 |
'eligibility_stage_size', |
|
|
111 |
'eligibility_site', |
|
|
112 |
'eligibility_histological_type', |
|
|
113 |
'eligibility_performance_score', |
|
|
114 |
'eligibility_comorbidities', |
|
|
115 |
'eligibility_hx_of_tt', |
|
|
116 |
'eligibility_lab_values', |
|
|
117 |
'eligibility_misc' |
|
|
118 |
] |
|
|
119 |
|
|
|
120 |
for field in eligibility_fields: |
|
|
121 |
print(f" - Generating plot for {field}...") |
|
|
122 |
fig_field = compare_field_by_category( |
|
|
123 |
df_final, |
|
|
124 |
field, |
|
|
125 |
categories=["Top20", "Bottom20"], |
|
|
126 |
height=600 |
|
|
127 |
) |
|
|
128 |
field_name = field.replace('eligibility_', '') |
|
|
129 |
save_figure(fig_field, f"distribution_{field_name}") |
|
|
130 |
|
|
|
131 |
# Step 7: Create single institution comparison plot |
|
|
132 |
print("\nGenerating institutional setting plot...") |
|
|
133 |
fig_single_inst = compare_field_by_category( |
|
|
134 |
df_final, |
|
|
135 |
"is_single_institution", |
|
|
136 |
categories=["Top20", "Bottom20"], |
|
|
137 |
height=600 |
|
|
138 |
) |
|
|
139 |
save_figure(fig_single_inst, "distribution_is_single_institution") |
|
|
140 |
|
|
|
141 |
# Step 8: Create geographic distribution plot |
|
|
142 |
print("\nGenerating geographic distribution plot...") |
|
|
143 |
fig_geo = create_geo_distribution_plot(df_top, df_bottom) |
|
|
144 |
save_figure(fig_geo, "geo_distribution") |
|
|
145 |
|
|
|
146 |
# Step 9: Perform comprehensive analysis |
|
|
147 |
print("\nPerforming comprehensive statistical analysis...") |
|
|
148 |
analysis_results = analyze_all_factors(df_top, df_bottom) |
|
|
149 |
|
|
|
150 |
# Print key findings |
|
|
151 |
print("\nKey Findings:") |
|
|
152 |
|
|
|
153 |
# Eligibility score comparison |
|
|
154 |
elig_scores = analysis_results['eligibility_scores'] |
|
|
155 |
print(f"\nEligibility Score Comparison:") |
|
|
156 |
print(f" Top Studies: Mean = {elig_scores['top_stats']['mean']:.2f}, Median = {elig_scores['top_stats']['median']:.2f}") |
|
|
157 |
print(f" Bottom Studies: Mean = {elig_scores['bottom_stats']['mean']:.2f}, Median = {elig_scores['bottom_stats']['median']:.2f}") |
|
|
158 |
print(f" Significant Difference: {elig_scores['significant']}") |
|
|
159 |
|
|
|
160 |
# Most significant eligibility criteria |
|
|
161 |
print("\nEligibility Criteria with Largest Differences:") |
|
|
162 |
criteria_results = analysis_results['eligibility_criteria'] |
|
|
163 |
for criterion, result in sorted(criteria_results.items(), key=lambda x: abs(x[1]['difference']), reverse=True)[:3]: |
|
|
164 |
criterion_name = criterion.replace('eligibility_', '') |
|
|
165 |
print(f" {criterion_name}: Top = {result['top_prevalence']:.1f}%, Bottom = {result['bottom_prevalence']:.1f}%") |
|
|
166 |
print(f" Difference = {result['difference']:.1f}%, Significant: {result['significant']}") |
|
|
167 |
|
|
|
168 |
# Study characteristics |
|
|
169 |
print("\nStudy Characteristics Comparison:") |
|
|
170 |
char_results = analysis_results['study_characteristics'] |
|
|
171 |
for char, result in char_results.items(): |
|
|
172 |
if 'difference' in result: # Numeric characteristic |
|
|
173 |
print(f" {char}: Top = {result['top_mean']:.2f}, Bottom = {result['bottom_mean']:.2f}") |
|
|
174 |
print(f" Difference = {result['difference']:.2f}, Significant: {result['significant']}") |
|
|
175 |
|
|
|
176 |
# Geographic distribution |
|
|
177 |
geo_results = analysis_results['geographic_distribution'] |
|
|
178 |
print("\nGeographic Distribution:") |
|
|
179 |
print(f" Number of unique locations in top studies: {geo_results['num_top_locations']}") |
|
|
180 |
print(f" Number of unique locations in bottom studies: {geo_results['num_bottom_locations']}") |
|
|
181 |
print(f" Number of locations in both categories: {len(geo_results['common_locations'])}") |
|
|
182 |
|
|
|
183 |
print("\nAnalysis complete. All figures saved to the 'plots' directory.") |
|
|
184 |
print("\nSummary of findings:") |
|
|
185 |
print("1. More restrictive eligibility criteria are associated with lower diversity in clinical trials.") |
|
|
186 |
print("2. Geographic location appears to be a significant factor in trial diversity.") |
|
|
187 |
print("3. There are differences in participant demographics between high and low diversity studies.") |
|
|
188 |
print("4. Specific eligibility restrictions may disproportionately affect minority participation.") |
|
|
189 |
|
|
|
190 |
if __name__ == "__main__": |
|
|
191 |
main() |