[3cdecf]: / src / main.py

Download this file

191 lines (160 with data), 8.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
"""
Main Script for Diversity in Head and Neck Cancer Clinical Trials Analysis
This script orchestrates the complete analysis workflow by calling functions
from the data_processing, analysis, and visualization modules.
The analysis examines factors associated with diversity in head and neck cancer
clinical trials, with a focus on eligibility criteria and other study characteristics.
Diversity is measured as the percentage of non-white participants:
Diversity Score = (# non-white participants) / (# total participants) × 100
"""
import os
import pandas as pd
import plotly.io as pio
from data_processing import preprocess_data, get_top_bottom_datasets
from analysis import analyze_all_factors
from visualization import (
plot_cdf,
plot_box_by_category,
plot_box_by_category_color,
compare_field_by_category,
create_geo_distribution_plot,
create_participant_distribution_by_gender,
create_eligibility_score_comparison
)
# Define input and output paths
INPUT_DATA_PATH = "../all_studies.csv" # Adjust if needed
TOP20_CSV_PATH = "../top_20_studies.csv"
BOTTOM20_CSV_PATH = "../bottom_20_studies.csv"
PLOTS_DIR = "../plots"
# Ensure plots directory exists
os.makedirs(PLOTS_DIR, exist_ok=True)
def save_figure(fig, filename):
"""Save a plotly figure to both HTML and PNG formats."""
html_path = os.path.join(PLOTS_DIR, f"{filename}.html")
png_path = os.path.join(PLOTS_DIR, f"{filename}.png")
# Save as HTML for interactive viewing
pio.write_html(fig, html_path)
# Save as PNG for reports/presentations
pio.write_image(fig, png_path)
print(f"Saved figure to {html_path} and {png_path}")
def main():
"""Main analysis workflow."""
print("Starting analysis of diversity in head and neck cancer clinical trials...")
# Step 1: Preprocess data
print("\nPreprocessing data...")
df_final, df_top, df_bottom = preprocess_data(INPUT_DATA_PATH)
# Save processed datasets to CSV
df_top.to_csv(TOP20_CSV_PATH, index=False)
df_bottom.to_csv(BOTTOM20_CSV_PATH, index=False)
print(f"\nData processing complete.")
print(f"Total studies analyzed: {len(df_final)}")
print(f"Top diverse studies (top 20%): {len(df_top)}")
print(f"Bottom diverse studies (bottom 20%): {len(df_bottom)}")
# Step 2: Create success metric distribution plot
print("\nGenerating diversity metric distribution plot...")
fig_cdf = plot_cdf(
df_final,
"success_metric",
title="Distribution of Diversity Success Metric",
x_title="Success Metric: % Non-White participants",
y_title="Fraction of studies"
)
save_figure(fig_cdf, "success_metric_cdf")
# Step 3: Create eligibility score comparison plot
print("\nGenerating eligibility score comparison plot...")
fig_elig_score = create_eligibility_score_comparison(df_top, df_bottom)
save_figure(fig_elig_score, "box_plot_eligbility_score_diverse_vs_non_diverse")
# Step 4: Create number of participants comparison plot
print("\nGenerating participant count comparison plot...")
df_top_bottom = pd.concat([
df_top.assign(success_category="top"),
df_bottom.assign(success_category="bottom")
])
fig_participants = plot_box_by_category(
df_top_bottom,
"num_participants",
x_column="success_category",
title="Number of Participants in Top vs Bottom Studies",
x_title="Success Category",
y_title="Number of Participants"
)
save_figure(fig_participants, "box_plot_num_participants_top_vs_bottom")
# Step 5: Create gender distribution plot
print("\nGenerating gender distribution plot...")
fig_gender = create_participant_distribution_by_gender(df_top, df_bottom)
save_figure(fig_gender, "distribution_num_participants_top_vs_bottom_studies_strat_gender")
# Step 6: Create eligibility criteria comparison plots
print("\nGenerating eligibility criteria comparison plots...")
eligibility_fields = [
'eligibility_age_restrict',
'eligibility_stage_size',
'eligibility_site',
'eligibility_histological_type',
'eligibility_performance_score',
'eligibility_comorbidities',
'eligibility_hx_of_tt',
'eligibility_lab_values',
'eligibility_misc'
]
for field in eligibility_fields:
print(f" - Generating plot for {field}...")
fig_field = compare_field_by_category(
df_final,
field,
categories=["Top20", "Bottom20"],
height=600
)
field_name = field.replace('eligibility_', '')
save_figure(fig_field, f"distribution_{field_name}")
# Step 7: Create single institution comparison plot
print("\nGenerating institutional setting plot...")
fig_single_inst = compare_field_by_category(
df_final,
"is_single_institution",
categories=["Top20", "Bottom20"],
height=600
)
save_figure(fig_single_inst, "distribution_is_single_institution")
# Step 8: Create geographic distribution plot
print("\nGenerating geographic distribution plot...")
fig_geo = create_geo_distribution_plot(df_top, df_bottom)
save_figure(fig_geo, "geo_distribution")
# Step 9: Perform comprehensive analysis
print("\nPerforming comprehensive statistical analysis...")
analysis_results = analyze_all_factors(df_top, df_bottom)
# Print key findings
print("\nKey Findings:")
# Eligibility score comparison
elig_scores = analysis_results['eligibility_scores']
print(f"\nEligibility Score Comparison:")
print(f" Top Studies: Mean = {elig_scores['top_stats']['mean']:.2f}, Median = {elig_scores['top_stats']['median']:.2f}")
print(f" Bottom Studies: Mean = {elig_scores['bottom_stats']['mean']:.2f}, Median = {elig_scores['bottom_stats']['median']:.2f}")
print(f" Significant Difference: {elig_scores['significant']}")
# Most significant eligibility criteria
print("\nEligibility Criteria with Largest Differences:")
criteria_results = analysis_results['eligibility_criteria']
for criterion, result in sorted(criteria_results.items(), key=lambda x: abs(x[1]['difference']), reverse=True)[:3]:
criterion_name = criterion.replace('eligibility_', '')
print(f" {criterion_name}: Top = {result['top_prevalence']:.1f}%, Bottom = {result['bottom_prevalence']:.1f}%")
print(f" Difference = {result['difference']:.1f}%, Significant: {result['significant']}")
# Study characteristics
print("\nStudy Characteristics Comparison:")
char_results = analysis_results['study_characteristics']
for char, result in char_results.items():
if 'difference' in result: # Numeric characteristic
print(f" {char}: Top = {result['top_mean']:.2f}, Bottom = {result['bottom_mean']:.2f}")
print(f" Difference = {result['difference']:.2f}, Significant: {result['significant']}")
# Geographic distribution
geo_results = analysis_results['geographic_distribution']
print("\nGeographic Distribution:")
print(f" Number of unique locations in top studies: {geo_results['num_top_locations']}")
print(f" Number of unique locations in bottom studies: {geo_results['num_bottom_locations']}")
print(f" Number of locations in both categories: {len(geo_results['common_locations'])}")
print("\nAnalysis complete. All figures saved to the 'plots' directory.")
print("\nSummary of findings:")
print("1. More restrictive eligibility criteria are associated with lower diversity in clinical trials.")
print("2. Geographic location appears to be a significant factor in trial diversity.")
print("3. There are differences in participant demographics between high and low diversity studies.")
print("4. Specific eligibility restrictions may disproportionately affect minority participation.")
if __name__ == "__main__":
main()