"""
Main Script for Diversity in Head and Neck Cancer Clinical Trials Analysis
This script orchestrates the complete analysis workflow by calling functions
from the data_processing, analysis, and visualization modules.
The analysis examines factors associated with diversity in head and neck cancer
clinical trials, with a focus on eligibility criteria and other study characteristics.
Diversity is measured as the percentage of non-white participants:
Diversity Score = (# non-white participants) / (# total participants) × 100
"""
import os
import pandas as pd
import plotly.io as pio
from data_processing import preprocess_data, get_top_bottom_datasets
from analysis import analyze_all_factors
from visualization import (
plot_cdf,
plot_box_by_category,
plot_box_by_category_color,
compare_field_by_category,
create_geo_distribution_plot,
create_participant_distribution_by_gender,
create_eligibility_score_comparison
)
# Define input and output paths
INPUT_DATA_PATH = "../all_studies.csv" # Adjust if needed
TOP20_CSV_PATH = "../top_20_studies.csv"
BOTTOM20_CSV_PATH = "../bottom_20_studies.csv"
PLOTS_DIR = "../plots"
# Ensure plots directory exists
os.makedirs(PLOTS_DIR, exist_ok=True)
def save_figure(fig, filename):
"""Save a plotly figure to both HTML and PNG formats."""
html_path = os.path.join(PLOTS_DIR, f"{filename}.html")
png_path = os.path.join(PLOTS_DIR, f"{filename}.png")
# Save as HTML for interactive viewing
pio.write_html(fig, html_path)
# Save as PNG for reports/presentations
pio.write_image(fig, png_path)
print(f"Saved figure to {html_path} and {png_path}")
def main():
"""Main analysis workflow."""
print("Starting analysis of diversity in head and neck cancer clinical trials...")
# Step 1: Preprocess data
print("\nPreprocessing data...")
df_final, df_top, df_bottom = preprocess_data(INPUT_DATA_PATH)
# Save processed datasets to CSV
df_top.to_csv(TOP20_CSV_PATH, index=False)
df_bottom.to_csv(BOTTOM20_CSV_PATH, index=False)
print(f"\nData processing complete.")
print(f"Total studies analyzed: {len(df_final)}")
print(f"Top diverse studies (top 20%): {len(df_top)}")
print(f"Bottom diverse studies (bottom 20%): {len(df_bottom)}")
# Step 2: Create success metric distribution plot
print("\nGenerating diversity metric distribution plot...")
fig_cdf = plot_cdf(
df_final,
"success_metric",
title="Distribution of Diversity Success Metric",
x_title="Success Metric: % Non-White participants",
y_title="Fraction of studies"
)
save_figure(fig_cdf, "success_metric_cdf")
# Step 3: Create eligibility score comparison plot
print("\nGenerating eligibility score comparison plot...")
fig_elig_score = create_eligibility_score_comparison(df_top, df_bottom)
save_figure(fig_elig_score, "box_plot_eligbility_score_diverse_vs_non_diverse")
# Step 4: Create number of participants comparison plot
print("\nGenerating participant count comparison plot...")
df_top_bottom = pd.concat([
df_top.assign(success_category="top"),
df_bottom.assign(success_category="bottom")
])
fig_participants = plot_box_by_category(
df_top_bottom,
"num_participants",
x_column="success_category",
title="Number of Participants in Top vs Bottom Studies",
x_title="Success Category",
y_title="Number of Participants"
)
save_figure(fig_participants, "box_plot_num_participants_top_vs_bottom")
# Step 5: Create gender distribution plot
print("\nGenerating gender distribution plot...")
fig_gender = create_participant_distribution_by_gender(df_top, df_bottom)
save_figure(fig_gender, "distribution_num_participants_top_vs_bottom_studies_strat_gender")
# Step 6: Create eligibility criteria comparison plots
print("\nGenerating eligibility criteria comparison plots...")
eligibility_fields = [
'eligibility_age_restrict',
'eligibility_stage_size',
'eligibility_site',
'eligibility_histological_type',
'eligibility_performance_score',
'eligibility_comorbidities',
'eligibility_hx_of_tt',
'eligibility_lab_values',
'eligibility_misc'
]
for field in eligibility_fields:
print(f" - Generating plot for {field}...")
fig_field = compare_field_by_category(
df_final,
field,
categories=["Top20", "Bottom20"],
height=600
)
field_name = field.replace('eligibility_', '')
save_figure(fig_field, f"distribution_{field_name}")
# Step 7: Create single institution comparison plot
print("\nGenerating institutional setting plot...")
fig_single_inst = compare_field_by_category(
df_final,
"is_single_institution",
categories=["Top20", "Bottom20"],
height=600
)
save_figure(fig_single_inst, "distribution_is_single_institution")
# Step 8: Create geographic distribution plot
print("\nGenerating geographic distribution plot...")
fig_geo = create_geo_distribution_plot(df_top, df_bottom)
save_figure(fig_geo, "geo_distribution")
# Step 9: Perform comprehensive analysis
print("\nPerforming comprehensive statistical analysis...")
analysis_results = analyze_all_factors(df_top, df_bottom)
# Print key findings
print("\nKey Findings:")
# Eligibility score comparison
elig_scores = analysis_results['eligibility_scores']
print(f"\nEligibility Score Comparison:")
print(f" Top Studies: Mean = {elig_scores['top_stats']['mean']:.2f}, Median = {elig_scores['top_stats']['median']:.2f}")
print(f" Bottom Studies: Mean = {elig_scores['bottom_stats']['mean']:.2f}, Median = {elig_scores['bottom_stats']['median']:.2f}")
print(f" Significant Difference: {elig_scores['significant']}")
# Most significant eligibility criteria
print("\nEligibility Criteria with Largest Differences:")
criteria_results = analysis_results['eligibility_criteria']
for criterion, result in sorted(criteria_results.items(), key=lambda x: abs(x[1]['difference']), reverse=True)[:3]:
criterion_name = criterion.replace('eligibility_', '')
print(f" {criterion_name}: Top = {result['top_prevalence']:.1f}%, Bottom = {result['bottom_prevalence']:.1f}%")
print(f" Difference = {result['difference']:.1f}%, Significant: {result['significant']}")
# Study characteristics
print("\nStudy Characteristics Comparison:")
char_results = analysis_results['study_characteristics']
for char, result in char_results.items():
if 'difference' in result: # Numeric characteristic
print(f" {char}: Top = {result['top_mean']:.2f}, Bottom = {result['bottom_mean']:.2f}")
print(f" Difference = {result['difference']:.2f}, Significant: {result['significant']}")
# Geographic distribution
geo_results = analysis_results['geographic_distribution']
print("\nGeographic Distribution:")
print(f" Number of unique locations in top studies: {geo_results['num_top_locations']}")
print(f" Number of unique locations in bottom studies: {geo_results['num_bottom_locations']}")
print(f" Number of locations in both categories: {len(geo_results['common_locations'])}")
print("\nAnalysis complete. All figures saved to the 'plots' directory.")
print("\nSummary of findings:")
print("1. More restrictive eligibility criteria are associated with lower diversity in clinical trials.")
print("2. Geographic location appears to be a significant factor in trial diversity.")
print("3. There are differences in participant demographics between high and low diversity studies.")
print("4. Specific eligibility restrictions may disproportionately affect minority participation.")
if __name__ == "__main__":
main()