"""
Analysis Module for Diversity in Head and Neck Cancer Clinical Trials
This module contains functions for analyzing the factors that contribute to
diversity in head and neck cancer clinical trials, with a focus on eligibility
criteria and other study characteristics.
"""
import pandas as pd
import numpy as np
from scipy import stats
def compare_eligibility_scores(df_top, df_bottom):
"""
Compare eligibility scores between top and bottom diverse studies.
The eligibility score is the sum of binary flags for all eligibility criteria,
with higher scores indicating more restrictive criteria.
Args:
df_top (pandas.DataFrame): Dataset of top diverse studies
df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
Returns:
dict: Dictionary with comparison results and statistical test
"""
# Get eligibility scores for both groups
top_scores = df_top['eligibility_score']
bottom_scores = df_bottom['eligibility_score']
# Perform statistical comparison (Mann-Whitney U test)
u_stat, p_value = stats.mannwhitneyu(top_scores, bottom_scores, alternative='two-sided')
# Calculate descriptive statistics
top_stats = {
'mean': np.mean(top_scores),
'median': np.median(top_scores),
'std': np.std(top_scores),
'min': np.min(top_scores),
'max': np.max(top_scores)
}
bottom_stats = {
'mean': np.mean(bottom_scores),
'median': np.median(bottom_scores),
'std': np.std(bottom_scores),
'min': np.min(bottom_scores),
'max': np.max(bottom_scores)
}
return {
'top_stats': top_stats,
'bottom_stats': bottom_stats,
'u_statistic': u_stat,
'p_value': p_value,
'significant': p_value < 0.05
}
def compare_eligibility_criteria(df_top, df_bottom):
"""
Compare the prevalence of specific eligibility criteria between top and bottom diverse studies.
This function examines the following eligibility criteria:
- age_restrict: 0 if the restriction is age>18, 1 for other restrictions (e.g., 18<age<75)
- stage_size: Restrictions on the cancer stage and the size of the tumor
- cancer_site: Restrictions on the cancer site
- histological_type: Whether the study was limited to SCC or any other type
- performance_score: Restrictions on performance score (e.g., ECOG performance)
- comorbidities: Restrictions on comorbidities
- hx_of_tt: Restrictions on treatment history for cancer
- lab_values: Restrictions on lab test values
- pregnancy_or_contraception: Restrictions on pregnancy or particular contraceptives
- misc: Other restrictions (smoking status, ethnicity requirements)
Args:
df_top (pandas.DataFrame): Dataset of top diverse studies
df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
Returns:
dict: Dictionary mapping criteria names to comparison results
"""
# Define eligibility criteria fields to compare
criteria_fields = [
'eligibility_age_restrict',
'eligibility_stage_size',
'eligibility_site',
'eligibility_histological_type',
'eligibility_performance_score',
'eligibility_comorbidities',
'eligibility_hx_of_tt',
'eligibility_lab_values',
'eligibility_pregnancy_or_contraception',
'eligibility_misc'
]
# Compare each criterion
results = {}
for criterion in criteria_fields:
# Calculate prevalence in each group
top_prevalence = df_top[criterion].mean() * 100
bottom_prevalence = df_bottom[criterion].mean() * 100
# Perform Fisher's exact test for significance
contingency_table = pd.crosstab(
pd.Series(df_top[criterion].tolist() + df_bottom[criterion].tolist()),
pd.Series(['top'] * len(df_top) + ['bottom'] * len(df_bottom))
)
_, p_value = stats.fisher_exact(contingency_table)
# Store results
results[criterion] = {
'top_prevalence': top_prevalence,
'bottom_prevalence': bottom_prevalence,
'difference': top_prevalence - bottom_prevalence,
'p_value': p_value,
'significant': p_value < 0.05
}
return results
def compare_study_characteristics(df_top, df_bottom):
"""
Compare general study characteristics between top and bottom diverse studies.
This function examines the following characteristics:
1. Study start date and end date
2. Single vs. multi-institutional study
3. Number of participants (total, male, female)
4. Modality (Drug/Radiation/Biologic/Combination)
5. Trial type (Primary/Palliative/Recurrent/Metastatic)
Args:
df_top (pandas.DataFrame): Dataset of top diverse studies
df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
Returns:
dict: Dictionary mapping characteristic names to comparison results
"""
# Define characteristics to compare
characteristics = [
'num_sites',
'is_single_institution',
'num_participants',
'num_male_participants',
'num_female_participants',
'modalities',
'trial_type',
'trial_phase',
'tumor_type',
'cancer_site',
'study_start_date',
'apc_date'
]
results = {}
for char in characteristics:
if char in df_top.columns and char in df_bottom.columns:
if df_top[char].dtype in [np.int64, np.float64]:
# Numeric characteristic - use Mann-Whitney U test
u_stat, p_value = stats.mannwhitneyu(
df_top[char].dropna(),
df_bottom[char].dropna(),
alternative='two-sided'
)
results[char] = {
'top_mean': df_top[char].mean(),
'bottom_mean': df_bottom[char].mean(),
'difference': df_top[char].mean() - df_bottom[char].mean(),
'test': 'Mann-Whitney U',
'statistic': u_stat,
'p_value': p_value,
'significant': p_value < 0.05
}
else:
# Categorical characteristic - use Chi-square test
try:
contingency_table = pd.crosstab(
pd.Series(df_top[char].tolist() + df_bottom[char].tolist()),
pd.Series(['top'] * len(df_top) + ['bottom'] * len(df_bottom))
)
chi2, p_value, _, _ = stats.chi2_contingency(contingency_table)
# Calculate prevalence of each category in both groups
top_distribution = df_top[char].value_counts(normalize=True).to_dict()
bottom_distribution = df_bottom[char].value_counts(normalize=True).to_dict()
results[char] = {
'top_distribution': top_distribution,
'bottom_distribution': bottom_distribution,
'test': 'Chi-square',
'statistic': chi2,
'p_value': p_value,
'significant': p_value < 0.05
}
except (ValueError, np.linalg.LinAlgError):
# Handle cases where Chi-square test cannot be performed
top_distribution = df_top[char].value_counts(normalize=True).to_dict()
bottom_distribution = df_bottom[char].value_counts(normalize=True).to_dict()
results[char] = {
'top_distribution': top_distribution,
'bottom_distribution': bottom_distribution,
'test': 'Not performed',
'statistic': None,
'p_value': None,
'significant': None
}
return results
def analyze_geographic_distribution(df_top, df_bottom):
"""
Analyze the geographic distribution of studies and its impact on diversity.
This function examines how geographic location relates to diversity in clinical trials,
considering that areas with more diverse populations may have higher potential for
diverse study recruitment.
Args:
df_top (pandas.DataFrame): Dataset of top diverse studies
df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
Returns:
dict: Dictionary with geographic analysis results
"""
# Extract location information
location_field = 'location'
if location_field in df_top.columns and location_field in df_bottom.columns:
top_locations = df_top[location_field].value_counts().to_dict()
bottom_locations = df_bottom[location_field].value_counts().to_dict()
# Identify common and unique locations
all_locations = set(top_locations.keys()).union(set(bottom_locations.keys()))
common_locations = set(top_locations.keys()).intersection(set(bottom_locations.keys()))
top_only_locations = set(top_locations.keys()) - set(bottom_locations.keys())
bottom_only_locations = set(bottom_locations.keys()) - set(top_locations.keys())
return {
'top_locations': top_locations,
'bottom_locations': bottom_locations,
'all_locations': list(all_locations),
'common_locations': list(common_locations),
'top_only_locations': list(top_only_locations),
'bottom_only_locations': list(bottom_only_locations),
'num_top_locations': len(top_locations),
'num_bottom_locations': len(bottom_locations)
}
else:
return {
'error': f"Location field '{location_field}' not found in datasets"
}
def analyze_institutional_setting(df_top, df_bottom):
"""
Analyze the impact of institutional setting (single vs. multi-institution) on diversity.
This function examines whether single-institution or multi-institution studies
tend to have higher diversity.
Args:
df_top (pandas.DataFrame): Dataset of top diverse studies
df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
Returns:
dict: Dictionary with institutional setting analysis results
"""
institution_field = 'is_single_institution'
if institution_field in df_top.columns and institution_field in df_bottom.columns:
# Calculate prevalence of single-institution studies in each group
top_single_inst_perc = df_top[institution_field].mean() * 100
bottom_single_inst_perc = df_bottom[institution_field].mean() * 100
# Perform Fisher's exact test for significance
contingency_table = pd.crosstab(
pd.Series(df_top[institution_field].tolist() + df_bottom[institution_field].tolist()),
pd.Series(['top'] * len(df_top) + ['bottom'] * len(df_bottom))
)
_, p_value = stats.fisher_exact(contingency_table)
return {
'top_single_institution_percent': top_single_inst_perc,
'bottom_single_institution_percent': bottom_single_inst_perc,
'difference': top_single_inst_perc - bottom_single_inst_perc,
'p_value': p_value,
'significant': p_value < 0.05
}
else:
return {
'error': f"Institution field '{institution_field}' not found in datasets"
}
def analyze_all_factors(df_top, df_bottom):
"""
Perform a comprehensive analysis of all factors that may affect diversity.
Args:
df_top (pandas.DataFrame): Dataset of top diverse studies
df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
Returns:
dict: Dictionary with all analysis results
"""
return {
'eligibility_scores': compare_eligibility_scores(df_top, df_bottom),
'eligibility_criteria': compare_eligibility_criteria(df_top, df_bottom),
'study_characteristics': compare_study_characteristics(df_top, df_bottom),
'geographic_distribution': analyze_geographic_distribution(df_top, df_bottom),
'institutional_setting': analyze_institutional_setting(df_top, df_bottom)
}