[3cdecf]: / src / analysis.py

Download this file

314 lines (267 with data), 12.7 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
"""
Analysis Module for Diversity in Head and Neck Cancer Clinical Trials
This module contains functions for analyzing the factors that contribute to
diversity in head and neck cancer clinical trials, with a focus on eligibility
criteria and other study characteristics.
"""
import pandas as pd
import numpy as np
from scipy import stats
def compare_eligibility_scores(df_top, df_bottom):
"""
Compare eligibility scores between top and bottom diverse studies.
The eligibility score is the sum of binary flags for all eligibility criteria,
with higher scores indicating more restrictive criteria.
Args:
df_top (pandas.DataFrame): Dataset of top diverse studies
df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
Returns:
dict: Dictionary with comparison results and statistical test
"""
# Get eligibility scores for both groups
top_scores = df_top['eligibility_score']
bottom_scores = df_bottom['eligibility_score']
# Perform statistical comparison (Mann-Whitney U test)
u_stat, p_value = stats.mannwhitneyu(top_scores, bottom_scores, alternative='two-sided')
# Calculate descriptive statistics
top_stats = {
'mean': np.mean(top_scores),
'median': np.median(top_scores),
'std': np.std(top_scores),
'min': np.min(top_scores),
'max': np.max(top_scores)
}
bottom_stats = {
'mean': np.mean(bottom_scores),
'median': np.median(bottom_scores),
'std': np.std(bottom_scores),
'min': np.min(bottom_scores),
'max': np.max(bottom_scores)
}
return {
'top_stats': top_stats,
'bottom_stats': bottom_stats,
'u_statistic': u_stat,
'p_value': p_value,
'significant': p_value < 0.05
}
def compare_eligibility_criteria(df_top, df_bottom):
"""
Compare the prevalence of specific eligibility criteria between top and bottom diverse studies.
This function examines the following eligibility criteria:
- age_restrict: 0 if the restriction is age>18, 1 for other restrictions (e.g., 18<age<75)
- stage_size: Restrictions on the cancer stage and the size of the tumor
- cancer_site: Restrictions on the cancer site
- histological_type: Whether the study was limited to SCC or any other type
- performance_score: Restrictions on performance score (e.g., ECOG performance)
- comorbidities: Restrictions on comorbidities
- hx_of_tt: Restrictions on treatment history for cancer
- lab_values: Restrictions on lab test values
- pregnancy_or_contraception: Restrictions on pregnancy or particular contraceptives
- misc: Other restrictions (smoking status, ethnicity requirements)
Args:
df_top (pandas.DataFrame): Dataset of top diverse studies
df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
Returns:
dict: Dictionary mapping criteria names to comparison results
"""
# Define eligibility criteria fields to compare
criteria_fields = [
'eligibility_age_restrict',
'eligibility_stage_size',
'eligibility_site',
'eligibility_histological_type',
'eligibility_performance_score',
'eligibility_comorbidities',
'eligibility_hx_of_tt',
'eligibility_lab_values',
'eligibility_pregnancy_or_contraception',
'eligibility_misc'
]
# Compare each criterion
results = {}
for criterion in criteria_fields:
# Calculate prevalence in each group
top_prevalence = df_top[criterion].mean() * 100
bottom_prevalence = df_bottom[criterion].mean() * 100
# Perform Fisher's exact test for significance
contingency_table = pd.crosstab(
pd.Series(df_top[criterion].tolist() + df_bottom[criterion].tolist()),
pd.Series(['top'] * len(df_top) + ['bottom'] * len(df_bottom))
)
_, p_value = stats.fisher_exact(contingency_table)
# Store results
results[criterion] = {
'top_prevalence': top_prevalence,
'bottom_prevalence': bottom_prevalence,
'difference': top_prevalence - bottom_prevalence,
'p_value': p_value,
'significant': p_value < 0.05
}
return results
def compare_study_characteristics(df_top, df_bottom):
"""
Compare general study characteristics between top and bottom diverse studies.
This function examines the following characteristics:
1. Study start date and end date
2. Single vs. multi-institutional study
3. Number of participants (total, male, female)
4. Modality (Drug/Radiation/Biologic/Combination)
5. Trial type (Primary/Palliative/Recurrent/Metastatic)
Args:
df_top (pandas.DataFrame): Dataset of top diverse studies
df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
Returns:
dict: Dictionary mapping characteristic names to comparison results
"""
# Define characteristics to compare
characteristics = [
'num_sites',
'is_single_institution',
'num_participants',
'num_male_participants',
'num_female_participants',
'modalities',
'trial_type',
'trial_phase',
'tumor_type',
'cancer_site',
'study_start_date',
'apc_date'
]
results = {}
for char in characteristics:
if char in df_top.columns and char in df_bottom.columns:
if df_top[char].dtype in [np.int64, np.float64]:
# Numeric characteristic - use Mann-Whitney U test
u_stat, p_value = stats.mannwhitneyu(
df_top[char].dropna(),
df_bottom[char].dropna(),
alternative='two-sided'
)
results[char] = {
'top_mean': df_top[char].mean(),
'bottom_mean': df_bottom[char].mean(),
'difference': df_top[char].mean() - df_bottom[char].mean(),
'test': 'Mann-Whitney U',
'statistic': u_stat,
'p_value': p_value,
'significant': p_value < 0.05
}
else:
# Categorical characteristic - use Chi-square test
try:
contingency_table = pd.crosstab(
pd.Series(df_top[char].tolist() + df_bottom[char].tolist()),
pd.Series(['top'] * len(df_top) + ['bottom'] * len(df_bottom))
)
chi2, p_value, _, _ = stats.chi2_contingency(contingency_table)
# Calculate prevalence of each category in both groups
top_distribution = df_top[char].value_counts(normalize=True).to_dict()
bottom_distribution = df_bottom[char].value_counts(normalize=True).to_dict()
results[char] = {
'top_distribution': top_distribution,
'bottom_distribution': bottom_distribution,
'test': 'Chi-square',
'statistic': chi2,
'p_value': p_value,
'significant': p_value < 0.05
}
except (ValueError, np.linalg.LinAlgError):
# Handle cases where Chi-square test cannot be performed
top_distribution = df_top[char].value_counts(normalize=True).to_dict()
bottom_distribution = df_bottom[char].value_counts(normalize=True).to_dict()
results[char] = {
'top_distribution': top_distribution,
'bottom_distribution': bottom_distribution,
'test': 'Not performed',
'statistic': None,
'p_value': None,
'significant': None
}
return results
def analyze_geographic_distribution(df_top, df_bottom):
"""
Analyze the geographic distribution of studies and its impact on diversity.
This function examines how geographic location relates to diversity in clinical trials,
considering that areas with more diverse populations may have higher potential for
diverse study recruitment.
Args:
df_top (pandas.DataFrame): Dataset of top diverse studies
df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
Returns:
dict: Dictionary with geographic analysis results
"""
# Extract location information
location_field = 'location'
if location_field in df_top.columns and location_field in df_bottom.columns:
top_locations = df_top[location_field].value_counts().to_dict()
bottom_locations = df_bottom[location_field].value_counts().to_dict()
# Identify common and unique locations
all_locations = set(top_locations.keys()).union(set(bottom_locations.keys()))
common_locations = set(top_locations.keys()).intersection(set(bottom_locations.keys()))
top_only_locations = set(top_locations.keys()) - set(bottom_locations.keys())
bottom_only_locations = set(bottom_locations.keys()) - set(top_locations.keys())
return {
'top_locations': top_locations,
'bottom_locations': bottom_locations,
'all_locations': list(all_locations),
'common_locations': list(common_locations),
'top_only_locations': list(top_only_locations),
'bottom_only_locations': list(bottom_only_locations),
'num_top_locations': len(top_locations),
'num_bottom_locations': len(bottom_locations)
}
else:
return {
'error': f"Location field '{location_field}' not found in datasets"
}
def analyze_institutional_setting(df_top, df_bottom):
"""
Analyze the impact of institutional setting (single vs. multi-institution) on diversity.
This function examines whether single-institution or multi-institution studies
tend to have higher diversity.
Args:
df_top (pandas.DataFrame): Dataset of top diverse studies
df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
Returns:
dict: Dictionary with institutional setting analysis results
"""
institution_field = 'is_single_institution'
if institution_field in df_top.columns and institution_field in df_bottom.columns:
# Calculate prevalence of single-institution studies in each group
top_single_inst_perc = df_top[institution_field].mean() * 100
bottom_single_inst_perc = df_bottom[institution_field].mean() * 100
# Perform Fisher's exact test for significance
contingency_table = pd.crosstab(
pd.Series(df_top[institution_field].tolist() + df_bottom[institution_field].tolist()),
pd.Series(['top'] * len(df_top) + ['bottom'] * len(df_bottom))
)
_, p_value = stats.fisher_exact(contingency_table)
return {
'top_single_institution_percent': top_single_inst_perc,
'bottom_single_institution_percent': bottom_single_inst_perc,
'difference': top_single_inst_perc - bottom_single_inst_perc,
'p_value': p_value,
'significant': p_value < 0.05
}
else:
return {
'error': f"Institution field '{institution_field}' not found in datasets"
}
def analyze_all_factors(df_top, df_bottom):
"""
Perform a comprehensive analysis of all factors that may affect diversity.
Args:
df_top (pandas.DataFrame): Dataset of top diverse studies
df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
Returns:
dict: Dictionary with all analysis results
"""
return {
'eligibility_scores': compare_eligibility_scores(df_top, df_bottom),
'eligibility_criteria': compare_eligibility_criteria(df_top, df_bottom),
'study_characteristics': compare_study_characteristics(df_top, df_bottom),
'geographic_distribution': analyze_geographic_distribution(df_top, df_bottom),
'institutional_setting': analyze_institutional_setting(df_top, df_bottom)
}