|
a |
|
b/src/analysis.py |
|
|
1 |
""" |
|
|
2 |
Analysis Module for Diversity in Head and Neck Cancer Clinical Trials |
|
|
3 |
|
|
|
4 |
This module contains functions for analyzing the factors that contribute to |
|
|
5 |
diversity in head and neck cancer clinical trials, with a focus on eligibility |
|
|
6 |
criteria and other study characteristics. |
|
|
7 |
""" |
|
|
8 |
|
|
|
9 |
import pandas as pd |
|
|
10 |
import numpy as np |
|
|
11 |
from scipy import stats |
|
|
12 |
|
|
|
13 |
def compare_eligibility_scores(df_top, df_bottom): |
|
|
14 |
""" |
|
|
15 |
Compare eligibility scores between top and bottom diverse studies. |
|
|
16 |
|
|
|
17 |
The eligibility score is the sum of binary flags for all eligibility criteria, |
|
|
18 |
with higher scores indicating more restrictive criteria. |
|
|
19 |
|
|
|
20 |
Args: |
|
|
21 |
df_top (pandas.DataFrame): Dataset of top diverse studies |
|
|
22 |
df_bottom (pandas.DataFrame): Dataset of bottom diverse studies |
|
|
23 |
|
|
|
24 |
Returns: |
|
|
25 |
dict: Dictionary with comparison results and statistical test |
|
|
26 |
""" |
|
|
27 |
# Get eligibility scores for both groups |
|
|
28 |
top_scores = df_top['eligibility_score'] |
|
|
29 |
bottom_scores = df_bottom['eligibility_score'] |
|
|
30 |
|
|
|
31 |
# Perform statistical comparison (Mann-Whitney U test) |
|
|
32 |
u_stat, p_value = stats.mannwhitneyu(top_scores, bottom_scores, alternative='two-sided') |
|
|
33 |
|
|
|
34 |
# Calculate descriptive statistics |
|
|
35 |
top_stats = { |
|
|
36 |
'mean': np.mean(top_scores), |
|
|
37 |
'median': np.median(top_scores), |
|
|
38 |
'std': np.std(top_scores), |
|
|
39 |
'min': np.min(top_scores), |
|
|
40 |
'max': np.max(top_scores) |
|
|
41 |
} |
|
|
42 |
|
|
|
43 |
bottom_stats = { |
|
|
44 |
'mean': np.mean(bottom_scores), |
|
|
45 |
'median': np.median(bottom_scores), |
|
|
46 |
'std': np.std(bottom_scores), |
|
|
47 |
'min': np.min(bottom_scores), |
|
|
48 |
'max': np.max(bottom_scores) |
|
|
49 |
} |
|
|
50 |
|
|
|
51 |
return { |
|
|
52 |
'top_stats': top_stats, |
|
|
53 |
'bottom_stats': bottom_stats, |
|
|
54 |
'u_statistic': u_stat, |
|
|
55 |
'p_value': p_value, |
|
|
56 |
'significant': p_value < 0.05 |
|
|
57 |
} |
|
|
58 |
|
|
|
59 |
def compare_eligibility_criteria(df_top, df_bottom): |
|
|
60 |
""" |
|
|
61 |
Compare the prevalence of specific eligibility criteria between top and bottom diverse studies. |
|
|
62 |
|
|
|
63 |
This function examines the following eligibility criteria: |
|
|
64 |
- age_restrict: 0 if the restriction is age>18, 1 for other restrictions (e.g., 18<age<75) |
|
|
65 |
- stage_size: Restrictions on the cancer stage and the size of the tumor |
|
|
66 |
- cancer_site: Restrictions on the cancer site |
|
|
67 |
- histological_type: Whether the study was limited to SCC or any other type |
|
|
68 |
- performance_score: Restrictions on performance score (e.g., ECOG performance) |
|
|
69 |
- comorbidities: Restrictions on comorbidities |
|
|
70 |
- hx_of_tt: Restrictions on treatment history for cancer |
|
|
71 |
- lab_values: Restrictions on lab test values |
|
|
72 |
- pregnancy_or_contraception: Restrictions on pregnancy or particular contraceptives |
|
|
73 |
- misc: Other restrictions (smoking status, ethnicity requirements) |
|
|
74 |
|
|
|
75 |
Args: |
|
|
76 |
df_top (pandas.DataFrame): Dataset of top diverse studies |
|
|
77 |
df_bottom (pandas.DataFrame): Dataset of bottom diverse studies |
|
|
78 |
|
|
|
79 |
Returns: |
|
|
80 |
dict: Dictionary mapping criteria names to comparison results |
|
|
81 |
""" |
|
|
82 |
# Define eligibility criteria fields to compare |
|
|
83 |
criteria_fields = [ |
|
|
84 |
'eligibility_age_restrict', |
|
|
85 |
'eligibility_stage_size', |
|
|
86 |
'eligibility_site', |
|
|
87 |
'eligibility_histological_type', |
|
|
88 |
'eligibility_performance_score', |
|
|
89 |
'eligibility_comorbidities', |
|
|
90 |
'eligibility_hx_of_tt', |
|
|
91 |
'eligibility_lab_values', |
|
|
92 |
'eligibility_pregnancy_or_contraception', |
|
|
93 |
'eligibility_misc' |
|
|
94 |
] |
|
|
95 |
|
|
|
96 |
# Compare each criterion |
|
|
97 |
results = {} |
|
|
98 |
for criterion in criteria_fields: |
|
|
99 |
# Calculate prevalence in each group |
|
|
100 |
top_prevalence = df_top[criterion].mean() * 100 |
|
|
101 |
bottom_prevalence = df_bottom[criterion].mean() * 100 |
|
|
102 |
|
|
|
103 |
# Perform Fisher's exact test for significance |
|
|
104 |
contingency_table = pd.crosstab( |
|
|
105 |
pd.Series(df_top[criterion].tolist() + df_bottom[criterion].tolist()), |
|
|
106 |
pd.Series(['top'] * len(df_top) + ['bottom'] * len(df_bottom)) |
|
|
107 |
) |
|
|
108 |
|
|
|
109 |
_, p_value = stats.fisher_exact(contingency_table) |
|
|
110 |
|
|
|
111 |
# Store results |
|
|
112 |
results[criterion] = { |
|
|
113 |
'top_prevalence': top_prevalence, |
|
|
114 |
'bottom_prevalence': bottom_prevalence, |
|
|
115 |
'difference': top_prevalence - bottom_prevalence, |
|
|
116 |
'p_value': p_value, |
|
|
117 |
'significant': p_value < 0.05 |
|
|
118 |
} |
|
|
119 |
|
|
|
120 |
return results |
|
|
121 |
|
|
|
122 |
def compare_study_characteristics(df_top, df_bottom): |
|
|
123 |
""" |
|
|
124 |
Compare general study characteristics between top and bottom diverse studies. |
|
|
125 |
|
|
|
126 |
This function examines the following characteristics: |
|
|
127 |
1. Study start date and end date |
|
|
128 |
2. Single vs. multi-institutional study |
|
|
129 |
3. Number of participants (total, male, female) |
|
|
130 |
4. Modality (Drug/Radiation/Biologic/Combination) |
|
|
131 |
5. Trial type (Primary/Palliative/Recurrent/Metastatic) |
|
|
132 |
|
|
|
133 |
Args: |
|
|
134 |
df_top (pandas.DataFrame): Dataset of top diverse studies |
|
|
135 |
df_bottom (pandas.DataFrame): Dataset of bottom diverse studies |
|
|
136 |
|
|
|
137 |
Returns: |
|
|
138 |
dict: Dictionary mapping characteristic names to comparison results |
|
|
139 |
""" |
|
|
140 |
# Define characteristics to compare |
|
|
141 |
characteristics = [ |
|
|
142 |
'num_sites', |
|
|
143 |
'is_single_institution', |
|
|
144 |
'num_participants', |
|
|
145 |
'num_male_participants', |
|
|
146 |
'num_female_participants', |
|
|
147 |
'modalities', |
|
|
148 |
'trial_type', |
|
|
149 |
'trial_phase', |
|
|
150 |
'tumor_type', |
|
|
151 |
'cancer_site', |
|
|
152 |
'study_start_date', |
|
|
153 |
'apc_date' |
|
|
154 |
] |
|
|
155 |
|
|
|
156 |
results = {} |
|
|
157 |
for char in characteristics: |
|
|
158 |
if char in df_top.columns and char in df_bottom.columns: |
|
|
159 |
if df_top[char].dtype in [np.int64, np.float64]: |
|
|
160 |
# Numeric characteristic - use Mann-Whitney U test |
|
|
161 |
u_stat, p_value = stats.mannwhitneyu( |
|
|
162 |
df_top[char].dropna(), |
|
|
163 |
df_bottom[char].dropna(), |
|
|
164 |
alternative='two-sided' |
|
|
165 |
) |
|
|
166 |
|
|
|
167 |
results[char] = { |
|
|
168 |
'top_mean': df_top[char].mean(), |
|
|
169 |
'bottom_mean': df_bottom[char].mean(), |
|
|
170 |
'difference': df_top[char].mean() - df_bottom[char].mean(), |
|
|
171 |
'test': 'Mann-Whitney U', |
|
|
172 |
'statistic': u_stat, |
|
|
173 |
'p_value': p_value, |
|
|
174 |
'significant': p_value < 0.05 |
|
|
175 |
} |
|
|
176 |
else: |
|
|
177 |
# Categorical characteristic - use Chi-square test |
|
|
178 |
try: |
|
|
179 |
contingency_table = pd.crosstab( |
|
|
180 |
pd.Series(df_top[char].tolist() + df_bottom[char].tolist()), |
|
|
181 |
pd.Series(['top'] * len(df_top) + ['bottom'] * len(df_bottom)) |
|
|
182 |
) |
|
|
183 |
|
|
|
184 |
chi2, p_value, _, _ = stats.chi2_contingency(contingency_table) |
|
|
185 |
|
|
|
186 |
# Calculate prevalence of each category in both groups |
|
|
187 |
top_distribution = df_top[char].value_counts(normalize=True).to_dict() |
|
|
188 |
bottom_distribution = df_bottom[char].value_counts(normalize=True).to_dict() |
|
|
189 |
|
|
|
190 |
results[char] = { |
|
|
191 |
'top_distribution': top_distribution, |
|
|
192 |
'bottom_distribution': bottom_distribution, |
|
|
193 |
'test': 'Chi-square', |
|
|
194 |
'statistic': chi2, |
|
|
195 |
'p_value': p_value, |
|
|
196 |
'significant': p_value < 0.05 |
|
|
197 |
} |
|
|
198 |
except (ValueError, np.linalg.LinAlgError): |
|
|
199 |
# Handle cases where Chi-square test cannot be performed |
|
|
200 |
top_distribution = df_top[char].value_counts(normalize=True).to_dict() |
|
|
201 |
bottom_distribution = df_bottom[char].value_counts(normalize=True).to_dict() |
|
|
202 |
|
|
|
203 |
results[char] = { |
|
|
204 |
'top_distribution': top_distribution, |
|
|
205 |
'bottom_distribution': bottom_distribution, |
|
|
206 |
'test': 'Not performed', |
|
|
207 |
'statistic': None, |
|
|
208 |
'p_value': None, |
|
|
209 |
'significant': None |
|
|
210 |
} |
|
|
211 |
|
|
|
212 |
return results |
|
|
213 |
|
|
|
214 |
def analyze_geographic_distribution(df_top, df_bottom): |
|
|
215 |
""" |
|
|
216 |
Analyze the geographic distribution of studies and its impact on diversity. |
|
|
217 |
|
|
|
218 |
This function examines how geographic location relates to diversity in clinical trials, |
|
|
219 |
considering that areas with more diverse populations may have higher potential for |
|
|
220 |
diverse study recruitment. |
|
|
221 |
|
|
|
222 |
Args: |
|
|
223 |
df_top (pandas.DataFrame): Dataset of top diverse studies |
|
|
224 |
df_bottom (pandas.DataFrame): Dataset of bottom diverse studies |
|
|
225 |
|
|
|
226 |
Returns: |
|
|
227 |
dict: Dictionary with geographic analysis results |
|
|
228 |
""" |
|
|
229 |
# Extract location information |
|
|
230 |
location_field = 'location' |
|
|
231 |
if location_field in df_top.columns and location_field in df_bottom.columns: |
|
|
232 |
top_locations = df_top[location_field].value_counts().to_dict() |
|
|
233 |
bottom_locations = df_bottom[location_field].value_counts().to_dict() |
|
|
234 |
|
|
|
235 |
# Identify common and unique locations |
|
|
236 |
all_locations = set(top_locations.keys()).union(set(bottom_locations.keys())) |
|
|
237 |
common_locations = set(top_locations.keys()).intersection(set(bottom_locations.keys())) |
|
|
238 |
top_only_locations = set(top_locations.keys()) - set(bottom_locations.keys()) |
|
|
239 |
bottom_only_locations = set(bottom_locations.keys()) - set(top_locations.keys()) |
|
|
240 |
|
|
|
241 |
return { |
|
|
242 |
'top_locations': top_locations, |
|
|
243 |
'bottom_locations': bottom_locations, |
|
|
244 |
'all_locations': list(all_locations), |
|
|
245 |
'common_locations': list(common_locations), |
|
|
246 |
'top_only_locations': list(top_only_locations), |
|
|
247 |
'bottom_only_locations': list(bottom_only_locations), |
|
|
248 |
'num_top_locations': len(top_locations), |
|
|
249 |
'num_bottom_locations': len(bottom_locations) |
|
|
250 |
} |
|
|
251 |
else: |
|
|
252 |
return { |
|
|
253 |
'error': f"Location field '{location_field}' not found in datasets" |
|
|
254 |
} |
|
|
255 |
|
|
|
256 |
def analyze_institutional_setting(df_top, df_bottom): |
|
|
257 |
""" |
|
|
258 |
Analyze the impact of institutional setting (single vs. multi-institution) on diversity. |
|
|
259 |
|
|
|
260 |
This function examines whether single-institution or multi-institution studies |
|
|
261 |
tend to have higher diversity. |
|
|
262 |
|
|
|
263 |
Args: |
|
|
264 |
df_top (pandas.DataFrame): Dataset of top diverse studies |
|
|
265 |
df_bottom (pandas.DataFrame): Dataset of bottom diverse studies |
|
|
266 |
|
|
|
267 |
Returns: |
|
|
268 |
dict: Dictionary with institutional setting analysis results |
|
|
269 |
""" |
|
|
270 |
institution_field = 'is_single_institution' |
|
|
271 |
|
|
|
272 |
if institution_field in df_top.columns and institution_field in df_bottom.columns: |
|
|
273 |
# Calculate prevalence of single-institution studies in each group |
|
|
274 |
top_single_inst_perc = df_top[institution_field].mean() * 100 |
|
|
275 |
bottom_single_inst_perc = df_bottom[institution_field].mean() * 100 |
|
|
276 |
|
|
|
277 |
# Perform Fisher's exact test for significance |
|
|
278 |
contingency_table = pd.crosstab( |
|
|
279 |
pd.Series(df_top[institution_field].tolist() + df_bottom[institution_field].tolist()), |
|
|
280 |
pd.Series(['top'] * len(df_top) + ['bottom'] * len(df_bottom)) |
|
|
281 |
) |
|
|
282 |
|
|
|
283 |
_, p_value = stats.fisher_exact(contingency_table) |
|
|
284 |
|
|
|
285 |
return { |
|
|
286 |
'top_single_institution_percent': top_single_inst_perc, |
|
|
287 |
'bottom_single_institution_percent': bottom_single_inst_perc, |
|
|
288 |
'difference': top_single_inst_perc - bottom_single_inst_perc, |
|
|
289 |
'p_value': p_value, |
|
|
290 |
'significant': p_value < 0.05 |
|
|
291 |
} |
|
|
292 |
else: |
|
|
293 |
return { |
|
|
294 |
'error': f"Institution field '{institution_field}' not found in datasets" |
|
|
295 |
} |
|
|
296 |
|
|
|
297 |
def analyze_all_factors(df_top, df_bottom): |
|
|
298 |
""" |
|
|
299 |
Perform a comprehensive analysis of all factors that may affect diversity. |
|
|
300 |
|
|
|
301 |
Args: |
|
|
302 |
df_top (pandas.DataFrame): Dataset of top diverse studies |
|
|
303 |
df_bottom (pandas.DataFrame): Dataset of bottom diverse studies |
|
|
304 |
|
|
|
305 |
Returns: |
|
|
306 |
dict: Dictionary with all analysis results |
|
|
307 |
""" |
|
|
308 |
return { |
|
|
309 |
'eligibility_scores': compare_eligibility_scores(df_top, df_bottom), |
|
|
310 |
'eligibility_criteria': compare_eligibility_criteria(df_top, df_bottom), |
|
|
311 |
'study_characteristics': compare_study_characteristics(df_top, df_bottom), |
|
|
312 |
'geographic_distribution': analyze_geographic_distribution(df_top, df_bottom), |
|
|
313 |
'institutional_setting': analyze_institutional_setting(df_top, df_bottom) |
|
|
314 |
} |