Diff of /src/analysis.py [000000] .. [3cdecf]

Switch to unified view

a b/src/analysis.py
1
"""
2
Analysis Module for Diversity in Head and Neck Cancer Clinical Trials
3
4
This module contains functions for analyzing the factors that contribute to
5
diversity in head and neck cancer clinical trials, with a focus on eligibility
6
criteria and other study characteristics.
7
"""
8
9
import pandas as pd
10
import numpy as np
11
from scipy import stats
12
13
def compare_eligibility_scores(df_top, df_bottom):
14
    """
15
    Compare eligibility scores between top and bottom diverse studies.
16
    
17
    The eligibility score is the sum of binary flags for all eligibility criteria,
18
    with higher scores indicating more restrictive criteria.
19
    
20
    Args:
21
        df_top (pandas.DataFrame): Dataset of top diverse studies
22
        df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
23
        
24
    Returns:
25
        dict: Dictionary with comparison results and statistical test
26
    """
27
    # Get eligibility scores for both groups
28
    top_scores = df_top['eligibility_score']
29
    bottom_scores = df_bottom['eligibility_score']
30
    
31
    # Perform statistical comparison (Mann-Whitney U test)
32
    u_stat, p_value = stats.mannwhitneyu(top_scores, bottom_scores, alternative='two-sided')
33
    
34
    # Calculate descriptive statistics
35
    top_stats = {
36
        'mean': np.mean(top_scores),
37
        'median': np.median(top_scores),
38
        'std': np.std(top_scores),
39
        'min': np.min(top_scores),
40
        'max': np.max(top_scores)
41
    }
42
    
43
    bottom_stats = {
44
        'mean': np.mean(bottom_scores),
45
        'median': np.median(bottom_scores),
46
        'std': np.std(bottom_scores),
47
        'min': np.min(bottom_scores),
48
        'max': np.max(bottom_scores)
49
    }
50
    
51
    return {
52
        'top_stats': top_stats,
53
        'bottom_stats': bottom_stats,
54
        'u_statistic': u_stat,
55
        'p_value': p_value,
56
        'significant': p_value < 0.05
57
    }
58
59
def compare_eligibility_criteria(df_top, df_bottom):
60
    """
61
    Compare the prevalence of specific eligibility criteria between top and bottom diverse studies.
62
    
63
    This function examines the following eligibility criteria:
64
    - age_restrict: 0 if the restriction is age>18, 1 for other restrictions (e.g., 18<age<75)
65
    - stage_size: Restrictions on the cancer stage and the size of the tumor
66
    - cancer_site: Restrictions on the cancer site
67
    - histological_type: Whether the study was limited to SCC or any other type
68
    - performance_score: Restrictions on performance score (e.g., ECOG performance)
69
    - comorbidities: Restrictions on comorbidities
70
    - hx_of_tt: Restrictions on treatment history for cancer
71
    - lab_values: Restrictions on lab test values
72
    - pregnancy_or_contraception: Restrictions on pregnancy or particular contraceptives
73
    - misc: Other restrictions (smoking status, ethnicity requirements)
74
    
75
    Args:
76
        df_top (pandas.DataFrame): Dataset of top diverse studies
77
        df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
78
        
79
    Returns:
80
        dict: Dictionary mapping criteria names to comparison results
81
    """
82
    # Define eligibility criteria fields to compare
83
    criteria_fields = [
84
        'eligibility_age_restrict',
85
        'eligibility_stage_size',
86
        'eligibility_site',
87
        'eligibility_histological_type',
88
        'eligibility_performance_score',
89
        'eligibility_comorbidities',
90
        'eligibility_hx_of_tt',
91
        'eligibility_lab_values',
92
        'eligibility_pregnancy_or_contraception',
93
        'eligibility_misc'
94
    ]
95
    
96
    # Compare each criterion
97
    results = {}
98
    for criterion in criteria_fields:
99
        # Calculate prevalence in each group
100
        top_prevalence = df_top[criterion].mean() * 100
101
        bottom_prevalence = df_bottom[criterion].mean() * 100
102
        
103
        # Perform Fisher's exact test for significance
104
        contingency_table = pd.crosstab(
105
            pd.Series(df_top[criterion].tolist() + df_bottom[criterion].tolist()), 
106
            pd.Series(['top'] * len(df_top) + ['bottom'] * len(df_bottom))
107
        )
108
        
109
        _, p_value = stats.fisher_exact(contingency_table)
110
        
111
        # Store results
112
        results[criterion] = {
113
            'top_prevalence': top_prevalence,
114
            'bottom_prevalence': bottom_prevalence,
115
            'difference': top_prevalence - bottom_prevalence,
116
            'p_value': p_value,
117
            'significant': p_value < 0.05
118
        }
119
    
120
    return results
121
122
def compare_study_characteristics(df_top, df_bottom):
123
    """
124
    Compare general study characteristics between top and bottom diverse studies.
125
    
126
    This function examines the following characteristics:
127
    1. Study start date and end date
128
    2. Single vs. multi-institutional study
129
    3. Number of participants (total, male, female)
130
    4. Modality (Drug/Radiation/Biologic/Combination)
131
    5. Trial type (Primary/Palliative/Recurrent/Metastatic)
132
    
133
    Args:
134
        df_top (pandas.DataFrame): Dataset of top diverse studies
135
        df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
136
        
137
    Returns:
138
        dict: Dictionary mapping characteristic names to comparison results
139
    """
140
    # Define characteristics to compare
141
    characteristics = [
142
        'num_sites',
143
        'is_single_institution',
144
        'num_participants',
145
        'num_male_participants',
146
        'num_female_participants',
147
        'modalities',
148
        'trial_type',
149
        'trial_phase',
150
        'tumor_type',
151
        'cancer_site',
152
        'study_start_date',
153
        'apc_date'
154
    ]
155
    
156
    results = {}
157
    for char in characteristics:
158
        if char in df_top.columns and char in df_bottom.columns:
159
            if df_top[char].dtype in [np.int64, np.float64]:
160
                # Numeric characteristic - use Mann-Whitney U test
161
                u_stat, p_value = stats.mannwhitneyu(
162
                    df_top[char].dropna(), 
163
                    df_bottom[char].dropna(), 
164
                    alternative='two-sided'
165
                )
166
                
167
                results[char] = {
168
                    'top_mean': df_top[char].mean(),
169
                    'bottom_mean': df_bottom[char].mean(),
170
                    'difference': df_top[char].mean() - df_bottom[char].mean(),
171
                    'test': 'Mann-Whitney U',
172
                    'statistic': u_stat,
173
                    'p_value': p_value,
174
                    'significant': p_value < 0.05
175
                }
176
            else:
177
                # Categorical characteristic - use Chi-square test
178
                try:
179
                    contingency_table = pd.crosstab(
180
                        pd.Series(df_top[char].tolist() + df_bottom[char].tolist()), 
181
                        pd.Series(['top'] * len(df_top) + ['bottom'] * len(df_bottom))
182
                    )
183
                    
184
                    chi2, p_value, _, _ = stats.chi2_contingency(contingency_table)
185
                    
186
                    # Calculate prevalence of each category in both groups
187
                    top_distribution = df_top[char].value_counts(normalize=True).to_dict()
188
                    bottom_distribution = df_bottom[char].value_counts(normalize=True).to_dict()
189
                    
190
                    results[char] = {
191
                        'top_distribution': top_distribution,
192
                        'bottom_distribution': bottom_distribution,
193
                        'test': 'Chi-square',
194
                        'statistic': chi2,
195
                        'p_value': p_value,
196
                        'significant': p_value < 0.05
197
                    }
198
                except (ValueError, np.linalg.LinAlgError):
199
                    # Handle cases where Chi-square test cannot be performed
200
                    top_distribution = df_top[char].value_counts(normalize=True).to_dict()
201
                    bottom_distribution = df_bottom[char].value_counts(normalize=True).to_dict()
202
                    
203
                    results[char] = {
204
                        'top_distribution': top_distribution,
205
                        'bottom_distribution': bottom_distribution,
206
                        'test': 'Not performed',
207
                        'statistic': None,
208
                        'p_value': None,
209
                        'significant': None
210
                    }
211
    
212
    return results
213
214
def analyze_geographic_distribution(df_top, df_bottom):
215
    """
216
    Analyze the geographic distribution of studies and its impact on diversity.
217
    
218
    This function examines how geographic location relates to diversity in clinical trials,
219
    considering that areas with more diverse populations may have higher potential for
220
    diverse study recruitment.
221
    
222
    Args:
223
        df_top (pandas.DataFrame): Dataset of top diverse studies
224
        df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
225
        
226
    Returns:
227
        dict: Dictionary with geographic analysis results
228
    """
229
    # Extract location information
230
    location_field = 'location'
231
    if location_field in df_top.columns and location_field in df_bottom.columns:
232
        top_locations = df_top[location_field].value_counts().to_dict()
233
        bottom_locations = df_bottom[location_field].value_counts().to_dict()
234
        
235
        # Identify common and unique locations
236
        all_locations = set(top_locations.keys()).union(set(bottom_locations.keys()))
237
        common_locations = set(top_locations.keys()).intersection(set(bottom_locations.keys()))
238
        top_only_locations = set(top_locations.keys()) - set(bottom_locations.keys())
239
        bottom_only_locations = set(bottom_locations.keys()) - set(top_locations.keys())
240
        
241
        return {
242
            'top_locations': top_locations,
243
            'bottom_locations': bottom_locations,
244
            'all_locations': list(all_locations),
245
            'common_locations': list(common_locations),
246
            'top_only_locations': list(top_only_locations),
247
            'bottom_only_locations': list(bottom_only_locations),
248
            'num_top_locations': len(top_locations),
249
            'num_bottom_locations': len(bottom_locations)
250
        }
251
    else:
252
        return {
253
            'error': f"Location field '{location_field}' not found in datasets"
254
        }
255
256
def analyze_institutional_setting(df_top, df_bottom):
257
    """
258
    Analyze the impact of institutional setting (single vs. multi-institution) on diversity.
259
    
260
    This function examines whether single-institution or multi-institution studies
261
    tend to have higher diversity.
262
    
263
    Args:
264
        df_top (pandas.DataFrame): Dataset of top diverse studies
265
        df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
266
        
267
    Returns:
268
        dict: Dictionary with institutional setting analysis results
269
    """
270
    institution_field = 'is_single_institution'
271
    
272
    if institution_field in df_top.columns and institution_field in df_bottom.columns:
273
        # Calculate prevalence of single-institution studies in each group
274
        top_single_inst_perc = df_top[institution_field].mean() * 100
275
        bottom_single_inst_perc = df_bottom[institution_field].mean() * 100
276
        
277
        # Perform Fisher's exact test for significance
278
        contingency_table = pd.crosstab(
279
            pd.Series(df_top[institution_field].tolist() + df_bottom[institution_field].tolist()), 
280
            pd.Series(['top'] * len(df_top) + ['bottom'] * len(df_bottom))
281
        )
282
        
283
        _, p_value = stats.fisher_exact(contingency_table)
284
        
285
        return {
286
            'top_single_institution_percent': top_single_inst_perc,
287
            'bottom_single_institution_percent': bottom_single_inst_perc,
288
            'difference': top_single_inst_perc - bottom_single_inst_perc,
289
            'p_value': p_value,
290
            'significant': p_value < 0.05
291
        }
292
    else:
293
        return {
294
            'error': f"Institution field '{institution_field}' not found in datasets"
295
        }
296
297
def analyze_all_factors(df_top, df_bottom):
298
    """
299
    Perform a comprehensive analysis of all factors that may affect diversity.
300
    
301
    Args:
302
        df_top (pandas.DataFrame): Dataset of top diverse studies
303
        df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
304
        
305
    Returns:
306
        dict: Dictionary with all analysis results
307
    """
308
    return {
309
        'eligibility_scores': compare_eligibility_scores(df_top, df_bottom),
310
        'eligibility_criteria': compare_eligibility_criteria(df_top, df_bottom),
311
        'study_characteristics': compare_study_characteristics(df_top, df_bottom),
312
        'geographic_distribution': analyze_geographic_distribution(df_top, df_bottom),
313
        'institutional_setting': analyze_institutional_setting(df_top, df_bottom)
314
    }