[3cdecf]: / src / data_processing.py

Download this file

210 lines (163 with data), 7.0 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
"""
Data Processing Module for Diversity in Head and Neck Cancer Clinical Trials Analysis
This module contains functions for data loading, filtering, and processing
to support the analysis of diversity in head and neck cancer clinical trials.
The main analysis focuses on computing a diversity score for each study,
defined as:
Diversity Score = (# non-white participants) / (# total participants) × 100
where total participants = # white participants + # non-white participants
Studies are then categorized into top 20% diverse and bottom 20% diverse groups
for comparative analysis of various features.
"""
import warnings
import pandas as pd
import numpy as np
# Suppress warnings for cleaner output
warnings.simplefilter(action='ignore')
def load_data(filepath):
"""
Load clinical trial data from CSV file.
Args:
filepath (str): Path to the CSV file
Returns:
pandas.DataFrame: Loaded data
"""
return pd.read_csv(filepath)
def filter_usa_studies(df):
"""
Filter for studies conducted only in the United States.
Args:
df (pandas.DataFrame): Original dataset
Returns:
pandas.DataFrame: Filtered dataset containing only USA studies
"""
return df[df['Area Offered'] == "United States"]
def filter_race_information(df):
"""
Filter for studies that contain race information.
Args:
df (pandas.DataFrame): Dataset to filter
Returns:
pandas.DataFrame: Filtered dataset containing only studies with race information
"""
return df[~(df["# White"].isna() | df["# Non White"].isna())]
def calculate_success_metric(df):
"""
Calculate success metric (percentage of non-white participants) for each study.
The diversity score is calculated as:
Diversity Score = (# non-white participants) / (# total participants) × 100
where total participants = # white participants + # non-white participants
Args:
df (pandas.DataFrame): Dataset with race information
Returns:
pandas.DataFrame: Dataset with added success_metric column
"""
df_with_metric = df.copy()
# Calculate diversity score
df_with_metric["success_metric"] = (
df_with_metric["# Non White"] /
(df_with_metric["# White"] + df_with_metric["# Non White"]) *
100.0
)
return df_with_metric
def categorize_by_diversity(df, top_percentile=0.8, bottom_percentile=0.2):
"""
Categorize studies as top, bottom, or middle based on the success metric.
Args:
df (pandas.DataFrame): Dataset with success_metric column
top_percentile (float): Percentile threshold for top category (default: 0.8)
bottom_percentile (float): Percentile threshold for bottom category (default: 0.2)
Returns:
pandas.DataFrame: Dataset with added success_category column
"""
top_threshold = np.quantile(df.success_metric, top_percentile)
bottom_threshold = np.quantile(df.success_metric, bottom_percentile)
def get_category_label(x):
if x >= top_threshold:
return "Top20"
elif x <= bottom_threshold:
return "Bottom20"
else:
return "Neither"
df_categorized = df.copy()
df_categorized["success_category"] = df_categorized["success_metric"].apply(get_category_label)
return df_categorized
def get_top_bottom_datasets(df):
"""
Extract top and bottom diversity datasets based on success_metric thresholds.
Args:
df (pandas.DataFrame): Dataset with success_metric column
Returns:
tuple: (top_dataset, bottom_dataset)
"""
success_metric_80th_perc = np.quantile(df.success_metric, 0.8)
success_metric_20th_perc = np.quantile(df.success_metric, 0.2)
df_top = df[df.success_metric >= success_metric_80th_perc]
df_bottom = df[df.success_metric <= success_metric_20th_perc]
return df_top, df_bottom
def extract_eligibility_features(df):
"""
Extract and categorize eligibility features for analysis.
This function processes the raw data to extract binary flags for
different eligibility criteria, which can affect trial diversity.
Eligibility features include:
- age_restrict: 0 if only 18+, 1 for other restrictions (e.g., 18-75)
- stage_size: Restrictions on cancer stage and tumor size
- cancer_site: Restrictions on cancer site
- histological_type: Limited to specific histology (e.g., SCC only)
- performance_score: ECOG or other performance score requirements
- comorbidities: Restrictions on comorbidities
- hx_of_tt: Restrictions on treatment history
- lab_values: Restrictions on laboratory values
- pregnancy_or_contraception: Pregnancy or contraception requirements
- misc: Other restrictions (smoking, ethnicity, etc.)
Args:
df (pandas.DataFrame): Dataset to process
Returns:
pandas.DataFrame: Dataset with added eligibility feature columns
"""
# This is a placeholder for the actual implementation
# In a real scenario, this would parse eligibility criteria text
# and extract the relevant features
df_with_features = df.copy()
# Calculate overall eligibility score (if not already present)
if 'eligibility_score' not in df_with_features.columns:
eligibility_cols = [
'eligibility_age_restrict',
'eligibility_stage_size',
'eligibility_site',
'eligibility_histological_type',
'eligibility_performance_score',
'eligibility_comorbidities',
'eligibility_hx_of_tt',
'eligibility_lab_values',
'eligibility_pregnancy_or_contraception',
'eligibility_misc'
]
# If these columns exist, calculate the sum
if all(col in df_with_features.columns for col in eligibility_cols):
df_with_features['eligibility_score'] = df_with_features[eligibility_cols].sum(axis=1)
return df_with_features
def preprocess_data(input_filepath):
"""
Complete preprocessing pipeline for diversity analysis.
Args:
input_filepath (str): Path to input CSV file
Returns:
tuple: (processed_df, top_df, bottom_df)
"""
# Load data
df = load_data(input_filepath)
# Filter for USA studies
df_usa = filter_usa_studies(df)
# Filter for studies with race information
df_final = filter_race_information(df_usa)
# Calculate success metric
df_final = calculate_success_metric(df_final)
# Extract eligibility features
df_final = extract_eligibility_features(df_final)
# Categorize studies
df_final = categorize_by_diversity(df_final)
# Get top and bottom datasets
df_top, df_bottom = get_top_bottom_datasets(df_final)
return df_final, df_top, df_bottom