[3cdecf]: / src / visualization.py

Download this file

363 lines (295 with data), 12.4 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
"""
Visualization Module for Diversity in Head and Neck Cancer Clinical Trials Analysis
This module contains functions for creating visualizations to help analyze
diversity in head and neck cancer clinical trials, with a particular focus
on comparing eligibility criteria and other study characteristics between
high-diversity and low-diversity studies.
"""
import numpy as np
import pandas as pd
import plotly.express as px
from plotly import graph_objects as go
from plotly.subplots import make_subplots
def plot_cdf(df, column_name, title=None, x_title=None, y_title=None, width=800):
"""
Create a cumulative distribution function plot for a specific column.
This is useful for visualizing the distribution of the success metric
(percentage of non-white participants) across all studies.
Args:
df (pandas.DataFrame): Dataset
column_name (str): Column to plot
title (str, optional): Plot title
x_title (str, optional): X-axis label
y_title (str, optional): Y-axis label
width (int, optional): Plot width
Returns:
plotly.graph_objects.Figure: CDF plot
"""
hist, bins = np.histogram(df[column_name], bins=100)
cdf = np.cumsum(hist)
cdf = cdf/cdf[-1]
fig = px.line(x=bins[:-1], y=cdf)
if x_title:
fig.update_xaxes(title=x_title)
else:
fig.update_xaxes(title=column_name)
if y_title:
fig.update_yaxes(title=y_title)
else:
fig.update_yaxes(title="Cumulative Probability")
if title:
fig.update_layout(title=title)
fig.update_layout(width=width)
return fig
def plot_box_by_category(df, y_column, x_column="success_category",
title=None, x_title=None, y_title=None, width=700):
"""
Create a box plot comparing a variable across categories.
Useful for comparing eligibility scores or number of participants between
diverse and non-diverse studies.
Args:
df (pandas.DataFrame): Dataset
y_column (str): Column to display on y-axis
x_column (str, optional): Column to group by on x-axis
title (str, optional): Plot title
x_title (str, optional): X-axis label
y_title (str, optional): Y-axis label
width (int, optional): Plot width
Returns:
plotly.graph_objects.Figure: Box plot
"""
fig = px.box(df, x=x_column, y=y_column)
if title:
fig.update_layout(title=title)
if x_title:
fig.update_xaxes(title=x_title)
if y_title:
fig.update_yaxes(title=y_title)
fig.update_layout(width=width)
return fig
def plot_box_by_category_color(df, y_column, x_column="success_category",
color_column=None, title=None,
x_title=None, y_title=None, width=700):
"""
Create a box plot comparing a variable across categories with color grouping.
Useful for comparing participant demographics (e.g., by gender) between
diverse and non-diverse studies.
Args:
df (pandas.DataFrame): Dataset
y_column (str): Column to display on y-axis
x_column (str, optional): Column to group by on x-axis
color_column (str, optional): Column to use for color grouping
title (str, optional): Plot title
x_title (str, optional): X-axis label
y_title (str, optional): Y-axis label
width (int, optional): Plot width
Returns:
plotly.graph_objects.Figure: Box plot with color grouping
"""
fig = px.box(df, x=x_column, y=y_column, color=color_column)
if title:
fig.update_layout(title=title)
if x_title:
fig.update_xaxes(title=x_title)
if y_title:
fig.update_yaxes(title=y_title)
fig.update_layout(width=width)
return fig
def compare_field_by_category(df, field, categories=None, height=900, width=1200):
"""
Create subplot comparing the distribution of a field across categories.
This function is especially useful for comparing eligibility criteria between
diverse and non-diverse studies. It creates a vertical subplot with bar charts
showing the distribution of values for a specific field in each category.
Args:
df (pandas.DataFrame): Dataset
field (str): Column to compare, typically an eligibility criterion
categories (list, optional): Categories to include in comparison
height (int, optional): Plot height
width (int, optional): Plot width
Returns:
plotly.graph_objects.Figure: Subplot with field distributions by category
"""
if categories is None:
categories = ["Top20", "Bottom20", "Neither"]
fig = make_subplots(
rows=len(categories),
subplot_titles=categories,
vertical_spacing=0.1,
shared_xaxes=True
)
for i, category in enumerate(categories):
df_category = df[df["success_category"] == category][field].value_counts().reset_index()
df_category.columns = [field, "Num. Studies"]
fig.add_trace(
go.Bar(
x=df_category[field],
y=df_category["Num. Studies"],
name=category
),
row=i+1,
col=1
)
# Add field name explanation as annotation
field_descriptions = {
'eligibility_age_restrict': 'Age restriction beyond standard 18+ requirement',
'eligibility_stage_size': 'Restrictions on cancer stage or tumor size',
'eligibility_site': 'Restrictions on specific cancer sites',
'eligibility_histological_type': 'Restrictions on cancer histology (e.g., SCC only)',
'eligibility_performance_score': 'ECOG or other performance status requirements',
'eligibility_comorbidities': 'Restrictions on patient comorbidities',
'eligibility_hx_of_tt': 'Restrictions on previous treatment history',
'eligibility_lab_values': 'Restrictions on laboratory test values',
'eligibility_pregnancy_or_contraception': 'Pregnancy or contraception requirements',
'eligibility_misc': 'Other restrictions (smoking, ethnicity, etc.)',
'is_single_institution': 'Whether the study was conducted at a single institution'
}
# If we have a description for this field, add it as an annotation
if field in field_descriptions:
fig.add_annotation(
xref="paper", yref="paper",
x=0.5, y=1.05,
text=f"{field.replace('eligibility_', '')}: {field_descriptions[field]}",
showarrow=False,
font=dict(size=12)
)
fig.update_layout(height=height, width=width)
return fig
def create_geo_distribution_plot(df_top, df_bottom, location_column='location',
title='Top and Bottom Study Locations'):
"""
Create a map showing the geographic distribution of top and bottom studies.
This visualization helps identify whether geographic location correlates with
study diversity, considering that areas with more diverse populations may
have higher potential for diverse study recruitment.
Args:
df_top (pandas.DataFrame): Dataset of top diverse studies
df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
location_column (str, optional): Column with location information
title (str, optional): Plot title
Returns:
plotly.graph_objects.Figure: Geographic distribution plot
"""
# This function would normally use the actual location data to create a map
# The implementation would depend on how the location data is stored
# Placeholder implementation
fig = go.Figure()
# Add locations for top studies
fig.add_trace(go.Scattergeo(
locationmode='USA-states',
text=df_top[location_column],
mode='markers',
marker=dict(
size=10,
color='blue',
line=dict(width=1, color='black')
),
name='Top Diverse Studies'
))
# Add locations for bottom studies
fig.add_trace(go.Scattergeo(
locationmode='USA-states',
text=df_bottom[location_column],
mode='markers',
marker=dict(
size=10,
color='red',
line=dict(width=1, color='black')
),
name='Bottom Diverse Studies'
))
fig.update_layout(
title=title,
geo=dict(
scope='usa',
showland=True,
landcolor='rgb(217, 217, 217)',
countrycolor='rgb(255, 255, 255)',
lakecolor='rgb(255, 255, 255)',
showlakes=True
)
)
return fig
def create_participant_distribution_by_gender(df_top, df_bottom):
"""
Create a boxplot showing the distribution of participants by gender for top and bottom studies.
This visualization helps identify whether gender distribution differs between
high-diversity and low-diversity studies.
Args:
df_top (pandas.DataFrame): Dataset of top diverse studies
df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
Returns:
plotly.graph_objects.Figure: Box plot of participant distribution by gender
"""
# Prepare data for visualization
df_top_copy = df_top.copy()
df_bottom_copy = df_bottom.copy()
df_top_copy["success_category"] = "top"
df_bottom_copy["success_category"] = "bottom"
df_all = pd.concat([df_top_copy, df_bottom_copy])
df_num_participants = pd.concat([
df_all[["success_category", "num_male_participants"]]
.assign(sex="male")
.rename(columns={"num_male_participants": "num_participants"}),
df_all[["success_category", "num_female_participants"]]
.assign(sex="female")
.rename(columns={"num_female_participants": "num_participants"}),
])
# Create visualization
fig = px.box(
df_num_participants,
x="success_category",
y="num_participants",
color="sex",
title="Distribution of Participants by Gender in Top vs Bottom Studies",
labels={
"success_category": "Success Category",
"num_participants": "Number of Participants",
"sex": "Gender"
}
)
# Add annotation explaining the visualization
fig.add_annotation(
xref="paper", yref="paper",
x=0.5, y=1.05,
text="This plot shows gender distribution in high vs. low diversity studies",
showarrow=False,
font=dict(size=12)
)
fig.update_layout(width=700)
return fig
def create_eligibility_score_comparison(df_top, df_bottom):
"""
Create a boxplot comparing eligibility scores between top and bottom diverse studies.
This visualization helps determine if more restrictive eligibility criteria
(higher scores) correlate with lower diversity.
Args:
df_top (pandas.DataFrame): Dataset of top diverse studies
df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
Returns:
plotly.graph_objects.Figure: Box plot of eligibility scores
"""
df_top_copy = df_top.copy()
df_bottom_copy = df_bottom.copy()
df_top_copy["diversity_category"] = "diverse"
df_bottom_copy["diversity_category"] = "not_diverse"
df_combined = pd.concat([df_top_copy, df_bottom_copy])
fig = px.box(
df_combined,
x="diversity_category",
y="eligibility_score",
title="Distribution of eligibility score for diverse studies vs non-diverse studies",
labels={
"diversity_category": "Diversity Category",
"eligibility_score": "Eligibility Score"
}
)
# Add annotation explaining the eligibility score
fig.add_annotation(
xref="paper", yref="paper",
x=0.5, y=1.05,
text="Eligibility score is the sum of all eligibility restrictions (higher = more restrictive)",
showarrow=False,
font=dict(size=12)
)
return fig