|
a |
|
b/src/visualization.py |
|
|
1 |
""" |
|
|
2 |
Visualization Module for Diversity in Head and Neck Cancer Clinical Trials Analysis |
|
|
3 |
|
|
|
4 |
This module contains functions for creating visualizations to help analyze |
|
|
5 |
diversity in head and neck cancer clinical trials, with a particular focus |
|
|
6 |
on comparing eligibility criteria and other study characteristics between |
|
|
7 |
high-diversity and low-diversity studies. |
|
|
8 |
""" |
|
|
9 |
|
|
|
10 |
import numpy as np |
|
|
11 |
import pandas as pd |
|
|
12 |
import plotly.express as px |
|
|
13 |
from plotly import graph_objects as go |
|
|
14 |
from plotly.subplots import make_subplots |
|
|
15 |
|
|
|
16 |
def plot_cdf(df, column_name, title=None, x_title=None, y_title=None, width=800): |
|
|
17 |
""" |
|
|
18 |
Create a cumulative distribution function plot for a specific column. |
|
|
19 |
|
|
|
20 |
This is useful for visualizing the distribution of the success metric |
|
|
21 |
(percentage of non-white participants) across all studies. |
|
|
22 |
|
|
|
23 |
Args: |
|
|
24 |
df (pandas.DataFrame): Dataset |
|
|
25 |
column_name (str): Column to plot |
|
|
26 |
title (str, optional): Plot title |
|
|
27 |
x_title (str, optional): X-axis label |
|
|
28 |
y_title (str, optional): Y-axis label |
|
|
29 |
width (int, optional): Plot width |
|
|
30 |
|
|
|
31 |
Returns: |
|
|
32 |
plotly.graph_objects.Figure: CDF plot |
|
|
33 |
""" |
|
|
34 |
hist, bins = np.histogram(df[column_name], bins=100) |
|
|
35 |
cdf = np.cumsum(hist) |
|
|
36 |
cdf = cdf/cdf[-1] |
|
|
37 |
|
|
|
38 |
fig = px.line(x=bins[:-1], y=cdf) |
|
|
39 |
|
|
|
40 |
if x_title: |
|
|
41 |
fig.update_xaxes(title=x_title) |
|
|
42 |
else: |
|
|
43 |
fig.update_xaxes(title=column_name) |
|
|
44 |
|
|
|
45 |
if y_title: |
|
|
46 |
fig.update_yaxes(title=y_title) |
|
|
47 |
else: |
|
|
48 |
fig.update_yaxes(title="Cumulative Probability") |
|
|
49 |
|
|
|
50 |
if title: |
|
|
51 |
fig.update_layout(title=title) |
|
|
52 |
|
|
|
53 |
fig.update_layout(width=width) |
|
|
54 |
|
|
|
55 |
return fig |
|
|
56 |
|
|
|
57 |
def plot_box_by_category(df, y_column, x_column="success_category", |
|
|
58 |
title=None, x_title=None, y_title=None, width=700): |
|
|
59 |
""" |
|
|
60 |
Create a box plot comparing a variable across categories. |
|
|
61 |
|
|
|
62 |
Useful for comparing eligibility scores or number of participants between |
|
|
63 |
diverse and non-diverse studies. |
|
|
64 |
|
|
|
65 |
Args: |
|
|
66 |
df (pandas.DataFrame): Dataset |
|
|
67 |
y_column (str): Column to display on y-axis |
|
|
68 |
x_column (str, optional): Column to group by on x-axis |
|
|
69 |
title (str, optional): Plot title |
|
|
70 |
x_title (str, optional): X-axis label |
|
|
71 |
y_title (str, optional): Y-axis label |
|
|
72 |
width (int, optional): Plot width |
|
|
73 |
|
|
|
74 |
Returns: |
|
|
75 |
plotly.graph_objects.Figure: Box plot |
|
|
76 |
""" |
|
|
77 |
fig = px.box(df, x=x_column, y=y_column) |
|
|
78 |
|
|
|
79 |
if title: |
|
|
80 |
fig.update_layout(title=title) |
|
|
81 |
|
|
|
82 |
if x_title: |
|
|
83 |
fig.update_xaxes(title=x_title) |
|
|
84 |
|
|
|
85 |
if y_title: |
|
|
86 |
fig.update_yaxes(title=y_title) |
|
|
87 |
|
|
|
88 |
fig.update_layout(width=width) |
|
|
89 |
|
|
|
90 |
return fig |
|
|
91 |
|
|
|
92 |
def plot_box_by_category_color(df, y_column, x_column="success_category", |
|
|
93 |
color_column=None, title=None, |
|
|
94 |
x_title=None, y_title=None, width=700): |
|
|
95 |
""" |
|
|
96 |
Create a box plot comparing a variable across categories with color grouping. |
|
|
97 |
|
|
|
98 |
Useful for comparing participant demographics (e.g., by gender) between |
|
|
99 |
diverse and non-diverse studies. |
|
|
100 |
|
|
|
101 |
Args: |
|
|
102 |
df (pandas.DataFrame): Dataset |
|
|
103 |
y_column (str): Column to display on y-axis |
|
|
104 |
x_column (str, optional): Column to group by on x-axis |
|
|
105 |
color_column (str, optional): Column to use for color grouping |
|
|
106 |
title (str, optional): Plot title |
|
|
107 |
x_title (str, optional): X-axis label |
|
|
108 |
y_title (str, optional): Y-axis label |
|
|
109 |
width (int, optional): Plot width |
|
|
110 |
|
|
|
111 |
Returns: |
|
|
112 |
plotly.graph_objects.Figure: Box plot with color grouping |
|
|
113 |
""" |
|
|
114 |
fig = px.box(df, x=x_column, y=y_column, color=color_column) |
|
|
115 |
|
|
|
116 |
if title: |
|
|
117 |
fig.update_layout(title=title) |
|
|
118 |
|
|
|
119 |
if x_title: |
|
|
120 |
fig.update_xaxes(title=x_title) |
|
|
121 |
|
|
|
122 |
if y_title: |
|
|
123 |
fig.update_yaxes(title=y_title) |
|
|
124 |
|
|
|
125 |
fig.update_layout(width=width) |
|
|
126 |
|
|
|
127 |
return fig |
|
|
128 |
|
|
|
129 |
def compare_field_by_category(df, field, categories=None, height=900, width=1200): |
|
|
130 |
""" |
|
|
131 |
Create subplot comparing the distribution of a field across categories. |
|
|
132 |
|
|
|
133 |
This function is especially useful for comparing eligibility criteria between |
|
|
134 |
diverse and non-diverse studies. It creates a vertical subplot with bar charts |
|
|
135 |
showing the distribution of values for a specific field in each category. |
|
|
136 |
|
|
|
137 |
Args: |
|
|
138 |
df (pandas.DataFrame): Dataset |
|
|
139 |
field (str): Column to compare, typically an eligibility criterion |
|
|
140 |
categories (list, optional): Categories to include in comparison |
|
|
141 |
height (int, optional): Plot height |
|
|
142 |
width (int, optional): Plot width |
|
|
143 |
|
|
|
144 |
Returns: |
|
|
145 |
plotly.graph_objects.Figure: Subplot with field distributions by category |
|
|
146 |
""" |
|
|
147 |
if categories is None: |
|
|
148 |
categories = ["Top20", "Bottom20", "Neither"] |
|
|
149 |
|
|
|
150 |
fig = make_subplots( |
|
|
151 |
rows=len(categories), |
|
|
152 |
subplot_titles=categories, |
|
|
153 |
vertical_spacing=0.1, |
|
|
154 |
shared_xaxes=True |
|
|
155 |
) |
|
|
156 |
|
|
|
157 |
for i, category in enumerate(categories): |
|
|
158 |
df_category = df[df["success_category"] == category][field].value_counts().reset_index() |
|
|
159 |
df_category.columns = [field, "Num. Studies"] |
|
|
160 |
|
|
|
161 |
fig.add_trace( |
|
|
162 |
go.Bar( |
|
|
163 |
x=df_category[field], |
|
|
164 |
y=df_category["Num. Studies"], |
|
|
165 |
name=category |
|
|
166 |
), |
|
|
167 |
row=i+1, |
|
|
168 |
col=1 |
|
|
169 |
) |
|
|
170 |
|
|
|
171 |
# Add field name explanation as annotation |
|
|
172 |
field_descriptions = { |
|
|
173 |
'eligibility_age_restrict': 'Age restriction beyond standard 18+ requirement', |
|
|
174 |
'eligibility_stage_size': 'Restrictions on cancer stage or tumor size', |
|
|
175 |
'eligibility_site': 'Restrictions on specific cancer sites', |
|
|
176 |
'eligibility_histological_type': 'Restrictions on cancer histology (e.g., SCC only)', |
|
|
177 |
'eligibility_performance_score': 'ECOG or other performance status requirements', |
|
|
178 |
'eligibility_comorbidities': 'Restrictions on patient comorbidities', |
|
|
179 |
'eligibility_hx_of_tt': 'Restrictions on previous treatment history', |
|
|
180 |
'eligibility_lab_values': 'Restrictions on laboratory test values', |
|
|
181 |
'eligibility_pregnancy_or_contraception': 'Pregnancy or contraception requirements', |
|
|
182 |
'eligibility_misc': 'Other restrictions (smoking, ethnicity, etc.)', |
|
|
183 |
'is_single_institution': 'Whether the study was conducted at a single institution' |
|
|
184 |
} |
|
|
185 |
|
|
|
186 |
# If we have a description for this field, add it as an annotation |
|
|
187 |
if field in field_descriptions: |
|
|
188 |
fig.add_annotation( |
|
|
189 |
xref="paper", yref="paper", |
|
|
190 |
x=0.5, y=1.05, |
|
|
191 |
text=f"{field.replace('eligibility_', '')}: {field_descriptions[field]}", |
|
|
192 |
showarrow=False, |
|
|
193 |
font=dict(size=12) |
|
|
194 |
) |
|
|
195 |
|
|
|
196 |
fig.update_layout(height=height, width=width) |
|
|
197 |
|
|
|
198 |
return fig |
|
|
199 |
|
|
|
200 |
def create_geo_distribution_plot(df_top, df_bottom, location_column='location', |
|
|
201 |
title='Top and Bottom Study Locations'): |
|
|
202 |
""" |
|
|
203 |
Create a map showing the geographic distribution of top and bottom studies. |
|
|
204 |
|
|
|
205 |
This visualization helps identify whether geographic location correlates with |
|
|
206 |
study diversity, considering that areas with more diverse populations may |
|
|
207 |
have higher potential for diverse study recruitment. |
|
|
208 |
|
|
|
209 |
Args: |
|
|
210 |
df_top (pandas.DataFrame): Dataset of top diverse studies |
|
|
211 |
df_bottom (pandas.DataFrame): Dataset of bottom diverse studies |
|
|
212 |
location_column (str, optional): Column with location information |
|
|
213 |
title (str, optional): Plot title |
|
|
214 |
|
|
|
215 |
Returns: |
|
|
216 |
plotly.graph_objects.Figure: Geographic distribution plot |
|
|
217 |
""" |
|
|
218 |
# This function would normally use the actual location data to create a map |
|
|
219 |
# The implementation would depend on how the location data is stored |
|
|
220 |
|
|
|
221 |
# Placeholder implementation |
|
|
222 |
fig = go.Figure() |
|
|
223 |
|
|
|
224 |
# Add locations for top studies |
|
|
225 |
fig.add_trace(go.Scattergeo( |
|
|
226 |
locationmode='USA-states', |
|
|
227 |
text=df_top[location_column], |
|
|
228 |
mode='markers', |
|
|
229 |
marker=dict( |
|
|
230 |
size=10, |
|
|
231 |
color='blue', |
|
|
232 |
line=dict(width=1, color='black') |
|
|
233 |
), |
|
|
234 |
name='Top Diverse Studies' |
|
|
235 |
)) |
|
|
236 |
|
|
|
237 |
# Add locations for bottom studies |
|
|
238 |
fig.add_trace(go.Scattergeo( |
|
|
239 |
locationmode='USA-states', |
|
|
240 |
text=df_bottom[location_column], |
|
|
241 |
mode='markers', |
|
|
242 |
marker=dict( |
|
|
243 |
size=10, |
|
|
244 |
color='red', |
|
|
245 |
line=dict(width=1, color='black') |
|
|
246 |
), |
|
|
247 |
name='Bottom Diverse Studies' |
|
|
248 |
)) |
|
|
249 |
|
|
|
250 |
fig.update_layout( |
|
|
251 |
title=title, |
|
|
252 |
geo=dict( |
|
|
253 |
scope='usa', |
|
|
254 |
showland=True, |
|
|
255 |
landcolor='rgb(217, 217, 217)', |
|
|
256 |
countrycolor='rgb(255, 255, 255)', |
|
|
257 |
lakecolor='rgb(255, 255, 255)', |
|
|
258 |
showlakes=True |
|
|
259 |
) |
|
|
260 |
) |
|
|
261 |
|
|
|
262 |
return fig |
|
|
263 |
|
|
|
264 |
def create_participant_distribution_by_gender(df_top, df_bottom): |
|
|
265 |
""" |
|
|
266 |
Create a boxplot showing the distribution of participants by gender for top and bottom studies. |
|
|
267 |
|
|
|
268 |
This visualization helps identify whether gender distribution differs between |
|
|
269 |
high-diversity and low-diversity studies. |
|
|
270 |
|
|
|
271 |
Args: |
|
|
272 |
df_top (pandas.DataFrame): Dataset of top diverse studies |
|
|
273 |
df_bottom (pandas.DataFrame): Dataset of bottom diverse studies |
|
|
274 |
|
|
|
275 |
Returns: |
|
|
276 |
plotly.graph_objects.Figure: Box plot of participant distribution by gender |
|
|
277 |
""" |
|
|
278 |
# Prepare data for visualization |
|
|
279 |
df_top_copy = df_top.copy() |
|
|
280 |
df_bottom_copy = df_bottom.copy() |
|
|
281 |
|
|
|
282 |
df_top_copy["success_category"] = "top" |
|
|
283 |
df_bottom_copy["success_category"] = "bottom" |
|
|
284 |
df_all = pd.concat([df_top_copy, df_bottom_copy]) |
|
|
285 |
|
|
|
286 |
df_num_participants = pd.concat([ |
|
|
287 |
df_all[["success_category", "num_male_participants"]] |
|
|
288 |
.assign(sex="male") |
|
|
289 |
.rename(columns={"num_male_participants": "num_participants"}), |
|
|
290 |
df_all[["success_category", "num_female_participants"]] |
|
|
291 |
.assign(sex="female") |
|
|
292 |
.rename(columns={"num_female_participants": "num_participants"}), |
|
|
293 |
]) |
|
|
294 |
|
|
|
295 |
# Create visualization |
|
|
296 |
fig = px.box( |
|
|
297 |
df_num_participants, |
|
|
298 |
x="success_category", |
|
|
299 |
y="num_participants", |
|
|
300 |
color="sex", |
|
|
301 |
title="Distribution of Participants by Gender in Top vs Bottom Studies", |
|
|
302 |
labels={ |
|
|
303 |
"success_category": "Success Category", |
|
|
304 |
"num_participants": "Number of Participants", |
|
|
305 |
"sex": "Gender" |
|
|
306 |
} |
|
|
307 |
) |
|
|
308 |
|
|
|
309 |
# Add annotation explaining the visualization |
|
|
310 |
fig.add_annotation( |
|
|
311 |
xref="paper", yref="paper", |
|
|
312 |
x=0.5, y=1.05, |
|
|
313 |
text="This plot shows gender distribution in high vs. low diversity studies", |
|
|
314 |
showarrow=False, |
|
|
315 |
font=dict(size=12) |
|
|
316 |
) |
|
|
317 |
|
|
|
318 |
fig.update_layout(width=700) |
|
|
319 |
|
|
|
320 |
return fig |
|
|
321 |
|
|
|
322 |
def create_eligibility_score_comparison(df_top, df_bottom): |
|
|
323 |
""" |
|
|
324 |
Create a boxplot comparing eligibility scores between top and bottom diverse studies. |
|
|
325 |
|
|
|
326 |
This visualization helps determine if more restrictive eligibility criteria |
|
|
327 |
(higher scores) correlate with lower diversity. |
|
|
328 |
|
|
|
329 |
Args: |
|
|
330 |
df_top (pandas.DataFrame): Dataset of top diverse studies |
|
|
331 |
df_bottom (pandas.DataFrame): Dataset of bottom diverse studies |
|
|
332 |
|
|
|
333 |
Returns: |
|
|
334 |
plotly.graph_objects.Figure: Box plot of eligibility scores |
|
|
335 |
""" |
|
|
336 |
df_top_copy = df_top.copy() |
|
|
337 |
df_bottom_copy = df_bottom.copy() |
|
|
338 |
|
|
|
339 |
df_top_copy["diversity_category"] = "diverse" |
|
|
340 |
df_bottom_copy["diversity_category"] = "not_diverse" |
|
|
341 |
df_combined = pd.concat([df_top_copy, df_bottom_copy]) |
|
|
342 |
|
|
|
343 |
fig = px.box( |
|
|
344 |
df_combined, |
|
|
345 |
x="diversity_category", |
|
|
346 |
y="eligibility_score", |
|
|
347 |
title="Distribution of eligibility score for diverse studies vs non-diverse studies", |
|
|
348 |
labels={ |
|
|
349 |
"diversity_category": "Diversity Category", |
|
|
350 |
"eligibility_score": "Eligibility Score" |
|
|
351 |
} |
|
|
352 |
) |
|
|
353 |
|
|
|
354 |
# Add annotation explaining the eligibility score |
|
|
355 |
fig.add_annotation( |
|
|
356 |
xref="paper", yref="paper", |
|
|
357 |
x=0.5, y=1.05, |
|
|
358 |
text="Eligibility score is the sum of all eligibility restrictions (higher = more restrictive)", |
|
|
359 |
showarrow=False, |
|
|
360 |
font=dict(size=12) |
|
|
361 |
) |
|
|
362 |
|
|
|
363 |
return fig |