Diff of /src/visualization.py [000000] .. [3cdecf]

Switch to unified view

a b/src/visualization.py
1
"""
2
Visualization Module for Diversity in Head and Neck Cancer Clinical Trials Analysis
3
4
This module contains functions for creating visualizations to help analyze
5
diversity in head and neck cancer clinical trials, with a particular focus
6
on comparing eligibility criteria and other study characteristics between
7
high-diversity and low-diversity studies.
8
"""
9
10
import numpy as np
11
import pandas as pd
12
import plotly.express as px
13
from plotly import graph_objects as go
14
from plotly.subplots import make_subplots
15
16
def plot_cdf(df, column_name, title=None, x_title=None, y_title=None, width=800):
17
    """
18
    Create a cumulative distribution function plot for a specific column.
19
    
20
    This is useful for visualizing the distribution of the success metric
21
    (percentage of non-white participants) across all studies.
22
    
23
    Args:
24
        df (pandas.DataFrame): Dataset
25
        column_name (str): Column to plot
26
        title (str, optional): Plot title
27
        x_title (str, optional): X-axis label
28
        y_title (str, optional): Y-axis label
29
        width (int, optional): Plot width
30
        
31
    Returns:
32
        plotly.graph_objects.Figure: CDF plot
33
    """
34
    hist, bins = np.histogram(df[column_name], bins=100)
35
    cdf = np.cumsum(hist)
36
    cdf = cdf/cdf[-1]
37
    
38
    fig = px.line(x=bins[:-1], y=cdf)
39
    
40
    if x_title:
41
        fig.update_xaxes(title=x_title)
42
    else:
43
        fig.update_xaxes(title=column_name)
44
        
45
    if y_title:
46
        fig.update_yaxes(title=y_title)
47
    else:
48
        fig.update_yaxes(title="Cumulative Probability")
49
        
50
    if title:
51
        fig.update_layout(title=title)
52
        
53
    fig.update_layout(width=width)
54
    
55
    return fig
56
57
def plot_box_by_category(df, y_column, x_column="success_category", 
58
                         title=None, x_title=None, y_title=None, width=700):
59
    """
60
    Create a box plot comparing a variable across categories.
61
    
62
    Useful for comparing eligibility scores or number of participants between
63
    diverse and non-diverse studies.
64
    
65
    Args:
66
        df (pandas.DataFrame): Dataset
67
        y_column (str): Column to display on y-axis
68
        x_column (str, optional): Column to group by on x-axis
69
        title (str, optional): Plot title
70
        x_title (str, optional): X-axis label
71
        y_title (str, optional): Y-axis label
72
        width (int, optional): Plot width
73
        
74
    Returns:
75
        plotly.graph_objects.Figure: Box plot
76
    """
77
    fig = px.box(df, x=x_column, y=y_column)
78
    
79
    if title:
80
        fig.update_layout(title=title)
81
    
82
    if x_title:
83
        fig.update_xaxes(title=x_title)
84
    
85
    if y_title:
86
        fig.update_yaxes(title=y_title)
87
    
88
    fig.update_layout(width=width)
89
    
90
    return fig
91
92
def plot_box_by_category_color(df, y_column, x_column="success_category", 
93
                               color_column=None, title=None, 
94
                               x_title=None, y_title=None, width=700):
95
    """
96
    Create a box plot comparing a variable across categories with color grouping.
97
    
98
    Useful for comparing participant demographics (e.g., by gender) between
99
    diverse and non-diverse studies.
100
    
101
    Args:
102
        df (pandas.DataFrame): Dataset
103
        y_column (str): Column to display on y-axis
104
        x_column (str, optional): Column to group by on x-axis
105
        color_column (str, optional): Column to use for color grouping
106
        title (str, optional): Plot title
107
        x_title (str, optional): X-axis label
108
        y_title (str, optional): Y-axis label
109
        width (int, optional): Plot width
110
        
111
    Returns:
112
        plotly.graph_objects.Figure: Box plot with color grouping
113
    """
114
    fig = px.box(df, x=x_column, y=y_column, color=color_column)
115
    
116
    if title:
117
        fig.update_layout(title=title)
118
    
119
    if x_title:
120
        fig.update_xaxes(title=x_title)
121
    
122
    if y_title:
123
        fig.update_yaxes(title=y_title)
124
    
125
    fig.update_layout(width=width)
126
    
127
    return fig
128
129
def compare_field_by_category(df, field, categories=None, height=900, width=1200):
130
    """
131
    Create subplot comparing the distribution of a field across categories.
132
    
133
    This function is especially useful for comparing eligibility criteria between
134
    diverse and non-diverse studies. It creates a vertical subplot with bar charts
135
    showing the distribution of values for a specific field in each category.
136
    
137
    Args:
138
        df (pandas.DataFrame): Dataset
139
        field (str): Column to compare, typically an eligibility criterion
140
        categories (list, optional): Categories to include in comparison
141
        height (int, optional): Plot height
142
        width (int, optional): Plot width
143
        
144
    Returns:
145
        plotly.graph_objects.Figure: Subplot with field distributions by category
146
    """
147
    if categories is None:
148
        categories = ["Top20", "Bottom20", "Neither"]
149
    
150
    fig = make_subplots(
151
        rows=len(categories), 
152
        subplot_titles=categories, 
153
        vertical_spacing=0.1, 
154
        shared_xaxes=True
155
    )
156
    
157
    for i, category in enumerate(categories):
158
        df_category = df[df["success_category"] == category][field].value_counts().reset_index()
159
        df_category.columns = [field, "Num. Studies"]
160
        
161
        fig.add_trace(
162
            go.Bar(
163
                x=df_category[field],
164
                y=df_category["Num. Studies"],
165
                name=category
166
            ),
167
            row=i+1,
168
            col=1
169
        )
170
    
171
    # Add field name explanation as annotation
172
    field_descriptions = {
173
        'eligibility_age_restrict': 'Age restriction beyond standard 18+ requirement',
174
        'eligibility_stage_size': 'Restrictions on cancer stage or tumor size',
175
        'eligibility_site': 'Restrictions on specific cancer sites',
176
        'eligibility_histological_type': 'Restrictions on cancer histology (e.g., SCC only)',
177
        'eligibility_performance_score': 'ECOG or other performance status requirements',
178
        'eligibility_comorbidities': 'Restrictions on patient comorbidities',
179
        'eligibility_hx_of_tt': 'Restrictions on previous treatment history',
180
        'eligibility_lab_values': 'Restrictions on laboratory test values',
181
        'eligibility_pregnancy_or_contraception': 'Pregnancy or contraception requirements',
182
        'eligibility_misc': 'Other restrictions (smoking, ethnicity, etc.)',
183
        'is_single_institution': 'Whether the study was conducted at a single institution'
184
    }
185
    
186
    # If we have a description for this field, add it as an annotation
187
    if field in field_descriptions:
188
        fig.add_annotation(
189
            xref="paper", yref="paper",
190
            x=0.5, y=1.05,
191
            text=f"{field.replace('eligibility_', '')}: {field_descriptions[field]}",
192
            showarrow=False,
193
            font=dict(size=12)
194
        )
195
    
196
    fig.update_layout(height=height, width=width)
197
    
198
    return fig
199
200
def create_geo_distribution_plot(df_top, df_bottom, location_column='location', 
201
                                title='Top and Bottom Study Locations'):
202
    """
203
    Create a map showing the geographic distribution of top and bottom studies.
204
    
205
    This visualization helps identify whether geographic location correlates with
206
    study diversity, considering that areas with more diverse populations may
207
    have higher potential for diverse study recruitment.
208
    
209
    Args:
210
        df_top (pandas.DataFrame): Dataset of top diverse studies
211
        df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
212
        location_column (str, optional): Column with location information
213
        title (str, optional): Plot title
214
        
215
    Returns:
216
        plotly.graph_objects.Figure: Geographic distribution plot
217
    """
218
    # This function would normally use the actual location data to create a map
219
    # The implementation would depend on how the location data is stored
220
    
221
    # Placeholder implementation
222
    fig = go.Figure()
223
    
224
    # Add locations for top studies
225
    fig.add_trace(go.Scattergeo(
226
        locationmode='USA-states',
227
        text=df_top[location_column],
228
        mode='markers',
229
        marker=dict(
230
            size=10,
231
            color='blue',
232
            line=dict(width=1, color='black')
233
        ),
234
        name='Top Diverse Studies'
235
    ))
236
    
237
    # Add locations for bottom studies
238
    fig.add_trace(go.Scattergeo(
239
        locationmode='USA-states',
240
        text=df_bottom[location_column],
241
        mode='markers',
242
        marker=dict(
243
            size=10,
244
            color='red',
245
            line=dict(width=1, color='black')
246
        ),
247
        name='Bottom Diverse Studies'
248
    ))
249
    
250
    fig.update_layout(
251
        title=title,
252
        geo=dict(
253
            scope='usa',
254
            showland=True,
255
            landcolor='rgb(217, 217, 217)',
256
            countrycolor='rgb(255, 255, 255)',
257
            lakecolor='rgb(255, 255, 255)',
258
            showlakes=True
259
        )
260
    )
261
    
262
    return fig
263
264
def create_participant_distribution_by_gender(df_top, df_bottom):
265
    """
266
    Create a boxplot showing the distribution of participants by gender for top and bottom studies.
267
    
268
    This visualization helps identify whether gender distribution differs between
269
    high-diversity and low-diversity studies.
270
    
271
    Args:
272
        df_top (pandas.DataFrame): Dataset of top diverse studies
273
        df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
274
        
275
    Returns:
276
        plotly.graph_objects.Figure: Box plot of participant distribution by gender
277
    """
278
    # Prepare data for visualization
279
    df_top_copy = df_top.copy()
280
    df_bottom_copy = df_bottom.copy()
281
    
282
    df_top_copy["success_category"] = "top"
283
    df_bottom_copy["success_category"] = "bottom"
284
    df_all = pd.concat([df_top_copy, df_bottom_copy])
285
    
286
    df_num_participants = pd.concat([
287
        df_all[["success_category", "num_male_participants"]]
288
            .assign(sex="male")
289
            .rename(columns={"num_male_participants": "num_participants"}),
290
        df_all[["success_category", "num_female_participants"]]
291
            .assign(sex="female")
292
            .rename(columns={"num_female_participants": "num_participants"}),
293
    ])
294
    
295
    # Create visualization
296
    fig = px.box(
297
        df_num_participants, 
298
        x="success_category", 
299
        y="num_participants", 
300
        color="sex",
301
        title="Distribution of Participants by Gender in Top vs Bottom Studies",
302
        labels={
303
            "success_category": "Success Category",
304
            "num_participants": "Number of Participants",
305
            "sex": "Gender"
306
        }
307
    )
308
    
309
    # Add annotation explaining the visualization
310
    fig.add_annotation(
311
        xref="paper", yref="paper",
312
        x=0.5, y=1.05,
313
        text="This plot shows gender distribution in high vs. low diversity studies",
314
        showarrow=False,
315
        font=dict(size=12)
316
    )
317
    
318
    fig.update_layout(width=700)
319
    
320
    return fig
321
322
def create_eligibility_score_comparison(df_top, df_bottom):
323
    """
324
    Create a boxplot comparing eligibility scores between top and bottom diverse studies.
325
    
326
    This visualization helps determine if more restrictive eligibility criteria
327
    (higher scores) correlate with lower diversity.
328
    
329
    Args:
330
        df_top (pandas.DataFrame): Dataset of top diverse studies
331
        df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
332
        
333
    Returns:
334
        plotly.graph_objects.Figure: Box plot of eligibility scores
335
    """
336
    df_top_copy = df_top.copy()
337
    df_bottom_copy = df_bottom.copy()
338
    
339
    df_top_copy["diversity_category"] = "diverse"
340
    df_bottom_copy["diversity_category"] = "not_diverse"
341
    df_combined = pd.concat([df_top_copy, df_bottom_copy])
342
    
343
    fig = px.box(
344
        df_combined,
345
        x="diversity_category",
346
        y="eligibility_score",
347
        title="Distribution of eligibility score for diverse studies vs non-diverse studies",
348
        labels={
349
            "diversity_category": "Diversity Category",
350
            "eligibility_score": "Eligibility Score"
351
        }
352
    )
353
    
354
    # Add annotation explaining the eligibility score
355
    fig.add_annotation(
356
        xref="paper", yref="paper",
357
        x=0.5, y=1.05,
358
        text="Eligibility score is the sum of all eligibility restrictions (higher = more restrictive)",
359
        showarrow=False,
360
        font=dict(size=12)
361
    )
362
    
363
    return fig