"""
Visualization Module for Diversity in Head and Neck Cancer Clinical Trials Analysis
This module contains functions for creating visualizations to help analyze
diversity in head and neck cancer clinical trials, with a particular focus
on comparing eligibility criteria and other study characteristics between
high-diversity and low-diversity studies.
"""
import numpy as np
import pandas as pd
import plotly.express as px
from plotly import graph_objects as go
from plotly.subplots import make_subplots
def plot_cdf(df, column_name, title=None, x_title=None, y_title=None, width=800):
"""
Create a cumulative distribution function plot for a specific column.
This is useful for visualizing the distribution of the success metric
(percentage of non-white participants) across all studies.
Args:
df (pandas.DataFrame): Dataset
column_name (str): Column to plot
title (str, optional): Plot title
x_title (str, optional): X-axis label
y_title (str, optional): Y-axis label
width (int, optional): Plot width
Returns:
plotly.graph_objects.Figure: CDF plot
"""
hist, bins = np.histogram(df[column_name], bins=100)
cdf = np.cumsum(hist)
cdf = cdf/cdf[-1]
fig = px.line(x=bins[:-1], y=cdf)
if x_title:
fig.update_xaxes(title=x_title)
else:
fig.update_xaxes(title=column_name)
if y_title:
fig.update_yaxes(title=y_title)
else:
fig.update_yaxes(title="Cumulative Probability")
if title:
fig.update_layout(title=title)
fig.update_layout(width=width)
return fig
def plot_box_by_category(df, y_column, x_column="success_category",
title=None, x_title=None, y_title=None, width=700):
"""
Create a box plot comparing a variable across categories.
Useful for comparing eligibility scores or number of participants between
diverse and non-diverse studies.
Args:
df (pandas.DataFrame): Dataset
y_column (str): Column to display on y-axis
x_column (str, optional): Column to group by on x-axis
title (str, optional): Plot title
x_title (str, optional): X-axis label
y_title (str, optional): Y-axis label
width (int, optional): Plot width
Returns:
plotly.graph_objects.Figure: Box plot
"""
fig = px.box(df, x=x_column, y=y_column)
if title:
fig.update_layout(title=title)
if x_title:
fig.update_xaxes(title=x_title)
if y_title:
fig.update_yaxes(title=y_title)
fig.update_layout(width=width)
return fig
def plot_box_by_category_color(df, y_column, x_column="success_category",
color_column=None, title=None,
x_title=None, y_title=None, width=700):
"""
Create a box plot comparing a variable across categories with color grouping.
Useful for comparing participant demographics (e.g., by gender) between
diverse and non-diverse studies.
Args:
df (pandas.DataFrame): Dataset
y_column (str): Column to display on y-axis
x_column (str, optional): Column to group by on x-axis
color_column (str, optional): Column to use for color grouping
title (str, optional): Plot title
x_title (str, optional): X-axis label
y_title (str, optional): Y-axis label
width (int, optional): Plot width
Returns:
plotly.graph_objects.Figure: Box plot with color grouping
"""
fig = px.box(df, x=x_column, y=y_column, color=color_column)
if title:
fig.update_layout(title=title)
if x_title:
fig.update_xaxes(title=x_title)
if y_title:
fig.update_yaxes(title=y_title)
fig.update_layout(width=width)
return fig
def compare_field_by_category(df, field, categories=None, height=900, width=1200):
"""
Create subplot comparing the distribution of a field across categories.
This function is especially useful for comparing eligibility criteria between
diverse and non-diverse studies. It creates a vertical subplot with bar charts
showing the distribution of values for a specific field in each category.
Args:
df (pandas.DataFrame): Dataset
field (str): Column to compare, typically an eligibility criterion
categories (list, optional): Categories to include in comparison
height (int, optional): Plot height
width (int, optional): Plot width
Returns:
plotly.graph_objects.Figure: Subplot with field distributions by category
"""
if categories is None:
categories = ["Top20", "Bottom20", "Neither"]
fig = make_subplots(
rows=len(categories),
subplot_titles=categories,
vertical_spacing=0.1,
shared_xaxes=True
)
for i, category in enumerate(categories):
df_category = df[df["success_category"] == category][field].value_counts().reset_index()
df_category.columns = [field, "Num. Studies"]
fig.add_trace(
go.Bar(
x=df_category[field],
y=df_category["Num. Studies"],
name=category
),
row=i+1,
col=1
)
# Add field name explanation as annotation
field_descriptions = {
'eligibility_age_restrict': 'Age restriction beyond standard 18+ requirement',
'eligibility_stage_size': 'Restrictions on cancer stage or tumor size',
'eligibility_site': 'Restrictions on specific cancer sites',
'eligibility_histological_type': 'Restrictions on cancer histology (e.g., SCC only)',
'eligibility_performance_score': 'ECOG or other performance status requirements',
'eligibility_comorbidities': 'Restrictions on patient comorbidities',
'eligibility_hx_of_tt': 'Restrictions on previous treatment history',
'eligibility_lab_values': 'Restrictions on laboratory test values',
'eligibility_pregnancy_or_contraception': 'Pregnancy or contraception requirements',
'eligibility_misc': 'Other restrictions (smoking, ethnicity, etc.)',
'is_single_institution': 'Whether the study was conducted at a single institution'
}
# If we have a description for this field, add it as an annotation
if field in field_descriptions:
fig.add_annotation(
xref="paper", yref="paper",
x=0.5, y=1.05,
text=f"{field.replace('eligibility_', '')}: {field_descriptions[field]}",
showarrow=False,
font=dict(size=12)
)
fig.update_layout(height=height, width=width)
return fig
def create_geo_distribution_plot(df_top, df_bottom, location_column='location',
title='Top and Bottom Study Locations'):
"""
Create a map showing the geographic distribution of top and bottom studies.
This visualization helps identify whether geographic location correlates with
study diversity, considering that areas with more diverse populations may
have higher potential for diverse study recruitment.
Args:
df_top (pandas.DataFrame): Dataset of top diverse studies
df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
location_column (str, optional): Column with location information
title (str, optional): Plot title
Returns:
plotly.graph_objects.Figure: Geographic distribution plot
"""
# This function would normally use the actual location data to create a map
# The implementation would depend on how the location data is stored
# Placeholder implementation
fig = go.Figure()
# Add locations for top studies
fig.add_trace(go.Scattergeo(
locationmode='USA-states',
text=df_top[location_column],
mode='markers',
marker=dict(
size=10,
color='blue',
line=dict(width=1, color='black')
),
name='Top Diverse Studies'
))
# Add locations for bottom studies
fig.add_trace(go.Scattergeo(
locationmode='USA-states',
text=df_bottom[location_column],
mode='markers',
marker=dict(
size=10,
color='red',
line=dict(width=1, color='black')
),
name='Bottom Diverse Studies'
))
fig.update_layout(
title=title,
geo=dict(
scope='usa',
showland=True,
landcolor='rgb(217, 217, 217)',
countrycolor='rgb(255, 255, 255)',
lakecolor='rgb(255, 255, 255)',
showlakes=True
)
)
return fig
def create_participant_distribution_by_gender(df_top, df_bottom):
"""
Create a boxplot showing the distribution of participants by gender for top and bottom studies.
This visualization helps identify whether gender distribution differs between
high-diversity and low-diversity studies.
Args:
df_top (pandas.DataFrame): Dataset of top diverse studies
df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
Returns:
plotly.graph_objects.Figure: Box plot of participant distribution by gender
"""
# Prepare data for visualization
df_top_copy = df_top.copy()
df_bottom_copy = df_bottom.copy()
df_top_copy["success_category"] = "top"
df_bottom_copy["success_category"] = "bottom"
df_all = pd.concat([df_top_copy, df_bottom_copy])
df_num_participants = pd.concat([
df_all[["success_category", "num_male_participants"]]
.assign(sex="male")
.rename(columns={"num_male_participants": "num_participants"}),
df_all[["success_category", "num_female_participants"]]
.assign(sex="female")
.rename(columns={"num_female_participants": "num_participants"}),
])
# Create visualization
fig = px.box(
df_num_participants,
x="success_category",
y="num_participants",
color="sex",
title="Distribution of Participants by Gender in Top vs Bottom Studies",
labels={
"success_category": "Success Category",
"num_participants": "Number of Participants",
"sex": "Gender"
}
)
# Add annotation explaining the visualization
fig.add_annotation(
xref="paper", yref="paper",
x=0.5, y=1.05,
text="This plot shows gender distribution in high vs. low diversity studies",
showarrow=False,
font=dict(size=12)
)
fig.update_layout(width=700)
return fig
def create_eligibility_score_comparison(df_top, df_bottom):
"""
Create a boxplot comparing eligibility scores between top and bottom diverse studies.
This visualization helps determine if more restrictive eligibility criteria
(higher scores) correlate with lower diversity.
Args:
df_top (pandas.DataFrame): Dataset of top diverse studies
df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
Returns:
plotly.graph_objects.Figure: Box plot of eligibility scores
"""
df_top_copy = df_top.copy()
df_bottom_copy = df_bottom.copy()
df_top_copy["diversity_category"] = "diverse"
df_bottom_copy["diversity_category"] = "not_diverse"
df_combined = pd.concat([df_top_copy, df_bottom_copy])
fig = px.box(
df_combined,
x="diversity_category",
y="eligibility_score",
title="Distribution of eligibility score for diverse studies vs non-diverse studies",
labels={
"diversity_category": "Diversity Category",
"eligibility_score": "Eligibility Score"
}
)
# Add annotation explaining the eligibility score
fig.add_annotation(
xref="paper", yref="paper",
x=0.5, y=1.05,
text="Eligibility score is the sum of all eligibility restrictions (higher = more restrictive)",
showarrow=False,
font=dict(size=12)
)
return fig