--- a +++ b/src/visualization.py @@ -0,0 +1,363 @@ +""" +Visualization Module for Diversity in Head and Neck Cancer Clinical Trials Analysis + +This module contains functions for creating visualizations to help analyze +diversity in head and neck cancer clinical trials, with a particular focus +on comparing eligibility criteria and other study characteristics between +high-diversity and low-diversity studies. +""" + +import numpy as np +import pandas as pd +import plotly.express as px +from plotly import graph_objects as go +from plotly.subplots import make_subplots + +def plot_cdf(df, column_name, title=None, x_title=None, y_title=None, width=800): + """ + Create a cumulative distribution function plot for a specific column. + + This is useful for visualizing the distribution of the success metric + (percentage of non-white participants) across all studies. + + Args: + df (pandas.DataFrame): Dataset + column_name (str): Column to plot + title (str, optional): Plot title + x_title (str, optional): X-axis label + y_title (str, optional): Y-axis label + width (int, optional): Plot width + + Returns: + plotly.graph_objects.Figure: CDF plot + """ + hist, bins = np.histogram(df[column_name], bins=100) + cdf = np.cumsum(hist) + cdf = cdf/cdf[-1] + + fig = px.line(x=bins[:-1], y=cdf) + + if x_title: + fig.update_xaxes(title=x_title) + else: + fig.update_xaxes(title=column_name) + + if y_title: + fig.update_yaxes(title=y_title) + else: + fig.update_yaxes(title="Cumulative Probability") + + if title: + fig.update_layout(title=title) + + fig.update_layout(width=width) + + return fig + +def plot_box_by_category(df, y_column, x_column="success_category", + title=None, x_title=None, y_title=None, width=700): + """ + Create a box plot comparing a variable across categories. + + Useful for comparing eligibility scores or number of participants between + diverse and non-diverse studies. + + Args: + df (pandas.DataFrame): Dataset + y_column (str): Column to display on y-axis + x_column (str, optional): Column to group by on x-axis + title (str, optional): Plot title + x_title (str, optional): X-axis label + y_title (str, optional): Y-axis label + width (int, optional): Plot width + + Returns: + plotly.graph_objects.Figure: Box plot + """ + fig = px.box(df, x=x_column, y=y_column) + + if title: + fig.update_layout(title=title) + + if x_title: + fig.update_xaxes(title=x_title) + + if y_title: + fig.update_yaxes(title=y_title) + + fig.update_layout(width=width) + + return fig + +def plot_box_by_category_color(df, y_column, x_column="success_category", + color_column=None, title=None, + x_title=None, y_title=None, width=700): + """ + Create a box plot comparing a variable across categories with color grouping. + + Useful for comparing participant demographics (e.g., by gender) between + diverse and non-diverse studies. + + Args: + df (pandas.DataFrame): Dataset + y_column (str): Column to display on y-axis + x_column (str, optional): Column to group by on x-axis + color_column (str, optional): Column to use for color grouping + title (str, optional): Plot title + x_title (str, optional): X-axis label + y_title (str, optional): Y-axis label + width (int, optional): Plot width + + Returns: + plotly.graph_objects.Figure: Box plot with color grouping + """ + fig = px.box(df, x=x_column, y=y_column, color=color_column) + + if title: + fig.update_layout(title=title) + + if x_title: + fig.update_xaxes(title=x_title) + + if y_title: + fig.update_yaxes(title=y_title) + + fig.update_layout(width=width) + + return fig + +def compare_field_by_category(df, field, categories=None, height=900, width=1200): + """ + Create subplot comparing the distribution of a field across categories. + + This function is especially useful for comparing eligibility criteria between + diverse and non-diverse studies. It creates a vertical subplot with bar charts + showing the distribution of values for a specific field in each category. + + Args: + df (pandas.DataFrame): Dataset + field (str): Column to compare, typically an eligibility criterion + categories (list, optional): Categories to include in comparison + height (int, optional): Plot height + width (int, optional): Plot width + + Returns: + plotly.graph_objects.Figure: Subplot with field distributions by category + """ + if categories is None: + categories = ["Top20", "Bottom20", "Neither"] + + fig = make_subplots( + rows=len(categories), + subplot_titles=categories, + vertical_spacing=0.1, + shared_xaxes=True + ) + + for i, category in enumerate(categories): + df_category = df[df["success_category"] == category][field].value_counts().reset_index() + df_category.columns = [field, "Num. Studies"] + + fig.add_trace( + go.Bar( + x=df_category[field], + y=df_category["Num. Studies"], + name=category + ), + row=i+1, + col=1 + ) + + # Add field name explanation as annotation + field_descriptions = { + 'eligibility_age_restrict': 'Age restriction beyond standard 18+ requirement', + 'eligibility_stage_size': 'Restrictions on cancer stage or tumor size', + 'eligibility_site': 'Restrictions on specific cancer sites', + 'eligibility_histological_type': 'Restrictions on cancer histology (e.g., SCC only)', + 'eligibility_performance_score': 'ECOG or other performance status requirements', + 'eligibility_comorbidities': 'Restrictions on patient comorbidities', + 'eligibility_hx_of_tt': 'Restrictions on previous treatment history', + 'eligibility_lab_values': 'Restrictions on laboratory test values', + 'eligibility_pregnancy_or_contraception': 'Pregnancy or contraception requirements', + 'eligibility_misc': 'Other restrictions (smoking, ethnicity, etc.)', + 'is_single_institution': 'Whether the study was conducted at a single institution' + } + + # If we have a description for this field, add it as an annotation + if field in field_descriptions: + fig.add_annotation( + xref="paper", yref="paper", + x=0.5, y=1.05, + text=f"{field.replace('eligibility_', '')}: {field_descriptions[field]}", + showarrow=False, + font=dict(size=12) + ) + + fig.update_layout(height=height, width=width) + + return fig + +def create_geo_distribution_plot(df_top, df_bottom, location_column='location', + title='Top and Bottom Study Locations'): + """ + Create a map showing the geographic distribution of top and bottom studies. + + This visualization helps identify whether geographic location correlates with + study diversity, considering that areas with more diverse populations may + have higher potential for diverse study recruitment. + + Args: + df_top (pandas.DataFrame): Dataset of top diverse studies + df_bottom (pandas.DataFrame): Dataset of bottom diverse studies + location_column (str, optional): Column with location information + title (str, optional): Plot title + + Returns: + plotly.graph_objects.Figure: Geographic distribution plot + """ + # This function would normally use the actual location data to create a map + # The implementation would depend on how the location data is stored + + # Placeholder implementation + fig = go.Figure() + + # Add locations for top studies + fig.add_trace(go.Scattergeo( + locationmode='USA-states', + text=df_top[location_column], + mode='markers', + marker=dict( + size=10, + color='blue', + line=dict(width=1, color='black') + ), + name='Top Diverse Studies' + )) + + # Add locations for bottom studies + fig.add_trace(go.Scattergeo( + locationmode='USA-states', + text=df_bottom[location_column], + mode='markers', + marker=dict( + size=10, + color='red', + line=dict(width=1, color='black') + ), + name='Bottom Diverse Studies' + )) + + fig.update_layout( + title=title, + geo=dict( + scope='usa', + showland=True, + landcolor='rgb(217, 217, 217)', + countrycolor='rgb(255, 255, 255)', + lakecolor='rgb(255, 255, 255)', + showlakes=True + ) + ) + + return fig + +def create_participant_distribution_by_gender(df_top, df_bottom): + """ + Create a boxplot showing the distribution of participants by gender for top and bottom studies. + + This visualization helps identify whether gender distribution differs between + high-diversity and low-diversity studies. + + Args: + df_top (pandas.DataFrame): Dataset of top diverse studies + df_bottom (pandas.DataFrame): Dataset of bottom diverse studies + + Returns: + plotly.graph_objects.Figure: Box plot of participant distribution by gender + """ + # Prepare data for visualization + df_top_copy = df_top.copy() + df_bottom_copy = df_bottom.copy() + + df_top_copy["success_category"] = "top" + df_bottom_copy["success_category"] = "bottom" + df_all = pd.concat([df_top_copy, df_bottom_copy]) + + df_num_participants = pd.concat([ + df_all[["success_category", "num_male_participants"]] + .assign(sex="male") + .rename(columns={"num_male_participants": "num_participants"}), + df_all[["success_category", "num_female_participants"]] + .assign(sex="female") + .rename(columns={"num_female_participants": "num_participants"}), + ]) + + # Create visualization + fig = px.box( + df_num_participants, + x="success_category", + y="num_participants", + color="sex", + title="Distribution of Participants by Gender in Top vs Bottom Studies", + labels={ + "success_category": "Success Category", + "num_participants": "Number of Participants", + "sex": "Gender" + } + ) + + # Add annotation explaining the visualization + fig.add_annotation( + xref="paper", yref="paper", + x=0.5, y=1.05, + text="This plot shows gender distribution in high vs. low diversity studies", + showarrow=False, + font=dict(size=12) + ) + + fig.update_layout(width=700) + + return fig + +def create_eligibility_score_comparison(df_top, df_bottom): + """ + Create a boxplot comparing eligibility scores between top and bottom diverse studies. + + This visualization helps determine if more restrictive eligibility criteria + (higher scores) correlate with lower diversity. + + Args: + df_top (pandas.DataFrame): Dataset of top diverse studies + df_bottom (pandas.DataFrame): Dataset of bottom diverse studies + + Returns: + plotly.graph_objects.Figure: Box plot of eligibility scores + """ + df_top_copy = df_top.copy() + df_bottom_copy = df_bottom.copy() + + df_top_copy["diversity_category"] = "diverse" + df_bottom_copy["diversity_category"] = "not_diverse" + df_combined = pd.concat([df_top_copy, df_bottom_copy]) + + fig = px.box( + df_combined, + x="diversity_category", + y="eligibility_score", + title="Distribution of eligibility score for diverse studies vs non-diverse studies", + labels={ + "diversity_category": "Diversity Category", + "eligibility_score": "Eligibility Score" + } + ) + + # Add annotation explaining the eligibility score + fig.add_annotation( + xref="paper", yref="paper", + x=0.5, y=1.05, + text="Eligibility score is the sum of all eligibility restrictions (higher = more restrictive)", + showarrow=False, + font=dict(size=12) + ) + + return fig \ No newline at end of file