Head-and-Neck-Trials / Git / [3cdecf] /src/visualization.py

Models:
joseph-gordon/
Head-and-Neck-Trials
Downloads: 1
[3cdecf]: / src / visualization.py
History
Download this file
363 lines (295 with data), 12.4 kB

"""
Visualization Module for Diversity in Head and Neck Cancer Clinical Trials Analysis

This module contains functions for creating visualizations to help analyze
diversity in head and neck cancer clinical trials, with a particular focus
on comparing eligibility criteria and other study characteristics between
high-diversity and low-diversity studies.
"""

import numpy as np
import pandas as pd
import plotly.express as px
from plotly import graph_objects as go
from plotly.subplots import make_subplots

def plot_cdf(df, column_name, title=None, x_title=None, y_title=None, width=800):
    """
    Create a cumulative distribution function plot for a specific column.
    
    This is useful for visualizing the distribution of the success metric
    (percentage of non-white participants) across all studies.
    
    Args:
        df (pandas.DataFrame): Dataset
        column_name (str): Column to plot
        title (str, optional): Plot title
        x_title (str, optional): X-axis label
        y_title (str, optional): Y-axis label
        width (int, optional): Plot width
        
    Returns:
        plotly.graph_objects.Figure: CDF plot
    """
    hist, bins = np.histogram(df[column_name], bins=100)
    cdf = np.cumsum(hist)
    cdf = cdf/cdf[-1]
    
    fig = px.line(x=bins[:-1], y=cdf)
    
    if x_title:
        fig.update_xaxes(title=x_title)
    else:
        fig.update_xaxes(title=column_name)
        
    if y_title:
        fig.update_yaxes(title=y_title)
    else:
        fig.update_yaxes(title="Cumulative Probability")
        
    if title:
        fig.update_layout(title=title)
        
    fig.update_layout(width=width)
    
    return fig

def plot_box_by_category(df, y_column, x_column="success_category", 
                         title=None, x_title=None, y_title=None, width=700):
    """
    Create a box plot comparing a variable across categories.
    
    Useful for comparing eligibility scores or number of participants between
    diverse and non-diverse studies.
    
    Args:
        df (pandas.DataFrame): Dataset
        y_column (str): Column to display on y-axis
        x_column (str, optional): Column to group by on x-axis
        title (str, optional): Plot title
        x_title (str, optional): X-axis label
        y_title (str, optional): Y-axis label
        width (int, optional): Plot width
        
    Returns:
        plotly.graph_objects.Figure: Box plot
    """
    fig = px.box(df, x=x_column, y=y_column)
    
    if title:
        fig.update_layout(title=title)
    
    if x_title:
        fig.update_xaxes(title=x_title)
    
    if y_title:
        fig.update_yaxes(title=y_title)
    
    fig.update_layout(width=width)
    
    return fig

def plot_box_by_category_color(df, y_column, x_column="success_category", 
                               color_column=None, title=None, 
                               x_title=None, y_title=None, width=700):
    """
    Create a box plot comparing a variable across categories with color grouping.
    
    Useful for comparing participant demographics (e.g., by gender) between
    diverse and non-diverse studies.
    
    Args:
        df (pandas.DataFrame): Dataset
        y_column (str): Column to display on y-axis
        x_column (str, optional): Column to group by on x-axis
        color_column (str, optional): Column to use for color grouping
        title (str, optional): Plot title
        x_title (str, optional): X-axis label
        y_title (str, optional): Y-axis label
        width (int, optional): Plot width
        
    Returns:
        plotly.graph_objects.Figure: Box plot with color grouping
    """
    fig = px.box(df, x=x_column, y=y_column, color=color_column)
    
    if title:
        fig.update_layout(title=title)
    
    if x_title:
        fig.update_xaxes(title=x_title)
    
    if y_title:
        fig.update_yaxes(title=y_title)
    
    fig.update_layout(width=width)
    
    return fig

def compare_field_by_category(df, field, categories=None, height=900, width=1200):
    """
    Create subplot comparing the distribution of a field across categories.
    
    This function is especially useful for comparing eligibility criteria between
    diverse and non-diverse studies. It creates a vertical subplot with bar charts
    showing the distribution of values for a specific field in each category.
    
    Args:
        df (pandas.DataFrame): Dataset
        field (str): Column to compare, typically an eligibility criterion
        categories (list, optional): Categories to include in comparison
        height (int, optional): Plot height
        width (int, optional): Plot width
        
    Returns:
        plotly.graph_objects.Figure: Subplot with field distributions by category
    """
    if categories is None:
        categories = ["Top20", "Bottom20", "Neither"]
    
    fig = make_subplots(
        rows=len(categories), 
        subplot_titles=categories, 
        vertical_spacing=0.1, 
        shared_xaxes=True
    )
    
    for i, category in enumerate(categories):
        df_category = df[df["success_category"] == category][field].value_counts().reset_index()
        df_category.columns = [field, "Num. Studies"]
        
        fig.add_trace(
            go.Bar(
                x=df_category[field],
                y=df_category["Num. Studies"],
                name=category
            ),
            row=i+1,
            col=1
        )
    
    # Add field name explanation as annotation
    field_descriptions = {
        'eligibility_age_restrict': 'Age restriction beyond standard 18+ requirement',
        'eligibility_stage_size': 'Restrictions on cancer stage or tumor size',
        'eligibility_site': 'Restrictions on specific cancer sites',
        'eligibility_histological_type': 'Restrictions on cancer histology (e.g., SCC only)',
        'eligibility_performance_score': 'ECOG or other performance status requirements',
        'eligibility_comorbidities': 'Restrictions on patient comorbidities',
        'eligibility_hx_of_tt': 'Restrictions on previous treatment history',
        'eligibility_lab_values': 'Restrictions on laboratory test values',
        'eligibility_pregnancy_or_contraception': 'Pregnancy or contraception requirements',
        'eligibility_misc': 'Other restrictions (smoking, ethnicity, etc.)',
        'is_single_institution': 'Whether the study was conducted at a single institution'
    }
    
    # If we have a description for this field, add it as an annotation
    if field in field_descriptions:
        fig.add_annotation(
            xref="paper", yref="paper",
            x=0.5, y=1.05,
            text=f"{field.replace('eligibility_', '')}: {field_descriptions[field]}",
            showarrow=False,
            font=dict(size=12)
        )
    
    fig.update_layout(height=height, width=width)
    
    return fig

def create_geo_distribution_plot(df_top, df_bottom, location_column='location', 
                                title='Top and Bottom Study Locations'):
    """
    Create a map showing the geographic distribution of top and bottom studies.
    
    This visualization helps identify whether geographic location correlates with
    study diversity, considering that areas with more diverse populations may
    have higher potential for diverse study recruitment.
    
    Args:
        df_top (pandas.DataFrame): Dataset of top diverse studies
        df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
        location_column (str, optional): Column with location information
        title (str, optional): Plot title
        
    Returns:
        plotly.graph_objects.Figure: Geographic distribution plot
    """
    # This function would normally use the actual location data to create a map
    # The implementation would depend on how the location data is stored
    
    # Placeholder implementation
    fig = go.Figure()
    
    # Add locations for top studies
    fig.add_trace(go.Scattergeo(
        locationmode='USA-states',
        text=df_top[location_column],
        mode='markers',
        marker=dict(
            size=10,
            color='blue',
            line=dict(width=1, color='black')
        ),
        name='Top Diverse Studies'
    ))
    
    # Add locations for bottom studies
    fig.add_trace(go.Scattergeo(
        locationmode='USA-states',
        text=df_bottom[location_column],
        mode='markers',
        marker=dict(
            size=10,
            color='red',
            line=dict(width=1, color='black')
        ),
        name='Bottom Diverse Studies'
    ))
    
    fig.update_layout(
        title=title,
        geo=dict(
            scope='usa',
            showland=True,
            landcolor='rgb(217, 217, 217)',
            countrycolor='rgb(255, 255, 255)',
            lakecolor='rgb(255, 255, 255)',
            showlakes=True
        )
    )
    
    return fig

def create_participant_distribution_by_gender(df_top, df_bottom):
    """
    Create a boxplot showing the distribution of participants by gender for top and bottom studies.
    
    This visualization helps identify whether gender distribution differs between
    high-diversity and low-diversity studies.
    
    Args:
        df_top (pandas.DataFrame): Dataset of top diverse studies
        df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
        
    Returns:
        plotly.graph_objects.Figure: Box plot of participant distribution by gender
    """
    # Prepare data for visualization
    df_top_copy = df_top.copy()
    df_bottom_copy = df_bottom.copy()
    
    df_top_copy["success_category"] = "top"
    df_bottom_copy["success_category"] = "bottom"
    df_all = pd.concat([df_top_copy, df_bottom_copy])
    
    df_num_participants = pd.concat([
        df_all[["success_category", "num_male_participants"]]
            .assign(sex="male")
            .rename(columns={"num_male_participants": "num_participants"}),
        df_all[["success_category", "num_female_participants"]]
            .assign(sex="female")
            .rename(columns={"num_female_participants": "num_participants"}),
    ])
    
    # Create visualization
    fig = px.box(
        df_num_participants, 
        x="success_category", 
        y="num_participants", 
        color="sex",
        title="Distribution of Participants by Gender in Top vs Bottom Studies",
        labels={
            "success_category": "Success Category",
            "num_participants": "Number of Participants",
            "sex": "Gender"
        }
    )
    
    # Add annotation explaining the visualization
    fig.add_annotation(
        xref="paper", yref="paper",
        x=0.5, y=1.05,
        text="This plot shows gender distribution in high vs. low diversity studies",
        showarrow=False,
        font=dict(size=12)
    )
    
    fig.update_layout(width=700)
    
    return fig

def create_eligibility_score_comparison(df_top, df_bottom):
    """
    Create a boxplot comparing eligibility scores between top and bottom diverse studies.
    
    This visualization helps determine if more restrictive eligibility criteria
    (higher scores) correlate with lower diversity.
    
    Args:
        df_top (pandas.DataFrame): Dataset of top diverse studies
        df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
        
    Returns:
        plotly.graph_objects.Figure: Box plot of eligibility scores
    """
    df_top_copy = df_top.copy()
    df_bottom_copy = df_bottom.copy()
    
    df_top_copy["diversity_category"] = "diverse"
    df_bottom_copy["diversity_category"] = "not_diverse"
    df_combined = pd.concat([df_top_copy, df_bottom_copy])
    
    fig = px.box(
        df_combined,
        x="diversity_category",
        y="eligibility_score",
        title="Distribution of eligibility score for diverse studies vs non-diverse studies",
        labels={
            "diversity_category": "Diversity Category",
            "eligibility_score": "Eligibility Score"
        }
    )
    
    # Add annotation explaining the eligibility score
    fig.add_annotation(
        xref="paper", yref="paper",
        x=0.5, y=1.05,
        text="Eligibility score is the sum of all eligibility restrictions (higher = more restrictive)",
        showarrow=False,
        font=dict(size=12)
    )
    
    return fig