--- a
+++ b/src/visualization.py
@@ -0,0 +1,363 @@
+"""
+Visualization Module for Diversity in Head and Neck Cancer Clinical Trials Analysis
+
+This module contains functions for creating visualizations to help analyze
+diversity in head and neck cancer clinical trials, with a particular focus
+on comparing eligibility criteria and other study characteristics between
+high-diversity and low-diversity studies.
+"""
+
+import numpy as np
+import pandas as pd
+import plotly.express as px
+from plotly import graph_objects as go
+from plotly.subplots import make_subplots
+
+def plot_cdf(df, column_name, title=None, x_title=None, y_title=None, width=800):
+    """
+    Create a cumulative distribution function plot for a specific column.
+    
+    This is useful for visualizing the distribution of the success metric
+    (percentage of non-white participants) across all studies.
+    
+    Args:
+        df (pandas.DataFrame): Dataset
+        column_name (str): Column to plot
+        title (str, optional): Plot title
+        x_title (str, optional): X-axis label
+        y_title (str, optional): Y-axis label
+        width (int, optional): Plot width
+        
+    Returns:
+        plotly.graph_objects.Figure: CDF plot
+    """
+    hist, bins = np.histogram(df[column_name], bins=100)
+    cdf = np.cumsum(hist)
+    cdf = cdf/cdf[-1]
+    
+    fig = px.line(x=bins[:-1], y=cdf)
+    
+    if x_title:
+        fig.update_xaxes(title=x_title)
+    else:
+        fig.update_xaxes(title=column_name)
+        
+    if y_title:
+        fig.update_yaxes(title=y_title)
+    else:
+        fig.update_yaxes(title="Cumulative Probability")
+        
+    if title:
+        fig.update_layout(title=title)
+        
+    fig.update_layout(width=width)
+    
+    return fig
+
+def plot_box_by_category(df, y_column, x_column="success_category", 
+                         title=None, x_title=None, y_title=None, width=700):
+    """
+    Create a box plot comparing a variable across categories.
+    
+    Useful for comparing eligibility scores or number of participants between
+    diverse and non-diverse studies.
+    
+    Args:
+        df (pandas.DataFrame): Dataset
+        y_column (str): Column to display on y-axis
+        x_column (str, optional): Column to group by on x-axis
+        title (str, optional): Plot title
+        x_title (str, optional): X-axis label
+        y_title (str, optional): Y-axis label
+        width (int, optional): Plot width
+        
+    Returns:
+        plotly.graph_objects.Figure: Box plot
+    """
+    fig = px.box(df, x=x_column, y=y_column)
+    
+    if title:
+        fig.update_layout(title=title)
+    
+    if x_title:
+        fig.update_xaxes(title=x_title)
+    
+    if y_title:
+        fig.update_yaxes(title=y_title)
+    
+    fig.update_layout(width=width)
+    
+    return fig
+
+def plot_box_by_category_color(df, y_column, x_column="success_category", 
+                               color_column=None, title=None, 
+                               x_title=None, y_title=None, width=700):
+    """
+    Create a box plot comparing a variable across categories with color grouping.
+    
+    Useful for comparing participant demographics (e.g., by gender) between
+    diverse and non-diverse studies.
+    
+    Args:
+        df (pandas.DataFrame): Dataset
+        y_column (str): Column to display on y-axis
+        x_column (str, optional): Column to group by on x-axis
+        color_column (str, optional): Column to use for color grouping
+        title (str, optional): Plot title
+        x_title (str, optional): X-axis label
+        y_title (str, optional): Y-axis label
+        width (int, optional): Plot width
+        
+    Returns:
+        plotly.graph_objects.Figure: Box plot with color grouping
+    """
+    fig = px.box(df, x=x_column, y=y_column, color=color_column)
+    
+    if title:
+        fig.update_layout(title=title)
+    
+    if x_title:
+        fig.update_xaxes(title=x_title)
+    
+    if y_title:
+        fig.update_yaxes(title=y_title)
+    
+    fig.update_layout(width=width)
+    
+    return fig
+
+def compare_field_by_category(df, field, categories=None, height=900, width=1200):
+    """
+    Create subplot comparing the distribution of a field across categories.
+    
+    This function is especially useful for comparing eligibility criteria between
+    diverse and non-diverse studies. It creates a vertical subplot with bar charts
+    showing the distribution of values for a specific field in each category.
+    
+    Args:
+        df (pandas.DataFrame): Dataset
+        field (str): Column to compare, typically an eligibility criterion
+        categories (list, optional): Categories to include in comparison
+        height (int, optional): Plot height
+        width (int, optional): Plot width
+        
+    Returns:
+        plotly.graph_objects.Figure: Subplot with field distributions by category
+    """
+    if categories is None:
+        categories = ["Top20", "Bottom20", "Neither"]
+    
+    fig = make_subplots(
+        rows=len(categories), 
+        subplot_titles=categories, 
+        vertical_spacing=0.1, 
+        shared_xaxes=True
+    )
+    
+    for i, category in enumerate(categories):
+        df_category = df[df["success_category"] == category][field].value_counts().reset_index()
+        df_category.columns = [field, "Num. Studies"]
+        
+        fig.add_trace(
+            go.Bar(
+                x=df_category[field],
+                y=df_category["Num. Studies"],
+                name=category
+            ),
+            row=i+1,
+            col=1
+        )
+    
+    # Add field name explanation as annotation
+    field_descriptions = {
+        'eligibility_age_restrict': 'Age restriction beyond standard 18+ requirement',
+        'eligibility_stage_size': 'Restrictions on cancer stage or tumor size',
+        'eligibility_site': 'Restrictions on specific cancer sites',
+        'eligibility_histological_type': 'Restrictions on cancer histology (e.g., SCC only)',
+        'eligibility_performance_score': 'ECOG or other performance status requirements',
+        'eligibility_comorbidities': 'Restrictions on patient comorbidities',
+        'eligibility_hx_of_tt': 'Restrictions on previous treatment history',
+        'eligibility_lab_values': 'Restrictions on laboratory test values',
+        'eligibility_pregnancy_or_contraception': 'Pregnancy or contraception requirements',
+        'eligibility_misc': 'Other restrictions (smoking, ethnicity, etc.)',
+        'is_single_institution': 'Whether the study was conducted at a single institution'
+    }
+    
+    # If we have a description for this field, add it as an annotation
+    if field in field_descriptions:
+        fig.add_annotation(
+            xref="paper", yref="paper",
+            x=0.5, y=1.05,
+            text=f"{field.replace('eligibility_', '')}: {field_descriptions[field]}",
+            showarrow=False,
+            font=dict(size=12)
+        )
+    
+    fig.update_layout(height=height, width=width)
+    
+    return fig
+
+def create_geo_distribution_plot(df_top, df_bottom, location_column='location', 
+                                title='Top and Bottom Study Locations'):
+    """
+    Create a map showing the geographic distribution of top and bottom studies.
+    
+    This visualization helps identify whether geographic location correlates with
+    study diversity, considering that areas with more diverse populations may
+    have higher potential for diverse study recruitment.
+    
+    Args:
+        df_top (pandas.DataFrame): Dataset of top diverse studies
+        df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
+        location_column (str, optional): Column with location information
+        title (str, optional): Plot title
+        
+    Returns:
+        plotly.graph_objects.Figure: Geographic distribution plot
+    """
+    # This function would normally use the actual location data to create a map
+    # The implementation would depend on how the location data is stored
+    
+    # Placeholder implementation
+    fig = go.Figure()
+    
+    # Add locations for top studies
+    fig.add_trace(go.Scattergeo(
+        locationmode='USA-states',
+        text=df_top[location_column],
+        mode='markers',
+        marker=dict(
+            size=10,
+            color='blue',
+            line=dict(width=1, color='black')
+        ),
+        name='Top Diverse Studies'
+    ))
+    
+    # Add locations for bottom studies
+    fig.add_trace(go.Scattergeo(
+        locationmode='USA-states',
+        text=df_bottom[location_column],
+        mode='markers',
+        marker=dict(
+            size=10,
+            color='red',
+            line=dict(width=1, color='black')
+        ),
+        name='Bottom Diverse Studies'
+    ))
+    
+    fig.update_layout(
+        title=title,
+        geo=dict(
+            scope='usa',
+            showland=True,
+            landcolor='rgb(217, 217, 217)',
+            countrycolor='rgb(255, 255, 255)',
+            lakecolor='rgb(255, 255, 255)',
+            showlakes=True
+        )
+    )
+    
+    return fig
+
+def create_participant_distribution_by_gender(df_top, df_bottom):
+    """
+    Create a boxplot showing the distribution of participants by gender for top and bottom studies.
+    
+    This visualization helps identify whether gender distribution differs between
+    high-diversity and low-diversity studies.
+    
+    Args:
+        df_top (pandas.DataFrame): Dataset of top diverse studies
+        df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
+        
+    Returns:
+        plotly.graph_objects.Figure: Box plot of participant distribution by gender
+    """
+    # Prepare data for visualization
+    df_top_copy = df_top.copy()
+    df_bottom_copy = df_bottom.copy()
+    
+    df_top_copy["success_category"] = "top"
+    df_bottom_copy["success_category"] = "bottom"
+    df_all = pd.concat([df_top_copy, df_bottom_copy])
+    
+    df_num_participants = pd.concat([
+        df_all[["success_category", "num_male_participants"]]
+            .assign(sex="male")
+            .rename(columns={"num_male_participants": "num_participants"}),
+        df_all[["success_category", "num_female_participants"]]
+            .assign(sex="female")
+            .rename(columns={"num_female_participants": "num_participants"}),
+    ])
+    
+    # Create visualization
+    fig = px.box(
+        df_num_participants, 
+        x="success_category", 
+        y="num_participants", 
+        color="sex",
+        title="Distribution of Participants by Gender in Top vs Bottom Studies",
+        labels={
+            "success_category": "Success Category",
+            "num_participants": "Number of Participants",
+            "sex": "Gender"
+        }
+    )
+    
+    # Add annotation explaining the visualization
+    fig.add_annotation(
+        xref="paper", yref="paper",
+        x=0.5, y=1.05,
+        text="This plot shows gender distribution in high vs. low diversity studies",
+        showarrow=False,
+        font=dict(size=12)
+    )
+    
+    fig.update_layout(width=700)
+    
+    return fig
+
+def create_eligibility_score_comparison(df_top, df_bottom):
+    """
+    Create a boxplot comparing eligibility scores between top and bottom diverse studies.
+    
+    This visualization helps determine if more restrictive eligibility criteria
+    (higher scores) correlate with lower diversity.
+    
+    Args:
+        df_top (pandas.DataFrame): Dataset of top diverse studies
+        df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
+        
+    Returns:
+        plotly.graph_objects.Figure: Box plot of eligibility scores
+    """
+    df_top_copy = df_top.copy()
+    df_bottom_copy = df_bottom.copy()
+    
+    df_top_copy["diversity_category"] = "diverse"
+    df_bottom_copy["diversity_category"] = "not_diverse"
+    df_combined = pd.concat([df_top_copy, df_bottom_copy])
+    
+    fig = px.box(
+        df_combined,
+        x="diversity_category",
+        y="eligibility_score",
+        title="Distribution of eligibility score for diverse studies vs non-diverse studies",
+        labels={
+            "diversity_category": "Diversity Category",
+            "eligibility_score": "Eligibility Score"
+        }
+    )
+    
+    # Add annotation explaining the eligibility score
+    fig.add_annotation(
+        xref="paper", yref="paper",
+        x=0.5, y=1.05,
+        text="Eligibility score is the sum of all eligibility restrictions (higher = more restrictive)",
+        showarrow=False,
+        font=dict(size=12)
+    )
+    
+    return fig
\ No newline at end of file