In [None]:
"""
Initialize environment and import necessary libraries for the analysis
of diversity in head and neck cancer clinical trials.
"""

import warnings
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 500)

# Suppress FutureWarning messages
warnings.simplefilter(action='ignore')

In [None]:
# Import visualization libraries for creating interactive plots
import plotly
from plotly import graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Exclusion criteria for the analysis

This analysis examines diversity in head and neck cancer clinical trials, focusing on studies conducted in the United States with reported race information.

The dataset contains studies conducted all over the world. Considering the cultural context of the United States, we include only those studies that were performed in the US only. Further, given that we need to consider the information on the race of the participants, we apply an additional filter to consider only those studies where both "Num. White participants" and "Num. Non-white participants" were reported (field was not blank).

The number of studies with the successive filters are as follows:
- Total number of studies: 278
- Total number of studies performed in USA only: 187
- Total number of studies that contain information on race: 116

Our diversity metric is defined as:
- Diversity Score = (# non-white participants) / (# total participants) × 100
- Where total participants = # white participants + # non-white participants

In [None]:
df = pd.read_csv("all_studies.csv")
df.columns, df.shape

In [None]:
df['Area Offered'].value_counts()

In [None]:
df_studies_per_area = df['Area Offered'].value_counts().reset_index().rename(
    columns={"index": "Countries", 'Area Offered': "Counts"}
)

px.bar(
    df_studies_per_area,
    x="Countries",
    y="Counts"
).update_layout(
      xaxis=dict(title="place where the study happened"),
      yaxis=dict(title="how many studies per place"),
)


In [None]:
# Filter for USA studies only
df_usa = df[df['Area Offered'] == "United States"]

# Filter for prescence of race information
df_final = df_usa[~(df_usa["# White"].isna() | df_usa["# Non White"].isna())]

print(f"Num. studies in USA: {df_usa.shape[0]}")
print(f"Num. studies in USA AND contains race information: {df_final.shape[0]}")

In [None]:
a=3
b = "meow"
c=(1, 4, 8)
print(f"{a}hello{b}is{c}")

# Success metric and its distribution

This analysis defines the "success metric" of a study as the percentage of non-white participants in the given study. This metric helps us quantify diversity and compare studies objectively.

**Success Metric** = (# non-white participants) / (# total participants) × 100

Considering the above success metric, we can arrive at the following statistics:
- Avg. success percentage: 14.80%
- Median success percentage: 11.55%
- 20th percentile success percentage (low success): 4.76%
- 80th percentile success percentage (high success): 21.93%

We use the 20th and 80th percentiles to categorize studies into "Bottom20" (low diversity) and "Top20" (high diversity) groups for comparative analysis.

In [None]:
df_final["success_metric"] = df_final["# Non White"] / (df_final["# White"] + df_final["# Non White"]) * 100.0

print(f"Avg. success percentage: {df_final.success_metric.mean()} %")
print(f"Median success percentage: {np.quantile(df_final.success_metric, 0.5)} %")
print(f"20th percentile success percentage (low success): {np.quantile(df_final.success_metric, 0.2)} %")
print(f"80th percentile success percentage (high success): {np.quantile(df_final.success_metric, 0.8)} %")

In [None]:
# Cumulative distribution function of the success metric
hist, bins = np.histogram(df_final["success_metric"], bins=100)
cdf = np.cumsum(hist)
cdf = cdf/cdf[-1]

px.line(x=bins[:-1], y=cdf).update_xaxes(title="Success Metric: %age Non-White particpants").update_yaxes(title="Fraction of studies").update_layout(width = 800)

In [None]:
success_metric_20th_perc = np.quantile(df_final.success_metric, 0.2)
success_metric_80th_perc = np.quantile(df_final.success_metric, 0.8)

df_top_20 = df_final[df_final.success_metric >= success_metric_80th_perc]
df_bottom_20 = df_final[df_final.success_metric <= success_metric_20th_perc]

In [None]:
df_top_20

In [None]:
df_bottom_20

In [None]:
df_top_20.to_csv("top_20_studies.csv")
df_bottom_20.to_csv("bottom_20_studies.csv")

# Distribution by success categories

Based on the success metric (% of non-white participants), we categorize the studies into three groups:
- **Top20**: Top 20% of the studies by success metric (most diverse)
- **Bottom20**: Bottom 20% of the studies by success metric (least diverse) 
- **Neither**: Studies in the middle 60%

This categorization allows us to compare factors that might contribute to diversity by examining the differences between highly diverse and less diverse studies. The key factors we analyze include:

1. **Eligibility Criteria**: Restrictions on participant eligibility, including:
   - Age restrictions beyond standard 18+ requirement
   - Cancer stage or tumor size restrictions
   - Cancer site restrictions
   - Histological type restrictions (e.g., SCC only)
   - Performance score requirements
   - Comorbidity restrictions
   - Treatment history restrictions
   - Laboratory value requirements
   - Pregnancy/contraception requirements
   - Other restrictions (smoking status, ethnicity, etc.)

2. **Study Characteristics**:
   - Single vs. multi-institution studies
   - Number of participants 
   - Geographic location
   - Male/female ratio
   - Trial type (Primary/Palliative/Recurrent/Metastatic)
   - Modality (Drug/Radiation/Biological/Combination)

In [None]:
top_20_success_metric_threshold = np.quantile(df_final["success_metric"], 0.8)
bottom_20_success_metric_threshold = np.quantile(df_final["success_metric"], 0.2)

def get_category_label(x):
  if x >= top_20_success_metric_threshold:
    return "Top20"
  elif x<= bottom_20_success_metric_threshold:
    return "Bottom20"
  else:
    return "Neither"

df_final["success_category"] = df_final["success_metric"].apply(lambda x: get_category_label(x))

In [None]:
categories = ["Top20", "Bottom20", "Neither"]

def compare_field_by_category(df, field, height=900, width=1200):
  fig = make_subplots(rows=3, subplot_titles=categories, vertical_spacing=0.1, shared_xaxes=True)
  for i, category in enumerate(categories):
    df_category = df[df["success_category"] == category][field].value_counts().reset_index().rename(columns={"index": field, field: "Num. Studies"})
    fig.add_trace(
        go.Bar(
            x=df_category[field],
            y=df_category["Num. Studies"],
            name=category
        ),
        row=i+1,
        col=1
    )

  fig.update_layout(height=height, width=width).show()


In [None]:
#field = "Modalities"
field = "Trial Type "
#field = "Cancer Site"
#field = "Trial Phase"
#field = "Tumor Type"

compare_field_by_category(df_final, field)

In [None]:
df_final[[
    'Study Title - Link to Page here',
    'Study ID ',
    'Study Start Date',
    'APC Date',
    'Cancer Site',
    'Trial Type ',
    'Trial Phase',
    'Tumor Type',
    'Modalities',
    'Trial Status',
    'Total Included',
    'Median Age',
    'Mean Age',
    'Min Age',
    'Max Age',
    '# Female',
    '# Male',
    '# White',
    '#Hispanic (ethnicity)',
    '# Non White',
    '# Asian',
    '#American Indian',
    '#Native Hawaiian or Pacifi Islande',
    '#Black ',
    '#Not Reported/Other',
    'notes',
    'contact library?',
    'success_metric']]

In [None]:
df_race_reported = df_usa[~(df_usa["# White"].isna() | df_usa["# Non White"].isna())]

In [None]:
df_race_reported[df_race_reported["# Non White"] == 0]

The dataset contains

In [None]:
df["has_valid_participants"] = ~(df["# White"].isna() | df["# Non White"].isna() | (df["# White"] == 0))
df_filtered = df[df["has_valid_participants"]]

In [None]:

df['Area Offered'].value_counts()

In [None]:
df['Area Offered'].value_counts().reset_index()


In [None]:
dict(title="Area the trial was offered", cat='Meow is sweet')

In [None]:
px.bar(
df_studies_per_area,
x="area_offered",
y="num_studies"
).update_layout(
xaxis=dict(title="Area the trial was offered"),
yaxis=dict(title="Num. trials"),
)

