--- a
+++ b/src/analysis.py
@@ -0,0 +1,314 @@
+"""
+Analysis Module for Diversity in Head and Neck Cancer Clinical Trials
+
+This module contains functions for analyzing the factors that contribute to
+diversity in head and neck cancer clinical trials, with a focus on eligibility
+criteria and other study characteristics.
+"""
+
+import pandas as pd
+import numpy as np
+from scipy import stats
+
+def compare_eligibility_scores(df_top, df_bottom):
+    """
+    Compare eligibility scores between top and bottom diverse studies.
+    
+    The eligibility score is the sum of binary flags for all eligibility criteria,
+    with higher scores indicating more restrictive criteria.
+    
+    Args:
+        df_top (pandas.DataFrame): Dataset of top diverse studies
+        df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
+        
+    Returns:
+        dict: Dictionary with comparison results and statistical test
+    """
+    # Get eligibility scores for both groups
+    top_scores = df_top['eligibility_score']
+    bottom_scores = df_bottom['eligibility_score']
+    
+    # Perform statistical comparison (Mann-Whitney U test)
+    u_stat, p_value = stats.mannwhitneyu(top_scores, bottom_scores, alternative='two-sided')
+    
+    # Calculate descriptive statistics
+    top_stats = {
+        'mean': np.mean(top_scores),
+        'median': np.median(top_scores),
+        'std': np.std(top_scores),
+        'min': np.min(top_scores),
+        'max': np.max(top_scores)
+    }
+    
+    bottom_stats = {
+        'mean': np.mean(bottom_scores),
+        'median': np.median(bottom_scores),
+        'std': np.std(bottom_scores),
+        'min': np.min(bottom_scores),
+        'max': np.max(bottom_scores)
+    }
+    
+    return {
+        'top_stats': top_stats,
+        'bottom_stats': bottom_stats,
+        'u_statistic': u_stat,
+        'p_value': p_value,
+        'significant': p_value < 0.05
+    }
+
+def compare_eligibility_criteria(df_top, df_bottom):
+    """
+    Compare the prevalence of specific eligibility criteria between top and bottom diverse studies.
+    
+    This function examines the following eligibility criteria:
+    - age_restrict: 0 if the restriction is age>18, 1 for other restrictions (e.g., 18<age<75)
+    - stage_size: Restrictions on the cancer stage and the size of the tumor
+    - cancer_site: Restrictions on the cancer site
+    - histological_type: Whether the study was limited to SCC or any other type
+    - performance_score: Restrictions on performance score (e.g., ECOG performance)
+    - comorbidities: Restrictions on comorbidities
+    - hx_of_tt: Restrictions on treatment history for cancer
+    - lab_values: Restrictions on lab test values
+    - pregnancy_or_contraception: Restrictions on pregnancy or particular contraceptives
+    - misc: Other restrictions (smoking status, ethnicity requirements)
+    
+    Args:
+        df_top (pandas.DataFrame): Dataset of top diverse studies
+        df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
+        
+    Returns:
+        dict: Dictionary mapping criteria names to comparison results
+    """
+    # Define eligibility criteria fields to compare
+    criteria_fields = [
+        'eligibility_age_restrict',
+        'eligibility_stage_size',
+        'eligibility_site',
+        'eligibility_histological_type',
+        'eligibility_performance_score',
+        'eligibility_comorbidities',
+        'eligibility_hx_of_tt',
+        'eligibility_lab_values',
+        'eligibility_pregnancy_or_contraception',
+        'eligibility_misc'
+    ]
+    
+    # Compare each criterion
+    results = {}
+    for criterion in criteria_fields:
+        # Calculate prevalence in each group
+        top_prevalence = df_top[criterion].mean() * 100
+        bottom_prevalence = df_bottom[criterion].mean() * 100
+        
+        # Perform Fisher's exact test for significance
+        contingency_table = pd.crosstab(
+            pd.Series(df_top[criterion].tolist() + df_bottom[criterion].tolist()), 
+            pd.Series(['top'] * len(df_top) + ['bottom'] * len(df_bottom))
+        )
+        
+        _, p_value = stats.fisher_exact(contingency_table)
+        
+        # Store results
+        results[criterion] = {
+            'top_prevalence': top_prevalence,
+            'bottom_prevalence': bottom_prevalence,
+            'difference': top_prevalence - bottom_prevalence,
+            'p_value': p_value,
+            'significant': p_value < 0.05
+        }
+    
+    return results
+
+def compare_study_characteristics(df_top, df_bottom):
+    """
+    Compare general study characteristics between top and bottom diverse studies.
+    
+    This function examines the following characteristics:
+    1. Study start date and end date
+    2. Single vs. multi-institutional study
+    3. Number of participants (total, male, female)
+    4. Modality (Drug/Radiation/Biologic/Combination)
+    5. Trial type (Primary/Palliative/Recurrent/Metastatic)
+    
+    Args:
+        df_top (pandas.DataFrame): Dataset of top diverse studies
+        df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
+        
+    Returns:
+        dict: Dictionary mapping characteristic names to comparison results
+    """
+    # Define characteristics to compare
+    characteristics = [
+        'num_sites',
+        'is_single_institution',
+        'num_participants',
+        'num_male_participants',
+        'num_female_participants',
+        'modalities',
+        'trial_type',
+        'trial_phase',
+        'tumor_type',
+        'cancer_site',
+        'study_start_date',
+        'apc_date'
+    ]
+    
+    results = {}
+    for char in characteristics:
+        if char in df_top.columns and char in df_bottom.columns:
+            if df_top[char].dtype in [np.int64, np.float64]:
+                # Numeric characteristic - use Mann-Whitney U test
+                u_stat, p_value = stats.mannwhitneyu(
+                    df_top[char].dropna(), 
+                    df_bottom[char].dropna(), 
+                    alternative='two-sided'
+                )
+                
+                results[char] = {
+                    'top_mean': df_top[char].mean(),
+                    'bottom_mean': df_bottom[char].mean(),
+                    'difference': df_top[char].mean() - df_bottom[char].mean(),
+                    'test': 'Mann-Whitney U',
+                    'statistic': u_stat,
+                    'p_value': p_value,
+                    'significant': p_value < 0.05
+                }
+            else:
+                # Categorical characteristic - use Chi-square test
+                try:
+                    contingency_table = pd.crosstab(
+                        pd.Series(df_top[char].tolist() + df_bottom[char].tolist()), 
+                        pd.Series(['top'] * len(df_top) + ['bottom'] * len(df_bottom))
+                    )
+                    
+                    chi2, p_value, _, _ = stats.chi2_contingency(contingency_table)
+                    
+                    # Calculate prevalence of each category in both groups
+                    top_distribution = df_top[char].value_counts(normalize=True).to_dict()
+                    bottom_distribution = df_bottom[char].value_counts(normalize=True).to_dict()
+                    
+                    results[char] = {
+                        'top_distribution': top_distribution,
+                        'bottom_distribution': bottom_distribution,
+                        'test': 'Chi-square',
+                        'statistic': chi2,
+                        'p_value': p_value,
+                        'significant': p_value < 0.05
+                    }
+                except (ValueError, np.linalg.LinAlgError):
+                    # Handle cases where Chi-square test cannot be performed
+                    top_distribution = df_top[char].value_counts(normalize=True).to_dict()
+                    bottom_distribution = df_bottom[char].value_counts(normalize=True).to_dict()
+                    
+                    results[char] = {
+                        'top_distribution': top_distribution,
+                        'bottom_distribution': bottom_distribution,
+                        'test': 'Not performed',
+                        'statistic': None,
+                        'p_value': None,
+                        'significant': None
+                    }
+    
+    return results
+
+def analyze_geographic_distribution(df_top, df_bottom):
+    """
+    Analyze the geographic distribution of studies and its impact on diversity.
+    
+    This function examines how geographic location relates to diversity in clinical trials,
+    considering that areas with more diverse populations may have higher potential for
+    diverse study recruitment.
+    
+    Args:
+        df_top (pandas.DataFrame): Dataset of top diverse studies
+        df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
+        
+    Returns:
+        dict: Dictionary with geographic analysis results
+    """
+    # Extract location information
+    location_field = 'location'
+    if location_field in df_top.columns and location_field in df_bottom.columns:
+        top_locations = df_top[location_field].value_counts().to_dict()
+        bottom_locations = df_bottom[location_field].value_counts().to_dict()
+        
+        # Identify common and unique locations
+        all_locations = set(top_locations.keys()).union(set(bottom_locations.keys()))
+        common_locations = set(top_locations.keys()).intersection(set(bottom_locations.keys()))
+        top_only_locations = set(top_locations.keys()) - set(bottom_locations.keys())
+        bottom_only_locations = set(bottom_locations.keys()) - set(top_locations.keys())
+        
+        return {
+            'top_locations': top_locations,
+            'bottom_locations': bottom_locations,
+            'all_locations': list(all_locations),
+            'common_locations': list(common_locations),
+            'top_only_locations': list(top_only_locations),
+            'bottom_only_locations': list(bottom_only_locations),
+            'num_top_locations': len(top_locations),
+            'num_bottom_locations': len(bottom_locations)
+        }
+    else:
+        return {
+            'error': f"Location field '{location_field}' not found in datasets"
+        }
+
+def analyze_institutional_setting(df_top, df_bottom):
+    """
+    Analyze the impact of institutional setting (single vs. multi-institution) on diversity.
+    
+    This function examines whether single-institution or multi-institution studies
+    tend to have higher diversity.
+    
+    Args:
+        df_top (pandas.DataFrame): Dataset of top diverse studies
+        df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
+        
+    Returns:
+        dict: Dictionary with institutional setting analysis results
+    """
+    institution_field = 'is_single_institution'
+    
+    if institution_field in df_top.columns and institution_field in df_bottom.columns:
+        # Calculate prevalence of single-institution studies in each group
+        top_single_inst_perc = df_top[institution_field].mean() * 100
+        bottom_single_inst_perc = df_bottom[institution_field].mean() * 100
+        
+        # Perform Fisher's exact test for significance
+        contingency_table = pd.crosstab(
+            pd.Series(df_top[institution_field].tolist() + df_bottom[institution_field].tolist()), 
+            pd.Series(['top'] * len(df_top) + ['bottom'] * len(df_bottom))
+        )
+        
+        _, p_value = stats.fisher_exact(contingency_table)
+        
+        return {
+            'top_single_institution_percent': top_single_inst_perc,
+            'bottom_single_institution_percent': bottom_single_inst_perc,
+            'difference': top_single_inst_perc - bottom_single_inst_perc,
+            'p_value': p_value,
+            'significant': p_value < 0.05
+        }
+    else:
+        return {
+            'error': f"Institution field '{institution_field}' not found in datasets"
+        }
+
+def analyze_all_factors(df_top, df_bottom):
+    """
+    Perform a comprehensive analysis of all factors that may affect diversity.
+    
+    Args:
+        df_top (pandas.DataFrame): Dataset of top diverse studies
+        df_bottom (pandas.DataFrame): Dataset of bottom diverse studies
+        
+    Returns:
+        dict: Dictionary with all analysis results
+    """
+    return {
+        'eligibility_scores': compare_eligibility_scores(df_top, df_bottom),
+        'eligibility_criteria': compare_eligibility_criteria(df_top, df_bottom),
+        'study_characteristics': compare_study_characteristics(df_top, df_bottom),
+        'geographic_distribution': analyze_geographic_distribution(df_top, df_bottom),
+        'institutional_setting': analyze_institutional_setting(df_top, df_bottom)
+    }
\ No newline at end of file