selection-bias-benchmark / Git / [6ac965] /src/iterpretability/datasets/constructed

Models:
MarcoTheBlack/
selection-bias-benchmark
Downloads: 1
[6ac965]: / src / iterpretability / datasets / constructed_examples / save.py
History
Download this file
215 lines (162 with data), 7.2 kB

# %%
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from scipy.special import softmax
from scipy.stats import zscore

# %% [markdown]
# ### 1. Create Data

# %%
import numpy as np

## SETTINGS ##
# toy_example = "ex1"
propensity_scale = 100
num_points_grid = 10000
num_points = 200

toy_example = "ex2_nonlinear"
# ex1: high bias, high total bias, high outcome bias
# ex2: high bias, high total bias, low Y0, Y1 bias, high Y1-Y0 bias


## 1. PATIENT DATA ##

# Calculate the number of points along each aX_gridis
points_per_axis = int(np.sqrt(num_points_grid))

k = 10
logistic = lambda x: 1 / (1 + np.exp(-k * (x - 0.5))) #logistic
nonlinearity = logistic

# Generate linearly spaced points along each axis
x0_grid = np.linspace(0, 1, points_per_axis)
x1_grid = np.linspace(0, 1, points_per_axis)

# Sample numpy array from uniform distribution
x0 = np.random.uniform(0, 1, num_points)
x1 = np.random.uniform(0, 1, num_points)

# Create a meshgrid
xx0_grid, xx1_grid = np.meshgrid(x0_grid, x1_grid)

# Flatten the meshgrid to create a 1000x1 array
X_grid = np.vstack([xx0_grid.ravel(), xx1_grid.ravel()]).T
X = np.vstack([x0.ravel(), x1.ravel()]).T
X0_grid = X_grid[:, 0]
X1_grid = X_grid[:, 1]
X0 = X[:, 0]
X1 = X[:, 1]


## 2. OUTCOMES ##
# Create Y_grid
if toy_example.startswith("ex1"):
    Y_grid = np.array([X0_grid,1-X0_grid]).T
    Y = np.array([X0,1-X0]).T

elif toy_example.startswith("ex2"):
    Y_grid = np.array([X0_grid,1-X1_grid]).T
    Y = np.array([X0,1-X1]).T

if toy_example.endswith("nonlinear"):
    Y = nonlinearity(Y)
    Y_grid = nonlinearity(Y_grid)

Y0 = Y[:, 0]
Y1 = Y[:, 1]

## 3. TREATMENT ##
if toy_example.startswith("ex1"):
    scores_grid = np.array([X0_grid,1-X0_grid]).T
    scores = np.array([X0,1-X0]).T

    scores_grid = zscore(scores_grid, axis=0)
    scores = zscore(scores, axis=0)

elif toy_example.startswith("ex2"):
    scores_grid = np.array([X0_grid,1-X1_grid]).T
    scores = np.array([X0,1-X1]).T

    scores_grid = zscore(scores_grid, axis=0)
    scores = zscore(scores, axis=0)

# Apply the softmax function to each row to get probabilities
p_grid = softmax(propensity_scale*scores_grid, axis=1)
p = softmax(propensity_scale*scores, axis=1)

# Make sure rows add up to one again
row_sums_grid = p_grid.sum(axis=1, keepdims=True)
row_sums = p.sum(axis=1, keepdims=True)

p_grid = p_grid / row_sums_grid
p = p / row_sums

propensities_grid = p_grid
propensities = p

T = np.array([np.random.choice([tre for tre in range(propensities.shape[1])], p=row) for row in propensities])

# Create contourplot with X_grid and Y_grid
import matplotlib.pyplot as plt
import numpy as np

def plot_propensities(xx0_grid, xx1_grid, propensities_grid, points_per_axis):
    # Calculate the min and max values for the color scale
    vmin = min(propensities_grid[:, 0].min(), propensities_grid[:, 1].min())
    vmax = max(propensities_grid[:, 0].max(), propensities_grid[:, 1].max())

    # Create subplots
    fig, axs = plt.subplots(1, 2, figsize=(8, 3), sharex=True, sharey=True)

    # Plot the first propensity
    contour0 = axs[0].contourf(xx0_grid, xx1_grid, propensities_grid[:, 0].reshape(points_per_axis, points_per_axis), alpha=0.4, levels=20, vmin=vmin, vmax=vmax, cmap='Spectral')
    axs[0].scatter(X[:,0], X[:,1], c=T, s=2, cmap='Spectral')
    axs[0].set_title('A0 Distribution')
    axs[0].set_aspect('equal')

    # Plot the second propensity
    contour1 = axs[1].contourf(xx0_grid, xx1_grid, propensities_grid[:, 1].reshape(points_per_axis, points_per_axis), alpha=0.4, levels=20, vmin=vmin, vmax=vmax, cmap='Spectral')
    # plt.scatter(X, c=T, alpha=0.5)
    axs[1].scatter(X[:,0], X[:,1], c=T, s=2, cmap='Spectral')

    axs[1].set_title('A1 Distribution')
    axs[1].set_aspect('equal')

    # Add a single shared colorbar
    cbar = fig.colorbar(contour1, ax=axs, orientation='vertical', fraction=0.02, pad=0.04)

    plt.show()
    
def plot_outcomes(xx0_grid, xx1_grid, X0, X1, Y_grid, T, points_per_axis):
    # Calculate the min and max values for the color scale
    vmin = min(Y_grid[:, 0].min(), Y_grid[:, 1].min(), (Y_grid[:, 1] - Y_grid[:, 0]).min())
    vmax = max(Y_grid[:, 0].max(), Y_grid[:, 1].max(), (Y_grid[:, 1] - Y_grid[:, 0]).max())

    # Plot the above three plots in a single figure next to each other, all with the same colorbar
    fig, axs = plt.subplots(1, 3, figsize=(12, 3), sharex=True, sharey=True)
    contour0 = axs[0].contourf(xx0_grid, xx1_grid, Y_grid[:, 0].reshape(points_per_axis, points_per_axis), alpha=0.5, levels=20, vmin=vmin, vmax=vmax)
    axs[0].scatter(X0, X1, c=T, s=2, cmap='Spectral')
    axs[0].set_title('Y0')
    axs[0].set_aspect('equal')

    contour1 = axs[1].contourf(xx0_grid, xx1_grid, Y_grid[:, 1].reshape(points_per_axis, points_per_axis), alpha=0.5, levels=20, vmin=vmin, vmax=vmax)
    axs[1].scatter(X0, X1, c=T, s=2, cmap='Spectral')
    axs[1].set_title('Y1')
    axs[1].set_aspect('equal')

    contour_diff = axs[2].contourf(xx0_grid, xx1_grid, (Y_grid[:, 1] - Y_grid[:, 0]).reshape(points_per_axis, points_per_axis), alpha=0.5, levels=20, vmin=vmin, vmax=vmax)
    axs[2].scatter(X0, X1, c=T, s=2, cmap='Spectral')
    axs[2].set_title('Y1-Y0')
    axs[2].set_aspect('equal')

    # Add a single shared colorbar for all three plots
    cbar = fig.colorbar(contour_diff, ax=axs, orientation='vertical', fraction=0.02, pad=0.04)

    plt.show()

def plot_outcome_dists(Y0, Y1, T):
    # Calculate Y1 - Y0
    Y_diff = Y1 - Y0

    # Create subplots
    fig, axs = plt.subplots(1, 4, figsize=(20, 5))

    # Define colors for different values of T
    # colors = ['blue', 'orange']
    # Get extreme colors of spectral colormap
    colors = plt.cm.Spectral(np.linspace(0, 1, len(np.unique(T))))

    # Plot the distribution of Y0 colored by T
    for t in np.unique(T):
        axs[0].hist(Y0[T == t], bins=30, color=colors[t], alpha=0.7, label=f'T={t}')
    axs[0].set_title('Distribution of Y0')
    axs[0].set_xlabel('Y0')
    axs[0].set_ylabel('Frequency')
    axs[0].legend()

    # Plot the distribution of Y1 colored by T
    for t in np.unique(T):
        axs[1].hist(Y1[T == t], bins=30, color=colors[t], alpha=0.7, label=f'T={t}')
    axs[1].set_title('Distribution of Y1')
    axs[1].set_xlabel('Y1')
    axs[1].set_ylabel('Frequency')
    axs[1].legend()

    # Plot the distribution of Y1 - Y0 colored by T
    for t in np.unique(T):
        axs[2].hist(Y_diff[T == t], bins=30, color=colors[t], alpha=0.7, label=f'T={t}')
    axs[2].set_title('Distribution of Y1 - Y0')
    axs[2].set_xlabel('Y1 - Y0')
    axs[2].set_ylabel('Frequency')
    axs[2].legend()

    # Plot the joint distribution of Y0 and Y1 colored by T
    for t in np.unique(T):
        axs[3].scatter(Y0[T == t], Y1[T == t], color=colors[t], s=8, label=f'T={t}', alpha=0.7)
    axs[3].set_title('Joint Distribution of Y0 and Y1')
    axs[3].set_xlabel('Y0')
    axs[3].set_ylabel('Y1')
    axs[3].legend()

    # plt.tight_layout()
    plt.show()

plot_propensities(xx0_grid, xx1_grid, propensities_grid, points_per_axis)
plot_outcomes(xx0_grid, xx1_grid, X0, X1, Y_grid, T, points_per_axis)
plot_outcome_dists(Y0, Y1, T)

# %% [markdown]
# ### 2. Plotting