[f54d94]: / tutorials / 5_CLMBR Featurization And Modeling.ipynb

Download this file

258 lines (257 with data), 8.6 kB

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "55c59464-ef8f-4fb9-a0a6-ebb68475e2a9",
   "metadata": {},
   "source": [
    "# Using CLMBR to generate features and training models on those features\n",
    "\n",
    "We can use a trained CLMBR model to generate features and then use those features in a logistic regression model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "fe93d59d-f135-46f6-b0a7-2d75d9b18e6f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import shutil\n",
    "import os\n",
    "\n",
    "TARGET_DIR = 'trash/tutorial_5'\n",
    "\n",
    "if os.path.exists(TARGET_DIR):\n",
    "    shutil.rmtree(TARGET_DIR)\n",
    "\n",
    "os.mkdir(TARGET_DIR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "25d741a7-46a2-4760-a369-3efb01afd804",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/esteinberg/miniconda3/envs/debug_document_femr/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "Map (num_proc=4): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 1403.57 examples/s]\n",
      "Some weights of the model checkpoint at input/clmbr_model were not used when initializing FEMRModel: ['task_model.final_layer.bias', 'task_model.final_layer.weight']\n",
      "- This IS expected if you are initializing FEMRModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing FEMRModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "Map (num_proc=4): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 893.12 examples/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Creating batches 5\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Generating train split: 5 examples [00:00, 19.30 examples/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "patient_ids (200,)\n",
      "feature_times (200,)\n",
      "features (200, 64)\n"
     ]
    }
   ],
   "source": [
    "import femr.models.transformer\n",
    "import pyarrow.csv\n",
    "import datasets\n",
    "\n",
    "# First, we compute our features\n",
    "\n",
    "# Load some labels\n",
    "labels = pyarrow.csv.read_csv('input/labels.csv').to_pylist()\n",
    "\n",
    "# Load our data\n",
    "dataset = datasets.Dataset.from_parquet(\"input/meds/data/*\")\n",
    "\n",
    "features = femr.models.transformer.compute_features(dataset,'input/clmbr_model', labels, num_proc=4, tokens_per_batch=128)\n",
    "\n",
    "# We have our features\n",
    "for k, v in features.items():\n",
    "    print(k, v.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4c5c75a9",
   "metadata": {},
   "source": [
    "# Joining features and labels\n",
    "\n",
    "Given a feature set, it's important to be able to join a set of labels to those features.\n",
    "\n",
    "This can be done with femr.featurizers.join_labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "9ad882f7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "boolean_values (200,)\n",
      "patient_ids (200,)\n",
      "times (200,)\n",
      "features (200, 64)\n"
     ]
    }
   ],
   "source": [
    "import femr.featurizers\n",
    "\n",
    "features_and_labels = femr.featurizers.join_labels(features, labels)\n",
    "\n",
    "for k, v in features_and_labels.items():\n",
    "    print(k, v.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7192ccc8",
   "metadata": {},
   "source": [
    "# Data Splitting\n",
    "\n",
    "Your data split should be contained in a CSV with two columns: `patient_id` and `split_name`, where `patient_id` is the ID of the `meds.Patient` and `split_name` is `train` or `test`. When using a pretrained CLMBR model, we have to be very careful to use the splits used for the original model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "b5c49417",
   "metadata": {},
   "outputs": [],
   "source": [
    "import femr.splits\n",
    "import numpy as np\n",
    "\n",
    "# We split into a global training and test set\n",
    "split = femr.splits.PatientSplit.load_from_csv('input/clmbr_model/main_split.csv')\n",
    "\n",
    "train_mask = np.isin(features_and_labels['patient_ids'], split.train_patient_ids)\n",
    "test_mask = np.isin(features_and_labels['patient_ids'], split.test_patient_ids)\n",
    "\n",
    "percent_train = .70\n",
    "X_train, y_train = (\n",
    "    features_and_labels['features'][train_mask],\n",
    "    features_and_labels['boolean_values'][train_mask],\n",
    ")\n",
    "X_test, y_test = (\n",
    "    features_and_labels['features'][test_mask],\n",
    "    features_and_labels['boolean_values'][test_mask],\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8deca785",
   "metadata": {},
   "source": [
    "# Building Models\n",
    "\n",
    "The generated features can then be used to build your standard models. In this case we construct both logistic regression and XGBoost models and evaluate them.\n",
    "\n",
    "Performance is perfect since our task (predicting gender) is 100% determined by the features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "bad5ad4f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "---- Logistic Regression ----\n",
      "Train:\n",
      "\tAUROC: 1.0\n",
      "\tAPS: 1.0\n",
      "\tAccuracy: 1.0\n",
      "\tF1 Score: 1.0\n",
      "Test:\n",
      "\tAUROC: 1.0\n",
      "\tAPS: 1.0\n",
      "\tAccuracy: 1.0\n",
      "\tF1 Score: 1.0\n"
     ]
    }
   ],
   "source": [
    "import sklearn.linear_model\n",
    "import sklearn.metrics\n",
    "import sklearn.preprocessing\n",
    "\n",
    "def run_analysis(title: str, y_train, y_train_proba, y_test, y_test_proba):\n",
    "    print(f\"---- {title} ----\")\n",
    "    print(\"Train:\")\n",
    "    print_metrics(y_train, y_train_proba)\n",
    "    print(\"Test:\")\n",
    "    print_metrics(y_test, y_test_proba)\n",
    "\n",
    "def print_metrics(y_true, y_proba):\n",
    "    y_pred = y_proba > 0.5\n",
    "    auroc = sklearn.metrics.roc_auc_score(y_true, y_proba)\n",
    "    aps = sklearn.metrics.average_precision_score(y_true, y_proba)\n",
    "    accuracy = sklearn.metrics.accuracy_score(y_true, y_pred)\n",
    "    f1 = sklearn.metrics.f1_score(y_true, y_pred)\n",
    "    print(\"\\tAUROC:\", auroc)\n",
    "    print(\"\\tAPS:\", aps)\n",
    "    print(\"\\tAccuracy:\", accuracy)\n",
    "    print(\"\\tF1 Score:\", f1)\n",
    "\n",
    "\n",
    "model = sklearn.linear_model.LogisticRegressionCV(penalty=\"l2\", solver=\"liblinear\").fit(X_train, y_train)\n",
    "y_train_proba = model.predict_proba(X_train)[::, 1]\n",
    "y_test_proba = model.predict_proba(X_test)[::, 1]\n",
    "run_analysis(\"Logistic Regression\", y_train, y_train_proba, y_test, y_test_proba)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}