femr / Git / Diff of /tutorials/5_CLMBR Featurization And Modeling.ipynb

Models:
philipB/
femr
Downloads: 1
Diff of /tutorials/5_CLMBR Featurization And Modeling.ipynb [000000] .. [f54d94]
Switch to side-by-side view

--- a
+++ b/tutorials/5_CLMBR Featurization And Modeling.ipynb
@@ -0,0 +1,257 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "55c59464-ef8f-4fb9-a0a6-ebb68475e2a9",
+   "metadata": {},
+   "source": [
+    "# Using CLMBR to generate features and training models on those features\n",
+    "\n",
+    "We can use a trained CLMBR model to generate features and then use those features in a logistic regression model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "fe93d59d-f135-46f6-b0a7-2d75d9b18e6f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import shutil\n",
+    "import os\n",
+    "\n",
+    "TARGET_DIR = 'trash/tutorial_5'\n",
+    "\n",
+    "if os.path.exists(TARGET_DIR):\n",
+    "    shutil.rmtree(TARGET_DIR)\n",
+    "\n",
+    "os.mkdir(TARGET_DIR)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "25d741a7-46a2-4760-a369-3efb01afd804",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/esteinberg/miniconda3/envs/debug_document_femr/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "Map (num_proc=4): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 1403.57 examples/s]\n",
+      "Some weights of the model checkpoint at input/clmbr_model were not used when initializing FEMRModel: ['task_model.final_layer.bias', 'task_model.final_layer.weight']\n",
+      "- This IS expected if you are initializing FEMRModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing FEMRModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "Map (num_proc=4): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 893.12 examples/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Creating batches 5\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Generating train split: 5 examples [00:00, 19.30 examples/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "patient_ids (200,)\n",
+      "feature_times (200,)\n",
+      "features (200, 64)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import femr.models.transformer\n",
+    "import pyarrow.csv\n",
+    "import datasets\n",
+    "\n",
+    "# First, we compute our features\n",
+    "\n",
+    "# Load some labels\n",
+    "labels = pyarrow.csv.read_csv('input/labels.csv').to_pylist()\n",
+    "\n",
+    "# Load our data\n",
+    "dataset = datasets.Dataset.from_parquet(\"input/meds/data/*\")\n",
+    "\n",
+    "features = femr.models.transformer.compute_features(dataset,'input/clmbr_model', labels, num_proc=4, tokens_per_batch=128)\n",
+    "\n",
+    "# We have our features\n",
+    "for k, v in features.items():\n",
+    "    print(k, v.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4c5c75a9",
+   "metadata": {},
+   "source": [
+    "# Joining features and labels\n",
+    "\n",
+    "Given a feature set, it's important to be able to join a set of labels to those features.\n",
+    "\n",
+    "This can be done with femr.featurizers.join_labels"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "9ad882f7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "boolean_values (200,)\n",
+      "patient_ids (200,)\n",
+      "times (200,)\n",
+      "features (200, 64)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import femr.featurizers\n",
+    "\n",
+    "features_and_labels = femr.featurizers.join_labels(features, labels)\n",
+    "\n",
+    "for k, v in features_and_labels.items():\n",
+    "    print(k, v.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7192ccc8",
+   "metadata": {},
+   "source": [
+    "# Data Splitting\n",
+    "\n",
+    "Your data split should be contained in a CSV with two columns: `patient_id` and `split_name`, where `patient_id` is the ID of the `meds.Patient` and `split_name` is `train` or `test`. When using a pretrained CLMBR model, we have to be very careful to use the splits used for the original model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "b5c49417",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import femr.splits\n",
+    "import numpy as np\n",
+    "\n",
+    "# We split into a global training and test set\n",
+    "split = femr.splits.PatientSplit.load_from_csv('input/clmbr_model/main_split.csv')\n",
+    "\n",
+    "train_mask = np.isin(features_and_labels['patient_ids'], split.train_patient_ids)\n",
+    "test_mask = np.isin(features_and_labels['patient_ids'], split.test_patient_ids)\n",
+    "\n",
+    "percent_train = .70\n",
+    "X_train, y_train = (\n",
+    "    features_and_labels['features'][train_mask],\n",
+    "    features_and_labels['boolean_values'][train_mask],\n",
+    ")\n",
+    "X_test, y_test = (\n",
+    "    features_and_labels['features'][test_mask],\n",
+    "    features_and_labels['boolean_values'][test_mask],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8deca785",
+   "metadata": {},
+   "source": [
+    "# Building Models\n",
+    "\n",
+    "The generated features can then be used to build your standard models. In this case we construct both logistic regression and XGBoost models and evaluate them.\n",
+    "\n",
+    "Performance is perfect since our task (predicting gender) is 100% determined by the features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "bad5ad4f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "---- Logistic Regression ----\n",
+      "Train:\n",
+      "\tAUROC: 1.0\n",
+      "\tAPS: 1.0\n",
+      "\tAccuracy: 1.0\n",
+      "\tF1 Score: 1.0\n",
+      "Test:\n",
+      "\tAUROC: 1.0\n",
+      "\tAPS: 1.0\n",
+      "\tAccuracy: 1.0\n",
+      "\tF1 Score: 1.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sklearn.linear_model\n",
+    "import sklearn.metrics\n",
+    "import sklearn.preprocessing\n",
+    "\n",
+    "def run_analysis(title: str, y_train, y_train_proba, y_test, y_test_proba):\n",
+    "    print(f\"---- {title} ----\")\n",
+    "    print(\"Train:\")\n",
+    "    print_metrics(y_train, y_train_proba)\n",
+    "    print(\"Test:\")\n",
+    "    print_metrics(y_test, y_test_proba)\n",
+    "\n",
+    "def print_metrics(y_true, y_proba):\n",
+    "    y_pred = y_proba > 0.5\n",
+    "    auroc = sklearn.metrics.roc_auc_score(y_true, y_proba)\n",
+    "    aps = sklearn.metrics.average_precision_score(y_true, y_proba)\n",
+    "    accuracy = sklearn.metrics.accuracy_score(y_true, y_pred)\n",
+    "    f1 = sklearn.metrics.f1_score(y_true, y_pred)\n",
+    "    print(\"\\tAUROC:\", auroc)\n",
+    "    print(\"\\tAPS:\", aps)\n",
+    "    print(\"\\tAccuracy:\", accuracy)\n",
+    "    print(\"\\tF1 Score:\", f1)\n",
+    "\n",
+    "\n",
+    "model = sklearn.linear_model.LogisticRegressionCV(penalty=\"l2\", solver=\"liblinear\").fit(X_train, y_train)\n",
+    "y_train_proba = model.predict_proba(X_train)[::, 1]\n",
+    "y_test_proba = model.predict_proba(X_test)[::, 1]\n",
+    "run_analysis(\"Logistic Regression\", y_train, y_train_proba, y_test, y_test_proba)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}