[f54d94]: / tutorials / 7_MOTOR Featurization And Modeling.ipynb

Download this file

263 lines (262 with data), 8.7 kB

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "55c59464-ef8f-4fb9-a0a6-ebb68475e2a9",
   "metadata": {},
   "source": [
    "# Using CLMBR to generate features and training models on those features\n",
    "\n",
    "We can use a trained CLMBR model to generate features and then use those features in a logistic regression model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "fe93d59d-f135-46f6-b0a7-2d75d9b18e6f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import shutil\n",
    "import os\n",
    "\n",
    "TARGET_DIR = 'trash/tutorial_7'\n",
    "\n",
    "if os.path.exists(TARGET_DIR):\n",
    "    shutil.rmtree(TARGET_DIR)\n",
    "\n",
    "os.mkdir(TARGET_DIR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "25d741a7-46a2-4760-a369-3efb01afd804",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/esteinberg/miniconda3/envs/debug_document_femr/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "Map (num_proc=4): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 1222.23 examples/s]\n",
      "Some weights of the model checkpoint at input/motor_model were not used when initializing FEMRModel: ['task_model.final_layer.bias', 'task_model.final_layer.weight', 'task_model.task_layer.bias', 'task_model.task_layer.weight']\n",
      "- This IS expected if you are initializing FEMRModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing FEMRModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "Map (num_proc=4): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 330.39 examples/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Creating batches 4\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Generating train split: 4 examples [00:00,  5.97 examples/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "patient_ids (200,)\n",
      "feature_times (200,)\n",
      "features (200, 64)\n"
     ]
    }
   ],
   "source": [
    "import femr.models.transformer\n",
    "import pyarrow.csv\n",
    "import datasets\n",
    "import pickle\n",
    "\n",
    "# First, we compute our features\n",
    "\n",
    "# Load some labels\n",
    "labels = pyarrow.csv.read_csv('input/labels.csv').to_pylist()\n",
    "\n",
    "# Load our data\n",
    "dataset = datasets.Dataset.from_parquet(\"input/meds/data/*\")\n",
    "\n",
    "# We need an ontology for MOTOR\n",
    "with open('input/meds/ontology.pkl', 'rb') as f:\n",
    "    ontology = pickle.load(f)\n",
    "\n",
    "features = femr.models.transformer.compute_features(dataset,'input/motor_model', labels, num_proc=4, tokens_per_batch=128, ontology=ontology)\n",
    "\n",
    "# We have our features\n",
    "for k, v in features.items():\n",
    "    print(k, v.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4c5c75a9",
   "metadata": {},
   "source": [
    "# Joining features and labels\n",
    "\n",
    "Given a feature set, it's important to be able to join a set of labels to those features.\n",
    "\n",
    "This can be done with femr.featurizers.join_labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "9ad882f7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "boolean_values (200,)\n",
      "patient_ids (200,)\n",
      "times (200,)\n",
      "features (200, 64)\n"
     ]
    }
   ],
   "source": [
    "import femr.featurizers\n",
    "\n",
    "features_and_labels = femr.featurizers.join_labels(features, labels)\n",
    "\n",
    "for k, v in features_and_labels.items():\n",
    "    print(k, v.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7192ccc8",
   "metadata": {},
   "source": [
    "# Data Splitting\n",
    "\n",
    "When using a pretrained CLMBR model, we have to be very careful to use the splits used for the original model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "b5c49417",
   "metadata": {},
   "outputs": [],
   "source": [
    "import femr.splits\n",
    "import numpy as np\n",
    "\n",
    "# We split into a global training and test set\n",
    "split = femr.splits.PatientSplit.load_from_csv('input/motor_model/main_split.csv')\n",
    "\n",
    "train_mask = np.isin(features_and_labels['patient_ids'], split.train_patient_ids)\n",
    "test_mask = np.isin(features_and_labels['patient_ids'], split.test_patient_ids)\n",
    "\n",
    "percent_train = .70\n",
    "X_train, y_train = (\n",
    "    features_and_labels['features'][train_mask],\n",
    "    features_and_labels['boolean_values'][train_mask],\n",
    ")\n",
    "X_test, y_test = (\n",
    "    features_and_labels['features'][test_mask],\n",
    "    features_and_labels['boolean_values'][test_mask],\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8deca785",
   "metadata": {},
   "source": [
    "# Building Models\n",
    "\n",
    "The generated features can then be used to build your standard models. In this case we construct both logistic regression and XGBoost models and evaluate them.\n",
    "\n",
    "Performance is perfect since our task (predicting gender) is 100% determined by the features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "bad5ad4f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "---- Logistic Regression ----\n",
      "Train:\n",
      "\tAUROC: 1.0\n",
      "\tAPS: 1.0\n",
      "\tAccuracy: 1.0\n",
      "\tF1 Score: 1.0\n",
      "Test:\n",
      "\tAUROC: 1.0\n",
      "\tAPS: 1.0\n",
      "\tAccuracy: 1.0\n",
      "\tF1 Score: 1.0\n"
     ]
    }
   ],
   "source": [
    "import sklearn.linear_model\n",
    "import sklearn.metrics\n",
    "import sklearn.preprocessing\n",
    "\n",
    "def run_analysis(title: str, y_train, y_train_proba, y_test, y_test_proba):\n",
    "    print(f\"---- {title} ----\")\n",
    "    print(\"Train:\")\n",
    "    print_metrics(y_train, y_train_proba)\n",
    "    print(\"Test:\")\n",
    "    print_metrics(y_test, y_test_proba)\n",
    "\n",
    "def print_metrics(y_true, y_proba):\n",
    "    y_pred = y_proba > 0.5\n",
    "    auroc = sklearn.metrics.roc_auc_score(y_true, y_proba)\n",
    "    aps = sklearn.metrics.average_precision_score(y_true, y_proba)\n",
    "    accuracy = sklearn.metrics.accuracy_score(y_true, y_pred)\n",
    "    f1 = sklearn.metrics.f1_score(y_true, y_pred)\n",
    "    print(\"\\tAUROC:\", auroc)\n",
    "    print(\"\\tAPS:\", aps)\n",
    "    print(\"\\tAccuracy:\", accuracy)\n",
    "    print(\"\\tF1 Score:\", f1)\n",
    "\n",
    "\n",
    "model = sklearn.linear_model.LogisticRegressionCV(penalty=\"l2\", solver=\"liblinear\").fit(X_train, y_train)\n",
    "y_train_proba = model.predict_proba(X_train)[::, 1]\n",
    "y_test_proba = model.predict_proba(X_test)[::, 1]\n",
    "run_analysis(\"Logistic Regression\", y_train, y_train_proba, y_test, y_test_proba)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}