b/gpt_vs_ftT5.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import Dataset, DatasetDict\n",
+    "import torch\n",
+    "from random import randrange, sample\n",
+    "from transformers import DataCollatorForSeq2Seq, T5ForConditionalGeneration\n",
+    "import pandas as pd\n",
+    "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
+    "from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType, PeftModel, PeftConfig\n",
+    "from transformers import DataCollatorForSeq2Seq\n",
+    "from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments\n",
+    "from sklearn.preprocessing import MultiLabelBinarizer\n",
+    "from sklearn.metrics import classification_report, roc_auc_score, precision_recall_fscore_support\n",
+    "import json\n",
+    "import argparse\n",
+    "import tqdm\n",
+    "import numpy as np\n",
+    "import random\n",
+    "import os\n",
+    "from skllm.config import SKLLMConfig\n",
+    "from skllm import MultiLabelZeroShotGPTClassifier"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_data = pd.read_csv('./synthetic_data/Iteration_1.csv')\n",
+    "test_data = pd.read_csv('./synthetic_data/Partial_Iteration_2_annotated.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# drop rows where column label is empty\n",
+    "test_data = test_data.dropna(subset=['label'])"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Remove legacy categories"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_data['label'] = test_data['label'].str.split(',')\n",
+    "\n",
+    "# Remove 'CAREGIVER' and 'EDUCATION' from the label list\n",
+    "test_data['label'] = test_data['label'].apply(lambda x: [label.strip() for label in x if label.strip() not in ['CAREGIVER', 'EDUCATION']])\n",
+    "\n",
+    "# Convert the label list back to a comma-separated string\n",
+    "test_data['label'] = test_data['label'].apply(lambda x: ','.join(x))\n",
+    "\n",
+    "test_data = test_data[test_data['label'] != '']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_text = test_data['text'].tolist()\n",
+    "test_labels = test_data['label'].tolist()\n",
+    "testdf = pd.DataFrame({'text':test_text, 'SDOHlabels':test_labels})\n",
+    "test_dataset = Dataset.from_pandas(testdf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_text = train_data['text'].tolist()\n",
+    "train_labels = train_data['label'].tolist()\n",
+    "traindf = pd.DataFrame({'text':train_text, 'SDOHlabels':train_labels})\n",
+    "train_dataset = Dataset.from_pandas(traindf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "BROAD_LABELS = {'TRANSPORTATION', 'HOUSING', 'RELATIONSHIP',\n",
+    "                'PARENT','EMPLOYMENT', 'SUPPORT'}"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Fine Tuned T5"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_path = 'path/to/finetuned/t5'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "TOKENIZER = AutoTokenizer.from_pretrained(model_path)\n",
+    "MAX_S_LEN = 100\n",
+    "MAX_T_LEN = 40"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config = PeftConfig.from_pretrained(model_path)\n",
+    "\n",
+    "# load base LLM model and tokenizer\n",
+    "reloaded_model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path,  load_in_8bit=True,  device_map={\"\":0})\n",
+    "\n",
+    "## tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)\n",
+    "# Load the Lora model\n",
+    "reloaded_model = PeftModel.from_pretrained(reloaded_model, model_path, device_map={\"\":0})\n",
+    "reloaded_model.eval()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Support functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def postprocess_function(preds):\n",
+    "    \"\"\"\n",
+    "    Perform post-processing on the predictions.\n",
+    "\n",
+    "    Args:\n",
+    "        preds (list): A list of predictions.\n",
+    "\n",
+    "    Returns:\n",
+    "        list: Processed predictions with fixed labels.\n",
+    "\n",
+    "    Examples:\n",
+    "        >>> preds = ['REL', 'EMPLO', 'HOUS', 'UNKNOWN']\n",
+    "        >>> postprocess_function(preds)\n",
+    "        ['RELATIONSHIP', 'EMPLOYMENT', 'HOUSING', 'UNKNOWN']\n",
+    "\n",
+    "        >>> preds = ['NO_SD', np.nan, 'SUPP']\n",
+    "        >>> postprocess_function(preds)\n",
+    "        ['<NO_SDOH>', '<NO_SDOH>', 'SUPPORT']\n",
+    "    \"\"\"\n",
+    "    lab_fixed_dict = {\n",
+    "        'REL': 'RELATIONSHIP',\n",
+    "        'RELAT': 'RELATIONSHIP',\n",
+    "        'EMP': 'EMPLOYMENT',\n",
+    "        'EMPLO': 'EMPLOYMENT',\n",
+    "        'SUPP': 'SUPPORT',\n",
+    "        'HOUS': 'HOUSING',\n",
+    "        'PAREN': 'PARENT',\n",
+    "        'TRANSPORT': 'TRANSPORTATION',\n",
+    "        'NO_SD': '<NO_SDOH>',\n",
+    "        np.nan: '<NO_SDOH>',\n",
+    "        'NO_SDOH>': '<NO_SDOH>',\n",
+    "        '<NO_SDOH': '<NO_SDOH>',\n",
+    "    }\n",
+    "\n",
+    "    new_preds = []\n",
+    "    for pred in preds:\n",
+    "        pred_ls = []\n",
+    "        pred = str(pred)\n",
+    "        for pp in pred.split(','):\n",
+    "            if pp in lab_fixed_dict.keys():\n",
+    "                pred_ls.append(lab_fixed_dict[pp])\n",
+    "            else:\n",
+    "                pred_ls.append(pp)\n",
+    "        new_preds.append(','.join(pred_ls))\n",
+    "\n",
+    "    return new_preds\n",
+    "\n",
+    "def preprocess_function(sample,padding=\"max_length\"):\n",
+    "    # add prefix to the input for t5\n",
+    "    inputs = [\"summarize: \" + item for item in sample[\"text\"]]\n",
+    "    # tokenize inputs\n",
+    "    model_inputs = TOKENIZER(inputs, max_length=MAX_S_LEN, padding=padding, truncation=True)\n",
+    "\n",
+    "    # Tokenize targets with the `text_target` keyword argument\n",
+    "    labels = TOKENIZER(text_target=sample[\"SDOHlabels\"], max_length=MAX_T_LEN, padding=padding, truncation=True)\n",
+    "\n",
+    "    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore\n",
+    "    # padding in the loss.\n",
+    "    if padding == \"max_length\":\n",
+    "        labels[\"input_ids\"] = [\n",
+    "            [(l if l != TOKENIZER.pad_token_id else -100) for l in label] for label in labels[\"input_ids\"]\n",
+    "        ]\n",
+    "    model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
+    "    return model_inputs\n",
+    "\n",
+    "def normal_eval(preds, gold):\n",
+    "    pred_temp = [p.split(\",\") for p in preds]\n",
+    "    gold_list = [g.split(',') for g in gold]\n",
+    "\n",
+    "    pred_list = []\n",
+    "    for labs in pred_temp:\n",
+    "        point_pred = [p for p in labs if p in BROAD_LABELS]\n",
+    "        pred_list.append(point_pred)\n",
+    "    mlb = MultiLabelBinarizer()\n",
+    "    oh_gold = mlb.fit_transform(gold_list)\n",
+    "    oh_pred = mlb.transform(pred_list)\n",
+    "\n",
+    "    prec, rec, f1, _ = precision_recall_fscore_support(oh_gold, oh_pred)\n",
+    "    micro_f1  = precision_recall_fscore_support(oh_gold, oh_pred, average='micro')[2]\n",
+    "    weight_f1 = precision_recall_fscore_support(oh_gold, oh_pred, average='weighted')[2]\n",
+    "    macro_f1 = precision_recall_fscore_support(oh_gold, oh_pred, average='macro')[2]\n",
+    "\n",
+    "    metrics_out = {'macro_f1':macro_f1, 'micro_f1': micro_f1, 'weighted_f1': weight_f1}\n",
+    "    for i, lab in enumerate(list(mlb.classes_)):\n",
+    "        metrics_out['precision_'+str(lab)] = prec[i]\n",
+    "        metrics_out['recall_'+str(lab)] = rec[i]\n",
+    "        metrics_out['f1_'+str(lab)] = f1[i]\n",
+    "    print(classification_report(oh_gold, oh_pred, target_names=mlb.classes_))\n",
+    "    return metrics_out\n",
+    "\n",
+    "def predict(dataset, model, batch_size):\n",
+    "    predictions, references = [], []\n",
+    "    batch_size = batch_size\n",
+    "    for i in tqdm.tqdm(range(0, len(dataset), batch_size)):\n",
+    "        texts = dataset[i:i+batch_size]\n",
+    "        input_ids = TOKENIZER(texts[\"text\"], return_tensors=\"pt\", truncation=True, padding=\"max_length\").input_ids.cuda()\n",
+    "        outputs = model.generate(input_ids=input_ids, do_sample=False, top_p=0.9, max_new_tokens=5, num_beams=4) #, top_p=0.9, max_new_tokens=10\n",
+    "        outputs = TOKENIZER.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True) \n",
+    "        labels = dataset[i:i+batch_size][\"SDOHlabels\"] \n",
+    "        predictions.extend(outputs)\n",
+    "        references.extend(labels)\n",
+    "    return predictions, references"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "predictions, references = predict(test_dataset, reloaded_model, 4)\n",
+    "metrics = normal_eval(predictions, references)\n",
+    "print('='*30+'POST PROCESSED'+'='*30)\n",
+    "processed_predictions = postprocess_function(predictions)\n",
+    "processed_metrics = normal_eval(processed_predictions, references)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## GPT"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "SKLLMConfig.set_openai_key(\"API KEY HERE\")\n",
+    "SKLLMConfig.set_openai_org(\"ORGANIZATION HERE\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Multi-Label"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "clf = MultiLabelZeroShotGPTClassifier(max_labels=4)\n",
+    "clf.fit(None, [BROAD_LABELS])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "labels = clf.predict(test_data['text'])\n",
+    "y = [foo.split(',') for foo in test_data['label']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mlb2 = MultiLabelBinarizer()\n",
+    "y = mlb2.fit_transform(y)\n",
+    "labels = mlb2.transform(labels)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(classification_report(y, labels, target_names=mlb2.classes_))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "models2",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.10"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}