375 lines (374 with data), 11.7 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from datasets import Dataset, DatasetDict\n",
"import torch\n",
"from random import randrange, sample\n",
"from transformers import DataCollatorForSeq2Seq, T5ForConditionalGeneration\n",
"import pandas as pd\n",
"from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
"from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType, PeftModel, PeftConfig\n",
"from transformers import DataCollatorForSeq2Seq\n",
"from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments\n",
"from sklearn.preprocessing import MultiLabelBinarizer\n",
"from sklearn.metrics import classification_report, roc_auc_score, precision_recall_fscore_support\n",
"import json\n",
"import argparse\n",
"import tqdm\n",
"import numpy as np\n",
"import random\n",
"import os\n",
"from skllm.config import SKLLMConfig\n",
"from skllm import MultiLabelZeroShotGPTClassifier"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"train_data = pd.read_csv('./synthetic_data/Iteration_1.csv')\n",
"test_data = pd.read_csv('./synthetic_data/Partial_Iteration_2_annotated.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# drop rows where column label is empty\n",
"test_data = test_data.dropna(subset=['label'])"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Remove legacy categories"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"test_data['label'] = test_data['label'].str.split(',')\n",
"\n",
"# Remove 'CAREGIVER' and 'EDUCATION' from the label list\n",
"test_data['label'] = test_data['label'].apply(lambda x: [label.strip() for label in x if label.strip() not in ['CAREGIVER', 'EDUCATION']])\n",
"\n",
"# Convert the label list back to a comma-separated string\n",
"test_data['label'] = test_data['label'].apply(lambda x: ','.join(x))\n",
"\n",
"test_data = test_data[test_data['label'] != '']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"test_text = test_data['text'].tolist()\n",
"test_labels = test_data['label'].tolist()\n",
"testdf = pd.DataFrame({'text':test_text, 'SDOHlabels':test_labels})\n",
"test_dataset = Dataset.from_pandas(testdf)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"train_text = train_data['text'].tolist()\n",
"train_labels = train_data['label'].tolist()\n",
"traindf = pd.DataFrame({'text':train_text, 'SDOHlabels':train_labels})\n",
"train_dataset = Dataset.from_pandas(traindf)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"BROAD_LABELS = {'TRANSPORTATION', 'HOUSING', 'RELATIONSHIP',\n",
" 'PARENT','EMPLOYMENT', 'SUPPORT'}"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Fine Tuned T5"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model_path = 'path/to/finetuned/t5'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"TOKENIZER = AutoTokenizer.from_pretrained(model_path)\n",
"MAX_S_LEN = 100\n",
"MAX_T_LEN = 40"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"config = PeftConfig.from_pretrained(model_path)\n",
"\n",
"# load base LLM model and tokenizer\n",
"reloaded_model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, load_in_8bit=True, device_map={\"\":0})\n",
"\n",
"## tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)\n",
"# Load the Lora model\n",
"reloaded_model = PeftModel.from_pretrained(reloaded_model, model_path, device_map={\"\":0})\n",
"reloaded_model.eval()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Support functions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def postprocess_function(preds):\n",
" \"\"\"\n",
" Perform post-processing on the predictions.\n",
"\n",
" Args:\n",
" preds (list): A list of predictions.\n",
"\n",
" Returns:\n",
" list: Processed predictions with fixed labels.\n",
"\n",
" Examples:\n",
" >>> preds = ['REL', 'EMPLO', 'HOUS', 'UNKNOWN']\n",
" >>> postprocess_function(preds)\n",
" ['RELATIONSHIP', 'EMPLOYMENT', 'HOUSING', 'UNKNOWN']\n",
"\n",
" >>> preds = ['NO_SD', np.nan, 'SUPP']\n",
" >>> postprocess_function(preds)\n",
" ['<NO_SDOH>', '<NO_SDOH>', 'SUPPORT']\n",
" \"\"\"\n",
" lab_fixed_dict = {\n",
" 'REL': 'RELATIONSHIP',\n",
" 'RELAT': 'RELATIONSHIP',\n",
" 'EMP': 'EMPLOYMENT',\n",
" 'EMPLO': 'EMPLOYMENT',\n",
" 'SUPP': 'SUPPORT',\n",
" 'HOUS': 'HOUSING',\n",
" 'PAREN': 'PARENT',\n",
" 'TRANSPORT': 'TRANSPORTATION',\n",
" 'NO_SD': '<NO_SDOH>',\n",
" np.nan: '<NO_SDOH>',\n",
" 'NO_SDOH>': '<NO_SDOH>',\n",
" '<NO_SDOH': '<NO_SDOH>',\n",
" }\n",
"\n",
" new_preds = []\n",
" for pred in preds:\n",
" pred_ls = []\n",
" pred = str(pred)\n",
" for pp in pred.split(','):\n",
" if pp in lab_fixed_dict.keys():\n",
" pred_ls.append(lab_fixed_dict[pp])\n",
" else:\n",
" pred_ls.append(pp)\n",
" new_preds.append(','.join(pred_ls))\n",
"\n",
" return new_preds\n",
"\n",
"def preprocess_function(sample,padding=\"max_length\"):\n",
" # add prefix to the input for t5\n",
" inputs = [\"summarize: \" + item for item in sample[\"text\"]]\n",
" # tokenize inputs\n",
" model_inputs = TOKENIZER(inputs, max_length=MAX_S_LEN, padding=padding, truncation=True)\n",
"\n",
" # Tokenize targets with the `text_target` keyword argument\n",
" labels = TOKENIZER(text_target=sample[\"SDOHlabels\"], max_length=MAX_T_LEN, padding=padding, truncation=True)\n",
"\n",
" # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore\n",
" # padding in the loss.\n",
" if padding == \"max_length\":\n",
" labels[\"input_ids\"] = [\n",
" [(l if l != TOKENIZER.pad_token_id else -100) for l in label] for label in labels[\"input_ids\"]\n",
" ]\n",
" model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
" return model_inputs\n",
"\n",
"def normal_eval(preds, gold):\n",
" pred_temp = [p.split(\",\") for p in preds]\n",
" gold_list = [g.split(',') for g in gold]\n",
"\n",
" pred_list = []\n",
" for labs in pred_temp:\n",
" point_pred = [p for p in labs if p in BROAD_LABELS]\n",
" pred_list.append(point_pred)\n",
" mlb = MultiLabelBinarizer()\n",
" oh_gold = mlb.fit_transform(gold_list)\n",
" oh_pred = mlb.transform(pred_list)\n",
"\n",
" prec, rec, f1, _ = precision_recall_fscore_support(oh_gold, oh_pred)\n",
" micro_f1 = precision_recall_fscore_support(oh_gold, oh_pred, average='micro')[2]\n",
" weight_f1 = precision_recall_fscore_support(oh_gold, oh_pred, average='weighted')[2]\n",
" macro_f1 = precision_recall_fscore_support(oh_gold, oh_pred, average='macro')[2]\n",
"\n",
" metrics_out = {'macro_f1':macro_f1, 'micro_f1': micro_f1, 'weighted_f1': weight_f1}\n",
" for i, lab in enumerate(list(mlb.classes_)):\n",
" metrics_out['precision_'+str(lab)] = prec[i]\n",
" metrics_out['recall_'+str(lab)] = rec[i]\n",
" metrics_out['f1_'+str(lab)] = f1[i]\n",
" print(classification_report(oh_gold, oh_pred, target_names=mlb.classes_))\n",
" return metrics_out\n",
"\n",
"def predict(dataset, model, batch_size):\n",
" predictions, references = [], []\n",
" batch_size = batch_size\n",
" for i in tqdm.tqdm(range(0, len(dataset), batch_size)):\n",
" texts = dataset[i:i+batch_size]\n",
" input_ids = TOKENIZER(texts[\"text\"], return_tensors=\"pt\", truncation=True, padding=\"max_length\").input_ids.cuda()\n",
" outputs = model.generate(input_ids=input_ids, do_sample=False, top_p=0.9, max_new_tokens=5, num_beams=4) #, top_p=0.9, max_new_tokens=10\n",
" outputs = TOKENIZER.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True) \n",
" labels = dataset[i:i+batch_size][\"SDOHlabels\"] \n",
" predictions.extend(outputs)\n",
" references.extend(labels)\n",
" return predictions, references"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"predictions, references = predict(test_dataset, reloaded_model, 4)\n",
"metrics = normal_eval(predictions, references)\n",
"print('='*30+'POST PROCESSED'+'='*30)\n",
"processed_predictions = postprocess_function(predictions)\n",
"processed_metrics = normal_eval(processed_predictions, references)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## GPT"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"SKLLMConfig.set_openai_key(\"API KEY HERE\")\n",
"SKLLMConfig.set_openai_org(\"ORGANIZATION HERE\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Multi-Label"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"clf = MultiLabelZeroShotGPTClassifier(max_labels=4)\n",
"clf.fit(None, [BROAD_LABELS])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"labels = clf.predict(test_data['text'])\n",
"y = [foo.split(',') for foo in test_data['label']]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mlb2 = MultiLabelBinarizer()\n",
"y = mlb2.fit_transform(y)\n",
"labels = mlb2.transform(labels)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(classification_report(y, labels, target_names=mlb2.classes_))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "models2",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.10"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}