556 lines (555 with data), 75.7 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "f8302cf8",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import os\n",
"import json \n",
"\n",
"import spacy\n",
"from spacy import displacy\n",
"\n",
"import numpy as np\n",
"from tensorflow.keras.preprocessing.sequence import pad_sequences"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dd80913f",
"metadata": {},
"outputs": [],
"source": [
"import nltk\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"from nltk.corpus import stopwords\n",
"\n",
"STOP_WORDS = stopwords.words('english')"
]
},
{
"cell_type": "code",
"execution_count": 80,
"id": "6851f1a6",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Error loading punkt: <urlopen error [SSL:\n",
"[nltk_data] CERTIFICATE_VERIFY_FAILED] certificate verify failed:\n",
"[nltk_data] unable to get local issuer certificate (_ssl.c:1131)>\n",
"[nltk_data] Error loading stopwords: <urlopen error [SSL:\n",
"[nltk_data] CERTIFICATE_VERIFY_FAILED] certificate verify failed:\n",
"[nltk_data] unable to get local issuer certificate (_ssl.c:1131)>\n"
]
}
],
"source": [
"import sys\n",
"sys.path.append('../scripts')\n",
"from utils import predict, predict_multi_line_text, load_data\n",
"\n",
"sys.path.append('../')\n",
"from config import entity_to_acronyms, acronyms_to_entities"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "89395fb0",
"metadata": {},
"outputs": [],
"source": [
"# from importlib import reload\n",
"\n",
"# import utils\n",
"# reload(utils)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "55b67971",
"metadata": {},
"outputs": [],
"source": [
"import tensorflow as tf\n",
"from tensorflow.keras.preprocessing.text import Tokenizer\n",
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
"from tensorflow.keras.utils import to_categorical\n",
"from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "fab32dfc",
"metadata": {},
"outputs": [],
"source": [
"from transformers import TFBertModel\n",
"from transformers import BertTokenizer"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "f86da10e",
"metadata": {},
"outputs": [],
"source": [
"data_dir = \"../data/bio_data_files\""
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "51467642",
"metadata": {},
"outputs": [],
"source": [
"bio_files = [os.path.join(data_dir, file) for file in os.listdir(data_dir)]"
]
},
{
"cell_type": "markdown",
"id": "3f666677",
"metadata": {},
"source": [
"## Model parameters"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "5c0199db",
"metadata": {},
"outputs": [],
"source": [
"EMBEDDING_DIM = 200\n",
"MAX_LENGTH = 100"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "36c1529c",
"metadata": {},
"outputs": [],
"source": [
"# initialize tokenizer\n",
"tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "f380a0d9",
"metadata": {},
"outputs": [],
"source": [
"# initialize lists for input sequences and labels\n",
"input_sequences = []\n",
"labels = []\n",
"\n",
"# loop through each file\n",
"for file_path in bio_files:\n",
" with open(file_path, 'r') as f:\n",
" lines = f.readlines()\n",
" # initialize list for current sequence and labels\n",
" sequence = []\n",
" sequence_labels = []\n",
" # loop through each line in the file\n",
" for line in lines:\n",
" line = line.strip()\n",
" if line:\n",
" # tokenize current word\n",
" word = line.split('\\t')[0]\n",
" tokenized_word = tokenizer.tokenize(word)\n",
" if tokenized_word:\n",
" # add tokenized word to current sequence\n",
" sequence.extend(tokenized_word)\n",
" # add label for each subword in tokenized word\n",
" label = line.split('\\t')[1]\n",
" sequence_labels.append(label)\n",
" for i in range(1, len(tokenized_word)):\n",
" if label == 'O':\n",
" sequence_labels.append(label)\n",
" else:\n",
" sequence_labels.append(f\"I-{label[2:]}\")\n",
" else:\n",
" # add current sequence and labels to input sequences and labels\n",
" input_sequences.append(sequence)\n",
" labels.append(sequence_labels)\n",
" # reset sequence and sequence_labels lists\n",
" sequence = []\n",
" sequence_labels = []\n"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "18e1b557",
"metadata": {},
"outputs": [],
"source": [
"unique_labels = set(element for sublist in labels for element in sublist)\n",
"\n",
"label_to_index = {label: id+1 for id, label in enumerate(sorted(unique_labels))}\n",
"index_to_label = {id: label for label, id in label_to_index.items()}\n",
"\n",
"# Add the new label and ID to the dictionaries\n",
"label_to_index['<PAD>'] = 0\n",
"index_to_label[0] = '<PAD>'\n"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "f39385e9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"79"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(unique_labels)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "bef61aea",
"metadata": {},
"outputs": [],
"source": [
"\n",
"# pad input sequences and labels\n",
"padded_sequences = pad_sequences([tokenizer.convert_tokens_to_ids(seq) for seq in input_sequences], maxlen=MAX_LENGTH, padding='post')\n",
"padded_labels = pad_sequences(\n",
" [[label_to_index[label] for label in seq] for seq in labels], \n",
" maxlen=MAX_LENGTH, padding='post')\n",
"padded_labels = to_categorical(padded_labels, num_classes=len(label_to_index))\n",
"\n",
"# train_labels = [[label_to_index[label] for label in all_labels] for all_labels in labels]\n",
"# train_labels = pad_sequences(train_labels, maxlen=MAX_LENGTH, padding='post', value=NUM_CLASSES-1)\n",
"# train_labels = to_categorical(train_labels, num_classes=NUM_CLASSES)\n",
"\n",
"# create attention masks\n",
"attention_masks = np.where(padded_sequences != 0, 1, 0)\n"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "de575658",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"((4341, 100), (4341, 100, 80))"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"padded_sequences.shape, padded_labels.shape"
]
},
{
"cell_type": "markdown",
"id": "e4a386d2",
"metadata": {},
"source": [
"## FineTuning Bert Model"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "a48afd30",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']\n",
"- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.\n",
"If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.\n"
]
}
],
"source": [
"bert_model = TFBertModel.from_pretrained('bert-base-uncased')"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "67045d3b",
"metadata": {},
"outputs": [],
"source": [
"input_ids = tf.keras.Input(shape=(max_len,), dtype='int32')\n",
"attention_masks = tf.keras.Input(shape=(max_len,), dtype='int32')\n",
"bert_output = bert_model(input_ids, attention_mask=attention_masks, return_dict=True)\n",
"embedding = tf.keras.layers.Dropout(0.3)(bert_output[\"last_hidden_state\"])\n",
"output = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(80, activation='softmax'))(embedding)\n",
"model = tf.keras.models.Model(inputs=[input_ids, attention_masks], outputs=[output])\n",
"model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.00001), loss=\"categorical_crossentropy\", metrics=[\"accuracy\"])"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "521364ab",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1/5\n",
"WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_4/bert/pooler/dense/kernel:0', 'tf_bert_model_4/bert/pooler/dense/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?\n",
"WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_4/bert/pooler/dense/kernel:0', 'tf_bert_model_4/bert/pooler/dense/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?\n",
"123/123 [==============================] - 552s 4s/step - loss: 1.0844 - accuracy: 0.7872 - val_loss: 0.6442 - val_accuracy: 0.8443\n",
"Epoch 2/5\n",
"123/123 [==============================] - 557s 5s/step - loss: 0.5870 - accuracy: 0.8590 - val_loss: 0.4519 - val_accuracy: 0.8811\n",
"Epoch 3/5\n",
"123/123 [==============================] - 562s 5s/step - loss: 0.4413 - accuracy: 0.8855 - val_loss: 0.3637 - val_accuracy: 0.9012\n",
"Epoch 4/5\n",
"123/123 [==============================] - 566s 5s/step - loss: 0.3634 - accuracy: 0.9030 - val_loss: 0.3158 - val_accuracy: 0.9114\n",
"Epoch 5/5\n",
"123/123 [==============================] - 545s 4s/step - loss: 0.3164 - accuracy: 0.9137 - val_loss: 0.2880 - val_accuracy: 0.9189\n"
]
}
],
"source": [
"history = model.fit(\n",
" x=[padded_sequences, attention_masks],\n",
" y=padded_labels,\n",
" epochs=5,\n",
" batch_size=32,\n",
" validation_split=0.1\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 85,
"id": "ec57b16d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"136/136 [==============================] - 197s 1s/step - loss: 0.2564 - accuracy: 0.9285\n",
"test loss, test acc: [0.25636425614356995, 0.9285141825675964]\n"
]
}
],
"source": [
"test_attention_masks = np.where(padded_sequences != 0, 1, 0)\n",
"results = model.evaluate(\n",
" x=[padded_sequences, attention_masks],\n",
" y=padded_labels,\n",
" batch_size=32\n",
")\n",
"print(\"test loss, test acc:\", results)\n"
]
},
{
"cell_type": "code",
"execution_count": 84,
"id": "770388cb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1, 100)\n",
"1/1 [==============================] - 0s 99ms/step\n",
"[[79 10 10 6 45 45 36 75 6 6 45 12 51 51 51 51 51 79 11 33 72 72 72 79\n",
" 6 45 45 45 45 33 52 72 33 72 72 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0]]\n",
"first O\n",
"hospital Date\n",
"day Date\n",
"inter Biological_structure\n",
"##cos Biological_structure\n",
"##tal Biological_structure\n",
"drainage Therapeutic_procedure\n",
"tube Therapeutic_procedure\n",
"inserted Biological_structure\n",
"drainage Biological_structure\n",
"chest Biological_structure\n",
"computed Diagnostic_procedure\n",
"tom Diagnostic_procedure\n",
"##ography Diagnostic_procedure\n",
"ct Diagnostic_procedure\n",
"fig Diagnostic_procedure\n",
"##2 Diagnostic_procedure\n",
"revealed O\n",
"irregular Detailed_description\n",
"pl Sign_symptom\n",
"##eur Sign_symptom\n",
"##al Sign_symptom\n",
"mass Sign_symptom\n",
"invading O\n",
"left Biological_structure\n",
"chest Biological_structure\n",
"wall Biological_structure\n",
"rib Biological_structure\n",
"destruction Biological_structure\n",
"pl Sign_symptom\n",
"##eur Distance\n",
"##al Sign_symptom\n",
"e Sign_symptom\n",
"##ff Sign_symptom\n",
"##usion Sign_symptom\n"
]
}
],
"source": [
"from transformers import BertTokenizer\n",
"import numpy as np\n",
"\n",
"# input sentence\n",
"input_sentence = \"On the first hospital day, an intercostal drainage tube was inserted, and after drainage, chest computed tomography (CT) (Fig.2) revealed an irregular pleural mass invading her left chest wall with rib destruction and pleural effusion.\"\n",
"\n",
"# tokenize input sentence\n",
"tokens = []\n",
"for t in tokenize_text(input_sentence):\n",
" tokenized_word = tokenizer.tokenize(t)\n",
" for token in tokenized_word:\n",
" tokens.append(token)\n",
"padded_sequence = pad_sequences(\n",
" [[tokenizer.convert_tokens_to_ids(seq) for seq in tokens]], \n",
" maxlen=MAX_LENGTH, padding='post')\n",
"# # sequence = tokenizer.texts_to_sequences([' '.join(token for token in tokens)])\n",
"# # padded_sequence = pad_sequences(sequence, maxlen=MAX_LENGTH, padding='post')\n",
"print(padded_sequence.shape)\n",
"\n",
"# sequence = tokenizer.texts_to_sequences([' '.join(token for token in tokens)])\n",
"# padded_sequence = pad_sequences(sequence, maxlen=MAX_LENGTH, padding='post')\n",
"\n",
"# create attention mask\n",
"attention_mask = np.where(padded_sequence != 0, 1, 0)\n",
"\n",
"# get predicted NER tags\n",
"predictions = model.predict([padded_sequence, attention_mask])\n",
"\n",
"# decode predictions\n",
"predicted_labels = np.argmax(predictions, axis=-1)\n",
"print(predicted_labels)\n",
"predicted_labels = [index_to_label[i] for i in predicted_labels[0]]\n",
"# decode predicted tags\n",
"\n",
"for token, label in zip(tokens, predicted_labels):\n",
" if label == 'O':\n",
" print(f\"{token} {label}\")\n",
" else:\n",
" print(f\"{token} {acronyms_to_entities[label[2:]]}\")\n",
" \n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "c744592c",
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"def plot_graphs(history):\n",
" fig, axs = plt.subplots(1, 2, figsize=(10, 5))\n",
" \n",
" axs[0].plot(history.history['accuracy'])\n",
" axs[0].plot(history.history['val_accuracy'])\n",
" axs[0].set_title('Model Accuracy')\n",
" axs[0].set_ylabel('Accuracy')\n",
" axs[0].legend(['train', 'val'], loc='best')\n",
" \n",
" axs[1].plot(history.history['loss'])\n",
" axs[1].plot(history.history['val_loss'])\n",
" axs[1].set_title('Model Loss')\n",
" axs[1].set_ylabel('Loss')\n",
" axs[1].legend(['train', 'val'], loc='best')\n",
"\n",
" plt.tight_layout()\n",
" plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "4e14092b",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 1000x500 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Plot the training history\n",
"plot_graphs(history)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}