[c0f169]: / notebooks / 03_Data_Preprocessing.ipynb

Download this file

700 lines (699 with data), 81.3 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 177,
   "id": "75a9cab2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import random\n",
    "import numpy as np\n",
    "import pickle\n",
    "\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "id": "7d42d817",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Error loading stopwords: <urlopen error [SSL:\n",
      "[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:\n",
      "[nltk_data]     unable to get local issuer certificate (_ssl.c:1131)>\n"
     ]
    }
   ],
   "source": [
    "import re\n",
    "import spacy\n",
    "\n",
    "import nltk\n",
    "import string\n",
    "#nltk.download('punkt')\n",
    "nltk.download('stopwords')\n",
    "from nltk.corpus import stopwords\n",
    "STOP_WORDS = stopwords.words('english')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "id": "b8516e50",
   "metadata": {},
   "outputs": [],
   "source": [
    "bio_files_dir = '../data/bio_data_files'\n",
    "bio_files = [os.path.join(bio_files_dir, f) for f in os.listdir('../data/bio_data_files') if f.endswith('.bio')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "id": "cebfd940",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The number of .bio files is 200\n"
     ]
    }
   ],
   "source": [
    "print(f\"The number of .bio files is {len(bio_files)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bfe2b598",
   "metadata": {},
   "source": [
    "## Check if any of the stopwords contain B-tag"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "id": "4d9f940a",
   "metadata": {},
   "outputs": [],
   "source": [
    "for bio_file in bio_files:\n",
    "    with open(bio_file, \"r\", encoding='utf-8') as f:\n",
    "        for line in f:\n",
    "            if line.strip() == '':\n",
    "                continue\n",
    "            word, tag = line.strip().split('\\t')\n",
    "            if word in STOP_WORDS and tag.startswith('B'):\n",
    "                print(line)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2fbe4fa1",
   "metadata": {},
   "source": [
    "## Data Cleaning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "id": "73f0e53d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_word(word):\n",
    "    \n",
    "    # remove non-alphanumeric characters and extra whitespaces\n",
    "    word = re.sub(r'[^\\w\\s]','',word)\n",
    "    word = re.sub(r'\\s+',' ',word)\n",
    "    \n",
    "    # convert to lowercase\n",
    "    word = word.lower()\n",
    "    \n",
    "    if word not in STOP_WORDS:\n",
    "        return word\n",
    "    \n",
    "    return ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "id": "09853edd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load a pre-trained Spacy model and set the stop words\n",
    "nlp = spacy.load('en_core_web_sm')\n",
    "\n",
    "def clean_word(word):\n",
    "    # remove non-alphanumeric characters and extra whitespaces\n",
    "    word = re.sub(r'[^\\w\\s]','',word)\n",
    "    word = re.sub(r'\\s+',' ',word)\n",
    "    \n",
    "    # convert to lowercase\n",
    "    word = word.lower()\n",
    "\n",
    "    # lemmatize the word\n",
    "    lemma = nlp(word)[0].lemma_\n",
    "    \n",
    "    # check if the lemma is a stop word\n",
    "    if lemma not in STOP_WORDS:\n",
    "        return lemma\n",
    "    \n",
    "    return ''\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "id": "141f77d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse_data_from_file(bio_file):\n",
    "    \"\"\"\n",
    "    Reads a file in BIO format (one token per line, with tab-separated word and tag),\n",
    "    and extracts the sentences and labels as lists of lists. Each inner list represents\n",
    "    a sentence, and contains the words of the sentence in order. Each corresponding inner\n",
    "    list in the 'labels' list contains the BIO tags for the words in the corresponding\n",
    "    sentence, in the same order.\n",
    "    \n",
    "    Args:\n",
    "    - bio_file (str): the path to the BioNLP file to read\n",
    "    \n",
    "    Returns:\n",
    "    - A tuple containing:\n",
    "        - sentences (List[List[str]]): a list of lists, where each inner list represents\n",
    "          a sentence and contains the words of the sentence in order\n",
    "        - labels (List[List[str]]): a list of lists, where each inner list corresponds\n",
    "          to a sentence in the 'sentences' list and contains the BIO tags for the words\n",
    "          in the corresponding sentence, in the same order.\n",
    "    \"\"\"\n",
    "    sentences = []\n",
    "    labels = []\n",
    "    \n",
    "    with open(bio_file, \"r\", encoding='utf-8') as f:\n",
    "        \n",
    "        current_sentences = []\n",
    "        current_labels = []\n",
    "        \n",
    "        for line in f:\n",
    "            \n",
    "            if line.strip() == '':\n",
    "                # If we encounter a blank line, it means we've reached the end of a sentence\n",
    "                if len(current_sentences) > 0:\n",
    "                    \n",
    "                    # Add the current sentence and labels to the list\n",
    "                    sentences.append(current_sentences)\n",
    "                    labels.append(current_labels)\n",
    "                    \n",
    "                    # Reset the current sentence and labels lists\n",
    "                    current_sentences = []\n",
    "                    current_labels = []\n",
    "                    continue\n",
    "                    \n",
    "            word, tag = line.strip().split('\\t')\n",
    "            word = clean_word(word)\n",
    "            \n",
    "            if word.strip():\n",
    "                current_sentences.append(word)\n",
    "                \n",
    "                if len(current_labels) > 0:\n",
    "                    if tag[2:] == current_labels[-1][2:] and tag[:2] == \"B-\":\n",
    "                        tag = f\"I-{tag[2:]}\"\n",
    "                current_labels.append(tag)\n",
    "        \n",
    "    return sentences, labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "id": "fd844f67",
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse_bio_files(bio_files):\n",
    "    \n",
    "    sentences = []\n",
    "    labels = []\n",
    "    \n",
    "    for idx, bio_file in enumerate(bio_files):\n",
    "        \n",
    "        curr_sentences, curr_labels = parse_data_from_file(bio_file)\n",
    "        \n",
    "        if len(curr_sentences) > 0:\n",
    "            sentences.extend(curr_sentences)\n",
    "            labels.extend(curr_labels)\n",
    "            \n",
    "        if (idx+1) % 20 == 0:\n",
    "            print(f'{idx+1} completed')\n",
    "\n",
    "    return sentences, labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "45fd1f2c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "20 completed\n",
      "40 completed\n",
      "60 completed\n",
      "80 completed\n",
      "100 completed\n",
      "120 completed\n",
      "140 completed\n",
      "160 completed\n",
      "180 completed\n",
      "200 completed\n"
     ]
    }
   ],
   "source": [
    "sentences, labels = parse_bio_files(bio_files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "id": "b53eabb9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset contains 4341 examples\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(f\"Dataset contains {len(sentences)} examples\\n\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "46342054",
   "metadata": {},
   "source": [
    "## Shuffle the sentences and labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "id": "56e01b35",
   "metadata": {},
   "outputs": [],
   "source": [
    "combined = list(zip(sentences, labels))\n",
    "random.shuffle(combined)\n",
    "sentences[:], labels[:] = zip(*combined)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "722ff4a8",
   "metadata": {},
   "source": [
    "## Train Test Validation Split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "id": "30bb6251",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split the data into training, validation, and test sets\n",
    "\n",
    "TEST_SIZE = 0.2\n",
    "\n",
    "num_sentences = len(sentences)\n",
    "num_train = int(num_sentences * (1 - TEST_SIZE - 0.1))\n",
    "num_valid = int(num_sentences * 0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "id": "0c1d5432",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_sentences = sentences[:num_train]\n",
    "train_labels = labels[:num_train]\n",
    "\n",
    "valid_sentences = sentences[num_train:num_train+num_valid]\n",
    "valid_labels = labels[num_train:num_train+num_valid]\n",
    "\n",
    "test_sentences = sentences[num_train+num_valid:]\n",
    "test_labels = labels[num_train+num_valid:]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7d413c70",
   "metadata": {},
   "source": [
    "## Tokenization - Sequences and padding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "id": "de6ea2c6",
   "metadata": {},
   "outputs": [],
   "source": [
    "unique_labels = set(element for sublist in labels for element in sublist)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "id": "9e0dbaea",
   "metadata": {},
   "outputs": [],
   "source": [
    "label_to_index = {label: id+1 for id, label in enumerate(sorted(unique_labels))}\n",
    "index_to_label = {id: label for label, id in label_to_index.items()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "id": "9ba093e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add the new label and ID to the dictionaries\n",
    "label_to_index['<PAD>'] = 0\n",
    "index_to_label[0] = '<PAD>'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "id": "c4f7383a",
   "metadata": {},
   "outputs": [],
   "source": [
    "NUM_CLASSES = len(index_to_label)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "id": "91f9e4df",
   "metadata": {},
   "outputs": [],
   "source": [
    "MAX_LENGTH = 100\n",
    "\n",
    "train_labels = [[label_to_index[label] for label in labels] for labels in train_labels]\n",
    "train_labels = pad_sequences(train_labels, maxlen=MAX_LENGTH, padding='post', value=NUM_CLASSES-1)\n",
    "train_labels = to_categorical(train_labels, num_classes=NUM_CLASSES)\n",
    "\n",
    "valid_labels = [[label_to_index[label] for label in labels] for labels in valid_labels]\n",
    "valid_labels = pad_sequences(valid_labels, maxlen=MAX_LENGTH, padding='post', value=NUM_CLASSES-1)\n",
    "valid_labels = to_categorical(valid_labels, num_classes=NUM_CLASSES)\n",
    "\n",
    "test_labels = [[label_to_index[label] for label in labels] for labels in test_labels]\n",
    "test_labels = pad_sequences(test_labels, maxlen=MAX_LENGTH, padding='post', value=NUM_CLASSES-1)\n",
    "test_labels = to_categorical(test_labels, num_classes=NUM_CLASSES)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "id": "a5fc9b7d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert the input sentences to sequences of word indices\n",
    "tokenizer = Tokenizer()\n",
    "tokenizer.fit_on_texts(train_sentences)\n",
    "\n",
    "\n",
    "train_sequences = tokenizer.texts_to_sequences(train_sentences)\n",
    "val_sequences = tokenizer.texts_to_sequences(valid_sentences)\n",
    "test_sequences = tokenizer.texts_to_sequences(test_sentences)\n",
    "\n",
    "\n",
    "# Pad the sequences to a fixed length\n",
    "train_sequences_padded = pad_sequences(train_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')\n",
    "val_sequences_padded = pad_sequences(val_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')\n",
    "test_sequences_padded = pad_sequences(test_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cfcdb497",
   "metadata": {},
   "source": [
    "###  save to a .npz file:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 176,
   "id": "27d0e8b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.savez(\n",
    "    '../data/data.npz',\n",
    "     train_sequences_padded=train_sequences_padded,\n",
    "     train_labels=train_labels,\n",
    "     val_sequences_padded=val_sequences_padded,\n",
    "     val_labels=valid_labels,\n",
    "     test_sequences_padded=test_sequences_padded,\n",
    "     test_labels=test_labels,\n",
    "     label_to_index=label_to_index,\n",
    "     index_to_label=index_to_label\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "55c25a8f",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "314f9f5d",
   "metadata": {},
   "source": [
    "## Building Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "id": "886f4882",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3038, 100, 79)"
      ]
     },
     "execution_count": 150,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_labels.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "id": "1cd0ea36",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/20\n",
      "95/95 [==============================] - 8s 63ms/step - loss: 0.8827 - accuracy: 0.8964 - val_loss: 0.3796 - val_accuracy: 0.9129\n",
      "Epoch 2/20\n",
      "95/95 [==============================] - 6s 64ms/step - loss: 0.3871 - accuracy: 0.9065 - val_loss: 0.3668 - val_accuracy: 0.9130\n",
      "Epoch 3/20\n",
      "95/95 [==============================] - 6s 62ms/step - loss: 0.3634 - accuracy: 0.9086 - val_loss: 0.3585 - val_accuracy: 0.9149\n",
      "Epoch 4/20\n",
      "95/95 [==============================] - 6s 63ms/step - loss: 0.3383 - accuracy: 0.9138 - val_loss: 0.3494 - val_accuracy: 0.9180\n",
      "Epoch 5/20\n",
      "95/95 [==============================] - 6s 65ms/step - loss: 0.3026 - accuracy: 0.9217 - val_loss: 0.3306 - val_accuracy: 0.9253\n",
      "Epoch 6/20\n",
      "95/95 [==============================] - 6s 65ms/step - loss: 0.2605 - accuracy: 0.9332 - val_loss: 0.3159 - val_accuracy: 0.9305\n",
      "Epoch 7/20\n",
      "95/95 [==============================] - 6s 63ms/step - loss: 0.2212 - accuracy: 0.9435 - val_loss: 0.3047 - val_accuracy: 0.9337\n",
      "Epoch 8/20\n",
      "95/95 [==============================] - 6s 64ms/step - loss: 0.1884 - accuracy: 0.9523 - val_loss: 0.3058 - val_accuracy: 0.9359\n",
      "Epoch 9/20\n",
      "95/95 [==============================] - 6s 63ms/step - loss: 0.1612 - accuracy: 0.9589 - val_loss: 0.3036 - val_accuracy: 0.9377\n",
      "Epoch 10/20\n",
      "95/95 [==============================] - 6s 66ms/step - loss: 0.1393 - accuracy: 0.9642 - val_loss: 0.3080 - val_accuracy: 0.9390\n",
      "Epoch 11/20\n",
      "95/95 [==============================] - 6s 65ms/step - loss: 0.1217 - accuracy: 0.9684 - val_loss: 0.3115 - val_accuracy: 0.9392\n",
      "Epoch 12/20\n",
      "95/95 [==============================] - 6s 62ms/step - loss: 0.1071 - accuracy: 0.9719 - val_loss: 0.3183 - val_accuracy: 0.9402\n",
      "Epoch 13/20\n",
      "95/95 [==============================] - 6s 62ms/step - loss: 0.0950 - accuracy: 0.9750 - val_loss: 0.3252 - val_accuracy: 0.9404\n",
      "Epoch 14/20\n",
      "95/95 [==============================] - 6s 63ms/step - loss: 0.0846 - accuracy: 0.9779 - val_loss: 0.3346 - val_accuracy: 0.9409\n",
      "Epoch 15/20\n",
      "95/95 [==============================] - 6s 65ms/step - loss: 0.0758 - accuracy: 0.9802 - val_loss: 0.3395 - val_accuracy: 0.9406\n",
      "Epoch 16/20\n",
      "95/95 [==============================] - 6s 67ms/step - loss: 0.0680 - accuracy: 0.9822 - val_loss: 0.3533 - val_accuracy: 0.9407\n",
      "Epoch 17/20\n",
      "95/95 [==============================] - 6s 65ms/step - loss: 0.0606 - accuracy: 0.9844 - val_loss: 0.3563 - val_accuracy: 0.9407\n",
      "Epoch 18/20\n",
      "95/95 [==============================] - 6s 64ms/step - loss: 0.0545 - accuracy: 0.9861 - val_loss: 0.3661 - val_accuracy: 0.9408\n",
      "Epoch 19/20\n",
      "95/95 [==============================] - 6s 66ms/step - loss: 0.0488 - accuracy: 0.9878 - val_loss: 0.3751 - val_accuracy: 0.9405\n",
      "Epoch 20/20\n",
      "95/95 [==============================] - 6s 62ms/step - loss: 0.0439 - accuracy: 0.9893 - val_loss: 0.3879 - val_accuracy: 0.9407\n"
     ]
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "id": "c7dd3bba",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "id": "841fc432",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "",
      "text/plain": [
       "<Figure size 640x480 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "image/png": "",
      "text/plain": [
       "<Figure size 640x480 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "id": "ded6974a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "28/28 [==============================] - 0s 17ms/step - loss: 0.4221 - accuracy: 0.9364\n",
      "Test accuracy: 0.9364326596260071\n"
     ]
    }
   ],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "9f061245",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 179,
   "id": "5c4b7d16",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1/1 [==============================] - 0s 9ms/step\n",
      "Predicted Named Entities:\n",
      "leptospira: Diagnostic_procedure\n",
      "igm: Diagnostic_procedure\n",
      "antibody: O\n",
      "perform: Date\n",
      "5th: Date\n",
      "day: O\n",
      "illness: Lab_value\n",
      "positive: Diagnostic_procedure\n",
      "igg: Lab_value\n",
      "negative: O\n"
     ]
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 180,
   "id": "2bf7f283",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Assuming tokenizer is your trained tokenizer\n",
    "with open('../data/tokenizer.pickle', 'wb') as handle:\n",
    "    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e4de8cb8",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}