Disease-NER / Git / [ce2cbf] /Pre-processing.ipynb

Models:
philipB/
Disease-NER
Downloads: 1
[ce2cbf]: / Pre-processing.ipynb
History
Download this file
897 lines (896 with data), 28.3 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from pathlib import Path\n",
    "import spacy\n",
    "from sklearn.model_selection import train_test_split\n",
    "import json\n",
    "import os\n",
    "import shutil"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting en_core_web_sm==2.2.5\n",
      "  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)\n",
      "  Preparing metadata (setup.py): started\n",
      "  Preparing metadata (setup.py): finished with status 'done'\n",
      "Requirement already satisfied: spacy>=2.2.2 in d:\\program files (x86)\\python3.7\\lib\\site-packages (from en_core_web_sm==2.2.5) (2.2.4)\n",
      "Requirement already satisfied: setuptools in d:\\program files (x86)\\python3.7\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (46.1.3)\n",
      "Requirement already satisfied: numpy>=1.15.0 in d:\\program files (x86)\\python3.7\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.21.5)\n",
      "Requirement already satisfied: srsly<1.1.0,>=1.0.2 in d:\\program files (x86)\\python3.7\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.2)\n",
      "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in d:\\program files (x86)\\python3.7\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (3.0.2)\n",
      "Requirement already satisfied: plac<1.2.0,>=0.9.6 in d:\\program files (x86)\\python3.7\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.1.3)\n",
      "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in d:\\program files (x86)\\python3.7\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.2)\n",
      "Requirement already satisfied: thinc==7.4.0 in d:\\program files (x86)\\python3.7\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (7.4.0)\n",
      "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in d:\\program files (x86)\\python3.7\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.6.0)\n",
      "Requirement already satisfied: requests<3.0.0,>=2.13.0 in d:\\program files (x86)\\python3.7\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (2.27.1)\n",
      "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in d:\\program files (x86)\\python3.7\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.0)\n",
      "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in d:\\program files (x86)\\python3.7\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (2.0.3)\n",
      "Requirement already satisfied: blis<0.5.0,>=0.4.0 in d:\\program files (x86)\\python3.7\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.4.1)\n",
      "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in d:\\program files (x86)\\python3.7\\lib\\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (4.64.0)\n",
      "Requirement already satisfied: importlib-metadata>=0.20 in d:\\program files (x86)\\python3.7\\lib\\site-packages (from catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_sm==2.2.5) (1.6.0)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in d:\\program files (x86)\\python3.7\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (2022.6.15)\n",
      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in d:\\program files (x86)\\python3.7\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (1.26.12)\n",
      "Requirement already satisfied: idna<4,>=2.5 in d:\\program files (x86)\\python3.7\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (2.8)\n",
      "Requirement already satisfied: charset-normalizer~=2.0.0 in d:\\program files (x86)\\python3.7\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (2.0.12)\n",
      "Requirement already satisfied: colorama in d:\\program files (x86)\\python3.7\\lib\\site-packages (from tqdm<5.0.0,>=4.38.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (0.4.3)\n",
      "Requirement already satisfied: zipp>=0.5 in d:\\program files (x86)\\python3.7\\lib\\site-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_sm==2.2.5) (3.1.0)\n",
      "[+] Download and installation successful\n",
      "You can now load the model via spacy.load('en_core_web_sm')\n"
     ]
    }
   ],
   "source": [
    "!python -m spacy download en_core_web_sm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Fetching Data "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>filename</th>\n",
       "      <th>mark</th>\n",
       "      <th>label</th>\n",
       "      <th>offset1</th>\n",
       "      <th>offset2</th>\n",
       "      <th>span</th>\n",
       "      <th>code</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>es-S0212-71992007000100007-1</td>\n",
       "      <td>T1</td>\n",
       "      <td>ENFERMEDAD</td>\n",
       "      <td>40</td>\n",
       "      <td>61</td>\n",
       "      <td>arterial hypertension</td>\n",
       "      <td>38341003</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>es-S0212-71992007000100007-1</td>\n",
       "      <td>T2</td>\n",
       "      <td>ENFERMEDAD</td>\n",
       "      <td>66</td>\n",
       "      <td>79</td>\n",
       "      <td>polyarthrosis</td>\n",
       "      <td>36186002</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>es-S0212-71992007000100007-1</td>\n",
       "      <td>T3</td>\n",
       "      <td>ENFERMEDAD</td>\n",
       "      <td>1682</td>\n",
       "      <td>1698</td>\n",
       "      <td>pleural effusion</td>\n",
       "      <td>60046008</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>es-S0212-71992007000100007-1</td>\n",
       "      <td>T4</td>\n",
       "      <td>ENFERMEDAD</td>\n",
       "      <td>1859</td>\n",
       "      <td>1875</td>\n",
       "      <td>pleural effusion</td>\n",
       "      <td>60046008</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>es-S0212-71992007000100007-1</td>\n",
       "      <td>T5</td>\n",
       "      <td>ENFERMEDAD</td>\n",
       "      <td>1626</td>\n",
       "      <td>1648</td>\n",
       "      <td>lower lobe atelectasis</td>\n",
       "      <td>46621007</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       filename mark       label  offset1  offset2  \\\n",
       "0  es-S0212-71992007000100007-1   T1  ENFERMEDAD       40       61   \n",
       "1  es-S0212-71992007000100007-1   T2  ENFERMEDAD       66       79   \n",
       "2  es-S0212-71992007000100007-1   T3  ENFERMEDAD     1682     1698   \n",
       "3  es-S0212-71992007000100007-1   T4  ENFERMEDAD     1859     1875   \n",
       "4  es-S0212-71992007000100007-1   T5  ENFERMEDAD     1626     1648   \n",
       "\n",
       "                     span      code  \n",
       "0   arterial hypertension  38341003  \n",
       "1           polyarthrosis  36186002  \n",
       "2        pleural effusion  60046008  \n",
       "3        pleural effusion  60046008  \n",
       "4  lower lobe atelectasis  46621007  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "entities = pd.read_csv('data/entities.tsv',sep = '\\t')\n",
    "entities.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "text_path = 'data/text/'\n",
    "text_files = list(Path(text_path).glob('*.txt'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Split into Train, Test, Valid Sets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_ratio = 0.75\n",
    "validation_ratio = 0.15\n",
    "test_ratio = 0.10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = pd.DataFrame()\n",
    "test_df = pd.DataFrame()\n",
    "valid_df = pd.DataFrame()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_text_files, test_text_files = train_test_split(text_files, test_size=1 - train_ratio, random_state=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "562"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(train_text_files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "188"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(test_text_files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "val_text_files, test_text_files = train_test_split(test_text_files, test_size=test_ratio/(test_ratio + validation_ratio), random_state=1) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "76"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(test_text_files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "112"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(val_text_files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_file_text = {}\n",
    "\n",
    "for file in train_text_files:\n",
    "    file = str(file)\n",
    "    with open(file, \"r\", encoding=\"UTF-8\") as f:\n",
    "        file_name = file[len(text_path):-4]\n",
    "        train_file_text.update({file_name: f.read()})\n",
    "\n",
    "train_text_files = [str(text_file)[len(text_path):-4] for text_file in train_text_files]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_file_text = {}\n",
    "\n",
    "for file in test_text_files:\n",
    "    file = str(file)\n",
    "    with open(file, \"r\", encoding=\"UTF-8\") as f:\n",
    "        file_name = file[len(text_path):-4]\n",
    "        test_file_text.update({file_name: f.read()})\n",
    "\n",
    "test_text_files = [str(text_file)[len(text_path):-4] for text_file in test_text_files]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "val_file_text = {}\n",
    "\n",
    "for file in val_text_files:\n",
    "    file = str(file)\n",
    "    with open(file, \"r\", encoding=\"UTF-8\") as f:\n",
    "        file_name = file[len(text_path):-4]\n",
    "        val_file_text.update({file_name: f.read()})\n",
    "\n",
    "val_text_files = [str(text_file)[len(text_path):-4] for text_file in val_text_files]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sentence Splitting & Tokenization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "nlp = spacy.load(\"en_core_web_sm\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "def tokenize(file_name, file_text):\n",
    "    text = file_text[file_name]\n",
    "    doc = nlp(text)\n",
    "    sentence_ids = []\n",
    "    tokens = []\n",
    "    for i, sent in enumerate(doc.sents):\n",
    "        sentence_ids.append(i)\n",
    "        tokens.append([(tk.text, tk.idx) for tk in sent])\n",
    "    return sentence_ids, tokens"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Converting Spans to IOB Format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_bio_tags(tokens, entities):\n",
    "    tags = []\n",
    "    curr_entity = None\n",
    "    for token in tokens:\n",
    "        if len(entities):\n",
    "            nxt_entity = entities[0]\n",
    "            start, end, lbl = nxt_entity[0], nxt_entity[1], nxt_entity[2]\n",
    "            if token[1] >= start and (token[1] + len(token)) <= end:\n",
    "                if curr_entity:\n",
    "                    tags.append('I-' + lbl)\n",
    "                else:\n",
    "                    tags.append('B-' + lbl)\n",
    "                    curr_entity = nxt_entity\n",
    "                if (token[1] + len(token)) >= end:\n",
    "                    curr_entity = None\n",
    "                    entities.pop(0)\n",
    "            else:\n",
    "                if token[1] >= end:\n",
    "                    entities.pop(0)\n",
    "                tags.append('O')\n",
    "                curr_entity = None\n",
    "        else:\n",
    "            tags.append('O')\n",
    "    return tags"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "file_info = ['text', 'entities', 'tags', 'sentence_ids', 'tokens']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "def dump_jsons(file_info, text_files, entities, file_text):\n",
    "    res = {}\n",
    "    for info in file_info:\n",
    "        res[info] = []\n",
    "        for file_num in range(len(text_files)):\n",
    "            if info == 'text':\n",
    "                res[info].append(\"\")\n",
    "            else:\n",
    "                res[info].append([])\n",
    "\n",
    "    file_idx = {}\n",
    "    for idx, file in enumerate(text_files):\n",
    "        file_idx[file] = idx\n",
    "        \n",
    "    for file in text_files:\n",
    "        entities_file = entities.loc[entities['filename'] == file]\n",
    "        for entity in entities_file.itertuples():\n",
    "            file = entity[1]\n",
    "            idx = file_idx[file]\n",
    "            if res['text'][idx] == \"\":\n",
    "                res['text'][idx] = file_text[file]\n",
    "                res['sentence_ids'][idx], res['tokens'][idx] = tokenize(file, file_text)\n",
    "            # entity -> offset1, offset2, label, span\n",
    "            res['entities'][idx].append([entity[4], entity[5], entity[3], entity[6]])\n",
    "\n",
    "    for idx, file_name in enumerate(text_files):\n",
    "        text = file_text[file_name]\n",
    "        doc = nlp(text)\n",
    "        ents = res['entities'][idx].copy()\n",
    "        for i in res['sentence_ids'][idx]:\n",
    "            res['tags'][idx].append(get_bio_tags(res['tokens'][idx][i], ents))\n",
    "    return res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "res_train = dump_jsons(file_info, train_text_files, entities, train_file_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "file_no = 1\n",
    "sentence_no = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('He', 126),\n",
       " ('had', 129),\n",
       " ('a', 133),\n",
       " ('personal', 135),\n",
       " ('history', 144),\n",
       " ('of', 152),\n",
       " ('arterial', 155),\n",
       " ('hypertension', 164),\n",
       " ('treated', 177),\n",
       " ('with', 185),\n",
       " ('angiotensin', 190),\n",
       " ('converting', 202),\n",
       " ('enzyme', 213),\n",
       " ('inhibitors', 220),\n",
       " (',', 230),\n",
       " ('surgery', 232),\n",
       " ('for', 240),\n",
       " ('duodenal', 244),\n",
       " ('ulcus', 253),\n",
       " ('in', 259),\n",
       " ('1961', 262),\n",
       " ('and', 267),\n",
       " ('cholecystectomy', 271),\n",
       " ('in', 287),\n",
       " ('2002', 290),\n",
       " ('.', 294),\n",
       " ('\\n', 295)]"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res_train['tokens'][file_no][sentence_no]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['O',\n",
       " 'O',\n",
       " 'O',\n",
       " 'O',\n",
       " 'O',\n",
       " 'O',\n",
       " 'B-ENFERMEDAD',\n",
       " 'I-ENFERMEDAD',\n",
       " 'O',\n",
       " 'O',\n",
       " 'O',\n",
       " 'O',\n",
       " 'O',\n",
       " 'O',\n",
       " 'O',\n",
       " 'O',\n",
       " 'O',\n",
       " 'B-ENFERMEDAD',\n",
       " 'I-ENFERMEDAD',\n",
       " 'O',\n",
       " 'O',\n",
       " 'O',\n",
       " 'O',\n",
       " 'O',\n",
       " 'O',\n",
       " 'O',\n",
       " 'O']"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res_train['tags'][file_no][sentence_no]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res_train['sentence_ids'][file_no]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[155, 176, 'ENFERMEDAD', 'arterial hypertension'],\n",
       " [244, 258, 'ENFERMEDAD', 'duodenal ulcus'],\n",
       " [299, 335, 'ENFERMEDAD', 'infrarenal abdominal aortic aneurysm'],\n",
       " [549, 566, 'ENFERMEDAD', 'horseshoe kidneys'],\n",
       " [327, 335, 'ENFERMEDAD', 'aneurysm'],\n",
       " [739, 747, 'ENFERMEDAD', 'aneurysm'],\n",
       " [1172, 1180, 'ENFERMEDAD', 'aneurysm'],\n",
       " [1376, 1385, 'ENFERMEDAD', 'aneurysms'],\n",
       " [1575, 1583, 'ENFERMEDAD', 'aneurysm'],\n",
       " [1557, 1583, 'ENFERMEDAD', 'infrarenal aortic aneurysm'],\n",
       " [2186, 2207, 'ENFERMEDAD', 'aneurysmal dilatation']]"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res_train['entities'][file_no]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'An 81-year-old man was referred from the outpatient clinic to our urology department for symptoms of the lower urinary tract. He had a personal history of arterial hypertension treated with angiotensin converting enzyme inhibitors, surgery for duodenal ulcus in 1961 and cholecystectomy in 2002.\\nAn infrarenal abdominal aortic aneurysm was diagnosed by chance during the abdomino-pelvic ultrasound scan. The CT scan revealed that it did not affect the iliac bifurcation. The examination also showed that the kidneys had a morphology compatible with horseshoe kidneys, with an isthmus located at the level of the infrarenal abdominal aorta. Our case reveals the most favourable situation, in principle, for the surgical approach, since the aneurysm originates distal, 4 cm from the exit of the main renal arteries, which are two, one for each renal half. There is no additional artery at the level of the isthmus.\\nIn successive controls, a progressive increase in the aortic diameter was observed, which, in a control CT scan, had reached a maximum diameter of 8 cm.\\n\\nIn view of these findings, the vascular surgery department decided to consider surgical treatment of the aneurysm.\\nOn physical examination, the patient preserved pulses and an expansive abdominal heartbeat was palpated. The rest of the examination was of no interest.\\nAs part of the preoperative protocol for aneurysms, a baseline echocardiogram and respiratory function tests were performed, all without significant alterations.\\nA retroperitoneal approach was chosen with resection of the infrarenal aortic aneurysm and aorto-aortic bypass with a 16 mm Hemashield prosthesis, without intraoperative complications. And no lumbar or polar arteries were visualised.\\nThe postoperative course was normal, with no complications arising from the procedure, and the patient maintained adequate renal function. At discharge, all pulses were present.\\nThe patient was asymptomatic in terms of vascular alteration; he maintained all pulses in the lower extremities, with preserved renal function parameters and a control CT angiography 6 months after the operation showed aorto-aortic bypass without leaks or evidence of areas of aneurysmal dilatation.\\n\\nAn intravenous urography was also performed, showing the kidneys with horseshoe morphology, pyeloureteral junction in the anterior plane without significant alterations, as well as in the collecting systems.\\n\\n\\n'"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res_train['text'][file_no]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "dump_file = 'data/processed_inp_data_train.json'\n",
    "\n",
    "with open(dump_file, 'w') as f:\n",
    "     f.write(json.dumps(res_train))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "res_test = dump_jsons(file_info, test_text_files, entities, test_file_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "dump_file = 'data/processed_inp_data_test.json'\n",
    "\n",
    "with open(dump_file, 'w') as f:\n",
    "     f.write(json.dumps(res_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "res_val = dump_jsons(file_info, val_text_files, entities, val_file_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "dump_file = 'data/processed_inp_data_val.json'\n",
    "\n",
    "with open(dump_file, 'w') as f:\n",
    "     f.write(json.dumps(res_val))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dir = \"data/text_files/train\"\n",
    "if not os.path.exists(train_dir):\n",
    "    os.makedirs(train_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "for file in train_text_files:\n",
    "    shutil.copyfile(os.path.join(text_path, file + \".txt\"), os.path.join(train_dir, file + \".txt\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "val_dir = \"data/text_files/val\"\n",
    "if not os.path.exists(val_dir):\n",
    "    os.makedirs(val_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "for file in val_text_files:\n",
    "    shutil.copyfile(os.path.join(text_path, file + \".txt\"), os.path.join(val_dir, file + \".txt\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_dir = \"data/text_files/test\"\n",
    "if not os.path.exists(test_dir):\n",
    "    os.makedirs(test_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "for file in test_text_files:\n",
    "    shutil.copyfile(os.path.join(text_path, file + \".txt\"), os.path.join(test_dir, file + \".txt\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_entities_file = pd.DataFrame()\n",
    "for file in train_text_files:\n",
    "        entities_file = entities.loc[entities['filename'] == file]\n",
    "        temp = pd.concat([train_entities_file, entities_file], ignore_index=True)\n",
    "\n",
    "        train_entities_file.drop(train_entities_file.index[0:], inplace=True)\n",
    "\n",
    "        train_entities_file[temp.columns] = temp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_entities_file.to_csv(\"data/train_entities.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "val_entities_file = pd.DataFrame()\n",
    "for file in val_text_files:\n",
    "        entities_file = entities.loc[entities['filename'] == file]\n",
    "        temp = pd.concat([val_entities_file, entities_file], ignore_index=True)\n",
    "\n",
    "        val_entities_file.drop(val_entities_file.index[0:], inplace=True)\n",
    "\n",
    "        val_entities_file[temp.columns] = temp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "val_entities_file.to_csv(\"data/val_entities.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_entities_file = pd.DataFrame()\n",
    "for file in test_text_files:\n",
    "        entities_file = entities.loc[entities['filename'] == file]\n",
    "        temp = pd.concat([test_entities_file, entities_file], ignore_index=True)\n",
    "\n",
    "        test_entities_file.drop(test_entities_file.index[0:], inplace=True)\n",
    "\n",
    "        test_entities_file[temp.columns] = temp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_entities_file.to_csv(\"data/test_entities.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}