--- a +++ b/Entities_NER.ipynb @@ -0,0 +1,15614 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# HuggingFace Installations" + ], + "metadata": { + "id": "vawMUV8TT2dg" + } + }, + { + "cell_type": "code", + "source": [ + "!pip install datasets\n", + "!pip install transformers\n", + "!pip install seqeval" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "6yD-WetFMkwS", + "outputId": "5880d2ef-ce47-4a34-a522-5d433e689f8a" + }, + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting datasets\n", + " Downloading datasets-2.6.1-py3-none-any.whl (441 kB)\n", + "\u001b[K |████████████████████████████████| 441 kB 4.9 MB/s \n", + "\u001b[?25hRequirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (2.23.0)\n", + "Collecting huggingface-hub<1.0.0,>=0.2.0\n", + " Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)\n", + "\u001b[K |████████████████████████████████| 163 kB 84.5 MB/s \n", + "\u001b[?25hRequirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.7/dist-packages (from datasets) (6.0)\n", + "Requirement already satisfied: aiohttp in /usr/local/lib/python3.7/dist-packages (from datasets) (3.8.3)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from datasets) (21.3)\n", + "Collecting responses<0.19\n", + " Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n", + "Requirement already satisfied: dill<0.3.6 in /usr/local/lib/python3.7/dist-packages (from datasets) (0.3.5.1)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from datasets) (1.3.5)\n", + "Requirement already satisfied: fsspec[http]>=2021.11.1 in /usr/local/lib/python3.7/dist-packages (from datasets) (2022.8.2)\n", + "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from datasets) (4.13.0)\n", + "Collecting multiprocess\n", + " Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)\n", + "\u001b[K |████████████████████████████████| 115 kB 93.2 MB/s \n", + "\u001b[?25hRequirement already satisfied: pyarrow>=6.0.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (6.0.1)\n", + "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.7/dist-packages (from datasets) (4.64.1)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from datasets) (1.21.6)\n", + "Collecting xxhash\n", + " Downloading xxhash-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)\n", + "\u001b[K |████████████████████████████████| 212 kB 87.9 MB/s \n", + "\u001b[?25hRequirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (22.1.0)\n", + "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (2.1.1)\n", + "Requirement already satisfied: typing-extensions>=3.7.4 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (4.1.1)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (1.8.1)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (6.0.2)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (1.3.1)\n", + "Requirement already satisfied: asynctest==0.13.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (0.13.0)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (1.2.0)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (4.0.2)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.2.0->datasets) (3.8.0)\n", + "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->datasets) (3.0.9)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (2022.9.24)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (2.10)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (1.24.3)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (3.0.4)\n", + "Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1\n", + " Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)\n", + "\u001b[K |████████████████████████████████| 127 kB 91.7 MB/s \n", + "\u001b[?25hRequirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->datasets) (3.9.0)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets) (2.8.2)\n", + "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets) (2022.4)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->datasets) (1.15.0)\n", + "Installing collected packages: urllib3, xxhash, responses, multiprocess, huggingface-hub, datasets\n", + " Attempting uninstall: urllib3\n", + " Found existing installation: urllib3 1.24.3\n", + " Uninstalling urllib3-1.24.3:\n", + " Successfully uninstalled urllib3-1.24.3\n", + "Successfully installed datasets-2.6.1 huggingface-hub-0.10.1 multiprocess-0.70.13 responses-0.18.0 urllib3-1.25.11 xxhash-3.1.0\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting transformers\n", + " Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)\n", + "\u001b[K |████████████████████████████████| 5.3 MB 5.0 MB/s \n", + "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers) (3.8.0)\n", + "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers) (4.13.0)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers) (4.64.1)\n", + "Collecting tokenizers!=0.11.3,<0.14,>=0.11.1\n", + " Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)\n", + "\u001b[K |████████████████████████████████| 7.6 MB 71.4 MB/s \n", + "\u001b[?25hRequirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.7/dist-packages (from transformers) (6.0)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (1.21.6)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from transformers) (21.3)\n", + "Requirement already satisfied: huggingface-hub<1.0,>=0.10.0 in /usr/local/lib/python3.7/dist-packages (from transformers) (0.10.1)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2022.6.2)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers) (2.23.0)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0,>=0.10.0->transformers) (4.1.1)\n", + "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->transformers) (3.0.9)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers) (3.9.0)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (1.25.11)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2022.9.24)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (3.0.4)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2.10)\n", + "Installing collected packages: tokenizers, transformers\n", + "Successfully installed tokenizers-0.13.1 transformers-4.23.1\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting seqeval\n", + " Downloading seqeval-1.2.2.tar.gz (43 kB)\n", + "\u001b[K |████████████████████████████████| 43 kB 1.4 MB/s \n", + "\u001b[?25hRequirement already satisfied: numpy>=1.14.0 in /usr/local/lib/python3.7/dist-packages (from seqeval) (1.21.6)\n", + "Requirement already satisfied: scikit-learn>=0.21.3 in /usr/local/lib/python3.7/dist-packages (from seqeval) (1.0.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn>=0.21.3->seqeval) (3.1.0)\n", + "Requirement already satisfied: scipy>=1.1.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn>=0.21.3->seqeval) (1.7.3)\n", + "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn>=0.21.3->seqeval) (1.2.0)\n", + "Building wheels for collected packages: seqeval\n", + " Building wheel for seqeval (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16180 sha256=23a3a3ae907d4e838c06fc140623ceabf5f6a01792b4d0fc8b0bc856e17c3e53\n", + " Stored in directory: /root/.cache/pip/wheels/05/96/ee/7cac4e74f3b19e3158dce26a20a1c86b3533c43ec72a549fd7\n", + "Successfully built seqeval\n", + "Installing collected packages: seqeval\n", + "Successfully installed seqeval-1.2.2\n" + ] + } + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "id": "2LEFwSmbKpLP" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import spacy\n", + "import tqdm\n", + "import sys\n", + "from datasets import Dataset, DatasetDict\n", + "from transformers import Trainer\n", + "from transformers import AutoModelForTokenClassification\n", + "from transformers import AutoTokenizer\n", + "from transformers import TrainingArguments\n", + "from transformers import DataCollatorForTokenClassification\n", + "from datasets import load_metric\n", + "from transformers import pipeline\n", + "from transformers import EarlyStoppingCallback, IntervalStrategy" + ] + }, + { + "cell_type": "code", + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "G7DLsSK319zN", + "outputId": "6d759f8e-521e-4fcb-c859-0c4d4f620e58" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Mounted at /content/drive\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "%cd /content/drive/MyDrive/IRE" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "czuvYXJp17x-", + "outputId": "c5fa02b9-9703-4ba6-8fe6-baa9c367f653" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "/content/drive/MyDrive/IRE\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JP_EFEuNKpLT" + }, + "source": [ + "# Loading Data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "V-_Zs-ZGKpLV", + "outputId": "aea55c01-65dd-4e58-916a-92e83b90ee95" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " filename mark label offset1 offset2 \\\n", + "0 es-S0212-71992007000100007-1 T1 ENFERMEDAD 40 61 \n", + "1 es-S0212-71992007000100007-1 T2 ENFERMEDAD 66 79 \n", + "2 es-S0212-71992007000100007-1 T3 ENFERMEDAD 1682 1698 \n", + "3 es-S0212-71992007000100007-1 T4 ENFERMEDAD 1859 1875 \n", + "4 es-S0212-71992007000100007-1 T5 ENFERMEDAD 1626 1648 \n", + "\n", + " span code \n", + "0 arterial hypertension 38341003 \n", + "1 polyarthrosis 36186002 \n", + "2 pleural effusion 60046008 \n", + "3 pleural effusion 60046008 \n", + "4 lower lobe atelectasis 46621007 " + ], + "text/html": [ + "\n", + " <div id=\"df-d46a4230-7e8d-49af-b801-224b9699bd0e\">\n", + " <div class=\"colab-df-container\">\n", + " <div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>filename</th>\n", + " <th>mark</th>\n", + " <th>label</th>\n", + " <th>offset1</th>\n", + " <th>offset2</th>\n", + " <th>span</th>\n", + " <th>code</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>es-S0212-71992007000100007-1</td>\n", + " <td>T1</td>\n", + " <td>ENFERMEDAD</td>\n", + " <td>40</td>\n", + " <td>61</td>\n", + " <td>arterial hypertension</td>\n", + " <td>38341003</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>es-S0212-71992007000100007-1</td>\n", + " <td>T2</td>\n", + " <td>ENFERMEDAD</td>\n", + " <td>66</td>\n", + " <td>79</td>\n", + " <td>polyarthrosis</td>\n", + " <td>36186002</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>es-S0212-71992007000100007-1</td>\n", + " <td>T3</td>\n", + " <td>ENFERMEDAD</td>\n", + " <td>1682</td>\n", + " <td>1698</td>\n", + " <td>pleural effusion</td>\n", + " <td>60046008</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>es-S0212-71992007000100007-1</td>\n", + " <td>T4</td>\n", + " <td>ENFERMEDAD</td>\n", + " <td>1859</td>\n", + " <td>1875</td>\n", + " <td>pleural effusion</td>\n", + " <td>60046008</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>es-S0212-71992007000100007-1</td>\n", + " <td>T5</td>\n", + " <td>ENFERMEDAD</td>\n", + " <td>1626</td>\n", + " <td>1648</td>\n", + " <td>lower lobe atelectasis</td>\n", + " <td>46621007</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>\n", + " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-d46a4230-7e8d-49af-b801-224b9699bd0e')\"\n", + " title=\"Convert this dataframe to an interactive table.\"\n", + " style=\"display:none;\">\n", + " \n", + " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", + " width=\"24px\">\n", + " <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n", + " <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n", + " </svg>\n", + " </button>\n", + " \n", + " <style>\n", + " .colab-df-container {\n", + " display:flex;\n", + " flex-wrap:wrap;\n", + " gap: 12px;\n", + " }\n", + "\n", + " .colab-df-convert {\n", + " background-color: #E8F0FE;\n", + " border: none;\n", + " border-radius: 50%;\n", + " cursor: pointer;\n", + " display: none;\n", + " fill: #1967D2;\n", + " height: 32px;\n", + " padding: 0 0 0 0;\n", + " width: 32px;\n", + " }\n", + "\n", + " .colab-df-convert:hover {\n", + " background-color: #E2EBFA;\n", + " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", + " fill: #174EA6;\n", + " }\n", + "\n", + " [theme=dark] .colab-df-convert {\n", + " background-color: #3B4455;\n", + " fill: #D2E3FC;\n", + " }\n", + "\n", + " [theme=dark] .colab-df-convert:hover {\n", + " background-color: #434B5C;\n", + " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", + " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", + " fill: #FFFFFF;\n", + " }\n", + " </style>\n", + "\n", + " <script>\n", + " const buttonEl =\n", + " document.querySelector('#df-d46a4230-7e8d-49af-b801-224b9699bd0e button.colab-df-convert');\n", + " buttonEl.style.display =\n", + " google.colab.kernel.accessAllowed ? 'block' : 'none';\n", + "\n", + " async function convertToInteractive(key) {\n", + " const element = document.querySelector('#df-d46a4230-7e8d-49af-b801-224b9699bd0e');\n", + " const dataTable =\n", + " await google.colab.kernel.invokeFunction('convertToInteractive',\n", + " [key], {});\n", + " if (!dataTable) return;\n", + "\n", + " const docLinkHtml = 'Like what you see? Visit the ' +\n", + " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", + " + ' to learn more about interactive tables.';\n", + " element.innerHTML = '';\n", + " dataTable['output_type'] = 'display_data';\n", + " await google.colab.output.renderOutput(dataTable, element);\n", + " const docLink = document.createElement('div');\n", + " docLink.innerHTML = docLinkHtml;\n", + " element.appendChild(docLink);\n", + " }\n", + " </script>\n", + " </div>\n", + " </div>\n", + " " + ] + }, + "metadata": {}, + "execution_count": 5 + } + ], + "source": [ + "entities = pd.read_csv(\"data/entities.tsv\", delimiter=\"\\t\")\n", + "entities.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "9ZrFWsjmKpLW" + }, + "outputs": [], + "source": [ + "list_off0 = list(entities['offset1'])\n", + "list_off1 = list(entities['offset2'])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "Mafcxoi4KpLW" + }, + "outputs": [], + "source": [ + "text_files_path = \"data/text\"" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "OFCPFj9SKpLX", + "outputId": "8946ced4-8b26-49fe-a6e4-6aa3e4c8e16d" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "A 73-year-old patient with a history of arterial hypertension and polyarthrosis presented to the emergency department with abdominal distension and pain associated with constipation and febrile fever. The symptoms had started three weeks earlier and worsened during the four days prior to admission. During this period, an upper gastrointestinal fibroendoscopy (oesophagus, stomach and duodenum) and a colonoscopy (up to the splenic angle) were performed, but no abnormalities were found.\n", + "\n", + "Physical examination revealed a low-grade fever (37.6º C), a distended abdomen, diffusely painful on palpation, tympanised on percussion, with scant borborygmi but no evidence of peritonism, pulmonary auscultation with decreased ventilation in the lower half of the right hemithorax and the onset of intense pain on palpation and percussion of the last three dorsal spinous processes.\n", + "\n", + "Analyses showed 8.2 x 109 leukocytes / L, haemoglobin 136 g / L, platelets 186 x 109 / L. Except for glycaemia (123 mg/dl), the following laboratory parameters were normal or negative: urea, creatinine, bilirubin, transaminases, gamma-glutamyltranspeptidase, sodium, potassium, chlorine, calcium, phosphorus, creatine phosphokinase, amylase, lactate dehydrogenase (LDH), proteinogram, immunoglobulin dosage, alpha-fetoprotein, CA 19 antigens. 9 and CA 125 antigens, as well as general urinalysis. ESR and C-reactive protein were elevated, with values of 85 mm / 1 h and 133 mg / L (normal < 5 mg / L), respectively. Mantoux intradermal reaction (10 IU RT-23) was positive, with an induration of 25 mm. Chest X-ray showed an image compatible with right lower lobe atelectasis in the context of an ipsilateral pleural effusion. There were no signs suggestive of adenopathy or alterations in the cardiopericardial silhouette. A thoracoabdominal CT scan confirmed the existence of a right pleural effusion and identified prominent degenerative changes along the dorsolumbar spine but, above all, erosions in the vertebral plates adjacent to the D10-D11 disc space. A lumbar MRI showed hyposignal on T1-weighted sequences and hypersignal on T2-weighted sequences in these vertebrae and their corresponding disc, with morphological alterations typical of infectious spondylodiscitis D10-D11. Three serial blood cultures were negative. Samples obtained by aspiration of the D10-D11 space showed gram-positive cocci chains, which were subsequently recovered and typed as penicillin-sensitive Streptococcus pneumoniae. Pleural fluid analysis showed pH: 7.55; leucocytes: 8.4 x 109/L (58% neutrophils, 26% eosinophils, 16% lymphocytes), protein: 48 g/L (ratio to serum protein: 0.65), glucose: 125 mg/dl, ADA: 25.92 IU/ml, LDH: 362 U/L (pleural LDH/serum LDH ratio: 0.8). Both auramine-rhodamine staining and Löwenstein-Jensen medium culture of pleural fluid were negative and cytology showed no evidence of neoplastic cells.\n", + "\n", + "\n", + "\n", + "The patient was initially treated intravenously with amoxicillin + clavulanic acid (1 g / 200 mg, every 8 hours). After 21 days, she was switched to the oral route (875 / 125 mg, every 8 hours) for 6 weeks. The evolution was favourable and she was able to start walking with a dorsolumbar corset after the fourth week. One month after the end of antibiotic therapy, a control chest CT scan still showed a discrete pleural effusion, but the patient had only mild mechanical dorsalgia, her ESR had decreased to 21 mm / 1 h and her CRP was 2.4 mg/L. Outpatient follow-up continued for a further three years, during which time the evolution was favourable and a D10-D11 vertebral block was formed.\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + } + ], + "source": [ + "f = open(text_files_path + \"/\" + entities.iloc[1,0] + \".txt\", \"r\", encoding=\"UTF-8\")\n", + "for l in f:\n", + " print(l)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "UeML6HJmKpLX", + "outputId": "de8c52d4-49f7-441b-89af-cce4183b18e9" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "100%|██████████| 6650/6650 [01:51<00:00, 59.70it/s]\n" + ] + } + ], + "source": [ + "#Clinical cases\n", + "HCs = {}\n", + "for fid in tqdm.tqdm(range(len(entities[\"filename\"]))):\n", + " fname = entities[\"filename\"][fid]\n", + " with open(text_files_path + \"/\" + fname + \".txt\", \"r\", encoding=\"UTF-8\") as f:\n", + " HCs.update({fname: f.read()})" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "nivRpUrBKpLY" + }, + "outputs": [], + "source": [ + "#Diseases\n", + "ENF = {}\n", + "enfermedades = []\n", + "fn = entities[\"filename\"][0]\n", + "for fname, enf in zip(entities[\"filename\"], entities[\"span\"]):\n", + " if fname!=fn:\n", + " enfermedades = []\n", + " enfermedades.append(enf)\n", + " ENF.update({fname: enfermedades})\n", + " fn = fname" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "nYI19JFtKpLY", + "outputId": "68cd794c-7b51-488f-ccf8-af9e574794cb" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "741" + ] + }, + "metadata": {}, + "execution_count": 11 + } + ], + "source": [ + "len(ENF)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Preprocessing" + ], + "metadata": { + "id": "qviw-SCgUDwK" + } + }, + { + "cell_type": "code", + "source": [ + "!python -m spacy download en_core_web_sm" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "xtkwMJhYNP17", + "outputId": "4c9fdacb-c17f-4bef-909d-fe6b1117dc06" + }, + "execution_count": 12, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting en-core-web-sm==3.4.1\n", + " Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)\n", + "\u001b[K |████████████████████████████████| 12.8 MB 2.6 MB/s \n", + "\u001b[?25hRequirement already satisfied: spacy<3.5.0,>=3.4.0 in /usr/local/lib/python3.7/dist-packages (from en-core-web-sm==3.4.1) (3.4.1)\n", + "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /usr/local/lib/python3.7/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.0.8)\n", + "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (4.64.1)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.0.7)\n", + "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /usr/local/lib/python3.7/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.4.4)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (57.4.0)\n", + "Requirement already satisfied: pathy>=0.3.5 in /usr/local/lib/python3.7/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (0.6.2)\n", + "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.23.0)\n", + "Requirement already satisfied: thinc<8.2.0,>=8.1.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (8.1.4)\n", + "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.9 in /usr/local/lib/python3.7/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (3.0.10)\n", + "Requirement already satisfied: wasabi<1.1.0,>=0.9.1 in /usr/local/lib/python3.7/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (0.10.1)\n", + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (3.0.8)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.7/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.11.3)\n", + "Requirement already satisfied: typing-extensions<4.2.0,>=3.7.4 in /usr/local/lib/python3.7/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (4.1.1)\n", + "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (3.3.0)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (21.3)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (1.0.9)\n", + "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.10.0,>=1.7.4 in /usr/local/lib/python3.7/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (1.9.2)\n", + "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (1.0.3)\n", + "Requirement already satisfied: typer<0.5.0,>=0.3.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (0.4.2)\n", + "Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (1.21.6)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from catalogue<2.1.0,>=2.0.6->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (3.9.0)\n", + "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (3.0.9)\n", + "Requirement already satisfied: smart-open<6.0.0,>=5.2.1 in /usr/local/lib/python3.7/dist-packages (from pathy>=0.3.5->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (5.2.1)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.10)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2022.9.24)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (1.25.11)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (3.0.4)\n", + "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /usr/local/lib/python3.7/dist-packages (from thinc<8.2.0,>=8.1.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (0.0.3)\n", + "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /usr/local/lib/python3.7/dist-packages (from thinc<8.2.0,>=8.1.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (0.7.8)\n", + "Requirement already satisfied: click<9.0.0,>=7.1.1 in /usr/local/lib/python3.7/dist-packages (from typer<0.5.0,>=0.3.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (7.1.2)\n", + "Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from jinja2->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.0.1)\n", + "Installing collected packages: en-core-web-sm\n", + " Attempting uninstall: en-core-web-sm\n", + " Found existing installation: en-core-web-sm 3.4.0\n", + " Uninstalling en-core-web-sm-3.4.0:\n", + " Successfully uninstalled en-core-web-sm-3.4.0\n", + "Successfully installed en-core-web-sm-3.4.1\n", + "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", + "You can now load the package via spacy.load('en_core_web_sm')\n" + ] + } + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "id": "6at1DC6zKpLZ" + }, + "outputs": [], + "source": [ + "nlp = spacy.load(\"en_core_web_sm\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "r-OrghwTKpLZ" + }, + "outputs": [], + "source": [ + "HCs_tokenized = []\n", + "for hc in HCs:\n", + " hl = []\n", + " tokens = nlp(HCs[hc])\n", + " #tokens = HCs[hc].split(\" \") #The simplest option\n", + " for t in tokens:\n", + " hl.append(str(t))\n", + " HCs_tokenized.append(hl)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "c4Nh9SJ9KpLa", + "outputId": "4cf4dd3e-e57b-4f52-98d3-ba2fc55632ab" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "741" + ] + }, + "metadata": {}, + "execution_count": 15 + } + ], + "source": [ + "len(HCs_tokenized)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "id": "mJkuhsnTKpLa" + }, + "outputs": [], + "source": [ + "Ent_tokenized = []\n", + "for enf in ENF:\n", + " Tks = []\n", + " for e in ENF[enf]:\n", + " sl = []\n", + " tokens = nlp(e)\n", + " #tokens = e.split(\" \")\n", + " for t in tokens:\n", + " sl.append(str(t))\n", + " Tks.append(sl)\n", + " Ent_tokenized.append(Tks)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "195X74buKpLa", + "outputId": "ad159ed3-e174-4048-9d30-9a92e94dd653" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "741" + ] + }, + "metadata": {}, + "execution_count": 17 + } + ], + "source": [ + "len(Ent_tokenized)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mtLgqICDKpLb", + "outputId": "d6befaf5-1174-4e9e-8cb1-768df6a768ac" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[['arterial', 'hypertension'],\n", + " ['polyarthrosis'],\n", + " ['pleural', 'effusion'],\n", + " ['pleural', 'effusion'],\n", + " ['lower', 'lobe', 'atelectasis'],\n", + " ['infectious', 'spondylodiscitis', 'D10', '-', 'D11'],\n", + " ['pleural', 'effusion']]" + ] + }, + "metadata": {}, + "execution_count": 18 + } + ], + "source": [ + "Ent_tokenized[0]" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Tagging Data with BIO scheme" + ], + "metadata": { + "id": "ZapndudTUQvP" + } + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "id": "O94wXbu4KpLb" + }, + "outputs": [], + "source": [ + "def find_idx(list_to_check, item_to_find):\n", + " indices = []\n", + " for idx, value in enumerate(list_to_check):\n", + " if value == item_to_find:\n", + " indices.append(idx)\n", + " return indices" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "xy-dmZodKpLb", + "outputId": "d34b67ea-5d01-4120-9cdb-4801dc7a8108" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "['A', '43', '-', 'year', '-', 'old', 'man', 'was', 'admitted', 'to', 'the', 'emergency', 'department', 'due', 'to', 'sudden', 'left', 'lumbar', 'pain', ',', 'continuous', 'and', 'incapacitating', ',', 'without', 'antalgic', 'position', 'or', 'aggravating', 'factors', ',', 'without', 'irradiation', ',', 'with', 'approximately', '23', 'hours', 'of', 'evolution', '.', 'No', 'nausea', 'or', 'vomiting', ',', 'no', 'macroscopic', 'haematuria', 'or', 'lower', 'urinary', 'tract', 'discomfort', '.', 'Absence', 'of', 'precordial', 'pain', '.', 'Hypertension', 'controlled', 'with', 'verapamil', '.', 'He', 'reported', 'an', 'episode', ',', 'interpreted', 'as', 'a', 'transient', 'ischaemic', 'attack', ',', 'approximately', 'eight', 'weeks', 'earlier', '(', 'not', 'confirmed', ')', '.', 'No', 'history', 'of', 'cardiac', 'arrhythmia', 'or', 'valvular', 'heart', 'disease', '.', 'No', 'other', 'previous', 'thromboembolic', 'episodes', '.', 'No', 'known', 'history', 'of', 'urinary', 'lithiasis', '.', 'No', 'osteoarticular', 'or', 'respiratory', 'complaints', '.', 'No', 'cocaine', 'abuse', '.', 'No', 'history', 'of', 'hepatitis', 'B', 'or', 'C.', 'Medicated', 'with', '160', 'mg', '/', 'day', 'of', 'verapamil', '.', '\\n', 'Physical', 'examination', 'showed', 'diaphoretic', ',', 'BP', '150', '/', '110', 'mmHg', ',', '80', 'beats', 'per', 'minute', ',', 'rhythmic', 'and', 'wide', '.', 'Temperature', '37.8', 'ºC.', 'Abdomen', 'painful', 'on', 'deep', 'palpation', 'in', 'the', 'left', 'iliac', 'fossa', 'and', 'flank', ',', 'with', 'defence', ',', 'with', 'no', 'signs', 'of', 'peritoneal', 'irritation', '.', 'Decreased', 'RHA', '.', 'No', 'abdominal', 'murmurs', '.', 'Negative', 'bilateral', 'renal', 'Murphy', '.', 'Existence', 'of', 'symmetrical', 'arterial', 'pulses', '.', 'No', 'perfusion', 'deficit', 'in', 'the', 'extremities', '.', 'General', 'neurological', 'examination', 'without', 'alterations', '.', '\\n', 'Renal', 'ultrasound', 'showed', 'no', 'abnormalities', ',', 'especially', 'dilatation', 'of', 'the', 'urinary', 'tract', '.', 'Laboratory', 'tests', ':', 'Hb15.6', 'g', '/', 'dL', ',', 'Leuc', '13,800/µL', ',', 'Neut', '76.1', '%', ',', 'Creat', '1.4', 'mg', '/', 'dL', ',', 'TGO', '104', 'UI', '/', 'L', ',', 'TGP', '74', 'UI', '/', 'L', ',', 'LDH', '1,890', 'UI', '/', 'L.', 'Coagulation', 'parameters', 'showed', 'no', 'alterations', '.', 'The', 'ECG', 'showed', 'sinus', 'rhythm', ',', 'with', 'no', 'alterations', 'compatible', 'with', 'acute', 'myocardial', 'ischaemia', '.', 'An', 'abdominal', 'and', 'pelvic', 'CT', 'scan', 'was', 'requested', ',', 'which', 'showed', 'the', 'presence', 'of', 'multiple', 'areas', 'without', 'contrast', 'uptake', 'in', 'the', 'left', 'kidney', ',', 'without', 'morpho', '-', 'structural', 'alterations', ',', 'compatible', 'with', 'multifocal', 'areas', 'of', 'ischaemia', ',', 'with', 'multisegmental', 'distribution', ',', 'probably', 'of', 'embolic', 'aetiology', '.', 'No', 'aortic', 'dilatation', 'or', 'renal', 'artery', 'aneurysm', '.', 'No', 'intra', '-', 'peritoneal', 'alterations', '.', 'Taking', 'into', 'account', 'the', 'multi', '-', 'segmental', 'distribution', 'of', 'the', 'ischaemic', 'process', 'and', 'the', 'duration', 'of', 'discomfort', ',', 'we', 'decided', 'that', 'there', 'was', 'no', 'indication', 'for', 'invasive', 'manoeuvres', '.', 'The', 'patient', 'underwent', 'systemic', 'hypo', '-', 'coagulation', 'with', 'heparin', 'in', 'an', 'attempt', 'to', 'avoid', 'future', 'embolic', 'episodes', 'and', 'appropriate', 'analgesia', '.', '\\n\\n', 'He', 'then', 'underwent', 'multiple', 'examinations', 'in', 'an', 'attempt', 'to', 'identify', 'an', 'embolic', 'focus', 'and', 'the', 'aetiological', 'process', '.', 'Echocardiography', 'ruled', 'out', 'pathology', 'of', 'the', 'cardiac', 'valvular', 'apparatus', 'or', 'the', 'existence', 'of', 'valvular', 'vegetations', '.', 'Absence', 'of', 'areas', 'of', 'myocardial', 'dyskinesia', '.', 'Arteriography', 'showed', 'a', 'perfusion', 'deficit', 'of', 'the', 'lower', 'pole', 'of', 'the', 'left', 'kidney', 'with', 'multiple', 'other', 'less', 'prominent', 'areas', 'showing', 'perfusion', 'deficits', 'as', 'well', '.', 'No', 'other', 'alterations', 'such', 'as', 'macro', '/', 'microaneurysms', 'or', 'alterations', 'of', 'the', 'main', 'renal', 'artery', 'or', 'aorta', 'were', 'detected', '.', '\\n', 'The', 'study', 'to', 'rule', 'out', 'prothrombotic', 'and', 'vascular', 'disease', '(', 'lupus', 'anticoagulant', ',', 'anti', '-', 'cardiolipin', ',', 'ANCA', \"'s\", ',', 'detection', 'of', 'cryoglobulins', ',', 'ANA', \"'s\", ',', 'determination', 'of', 'immunoglobulins', 'and', 'complement', 'fractions', ')', 'was', 'negative', '.', '\\n', 'After', '15', 'months', 'of', 'follow', '-', 'up', ',', 'we', 'were', 'left', 'without', 'an', 'aetiological', 'diagnosis', '.', 'The', 'patient', 'remains', 'asymptomatic', ',', 'with', 'no', 'new', 'episodes', 'of', 'embolism', 'or', 'manifestations', 'of', 'systemic', 'disease', '.', 'Anti', '-', 'coagulation', 'therapy', 'has', 'been', 'discontinued', '.', 'He', 'maintains', 'controlled', 'hypertension', 'with', 'the', 'same', 'dose', 'of', 'verapamil', '.', 'The', 'last', 'analytical', 'control', 'had', 'a', 'serum', 'creatinine', 'of', '1.2', 'mg', '/', 'dL', ',', 'and', 'GFR', 'of', '93', 'ml', '/', 'min', '.', 'The', 'follow', '-', 'up', 'kinillogram', 'shows', 'a', 'functional', 'deficit', 'of', 'the', 'affected', 'renal', 'unit', '(', 'differential', 'function', '41', '%', ')', '.', '\\n\\n']\n", + "[['transient', 'ischaemic', 'attack'], ['cardiac', 'arrhythmia'], ['valvular', 'heart', 'disease'], ['thromboembolic', 'episodes'], ['urinary', 'lithiasis'], ['cocaine', 'abuse'], ['hepatitis', 'B', 'or', 'C'], ['Hypertension'], ['acute', 'myocardial', 'ischaemia'], ['aortic', 'dilatation'], ['renal', 'artery', 'aneurysm'], ['ischaemia'], ['ischaemia'], ['valvular', 'vegetations'], ['myocardial', 'dyskinesia'], ['macro', '/', 'microaneurysms', 'or', 'alterations', 'of', 'the', 'main', 'renal', 'artery', 'or', 'aorta'], ['prothrombotic', 'and', 'vascular', 'disease'], ['hypertension'], ['embolism'], ['systemic', 'disease']]\n", + "['hepatitis', 'B', 'or', 'C']\n", + "C\n", + "27\n", + "['A', 'four', '-', 'month', '-', 'old', 'boy', 'was', 'admitted', 'with', 'vomiting', ',', 'diarrhoea', 'and', 'severe', 'weight', 'and', 'body', 'weight', 'delay', '.', 'He', 'presented', 'hypotonia', ',', 'bilateral', 'convergent', 'strabismus', 'and', 'inverted', 'nipples', '.', 'Healthy', ',', 'non', '-', 'consanguineous', 'parents', ',', 'normal', 'pregnancy', 'and', 'delivery', '.', 'Since', 'the', 'age', 'of', 'one', 'month', 'she', 'has', 'had', 'poor', 'intake', ',', 'growth', 'failure', 'and', 'vomiting', ',', 'with', 'normal', 'abdominal', 'and', 'transfontanelar', 'ultrasound', ',', 'chest', 'X', '-', 'ray', ',', 'blood', 'tests', 'and', 'urine', 'culture', '.', '\\n\\n', 'On', 'admission', ',', 'rotavirus', 'was', 'detected', 'in', 'stool', '.', 'After', 'resolution', 'of', 'the', 'acute', 'process', ',', 'nutritional', 'support', 'was', 'started', 'with', 'artificial', 'protein', 'hydrolysate', 'formula', ';', 'later', ',', 'the', 'patient', 'was', 'switched', 'to', 'elemental', 'formula', 'as', 'he', 'continued', 'to', 'show', 'poor', 'weight', 'gain', '.', 'The', 'study', 'performed', 'detected', 'hypertransaminemia', ',', 'hypoalbuminemia', ',', 'decreased', 'transferrin', 'and', 'ceruloplasmin', ',', 'with', 'normal', 'hepatotropic', 'virus', 'serology', 'and', 'echocardiography', '.', 'Given', 'the', 'suspicion', 'of', 'a', 'possible', 'inborn', 'error', 'of', 'metabolism', '(', 'IEM', ')', ',', 'a', 'metabolic', 'study', 'was', 'requested', ',', 'while', 'the', 'patient', 'required', 'admission', 'to', 'the', 'Intensive', 'Care', 'Unit', 'due', 'to', 'the', 'onset', 'of', 'symptoms', 'compatible', 'with', 'sepsis', '(', 'fever', ',', 'hypoglycaemia', ',', 'poor', 'general', 'condition', 'and', 'poor', 'colouring', ')', '.', 'A', 'cranial', 'MRI', 'scan', 'was', 'performed', ',', 'which', 'showed', 'cerebellar', 'hypoplasia', '.', 'The', 'result', 'of', 'the', 'metabolic', 'study', 'confirms', 'a', 'congenital', 'protein', 'glycosylation', 'defect', '(', 'CGD', ')', 'type', 'Ia.', '\\n\\n\\n']\n", + "[['bilateral', 'convergent', 'strabismus'], ['inverted', 'nipples'], ['inborn', 'error', 'of', 'metabolism'], ['IEM'], ['sepsis'], ['cerebellar', 'hypoplasia'], ['congenital', 'protein', 'glycosylation', 'defect', '(', 'CGD', ')', 'type', 'Ia'], ['CGD'], ['hypoglycaemia']]\n", + "['congenital', 'protein', 'glycosylation', 'defect', '(', 'CGD', ')', 'type', 'Ia']\n", + "Ia\n", + "205\n", + "['Patient', 'aged', '53', 'years', 'at', 'the', 'time', 'of', 'diagnosis', 'with', 'a', 'personal', 'history', 'of', 'a', 'caesarean', 'section', ',', 'arterial', 'hypertension', 'and', 'tachycardia', 'treated', 'with', 'Atenolol', ',', 'who', 'came', 'to', 'the', 'Emergency', 'Department', 'with', 'a', '5', '-', 'month', 'history', 'of', 'progressive', 'dull', 'pain', 'in', 'the', 'left', 'flank', 'and', 'microhaematuria', '.', 'Ultrasound', 'and', 'then', 'abdominal', '-', 'pelvic', 'computerised', 'axial', 'tomography', '(', 'CAT', ')', 'scans', 'revealed', 'a', 'large', 'solid', 'mass', 'measuring', '20x16x13', 'cm', 'arising', 'from', 'the', 'left', 'renal', 'pole', ',', 'without', 'associated', 'venous', 'thrombosis', '.', 'Para', '-', 'aortic', 'adenopathies', 'were', 'also', 'observed', '.', 'All', 'this', 'was', 'compatible', 'with', 'a', 'left', 'renal', 'tumour', '.', '\\n\\n', 'In', 'view', 'of', 'this', 'finding', ',', 'an', 'extension', 'study', 'was', 'carried', 'out', 'with', 'a', 'general', 'analysis', 'showing', 'mild', 'iron', '-', 'deficiency', 'microcytic', 'anaemia', 'and', 'hyperuricaemia', ',', 'a', 'normal', 'chest', 'X', '-', 'ray', ',', 'a', 'bone', 'scan', 'showing', 'an', 'image', 'with', 'a', 'slight', 'increase', 'in', 'tracer', 'uptake', 'corresponding', 'to', 'soft', 'tissue', ',', 'with', 'a', 'rounded', 'morphology', ',', 'located', 'in', 'the', 'hypochondrium', 'and', 'in', 'the', 'hypochondrium', ',', 'located', 'in', 'the', 'left', 'hypochondrium', 'and', 'void', ',', 'exceeding', 'the', 'midline', ',', 'and', '3', 'foci', 'of', 'tracer', 'uptake', ',', 'one', 'in', 'the', 'left', 'iliac', 'blade', ',', 'the', 'second', 'in', 'the', 'left', 'hemivertebrae', 'L4', 'and', 'L5', 'and', 'the', 'third', 'at', 'the', 'level', 'of', 'the', 'right', 'hemivertebra', 'L2', ',', 'which', 'could', 'correspond', 'to', 'bone', 'involvement', 'due', 'to', 'contiguity', 'or', 'be', 'metastatic', ',', 'and', 'an', 'abdominal', 'MRI', 'scan', 'which', 'confirms', 'the', 'findings', 'of', 'the', 'CT', 'scan', '.', '\\n', 'With', 'the', 'presumptive', 'diagnosis', 'of', 'renal', 'cell', 'carcinoma', 'with', 'retroperitoneal', 'adenopathy', ',', 'the', 'patient', 'underwent', 'radical', 'left', 'nephrectomy', ',', 'removal', 'of', 'all', 'the', 'pararenal', 'fat', 'and', 'the', 'left', 'adrenal', 'gland', 'and', 'para', '-', 'aortic', 'and', 'interaortocaval', 'lymphadenectomy', 'from', 'the', 'renal', 'artery', 'to', '3', 'cm', 'from', 'the', 'common', 'iliac', 'artery', '.', 'Removal', 'of', 'the', 'mass', 'was', 'difficult', 'due', 'to', 'infiltration', 'of', 'the', 'transverse', 'mesocolon', 'and', 'tail', 'of', 'the', 'pancreas', ',', 'which', 'were', 'released', ',', 'leaving', 'the', 'mesocolon', 'untouched', '.', 'The', 'anatomo', '-', 'pathological', '(', 'A.P.', ')', 'result', 'was', 'as', 'follows', ':', 'Collision', 'renal', 'tumour', '(', 'Leiomyosarcoma', '(', '21', 'x', '15', 'cm', ')', 'and', 'renal', 'carcinoma', 'papillary', 'type', 'nuclear', 'grade', '3', '(', '7', 'x', '3.5', 'cm', ')', ')', '.', 'The', 'weight', 'of', 'the', 'whole', 'specimen', 'was', '2539', 'grams', '.', 'The', 'tumour', 'was', 'in', 'contact', 'with', 'the', 'surgical', 'edge', 'in', 'most', 'areas', '.', 'The', 'renal', 'parenchyma', 'was', 'microscopically', 'respected', 'and', 'no', 'tumour', 'infiltration', 'was', 'observed', '.', 'The', 'ureteral', 'fragment', 'and', 'renal', 'hilum', 'were', 'free', 'of', 'tumour', 'infiltration', '.', 'The', 'immunophenotypic', 'profile', 'of', 'the', 'tumour', 'was', 'as', 'follows', ':', 'Actin', ',', 'Desmin', ',', 'S-100', ',', 'Synaptofusin', 'and', 'CD', '56', 'and', 'c', '-', 'kit', 'negative', ';', 'Smooth', 'muscle', 'actin', 'positive', 'in', 'the', 'sarcomatous', 'zone', 'and', 'keratin', 'cocktail', '(', 'E1', ',', 'E3', ')', 'positive', 'in', 'the', 'carcinomatous', 'zone', '.', 'At', 'the', 'level', 'of', 'the', 'para', '-', 'aortic', 'chain', ',', '16', 'adenopathies', 'were', 'isolated', ',', 'the', 'largest', 'measuring', '2.5', 'cm', ',', 'with', 'metastases', 'in', '14', 'of', 'them', ',', '13', 'from', 'the', 'carcinoma', 'and', '1', 'with', 'mixed', 'metastases', '(', 'sarcoma+carcinoma', ')', '.', 'Six', 'adenopathies', 'were', 'isolated', 'in', 'the', 'interaortocaval', 'chain', ',', 'the', 'largest', 'measuring', '1.4', 'cm', ',', 'three', 'of', 'which', 'were', 'carcinoma', 'metastases', '.', 'The', 'mesocolic', 'bed', 'was', 'infiltrated', 'by', 'leiomyosarcoma', '.', 'In', 'the', 'perisuprarenal', 'adipose', 'tissue', '4', 'adenopathies', 'were', 'isolated', ',', '3', 'of', 'them', 'with', 'metastasis', 'of', 'the', 'carcinoma', 'and', 'another', 'with', 'mixed', 'metastasis', '(', 'carcinoma+sarcoma', ')', '.', 'The', 'left', 'adrenal', 'gland', ',', 'the', 'perirenal', 'fat', 'and', 'the', 'gall', 'bladder', 'showed', 'no', 'tumour', 'elements', '.', '\\n\\n', 'We', 'were', 'therefore', 'faced', 'with', 'a', 'renal', 'collision', 'tumour', 'consisting', 'of', 'a', 'stage', 'IV', 'papillary', 'type', 'renal', 'cancer', '(', 'pT3', '-', '4pN2', ')', 'according', 'to', 'the', 'TNM', 'classification', 'and', 'a', 'stage', 'IV', 'renal', 'leiomyosarcoma', '(', 'pT2bpN1', ')', 'according', 'to', 'the', 'AJCC', 'classification', ',', 'not', 'radically', 'resected', 'and', 'with', 'possible', 'bone', 'metastases', 'according', 'to', 'bone', 'scintigraphy', '.', '\\n', 'The', 'postoperative', 'period', 'was', 'uneventful', 'and', 'the', 'patient', 'was', 'referred', 'to', 'the', 'Medical', 'Oncology', 'Department', '.', 'It', 'was', 'decided', 'to', 'propose', 'complementary', 'chemotherapy', 'treatment', 'with', 'Ifosfamide', '5', 'g', '/', 'm2', 'in', 'a', 'continuous', 'infusion', 'of', '24h', 'x', '1', 'day', '+', 'Adriamycin', '60', 'mg', '/', 'm2', 'x', '1', 'day/21', 'days', 'against', 'the', 'sarcomatous', 'component', 'of', 'the', 'tumour', '.', 'A', 'CT', 'scan', 'was', 'previously', 'requested', 'in', 'which', 'a', 'small', 'soft', 'tissue', 'enlargement', 'was', 'observed', 'behind', 'the', 'pancreatic', 'tail', 'and', 'renal', 'bed', ',', 'which', 'could', 'be', 'compatible', 'with', 'present', 'disease', '.', '\\n', 'The', 'patient', 'started', 'treatment', 'according', 'to', 'the', 'planned', 'schedule', '3', 'weeks', 'after', 'surgery', '.', 'She', 'received', 'a', 'total', 'of', '6', 'cycles', 'with', 'good', 'clinical', 'tolerance', '.', 'After', 'the', '4th', 'cycle', ',', 'an', 'abdominal', 'CAT', 'scan', 'was', 'performed', ',', 'which', 'was', 'normal', ',', 'and', 'at', 'the', 'end', 'of', 'the', '6th', 'cycle', ',', 'a', 'bone', 'scan', 'was', 'performed', ',', 'which', 'showed', 'no', 'pathological', 'findings', '.', 'The', 'patient', 'underwent', 'regular', 'check', '-', 'ups', 'and', '3', 'months', 'later', 'a', 'chest', 'X', '-', 'ray', 'was', 'performed', 'showing', 'images', 'suggestive', 'of', 'bilateral', 'pulmonary', 'metastases', ',', 'which', 'were', 'confirmed', 'by', 'a', 'CT', 'scan', 'showing', 'multiple', 'bilateral', 'millimetric', 'pulmonary', 'metastases', '.', 'In', 'order', 'to', 'identify', 'the', 'origin', 'of', 'these', 'metastases', ',', 'Thoracic', 'Surgery', 'was', 'consulted', 'and', 'it', 'was', 'decided', 'to', 'perform', 'a', 'left', 'videothoracoscopy', 'with', 'biopsies', '.', 'The', 'P.A.', 'diagnosis', 'was', 'metastasis', 'of', 'poorly', 'differentiated', 'carcinoma', 'with', 'an', 'epithelial', 'component', '(', 'renal', ')', '.', 'In', 'view', 'of', 'this', 'diagnosis', ',', '6', 'months', 'after', 'finishing', 'the', 'first', 'chemotherapy', 'regimen', ',', 'it', 'was', 'decided', 'to', 'start', 'a', 'second', 'line', 'of', 'treatment', 'with', 'a', 'chemotherapy', 'regimen', 'with', 'Gemzitabine', 'and', 'Fluoropyrimidines9', 'that', 'had', 'proved', 'useful', 'in', 'stage', 'IV', 'renal', 'carcinoma', ':', 'Gemcitabine', '1000', 'mg', '/', 'm2', 'days', '1', 'and', '8', '+', 'Capecitabine', '1000', 'mg', '/', 'm2/12h', 'days', '1', '-', '14/21', 'days', ',', 'which', 'the', 'patient', 'accepted', '.', '\\n\\n', 'Treatment', 'was', 'started', 'with', 'a', '20', '%', 'dose', 'reduction', 'which', 'was', 'maintained', 'for', 'the', 'rest', 'of', 'the', 'treatment', 'given', 'the', 'patient', \"'s\", 'general', 'condition', '(', 'ECOG', ':', '1', '-', '2', ')', '.', 'After', 'the', 'second', 'cycle', ',', 'the', 'patient', 'suffered', 'a', 'complication', 'of', 'pulmonary', 'thromboembolism', 'from', 'which', 'she', 'recovered', 'but', 'which', 'caused', 'a', 'delay', 'of', '4', 'weeks', 'in', 'the', 'administration', 'of', 'the', 'third', 'cycle', '.', 'After', '6', 'cycles', 'of', 'treatment', ',', 'which', 'she', 'received', 'with', 'acceptable', 'tolerance', 'except', 'for', 'grade', '4', 'anaemia', ',', 'she', 'was', 're', '-', 'evaluated', 'with', 'a', 'body', 'CT', 'scan', 'which', 'showed', 'persistent', 'pulmonary', 'metastases', 'with', 'the', 'appearance', 'of', 'liver', 'and', 'spleen', 'metastases', 'and', 'local', 'relapse', '.', '\\n', 'In', 'view', 'of', 'this', 'progression', ',', 'treatment', 'with', 'IL-2', 'was', 'proposed', 'for', '6', 'weeks', '(', '1', 'week', 'of', 'induction', 'with', '18', 'Million', 'Units', '(', 'MU', ')', 'x', '5', 'days', 'and', '5', 'weeks', ':', '9', 'MU', 'days', '1', 'and', '2', 'and', '18', 'MU', 'days', '3', 'to', '5)10', '.', 'The', 'patient', 'accepted', 'the', 'treatment', 'with', 'moderate', 'toxicity', 'with', 'secondary', 'constitutional', 'symptoms', 'grade', '2', ',', 'anaemia', 'grade', '3', 'and', 'emesis', 'grade', '1', ',', 'maintaining', 'her', 'general', 'condition', '.', '\\n', 'At', 'the', 'end', 'of', 'treatment', ',', 'a', 'new', 're', '-', 'evaluation', 'was', 'performed', 'with', 'a', 'CT', 'scan', 'showing', 'progression', 'of', 'the', 'disease', 'with', 'a', 'large', 'mass', 'in', 'the', 'surgical', 'site', 'measuring', '19x10x5', 'cm', ',', 'which', 'had', 'grown', 'with', 'respect', 'to', 'the', 'previous', 'CT', 'scan', ',', 'and', 'persistent', 'metastases', 'in', 'the', 'rest', 'of', 'the', 'previous', 'sites', '.', 'The', 'patient', \"'s\", 'general', 'condition', 'worsened', ',', 'with', 'the', 'appearance', 'of', 'abdominal', 'and', 'lumbar', 'pain', ',', 'and', 'on', 'physical', 'examination', 'a', '5', 'cm', 'epigastric', 'mass', 'was', 'palpated', ',', 'corresponding', 'to', 'the', 'underlying', 'mass', '.', '\\n', 'Given', 'this', 'new', 'progression', ',', 'it', 'is', 'considered', 'that', 'the', 'tumour', 'is', 'resistant', 'to', 'chemotherapy', 'or', 'immunotherapy', 'schemes', 'against', 'renal', 'carcinoma', 'and', 'it', 'is', 'proposed', 'to', 'start', 'palliative', 'treatment', 'with', 'liposomal', 'Adriamycin', 'against', 'the', 'sarcomatous', 'component', 'of', 'the', 'tumour', '.', 'The', 'patient', 'accepted', 'the', 'proposal', 'and', 'received', 'a', 'first', 'cycle', '.', 'However', ',', 'a', 'week', 'later', 'she', 'went', 'to', 'the', 'emergency', 'department', 'for', 'hypovolemic', 'shock', 'with', 'metabolic', 'acidosis', 'and', 'pre', '-', 'renal', 'renal', 'failure', 'secondary', 'to', 'hyperemesis', 'of', '4', 'days', \"'\", 'evolution', 'and', 'grade', '4', 'anaemia', '.', 'The', 'patient', 'recovered', 'from', 'this', 'episode', 'but', 'a', 'week', 'later', 'she', 'began', 'with', 'faecal', 'vomiting', 'of', 'probable', 'obstructive', 'origin', 'due', 'to', 'compression', 'of', 'the', 'retroperitoneal', 'mass', ',', 'causing', 'progressive', 'deterioration', 'of', 'the', 'patient', 'and', 'the', 'patient', 'died', 'of', 'multi', '-', 'organ', 'failure', '19', 'months', 'after', 'diagnosis', '.', '\\n\\n\\n']\n", + "[['tumour'], ['arterial', 'hypertension'], ['venous', 'thrombosis'], ['Para', '-', 'aortic', 'adenopathies'], ['renal', 'tumour'], ['hyperuricaemia'], ['anaemia'], ['bone', 'involvement'], ['renal', 'cell', 'carcinoma', 'with', 'retroperitoneal', 'adenopathy'], ['tumour'], ['tumour'], ['tumour'], ['tumour'], ['tumour'], ['tumour'], ['renal', 'carcinoma'], ['metastases'], ['metastases'], ['metastases'], ['Leiomyosarcoma'], ['leiomyosarcoma'], ['renal', 'tumour'], ['tumour', 'infiltration'], ['tumour', 'infiltration'], ['sarcoma'], ['sarcoma'], ['carcinoma'], ['carcinoma'], ['carcinoma'], ['carcinoma'], ['carcinoma'], ['carcinoma'], ['carcinoma'], ['carcinoma', 'metastases'], ['renal', 'collision', 'tumour'], ['tumour'], ['papillary', 'type', 'renal', 'cancer'], ['renal', 'leiomyosarcoma'], ['metastases'], ['bone', 'metastases'], ['leiomyosarcoma'], ['tumour'], ['sarcomatous', 'component', 'of', 'the', 'tumour'], ['renal', 'carcinoma'], ['metastases'], ['metastases'], ['metastases'], ['bilateral', 'pulmonary', 'metastases'], ['carcinoma'], ['carcinoma'], ['pulmonary', 'metastases'], ['pulmonary', 'metastases'], ['pulmonary', 'thromboembolism'], ['metastases'], ['metastases'], ['anaemia'], ['pulmonary', 'metastases'], ['anaemia'], ['emesis'], ['metastases'], ['tumour'], ['tumour'], ['renal', 'carcinoma'], ['anaemia'], ['hypovolemic', 'shock'], ['metabolic', 'acidosis'], ['hyperemesis'], ['multi', '-', 'organ', 'failure'], ['carcinoma'], ['sarcomatous', 'component', 'of', 'the', 'tumour']]\n", + "['sarcoma']\n", + "sarcoma\n", + "231\n", + "['Patient', 'aged', '53', 'years', 'at', 'the', 'time', 'of', 'diagnosis', 'with', 'a', 'personal', 'history', 'of', 'a', 'caesarean', 'section', ',', 'arterial', 'hypertension', 'and', 'tachycardia', 'treated', 'with', 'Atenolol', ',', 'who', 'came', 'to', 'the', 'Emergency', 'Department', 'with', 'a', '5', '-', 'month', 'history', 'of', 'progressive', 'dull', 'pain', 'in', 'the', 'left', 'flank', 'and', 'microhaematuria', '.', 'Ultrasound', 'and', 'then', 'abdominal', '-', 'pelvic', 'computerised', 'axial', 'tomography', '(', 'CAT', ')', 'scans', 'revealed', 'a', 'large', 'solid', 'mass', 'measuring', '20x16x13', 'cm', 'arising', 'from', 'the', 'left', 'renal', 'pole', ',', 'without', 'associated', 'venous', 'thrombosis', '.', 'Para', '-', 'aortic', 'adenopathies', 'were', 'also', 'observed', '.', 'All', 'this', 'was', 'compatible', 'with', 'a', 'left', 'renal', 'tumour', '.', '\\n\\n', 'In', 'view', 'of', 'this', 'finding', ',', 'an', 'extension', 'study', 'was', 'carried', 'out', 'with', 'a', 'general', 'analysis', 'showing', 'mild', 'iron', '-', 'deficiency', 'microcytic', 'anaemia', 'and', 'hyperuricaemia', ',', 'a', 'normal', 'chest', 'X', '-', 'ray', ',', 'a', 'bone', 'scan', 'showing', 'an', 'image', 'with', 'a', 'slight', 'increase', 'in', 'tracer', 'uptake', 'corresponding', 'to', 'soft', 'tissue', ',', 'with', 'a', 'rounded', 'morphology', ',', 'located', 'in', 'the', 'hypochondrium', 'and', 'in', 'the', 'hypochondrium', ',', 'located', 'in', 'the', 'left', 'hypochondrium', 'and', 'void', ',', 'exceeding', 'the', 'midline', ',', 'and', '3', 'foci', 'of', 'tracer', 'uptake', ',', 'one', 'in', 'the', 'left', 'iliac', 'blade', ',', 'the', 'second', 'in', 'the', 'left', 'hemivertebrae', 'L4', 'and', 'L5', 'and', 'the', 'third', 'at', 'the', 'level', 'of', 'the', 'right', 'hemivertebra', 'L2', ',', 'which', 'could', 'correspond', 'to', 'bone', 'involvement', 'due', 'to', 'contiguity', 'or', 'be', 'metastatic', ',', 'and', 'an', 'abdominal', 'MRI', 'scan', 'which', 'confirms', 'the', 'findings', 'of', 'the', 'CT', 'scan', '.', '\\n', 'With', 'the', 'presumptive', 'diagnosis', 'of', 'renal', 'cell', 'carcinoma', 'with', 'retroperitoneal', 'adenopathy', ',', 'the', 'patient', 'underwent', 'radical', 'left', 'nephrectomy', ',', 'removal', 'of', 'all', 'the', 'pararenal', 'fat', 'and', 'the', 'left', 'adrenal', 'gland', 'and', 'para', '-', 'aortic', 'and', 'interaortocaval', 'lymphadenectomy', 'from', 'the', 'renal', 'artery', 'to', '3', 'cm', 'from', 'the', 'common', 'iliac', 'artery', '.', 'Removal', 'of', 'the', 'mass', 'was', 'difficult', 'due', 'to', 'infiltration', 'of', 'the', 'transverse', 'mesocolon', 'and', 'tail', 'of', 'the', 'pancreas', ',', 'which', 'were', 'released', ',', 'leaving', 'the', 'mesocolon', 'untouched', '.', 'The', 'anatomo', '-', 'pathological', '(', 'A.P.', ')', 'result', 'was', 'as', 'follows', ':', 'Collision', 'renal', 'tumour', '(', 'Leiomyosarcoma', '(', '21', 'x', '15', 'cm', ')', 'and', 'renal', 'carcinoma', 'papillary', 'type', 'nuclear', 'grade', '3', '(', '7', 'x', '3.5', 'cm', ')', ')', '.', 'The', 'weight', 'of', 'the', 'whole', 'specimen', 'was', '2539', 'grams', '.', 'The', 'tumour', 'was', 'in', 'contact', 'with', 'the', 'surgical', 'edge', 'in', 'most', 'areas', '.', 'The', 'renal', 'parenchyma', 'was', 'microscopically', 'respected', 'and', 'no', 'tumour', 'infiltration', 'was', 'observed', '.', 'The', 'ureteral', 'fragment', 'and', 'renal', 'hilum', 'were', 'free', 'of', 'tumour', 'infiltration', '.', 'The', 'immunophenotypic', 'profile', 'of', 'the', 'tumour', 'was', 'as', 'follows', ':', 'Actin', ',', 'Desmin', ',', 'S-100', ',', 'Synaptofusin', 'and', 'CD', '56', 'and', 'c', '-', 'kit', 'negative', ';', 'Smooth', 'muscle', 'actin', 'positive', 'in', 'the', 'sarcomatous', 'zone', 'and', 'keratin', 'cocktail', '(', 'E1', ',', 'E3', ')', 'positive', 'in', 'the', 'carcinomatous', 'zone', '.', 'At', 'the', 'level', 'of', 'the', 'para', '-', 'aortic', 'chain', ',', '16', 'adenopathies', 'were', 'isolated', ',', 'the', 'largest', 'measuring', '2.5', 'cm', ',', 'with', 'metastases', 'in', '14', 'of', 'them', ',', '13', 'from', 'the', 'carcinoma', 'and', '1', 'with', 'mixed', 'metastases', '(', 'sarcoma+carcinoma', ')', '.', 'Six', 'adenopathies', 'were', 'isolated', 'in', 'the', 'interaortocaval', 'chain', ',', 'the', 'largest', 'measuring', '1.4', 'cm', ',', 'three', 'of', 'which', 'were', 'carcinoma', 'metastases', '.', 'The', 'mesocolic', 'bed', 'was', 'infiltrated', 'by', 'leiomyosarcoma', '.', 'In', 'the', 'perisuprarenal', 'adipose', 'tissue', '4', 'adenopathies', 'were', 'isolated', ',', '3', 'of', 'them', 'with', 'metastasis', 'of', 'the', 'carcinoma', 'and', 'another', 'with', 'mixed', 'metastasis', '(', 'carcinoma+sarcoma', ')', '.', 'The', 'left', 'adrenal', 'gland', ',', 'the', 'perirenal', 'fat', 'and', 'the', 'gall', 'bladder', 'showed', 'no', 'tumour', 'elements', '.', '\\n\\n', 'We', 'were', 'therefore', 'faced', 'with', 'a', 'renal', 'collision', 'tumour', 'consisting', 'of', 'a', 'stage', 'IV', 'papillary', 'type', 'renal', 'cancer', '(', 'pT3', '-', '4pN2', ')', 'according', 'to', 'the', 'TNM', 'classification', 'and', 'a', 'stage', 'IV', 'renal', 'leiomyosarcoma', '(', 'pT2bpN1', ')', 'according', 'to', 'the', 'AJCC', 'classification', ',', 'not', 'radically', 'resected', 'and', 'with', 'possible', 'bone', 'metastases', 'according', 'to', 'bone', 'scintigraphy', '.', '\\n', 'The', 'postoperative', 'period', 'was', 'uneventful', 'and', 'the', 'patient', 'was', 'referred', 'to', 'the', 'Medical', 'Oncology', 'Department', '.', 'It', 'was', 'decided', 'to', 'propose', 'complementary', 'chemotherapy', 'treatment', 'with', 'Ifosfamide', '5', 'g', '/', 'm2', 'in', 'a', 'continuous', 'infusion', 'of', '24h', 'x', '1', 'day', '+', 'Adriamycin', '60', 'mg', '/', 'm2', 'x', '1', 'day/21', 'days', 'against', 'the', 'sarcomatous', 'component', 'of', 'the', 'tumour', '.', 'A', 'CT', 'scan', 'was', 'previously', 'requested', 'in', 'which', 'a', 'small', 'soft', 'tissue', 'enlargement', 'was', 'observed', 'behind', 'the', 'pancreatic', 'tail', 'and', 'renal', 'bed', ',', 'which', 'could', 'be', 'compatible', 'with', 'present', 'disease', '.', '\\n', 'The', 'patient', 'started', 'treatment', 'according', 'to', 'the', 'planned', 'schedule', '3', 'weeks', 'after', 'surgery', '.', 'She', 'received', 'a', 'total', 'of', '6', 'cycles', 'with', 'good', 'clinical', 'tolerance', '.', 'After', 'the', '4th', 'cycle', ',', 'an', 'abdominal', 'CAT', 'scan', 'was', 'performed', ',', 'which', 'was', 'normal', ',', 'and', 'at', 'the', 'end', 'of', 'the', '6th', 'cycle', ',', 'a', 'bone', 'scan', 'was', 'performed', ',', 'which', 'showed', 'no', 'pathological', 'findings', '.', 'The', 'patient', 'underwent', 'regular', 'check', '-', 'ups', 'and', '3', 'months', 'later', 'a', 'chest', 'X', '-', 'ray', 'was', 'performed', 'showing', 'images', 'suggestive', 'of', 'bilateral', 'pulmonary', 'metastases', ',', 'which', 'were', 'confirmed', 'by', 'a', 'CT', 'scan', 'showing', 'multiple', 'bilateral', 'millimetric', 'pulmonary', 'metastases', '.', 'In', 'order', 'to', 'identify', 'the', 'origin', 'of', 'these', 'metastases', ',', 'Thoracic', 'Surgery', 'was', 'consulted', 'and', 'it', 'was', 'decided', 'to', 'perform', 'a', 'left', 'videothoracoscopy', 'with', 'biopsies', '.', 'The', 'P.A.', 'diagnosis', 'was', 'metastasis', 'of', 'poorly', 'differentiated', 'carcinoma', 'with', 'an', 'epithelial', 'component', '(', 'renal', ')', '.', 'In', 'view', 'of', 'this', 'diagnosis', ',', '6', 'months', 'after', 'finishing', 'the', 'first', 'chemotherapy', 'regimen', ',', 'it', 'was', 'decided', 'to', 'start', 'a', 'second', 'line', 'of', 'treatment', 'with', 'a', 'chemotherapy', 'regimen', 'with', 'Gemzitabine', 'and', 'Fluoropyrimidines9', 'that', 'had', 'proved', 'useful', 'in', 'stage', 'IV', 'renal', 'carcinoma', ':', 'Gemcitabine', '1000', 'mg', '/', 'm2', 'days', '1', 'and', '8', '+', 'Capecitabine', '1000', 'mg', '/', 'm2/12h', 'days', '1', '-', '14/21', 'days', ',', 'which', 'the', 'patient', 'accepted', '.', '\\n\\n', 'Treatment', 'was', 'started', 'with', 'a', '20', '%', 'dose', 'reduction', 'which', 'was', 'maintained', 'for', 'the', 'rest', 'of', 'the', 'treatment', 'given', 'the', 'patient', \"'s\", 'general', 'condition', '(', 'ECOG', ':', '1', '-', '2', ')', '.', 'After', 'the', 'second', 'cycle', ',', 'the', 'patient', 'suffered', 'a', 'complication', 'of', 'pulmonary', 'thromboembolism', 'from', 'which', 'she', 'recovered', 'but', 'which', 'caused', 'a', 'delay', 'of', '4', 'weeks', 'in', 'the', 'administration', 'of', 'the', 'third', 'cycle', '.', 'After', '6', 'cycles', 'of', 'treatment', ',', 'which', 'she', 'received', 'with', 'acceptable', 'tolerance', 'except', 'for', 'grade', '4', 'anaemia', ',', 'she', 'was', 're', '-', 'evaluated', 'with', 'a', 'body', 'CT', 'scan', 'which', 'showed', 'persistent', 'pulmonary', 'metastases', 'with', 'the', 'appearance', 'of', 'liver', 'and', 'spleen', 'metastases', 'and', 'local', 'relapse', '.', '\\n', 'In', 'view', 'of', 'this', 'progression', ',', 'treatment', 'with', 'IL-2', 'was', 'proposed', 'for', '6', 'weeks', '(', '1', 'week', 'of', 'induction', 'with', '18', 'Million', 'Units', '(', 'MU', ')', 'x', '5', 'days', 'and', '5', 'weeks', ':', '9', 'MU', 'days', '1', 'and', '2', 'and', '18', 'MU', 'days', '3', 'to', '5)10', '.', 'The', 'patient', 'accepted', 'the', 'treatment', 'with', 'moderate', 'toxicity', 'with', 'secondary', 'constitutional', 'symptoms', 'grade', '2', ',', 'anaemia', 'grade', '3', 'and', 'emesis', 'grade', '1', ',', 'maintaining', 'her', 'general', 'condition', '.', '\\n', 'At', 'the', 'end', 'of', 'treatment', ',', 'a', 'new', 're', '-', 'evaluation', 'was', 'performed', 'with', 'a', 'CT', 'scan', 'showing', 'progression', 'of', 'the', 'disease', 'with', 'a', 'large', 'mass', 'in', 'the', 'surgical', 'site', 'measuring', '19x10x5', 'cm', ',', 'which', 'had', 'grown', 'with', 'respect', 'to', 'the', 'previous', 'CT', 'scan', ',', 'and', 'persistent', 'metastases', 'in', 'the', 'rest', 'of', 'the', 'previous', 'sites', '.', 'The', 'patient', \"'s\", 'general', 'condition', 'worsened', ',', 'with', 'the', 'appearance', 'of', 'abdominal', 'and', 'lumbar', 'pain', ',', 'and', 'on', 'physical', 'examination', 'a', '5', 'cm', 'epigastric', 'mass', 'was', 'palpated', ',', 'corresponding', 'to', 'the', 'underlying', 'mass', '.', '\\n', 'Given', 'this', 'new', 'progression', ',', 'it', 'is', 'considered', 'that', 'the', 'tumour', 'is', 'resistant', 'to', 'chemotherapy', 'or', 'immunotherapy', 'schemes', 'against', 'renal', 'carcinoma', 'and', 'it', 'is', 'proposed', 'to', 'start', 'palliative', 'treatment', 'with', 'liposomal', 'Adriamycin', 'against', 'the', 'sarcomatous', 'component', 'of', 'the', 'tumour', '.', 'The', 'patient', 'accepted', 'the', 'proposal', 'and', 'received', 'a', 'first', 'cycle', '.', 'However', ',', 'a', 'week', 'later', 'she', 'went', 'to', 'the', 'emergency', 'department', 'for', 'hypovolemic', 'shock', 'with', 'metabolic', 'acidosis', 'and', 'pre', '-', 'renal', 'renal', 'failure', 'secondary', 'to', 'hyperemesis', 'of', '4', 'days', \"'\", 'evolution', 'and', 'grade', '4', 'anaemia', '.', 'The', 'patient', 'recovered', 'from', 'this', 'episode', 'but', 'a', 'week', 'later', 'she', 'began', 'with', 'faecal', 'vomiting', 'of', 'probable', 'obstructive', 'origin', 'due', 'to', 'compression', 'of', 'the', 'retroperitoneal', 'mass', ',', 'causing', 'progressive', 'deterioration', 'of', 'the', 'patient', 'and', 'the', 'patient', 'died', 'of', 'multi', '-', 'organ', 'failure', '19', 'months', 'after', 'diagnosis', '.', '\\n\\n\\n']\n", + "[['tumour'], ['arterial', 'hypertension'], ['venous', 'thrombosis'], ['Para', '-', 'aortic', 'adenopathies'], ['renal', 'tumour'], ['hyperuricaemia'], ['anaemia'], ['bone', 'involvement'], ['renal', 'cell', 'carcinoma', 'with', 'retroperitoneal', 'adenopathy'], ['tumour'], ['tumour'], ['tumour'], ['tumour'], ['tumour'], ['tumour'], ['renal', 'carcinoma'], ['metastases'], ['metastases'], ['metastases'], ['Leiomyosarcoma'], ['leiomyosarcoma'], ['renal', 'tumour'], ['tumour', 'infiltration'], ['tumour', 'infiltration'], ['sarcoma'], ['sarcoma'], ['carcinoma'], ['carcinoma'], ['carcinoma'], ['carcinoma'], ['carcinoma'], ['carcinoma'], ['carcinoma'], ['carcinoma', 'metastases'], ['renal', 'collision', 'tumour'], ['tumour'], ['papillary', 'type', 'renal', 'cancer'], ['renal', 'leiomyosarcoma'], ['metastases'], ['bone', 'metastases'], ['leiomyosarcoma'], ['tumour'], ['sarcomatous', 'component', 'of', 'the', 'tumour'], ['renal', 'carcinoma'], ['metastases'], ['metastases'], ['metastases'], ['bilateral', 'pulmonary', 'metastases'], ['carcinoma'], ['carcinoma'], ['pulmonary', 'metastases'], ['pulmonary', 'metastases'], ['pulmonary', 'thromboembolism'], ['metastases'], ['metastases'], ['anaemia'], ['pulmonary', 'metastases'], ['anaemia'], ['emesis'], ['metastases'], ['tumour'], ['tumour'], ['renal', 'carcinoma'], ['anaemia'], ['hypovolemic', 'shock'], ['metabolic', 'acidosis'], ['hyperemesis'], ['multi', '-', 'organ', 'failure'], ['carcinoma'], ['sarcomatous', 'component', 'of', 'the', 'tumour']]\n", + "['sarcoma']\n", + "sarcoma\n", + "231\n", + "['Male', 'patient', ',', 'black', ',', '21', 'years', 'of', 'age', ',', 'who', 'consulted', 'the', 'Maxillofacial', 'Surgery', 'Department', 'of', 'the', 'Hospital', 'Universitario', 'San', 'Vicente', 'Fundación', 'de', 'Medellín', 'attached', 'to', 'the', 'Universidad', 'de', 'Antioquia', ',', 'referred', 'by', 'a', 'dentist', 'from', 'a', 'public', 'health', 'institution', 'for', 'presenting', 'an', 'asymptomatic', 'increase', 'in', 'the', 'volume', 'of', 'the', 'mandible', ',', 'with', 'unknown', 'evolution', '.', '\\n', 'Intraorally', ',', 'excellent', 'dental', 'integrity', 'and', 'good', 'dental', 'occlusion', 'were', 'observed', ',', 'but', 'there', 'was', 'an', 'increase', 'in', 'volume', 'in', 'the', 'vestibular', 'region', 'involving', 'the', 'body', 'and', 'the', 'symphysis', ',', 'while', 'lingually', ',', 'the', 'expansion', 'of', 'the', 'table', 'was', 'only', 'in', 'the', 'area', 'of', 'the', 'lower', 'right', 'canine', 'and', 'bicuspids', '.', '\\n', 'The', 'initial', 'panoramic', 'X', '-', 'ray', 'showed', 'a', 'radiolucent', 'image', '10', 'cm', 'long', ',', 'multiloculated', ',', 'located', 'from', 'distal', 'tooth', '46', 'to', 'mesial', 'tooth', '33', '.', 'The', 'mesial', 'root', 'of', 'tooth', '46', 'and', 'the', 'root', 'of', 'tooth', '45', 'showed', 'rhizolysis', ',', 'with', 'possible', 'pulp', 'necrosis', '.', 'Teeth', '46', ',', '45', 'and', '44', 'had', 'minimal', 'mobility', ';', 'the', 'other', 'teeth', 'had', 'normal', 'vitality', 'and', 'the', 'lower', 'dental', 'canal', 'was', 'rejected', '.', 'There', 'is', 'no', 'alteration', 'in', 'the', 'sensitivity', 'of', 'the', 'mentonian', 'nerve', '.', '\\n\\n', 'The', 'patient', 'consented', 'to', 'the', 'entire', 'treatment', 'by', 'signing', 'the', 'informed', 'consent', 'form', '.', 'Prior', 'to', 'the', 'initial', 'biopsy', 'an', 'aspirate', 'was', 'taken', 'producing', 'a', 'citrine', 'fluid', 'and', 'the', 'first', 'histopathological', 'study', 'found', 'a', 'lax', 'connective', 'tissue', ',', 'some', 'multinucleated', 'giant', 'cells', 'with', 'few', 'nuclei', 'and', 'a', 'thin', 'band', 'of', 'keratin', ';', 'there', 'was', 'no', 'evidence', 'of', 'epithelial', 'tissue', 'but', 'it', 'was', 'considered', 'as', 'insufficient', 'sample', '.', 'Nevertheless', ',', 'a', 'diagnosis', 'of', 'keratocystic', 'odontogenic', 'tumour', 'was', 'made', ',', 'due', 'to', 'the', 'presence', 'of', 'a', 'keratin', 'band', '.', 'After', 'this', 'procedure', ',', 'the', 'lesion', 'became', 'superinfected', 'and', 'the', 'patient', 'had', 'to', 'be', 'hospitalised', 'due', 'to', 'the', 'severity', 'of', 'the', 'clinical', 'picture', ';', 'this', 'condition', 'was', 'used', 'to', 'perform', 'a', 'second', 'biopsy', 'under', 'general', 'anaesthesia', '8', 'days', 'later', ',', 'in', 'which', 'a', 'capsule', 'made', 'up', 'of', 'connective', 'tissue', 'with', 'an', 'intense', 'inflammatory', 'infiltrate', 'and', 'epithelial', 'tissue', 'with', 'several', 'layers', 'of', 'cells', 'was', 'found', ',', 'which', 'gave', 'the', 'diagnosis', ':', 'inflammatory', 'root', 'cyst', '.', 'Given', 'the', 'ambiguity', 'of', 'the', 'diagnosis', 'and', 'the', 'aggressiveness', 'of', 'the', 'lesion', ',', 'it', 'was', 'decided', 'to', 'perform', 'the', 'intervention', 'with', 'the', 'first', 'diagnosis', '(', 'keratocystic', 'odontogenic', 'tumour', '-exkeratocyst-', ')', '.', 'With', 'this', 'diagnosis', 'it', 'was', 'decided', 'to', 'plan', 'an', 'aggressive', 'surgical', 'treatment', 'and', 'it', 'was', 'decided', 'to', 'order', 'endodontic', 'treatment', 'from', '46', 'to', '33', '(', '9', 'teeth', ')', 'prior', 'to', 'the', 'surgical', 'procedure', ',', 'as', 'the', 'dental', 'apices', 'were', 'immersed', 'in', 'the', 'cavity', 'and', 'the', 'curettage', 'itself', 'would', 'cause', 'amputation', 'of', 'the', 'pulp', 'vascular', 'bundles', ',', 'in', 'addition', ',', 'according', 'to', 'the', 'diagnosis', ',', 'it', 'is', 'a', 'very', 'recurrent', 'lesion', '.', 'This', 'endodontic', 'treatment', 'lasted', '3', 'months', 'due', 'to', 'the', 'difficulty', 'of', 'sealing', 'the', 'canals', 'due', 'to', 'the', 'presence', 'of', 'an', 'amber', '-', 'coloured', 'liquid', 'draining', 'through', 'the', 'pulp', 'chambers', '.', 'After', 'this', 'time', ',', 'the', 'production', 'of', 'liquid', 'content', 'ceased', 'and', 'the', 'endodontic', 'treatment', 'was', 'completed', '.', '\\n\\n', 'The', 'surgical', 'phase', 'was', 'performed', 'under', 'general', 'anaesthesia', '.', 'A', 'trapezoidal', 'flap', 'was', 'raised', 'from', 'distal', '46', 'to', 'distal', '33', '.', 'When', 'the', 'flap', 'was', 'reflected', ',', 'it', 'was', 'found', 'to', 'be', 'expanded', 'in', 'its', 'entire', 'length', 'and', 'perforated', 'in', 'the', 'bicuspid', 'area', '(', 'teeth', '44', 'and', '45', ')', ',', 'where', 'the', 'biopsies', 'had', 'previously', 'been', 'taken', '.', 'We', 'proceeded', 'to', 'remove', 'all', 'the', 'expanded', 'vestibular', 'cortex', 'until', 'we', 'had', 'complete', 'access', 'to', 'the', 'cystic', 'cavity', '.', 'The', 'thick', 'fibrous', 'capsule', 'covering', 'the', 'osseous', 'defect', 'was', 'found', 'and', 'removed', '.', 'The', 'bone', 'cavity', 'is', 'reamed', 'with', 'rotary', 'cutting', 'instruments', 'and', 'the', 'entire', 'bone', 'defect', 'is', 'brushed', 'with', 'Carnoy', \"'s\", 'solution', '.', 'Some', 'perforations', 'of', 'the', 'lingual', 'cortex', 'are', 'visible', ',', 'which', 'are', 'cauterised', 'with', 'an', 'electroscalpel', 'due', 'to', 'the', 'risk', 'of', 'invasion', 'of', 'the', 'tumour', 'lesion', 'into', 'the', 'lingual', 'soft', 'tissues', '.', 'Due', 'to', 'the', 'weakening', 'of', 'the', 'mandibular', 'basilar', 'border', ',', 'a', 'reconstruction', 'plate', 'is', 'placed', 'to', 'avoid', 'intra-', 'or', 'postoperative', 'fracture', '.', 'Before', 'suturing', 'the', 'flap', ',', 'the', 'bone', 'defect', 'is', 'filled', 'with', 'fibrillar', 'collagen', 'and', 'medicated', 'with', 'analgesics', 'and', 'antibiotics', '.', 'The', 'tissue', 'obtained', 'is', 'sent', 'to', 'pathology', '.', 'The', 'histopathological', 'report', 'of', 'the', 'surgical', 'specimen', 'shows', 'fibroconnective', 'tissue', 'devoid', 'of', 'epithelium', 'and', 'a', 'post', '-', 'surgical', 'diagnosis', 'of', 'an', 'aneurysmal', 'bone', 'cyst', 'was', 'obtained', '.', '\\n', 'Because', 'the', 'second', 'biopsy', 'showed', 'an', 'epithelium', ',', 'the', 'diagnosis', 'was', 'misleading', 'but', ',', 'evaluating', 'the', 'three', 'histopathological', 'samples', ',', 'it', 'was', 'considered', 'that', 'the', 'epithelial', 'tissue', 'of', 'the', 'second', 'biopsy', 'was', 'the', 'product', 'of', 'the', 'inflammatory', 'reaction', ',', 'while', 'the', 'initial', 'biopsy', 'and', 'the', 'surgical', 'specimen', 'showed', 'no', 'epithelial', 'component', ';', 'it', 'was', 'decided', 'to', 'consider', 'it', 'an', 'aneurysmal', 'bone', 'cyst', '.', '\\n', 'The', 'patient', 'was', 'evaluated', 'at', '8', 'and', '15', 'days', 'and', 'then', 'at', '2', 'months', ',', '10', 'months', 'and', '2', 'years', '.', 'At', 'the', 'follow', '-', 'up', 'appointment', 'at', '2', 'months', 'pulp', 'necrosis', 'and', 'fistula', 'were', 'found', 'at', 'the', 'level', 'of', '47', ',', 'which', 'was', 'adjacent', 'to', 'the', 'lesion', 'and', 'the', 'mesial', 'root', 'apex', 'had', 'been', 'amputated', 'during', 'the', 'surgical', 'procedure', '.', 'Endodontics', 'was', 'performed', 'and', 'the', 'infection', 'resolved', '.', 'He', 'also', 'presented', 'with', 'right', 'mentonian', 'nerve', 'paraesthesia', 'and', 'was', 'prescribed', 'B', '-', 'complex', 'tablets', 'for', 'one', 'month', '.', 'A', 'new', 'post', '-', 'surgical', 'assessment', 'was', 'carried', 'out', '10', 'months', 'later', 'and', 'adequate', 'bone', 'healing', 'was', 'found', ',', 'but', 'there', 'was', 'an', 'occlusal', 'sequela', ',', 'as', 'the', 'teeth', 'that', 'were', 'left', 'without', 'bone', 'support', 'due', 'to', 'being', 'immersed', 'in', 'the', 'bone', 'defect', '(', 'teeth', '44', 'to', '33', ')', 'were', 'intruded', 'and', 'produced', 'an', 'open', 'bite', '.', 'Two', 'years', 'later', 'a', 'new', 'clinical', 'and', 'radiographic', 'check', '-', 'up', 'was', 'carried', 'out', 'where', 'it', 'was', 'found', 'that', ':', 'teeth', '44', 'to', '33', 'remained', 'in', 'open', 'bite', ',', 'tooth', '44', 'was', 'also', 'slightly', 'vestibularised', '.', 'The', 'panoramic', 'radiograph', 'shows', 'good', 'bone', 'filling', ',', 'but', 'the', 'three', '-', 'dimensional', 'tomography', 'shows', 'that', 'there', 'is', 'a', 'defect', 'of', 'about', '6', 'mm', 'in', 'diameter', 'that', 'compromises', 'the', 'apex', 'of', '44', 'and', '43', ',', 'and', 'that', 'the', 'bone', 'has', 'yet', 'to', 'regenerate', ',', 'and', 'the', 'paraesthesia', 'still', 'persists', 'two', 'years', 'later', '.', 'The', 'intraoral', 'image', 'shows', 'that', 'the', 'enlargement', 'produced', 'by', 'the', 'expansion', 'of', 'the', 'lesion', 'of', 'the', 'lingual', 'plate', 'at', 'the', 'level', 'of', 'teeth', '43', ',', '44', 'and', '45', 'still', 'persists', '.', '\\n\\n\\n']\n", + "[['pulp', 'necrosis'], ['keratocystic', 'odontogenic', 'tumour'], ['keratocystic', 'odontogenic', 'tumour'], ['inflammatory', 'root', 'cyst'], ['exkeratocyst'], ['lesion', 'became', 'superinfected'], ['lesion'], ['lesion'], ['lesion'], ['aneurysmal', 'bone', 'cyst'], ['tumour', 'lesion'], ['bone', 'defect'], ['bone', 'defect'], ['cystic', 'cavity'], ['lesion'], ['perforations', 'of', 'the', 'lingual', 'cortex'], ['aneurysmal', 'bone', 'cyst'], ['open', 'bite'], ['open', 'bite'], ['infection'], ['bone', 'defect'], ['pulp', 'necrosis'], ['lesion'], ['lesion']]\n", + "['exkeratocyst']\n", + "exkeratocyst\n", + "638\n" + ] + } + ], + "source": [ + "labels_tokenized = []\n", + "idx =-1\n", + "for hct, et in zip(HCs_tokenized, Ent_tokenized):\n", + " idx+=1\n", + " labels = []\n", + " for i in range(len(hct)):\n", + " #Labels: 0->'O'; 1->'B'; 2->'I'\n", + " #labels.append('O')\n", + " labels.append(0)\n", + "\n", + " #For Entities (Diseases|Enfermedades)\n", + " for enf in et:\n", + " first = True\n", + " for e in enf:\n", + " if first == True:\n", + " try:\n", + " #labels[hct.index(e)] = 'B'\n", + " #labels[posLab] = 'B'\n", + " indices = find_idx(hct, e)\n", + " if len(indices) > 1:\n", + " for id in indices:\n", + " labels[id] = 1\n", + " else:\n", + " labels[hct.index(e)] = 1\n", + " \n", + " first = False\n", + " except:\n", + " first = False\n", + " if e == \"sarcoma+carcinoma\" or e == \"carcinoma+sarcoma\":\n", + " continue\n", + " print(hct)\n", + " print(et)\n", + " print(enf)\n", + " print(e)\n", + " print(idx)\n", + " else:\n", + " try:\n", + " #labels[hct.index(e)] = 'I'\n", + " #labels[posLab] = 'I'\n", + " indices = find_idx(hct, e)\n", + " if len(indices) > 1:\n", + " for id in indices:\n", + " if labels[id-1] != 0:\n", + " labels[id] = 2\n", + " else:\n", + " labels[hct.index(e)] = 2\n", + " except:\n", + " if e == \"sarcoma+carcinoma\" or e == \"carcinoma+sarcoma\":\n", + " continue\n", + " print(hct)\n", + " print(et)\n", + " print(enf)\n", + " print(e)\n", + " print(idx)\n", + "\n", + " labels_tokenized.append(labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "jXKczS_fKpLc", + "outputId": "d385f71c-5507-4a33-8cce-f5d2195354f9" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "A\t0\n", + "73\t0\n", + "-\t0\n", + "year\t0\n", + "-\t0\n", + "old\t0\n", + "patient\t0\n", + "with\t0\n", + "a\t0\n", + "history\t0\n", + "of\t0\n", + "arterial\t1\n", + "hypertension\t2\n", + "and\t0\n", + "polyarthrosis\t1\n", + "presented\t0\n", + "to\t0\n", + "the\t0\n", + "emergency\t0\n", + "department\t0\n", + "with\t0\n", + "abdominal\t0\n", + "distension\t0\n", + "and\t0\n", + "pain\t0\n", + "associated\t0\n", + "with\t0\n", + "constipation\t0\n", + "and\t0\n", + "febrile\t0\n", + "fever\t0\n", + ".\t0\n", + "The\t0\n", + "symptoms\t0\n", + "had\t0\n", + "started\t0\n", + "three\t0\n", + "weeks\t0\n", + "earlier\t0\n", + "and\t0\n", + "worsened\t0\n", + "during\t0\n", + "the\t0\n", + "four\t0\n", + "days\t0\n", + "prior\t0\n", + "to\t0\n", + "admission\t0\n", + ".\t0\n", + "During\t0\n", + "this\t0\n", + "period\t0\n", + ",\t0\n", + "an\t0\n", + "upper\t0\n", + "gastrointestinal\t0\n", + "fibroendoscopy\t0\n", + "(\t0\n", + "oesophagus\t0\n", + ",\t0\n", + "stomach\t0\n", + "and\t0\n", + "duodenum\t0\n", + ")\t0\n", + "and\t0\n", + "a\t0\n", + "colonoscopy\t0\n", + "(\t0\n", + "up\t0\n", + "to\t0\n", + "the\t0\n", + "splenic\t0\n", + "angle\t0\n", + ")\t0\n", + "were\t0\n", + "performed\t0\n", + ",\t0\n", + "but\t0\n", + "no\t0\n", + "abnormalities\t0\n", + "were\t0\n", + "found\t0\n", + ".\t0\n", + "\n", + "\t0\n", + "Physical\t0\n", + "examination\t0\n", + "revealed\t0\n", + "a\t0\n", + "low\t0\n", + "-\t0\n", + "grade\t0\n", + "fever\t0\n", + "(\t0\n", + "37.6º\t0\n", + "C\t0\n", + ")\t0\n", + ",\t0\n", + "a\t0\n", + "distended\t0\n", + "abdomen\t0\n", + ",\t0\n", + "diffusely\t0\n", + "painful\t0\n", + "on\t0\n", + "palpation\t0\n", + ",\t0\n", + "tympanised\t0\n", + "on\t0\n", + "percussion\t0\n", + ",\t0\n", + "with\t0\n", + "scant\t0\n", + "borborygmi\t0\n", + "but\t0\n", + "no\t0\n", + "evidence\t0\n", + "of\t0\n", + "peritonism\t0\n", + ",\t0\n", + "pulmonary\t0\n", + "auscultation\t0\n", + "with\t0\n", + "decreased\t0\n", + "ventilation\t0\n", + "in\t0\n", + "the\t0\n", + "lower\t1\n", + "half\t0\n", + "of\t0\n", + "the\t0\n", + "right\t0\n", + "hemithorax\t0\n", + "and\t0\n", + "the\t0\n", + "onset\t0\n", + "of\t0\n", + "intense\t0\n", + "pain\t0\n", + "on\t0\n", + "palpation\t0\n", + "and\t0\n", + "percussion\t0\n", + "of\t0\n", + "the\t0\n", + "last\t0\n", + "three\t0\n", + "dorsal\t0\n", + "spinous\t0\n", + "processes\t0\n", + ".\t0\n", + "\n", + "\t0\n", + "Analyses\t0\n", + "showed\t0\n", + "8.2\t0\n", + "x\t0\n", + "109\t0\n", + "leukocytes\t0\n", + "/\t0\n", + "L\t0\n", + ",\t0\n", + "haemoglobin\t0\n", + "136\t0\n", + "g\t0\n", + "/\t0\n", + "L\t0\n", + ",\t0\n", + "platelets\t0\n", + "186\t0\n", + "x\t0\n", + "109\t0\n", + "/\t0\n", + "L.\t0\n", + "Except\t0\n", + "for\t0\n", + "glycaemia\t0\n", + "(\t0\n", + "123\t0\n", + "mg\t0\n", + "/\t0\n", + "dl\t0\n", + ")\t0\n", + ",\t0\n", + "the\t0\n", + "following\t0\n", + "laboratory\t0\n", + "parameters\t0\n", + "were\t0\n", + "normal\t0\n", + "or\t0\n", + "negative\t0\n", + ":\t0\n", + "urea\t0\n", + ",\t0\n", + "creatinine\t0\n", + ",\t0\n", + "bilirubin\t0\n", + ",\t0\n", + "transaminases\t0\n", + ",\t0\n", + "gamma\t0\n", + "-\t0\n", + "glutamyltranspeptidase\t0\n", + ",\t0\n", + "sodium\t0\n", + ",\t0\n", + "potassium\t0\n", + ",\t0\n", + "chlorine\t0\n", + ",\t0\n", + "calcium\t0\n", + ",\t0\n", + "phosphorus\t0\n", + ",\t0\n", + "creatine\t0\n", + "phosphokinase\t0\n", + ",\t0\n", + "amylase\t0\n", + ",\t0\n", + "lactate\t0\n", + "dehydrogenase\t0\n", + "(\t0\n", + "LDH\t0\n", + ")\t0\n", + ",\t0\n", + "proteinogram\t0\n", + ",\t0\n", + "immunoglobulin\t0\n", + "dosage\t0\n", + ",\t0\n", + "alpha\t0\n", + "-\t0\n", + "fetoprotein\t0\n", + ",\t0\n", + "CA\t0\n", + "19\t0\n", + "antigens\t0\n", + ".\t0\n", + "9\t0\n", + "and\t0\n", + "CA\t0\n", + "125\t0\n", + "antigens\t0\n", + ",\t0\n", + "as\t0\n", + "well\t0\n", + "as\t0\n", + "general\t0\n", + "urinalysis\t0\n", + ".\t0\n", + "ESR\t0\n", + "and\t0\n", + "C\t0\n", + "-\t0\n", + "reactive\t0\n", + "protein\t0\n", + "were\t0\n", + "elevated\t0\n", + ",\t0\n", + "with\t0\n", + "values\t0\n", + "of\t0\n", + "85\t0\n", + "mm\t0\n", + "/\t0\n", + "1\t0\n", + "h\t0\n", + "and\t0\n", + "133\t0\n", + "mg\t0\n", + "/\t0\n", + "L\t0\n", + "(\t0\n", + "normal\t0\n", + "<\t0\n", + "5\t0\n", + "mg\t0\n", + "/\t0\n", + "L\t0\n", + ")\t0\n", + ",\t0\n", + "respectively\t0\n", + ".\t0\n", + "Mantoux\t0\n", + "intradermal\t0\n", + "reaction\t0\n", + "(\t0\n", + "10\t0\n", + "IU\t0\n", + "RT-23\t0\n", + ")\t0\n", + "was\t0\n", + "positive\t0\n", + ",\t0\n", + "with\t0\n", + "an\t0\n", + "induration\t0\n", + "of\t0\n", + "25\t0\n", + "mm\t0\n", + ".\t0\n", + "Chest\t0\n", + "X\t0\n", + "-\t0\n", + "ray\t0\n", + "showed\t0\n", + "an\t0\n", + "image\t0\n", + "compatible\t0\n", + "with\t0\n", + "right\t0\n", + "lower\t1\n", + "lobe\t2\n", + "atelectasis\t2\n", + "in\t0\n", + "the\t0\n", + "context\t0\n", + "of\t0\n", + "an\t0\n", + "ipsilateral\t0\n", + "pleural\t1\n", + "effusion\t2\n", + ".\t0\n", + "There\t0\n", + "were\t0\n", + "no\t0\n", + "signs\t0\n", + "suggestive\t0\n", + "of\t0\n", + "adenopathy\t0\n", + "or\t0\n", + "alterations\t0\n", + "in\t0\n", + "the\t0\n", + "cardiopericardial\t0\n", + "silhouette\t0\n", + ".\t0\n", + "A\t0\n", + "thoracoabdominal\t0\n", + "CT\t0\n", + "scan\t0\n", + "confirmed\t0\n", + "the\t0\n", + "existence\t0\n", + "of\t0\n", + "a\t0\n", + "right\t0\n", + "pleural\t1\n", + "effusion\t2\n", + "and\t0\n", + "identified\t0\n", + "prominent\t0\n", + "degenerative\t0\n", + "changes\t0\n", + "along\t0\n", + "the\t0\n", + "dorsolumbar\t0\n", + "spine\t0\n", + "but\t0\n", + ",\t0\n", + "above\t0\n", + "all\t0\n", + ",\t0\n", + "erosions\t0\n", + "in\t0\n", + "the\t0\n", + "vertebral\t0\n", + "plates\t0\n", + "adjacent\t0\n", + "to\t0\n", + "the\t0\n", + "D10\t0\n", + "-\t0\n", + "D11\t0\n", + "disc\t0\n", + "space\t0\n", + ".\t0\n", + "A\t0\n", + "lumbar\t0\n", + "MRI\t0\n", + "showed\t0\n", + "hyposignal\t0\n", + "on\t0\n", + "T1\t0\n", + "-\t0\n", + "weighted\t0\n", + "sequences\t0\n", + "and\t0\n", + "hypersignal\t0\n", + "on\t0\n", + "T2\t0\n", + "-\t0\n", + "weighted\t0\n", + "sequences\t0\n", + "in\t0\n", + "these\t0\n", + "vertebrae\t0\n", + "and\t0\n", + "their\t0\n", + "corresponding\t0\n", + "disc\t0\n", + ",\t0\n", + "with\t0\n", + "morphological\t0\n", + "alterations\t0\n", + "typical\t0\n", + "of\t0\n", + "infectious\t1\n", + "spondylodiscitis\t2\n", + "D10\t2\n", + "-\t2\n", + "D11\t2\n", + ".\t0\n", + "Three\t0\n", + "serial\t0\n", + "blood\t0\n", + "cultures\t0\n", + "were\t0\n", + "negative\t0\n", + ".\t0\n", + "Samples\t0\n", + "obtained\t0\n", + "by\t0\n", + "aspiration\t0\n", + "of\t0\n", + "the\t0\n", + "D10\t0\n", + "-\t0\n", + "D11\t0\n", + "space\t0\n", + "showed\t0\n", + "gram\t0\n", + "-\t0\n", + "positive\t0\n", + "cocci\t0\n", + "chains\t0\n", + ",\t0\n", + "which\t0\n", + "were\t0\n", + "subsequently\t0\n", + "recovered\t0\n", + "and\t0\n", + "typed\t0\n", + "as\t0\n", + "penicillin\t0\n", + "-\t0\n", + "sensitive\t0\n", + "Streptococcus\t0\n", + "pneumoniae\t0\n", + ".\t0\n", + "Pleural\t0\n", + "fluid\t0\n", + "analysis\t0\n", + "showed\t0\n", + "pH\t0\n", + ":\t0\n", + "7.55\t0\n", + ";\t0\n", + "leucocytes\t0\n", + ":\t0\n", + "8.4\t0\n", + "x\t0\n", + "109\t0\n", + "/\t0\n", + "L\t0\n", + "(\t0\n", + "58\t0\n", + "%\t0\n", + "neutrophils\t0\n", + ",\t0\n", + "26\t0\n", + "%\t0\n", + "eosinophils\t0\n", + ",\t0\n", + "16\t0\n", + "%\t0\n", + "lymphocytes\t0\n", + ")\t0\n", + ",\t0\n", + "protein\t0\n", + ":\t0\n", + "48\t0\n", + "g\t0\n", + "/\t0\n", + "L\t0\n", + "(\t0\n", + "ratio\t0\n", + "to\t0\n", + "serum\t0\n", + "protein\t0\n", + ":\t0\n", + "0.65\t0\n", + ")\t0\n", + ",\t0\n", + "glucose\t0\n", + ":\t0\n", + "125\t0\n", + "mg\t0\n", + "/\t0\n", + "dl\t0\n", + ",\t0\n", + "ADA\t0\n", + ":\t0\n", + "25.92\t0\n", + "IU\t0\n", + "/\t0\n", + "ml\t0\n", + ",\t0\n", + "LDH\t0\n", + ":\t0\n", + "362\t0\n", + "U\t0\n", + "/\t0\n", + "L\t0\n", + "(\t0\n", + "pleural\t1\n", + "LDH\t0\n", + "/\t0\n", + "serum\t0\n", + "LDH\t0\n", + "ratio\t0\n", + ":\t0\n", + "0.8\t0\n", + ")\t0\n", + ".\t0\n", + "Both\t0\n", + "auramine\t0\n", + "-\t0\n", + "rhodamine\t0\n", + "staining\t0\n", + "and\t0\n", + "Löwenstein\t0\n", + "-\t0\n", + "Jensen\t0\n", + "medium\t0\n", + "culture\t0\n", + "of\t0\n", + "pleural\t1\n", + "fluid\t0\n", + "were\t0\n", + "negative\t0\n", + "and\t0\n", + "cytology\t0\n", + "showed\t0\n", + "no\t0\n", + "evidence\t0\n", + "of\t0\n", + "neoplastic\t0\n", + "cells\t0\n", + ".\t0\n", + "\n", + "\n", + "\t0\n", + "The\t0\n", + "patient\t0\n", + "was\t0\n", + "initially\t0\n", + "treated\t0\n", + "intravenously\t0\n", + "with\t0\n", + "amoxicillin\t0\n", + "+\t0\n", + "clavulanic\t0\n", + "acid\t0\n", + "(\t0\n", + "1\t0\n", + "g\t0\n", + "/\t0\n", + "200\t0\n", + "mg\t0\n", + ",\t0\n", + "every\t0\n", + "8\t0\n", + "hours\t0\n", + ")\t0\n", + ".\t0\n", + "After\t0\n", + "21\t0\n", + "days\t0\n", + ",\t0\n", + "she\t0\n", + "was\t0\n", + "switched\t0\n", + "to\t0\n", + "the\t0\n", + "oral\t0\n", + "route\t0\n", + "(\t0\n", + "875\t0\n", + "/\t0\n", + "125\t0\n", + "mg\t0\n", + ",\t0\n", + "every\t0\n", + "8\t0\n", + "hours\t0\n", + ")\t0\n", + "for\t0\n", + "6\t0\n", + "weeks\t0\n", + ".\t0\n", + "The\t0\n", + "evolution\t0\n", + "was\t0\n", + "favourable\t0\n", + "and\t0\n", + "she\t0\n", + "was\t0\n", + "able\t0\n", + "to\t0\n", + "start\t0\n", + "walking\t0\n", + "with\t0\n", + "a\t0\n", + "dorsolumbar\t0\n", + "corset\t0\n", + "after\t0\n", + "the\t0\n", + "fourth\t0\n", + "week\t0\n", + ".\t0\n", + "One\t0\n", + "month\t0\n", + "after\t0\n", + "the\t0\n", + "end\t0\n", + "of\t0\n", + "antibiotic\t0\n", + "therapy\t0\n", + ",\t0\n", + "a\t0\n", + "control\t0\n", + "chest\t0\n", + "CT\t0\n", + "scan\t0\n", + "still\t0\n", + "showed\t0\n", + "a\t0\n", + "discrete\t0\n", + "pleural\t1\n", + "effusion\t2\n", + ",\t0\n", + "but\t0\n", + "the\t0\n", + "patient\t0\n", + "had\t0\n", + "only\t0\n", + "mild\t0\n", + "mechanical\t0\n", + "dorsalgia\t0\n", + ",\t0\n", + "her\t0\n", + "ESR\t0\n", + "had\t0\n", + "decreased\t0\n", + "to\t0\n", + "21\t0\n", + "mm\t0\n", + "/\t0\n", + "1\t0\n", + "h\t0\n", + "and\t0\n", + "her\t0\n", + "CRP\t0\n", + "was\t0\n", + "2.4\t0\n", + "mg\t0\n", + "/\t0\n", + "L.\t0\n", + "Outpatient\t0\n", + "follow\t0\n", + "-\t0\n", + "up\t0\n", + "continued\t0\n", + "for\t0\n", + "a\t0\n", + "further\t0\n", + "three\t0\n", + "years\t0\n", + ",\t0\n", + "during\t0\n", + "which\t0\n", + "time\t0\n", + "the\t0\n", + "evolution\t0\n", + "was\t0\n", + "favourable\t0\n", + "and\t0\n", + "a\t0\n", + "D10\t0\n", + "-\t0\n", + "D11\t0\n", + "vertebral\t0\n", + "block\t0\n", + "was\t0\n", + "formed\t0\n", + ".\t0\n", + "\n", + "\n", + "\n", + "\t0\n" + ] + } + ], + "source": [ + "j = 0\n", + "for i in range(len(HCs_tokenized[j])):\n", + " print(str(HCs_tokenized[j][i]) + \"\\t\" + str(labels_tokenized[j][i]))" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Validating tokenization and alignment with the BIO tags." + ], + "metadata": { + "id": "laU64q79UYZy" + } + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "jcJBsudYKpLc", + "outputId": "13ea2860-19b7-4ee1-ddf4-ce607a60e80f" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Everything is aligned!\n" + ] + } + ], + "source": [ + "flag = 0\n", + "for st, lt in zip(HCs_tokenized, labels_tokenized):\n", + " if len(st) != len(lt):\n", + " print(st)\n", + " print(lt)\n", + " flag = 1\n", + "if flag==0:\n", + " print(\"Everything is aligned!\")" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Sentence tokenization" + ], + "metadata": { + "id": "lvR3V8qXUbvR" + } + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "id": "-nHHwKmUKpLd" + }, + "outputs": [], + "source": [ + "sent_tokenized = []\n", + "label_sent_tokenized = []\n", + "for ht, lht in zip(HCs_tokenized, labels_tokenized):\n", + " st = []; lbst = []\n", + " for h, l in zip(ht,lht):\n", + " if h != \".\":\n", + " st.append(h)\n", + " lbst.append(l)\n", + " else:\n", + " st.append(\".\")\n", + " lbst.append(0)\n", + " sent_tokenized.append(st)\n", + " label_sent_tokenized.append(lbst)\n", + " st = []; lbst = []" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "8Lo6zzhGKpLd", + "outputId": "6d68e803-7ad4-4970-b4bf-b34be7fbdd37" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "11668" + ] + }, + "metadata": {}, + "execution_count": 24 + } + ], + "source": [ + "len(sent_tokenized)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5Ei7TrATKpLd", + "outputId": "93b623a3-ac12-4500-b6f4-506972dcad9f" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['A',\n", + " '73',\n", + " '-',\n", + " 'year',\n", + " '-',\n", + " 'old',\n", + " 'patient',\n", + " 'with',\n", + " 'a',\n", + " 'history',\n", + " 'of',\n", + " 'arterial',\n", + " 'hypertension',\n", + " 'and',\n", + " 'polyarthrosis',\n", + " 'presented',\n", + " 'to',\n", + " 'the',\n", + " 'emergency',\n", + " 'department',\n", + " 'with',\n", + " 'abdominal',\n", + " 'distension',\n", + " 'and',\n", + " 'pain',\n", + " 'associated',\n", + " 'with',\n", + " 'constipation',\n", + " 'and',\n", + " 'febrile',\n", + " 'fever',\n", + " '.']" + ] + }, + "metadata": {}, + "execution_count": 25 + } + ], + "source": [ + "sent_tokenized[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KistaXEmKpLe", + "outputId": "b07ce5d3-df79-474e-a7fb-983632bb38d3" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "11668" + ] + }, + "metadata": {}, + "execution_count": 26 + } + ], + "source": [ + "len(label_sent_tokenized)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "m9qR1AxbKpLe", + "outputId": "66600bb9-dcae-4717-9cc9-9422115b4e2e" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 2,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0]" + ] + }, + "metadata": {}, + "execution_count": 27 + } + ], + "source": [ + "label_sent_tokenized[0]" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Disease mentions identification as a Token classification problem" + ], + "metadata": { + "id": "l5-_fyqPUh3b" + } + }, + { + "cell_type": "markdown", + "source": [ + "# Building the Dataset" + ], + "metadata": { + "id": "098UDE8VUjXf" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Case as a whole is given as input" + ], + "metadata": { + "id": "8FT7wAx4VeaU" + } + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "id": "5qVPJ2LVKpLe" + }, + "outputs": [], + "source": [ + "dic = {\"tokens\": HCs_tokenized, \"ner_tags\": labels_tokenized} #For the whole clinical case. We used this option for our paper.\n", + "#dic = {\"tokens\": sent_tokenized, \"ner_tags\": label_sent_tokenized} #Use this option if you want to check the model performance with sentences tokenized by \". \" b" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "id": "UzXp6PO2KpLe" + }, + "outputs": [], + "source": [ + "dataset = Dataset.from_dict(dic)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "NhqH78gAKpLf", + "outputId": "e3fe5c5f-764c-4bf8-c599-6bda653a4944" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['tokens', 'ner_tags'],\n", + " num_rows: 741\n", + "})" + ] + }, + "metadata": {}, + "execution_count": 30 + } + ], + "source": [ + "dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "id": "jspsCq2OKpLf" + }, + "outputs": [], + "source": [ + "#For training, validation, and test partitions\n", + "\"\"\"\n", + "#Train, val, test partitions\n", + "train_test = dataset.train_test_split()\n", + "test_val = train_test['test'].train_test_split()\n", + "raw_datasets = DatasetDict({\n", + " 'train': train_test['train'],\n", + " 'validation': test_val['train'],\n", + " 'test': test_val['test']\n", + " })\n", + "\"\"\"\n", + "\n", + "#Just for training and validation partitions\n", + "train_test = dataset.train_test_split()\n", + "raw_datasets = DatasetDict({\n", + " 'train': train_test['train'],\n", + " 'validation': train_test['test']\n", + " })" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "isqyq9VIKpLf", + "outputId": "7539fb10-1ab0-43cd-d7a5-bdb30514b891" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "DatasetDict({\n", + " train: Dataset({\n", + " features: ['tokens', 'ner_tags'],\n", + " num_rows: 555\n", + " })\n", + " validation: Dataset({\n", + " features: ['tokens', 'ner_tags'],\n", + " num_rows: 186\n", + " })\n", + "})" + ] + }, + "metadata": {}, + "execution_count": 32 + } + ], + "source": [ + "raw_datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "pEJhTKi8KpLf", + "outputId": "fe3f3c2b-226c-42eb-9674-f69fce8a61e9" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 2,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0]" + ] + }, + "metadata": {}, + "execution_count": 33 + } + ], + "source": [ + "raw_datasets[\"train\"][0][\"ner_tags\"]\n", + "#raw_datasets[\"train\"][0][\"pos_tags\"]\n", + "#raw_datasets[\"train\"][0][\"chunk_tags\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "gtrDybrCKpLf", + "outputId": "1217e93b-928c-4db1-bd22-8f1e3c2ebf2c" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['tokens', 'ner_tags'],\n", + " num_rows: 555\n", + "})" + ] + }, + "metadata": {}, + "execution_count": 34 + } + ], + "source": [ + "raw_datasets['train']" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "QjdG8yi0KpLf", + "outputId": "20bc5bed-36bf-4e05-f80b-f75638f5196b" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['O', 'B', 'I']" + ] + }, + "metadata": {}, + "execution_count": 35 + } + ], + "source": [ + "label_names = ['O','B','I']\n", + "label_names" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9kGXdiCsKpLg", + "outputId": "754c7613-c7c3-46fc-a109-d8a67d6ab072" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "This is a 70 - year - old male patient , who was admitted to the emergency department of the Hospital Pablo Tobón Uribe , with symptoms of approximately one hour of evolution consisting of chest tightness , general malaise , asthenia and diaphoresis ; which began after having ingested 100 mg of sildenafil , denies ingestion of another sexual stimulant or cocaine and without sexual intercourse after its consumption . The patient 's only clinical history was arterial hypertension , pharmacologically controlled , and he denies previous episodes of angina or nitrate consumption . The clinical examination and vital signs were normal ; however , after the initial assessment he presented cardiorespiratory arrest secondary to ventricular fibrillation with response to a single defibrillation of 200 joules . \n", + " The initial electrocardiogram showed ST - segment elevation in the inferior ( II , III and aVF ) and anterior ( V2 - V4 ) leads with reciprocal changes in aVL , with no electrocardiographic extension to the right ventricle . \n", + "\n", + " Cardiac enzymes on admission revealed a creatine kinase ( CK ) of 170 and a creatine phosphokinase - MB fraction ( CK - MB ) of 6 . Electrolytes , coagulation tests and blood cell counts were normal . \n", + " Initial management was with aspirin 100 mg , lovastatin 40 mg daily , metoprolol 25 mg every 12 hours , enoxaparin 60 mg every 12 hours , oxygen at 3 lt / min and streptokinase 1'500,000 units administered over 30 minutes . No changes secondary to reperfusion were demonstrated . The patient was transferred to the intensive care unit , where episodes of complete A - V block with spontaneous resolution were documented during the first hours of evolution . \n", + " The EKG taken at 24 hours of evolution revealed QS in the inferior face and a late progression of the R wave in the anterior face . Enzyme monitoring showed increased CK and MB fraction at 6 hours ( 4476 and 165 ) and 12 hours ( 3839 and 136 ) . \n", + " The next day coronary angiography showed diffuse disease of the anterior descending artery with 50 % lesion in the distal third and 40 % lesion in the proximal third of the first diagonal branch . The circumflex artery had a 50 % lesion in the middle third and diffuse disease of its obtuse marginal branches . The right coronary artery had an irregular lesion suggestive of a partially resolved thrombus producing a maximum stenosis of 50 % ; distally the posterior descending artery had two 40 % lesions . \n", + "\n", + " The patient evolved satisfactorily without further complications and pain - free . He was discharged for outpatient follow - up . \n", + "\n", + "\n", + " \n", + "O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B I O O O O O O O O O O B O O O O O O O O O O O O O O O O O O O O O B I I I I I O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B I I I I O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B I I I I I I I I I I I I I I I I I I O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O \n" + ] + } + ], + "source": [ + "words = raw_datasets[\"train\"][0][\"tokens\"]\n", + "labels = [int(n) for n in raw_datasets[\"train\"][0][\"ner_tags\"]]\n", + "#labels = raw_datasets[\"train\"][0][\"pos_tags\"]\n", + "#labels = raw_datasets[\"train\"][0][\"chunk_tags\"]\n", + "line1 = \"\"\n", + "line2 = \"\"\n", + "for word, label in zip(words, labels):\n", + " full_label = label_names[label]\n", + " max_length = max(len(word), len(full_label))\n", + " line1 += word + \" \" * (max_length - len(word) + 1)\n", + " line2 += full_label + \" \" * (max_length - len(full_label) + 1)\n", + "\n", + "print(line1)\n", + "print(line2)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Helper Functions" + ], + "metadata": { + "id": "0ABWC5YnTZzp" + } + }, + { + "cell_type": "code", + "source": [ + "def align_labels_with_tokens(labels, word_ids):\n", + " new_labels = []\n", + " current_word = None\n", + " for word_id in word_ids:\n", + " if word_id != current_word:\n", + " # Start of a new word!\n", + " current_word = word_id\n", + " label = -100 if word_id is None else labels[word_id]\n", + " new_labels.append(label)\n", + " elif word_id is None:\n", + " # Special token\n", + " new_labels.append(-100)\n", + " else:\n", + " # Same word as previous token\n", + " label = labels[word_id]\n", + " # If the label is B-XXX we change it to I-XXX\n", + " if label % 2 == 1:\n", + " label += 1\n", + " new_labels.append(label)\n", + "\n", + " return new_labels" + ], + "metadata": { + "id": "LQUKQDvxTNHn" + }, + "execution_count": 37, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def tokenize_and_align_labels(examples):\n", + " tokenized_inputs = tokenizer(\n", + " examples[\"tokens\"], truncation=True, is_split_into_words=True\n", + " )\n", + " all_labels = examples[\"ner_tags\"]\n", + " new_labels = []\n", + " for i, labels in enumerate(all_labels):\n", + " word_ids = tokenized_inputs.word_ids(i)\n", + " new_labels.append(align_labels_with_tokens(labels, word_ids))\n", + "\n", + " tokenized_inputs[\"labels\"] = new_labels\n", + " return tokenized_inputs" + ], + "metadata": { + "id": "qT-6oXUoTYah" + }, + "execution_count": 38, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "metric = load_metric(\"seqeval\")\n", + "def compute_metrics(eval_preds):\n", + " logits, labels = eval_preds\n", + " predictions = np.argmax(logits, axis=-1)\n", + "\n", + " # Remove ignored index (special tokens) and convert to labels\n", + " true_labels = [[label_names[l] for l in label if l != -100] for label in labels]\n", + " try:\n", + " true_predictions = [\n", + " [label_names[p] for (p, l) in zip(prediction, label) if l != -100]\n", + " for prediction, label in zip(predictions, labels)\n", + " ]\n", + " except:\n", + " true_predictions = []\n", + " for prediction, label in zip(predictions, labels):\n", + " label_list = []\n", + " for (p, l) in zip(prediction, label):\n", + " if l != -100:\n", + " if p not in range(len(label_names)):\n", + " p = 0\n", + " \n", + " label_list.append(label_names[p])\n", + " true_predictions.append(label_list)\n", + "\n", + " all_metrics = metric.compute(predictions=true_predictions, references=true_labels)\n", + " return {\n", + " \"precision\": all_metrics[\"overall_precision\"],\n", + " \"recall\": all_metrics[\"overall_recall\"],\n", + " \"f1\": all_metrics[\"overall_f1\"],\n", + " \"accuracy\": all_metrics[\"overall_accuracy\"],\n", + " }" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 104, + "referenced_widgets": [ + "b3fe12ca95e84b198d16bdb4d20f9ad9", + "1b7f8f1786394c01bad4a8589ad16513", + "70e437b3ba294189b4799c6607532ebd", + "0fb47d91dbf9497cac1ffc1c5dfd4519", + "9cfec0f21c0a459f9f5888c389a6a479", + "ef66098fb5f748eabe11abc3fe4ad54d", + "563d8b35192240be960bc08909984119", + "b94385d1423e47f5a9e2351bf873c3e0", + "e1b6e7774bc94a87ad23fb53d6c9b985", + "a0b523772cf04a85b0ac000cc9a83c67", + "71a3f1b2112344ea81721e59cce14cec" + ] + }, + "id": "EZMRI9ATTzc9", + "outputId": "14e38b3a-86a7-49b1-817f-92a7304747a2" + }, + "execution_count": 39, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: FutureWarning: load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate\n", + " \"\"\"Entry point for launching an IPython kernel.\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Downloading builder script: 0%| | 0.00/2.47k [00:00<?, ?B/s]" + ], + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "b3fe12ca95e84b198d16bdb4d20f9ad9" + } + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Loading BERT as a pre-trained model" + ], + "metadata": { + "id": "6D0P7PztVnCP" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Model 1 - d4data/biomedical-ner-all\n", + "\n", + "Complete document wise tokenization" + ], + "metadata": { + "id": "mmPReh2mSux9" + } + }, + { + "cell_type": "code", + "source": [ + "model_checkpoint = \"d4data/biomedical-ner-all\"\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 145, + "referenced_widgets": [ + "a275c54cbefb4438a3015080e8b57999", + "a4696c167a3247bd8fd0727e0556463a", + "afa79c37c031491da9e229c637d80cc4", + "5ddf799b2fd94edc9949d36450a2d5e9", + "592af11564074af19e40bce6680ed7f1", + "125b61b8e80d4192a6f19d43ba4797dc", + "d9ff16ada2d94eb7a1adc70e5265ff2d", + "f9bd10de9e2845f08100a29293b92d1c", + "c28349dbeccc4124b583e0eeec004e6b", + "ff2cf349b07442bd9812dd8c7e82e59b", + "414d466fed0b42378d8b38f10c720eba", + "e432f1e3e5c54358a321a21e9c7aad1f", + "cf931d70dc1a4d2ba5f10dba7bf90ece", + "58f1edc459ef4f5bab25544474897db3", + "8894005504364c36964d283cf58bb223", + "78db41a453ce4ff4884960c615147331", + "fe8d877f0fc1417baad9838094045475", + "74bfdb85ed55436f8c12bf9b25375533", + "e813e2a1cb7248b7a8c404d55e4fb248", + "cb3c438fb3a6412d80b5ba673a6455cb", + "bb189f5bc189462cad4824a1c30335c0", + "e98b7218049f4310951a1608c52c14e0", + "3dd1f27ff0d24a1294534ff7e69a7abb", + "4128d82e19f14e9d9be5416ebc974d0d", + "6676a80dc293456ea7aed4ce3e281d83", + "55a7a4c336884f26a53292d559a06ff8", + "d27469698b1e4ad1ae74ced6f7c3942d", + "d201490a05c049d38b087008aac0a400", + "703d715a4ef64c4e93cc6496f5340451", + "41d861058e3e458e949f1f3d92623217", + "f4d9343bd31d47b1b3dcf0494825be2d", + "b75e2a47db2b47dd8740f77b337c308f", + "d8849516ccb44011a7f9e7e745b30c60", + "ae5928c8da4243fba06ae9bf5086ba31", + "e613455bcbb24e36a31666acd83d7b24", + "bac45a33f9b444a1985ef56a9be85c52", + "91ac0673e600400f904b1b10deb86cee", + "289f23dd30814993afde0f5e987fdd9e", + "8e72912c0e434060ac30517a98d07a9e", + "ede252ab2cee4ffbbc2f5519373d1e97", + "f2da21cc1007475ca0233a9e5d146d65", + "92fd7c43f87142d1bbd05f89ba3bfe39", + "b58c9bfa30b3421496adb52e082cb50a", + "6cbfa925d26e47139365d10b9b28d96a" + ] + }, + "id": "RfwVBXFAS1Dc", + "outputId": "3a7c9806-3828-4ab7-c73d-d0e8039e1417" + }, + "execution_count": 40, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Downloading: 0%| | 0.00/373 [00:00<?, ?B/s]" + ], + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "a275c54cbefb4438a3015080e8b57999" + } + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Downloading: 0%| | 0.00/232k [00:00<?, ?B/s]" + ], + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "e432f1e3e5c54358a321a21e9c7aad1f" + } + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Downloading: 0%| | 0.00/711k [00:00<?, ?B/s]" + ], + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "3dd1f27ff0d24a1294534ff7e69a7abb" + } + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Downloading: 0%| | 0.00/125 [00:00<?, ?B/s]" + ], + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "ae5928c8da4243fba06ae9bf5086ba31" + } + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "tokenizer.is_fast" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "a9DegC0yS_Gp", + "outputId": "083e5505-007e-43c8-fe31-3b202c84cc76" + }, + "execution_count": 41, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 41 + } + ] + }, + { + "cell_type": "code", + "source": [ + "inputs = tokenizer(raw_datasets[\"train\"][0][\"tokens\"], is_split_into_words=True)\n", + "inputs.tokens()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "v-7VQWw5TB-c", + "outputId": "0922ac3e-61af-43ae-f3a8-29f291e7c19b" + }, + "execution_count": 42, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Token indices sequence length is longer than the specified maximum sequence length for this model (567 > 512). Running this sequence through the model will result in indexing errors\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['[CLS]',\n", + " 'this',\n", + " 'is',\n", + " 'a',\n", + " '70',\n", + " '-',\n", + " 'year',\n", + " '-',\n", + " 'old',\n", + " 'male',\n", + " 'patient',\n", + " ',',\n", + " 'who',\n", + " 'was',\n", + " 'admitted',\n", + " 'to',\n", + " 'the',\n", + " 'emergency',\n", + " 'department',\n", + " 'of',\n", + " 'the',\n", + " 'hospital',\n", + " 'pablo',\n", + " 'to',\n", + " '##bon',\n", + " 'ur',\n", + " '##ibe',\n", + " ',',\n", + " 'with',\n", + " 'symptoms',\n", + " 'of',\n", + " 'approximately',\n", + " 'one',\n", + " 'hour',\n", + " 'of',\n", + " 'evolution',\n", + " 'consisting',\n", + " 'of',\n", + " 'chest',\n", + " 'tight',\n", + " '##ness',\n", + " ',',\n", + " 'general',\n", + " 'mala',\n", + " '##ise',\n", + " ',',\n", + " 'as',\n", + " '##the',\n", + " '##nia',\n", + " 'and',\n", + " 'dia',\n", + " '##ph',\n", + " '##ores',\n", + " '##is',\n", + " ';',\n", + " 'which',\n", + " 'began',\n", + " 'after',\n", + " 'having',\n", + " 'ing',\n", + " '##ested',\n", + " '100',\n", + " 'mg',\n", + " 'of',\n", + " 'si',\n", + " '##lden',\n", + " '##af',\n", + " '##il',\n", + " ',',\n", + " 'denies',\n", + " 'ing',\n", + " '##est',\n", + " '##ion',\n", + " 'of',\n", + " 'another',\n", + " 'sexual',\n", + " 'st',\n", + " '##im',\n", + " '##ula',\n", + " '##nt',\n", + " 'or',\n", + " 'cocaine',\n", + " 'and',\n", + " 'without',\n", + " 'sexual',\n", + " 'intercourse',\n", + " 'after',\n", + " 'its',\n", + " 'consumption',\n", + " '.',\n", + " 'the',\n", + " 'patient',\n", + " \"'\",\n", + " 's',\n", + " 'only',\n", + " 'clinical',\n", + " 'history',\n", + " 'was',\n", + " 'arterial',\n", + " 'hyper',\n", + " '##tension',\n", + " ',',\n", + " 'ph',\n", + " '##arm',\n", + " '##aco',\n", + " '##logical',\n", + " '##ly',\n", + " 'controlled',\n", + " ',',\n", + " 'and',\n", + " 'he',\n", + " 'denies',\n", + " 'previous',\n", + " 'episodes',\n", + " 'of',\n", + " 'ang',\n", + " '##ina',\n", + " 'or',\n", + " 'nitrate',\n", + " 'consumption',\n", + " '.',\n", + " 'the',\n", + " 'clinical',\n", + " 'examination',\n", + " 'and',\n", + " 'vital',\n", + " 'signs',\n", + " 'were',\n", + " 'normal',\n", + " ';',\n", + " 'however',\n", + " ',',\n", + " 'after',\n", + " 'the',\n", + " 'initial',\n", + " 'assessment',\n", + " 'he',\n", + " 'presented',\n", + " 'card',\n", + " '##ior',\n", + " '##es',\n", + " '##pi',\n", + " '##rator',\n", + " '##y',\n", + " 'arrest',\n", + " 'secondary',\n", + " 'to',\n", + " 'vent',\n", + " '##ric',\n", + " '##ular',\n", + " 'fi',\n", + " '##bri',\n", + " '##llation',\n", + " 'with',\n", + " 'response',\n", + " 'to',\n", + " 'a',\n", + " 'single',\n", + " 'def',\n", + " '##ib',\n", + " '##rill',\n", + " '##ation',\n", + " 'of',\n", + " '200',\n", + " 'jo',\n", + " '##ules',\n", + " '.',\n", + " 'the',\n", + " 'initial',\n", + " 'electro',\n", + " '##card',\n", + " '##io',\n", + " '##gram',\n", + " 'showed',\n", + " 'st',\n", + " '-',\n", + " 'segment',\n", + " 'elevation',\n", + " 'in',\n", + " 'the',\n", + " 'inferior',\n", + " '(',\n", + " 'ii',\n", + " ',',\n", + " 'iii',\n", + " 'and',\n", + " 'av',\n", + " '##f',\n", + " ')',\n", + " 'and',\n", + " 'anterior',\n", + " '(',\n", + " 'v',\n", + " '##2',\n", + " '-',\n", + " 'v',\n", + " '##4',\n", + " ')',\n", + " 'leads',\n", + " 'with',\n", + " 'reciprocal',\n", + " 'changes',\n", + " 'in',\n", + " 'av',\n", + " '##l',\n", + " ',',\n", + " 'with',\n", + " 'no',\n", + " 'electro',\n", + " '##card',\n", + " '##io',\n", + " '##graphic',\n", + " 'extension',\n", + " 'to',\n", + " 'the',\n", + " 'right',\n", + " 'vent',\n", + " '##ric',\n", + " '##le',\n", + " '.',\n", + " 'cardiac',\n", + " 'enzymes',\n", + " 'on',\n", + " 'admission',\n", + " 'revealed',\n", + " 'a',\n", + " 'cr',\n", + " '##ea',\n", + " '##tine',\n", + " 'kinase',\n", + " '(',\n", + " 'ck',\n", + " ')',\n", + " 'of',\n", + " '170',\n", + " 'and',\n", + " 'a',\n", + " 'cr',\n", + " '##ea',\n", + " '##tine',\n", + " 'ph',\n", + " '##os',\n", + " '##ph',\n", + " '##oki',\n", + " '##nas',\n", + " '##e',\n", + " '-',\n", + " 'mb',\n", + " 'fraction',\n", + " '(',\n", + " 'ck',\n", + " '-',\n", + " 'mb',\n", + " ')',\n", + " 'of',\n", + " '6',\n", + " '.',\n", + " 'electro',\n", + " '##ly',\n", + " '##tes',\n", + " ',',\n", + " 'coa',\n", + " '##gul',\n", + " '##ation',\n", + " 'tests',\n", + " 'and',\n", + " 'blood',\n", + " 'cell',\n", + " 'counts',\n", + " 'were',\n", + " 'normal',\n", + " '.',\n", + " 'initial',\n", + " 'management',\n", + " 'was',\n", + " 'with',\n", + " 'as',\n", + " '##pi',\n", + " '##rin',\n", + " '100',\n", + " 'mg',\n", + " ',',\n", + " 'lo',\n", + " '##vas',\n", + " '##tat',\n", + " '##in',\n", + " '40',\n", + " 'mg',\n", + " 'daily',\n", + " ',',\n", + " 'met',\n", + " '##op',\n", + " '##rol',\n", + " '##ol',\n", + " '25',\n", + " 'mg',\n", + " 'every',\n", + " '12',\n", + " 'hours',\n", + " ',',\n", + " 'en',\n", + " '##ox',\n", + " '##apa',\n", + " '##rin',\n", + " '60',\n", + " 'mg',\n", + " 'every',\n", + " '12',\n", + " 'hours',\n", + " ',',\n", + " 'oxygen',\n", + " 'at',\n", + " '3',\n", + " 'lt',\n", + " '/',\n", + " 'min',\n", + " 'and',\n", + " 'st',\n", + " '##re',\n", + " '##pt',\n", + " '##oki',\n", + " '##nas',\n", + " '##e',\n", + " '1',\n", + " \"'\",\n", + " '500',\n", + " ',',\n", + " '000',\n", + " 'units',\n", + " 'administered',\n", + " 'over',\n", + " '30',\n", + " 'minutes',\n", + " '.',\n", + " 'no',\n", + " 'changes',\n", + " 'secondary',\n", + " 'to',\n", + " 'rep',\n", + " '##er',\n", + " '##fusion',\n", + " 'were',\n", + " 'demonstrated',\n", + " '.',\n", + " 'the',\n", + " 'patient',\n", + " 'was',\n", + " 'transferred',\n", + " 'to',\n", + " 'the',\n", + " 'intensive',\n", + " 'care',\n", + " 'unit',\n", + " ',',\n", + " 'where',\n", + " 'episodes',\n", + " 'of',\n", + " 'complete',\n", + " 'a',\n", + " '-',\n", + " 'v',\n", + " 'block',\n", + " 'with',\n", + " 'spontaneous',\n", + " 'resolution',\n", + " 'were',\n", + " 'documented',\n", + " 'during',\n", + " 'the',\n", + " 'first',\n", + " 'hours',\n", + " 'of',\n", + " 'evolution',\n", + " '.',\n", + " 'the',\n", + " 'ek',\n", + " '##g',\n", + " 'taken',\n", + " 'at',\n", + " '24',\n", + " 'hours',\n", + " 'of',\n", + " 'evolution',\n", + " 'revealed',\n", + " 'q',\n", + " '##s',\n", + " 'in',\n", + " 'the',\n", + " 'inferior',\n", + " 'face',\n", + " 'and',\n", + " 'a',\n", + " 'late',\n", + " 'progression',\n", + " 'of',\n", + " 'the',\n", + " 'r',\n", + " 'wave',\n", + " 'in',\n", + " 'the',\n", + " 'anterior',\n", + " 'face',\n", + " '.',\n", + " 'enzyme',\n", + " 'monitoring',\n", + " 'showed',\n", + " 'increased',\n", + " 'ck',\n", + " 'and',\n", + " 'mb',\n", + " 'fraction',\n", + " 'at',\n", + " '6',\n", + " 'hours',\n", + " '(',\n", + " '44',\n", + " '##7',\n", + " '##6',\n", + " 'and',\n", + " '165',\n", + " ')',\n", + " 'and',\n", + " '12',\n", + " 'hours',\n", + " '(',\n", + " '38',\n", + " '##39',\n", + " 'and',\n", + " '136',\n", + " ')',\n", + " '.',\n", + " 'the',\n", + " 'next',\n", + " 'day',\n", + " 'corona',\n", + " '##ry',\n", + " 'ang',\n", + " '##iography',\n", + " 'showed',\n", + " 'diffuse',\n", + " 'disease',\n", + " 'of',\n", + " 'the',\n", + " 'anterior',\n", + " 'descending',\n", + " 'artery',\n", + " 'with',\n", + " '50',\n", + " '%',\n", + " 'les',\n", + " '##ion',\n", + " 'in',\n", + " 'the',\n", + " 'distal',\n", + " 'third',\n", + " 'and',\n", + " '40',\n", + " '%',\n", + " 'les',\n", + " '##ion',\n", + " 'in',\n", + " 'the',\n", + " 'pro',\n", + " '##xi',\n", + " '##mal',\n", + " 'third',\n", + " 'of',\n", + " 'the',\n", + " 'first',\n", + " 'diagonal',\n", + " 'branch',\n", + " '.',\n", + " 'the',\n", + " 'ci',\n", + " '##rc',\n", + " '##um',\n", + " '##fle',\n", + " '##x',\n", + " 'artery',\n", + " 'had',\n", + " 'a',\n", + " '50',\n", + " '%',\n", + " 'les',\n", + " '##ion',\n", + " 'in',\n", + " 'the',\n", + " 'middle',\n", + " 'third',\n", + " 'and',\n", + " 'diffuse',\n", + " 'disease',\n", + " 'of',\n", + " 'its',\n", + " 'ob',\n", + " '##tus',\n", + " '##e',\n", + " 'marginal',\n", + " 'branches',\n", + " '.',\n", + " 'the',\n", + " 'right',\n", + " 'corona',\n", + " '##ry',\n", + " 'artery',\n", + " 'had',\n", + " 'an',\n", + " 'irregular',\n", + " 'les',\n", + " '##ion',\n", + " 'suggest',\n", + " '##ive',\n", + " 'of',\n", + " 'a',\n", + " 'partially',\n", + " 'resolved',\n", + " 'th',\n", + " '##rom',\n", + " '##bus',\n", + " 'producing',\n", + " 'a',\n", + " 'maximum',\n", + " 'ste',\n", + " '##nosis',\n", + " 'of',\n", + " '50',\n", + " '%',\n", + " ';',\n", + " 'distal',\n", + " '##ly',\n", + " 'the',\n", + " 'posterior',\n", + " 'descending',\n", + " 'artery',\n", + " 'had',\n", + " 'two',\n", + " '40',\n", + " '%',\n", + " 'lesions',\n", + " '.',\n", + " 'the',\n", + " 'patient',\n", + " 'evolved',\n", + " 'sat',\n", + " '##is',\n", + " '##fa',\n", + " '##ctor',\n", + " '##ily',\n", + " 'without',\n", + " 'further',\n", + " 'complications',\n", + " 'and',\n", + " 'pain',\n", + " '-',\n", + " 'free',\n", + " '.',\n", + " 'he',\n", + " 'was',\n", + " 'discharged',\n", + " 'for',\n", + " 'out',\n", + " '##patient',\n", + " 'follow',\n", + " '-',\n", + " 'up',\n", + " '.',\n", + " '[SEP]']" + ] + }, + "metadata": {}, + "execution_count": 42 + } + ] + }, + { + "cell_type": "code", + "source": [ + "labels = raw_datasets[\"train\"][0][\"ner_tags\"]\n", + "word_ids = inputs.word_ids()\n", + "print(labels)\n", + "print(align_labels_with_tokens(labels, word_ids))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "w77KW9-3TD4u", + "outputId": "eae68b43-ff6a-48cb-a380-930f156c24c6" + }, + "execution_count": 43, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n", + "[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "tokenized_datasets = raw_datasets.map(\n", + " tokenize_and_align_labels,\n", + " batched=True,\n", + " remove_columns=raw_datasets[\"train\"].column_names,\n", + ")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81, + "referenced_widgets": [ + "d161016f9fea41e6b27eb537c12d0703", + "f315bcfdc76848cb8e851a2698e0248b", + "ef152955607540f2a7d38bf9e2207eec", + "456c36425ac94dc294f8402c07668a51", + "4d305d32efdf4b639e65e816a7132597", + "5c3a10b039c344509be9867ca40a8472", + "6663eac35b7a4043b97edb90a555e3d9", + "30bce58edba74043abc1a2625c492d4a", + "d4b6dbbad9c946ed99b6c6e587bfb6da", + "8976a59e4ea049088f92a37f7547e16e", + "6a689955d9b3463abaaaa03b62d3cf69", + "9d65a59161cd401aad05f4a52d51c724", + "6f244b91a3884eb5b0fbd577ed5d1710", + "2b6ad660dd1f4c78855433118b9fb61e", + "101fa9a9581a46d8b1e0951f03796740", + "7ffe4378bc7b410780780dd51d0705ea", + "77c422e831944566a6529da37645ef6d", + "9a88121d0138438980f1c7e4341f480a", + "52600cdbf4804b148e02724ae4902de5", + "11f16a0c34e64d6494ac1d2550d18f8f", + "645616ac236e479c8303a56100d26d51", + "859e35e323f0407fbdea9eb7ae953742" + ] + }, + "id": "5BPyKS51TWGK", + "outputId": "f7ada6c5-8860-40d3-f32b-4b79abe14ae8" + }, + "execution_count": 44, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + " 0%| | 0/1 [00:00<?, ?ba/s]" + ], + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "d161016f9fea41e6b27eb537c12d0703" + } + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + " 0%| | 0/1 [00:00<?, ?ba/s]" + ], + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "9d65a59161cd401aad05f4a52d51c724" + } + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)" + ], + "metadata": { + "id": "_EZvP3kyTfms" + }, + "execution_count": 45, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "batch = data_collator([tokenized_datasets[\"train\"][i] for i in range(2)])\n", + "batch[\"labels\"]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "IkQdm1ODTiWJ", + "outputId": "5d18c54e-c5aa-4d89-ed8a-e1b7664350f8" + }, + "execution_count": 46, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "tensor([[-100, 0, 0, ..., 0, 0, -100],\n", + " [-100, 0, 0, ..., 0, 0, -100]])" + ] + }, + "metadata": {}, + "execution_count": 46 + } + ] + }, + { + "cell_type": "code", + "source": [ + "id2label = {str(i): label for i, label in enumerate(label_names)}\n", + "label2id = {v: k for k, v in id2label.items()}" + ], + "metadata": { + "id": "QPom2dyaTkdf" + }, + "execution_count": 47, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "model = AutoModelForTokenClassification.from_pretrained( \n", + " model_checkpoint\n", + ")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LSRqpzXDTnpt", + "outputId": "f87c31de-e40d-49f8-81a1-53a2a9905083" + }, + "execution_count": 55, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--d4data--biomedical-ner-all/snapshots/e87917020da1384aed6e93b1b46d68771f65ddab/config.json\n", + "Model config DistilBertConfig {\n", + " \"_name_or_path\": \"d4data/biomedical-ner-all\",\n", + " \"activation\": \"gelu\",\n", + " \"architectures\": [\n", + " \"DistilBertForTokenClassification\"\n", + " ],\n", + " \"attention_dropout\": 0.1,\n", + " \"dim\": 768,\n", + " \"dropout\": 0.1,\n", + " \"hidden_dim\": 3072,\n", + " \"id2label\": {\n", + " \"0\": \"O\",\n", + " \"1\": \"B-Activity\",\n", + " \"2\": \"B-Administration\",\n", + " \"3\": \"B-Age\",\n", + " \"4\": \"B-Area\",\n", + " \"5\": \"B-Biological_attribute\",\n", + " \"6\": \"B-Biological_structure\",\n", + " \"7\": \"B-Clinical_event\",\n", + " \"8\": \"B-Color\",\n", + " \"9\": \"B-Coreference\",\n", + " \"10\": \"B-Date\",\n", + " \"11\": \"B-Detailed_description\",\n", + " \"12\": \"B-Diagnostic_procedure\",\n", + " \"13\": \"B-Disease_disorder\",\n", + " \"14\": \"B-Distance\",\n", + " \"15\": \"B-Dosage\",\n", + " \"16\": \"B-Duration\",\n", + " \"17\": \"B-Family_history\",\n", + " \"18\": \"B-Frequency\",\n", + " \"19\": \"B-Height\",\n", + " \"20\": \"B-History\",\n", + " \"21\": \"B-Lab_value\",\n", + " \"22\": \"B-Mass\",\n", + " \"23\": \"B-Medication\",\n", + " \"24\": \"B-Non[biological](Detailed_description\",\n", + " \"25\": \"B-Nonbiological_location\",\n", + " \"26\": \"B-Occupation\",\n", + " \"27\": \"B-Other_entity\",\n", + " \"28\": \"B-Other_event\",\n", + " \"29\": \"B-Outcome\",\n", + " \"30\": \"B-Personal_[back](Biological_structure\",\n", + " \"31\": \"B-Personal_background\",\n", + " \"32\": \"B-Qualitative_concept\",\n", + " \"33\": \"B-Quantitative_concept\",\n", + " \"34\": \"B-Severity\",\n", + " \"35\": \"B-Sex\",\n", + " \"36\": \"B-Shape\",\n", + " \"37\": \"B-Sign_symptom\",\n", + " \"38\": \"B-Subject\",\n", + " \"39\": \"B-Texture\",\n", + " \"40\": \"B-Therapeutic_procedure\",\n", + " \"41\": \"B-Time\",\n", + " \"42\": \"B-Volume\",\n", + " \"43\": \"B-Weight\",\n", + " \"44\": \"I-Activity\",\n", + " \"45\": \"I-Administration\",\n", + " \"46\": \"I-Age\",\n", + " \"47\": \"I-Area\",\n", + " \"48\": \"I-Biological_attribute\",\n", + " \"49\": \"I-Biological_structure\",\n", + " \"50\": \"I-Clinical_event\",\n", + " \"51\": \"I-Color\",\n", + " \"52\": \"I-Coreference\",\n", + " \"53\": \"I-Date\",\n", + " \"54\": \"I-Detailed_description\",\n", + " \"55\": \"I-Diagnostic_procedure\",\n", + " \"56\": \"I-Disease_disorder\",\n", + " \"57\": \"I-Distance\",\n", + " \"58\": \"I-Dosage\",\n", + " \"59\": \"I-Duration\",\n", + " \"60\": \"I-Family_history\",\n", + " \"61\": \"I-Frequency\",\n", + " \"62\": \"I-Height\",\n", + " \"63\": \"I-History\",\n", + " \"64\": \"I-Lab_value\",\n", + " \"65\": \"I-Mass\",\n", + " \"66\": \"I-Medication\",\n", + " \"67\": \"I-Nonbiological_location\",\n", + " \"68\": \"I-Occupation\",\n", + " \"69\": \"I-Other_entity\",\n", + " \"70\": \"I-Other_event\",\n", + " \"71\": \"I-Outcome\",\n", + " \"72\": \"I-Personal_background\",\n", + " \"73\": \"I-Qualitative_concept\",\n", + " \"74\": \"I-Quantitative_concept\",\n", + " \"75\": \"I-Severity\",\n", + " \"76\": \"I-Shape\",\n", + " \"77\": \"I-Sign_symptom\",\n", + " \"78\": \"I-Subject\",\n", + " \"79\": \"I-Texture\",\n", + " \"80\": \"I-Therapeutic_procedure\",\n", + " \"81\": \"I-Time\",\n", + " \"82\": \"I-Volume\",\n", + " \"83\": \"I-Weight\"\n", + " },\n", + " \"initializer_range\": 0.02,\n", + " \"label2id\": {\n", + " \"B-Activity\": 1,\n", + " \"B-Administration\": 2,\n", + " \"B-Age\": 3,\n", + " \"B-Area\": 4,\n", + " \"B-Biological_attribute\": 5,\n", + " \"B-Biological_structure\": 6,\n", + " \"B-Clinical_event\": 7,\n", + " \"B-Color\": 8,\n", + " \"B-Coreference\": 9,\n", + " \"B-Date\": 10,\n", + " \"B-Detailed_description\": 11,\n", + " \"B-Diagnostic_procedure\": 12,\n", + " \"B-Disease_disorder\": 13,\n", + " \"B-Distance\": 14,\n", + " \"B-Dosage\": 15,\n", + " \"B-Duration\": 16,\n", + " \"B-Family_history\": 17,\n", + " \"B-Frequency\": 18,\n", + " \"B-Height\": 19,\n", + " \"B-History\": 20,\n", + " \"B-Lab_value\": 21,\n", + " \"B-Mass\": 22,\n", + " \"B-Medication\": 23,\n", + " \"B-Non[biological](Detailed_description\": 24,\n", + " \"B-Nonbiological_location\": 25,\n", + " \"B-Occupation\": 26,\n", + " \"B-Other_entity\": 27,\n", + " \"B-Other_event\": 28,\n", + " \"B-Outcome\": 29,\n", + " \"B-Personal_[back](Biological_structure\": 30,\n", + " \"B-Personal_background\": 31,\n", + " \"B-Qualitative_concept\": 32,\n", + " \"B-Quantitative_concept\": 33,\n", + " \"B-Severity\": 34,\n", + " \"B-Sex\": 35,\n", + " \"B-Shape\": 36,\n", + " \"B-Sign_symptom\": 37,\n", + " \"B-Subject\": 38,\n", + " \"B-Texture\": 39,\n", + " \"B-Therapeutic_procedure\": 40,\n", + " \"B-Time\": 41,\n", + " \"B-Volume\": 42,\n", + " \"B-Weight\": 43,\n", + " \"I-Activity\": 44,\n", + " \"I-Administration\": 45,\n", + " \"I-Age\": 46,\n", + " \"I-Area\": 47,\n", + " \"I-Biological_attribute\": 48,\n", + " \"I-Biological_structure\": 49,\n", + " \"I-Clinical_event\": 50,\n", + " \"I-Color\": 51,\n", + " \"I-Coreference\": 52,\n", + " \"I-Date\": 53,\n", + " \"I-Detailed_description\": 54,\n", + " \"I-Diagnostic_procedure\": 55,\n", + " \"I-Disease_disorder\": 56,\n", + " \"I-Distance\": 57,\n", + " \"I-Dosage\": 58,\n", + " \"I-Duration\": 59,\n", + " \"I-Family_history\": 60,\n", + " \"I-Frequency\": 61,\n", + " \"I-Height\": 62,\n", + " \"I-History\": 63,\n", + " \"I-Lab_value\": 64,\n", + " \"I-Mass\": 65,\n", + " \"I-Medication\": 66,\n", + " \"I-Nonbiological_location\": 67,\n", + " \"I-Occupation\": 68,\n", + " \"I-Other_entity\": 69,\n", + " \"I-Other_event\": 70,\n", + " \"I-Outcome\": 71,\n", + " \"I-Personal_background\": 72,\n", + " \"I-Qualitative_concept\": 73,\n", + " \"I-Quantitative_concept\": 74,\n", + " \"I-Severity\": 75,\n", + " \"I-Shape\": 76,\n", + " \"I-Sign_symptom\": 77,\n", + " \"I-Subject\": 78,\n", + " \"I-Texture\": 79,\n", + " \"I-Therapeutic_procedure\": 80,\n", + " \"I-Time\": 81,\n", + " \"I-Volume\": 82,\n", + " \"I-Weight\": 83,\n", + " \"O\": 0\n", + " },\n", + " \"max_position_embeddings\": 512,\n", + " \"model_type\": \"distilbert\",\n", + " \"n_heads\": 12,\n", + " \"n_layers\": 6,\n", + " \"pad_token_id\": 0,\n", + " \"qa_dropout\": 0.1,\n", + " \"seq_classif_dropout\": 0.2,\n", + " \"sinusoidal_pos_embds\": false,\n", + " \"tie_weights_\": true,\n", + " \"torch_dtype\": \"float32\",\n", + " \"transformers_version\": \"4.23.1\",\n", + " \"vocab_size\": 30522\n", + "}\n", + "\n", + "loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--d4data--biomedical-ner-all/snapshots/e87917020da1384aed6e93b1b46d68771f65ddab/pytorch_model.bin\n", + "All model checkpoint weights were used when initializing DistilBertForTokenClassification.\n", + "\n", + "All the weights of DistilBertForTokenClassification were initialized from the model checkpoint at d4data/biomedical-ner-all.\n", + "If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForTokenClassification for predictions without further training.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "args = TrainingArguments(\n", + " \"NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased\",\n", + " evaluation_strategy = IntervalStrategy.STEPS,\n", + " eval_steps = 50,\n", + " learning_rate=5e-5,\n", + " num_train_epochs=50,\n", + " weight_decay=0.01,\n", + " metric_for_best_model = 'f1',\n", + " load_best_model_at_end=True\n", + ")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Z4DTxHPoTsGs", + "outputId": "9b8c9d65-4d5a-4fb6-b91e-d41efb1ac68c" + }, + "execution_count": 56, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "PyTorch: setting up devices\n", + "The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "trainer = Trainer(\n", + " model=model,\n", + " args=args,\n", + " train_dataset=tokenized_datasets[\"train\"],\n", + " eval_dataset=tokenized_datasets[\"validation\"],\n", + " data_collator=data_collator,\n", + " compute_metrics=compute_metrics,\n", + " tokenizer=tokenizer,\n", + " callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]\n", + ")\n", + "trainer.train()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "rVfzrBZMTv6x", + "outputId": "3cdd94b4-ab62-40dc-e180-1f7a22a4ef1b" + }, + "execution_count": 57, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "***** Running training *****\n", + " Num examples = 555\n", + " Num Epochs = 50\n", + " Instantaneous batch size per device = 8\n", + " Total train batch size (w. parallel, distributed & accumulation) = 8\n", + " Gradient Accumulation steps = 1\n", + " Total optimization steps = 3500\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "<IPython.core.display.HTML object>" + ], + "text/html": [ + "\n", + " <div>\n", + " \n", + " <progress value='900' max='3500' style='width:300px; height:20px; vertical-align: middle;'></progress>\n", + " [ 900/3500 02:39 < 07:41, 5.63 it/s, Epoch 12/50]\n", + " </div>\n", + " <table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: left;\">\n", + " <th>Step</th>\n", + " <th>Training Loss</th>\n", + " <th>Validation Loss</th>\n", + " <th>Precision</th>\n", + " <th>Recall</th>\n", + " <th>F1</th>\n", + " <th>Accuracy</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <td>50</td>\n", + " <td>No log</td>\n", + " <td>0.162818</td>\n", + " <td>0.270997</td>\n", + " <td>0.309661</td>\n", + " <td>0.289042</td>\n", + " <td>0.937595</td>\n", + " </tr>\n", + " <tr>\n", + " <td>100</td>\n", + " <td>No log</td>\n", + " <td>0.166674</td>\n", + " <td>0.348107</td>\n", + " <td>0.517594</td>\n", + " <td>0.416259</td>\n", + " <td>0.938953</td>\n", + " </tr>\n", + " <tr>\n", + " <td>150</td>\n", + " <td>No log</td>\n", + " <td>0.165462</td>\n", + " <td>0.376270</td>\n", + " <td>0.521433</td>\n", + " <td>0.437115</td>\n", + " <td>0.938915</td>\n", + " </tr>\n", + " <tr>\n", + " <td>200</td>\n", + " <td>No log</td>\n", + " <td>0.146949</td>\n", + " <td>0.404802</td>\n", + " <td>0.463852</td>\n", + " <td>0.432320</td>\n", + " <td>0.945101</td>\n", + " </tr>\n", + " <tr>\n", + " <td>250</td>\n", + " <td>No log</td>\n", + " <td>0.165257</td>\n", + " <td>0.440503</td>\n", + " <td>0.492642</td>\n", + " <td>0.465116</td>\n", + " <td>0.944807</td>\n", + " </tr>\n", + " <tr>\n", + " <td>300</td>\n", + " <td>No log</td>\n", + " <td>0.187354</td>\n", + " <td>0.421611</td>\n", + " <td>0.431862</td>\n", + " <td>0.426675</td>\n", + " <td>0.945729</td>\n", + " </tr>\n", + " <tr>\n", + " <td>350</td>\n", + " <td>No log</td>\n", + " <td>0.203230</td>\n", + " <td>0.445104</td>\n", + " <td>0.479846</td>\n", + " <td>0.461823</td>\n", + " <td>0.945396</td>\n", + " </tr>\n", + " <tr>\n", + " <td>400</td>\n", + " <td>No log</td>\n", + " <td>0.215578</td>\n", + " <td>0.419140</td>\n", + " <td>0.523992</td>\n", + " <td>0.465738</td>\n", + " <td>0.944269</td>\n", + " </tr>\n", + " <tr>\n", + " <td>450</td>\n", + " <td>No log</td>\n", + " <td>0.228786</td>\n", + " <td>0.456535</td>\n", + " <td>0.467051</td>\n", + " <td>0.461733</td>\n", + " <td>0.945178</td>\n", + " </tr>\n", + " <tr>\n", + " <td>500</td>\n", + " <td>0.103800</td>\n", + " <td>0.261185</td>\n", + " <td>0.427002</td>\n", + " <td>0.522073</td>\n", + " <td>0.469775</td>\n", + " <td>0.943641</td>\n", + " </tr>\n", + " <tr>\n", + " <td>550</td>\n", + " <td>0.103800</td>\n", + " <td>0.265154</td>\n", + " <td>0.424972</td>\n", + " <td>0.485605</td>\n", + " <td>0.453270</td>\n", + " <td>0.945165</td>\n", + " </tr>\n", + " <tr>\n", + " <td>600</td>\n", + " <td>0.103800</td>\n", + " <td>0.269958</td>\n", + " <td>0.383793</td>\n", + " <td>0.551504</td>\n", + " <td>0.452612</td>\n", + " <td>0.941015</td>\n", + " </tr>\n", + " <tr>\n", + " <td>650</td>\n", + " <td>0.103800</td>\n", + " <td>0.283041</td>\n", + " <td>0.449687</td>\n", + " <td>0.506078</td>\n", + " <td>0.476219</td>\n", + " <td>0.945652</td>\n", + " </tr>\n", + " <tr>\n", + " <td>700</td>\n", + " <td>0.103800</td>\n", + " <td>0.296066</td>\n", + " <td>0.423529</td>\n", + " <td>0.552783</td>\n", + " <td>0.479600</td>\n", + " <td>0.941335</td>\n", + " </tr>\n", + " <tr>\n", + " <td>750</td>\n", + " <td>0.103800</td>\n", + " <td>0.282410</td>\n", + " <td>0.444382</td>\n", + " <td>0.503519</td>\n", + " <td>0.472106</td>\n", + " <td>0.944884</td>\n", + " </tr>\n", + " <tr>\n", + " <td>800</td>\n", + " <td>0.103800</td>\n", + " <td>0.295058</td>\n", + " <td>0.400769</td>\n", + " <td>0.533589</td>\n", + " <td>0.457739</td>\n", + " <td>0.943615</td>\n", + " </tr>\n", + " <tr>\n", + " <td>850</td>\n", + " <td>0.103800</td>\n", + " <td>0.294637</td>\n", + " <td>0.432018</td>\n", + " <td>0.504159</td>\n", + " <td>0.465309</td>\n", + " <td>0.944320</td>\n", + " </tr>\n", + " <tr>\n", + " <td>900</td>\n", + " <td>0.103800</td>\n", + " <td>0.296676</td>\n", + " <td>0.411402</td>\n", + " <td>0.512476</td>\n", + " <td>0.456410</td>\n", + " <td>0.942796</td>\n", + " </tr>\n", + " </tbody>\n", + "</table><p>" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "Saving model checkpoint to NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-500\n", + "Configuration saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-500/config.json\n", + "Model weights saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-500/pytorch_model.bin\n", + "tokenizer config file saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-500/tokenizer_config.json\n", + "Special tokens file saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-500/special_tokens_map.json\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "\n", + "\n", + "Training completed. Do not forget to share your model on huggingface.co/models =)\n", + "\n", + "\n", + "Loading best model from NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-500 (score: 0.4697754749568221).\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "TrainOutput(global_step=900, training_loss=0.06402364306979709, metrics={'train_runtime': 159.6722, 'train_samples_per_second': 173.794, 'train_steps_per_second': 21.92, 'total_flos': 932785215873192.0, 'train_loss': 0.06402364306979709, 'epoch': 12.86})" + ] + }, + "metadata": {}, + "execution_count": 57 + } + ] + }, + { + "cell_type": "code", + "source": [ + "trainer.save_model('model/distilbert-base-uncased-all-tokens')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "g_r4zDTuT6n0", + "outputId": "d9cf6e0f-4d72-4dc3-bdaf-fe162acfde8f" + }, + "execution_count": 58, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Saving model checkpoint to model/distilbert-base-uncased-all-tokens\n", + "Configuration saved in model/distilbert-base-uncased-all-tokens/config.json\n", + "Model weights saved in model/distilbert-base-uncased-all-tokens/pytorch_model.bin\n", + "tokenizer config file saved in model/distilbert-base-uncased-all-tokens/tokenizer_config.json\n", + "Special tokens file saved in model/distilbert-base-uncased-all-tokens/special_tokens_map.json\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Model 1 - d4data/biomedical-ner-all\n", + "\n", + "### Sentence Based Modelling" + ], + "metadata": { + "id": "U9LTC6H7Ut3-" + } + }, + { + "cell_type": "code", + "source": [ + "dic = {\"tokens\": sent_tokenized, \"ner_tags\": label_sent_tokenized} #Use this option if you want to check the model performance with sentences tokenized by \". \" b" + ], + "metadata": { + "id": "YntZAcxIUpmM" + }, + "execution_count": 59, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "id": "c0p6sqVqVDhK" + }, + "outputs": [], + "source": [ + "dataset = Dataset.from_dict(dic)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "54fb3085-2261-496a-f6d0-60f93f654540", + "id": "dKokCRtaVDhK" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['tokens', 'ner_tags'],\n", + " num_rows: 11668\n", + "})" + ] + }, + "metadata": {}, + "execution_count": 61 + } + ], + "source": [ + "dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "id": "yaiKWzNRVDhK" + }, + "outputs": [], + "source": [ + "#For training, validation, and test partitions\n", + "\"\"\"\n", + "#Train, val, test partitions\n", + "train_test = dataset.train_test_split()\n", + "test_val = train_test['test'].train_test_split()\n", + "raw_datasets = DatasetDict({\n", + " 'train': train_test['train'],\n", + " 'validation': test_val['train'],\n", + " 'test': test_val['test']\n", + " })\n", + "\"\"\"\n", + "\n", + "#Just for training and validation partitions\n", + "train_test = dataset.train_test_split()\n", + "raw_datasets = DatasetDict({\n", + " 'train': train_test['train'],\n", + " 'validation': train_test['test']\n", + " })" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "3f04c836-fa6c-4f6b-9be8-1a3b77910f74", + "id": "bJryyZX2VDhL" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "DatasetDict({\n", + " train: Dataset({\n", + " features: ['tokens', 'ner_tags'],\n", + " num_rows: 8751\n", + " })\n", + " validation: Dataset({\n", + " features: ['tokens', 'ner_tags'],\n", + " num_rows: 2917\n", + " })\n", + "})" + ] + }, + "metadata": {}, + "execution_count": 63 + } + ], + "source": [ + "raw_datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "7cb91446-41f5-4257-9d36-aac32e1a2d8b", + "id": "p9q9WmGpVDhL" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]" + ] + }, + "metadata": {}, + "execution_count": 64 + } + ], + "source": [ + "raw_datasets[\"train\"][0][\"ner_tags\"]\n", + "#raw_datasets[\"train\"][0][\"pos_tags\"]\n", + "#raw_datasets[\"train\"][0][\"chunk_tags\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "0dd389dd-cb44-4235-c327-f58e5d8f2f1a", + "id": "K7Sip5njVDhL" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['tokens', 'ner_tags'],\n", + " num_rows: 8751\n", + "})" + ] + }, + "metadata": {}, + "execution_count": 65 + } + ], + "source": [ + "raw_datasets['train']" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "fc667def-0a5f-481d-85c3-8ff5f80a5eb5", + "id": "BI420tEFVDhL" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['O', 'B', 'I']" + ] + }, + "metadata": {}, + "execution_count": 66 + } + ], + "source": [ + "label_names = ['O','B','I']\n", + "label_names" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "d11245fb-f724-440f-954e-064b90d32579", + "id": "fvbDPubIVDhL" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Since 2006 she had tried several times to lose weight , without success . \n", + "O O O O O O O O O O O O O O \n" + ] + } + ], + "source": [ + "words = raw_datasets[\"train\"][0][\"tokens\"]\n", + "labels = [int(n) for n in raw_datasets[\"train\"][0][\"ner_tags\"]]\n", + "#labels = raw_datasets[\"train\"][0][\"pos_tags\"]\n", + "#labels = raw_datasets[\"train\"][0][\"chunk_tags\"]\n", + "line1 = \"\"\n", + "line2 = \"\"\n", + "for word, label in zip(words, labels):\n", + " full_label = label_names[label]\n", + " max_length = max(len(word), len(full_label))\n", + " line1 += word + \" \" * (max_length - len(word) + 1)\n", + " line2 += full_label + \" \" * (max_length - len(full_label) + 1)\n", + "\n", + "print(line1)\n", + "print(line2)" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "9d061e6c-4cd9-472d-b873-814d709efb63", + "id": "LXmlD43QVDhL" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--d4data--biomedical-ner-all/snapshots/e87917020da1384aed6e93b1b46d68771f65ddab/vocab.txt\n", + "loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--d4data--biomedical-ner-all/snapshots/e87917020da1384aed6e93b1b46d68771f65ddab/tokenizer.json\n", + "loading file added_tokens.json from cache at None\n", + "loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--d4data--biomedical-ner-all/snapshots/e87917020da1384aed6e93b1b46d68771f65ddab/special_tokens_map.json\n", + "loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--d4data--biomedical-ner-all/snapshots/e87917020da1384aed6e93b1b46d68771f65ddab/tokenizer_config.json\n" + ] + } + ], + "source": [ + "model_checkpoint = \"d4data/biomedical-ner-all\"\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "e11cebbd-2fd8-4db3-f014-d52f80ae104c", + "id": "rUn3zUd9VDhM" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 69 + } + ], + "source": [ + "tokenizer.is_fast" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "0196fde7-250b-4486-f63f-2dcea8848e5d", + "id": "LXxYSaDLVDhM" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['[CLS]',\n", + " 'since',\n", + " '2006',\n", + " 'she',\n", + " 'had',\n", + " 'tried',\n", + " 'several',\n", + " 'times',\n", + " 'to',\n", + " 'lose',\n", + " 'weight',\n", + " ',',\n", + " 'without',\n", + " 'success',\n", + " '.',\n", + " '[SEP]']" + ] + }, + "metadata": {}, + "execution_count": 70 + } + ], + "source": [ + "inputs = tokenizer(raw_datasets[\"train\"][0][\"tokens\"], is_split_into_words=True)\n", + "inputs.tokens()" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "48b11a85-7ca4-4589-dcba-21833e5b6c55", + "id": "-bG1VI2NVDhM" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n", + "[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]\n" + ] + } + ], + "source": [ + "labels = raw_datasets[\"train\"][0][\"ner_tags\"]\n", + "word_ids = inputs.word_ids()\n", + "print(labels)\n", + "print(align_labels_with_tokens(labels, word_ids))" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81, + "referenced_widgets": [ + "40c2b37fa07f44648cecc9b7e406e7e2", + "69ed5fe8ed6046acb4202689c065f858", + "bbd9cf7a77aa48fda3a648583ed02b08", + "ebe8b2b35e884fd28bb42eacf01ff07c", + "ddb14bc1d5d4437a9ee4a895846e7d29", + "669c24c6309f46cbbdcd0c764143e74f", + "d1e4665beafa4bbeb25d0e9e8447a5a9", + "09217bdc1e2145eb84cc97207595e6f0", + "b294af01ac5f483dacbe2e1c40fdf223", + "c1e27e9184204d618ce59b97f7302335", + "2e63f2af443d448aaaddf81127def048", + "9239cc2fd1d94d86986b7f395de70fca", + "f1e8d31b67db4089ab1b036bda341617", + "33be40ebcab54ff68855f1145cf5e1d6", + "d96c111f09d74a0c9816328f88d9e45b", + "3907dc2aaa484877aee9beab8a6888d4", + "aaacfb0f3bd1427ea44ec84c28a2aaf7", + "cb92e843491142e8a2a4008223a90d02", + "52169f264141463e94a7761a4ffb3f7a", + "e509b790873740b59aa2f52875ca2038", + "a395318bce7348d78ca83a308552f042", + "a2bb171f700743559e1d2c472c8289ef" + ] + }, + "outputId": "aad10f99-e5fa-423c-f259-a12310b18d1d", + "id": "0OVZq1BtVDhM" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + " 0%| | 0/9 [00:00<?, ?ba/s]" + ], + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "40c2b37fa07f44648cecc9b7e406e7e2" + } + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + " 0%| | 0/3 [00:00<?, ?ba/s]" + ], + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "9239cc2fd1d94d86986b7f395de70fca" + } + }, + "metadata": {} + } + ], + "source": [ + "tokenized_datasets = raw_datasets.map(\n", + " tokenize_and_align_labels,\n", + " batched=True,\n", + " remove_columns=raw_datasets[\"train\"].column_names,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "id": "rdl_EpWuVDhN" + }, + "outputs": [], + "source": [ + "data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "d2768d3a-b399-4236-a9c4-cfff18f31d1c", + "id": "gY7K46A2VDhN" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "tensor([[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n", + " -100, -100, -100, -100, -100, -100, -100, -100, -100, -100],\n", + " [-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 1, 2, 2, 2, 2, 2, 2, 2, 0, -100]])" + ] + }, + "metadata": {}, + "execution_count": 74 + } + ], + "source": [ + "batch = data_collator([tokenized_datasets[\"train\"][i] for i in range(2)])\n", + "batch[\"labels\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": { + "id": "PNC5_yuKVDhN" + }, + "outputs": [], + "source": [ + "id2label = {str(i): label for i, label in enumerate(label_names)}\n", + "label2id = {v: k for k, v in id2label.items()}" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "ba1aa394-ff7a-471e-a8dc-d35755f501ed", + "id": "AbRkJHvbVDhN" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--d4data--biomedical-ner-all/snapshots/e87917020da1384aed6e93b1b46d68771f65ddab/config.json\n", + "Model config DistilBertConfig {\n", + " \"_name_or_path\": \"d4data/biomedical-ner-all\",\n", + " \"activation\": \"gelu\",\n", + " \"architectures\": [\n", + " \"DistilBertForTokenClassification\"\n", + " ],\n", + " \"attention_dropout\": 0.1,\n", + " \"dim\": 768,\n", + " \"dropout\": 0.1,\n", + " \"hidden_dim\": 3072,\n", + " \"id2label\": {\n", + " \"0\": \"O\",\n", + " \"1\": \"B-Activity\",\n", + " \"2\": \"B-Administration\",\n", + " \"3\": \"B-Age\",\n", + " \"4\": \"B-Area\",\n", + " \"5\": \"B-Biological_attribute\",\n", + " \"6\": \"B-Biological_structure\",\n", + " \"7\": \"B-Clinical_event\",\n", + " \"8\": \"B-Color\",\n", + " \"9\": \"B-Coreference\",\n", + " \"10\": \"B-Date\",\n", + " \"11\": \"B-Detailed_description\",\n", + " \"12\": \"B-Diagnostic_procedure\",\n", + " \"13\": \"B-Disease_disorder\",\n", + " \"14\": \"B-Distance\",\n", + " \"15\": \"B-Dosage\",\n", + " \"16\": \"B-Duration\",\n", + " \"17\": \"B-Family_history\",\n", + " \"18\": \"B-Frequency\",\n", + " \"19\": \"B-Height\",\n", + " \"20\": \"B-History\",\n", + " \"21\": \"B-Lab_value\",\n", + " \"22\": \"B-Mass\",\n", + " \"23\": \"B-Medication\",\n", + " \"24\": \"B-Non[biological](Detailed_description\",\n", + " \"25\": \"B-Nonbiological_location\",\n", + " \"26\": \"B-Occupation\",\n", + " \"27\": \"B-Other_entity\",\n", + " \"28\": \"B-Other_event\",\n", + " \"29\": \"B-Outcome\",\n", + " \"30\": \"B-Personal_[back](Biological_structure\",\n", + " \"31\": \"B-Personal_background\",\n", + " \"32\": \"B-Qualitative_concept\",\n", + " \"33\": \"B-Quantitative_concept\",\n", + " \"34\": \"B-Severity\",\n", + " \"35\": \"B-Sex\",\n", + " \"36\": \"B-Shape\",\n", + " \"37\": \"B-Sign_symptom\",\n", + " \"38\": \"B-Subject\",\n", + " \"39\": \"B-Texture\",\n", + " \"40\": \"B-Therapeutic_procedure\",\n", + " \"41\": \"B-Time\",\n", + " \"42\": \"B-Volume\",\n", + " \"43\": \"B-Weight\",\n", + " \"44\": \"I-Activity\",\n", + " \"45\": \"I-Administration\",\n", + " \"46\": \"I-Age\",\n", + " \"47\": \"I-Area\",\n", + " \"48\": \"I-Biological_attribute\",\n", + " \"49\": \"I-Biological_structure\",\n", + " \"50\": \"I-Clinical_event\",\n", + " \"51\": \"I-Color\",\n", + " \"52\": \"I-Coreference\",\n", + " \"53\": \"I-Date\",\n", + " \"54\": \"I-Detailed_description\",\n", + " \"55\": \"I-Diagnostic_procedure\",\n", + " \"56\": \"I-Disease_disorder\",\n", + " \"57\": \"I-Distance\",\n", + " \"58\": \"I-Dosage\",\n", + " \"59\": \"I-Duration\",\n", + " \"60\": \"I-Family_history\",\n", + " \"61\": \"I-Frequency\",\n", + " \"62\": \"I-Height\",\n", + " \"63\": \"I-History\",\n", + " \"64\": \"I-Lab_value\",\n", + " \"65\": \"I-Mass\",\n", + " \"66\": \"I-Medication\",\n", + " \"67\": \"I-Nonbiological_location\",\n", + " \"68\": \"I-Occupation\",\n", + " \"69\": \"I-Other_entity\",\n", + " \"70\": \"I-Other_event\",\n", + " \"71\": \"I-Outcome\",\n", + " \"72\": \"I-Personal_background\",\n", + " \"73\": \"I-Qualitative_concept\",\n", + " \"74\": \"I-Quantitative_concept\",\n", + " \"75\": \"I-Severity\",\n", + " \"76\": \"I-Shape\",\n", + " \"77\": \"I-Sign_symptom\",\n", + " \"78\": \"I-Subject\",\n", + " \"79\": \"I-Texture\",\n", + " \"80\": \"I-Therapeutic_procedure\",\n", + " \"81\": \"I-Time\",\n", + " \"82\": \"I-Volume\",\n", + " \"83\": \"I-Weight\"\n", + " },\n", + " \"initializer_range\": 0.02,\n", + " \"label2id\": {\n", + " \"B-Activity\": 1,\n", + " \"B-Administration\": 2,\n", + " \"B-Age\": 3,\n", + " \"B-Area\": 4,\n", + " \"B-Biological_attribute\": 5,\n", + " \"B-Biological_structure\": 6,\n", + " \"B-Clinical_event\": 7,\n", + " \"B-Color\": 8,\n", + " \"B-Coreference\": 9,\n", + " \"B-Date\": 10,\n", + " \"B-Detailed_description\": 11,\n", + " \"B-Diagnostic_procedure\": 12,\n", + " \"B-Disease_disorder\": 13,\n", + " \"B-Distance\": 14,\n", + " \"B-Dosage\": 15,\n", + " \"B-Duration\": 16,\n", + " \"B-Family_history\": 17,\n", + " \"B-Frequency\": 18,\n", + " \"B-Height\": 19,\n", + " \"B-History\": 20,\n", + " \"B-Lab_value\": 21,\n", + " \"B-Mass\": 22,\n", + " \"B-Medication\": 23,\n", + " \"B-Non[biological](Detailed_description\": 24,\n", + " \"B-Nonbiological_location\": 25,\n", + " \"B-Occupation\": 26,\n", + " \"B-Other_entity\": 27,\n", + " \"B-Other_event\": 28,\n", + " \"B-Outcome\": 29,\n", + " \"B-Personal_[back](Biological_structure\": 30,\n", + " \"B-Personal_background\": 31,\n", + " \"B-Qualitative_concept\": 32,\n", + " \"B-Quantitative_concept\": 33,\n", + " \"B-Severity\": 34,\n", + " \"B-Sex\": 35,\n", + " \"B-Shape\": 36,\n", + " \"B-Sign_symptom\": 37,\n", + " \"B-Subject\": 38,\n", + " \"B-Texture\": 39,\n", + " \"B-Therapeutic_procedure\": 40,\n", + " \"B-Time\": 41,\n", + " \"B-Volume\": 42,\n", + " \"B-Weight\": 43,\n", + " \"I-Activity\": 44,\n", + " \"I-Administration\": 45,\n", + " \"I-Age\": 46,\n", + " \"I-Area\": 47,\n", + " \"I-Biological_attribute\": 48,\n", + " \"I-Biological_structure\": 49,\n", + " \"I-Clinical_event\": 50,\n", + " \"I-Color\": 51,\n", + " \"I-Coreference\": 52,\n", + " \"I-Date\": 53,\n", + " \"I-Detailed_description\": 54,\n", + " \"I-Diagnostic_procedure\": 55,\n", + " \"I-Disease_disorder\": 56,\n", + " \"I-Distance\": 57,\n", + " \"I-Dosage\": 58,\n", + " \"I-Duration\": 59,\n", + " \"I-Family_history\": 60,\n", + " \"I-Frequency\": 61,\n", + " \"I-Height\": 62,\n", + " \"I-History\": 63,\n", + " \"I-Lab_value\": 64,\n", + " \"I-Mass\": 65,\n", + " \"I-Medication\": 66,\n", + " \"I-Nonbiological_location\": 67,\n", + " \"I-Occupation\": 68,\n", + " \"I-Other_entity\": 69,\n", + " \"I-Other_event\": 70,\n", + " \"I-Outcome\": 71,\n", + " \"I-Personal_background\": 72,\n", + " \"I-Qualitative_concept\": 73,\n", + " \"I-Quantitative_concept\": 74,\n", + " \"I-Severity\": 75,\n", + " \"I-Shape\": 76,\n", + " \"I-Sign_symptom\": 77,\n", + " \"I-Subject\": 78,\n", + " \"I-Texture\": 79,\n", + " \"I-Therapeutic_procedure\": 80,\n", + " \"I-Time\": 81,\n", + " \"I-Volume\": 82,\n", + " \"I-Weight\": 83,\n", + " \"O\": 0\n", + " },\n", + " \"max_position_embeddings\": 512,\n", + " \"model_type\": \"distilbert\",\n", + " \"n_heads\": 12,\n", + " \"n_layers\": 6,\n", + " \"pad_token_id\": 0,\n", + " \"qa_dropout\": 0.1,\n", + " \"seq_classif_dropout\": 0.2,\n", + " \"sinusoidal_pos_embds\": false,\n", + " \"tie_weights_\": true,\n", + " \"torch_dtype\": \"float32\",\n", + " \"transformers_version\": \"4.23.1\",\n", + " \"vocab_size\": 30522\n", + "}\n", + "\n", + "loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--d4data--biomedical-ner-all/snapshots/e87917020da1384aed6e93b1b46d68771f65ddab/pytorch_model.bin\n", + "All model checkpoint weights were used when initializing DistilBertForTokenClassification.\n", + "\n", + "All the weights of DistilBertForTokenClassification were initialized from the model checkpoint at d4data/biomedical-ner-all.\n", + "If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForTokenClassification for predictions without further training.\n" + ] + } + ], + "source": [ + "model = AutoModelForTokenClassification.from_pretrained( \n", + " model_checkpoint\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "MQILaROUVDhN", + "outputId": "a548d6da-abe4-417a-99eb-f65d2c5b3273" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "PyTorch: setting up devices\n", + "The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).\n" + ] + } + ], + "source": [ + "args = TrainingArguments(\n", + " \"NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased\",\n", + " evaluation_strategy = IntervalStrategy.STEPS,\n", + " eval_steps = 50,\n", + " learning_rate=5e-5,\n", + " num_train_epochs=50,\n", + " weight_decay=0.01,\n", + " metric_for_best_model = 'f1',\n", + " load_best_model_at_end=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "outputId": "cb0a12c3-0ce4-4d59-d773-f2f31ff05b68", + "id": "MRJatT86VDhN" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/transformers/optimization.py:310: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", + " FutureWarning,\n", + "***** Running training *****\n", + " Num examples = 8751\n", + " Num Epochs = 50\n", + " Instantaneous batch size per device = 8\n", + " Total train batch size (w. parallel, distributed & accumulation) = 8\n", + " Gradient Accumulation steps = 1\n", + " Total optimization steps = 54700\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "<IPython.core.display.HTML object>" + ], + "text/html": [ + "\n", + " <div>\n", + " \n", + " <progress value='1650' max='54700' style='width:300px; height:20px; vertical-align: middle;'></progress>\n", + " [ 1650/54700 04:10 < 2:14:23, 6.58 it/s, Epoch 1/50]\n", + " </div>\n", + " <table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: left;\">\n", + " <th>Step</th>\n", + " <th>Training Loss</th>\n", + " <th>Validation Loss</th>\n", + " <th>Precision</th>\n", + " <th>Recall</th>\n", + " <th>F1</th>\n", + " <th>Accuracy</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <td>50</td>\n", + " <td>No log</td>\n", + " <td>0.220473</td>\n", + " <td>0.223507</td>\n", + " <td>0.240041</td>\n", + " <td>0.231479</td>\n", + " <td>0.923262</td>\n", + " </tr>\n", + " <tr>\n", + " <td>100</td>\n", + " <td>No log</td>\n", + " <td>0.178907</td>\n", + " <td>0.233188</td>\n", + " <td>0.249353</td>\n", + " <td>0.241000</td>\n", + " <td>0.932774</td>\n", + " </tr>\n", + " <tr>\n", + " <td>150</td>\n", + " <td>No log</td>\n", + " <td>0.170712</td>\n", + " <td>0.209794</td>\n", + " <td>0.210554</td>\n", + " <td>0.210173</td>\n", + " <td>0.934965</td>\n", + " </tr>\n", + " <tr>\n", + " <td>200</td>\n", + " <td>No log</td>\n", + " <td>0.161026</td>\n", + " <td>0.308015</td>\n", + " <td>0.341956</td>\n", + " <td>0.324099</td>\n", + " <td>0.937756</td>\n", + " </tr>\n", + " <tr>\n", + " <td>250</td>\n", + " <td>No log</td>\n", + " <td>0.167800</td>\n", + " <td>0.290117</td>\n", + " <td>0.282462</td>\n", + " <td>0.286239</td>\n", + " <td>0.940074</td>\n", + " </tr>\n", + " <tr>\n", + " <td>300</td>\n", + " <td>No log</td>\n", + " <td>0.179418</td>\n", + " <td>0.388704</td>\n", + " <td>0.302638</td>\n", + " <td>0.340314</td>\n", + " <td>0.939863</td>\n", + " </tr>\n", + " <tr>\n", + " <td>350</td>\n", + " <td>No log</td>\n", + " <td>0.160154</td>\n", + " <td>0.333333</td>\n", + " <td>0.443870</td>\n", + " <td>0.380741</td>\n", + " <td>0.938609</td>\n", + " </tr>\n", + " <tr>\n", + " <td>400</td>\n", + " <td>No log</td>\n", + " <td>0.199905</td>\n", + " <td>0.207132</td>\n", + " <td>0.204346</td>\n", + " <td>0.205729</td>\n", + " <td>0.933585</td>\n", + " </tr>\n", + " <tr>\n", + " <td>450</td>\n", + " <td>No log</td>\n", + " <td>0.154181</td>\n", + " <td>0.349614</td>\n", + " <td>0.422142</td>\n", + " <td>0.382470</td>\n", + " <td>0.940611</td>\n", + " </tr>\n", + " <tr>\n", + " <td>500</td>\n", + " <td>0.213900</td>\n", + " <td>0.154374</td>\n", + " <td>0.376731</td>\n", + " <td>0.422142</td>\n", + " <td>0.398146</td>\n", + " <td>0.941137</td>\n", + " </tr>\n", + " <tr>\n", + " <td>550</td>\n", + " <td>0.213900</td>\n", + " <td>0.154931</td>\n", + " <td>0.426748</td>\n", + " <td>0.432488</td>\n", + " <td>0.429599</td>\n", + " <td>0.943276</td>\n", + " </tr>\n", + " <tr>\n", + " <td>600</td>\n", + " <td>0.213900</td>\n", + " <td>0.147228</td>\n", + " <td>0.402466</td>\n", + " <td>0.472840</td>\n", + " <td>0.434824</td>\n", + " <td>0.942075</td>\n", + " </tr>\n", + " <tr>\n", + " <td>650</td>\n", + " <td>0.213900</td>\n", + " <td>0.154059</td>\n", + " <td>0.274133</td>\n", + " <td>0.265908</td>\n", + " <td>0.269958</td>\n", + " <td>0.939326</td>\n", + " </tr>\n", + " <tr>\n", + " <td>700</td>\n", + " <td>0.213900</td>\n", + " <td>0.158030</td>\n", + " <td>0.389341</td>\n", + " <td>0.544232</td>\n", + " <td>0.453937</td>\n", + " <td>0.940095</td>\n", + " </tr>\n", + " <tr>\n", + " <td>750</td>\n", + " <td>0.213900</td>\n", + " <td>0.137712</td>\n", + " <td>0.373972</td>\n", + " <td>0.399897</td>\n", + " <td>0.386500</td>\n", + " <td>0.946952</td>\n", + " </tr>\n", + " <tr>\n", + " <td>800</td>\n", + " <td>0.213900</td>\n", + " <td>0.145364</td>\n", + " <td>0.385280</td>\n", + " <td>0.530781</td>\n", + " <td>0.446475</td>\n", + " <td>0.941485</td>\n", + " </tr>\n", + " <tr>\n", + " <td>850</td>\n", + " <td>0.213900</td>\n", + " <td>0.148491</td>\n", + " <td>0.401239</td>\n", + " <td>0.535954</td>\n", + " <td>0.458915</td>\n", + " <td>0.940895</td>\n", + " </tr>\n", + " <tr>\n", + " <td>900</td>\n", + " <td>0.213900</td>\n", + " <td>0.149123</td>\n", + " <td>0.374374</td>\n", + " <td>0.464046</td>\n", + " <td>0.414414</td>\n", + " <td>0.946784</td>\n", + " </tr>\n", + " <tr>\n", + " <td>950</td>\n", + " <td>0.213900</td>\n", + " <td>0.142598</td>\n", + " <td>0.405747</td>\n", + " <td>0.365235</td>\n", + " <td>0.384427</td>\n", + " <td>0.946963</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1000</td>\n", + " <td>0.153200</td>\n", + " <td>0.143212</td>\n", + " <td>0.449354</td>\n", + " <td>0.341956</td>\n", + " <td>0.388367</td>\n", + " <td>0.945857</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1050</td>\n", + " <td>0.153200</td>\n", + " <td>0.146789</td>\n", + " <td>0.435653</td>\n", + " <td>0.495603</td>\n", + " <td>0.463698</td>\n", + " <td>0.947300</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1100</td>\n", + " <td>0.153200</td>\n", + " <td>0.137099</td>\n", + " <td>0.421190</td>\n", + " <td>0.501811</td>\n", + " <td>0.457979</td>\n", + " <td>0.948090</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1150</td>\n", + " <td>0.153200</td>\n", + " <td>0.144017</td>\n", + " <td>0.437966</td>\n", + " <td>0.485773</td>\n", + " <td>0.460633</td>\n", + " <td>0.948532</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1200</td>\n", + " <td>0.153200</td>\n", + " <td>0.142823</td>\n", + " <td>0.459889</td>\n", + " <td>0.344025</td>\n", + " <td>0.393608</td>\n", + " <td>0.947416</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1250</td>\n", + " <td>0.153200</td>\n", + " <td>0.163925</td>\n", + " <td>0.380834</td>\n", + " <td>0.476979</td>\n", + " <td>0.423519</td>\n", + " <td>0.947732</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1300</td>\n", + " <td>0.153200</td>\n", + " <td>0.153968</td>\n", + " <td>0.461087</td>\n", + " <td>0.447491</td>\n", + " <td>0.454187</td>\n", + " <td>0.948711</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1350</td>\n", + " <td>0.153200</td>\n", + " <td>0.157536</td>\n", + " <td>0.465220</td>\n", + " <td>0.432488</td>\n", + " <td>0.448257</td>\n", + " <td>0.949680</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1400</td>\n", + " <td>0.153200</td>\n", + " <td>0.149706</td>\n", + " <td>0.421693</td>\n", + " <td>0.394206</td>\n", + " <td>0.407487</td>\n", + " <td>0.947236</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1450</td>\n", + " <td>0.153200</td>\n", + " <td>0.178025</td>\n", + " <td>0.472966</td>\n", + " <td>0.448008</td>\n", + " <td>0.460149</td>\n", + " <td>0.947447</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1500</td>\n", + " <td>0.113500</td>\n", + " <td>0.146669</td>\n", + " <td>0.440809</td>\n", + " <td>0.529747</td>\n", + " <td>0.481203</td>\n", + " <td>0.947689</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1550</td>\n", + " <td>0.113500</td>\n", + " <td>0.144341</td>\n", + " <td>0.480186</td>\n", + " <td>0.426280</td>\n", + " <td>0.451631</td>\n", + " <td>0.950344</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1600</td>\n", + " <td>0.113500</td>\n", + " <td>0.140071</td>\n", + " <td>0.446570</td>\n", + " <td>0.495085</td>\n", + " <td>0.469578</td>\n", + " <td>0.948690</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1650</td>\n", + " <td>0.113500</td>\n", + " <td>0.175828</td>\n", + " <td>0.432393</td>\n", + " <td>0.309364</td>\n", + " <td>0.360676</td>\n", + " <td>0.945825</td>\n", + " </tr>\n", + " </tbody>\n", + "</table><p>" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "Saving model checkpoint to NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-500\n", + "Configuration saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-500/config.json\n", + "Model weights saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-500/pytorch_model.bin\n", + "tokenizer config file saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-500/tokenizer_config.json\n", + "Special tokens file saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-500/special_tokens_map.json\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "Saving model checkpoint to NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-1000\n", + "Configuration saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-1000/config.json\n", + "Model weights saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-1000/pytorch_model.bin\n", + "tokenizer config file saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-1000/tokenizer_config.json\n", + "Special tokens file saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-1000/special_tokens_map.json\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "Saving model checkpoint to NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-1500\n", + "Configuration saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-1500/config.json\n", + "Model weights saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-1500/pytorch_model.bin\n", + "tokenizer config file saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-1500/tokenizer_config.json\n", + "Special tokens file saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-1500/special_tokens_map.json\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "\n", + "\n", + "Training completed. Do not forget to share your model on huggingface.co/models =)\n", + "\n", + "\n", + "Loading best model from NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-1500 (score: 0.48120300751879697).\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "TrainOutput(global_step=1650, training_loss=0.15572210947672527, metrics={'train_runtime': 250.5398, 'train_samples_per_second': 1746.429, 'train_steps_per_second': 218.329, 'total_flos': 244690187133744.0, 'train_loss': 0.15572210947672527, 'epoch': 1.51})" + ] + }, + "metadata": {}, + "execution_count": 78 + } + ], + "source": [ + "trainer = Trainer(\n", + " model=model,\n", + " args=args,\n", + " train_dataset=tokenized_datasets[\"train\"],\n", + " eval_dataset=tokenized_datasets[\"validation\"],\n", + " data_collator=data_collator,\n", + " compute_metrics=compute_metrics,\n", + " tokenizer=tokenizer,\n", + " callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]\n", + ")\n", + "trainer.train()" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "cb8d8bbc-fe8e-4926-d201-8a64dc0edcdb", + "id": "P2GfgJz0VDhO" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Saving model checkpoint to model/distilbert-base-uncased-sentence\n", + "Configuration saved in model/distilbert-base-uncased-sentence/config.json\n", + "Model weights saved in model/distilbert-base-uncased-sentence/pytorch_model.bin\n", + "tokenizer config file saved in model/distilbert-base-uncased-sentence/tokenizer_config.json\n", + "Special tokens file saved in model/distilbert-base-uncased-sentence/special_tokens_map.json\n" + ] + } + ], + "source": [ + "trainer.save_model('model/distilbert-base-uncased-sentence')" + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "lzL_QhxW7Dha" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Model 2 - pucpr/clinicalnerpt-medical\n", + "\n", + "Whole document based tokenization" + ], + "metadata": { + "id": "GgGucecD600w" + } + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000, + "referenced_widgets": [ + "f0fc94c6df4c432f9e1edcfceaf44edd", + "03f4216f904d4bf6a8e461c50f40378f", + "76fecb01ad6441a08ad0ddb989a8ab80", + "783edab3150d40a3bf99ed910cdbaf88", + "e33897e8fa4841308c55b19352876ab1", + "909d7d349769402a8c20b02b35eafbd9", + "6139e3d551044671a79e15e8adf351ab", + "1c19ec799ccb4e788f34ed8ac37d495f", + "7a49ac5f18f64b41825378184f8c32ec", + "76ad2415389f41deb4f2810bca56b753", + "8c4bb02e55fa48429c8f86dac5cb369e", + "980374f604ec4970b0afa70d108c864b", + "baa9e8a9169a45efb8c117fdf4ea45b7", + "aa248b7d4eae4e5d965a7d04144adacc", + "311d8199627e4c4f83d02c167b5755f3", + "a299d926edbb4c51906b1be8f694d074", + "c76ebf5d6c6c46cd9d14591a47725ae1", + "178555439d854d30a01cac053adf9079", + "00c8d81110fe4f4bbcf77be4d20581c3", + "43c18ef010014cb395f045dd26497fba", + "3b83d1f429d34e8e8de6ddff555df02d", + "91a8348ac2194686a9ef075f7d49687d", + "42dcc74bff5440608a3e9f2fa580cd3c", + "616c54d0cd534047b93b215e7baf2ba3", + "846ce38d6fb84279a7419091f2d269b0", + "1e15050772b54e34b014a98b9710c783", + "951f476862ea49619100e202a6e742f6", + "f6e25547ba664cd59128536944a926fd", + "53185a2afedc41e0a680d5007656b90b", + "860f8204efae42d6ad2fa4eb9e661810", + "e61ec283a9c04cc696d17bbe24ccf460", + "15abc268777a4e3cb5c3c7f430745c1d", + "e6c1266e8b074bdfafa0db6208743a07", + "01a12f499b7942cc90f2032a8f3284e9", + "804bf9b8a2154399a05dd0860f4dfd89", + "66fca4d7946240c3b08ba51fac82f2ae", + "6cefb3be5be9488ca033ed9908c6a8f5", + "2d0f2804db004da0914e4733ce96b749", + "3d6b93cde5254ce99f19802b7c1146f4", + "838416bfbee0400299abff324c4825bc", + "f7584b77d97f4d48aa5b50bae2df49f8", + "0a5cd97f8a914ff89fd27aed7b38164d", + "ad4c308ebd574e909d31b161580b9064", + "74030c98fcb942ed9d9ffc43799113f0" + ] + }, + "id": "axyP0XOOKpLg", + "outputId": "da8d34ae-66c7-459a-99be-56d7a434ec80" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Downloading: 0%| | 0.00/151 [00:00<?, ?B/s]" + ], + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "f0fc94c6df4c432f9e1edcfceaf44edd" + } + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Downloading: 0%| | 0.00/1.05k [00:00<?, ?B/s]" + ], + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "980374f604ec4970b0afa70d108c864b" + } + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--pucpr--clinicalnerpt-medical/snapshots/0d889f90b203734b0ba45904781a6779c8eac2b9/config.json\n", + "Model config BertConfig {\n", + " \"_name_or_path\": \"pucpr/clinicalnerpt-medical\",\n", + " \"_num_labels\": 3,\n", + " \"architectures\": [\n", + " \"BertForTokenClassification\"\n", + " ],\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"classifier_dropout\": null,\n", + " \"directionality\": \"bidi\",\n", + " \"eos_token_ids\": null,\n", + " \"gradient_checkpointing\": false,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.1,\n", + " \"hidden_size\": 768,\n", + " \"id2label\": {\n", + " \"0\": \"O\",\n", + " \"1\": \"B-MedicalDevice\",\n", + " \"2\": \"I-MedicalDevice\"\n", + " },\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"label2id\": {\n", + " \"B-MedicalDevice\": 1,\n", + " \"I-MedicalDevice\": 2,\n", + " \"O\": 0\n", + " },\n", + " \"layer_norm_eps\": 1e-12,\n", + " \"max_position_embeddings\": 512,\n", + " \"model_type\": \"bert\",\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 12,\n", + " \"output_past\": true,\n", + " \"pad_token_id\": null,\n", + " \"pooler_fc_size\": 768,\n", + " \"pooler_num_attention_heads\": 12,\n", + " \"pooler_num_fc_layers\": 3,\n", + " \"pooler_size_per_head\": 128,\n", + " \"pooler_type\": \"first_token_transform\",\n", + " \"position_embedding_type\": \"absolute\",\n", + " \"transformers_version\": \"4.23.1\",\n", + " \"type_vocab_size\": 2,\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 119547\n", + "}\n", + "\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Downloading: 0%| | 0.00/996k [00:00<?, ?B/s]" + ], + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "42dcc74bff5440608a3e9f2fa580cd3c" + } + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Downloading: 0%| | 0.00/112 [00:00<?, ?B/s]" + ], + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "01a12f499b7942cc90f2032a8f3284e9" + } + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--pucpr--clinicalnerpt-medical/snapshots/0d889f90b203734b0ba45904781a6779c8eac2b9/vocab.txt\n", + "loading file tokenizer.json from cache at None\n", + "loading file added_tokens.json from cache at None\n", + "loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--pucpr--clinicalnerpt-medical/snapshots/0d889f90b203734b0ba45904781a6779c8eac2b9/special_tokens_map.json\n", + "loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--pucpr--clinicalnerpt-medical/snapshots/0d889f90b203734b0ba45904781a6779c8eac2b9/tokenizer_config.json\n", + "loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--pucpr--clinicalnerpt-medical/snapshots/0d889f90b203734b0ba45904781a6779c8eac2b9/config.json\n", + "Model config BertConfig {\n", + " \"_name_or_path\": \"pucpr/clinicalnerpt-medical\",\n", + " \"_num_labels\": 3,\n", + " \"architectures\": [\n", + " \"BertForTokenClassification\"\n", + " ],\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"classifier_dropout\": null,\n", + " \"directionality\": \"bidi\",\n", + " \"eos_token_ids\": null,\n", + " \"gradient_checkpointing\": false,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.1,\n", + " \"hidden_size\": 768,\n", + " \"id2label\": {\n", + " \"0\": \"O\",\n", + " \"1\": \"B-MedicalDevice\",\n", + " \"2\": \"I-MedicalDevice\"\n", + " },\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"label2id\": {\n", + " \"B-MedicalDevice\": 1,\n", + " \"I-MedicalDevice\": 2,\n", + " \"O\": 0\n", + " },\n", + " \"layer_norm_eps\": 1e-12,\n", + " \"max_position_embeddings\": 512,\n", + " \"model_type\": \"bert\",\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 12,\n", + " \"output_past\": true,\n", + " \"pad_token_id\": null,\n", + " \"pooler_fc_size\": 768,\n", + " \"pooler_num_attention_heads\": 12,\n", + " \"pooler_num_fc_layers\": 3,\n", + " \"pooler_size_per_head\": 128,\n", + " \"pooler_type\": \"first_token_transform\",\n", + " \"position_embedding_type\": \"absolute\",\n", + " \"transformers_version\": \"4.23.1\",\n", + " \"type_vocab_size\": 2,\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 119547\n", + "}\n", + "\n", + "loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--pucpr--clinicalnerpt-medical/snapshots/0d889f90b203734b0ba45904781a6779c8eac2b9/config.json\n", + "Model config BertConfig {\n", + " \"_name_or_path\": \"pucpr/clinicalnerpt-medical\",\n", + " \"_num_labels\": 3,\n", + " \"architectures\": [\n", + " \"BertForTokenClassification\"\n", + " ],\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"classifier_dropout\": null,\n", + " \"directionality\": \"bidi\",\n", + " \"eos_token_ids\": null,\n", + " \"gradient_checkpointing\": false,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.1,\n", + " \"hidden_size\": 768,\n", + " \"id2label\": {\n", + " \"0\": \"O\",\n", + " \"1\": \"B-MedicalDevice\",\n", + " \"2\": \"I-MedicalDevice\"\n", + " },\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"label2id\": {\n", + " \"B-MedicalDevice\": 1,\n", + " \"I-MedicalDevice\": 2,\n", + " \"O\": 0\n", + " },\n", + " \"layer_norm_eps\": 1e-12,\n", + " \"max_position_embeddings\": 512,\n", + " \"model_type\": \"bert\",\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 12,\n", + " \"output_past\": true,\n", + " \"pad_token_id\": null,\n", + " \"pooler_fc_size\": 768,\n", + " \"pooler_num_attention_heads\": 12,\n", + " \"pooler_num_fc_layers\": 3,\n", + " \"pooler_size_per_head\": 128,\n", + " \"pooler_type\": \"first_token_transform\",\n", + " \"position_embedding_type\": \"absolute\",\n", + " \"transformers_version\": \"4.23.1\",\n", + " \"type_vocab_size\": 2,\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 119547\n", + "}\n", + "\n" + ] + } + ], + "source": [ + "model_checkpoint = \"pucpr/clinicalnerpt-medical\"\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "JPTd6vUPKpLg", + "outputId": "84451d5e-eb3a-4763-dc29-5ffebafc4c3d" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 81 + } + ], + "source": [ + "tokenizer.is_fast" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": { + "id": "q8o2kvS69K2h" + }, + "outputs": [], + "source": [ + "dic = {\"tokens\": HCs_tokenized, \"ner_tags\": labels_tokenized} #For the whole clinical case. We used this option for our paper.\n", + "#dic = {\"tokens\": sent_tokenized, \"ner_tags\": label_sent_tokenized} #Use this option if you want to check the model performance with sentences tokenized by \". \" b" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": { + "id": "6JXCzYF49K2h" + }, + "outputs": [], + "source": [ + "dataset = Dataset.from_dict(dic)" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "f01f435d-ced1-4002-cc23-05e42329c8df", + "id": "AGI1Hf_E9K2h" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['tokens', 'ner_tags'],\n", + " num_rows: 741\n", + "})" + ] + }, + "metadata": {}, + "execution_count": 84 + } + ], + "source": [ + "dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": { + "id": "CUH94L-29K2h" + }, + "outputs": [], + "source": [ + "#For training, validation, and test partitions\n", + "\"\"\"\n", + "#Train, val, test partitions\n", + "train_test = dataset.train_test_split()\n", + "test_val = train_test['test'].train_test_split()\n", + "raw_datasets = DatasetDict({\n", + " 'train': train_test['train'],\n", + " 'validation': test_val['train'],\n", + " 'test': test_val['test']\n", + " })\n", + "\"\"\"\n", + "\n", + "#Just for training and validation partitions\n", + "train_test = dataset.train_test_split()\n", + "raw_datasets = DatasetDict({\n", + " 'train': train_test['train'],\n", + " 'validation': train_test['test']\n", + " })" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "654a2798-e4f3-49ce-c59c-747c7538352b", + "id": "zRYC70NF9K2i" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "DatasetDict({\n", + " train: Dataset({\n", + " features: ['tokens', 'ner_tags'],\n", + " num_rows: 555\n", + " })\n", + " validation: Dataset({\n", + " features: ['tokens', 'ner_tags'],\n", + " num_rows: 186\n", + " })\n", + "})" + ] + }, + "metadata": {}, + "execution_count": 86 + } + ], + "source": [ + "raw_datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "e1de590a-ea92-4b0b-9da3-c65152d08872", + "id": "dCZcYkCa9K2i" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 2,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 2,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 2,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 2,\n", + " 2,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 2,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 2,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 2,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 2,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0]" + ] + }, + "metadata": {}, + "execution_count": 87 + } + ], + "source": [ + "raw_datasets[\"train\"][0][\"ner_tags\"]\n", + "#raw_datasets[\"train\"][0][\"pos_tags\"]\n", + "#raw_datasets[\"train\"][0][\"chunk_tags\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "6b0070ab-8b15-4750-e848-a122ca47e050", + "id": "wWinDHdc9K2i" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['tokens', 'ner_tags'],\n", + " num_rows: 555\n", + "})" + ] + }, + "metadata": {}, + "execution_count": 88 + } + ], + "source": [ + "raw_datasets['train']" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "ea900262-0ead-4843-da63-d1b736a2ad29", + "id": "NeEdChWM9K2i" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['O', 'B', 'I']" + ] + }, + "metadata": {}, + "execution_count": 89 + } + ], + "source": [ + "label_names = ['O','B','I']\n", + "label_names" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "e264924f-3c29-4f64-d8f6-4441ca35c9b1", + "id": "FwWqjcpJ9K2i" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "This is a 79 - year - old woman with a history of hypertension , osteoporosis and hysterectomy for myomatosis at the age of 50 . She underwent transurethral resection of infiltrating bladder carcinoma in October 2006 . She subsequently received radiotherapy sessions up to a total of 50 Gy due to persistence of an external tumour mass in the right angle of the bladder , finishing this treatment in June 2007 . In August 2007 she began chemotherapy treatment due to persistence of the bladder lesion and metastases in the spine detected by follow - up CT scan and bone scintigraphy . Her digestive history began in February 2008 when she was admitted for episodes of rectorrhagia , initially scarce and distal , but which soon became more frequent and profuse , accompanied by symptoms of haemodynamic instability and severe anaemia with extensive transfusion requirements . \n", + " Total colonoscopy was performed , showing only changes typical of actinic proctitis with large friable and bleeding neovascular lesions ; treatment was carried out with argon plasma ( APC ) . The patient 's clinical course was unfavourable : she received consecutive treatment with steroid enemas , 5 - aminosalicylic acid and sucralfate ; in addition , three more therapeutic rectoscopies were performed , applying APC treatment , despite which the episodes of profuse rectorrhagia with haemodynamic instability persisted , requiring transfusion of a total of 21 red blood cell concentrates throughout the patient 's hospitalisation period , despite also receiving oral and intravenous ferrotherapy . \n", + " In the absence of response to these treatments , the case was discussed with the surgeon for topical treatment with formalin , who , using spinal anaesthesia and anal dilatation , treated the rectal ampulla for 10 minutes with a 10 % formalin solution 200 ml + 300 ml of water ; The patient 's tolerance to the procedure was excellent , it was carried out without any complications and from that moment onwards the patient was completely asymptomatic without new episodes of haemorrhagic externalisation , haemodynamic instability or new transfusion requirements ; she was discharged and a follow - up colonoscopy was proposed after the treatment , but she did not accept . \n", + " Four months later , she was admitted again for clinical signs of tumour progression , and symptomatic treatment was decided by the Oncology Department , and the patient died , but without recurrence of the rectorrhagia . \n", + "\n", + "\n", + " \n", + "O O O O O O O O O O O O O O O B I O O B O O O O O O O O O O O B B I O O O O O O O O O O O O O O O O O O O O O O B I O O O O O O B O O O O O O O O O O O O O O O O O O O O B I I O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B I O O O O O O O O O O O O O O O O B I O O B I I I I O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B I O O O O O O O O O O O O O O O O O O O O O O O O \n" + ] + } + ], + "source": [ + "words = raw_datasets[\"train\"][0][\"tokens\"]\n", + "labels = [int(n) for n in raw_datasets[\"train\"][0][\"ner_tags\"]]\n", + "#labels = raw_datasets[\"train\"][0][\"pos_tags\"]\n", + "#labels = raw_datasets[\"train\"][0][\"chunk_tags\"]\n", + "line1 = \"\"\n", + "line2 = \"\"\n", + "for word, label in zip(words, labels):\n", + " full_label = label_names[label]\n", + " max_length = max(len(word), len(full_label))\n", + " line1 += word + \" \" * (max_length - len(word) + 1)\n", + " line2 += full_label + \" \" * (max_length - len(full_label) + 1)\n", + "\n", + "print(line1)\n", + "print(line2)" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Bhx4OJLrKpLg", + "outputId": "08eb850e-d3a5-41e0-9dcf-89f7e7748b02" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Token indices sequence length is longer than the specified maximum sequence length for this model (578 > 512). Running this sequence through the model will result in indexing errors\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['[CLS]',\n", + " 'this',\n", + " 'is',\n", + " 'a',\n", + " '79',\n", + " '-',\n", + " 'year',\n", + " '-',\n", + " 'old',\n", + " 'woman',\n", + " 'with',\n", + " 'a',\n", + " 'history',\n", + " 'of',\n", + " 'hy',\n", + " '##pert',\n", + " '##ension',\n", + " ',',\n", + " 'os',\n", + " '##te',\n", + " '##op',\n", + " '##oros',\n", + " '##is',\n", + " 'and',\n", + " 'hy',\n", + " '##ster',\n", + " '##ect',\n", + " '##omy',\n", + " 'for',\n", + " 'my',\n", + " '##oma',\n", + " '##tos',\n", + " '##is',\n", + " 'at',\n", + " 'the',\n", + " 'age',\n", + " 'of',\n", + " '50',\n", + " '.',\n", + " 'she',\n", + " 'underwent',\n", + " 'trans',\n", + " '##ure',\n", + " '##th',\n", + " '##ral',\n", + " 'res',\n", + " '##ection',\n", + " 'of',\n", + " 'in',\n", + " '##fil',\n", + " '##trat',\n", + " '##ing',\n", + " 'blad',\n", + " '##der',\n", + " 'car',\n", + " '##cino',\n", + " '##ma',\n", + " 'in',\n", + " 'o',\n", + " '##cto',\n", + " '##ber',\n", + " '2006',\n", + " '.',\n", + " 'she',\n", + " 'subsequently',\n", + " 'received',\n", + " 'radio',\n", + " '##ther',\n", + " '##ap',\n", + " '##y',\n", + " 'sessions',\n", + " 'up',\n", + " 'to',\n", + " 'a',\n", + " 'total',\n", + " 'of',\n", + " '50',\n", + " 'g',\n", + " '##y',\n", + " 'due',\n", + " 'to',\n", + " 'pers',\n", + " '##isten',\n", + " '##ce',\n", + " 'of',\n", + " 'an',\n", + " 'external',\n", + " 'tu',\n", + " '##mou',\n", + " '##r',\n", + " 'mass',\n", + " 'in',\n", + " 'the',\n", + " 'right',\n", + " 'angle',\n", + " 'of',\n", + " 'the',\n", + " 'blad',\n", + " '##der',\n", + " ',',\n", + " 'finishing',\n", + " 'this',\n", + " 'treatment',\n", + " 'in',\n", + " 'ju',\n", + " '##ne',\n", + " '2007',\n", + " '.',\n", + " 'in',\n", + " 'august',\n", + " '2007',\n", + " 'she',\n", + " 'began',\n", + " 'che',\n", + " '##mot',\n", + " '##hera',\n", + " '##py',\n", + " 'treatment',\n", + " 'due',\n", + " 'to',\n", + " 'pers',\n", + " '##isten',\n", + " '##ce',\n", + " 'of',\n", + " 'the',\n", + " 'blad',\n", + " '##der',\n", + " 'les',\n", + " '##ion',\n", + " 'and',\n", + " 'meta',\n", + " '##stas',\n", + " '##es',\n", + " 'in',\n", + " 'the',\n", + " 'spin',\n", + " '##e',\n", + " 'det',\n", + " '##ected',\n", + " 'by',\n", + " 'follow',\n", + " '-',\n", + " 'up',\n", + " 'c',\n", + " '##t',\n", + " 's',\n", + " '##can',\n", + " 'and',\n", + " 'bone',\n", + " 'sci',\n", + " '##nti',\n", + " '##graphy',\n", + " '.',\n", + " 'her',\n", + " 'dig',\n", + " '##esti',\n", + " '##ve',\n", + " 'history',\n", + " 'began',\n", + " 'in',\n", + " 'februar',\n", + " '##y',\n", + " '2008',\n", + " 'when',\n", + " 'she',\n", + " 'was',\n", + " 'admitted',\n", + " 'for',\n", + " 'episodes',\n", + " 'of',\n", + " 'rector',\n", + " '##r',\n", + " '##ha',\n", + " '##gia',\n", + " ',',\n", + " 'initially',\n", + " 's',\n", + " '##car',\n", + " '##ce',\n", + " 'and',\n", + " 'dis',\n", + " '##tal',\n", + " ',',\n", + " 'but',\n", + " 'which',\n", + " 'soon',\n", + " 'became',\n", + " 'more',\n", + " 'frequent',\n", + " 'and',\n", + " 'prof',\n", + " '##use',\n", + " ',',\n", + " 'accompanied',\n", + " 'by',\n", + " 'symptoms',\n", + " 'of',\n", + " 'hae',\n", + " '##mo',\n", + " '##dyn',\n", + " '##ami',\n", + " '##c',\n", + " 'ins',\n", + " '##tab',\n", + " '##ility',\n", + " 'and',\n", + " 'severe',\n", + " 'ana',\n", + " '##emia',\n", + " 'with',\n", + " 'extensive',\n", + " 'trans',\n", + " '##fus',\n", + " '##ion',\n", + " 'requirements',\n", + " '.',\n", + " 'total',\n", + " 'colonos',\n", + " '##co',\n", + " '##py',\n", + " 'was',\n", + " 'performed',\n", + " ',',\n", + " 'showing',\n", + " 'only',\n", + " 'changes',\n", + " 'typical',\n", + " 'of',\n", + " 'act',\n", + " '##ini',\n", + " '##c',\n", + " 'pro',\n", + " '##cti',\n", + " '##tis',\n", + " 'with',\n", + " 'large',\n", + " 'fri',\n", + " '##able',\n", + " 'and',\n", + " 'ble',\n", + " '##eding',\n", + " 'neo',\n", + " '##vas',\n", + " '##cular',\n", + " 'les',\n", + " '##ions',\n", + " ';',\n", + " 'treatment',\n", + " 'was',\n", + " 'carried',\n", + " 'out',\n", + " 'with',\n", + " 'ar',\n", + " '##gon',\n", + " 'plasma',\n", + " '(',\n", + " 'ap',\n", + " '##c',\n", + " ')',\n", + " '.',\n", + " 'the',\n", + " 'patient',\n", + " \"'\",\n", + " 's',\n", + " 'clinical',\n", + " 'course',\n", + " 'was',\n", + " 'un',\n", + " '##fa',\n", + " '##vour',\n", + " '##able',\n", + " ':',\n", + " 'she',\n", + " 'received',\n", + " 'consecutive',\n", + " 'treatment',\n", + " 'with',\n", + " 'ster',\n", + " '##oid',\n", + " 'ene',\n", + " '##mas',\n", + " ',',\n", + " '5',\n", + " '-',\n", + " 'amino',\n", + " '##sal',\n", + " '##icy',\n", + " '##lic',\n", + " 'acid',\n", + " 'and',\n", + " 'su',\n", + " '##cra',\n", + " '##lfa',\n", + " '##te',\n", + " ';',\n", + " 'in',\n", + " 'addition',\n", + " ',',\n", + " 'three',\n", + " 'more',\n", + " 'the',\n", + " '##rap',\n", + " '##eu',\n", + " '##tic',\n", + " 're',\n", + " '##ctos',\n", + " '##co',\n", + " '##pies',\n", + " 'were',\n", + " 'performed',\n", + " ',',\n", + " 'apply',\n", + " '##ing',\n", + " 'ap',\n", + " '##c',\n", + " 'treatment',\n", + " ',',\n", + " 'despite',\n", + " 'which',\n", + " 'the',\n", + " 'episodes',\n", + " 'of',\n", + " 'prof',\n", + " '##use',\n", + " 'rector',\n", + " '##r',\n", + " '##ha',\n", + " '##gia',\n", + " 'with',\n", + " 'hae',\n", + " '##mo',\n", + " '##dyn',\n", + " '##ami',\n", + " '##c',\n", + " 'ins',\n", + " '##tab',\n", + " '##ility',\n", + " 'pers',\n", + " '##isted',\n", + " ',',\n", + " 'requiring',\n", + " 'trans',\n", + " '##fus',\n", + " '##ion',\n", + " 'of',\n", + " 'a',\n", + " 'total',\n", + " 'of',\n", + " '21',\n", + " 'red',\n", + " 'blood',\n", + " 'cell',\n", + " 'con',\n", + " '##centra',\n", + " '##tes',\n", + " 'throughout',\n", + " 'the',\n", + " 'patient',\n", + " \"'\",\n", + " 's',\n", + " 'hospital',\n", + " '##isation',\n", + " 'period',\n", + " ',',\n", + " 'despite',\n", + " 'also',\n", + " 'receiving',\n", + " 'oral',\n", + " 'and',\n", + " 'intra',\n", + " '##veno',\n", + " '##us',\n", + " 'ferro',\n", + " '##ther',\n", + " '##ap',\n", + " '##y',\n", + " '.',\n", + " 'in',\n", + " 'the',\n", + " 'absence',\n", + " 'of',\n", + " 'response',\n", + " 'to',\n", + " 'these',\n", + " 'treatment',\n", + " '##s',\n", + " ',',\n", + " 'the',\n", + " 'case',\n", + " 'was',\n", + " 'discussed',\n", + " 'with',\n", + " 'the',\n", + " 'surgeon',\n", + " 'for',\n", + " 'topic',\n", + " '##al',\n", + " 'treatment',\n", + " 'with',\n", + " 'formal',\n", + " '##in',\n", + " ',',\n", + " 'who',\n", + " ',',\n", + " 'using',\n", + " 'spin',\n", + " '##al',\n", + " 'ana',\n", + " '##est',\n", + " '##hes',\n", + " '##ia',\n", + " 'and',\n", + " 'anal',\n", + " 'dil',\n", + " '##ata',\n", + " '##tion',\n", + " ',',\n", + " 'treated',\n", + " 'the',\n", + " 'recta',\n", + " '##l',\n", + " 'am',\n", + " '##pul',\n", + " '##la',\n", + " 'for',\n", + " '10',\n", + " 'minutes',\n", + " 'with',\n", + " 'a',\n", + " '10',\n", + " '%',\n", + " 'formal',\n", + " '##in',\n", + " 'solution',\n", + " '200',\n", + " 'ml',\n", + " '+',\n", + " '300',\n", + " 'ml',\n", + " 'of',\n", + " 'water',\n", + " ';',\n", + " 'the',\n", + " 'patient',\n", + " \"'\",\n", + " 's',\n", + " 'tol',\n", + " '##erance',\n", + " 'to',\n", + " 'the',\n", + " 'procedure',\n", + " 'was',\n", + " 'excellent',\n", + " ',',\n", + " 'it',\n", + " 'was',\n", + " 'carried',\n", + " 'out',\n", + " 'without',\n", + " 'any',\n", + " 'com',\n", + " '##plications',\n", + " 'and',\n", + " 'from',\n", + " 'that',\n", + " 'moment',\n", + " 'onwards',\n", + " 'the',\n", + " 'patient',\n", + " 'was',\n", + " 'completely',\n", + " 'as',\n", + " '##ym',\n", + " '##pt',\n", + " '##oma',\n", + " '##tic',\n", + " 'without',\n", + " 'new',\n", + " 'episodes',\n", + " 'of',\n", + " 'hae',\n", + " '##mor',\n", + " '##r',\n", + " '##ha',\n", + " '##gic',\n", + " 'external',\n", + " '##isation',\n", + " ',',\n", + " 'hae',\n", + " '##mo',\n", + " '##dyn',\n", + " '##ami',\n", + " '##c',\n", + " 'ins',\n", + " '##tab',\n", + " '##ility',\n", + " 'or',\n", + " 'new',\n", + " 'trans',\n", + " '##fus',\n", + " '##ion',\n", + " 'requirements',\n", + " ';',\n", + " 'she',\n", + " 'was',\n", + " 'disc',\n", + " '##harge',\n", + " '##d',\n", + " 'and',\n", + " 'a',\n", + " 'follow',\n", + " '-',\n", + " 'up',\n", + " 'colonos',\n", + " '##co',\n", + " '##py',\n", + " 'was',\n", + " 'proposed',\n", + " 'after',\n", + " 'the',\n", + " 'treatment',\n", + " ',',\n", + " 'but',\n", + " 'she',\n", + " 'did',\n", + " 'not',\n", + " 'accept',\n", + " '.',\n", + " 'four',\n", + " 'months',\n", + " 'later',\n", + " ',',\n", + " 'she',\n", + " 'was',\n", + " 'admitted',\n", + " 'again',\n", + " 'for',\n", + " 'clinical',\n", + " 'signs',\n", + " 'of',\n", + " 'tu',\n", + " '##mou',\n", + " '##r',\n", + " 'progression',\n", + " ',',\n", + " 'and',\n", + " 'sy',\n", + " '##mpt',\n", + " '##oma',\n", + " '##tic',\n", + " 'treatment',\n", + " 'was',\n", + " 'decided',\n", + " 'by',\n", + " 'the',\n", + " 'on',\n", + " '##cology',\n", + " 'department',\n", + " ',',\n", + " 'and',\n", + " 'the',\n", + " 'patient',\n", + " 'died',\n", + " ',',\n", + " 'but',\n", + " 'without',\n", + " 're',\n", + " '##cu',\n", + " '##rren',\n", + " '##ce',\n", + " 'of',\n", + " 'the',\n", + " 'rector',\n", + " '##r',\n", + " '##ha',\n", + " '##gia',\n", + " '.',\n", + " '[SEP]']" + ] + }, + "metadata": {}, + "execution_count": 91 + } + ], + "source": [ + "inputs = tokenizer(raw_datasets[\"train\"][0][\"tokens\"], is_split_into_words=True)\n", + "inputs.tokens()" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "EQskMV19KpLh", + "outputId": "9fbf1919-5853-4cd7-ea80-475edeb3b482" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n", + "[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]\n" + ] + } + ], + "source": [ + "labels = raw_datasets[\"train\"][0][\"ner_tags\"]\n", + "word_ids = inputs.word_ids()\n", + "print(labels)\n", + "print(align_labels_with_tokens(labels, word_ids))" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81, + "referenced_widgets": [ + "84631a71d2ca4ae8a781019ea3ce6da9", + "9666c2ff4f32449ea3cbef076a166836", + "efc4af4547804d9daae235691942e73a", + "c8194cd34f554a789359eab6e7596291", + "54f649d297ec4456be5b5df14497fb93", + "e35bc6f834c4490e85ac2ae25d9e922f", + "b166c61328bb49ea803f0d3a7d515d81", + "a2aac740ef3b4f3c913b71c82b408c2c", + "d77e188e43dc4e01b82054f2a6a8e832", + "266b2c90bb4d41198784f016e996066a", + "cc62b20cee8c4a4b8e24576d1c854fbf", + "98bebe04cb254369bb3b6b991d4b2648", + "40080df663cc43749963657150cf632d", + "a04f157a98db4d47b75094b6ef1b0990", + "ba2967950f4c483ea399827046f52963", + "a9e0ad6a141a462fb9bea1c18d447332", + "31fbaf0ffb0845f5800e6fca0353b929", + "4a2c17e757d34547a4a68718ef064073", + "f74d219071ab49479194f1061bf343be", + "a92097360dba4d5c848b48e345b0028e", + "24a164b22a8f4e00944ef05bcec5d032", + "976ade0b37cd43e2aa5aa272dac2445b" + ] + }, + "id": "z_6q0eitKpLh", + "outputId": "4c9c0ce3-5855-49e3-c4da-bbe068361c63" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + " 0%| | 0/1 [00:00<?, ?ba/s]" + ], + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "84631a71d2ca4ae8a781019ea3ce6da9" + } + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + " 0%| | 0/1 [00:00<?, ?ba/s]" + ], + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "98bebe04cb254369bb3b6b991d4b2648" + } + }, + "metadata": {} + } + ], + "source": [ + "tokenized_datasets = raw_datasets.map(\n", + " tokenize_and_align_labels,\n", + " batched=True,\n", + " remove_columns=raw_datasets[\"train\"].column_names,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": { + "id": "K3GGBsIIKpLh" + }, + "outputs": [], + "source": [ + "data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1M1bHT_pKpLi", + "outputId": "19fc29d4-cd88-4fe7-e963-71a59d0434ca" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "tensor([[-100, 0, 0, ..., 0, 0, -100],\n", + " [-100, 0, 0, ..., -100, -100, -100]])" + ] + }, + "metadata": {}, + "execution_count": 95 + } + ], + "source": [ + "batch = data_collator([tokenized_datasets[\"train\"][i] for i in range(2)])\n", + "batch[\"labels\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": { + "id": "maAO4nNXKpLi" + }, + "outputs": [], + "source": [ + "id2label = {str(i): label for i, label in enumerate(label_names)}\n", + "label2id = {v: k for k, v in id2label.items()}" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 954, + "referenced_widgets": [ + "21b02caa5dc146b8ac2bd1a282381c7f", + "0c01b39e34744f74ae47d0d4e70638ce", + "82e7357418144359abba3548449c0c08", + "2ff5e18b6d684b99a82676dbf3db6d32", + "cd75e771337843d9b55838502bed9a1b", + "b6a4250c705f4dd8b9f52731cce2a23d", + "4549eb0838864025ac6a0f3da9192818", + "b2d377844c1a4bc09433a94088f5213e", + "fe7e058b9a6944969d83f7e72b398bb1", + "47d5ccd1eafe4ea1a3476e06d998bd74", + "5a17027205cb4c2bbe140e1e96e4b495" + ] + }, + "id": "Q0T8WOGBKpLi", + "outputId": "d950423b-480b-495e-f02e-667ec124dcb4" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--pucpr--clinicalnerpt-medical/snapshots/0d889f90b203734b0ba45904781a6779c8eac2b9/config.json\n", + "Model config BertConfig {\n", + " \"_name_or_path\": \"pucpr/clinicalnerpt-medical\",\n", + " \"_num_labels\": 3,\n", + " \"architectures\": [\n", + " \"BertForTokenClassification\"\n", + " ],\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"classifier_dropout\": null,\n", + " \"directionality\": \"bidi\",\n", + " \"eos_token_ids\": null,\n", + " \"gradient_checkpointing\": false,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.1,\n", + " \"hidden_size\": 768,\n", + " \"id2label\": {\n", + " \"0\": \"O\",\n", + " \"1\": \"B-MedicalDevice\",\n", + " \"2\": \"I-MedicalDevice\"\n", + " },\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"label2id\": {\n", + " \"B-MedicalDevice\": 1,\n", + " \"I-MedicalDevice\": 2,\n", + " \"O\": 0\n", + " },\n", + " \"layer_norm_eps\": 1e-12,\n", + " \"max_position_embeddings\": 512,\n", + " \"model_type\": \"bert\",\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 12,\n", + " \"output_past\": true,\n", + " \"pad_token_id\": null,\n", + " \"pooler_fc_size\": 768,\n", + " \"pooler_num_attention_heads\": 12,\n", + " \"pooler_num_fc_layers\": 3,\n", + " \"pooler_size_per_head\": 128,\n", + " \"pooler_type\": \"first_token_transform\",\n", + " \"position_embedding_type\": \"absolute\",\n", + " \"transformers_version\": \"4.23.1\",\n", + " \"type_vocab_size\": 2,\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 119547\n", + "}\n", + "\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Downloading: 0%| | 0.00/709M [00:00<?, ?B/s]" + ], + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "21b02caa5dc146b8ac2bd1a282381c7f" + } + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--pucpr--clinicalnerpt-medical/snapshots/0d889f90b203734b0ba45904781a6779c8eac2b9/pytorch_model.bin\n", + "All model checkpoint weights were used when initializing BertForTokenClassification.\n", + "\n", + "All the weights of BertForTokenClassification were initialized from the model checkpoint at pucpr/clinicalnerpt-medical.\n", + "If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForTokenClassification for predictions without further training.\n" + ] + } + ], + "source": [ + "model = AutoModelForTokenClassification.from_pretrained( \n", + " model_checkpoint\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": { + "id": "k6ExcF0UKpLi", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "68661940-56bd-4adc-ec64-37f1db50604c" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "PyTorch: setting up devices\n", + "The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).\n" + ] + } + ], + "source": [ + "args = TrainingArguments(\n", + " \"NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased\",\n", + " evaluation_strategy = IntervalStrategy.STEPS,\n", + " eval_steps = 50,\n", + " learning_rate=5e-5,\n", + " num_train_epochs=50,\n", + " weight_decay=0.01,\n", + " metric_for_best_model = 'f1',\n", + " load_best_model_at_end=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "gQSIB3FfKpLj", + "outputId": "72ede653-dc87-4230-94b2-805a52f1f5d2" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/transformers/optimization.py:310: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", + " FutureWarning,\n", + "***** Running training *****\n", + " Num examples = 555\n", + " Num Epochs = 50\n", + " Instantaneous batch size per device = 8\n", + " Total train batch size (w. parallel, distributed & accumulation) = 8\n", + " Gradient Accumulation steps = 1\n", + " Total optimization steps = 3500\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "<IPython.core.display.HTML object>" + ], + "text/html": [ + "\n", + " <div>\n", + " \n", + " <progress value='1150' max='3500' style='width:300px; height:20px; vertical-align: middle;'></progress>\n", + " [1150/3500 05:32 < 11:20, 3.45 it/s, Epoch 16/50]\n", + " </div>\n", + " <table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: left;\">\n", + " <th>Step</th>\n", + " <th>Training Loss</th>\n", + " <th>Validation Loss</th>\n", + " <th>Precision</th>\n", + " <th>Recall</th>\n", + " <th>F1</th>\n", + " <th>Accuracy</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <td>50</td>\n", + " <td>No log</td>\n", + " <td>0.160077</td>\n", + " <td>0.281675</td>\n", + " <td>0.325786</td>\n", + " <td>0.302129</td>\n", + " <td>0.934798</td>\n", + " </tr>\n", + " <tr>\n", + " <td>100</td>\n", + " <td>No log</td>\n", + " <td>0.156098</td>\n", + " <td>0.440468</td>\n", + " <td>0.425786</td>\n", + " <td>0.433003</td>\n", + " <td>0.941619</td>\n", + " </tr>\n", + " <tr>\n", + " <td>150</td>\n", + " <td>No log</td>\n", + " <td>0.175201</td>\n", + " <td>0.430540</td>\n", + " <td>0.491195</td>\n", + " <td>0.458872</td>\n", + " <td>0.943305</td>\n", + " </tr>\n", + " <tr>\n", + " <td>200</td>\n", + " <td>No log</td>\n", + " <td>0.181505</td>\n", + " <td>0.465023</td>\n", + " <td>0.438994</td>\n", + " <td>0.451634</td>\n", + " <td>0.944750</td>\n", + " </tr>\n", + " <tr>\n", + " <td>250</td>\n", + " <td>No log</td>\n", + " <td>0.198468</td>\n", + " <td>0.437112</td>\n", + " <td>0.487421</td>\n", + " <td>0.460898</td>\n", + " <td>0.942113</td>\n", + " </tr>\n", + " <tr>\n", + " <td>300</td>\n", + " <td>No log</td>\n", + " <td>0.213199</td>\n", + " <td>0.454245</td>\n", + " <td>0.518239</td>\n", + " <td>0.484136</td>\n", + " <td>0.943191</td>\n", + " </tr>\n", + " <tr>\n", + " <td>350</td>\n", + " <td>No log</td>\n", + " <td>0.225870</td>\n", + " <td>0.439457</td>\n", + " <td>0.529560</td>\n", + " <td>0.480319</td>\n", + " <td>0.941974</td>\n", + " </tr>\n", + " <tr>\n", + " <td>400</td>\n", + " <td>No log</td>\n", + " <td>0.252911</td>\n", + " <td>0.509901</td>\n", + " <td>0.453459</td>\n", + " <td>0.480027</td>\n", + " <td>0.945802</td>\n", + " </tr>\n", + " <tr>\n", + " <td>450</td>\n", + " <td>No log</td>\n", + " <td>0.248783</td>\n", + " <td>0.475821</td>\n", + " <td>0.501258</td>\n", + " <td>0.488208</td>\n", + " <td>0.945092</td>\n", + " </tr>\n", + " <tr>\n", + " <td>500</td>\n", + " <td>0.078000</td>\n", + " <td>0.230288</td>\n", + " <td>0.441730</td>\n", + " <td>0.545912</td>\n", + " <td>0.488326</td>\n", + " <td>0.941631</td>\n", + " </tr>\n", + " <tr>\n", + " <td>550</td>\n", + " <td>0.078000</td>\n", + " <td>0.257168</td>\n", + " <td>0.470556</td>\n", + " <td>0.537736</td>\n", + " <td>0.501908</td>\n", + " <td>0.944002</td>\n", + " </tr>\n", + " <tr>\n", + " <td>600</td>\n", + " <td>0.078000</td>\n", + " <td>0.264445</td>\n", + " <td>0.509816</td>\n", + " <td>0.506289</td>\n", + " <td>0.508047</td>\n", + " <td>0.946145</td>\n", + " </tr>\n", + " <tr>\n", + " <td>650</td>\n", + " <td>0.078000</td>\n", + " <td>0.277278</td>\n", + " <td>0.448800</td>\n", + " <td>0.576101</td>\n", + " <td>0.504544</td>\n", + " <td>0.943115</td>\n", + " </tr>\n", + " <tr>\n", + " <td>700</td>\n", + " <td>0.078000</td>\n", + " <td>0.278563</td>\n", + " <td>0.504016</td>\n", + " <td>0.473585</td>\n", + " <td>0.488327</td>\n", + " <td>0.946145</td>\n", + " </tr>\n", + " <tr>\n", + " <td>750</td>\n", + " <td>0.078000</td>\n", + " <td>0.295773</td>\n", + " <td>0.501239</td>\n", + " <td>0.508805</td>\n", + " <td>0.504994</td>\n", + " <td>0.946436</td>\n", + " </tr>\n", + " <tr>\n", + " <td>800</td>\n", + " <td>0.078000</td>\n", + " <td>0.300880</td>\n", + " <td>0.491329</td>\n", + " <td>0.534591</td>\n", + " <td>0.512048</td>\n", + " <td>0.945333</td>\n", + " </tr>\n", + " <tr>\n", + " <td>850</td>\n", + " <td>0.078000</td>\n", + " <td>0.315745</td>\n", + " <td>0.503695</td>\n", + " <td>0.514465</td>\n", + " <td>0.509023</td>\n", + " <td>0.944154</td>\n", + " </tr>\n", + " <tr>\n", + " <td>900</td>\n", + " <td>0.078000</td>\n", + " <td>0.303611</td>\n", + " <td>0.461154</td>\n", + " <td>0.522642</td>\n", + " <td>0.489976</td>\n", + " <td>0.944839</td>\n", + " </tr>\n", + " <tr>\n", + " <td>950</td>\n", + " <td>0.078000</td>\n", + " <td>0.309813</td>\n", + " <td>0.502151</td>\n", + " <td>0.513836</td>\n", + " <td>0.507927</td>\n", + " <td>0.947552</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1000</td>\n", + " <td>0.009800</td>\n", + " <td>0.335969</td>\n", + " <td>0.520309</td>\n", + " <td>0.507547</td>\n", + " <td>0.513849</td>\n", + " <td>0.946829</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1050</td>\n", + " <td>0.009800</td>\n", + " <td>0.313057</td>\n", + " <td>0.484645</td>\n", + " <td>0.545912</td>\n", + " <td>0.513458</td>\n", + " <td>0.946018</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1100</td>\n", + " <td>0.009800</td>\n", + " <td>0.297767</td>\n", + " <td>0.478187</td>\n", + " <td>0.530818</td>\n", + " <td>0.503130</td>\n", + " <td>0.947780</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1150</td>\n", + " <td>0.009800</td>\n", + " <td>0.317451</td>\n", + " <td>0.496936</td>\n", + " <td>0.510063</td>\n", + " <td>0.503414</td>\n", + " <td>0.946601</td>\n", + " </tr>\n", + " </tbody>\n", + "</table><p>" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "Saving model checkpoint to NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-500\n", + "Configuration saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-500/config.json\n", + "Model weights saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-500/pytorch_model.bin\n", + "tokenizer config file saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-500/tokenizer_config.json\n", + "Special tokens file saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-500/special_tokens_map.json\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "Saving model checkpoint to NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-1000\n", + "Configuration saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-1000/config.json\n", + "Model weights saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-1000/pytorch_model.bin\n", + "tokenizer config file saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-1000/tokenizer_config.json\n", + "Special tokens file saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-1000/special_tokens_map.json\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 186\n", + " Batch size = 8\n", + "\n", + "\n", + "Training completed. Do not forget to share your model on huggingface.co/models =)\n", + "\n", + "\n", + "Loading best model from NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-1000 (score: 0.5138490926456541).\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "TrainOutput(global_step=1150, training_loss=0.03881311048632083, metrics={'train_runtime': 332.8101, 'train_samples_per_second': 83.381, 'train_steps_per_second': 10.517, 'total_flos': 2382353890443360.0, 'train_loss': 0.03881311048632083, 'epoch': 16.43})" + ] + }, + "metadata": {}, + "execution_count": 99 + } + ], + "source": [ + "trainer = Trainer(\n", + " model=model,\n", + " args=args,\n", + " train_dataset=tokenized_datasets[\"train\"],\n", + " eval_dataset=tokenized_datasets[\"validation\"],\n", + " data_collator=data_collator,\n", + " compute_metrics=compute_metrics,\n", + " tokenizer=tokenizer,\n", + " callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]\n", + ")\n", + "trainer.train()" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "arJA0rVIKpLj", + "outputId": "12b3b4d0-2f59-4f46-9f20-e2aa3cd35398" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Saving model checkpoint to model/multilingual-BERT-all-tokens\n", + "Configuration saved in model/multilingual-BERT-all-tokens/config.json\n", + "Model weights saved in model/multilingual-BERT-all-tokens/pytorch_model.bin\n", + "tokenizer config file saved in model/multilingual-BERT-all-tokens/tokenizer_config.json\n", + "Special tokens file saved in model/multilingual-BERT-all-tokens/special_tokens_map.json\n" + ] + } + ], + "source": [ + "trainer.save_model('model/multilingual-BERT-all-tokens')" + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "D3Bdj1H9TxOp" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "kCThHmWQWZ57" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Model 2 - pucpr/clinicalnerpt-medical\n", + "\n", + "### Sentence Based Modelling" + ], + "metadata": { + "id": "5fN4MRRx_fFZ" + } + }, + { + "cell_type": "code", + "source": [ + "dic = {\"tokens\": sent_tokenized, \"ner_tags\": label_sent_tokenized} #Use this option if you want to check the model performance with sentences tokenized by \". \" b" + ], + "metadata": { + "id": "_E60UP1w_fFZ" + }, + "execution_count": 101, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": { + "id": "DY7gaYbj_fFa" + }, + "outputs": [], + "source": [ + "dataset = Dataset.from_dict(dic)" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "44a9f0cc-c1cf-4064-aa89-baffa435cf52", + "id": "FcLEVVcn_fFa" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['tokens', 'ner_tags'],\n", + " num_rows: 11668\n", + "})" + ] + }, + "metadata": {}, + "execution_count": 103 + } + ], + "source": [ + "dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": { + "id": "vZqWyVo2_fFa" + }, + "outputs": [], + "source": [ + "#For training, validation, and test partitions\n", + "\"\"\"\n", + "#Train, val, test partitions\n", + "train_test = dataset.train_test_split()\n", + "test_val = train_test['test'].train_test_split()\n", + "raw_datasets = DatasetDict({\n", + " 'train': train_test['train'],\n", + " 'validation': test_val['train'],\n", + " 'test': test_val['test']\n", + " })\n", + "\"\"\"\n", + "\n", + "#Just for training and validation partitions\n", + "train_test = dataset.train_test_split()\n", + "raw_datasets = DatasetDict({\n", + " 'train': train_test['train'],\n", + " 'validation': train_test['test']\n", + " })" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "ddadc450-b61a-43f9-e5fd-90c8db32de6b", + "id": "GDv9hwpm_fFa" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "DatasetDict({\n", + " train: Dataset({\n", + " features: ['tokens', 'ner_tags'],\n", + " num_rows: 8751\n", + " })\n", + " validation: Dataset({\n", + " features: ['tokens', 'ner_tags'],\n", + " num_rows: 2917\n", + " })\n", + "})" + ] + }, + "metadata": {}, + "execution_count": 105 + } + ], + "source": [ + "raw_datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "54b152d5-edaa-482d-afb5-a209403c7dbc", + "id": "ZnB_fXql_fFa" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]" + ] + }, + "metadata": {}, + "execution_count": 106 + } + ], + "source": [ + "raw_datasets[\"train\"][0][\"ner_tags\"]\n", + "#raw_datasets[\"train\"][0][\"pos_tags\"]\n", + "#raw_datasets[\"train\"][0][\"chunk_tags\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "a7beafe3-27ce-4e03-904a-3b8a8e06d831", + "id": "egmgQAVt_fFa" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['tokens', 'ner_tags'],\n", + " num_rows: 8751\n", + "})" + ] + }, + "metadata": {}, + "execution_count": 107 + } + ], + "source": [ + "raw_datasets['train']" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "e0cd0121-e53b-4d8c-9b33-2d093d604378", + "id": "Hize89uK_fFa" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['O', 'B', 'I']" + ] + }, + "metadata": {}, + "execution_count": 108 + } + ], + "source": [ + "label_names = ['O','B','I']\n", + "label_names" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "a9fe31c8-45c9-4873-fb2c-cf24bd7c64a9", + "id": "bQf54Sst_fFa" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Since 2006 she had tried several times to lose weight , without success . \n", + "O O O O O O O O O O O O O O \n" + ] + } + ], + "source": [ + "words = raw_datasets[\"train\"][0][\"tokens\"]\n", + "labels = [int(n) for n in raw_datasets[\"train\"][0][\"ner_tags\"]]\n", + "#labels = raw_datasets[\"train\"][0][\"pos_tags\"]\n", + "#labels = raw_datasets[\"train\"][0][\"chunk_tags\"]\n", + "line1 = \"\"\n", + "line2 = \"\"\n", + "for word, label in zip(words, labels):\n", + " full_label = label_names[label]\n", + " max_length = max(len(word), len(full_label))\n", + " line1 += word + \" \" * (max_length - len(word) + 1)\n", + " line2 += full_label + \" \" * (max_length - len(full_label) + 1)\n", + "\n", + "print(line1)\n", + "print(line2)" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "53863be6-1112-4c14-b34b-529492b236dc", + "id": "cNKxSKCT_fFb" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--pucpr--clinicalnerpt-medical/snapshots/0d889f90b203734b0ba45904781a6779c8eac2b9/config.json\n", + "Model config BertConfig {\n", + " \"_name_or_path\": \"pucpr/clinicalnerpt-medical\",\n", + " \"_num_labels\": 3,\n", + " \"architectures\": [\n", + " \"BertForTokenClassification\"\n", + " ],\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"classifier_dropout\": null,\n", + " \"directionality\": \"bidi\",\n", + " \"eos_token_ids\": null,\n", + " \"gradient_checkpointing\": false,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.1,\n", + " \"hidden_size\": 768,\n", + " \"id2label\": {\n", + " \"0\": \"O\",\n", + " \"1\": \"B-MedicalDevice\",\n", + " \"2\": \"I-MedicalDevice\"\n", + " },\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"label2id\": {\n", + " \"B-MedicalDevice\": 1,\n", + " \"I-MedicalDevice\": 2,\n", + " \"O\": 0\n", + " },\n", + " \"layer_norm_eps\": 1e-12,\n", + " \"max_position_embeddings\": 512,\n", + " \"model_type\": \"bert\",\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 12,\n", + " \"output_past\": true,\n", + " \"pad_token_id\": null,\n", + " \"pooler_fc_size\": 768,\n", + " \"pooler_num_attention_heads\": 12,\n", + " \"pooler_num_fc_layers\": 3,\n", + " \"pooler_size_per_head\": 128,\n", + " \"pooler_type\": \"first_token_transform\",\n", + " \"position_embedding_type\": \"absolute\",\n", + " \"transformers_version\": \"4.23.1\",\n", + " \"type_vocab_size\": 2,\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 119547\n", + "}\n", + "\n", + "loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--pucpr--clinicalnerpt-medical/snapshots/0d889f90b203734b0ba45904781a6779c8eac2b9/vocab.txt\n", + "loading file tokenizer.json from cache at None\n", + "loading file added_tokens.json from cache at None\n", + "loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--pucpr--clinicalnerpt-medical/snapshots/0d889f90b203734b0ba45904781a6779c8eac2b9/special_tokens_map.json\n", + "loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--pucpr--clinicalnerpt-medical/snapshots/0d889f90b203734b0ba45904781a6779c8eac2b9/tokenizer_config.json\n", + "loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--pucpr--clinicalnerpt-medical/snapshots/0d889f90b203734b0ba45904781a6779c8eac2b9/config.json\n", + "Model config BertConfig {\n", + " \"_name_or_path\": \"pucpr/clinicalnerpt-medical\",\n", + " \"_num_labels\": 3,\n", + " \"architectures\": [\n", + " \"BertForTokenClassification\"\n", + " ],\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"classifier_dropout\": null,\n", + " \"directionality\": \"bidi\",\n", + " \"eos_token_ids\": null,\n", + " \"gradient_checkpointing\": false,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.1,\n", + " \"hidden_size\": 768,\n", + " \"id2label\": {\n", + " \"0\": \"O\",\n", + " \"1\": \"B-MedicalDevice\",\n", + " \"2\": \"I-MedicalDevice\"\n", + " },\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"label2id\": {\n", + " \"B-MedicalDevice\": 1,\n", + " \"I-MedicalDevice\": 2,\n", + " \"O\": 0\n", + " },\n", + " \"layer_norm_eps\": 1e-12,\n", + " \"max_position_embeddings\": 512,\n", + " \"model_type\": \"bert\",\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 12,\n", + " \"output_past\": true,\n", + " \"pad_token_id\": null,\n", + " \"pooler_fc_size\": 768,\n", + " \"pooler_num_attention_heads\": 12,\n", + " \"pooler_num_fc_layers\": 3,\n", + " \"pooler_size_per_head\": 128,\n", + " \"pooler_type\": \"first_token_transform\",\n", + " \"position_embedding_type\": \"absolute\",\n", + " \"transformers_version\": \"4.23.1\",\n", + " \"type_vocab_size\": 2,\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 119547\n", + "}\n", + "\n", + "loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--pucpr--clinicalnerpt-medical/snapshots/0d889f90b203734b0ba45904781a6779c8eac2b9/config.json\n", + "Model config BertConfig {\n", + " \"_name_or_path\": \"pucpr/clinicalnerpt-medical\",\n", + " \"_num_labels\": 3,\n", + " \"architectures\": [\n", + " \"BertForTokenClassification\"\n", + " ],\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"classifier_dropout\": null,\n", + " \"directionality\": \"bidi\",\n", + " \"eos_token_ids\": null,\n", + " \"gradient_checkpointing\": false,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.1,\n", + " \"hidden_size\": 768,\n", + " \"id2label\": {\n", + " \"0\": \"O\",\n", + " \"1\": \"B-MedicalDevice\",\n", + " \"2\": \"I-MedicalDevice\"\n", + " },\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"label2id\": {\n", + " \"B-MedicalDevice\": 1,\n", + " \"I-MedicalDevice\": 2,\n", + " \"O\": 0\n", + " },\n", + " \"layer_norm_eps\": 1e-12,\n", + " \"max_position_embeddings\": 512,\n", + " \"model_type\": \"bert\",\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 12,\n", + " \"output_past\": true,\n", + " \"pad_token_id\": null,\n", + " \"pooler_fc_size\": 768,\n", + " \"pooler_num_attention_heads\": 12,\n", + " \"pooler_num_fc_layers\": 3,\n", + " \"pooler_size_per_head\": 128,\n", + " \"pooler_type\": \"first_token_transform\",\n", + " \"position_embedding_type\": \"absolute\",\n", + " \"transformers_version\": \"4.23.1\",\n", + " \"type_vocab_size\": 2,\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 119547\n", + "}\n", + "\n" + ] + } + ], + "source": [ + "model_checkpoint = \"pucpr/clinicalnerpt-medical\"\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "1b408280-d763-4487-e0c0-06823ddc8daa", + "id": "Gm0lsFPN_fFb" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 111 + } + ], + "source": [ + "tokenizer.is_fast" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "a3d8f036-bf64-4f4d-e793-9244e9bd1043", + "id": "5Awp28lM_fFb" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['[CLS]',\n", + " 'since',\n", + " '2006',\n", + " 'she',\n", + " 'had',\n", + " 'tried',\n", + " 'several',\n", + " 'times',\n", + " 'to',\n", + " 'lose',\n", + " 'weight',\n", + " ',',\n", + " 'without',\n", + " 'success',\n", + " '.',\n", + " '[SEP]']" + ] + }, + "metadata": {}, + "execution_count": 112 + } + ], + "source": [ + "inputs = tokenizer(raw_datasets[\"train\"][0][\"tokens\"], is_split_into_words=True)\n", + "inputs.tokens()" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "65b0481e-76fe-4f0a-e7e6-daa6cee662d1", + "id": "Oqi5M1ll_fFb" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n", + "[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]\n" + ] + } + ], + "source": [ + "labels = raw_datasets[\"train\"][0][\"ner_tags\"]\n", + "word_ids = inputs.word_ids()\n", + "print(labels)\n", + "print(align_labels_with_tokens(labels, word_ids))" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81, + "referenced_widgets": [ + "49642b493a2d4c3592ed010663ef789c", + "fcc50d75dfb04fccb26c9b93bf8f1efa", + "dc630459e6564d69833d46b63493a160", + "31d0648af26b4fe797f2cb2ff21336a8", + "90e567bcc88445f695a896af6d8da649", + "2e815baaae8940ffb90e2aaf5c6f7e2a", + "32250602bfc140d18859e6b48f9dbfbc", + "a85179eeb7d94bba8c79a10a734d8c6b", + "7346605d4df548afb179d489217990ff", + "6b1a06b7d6ea43da8b9af7f92a882455", + "a1b4b4cc26ff4c50b4ef5c99d8c7cb3e", + "7b3ce57b6b5e4253b8219adc2c6ff47e", + "ee7bc9576ea44b97a6ad9d2ae8adaec7", + "c012f774a21e44188365f5b0646b422e", + "5899086ed64c4d3185374a7f541e22fe", + "3aaf8990b0574a9183b9902eb33670a5", + "cc3d6c6b95ab4c80983b3ce2175acb8a", + "873c876ae8f64b9bba4f25efa3a3859a", + "d727d1396df2443e9e46ab7e0c7d5276", + "cda8e993793c4b949910e309a3f50a03", + "5f38c6b988b44678a7b7f06a99daa983", + "7b53cf7c82a6439eb94d5f0635afe2f3" + ] + }, + "outputId": "774134e7-c481-40c4-df12-902897c28ad6", + "id": "fVvvny2D_fFb" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + " 0%| | 0/9 [00:00<?, ?ba/s]" + ], + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "49642b493a2d4c3592ed010663ef789c" + } + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + " 0%| | 0/3 [00:00<?, ?ba/s]" + ], + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "7b3ce57b6b5e4253b8219adc2c6ff47e" + } + }, + "metadata": {} + } + ], + "source": [ + "tokenized_datasets = raw_datasets.map(\n", + " tokenize_and_align_labels,\n", + " batched=True,\n", + " remove_columns=raw_datasets[\"train\"].column_names,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": { + "id": "MKc1_p7W_fFb" + }, + "outputs": [], + "source": [ + "data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "e4e50bbf-a7bd-485b-834b-05949304f311", + "id": "-u-nUkGH_fFb" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "tensor([[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n", + " -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100],\n", + " [-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, -100]])" + ] + }, + "metadata": {}, + "execution_count": 116 + } + ], + "source": [ + "batch = data_collator([tokenized_datasets[\"train\"][i] for i in range(2)])\n", + "batch[\"labels\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": { + "id": "_hW6-lz2_fFb" + }, + "outputs": [], + "source": [ + "id2label = {str(i): label for i, label in enumerate(label_names)}\n", + "label2id = {v: k for k, v in id2label.items()}" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "4b8291fc-97e1-4d18-9713-7350ec2f075e", + "id": "7zhu1mtB_fFb" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--pucpr--clinicalnerpt-medical/snapshots/0d889f90b203734b0ba45904781a6779c8eac2b9/config.json\n", + "Model config BertConfig {\n", + " \"_name_or_path\": \"pucpr/clinicalnerpt-medical\",\n", + " \"_num_labels\": 3,\n", + " \"architectures\": [\n", + " \"BertForTokenClassification\"\n", + " ],\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"classifier_dropout\": null,\n", + " \"directionality\": \"bidi\",\n", + " \"eos_token_ids\": null,\n", + " \"gradient_checkpointing\": false,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.1,\n", + " \"hidden_size\": 768,\n", + " \"id2label\": {\n", + " \"0\": \"O\",\n", + " \"1\": \"B-MedicalDevice\",\n", + " \"2\": \"I-MedicalDevice\"\n", + " },\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"label2id\": {\n", + " \"B-MedicalDevice\": 1,\n", + " \"I-MedicalDevice\": 2,\n", + " \"O\": 0\n", + " },\n", + " \"layer_norm_eps\": 1e-12,\n", + " \"max_position_embeddings\": 512,\n", + " \"model_type\": \"bert\",\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 12,\n", + " \"output_past\": true,\n", + " \"pad_token_id\": null,\n", + " \"pooler_fc_size\": 768,\n", + " \"pooler_num_attention_heads\": 12,\n", + " \"pooler_num_fc_layers\": 3,\n", + " \"pooler_size_per_head\": 128,\n", + " \"pooler_type\": \"first_token_transform\",\n", + " \"position_embedding_type\": \"absolute\",\n", + " \"transformers_version\": \"4.23.1\",\n", + " \"type_vocab_size\": 2,\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 119547\n", + "}\n", + "\n", + "loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--pucpr--clinicalnerpt-medical/snapshots/0d889f90b203734b0ba45904781a6779c8eac2b9/pytorch_model.bin\n", + "All model checkpoint weights were used when initializing BertForTokenClassification.\n", + "\n", + "All the weights of BertForTokenClassification were initialized from the model checkpoint at pucpr/clinicalnerpt-medical.\n", + "If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForTokenClassification for predictions without further training.\n" + ] + } + ], + "source": [ + "model = AutoModelForTokenClassification.from_pretrained( \n", + " model_checkpoint\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "7f9279a0-4c0a-41ea-c39c-caf895886d2a", + "id": "-4pqS6QR_fFc" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "PyTorch: setting up devices\n", + "The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).\n" + ] + } + ], + "source": [ + "args = TrainingArguments(\n", + " \"NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased\",\n", + " evaluation_strategy = IntervalStrategy.STEPS,\n", + " eval_steps = 50,\n", + " learning_rate=5e-5,\n", + " num_train_epochs=50,\n", + " weight_decay=0.01,\n", + " metric_for_best_model = 'f1',\n", + " load_best_model_at_end=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "outputId": "438ca67b-ebf6-49f3-f3ad-03827a51e196", + "id": "Xrf-cYnW_fFc" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/transformers/optimization.py:310: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", + " FutureWarning,\n", + "***** Running training *****\n", + " Num examples = 8751\n", + " Num Epochs = 50\n", + " Instantaneous batch size per device = 8\n", + " Total train batch size (w. parallel, distributed & accumulation) = 8\n", + " Gradient Accumulation steps = 1\n", + " Total optimization steps = 54700\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "<IPython.core.display.HTML object>" + ], + "text/html": [ + "\n", + " <div>\n", + " \n", + " <progress value='2750' max='54700' style='width:300px; height:20px; vertical-align: middle;'></progress>\n", + " [ 2750/54700 10:53 < 3:26:01, 4.20 it/s, Epoch 2/50]\n", + " </div>\n", + " <table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: left;\">\n", + " <th>Step</th>\n", + " <th>Training Loss</th>\n", + " <th>Validation Loss</th>\n", + " <th>Precision</th>\n", + " <th>Recall</th>\n", + " <th>F1</th>\n", + " <th>Accuracy</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <td>50</td>\n", + " <td>No log</td>\n", + " <td>0.213414</td>\n", + " <td>0.235529</td>\n", + " <td>0.183135</td>\n", + " <td>0.206054</td>\n", + " <td>0.927267</td>\n", + " </tr>\n", + " <tr>\n", + " <td>100</td>\n", + " <td>No log</td>\n", + " <td>0.185129</td>\n", + " <td>0.241594</td>\n", + " <td>0.200724</td>\n", + " <td>0.219271</td>\n", + " <td>0.927296</td>\n", + " </tr>\n", + " <tr>\n", + " <td>150</td>\n", + " <td>No log</td>\n", + " <td>0.175822</td>\n", + " <td>0.340069</td>\n", + " <td>0.304708</td>\n", + " <td>0.321419</td>\n", + " <td>0.933018</td>\n", + " </tr>\n", + " <tr>\n", + " <td>200</td>\n", + " <td>No log</td>\n", + " <td>0.175697</td>\n", + " <td>0.278355</td>\n", + " <td>0.332644</td>\n", + " <td>0.303087</td>\n", + " <td>0.932284</td>\n", + " </tr>\n", + " <tr>\n", + " <td>250</td>\n", + " <td>No log</td>\n", + " <td>0.183824</td>\n", + " <td>0.331652</td>\n", + " <td>0.442318</td>\n", + " <td>0.379073</td>\n", + " <td>0.931647</td>\n", + " </tr>\n", + " <tr>\n", + " <td>300</td>\n", + " <td>No log</td>\n", + " <td>0.197462</td>\n", + " <td>0.431220</td>\n", + " <td>0.228660</td>\n", + " <td>0.298851</td>\n", + " <td>0.932342</td>\n", + " </tr>\n", + " <tr>\n", + " <td>350</td>\n", + " <td>No log</td>\n", + " <td>0.167895</td>\n", + " <td>0.386070</td>\n", + " <td>0.433006</td>\n", + " <td>0.408193</td>\n", + " <td>0.937046</td>\n", + " </tr>\n", + " <tr>\n", + " <td>400</td>\n", + " <td>No log</td>\n", + " <td>0.190235</td>\n", + " <td>0.179594</td>\n", + " <td>0.201242</td>\n", + " <td>0.189802</td>\n", + " <td>0.930726</td>\n", + " </tr>\n", + " <tr>\n", + " <td>450</td>\n", + " <td>No log</td>\n", + " <td>0.182239</td>\n", + " <td>0.341333</td>\n", + " <td>0.463528</td>\n", + " <td>0.393155</td>\n", + " <td>0.936134</td>\n", + " </tr>\n", + " <tr>\n", + " <td>500</td>\n", + " <td>0.197700</td>\n", + " <td>0.161157</td>\n", + " <td>0.377800</td>\n", + " <td>0.383859</td>\n", + " <td>0.380806</td>\n", + " <td>0.938074</td>\n", + " </tr>\n", + " <tr>\n", + " <td>550</td>\n", + " <td>0.197700</td>\n", + " <td>0.158413</td>\n", + " <td>0.422494</td>\n", + " <td>0.375065</td>\n", + " <td>0.397369</td>\n", + " <td>0.940534</td>\n", + " </tr>\n", + " <tr>\n", + " <td>600</td>\n", + " <td>0.197700</td>\n", + " <td>0.155527</td>\n", + " <td>0.395566</td>\n", + " <td>0.350750</td>\n", + " <td>0.371812</td>\n", + " <td>0.939897</td>\n", + " </tr>\n", + " <tr>\n", + " <td>650</td>\n", + " <td>0.197700</td>\n", + " <td>0.159286</td>\n", + " <td>0.323269</td>\n", + " <td>0.321262</td>\n", + " <td>0.322263</td>\n", + " <td>0.938202</td>\n", + " </tr>\n", + " <tr>\n", + " <td>700</td>\n", + " <td>0.197700</td>\n", + " <td>0.171928</td>\n", + " <td>0.381213</td>\n", + " <td>0.503880</td>\n", + " <td>0.434046</td>\n", + " <td>0.933910</td>\n", + " </tr>\n", + " <tr>\n", + " <td>750</td>\n", + " <td>0.197700</td>\n", + " <td>0.147573</td>\n", + " <td>0.323820</td>\n", + " <td>0.305225</td>\n", + " <td>0.314248</td>\n", + " <td>0.942298</td>\n", + " </tr>\n", + " <tr>\n", + " <td>800</td>\n", + " <td>0.197700</td>\n", + " <td>0.148027</td>\n", + " <td>0.425638</td>\n", + " <td>0.439731</td>\n", + " <td>0.432570</td>\n", + " <td>0.943326</td>\n", + " </tr>\n", + " <tr>\n", + " <td>850</td>\n", + " <td>0.197700</td>\n", + " <td>0.147193</td>\n", + " <td>0.408436</td>\n", + " <td>0.395758</td>\n", + " <td>0.401997</td>\n", + " <td>0.943993</td>\n", + " </tr>\n", + " <tr>\n", + " <td>900</td>\n", + " <td>0.197700</td>\n", + " <td>0.164249</td>\n", + " <td>0.404822</td>\n", + " <td>0.495085</td>\n", + " <td>0.445427</td>\n", + " <td>0.943277</td>\n", + " </tr>\n", + " <tr>\n", + " <td>950</td>\n", + " <td>0.197700</td>\n", + " <td>0.155629</td>\n", + " <td>0.408925</td>\n", + " <td>0.289188</td>\n", + " <td>0.338788</td>\n", + " <td>0.940769</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1000</td>\n", + " <td>0.165700</td>\n", + " <td>0.185534</td>\n", + " <td>0.289720</td>\n", + " <td>0.080186</td>\n", + " <td>0.125608</td>\n", + " <td>0.925875</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1050</td>\n", + " <td>0.165700</td>\n", + " <td>0.163427</td>\n", + " <td>0.402687</td>\n", + " <td>0.511640</td>\n", + " <td>0.450672</td>\n", + " <td>0.936379</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1100</td>\n", + " <td>0.165700</td>\n", + " <td>0.148432</td>\n", + " <td>0.446659</td>\n", + " <td>0.428867</td>\n", + " <td>0.437582</td>\n", + " <td>0.944943</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1150</td>\n", + " <td>0.165700</td>\n", + " <td>0.160113</td>\n", + " <td>0.457825</td>\n", + " <td>0.426798</td>\n", + " <td>0.441767</td>\n", + " <td>0.943307</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1200</td>\n", + " <td>0.165700</td>\n", + " <td>0.167280</td>\n", + " <td>0.462996</td>\n", + " <td>0.265391</td>\n", + " <td>0.337389</td>\n", + " <td>0.939074</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1250</td>\n", + " <td>0.165700</td>\n", + " <td>0.147611</td>\n", + " <td>0.443478</td>\n", + " <td>0.448526</td>\n", + " <td>0.445988</td>\n", + " <td>0.946579</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1300</td>\n", + " <td>0.165700</td>\n", + " <td>0.147158</td>\n", + " <td>0.447046</td>\n", + " <td>0.434558</td>\n", + " <td>0.440714</td>\n", + " <td>0.945070</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1350</td>\n", + " <td>0.165700</td>\n", + " <td>0.184782</td>\n", + " <td>0.432701</td>\n", + " <td>0.495603</td>\n", + " <td>0.462021</td>\n", + " <td>0.942092</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1400</td>\n", + " <td>0.165700</td>\n", + " <td>0.164378</td>\n", + " <td>0.409772</td>\n", + " <td>0.381790</td>\n", + " <td>0.395287</td>\n", + " <td>0.944522</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1450</td>\n", + " <td>0.165700</td>\n", + " <td>0.161122</td>\n", + " <td>0.489035</td>\n", + " <td>0.461459</td>\n", + " <td>0.474847</td>\n", + " <td>0.946138</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1500</td>\n", + " <td>0.133400</td>\n", + " <td>0.151911</td>\n", + " <td>0.395260</td>\n", + " <td>0.483187</td>\n", + " <td>0.434823</td>\n", + " <td>0.944914</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1550</td>\n", + " <td>0.133400</td>\n", + " <td>0.160618</td>\n", + " <td>0.470175</td>\n", + " <td>0.485256</td>\n", + " <td>0.477597</td>\n", + " <td>0.945551</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1600</td>\n", + " <td>0.133400</td>\n", + " <td>0.153152</td>\n", + " <td>0.458456</td>\n", + " <td>0.485256</td>\n", + " <td>0.471475</td>\n", + " <td>0.944796</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1650</td>\n", + " <td>0.133400</td>\n", + " <td>0.159076</td>\n", + " <td>0.481323</td>\n", + " <td>0.406622</td>\n", + " <td>0.440830</td>\n", + " <td>0.946981</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1700</td>\n", + " <td>0.133400</td>\n", + " <td>0.147369</td>\n", + " <td>0.451056</td>\n", + " <td>0.486291</td>\n", + " <td>0.468011</td>\n", + " <td>0.948882</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1750</td>\n", + " <td>0.133400</td>\n", + " <td>0.157782</td>\n", + " <td>0.416149</td>\n", + " <td>0.554578</td>\n", + " <td>0.475493</td>\n", + " <td>0.942082</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1800</td>\n", + " <td>0.133400</td>\n", + " <td>0.139047</td>\n", + " <td>0.469262</td>\n", + " <td>0.473875</td>\n", + " <td>0.471557</td>\n", + " <td>0.948186</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1850</td>\n", + " <td>0.133400</td>\n", + " <td>0.169168</td>\n", + " <td>0.467036</td>\n", + " <td>0.392137</td>\n", + " <td>0.426322</td>\n", + " <td>0.947804</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1900</td>\n", + " <td>0.133400</td>\n", + " <td>0.190142</td>\n", + " <td>0.451332</td>\n", + " <td>0.482152</td>\n", + " <td>0.466233</td>\n", + " <td>0.946511</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1950</td>\n", + " <td>0.133400</td>\n", + " <td>0.142872</td>\n", + " <td>0.489979</td>\n", + " <td>0.480600</td>\n", + " <td>0.485244</td>\n", + " <td>0.948431</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2000</td>\n", + " <td>0.127700</td>\n", + " <td>0.141305</td>\n", + " <td>0.490028</td>\n", + " <td>0.444904</td>\n", + " <td>0.466377</td>\n", + " <td>0.948823</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2050</td>\n", + " <td>0.127700</td>\n", + " <td>0.154673</td>\n", + " <td>0.441645</td>\n", + " <td>0.516813</td>\n", + " <td>0.476281</td>\n", + " <td>0.944257</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2100</td>\n", + " <td>0.127700</td>\n", + " <td>0.161482</td>\n", + " <td>0.559639</td>\n", + " <td>0.417486</td>\n", + " <td>0.478222</td>\n", + " <td>0.948402</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2150</td>\n", + " <td>0.127700</td>\n", + " <td>0.155931</td>\n", + " <td>0.482567</td>\n", + " <td>0.494051</td>\n", + " <td>0.488241</td>\n", + " <td>0.946942</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2200</td>\n", + " <td>0.127700</td>\n", + " <td>0.180343</td>\n", + " <td>0.503902</td>\n", + " <td>0.467667</td>\n", + " <td>0.485109</td>\n", + " <td>0.947011</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2250</td>\n", + " <td>0.127700</td>\n", + " <td>0.173788</td>\n", + " <td>0.474372</td>\n", + " <td>0.469219</td>\n", + " <td>0.471782</td>\n", + " <td>0.945678</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2300</td>\n", + " <td>0.127700</td>\n", + " <td>0.172453</td>\n", + " <td>0.415626</td>\n", + " <td>0.533885</td>\n", + " <td>0.467391</td>\n", + " <td>0.942180</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2350</td>\n", + " <td>0.127700</td>\n", + " <td>0.177914</td>\n", + " <td>0.406309</td>\n", + " <td>0.553026</td>\n", + " <td>0.468449</td>\n", + " <td>0.935782</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2400</td>\n", + " <td>0.127700</td>\n", + " <td>0.177702</td>\n", + " <td>0.452438</td>\n", + " <td>0.484739</td>\n", + " <td>0.468032</td>\n", + " <td>0.947794</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2450</td>\n", + " <td>0.127700</td>\n", + " <td>0.149222</td>\n", + " <td>0.504098</td>\n", + " <td>0.445422</td>\n", + " <td>0.472947</td>\n", + " <td>0.947657</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2500</td>\n", + " <td>0.106000</td>\n", + " <td>0.176770</td>\n", + " <td>0.487315</td>\n", + " <td>0.476979</td>\n", + " <td>0.482092</td>\n", + " <td>0.947148</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2550</td>\n", + " <td>0.106000</td>\n", + " <td>0.159687</td>\n", + " <td>0.442256</td>\n", + " <td>0.511123</td>\n", + " <td>0.474202</td>\n", + " <td>0.944669</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2600</td>\n", + " <td>0.106000</td>\n", + " <td>0.185752</td>\n", + " <td>0.529092</td>\n", + " <td>0.503363</td>\n", + " <td>0.515907</td>\n", + " <td>0.949421</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2650</td>\n", + " <td>0.106000</td>\n", + " <td>0.166756</td>\n", + " <td>0.449671</td>\n", + " <td>0.494568</td>\n", + " <td>0.471052</td>\n", + " <td>0.945943</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2700</td>\n", + " <td>0.106000</td>\n", + " <td>0.174073</td>\n", + " <td>0.432521</td>\n", + " <td>0.477496</td>\n", + " <td>0.453897</td>\n", + " <td>0.946609</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2750</td>\n", + " <td>0.106000</td>\n", + " <td>0.186960</td>\n", + " <td>0.526675</td>\n", + " <td>0.439214</td>\n", + " <td>0.478984</td>\n", + " <td>0.947981</td>\n", + " </tr>\n", + " </tbody>\n", + "</table><p>" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "Saving model checkpoint to NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-500\n", + "Configuration saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-500/config.json\n", + "Model weights saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-500/pytorch_model.bin\n", + "tokenizer config file saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-500/tokenizer_config.json\n", + "Special tokens file saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-500/special_tokens_map.json\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "Saving model checkpoint to NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-1000\n", + "Configuration saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-1000/config.json\n", + "Model weights saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-1000/pytorch_model.bin\n", + "tokenizer config file saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-1000/tokenizer_config.json\n", + "Special tokens file saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-1000/special_tokens_map.json\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "Saving model checkpoint to NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-1500\n", + "Configuration saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-1500/config.json\n", + "Model weights saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-1500/pytorch_model.bin\n", + "tokenizer config file saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-1500/tokenizer_config.json\n", + "Special tokens file saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-1500/special_tokens_map.json\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "Saving model checkpoint to NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-2000\n", + "Configuration saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-2000/config.json\n", + "Model weights saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-2000/pytorch_model.bin\n", + "tokenizer config file saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-2000/tokenizer_config.json\n", + "Special tokens file saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-2000/special_tokens_map.json\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "Saving model checkpoint to NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-2500\n", + "Configuration saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-2500/config.json\n", + "Model weights saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-2500/pytorch_model.bin\n", + "tokenizer config file saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-2500/tokenizer_config.json\n", + "Special tokens file saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-2500/special_tokens_map.json\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "***** Running Evaluation *****\n", + " Num examples = 2917\n", + " Batch size = 8\n", + "\n", + "\n", + "Training completed. Do not forget to share your model on huggingface.co/models =)\n", + "\n", + "\n", + "Loading best model from NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-2500 (score: 0.48209150326797384).\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "TrainOutput(global_step=2750, training_loss=0.14159858218106355, metrics={'train_runtime': 653.9792, 'train_samples_per_second': 669.058, 'train_steps_per_second': 83.642, 'total_flos': 861581789561556.0, 'train_loss': 0.14159858218106355, 'epoch': 2.51})" + ] + }, + "metadata": {}, + "execution_count": 120 + } + ], + "source": [ + "trainer = Trainer(\n", + " model=model,\n", + " args=args,\n", + " train_dataset=tokenized_datasets[\"train\"],\n", + " eval_dataset=tokenized_datasets[\"validation\"],\n", + " data_collator=data_collator,\n", + " compute_metrics=compute_metrics,\n", + " tokenizer=tokenizer,\n", + " callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]\n", + ")\n", + "trainer.train()" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "977619c0-4fc0-423f-f47a-b308c9fb1ecd", + "id": "RdLDXL1K_fFc" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Saving model checkpoint to model/multilingual-BERT-sentence\n", + "Configuration saved in model/multilingual-BERT-sentence/config.json\n", + "Model weights saved in model/multilingual-BERT-sentence/pytorch_model.bin\n", + "tokenizer config file saved in model/multilingual-BERT-sentence/tokenizer_config.json\n", + "Special tokens file saved in model/multilingual-BERT-sentence/special_tokens_map.json\n" + ] + } + ], + "source": [ + "trainer.save_model('model/multilingual-BERT-sentence')" + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "kFOvQa4MCQdW" + }, + "execution_count": 166, + "outputs": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.2" + }, + "colab": { + "provenance": [], + "collapsed_sections": [], + "machine_shape": "hm" + }, + "accelerator": "GPU", + "gpuClass": "premium", + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "b3fe12ca95e84b198d16bdb4d20f9ad9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_1b7f8f1786394c01bad4a8589ad16513", + "IPY_MODEL_70e437b3ba294189b4799c6607532ebd", + "IPY_MODEL_0fb47d91dbf9497cac1ffc1c5dfd4519" + ], + "layout": "IPY_MODEL_9cfec0f21c0a459f9f5888c389a6a479" + } + }, + "1b7f8f1786394c01bad4a8589ad16513": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ef66098fb5f748eabe11abc3fe4ad54d", + "placeholder": "", + "style": "IPY_MODEL_563d8b35192240be960bc08909984119", + "value": "Downloading builder script: " + } + }, + "70e437b3ba294189b4799c6607532ebd": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b94385d1423e47f5a9e2351bf873c3e0", + "max": 2472, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_e1b6e7774bc94a87ad23fb53d6c9b985", + "value": 2472 + } + }, + "0fb47d91dbf9497cac1ffc1c5dfd4519": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a0b523772cf04a85b0ac000cc9a83c67", + "placeholder": "", + "style": "IPY_MODEL_71a3f1b2112344ea81721e59cce14cec", + "value": " 6.33k/? [00:00<00:00, 198kB/s]" + } + }, + "9cfec0f21c0a459f9f5888c389a6a479": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ef66098fb5f748eabe11abc3fe4ad54d": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "563d8b35192240be960bc08909984119": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b94385d1423e47f5a9e2351bf873c3e0": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e1b6e7774bc94a87ad23fb53d6c9b985": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "a0b523772cf04a85b0ac000cc9a83c67": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "71a3f1b2112344ea81721e59cce14cec": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a275c54cbefb4438a3015080e8b57999": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_a4696c167a3247bd8fd0727e0556463a", + "IPY_MODEL_afa79c37c031491da9e229c637d80cc4", + "IPY_MODEL_5ddf799b2fd94edc9949d36450a2d5e9" + ], + "layout": "IPY_MODEL_592af11564074af19e40bce6680ed7f1" + } + }, + "a4696c167a3247bd8fd0727e0556463a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_125b61b8e80d4192a6f19d43ba4797dc", + "placeholder": "", + "style": "IPY_MODEL_d9ff16ada2d94eb7a1adc70e5265ff2d", + "value": "Downloading: 100%" + } + }, + "afa79c37c031491da9e229c637d80cc4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f9bd10de9e2845f08100a29293b92d1c", + "max": 373, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_c28349dbeccc4124b583e0eeec004e6b", + "value": 373 + } + }, + "5ddf799b2fd94edc9949d36450a2d5e9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ff2cf349b07442bd9812dd8c7e82e59b", + "placeholder": "", + "style": "IPY_MODEL_414d466fed0b42378d8b38f10c720eba", + "value": " 373/373 [00:00<00:00, 10.0kB/s]" + } + }, + "592af11564074af19e40bce6680ed7f1": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "125b61b8e80d4192a6f19d43ba4797dc": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d9ff16ada2d94eb7a1adc70e5265ff2d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f9bd10de9e2845f08100a29293b92d1c": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c28349dbeccc4124b583e0eeec004e6b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "ff2cf349b07442bd9812dd8c7e82e59b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "414d466fed0b42378d8b38f10c720eba": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e432f1e3e5c54358a321a21e9c7aad1f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_cf931d70dc1a4d2ba5f10dba7bf90ece", + "IPY_MODEL_58f1edc459ef4f5bab25544474897db3", + "IPY_MODEL_8894005504364c36964d283cf58bb223" + ], + "layout": "IPY_MODEL_78db41a453ce4ff4884960c615147331" + } + }, + "cf931d70dc1a4d2ba5f10dba7bf90ece": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fe8d877f0fc1417baad9838094045475", + "placeholder": "", + "style": "IPY_MODEL_74bfdb85ed55436f8c12bf9b25375533", + "value": "Downloading: 100%" + } + }, + "58f1edc459ef4f5bab25544474897db3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e813e2a1cb7248b7a8c404d55e4fb248", + "max": 231508, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_cb3c438fb3a6412d80b5ba673a6455cb", + "value": 231508 + } + }, + "8894005504364c36964d283cf58bb223": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bb189f5bc189462cad4824a1c30335c0", + "placeholder": "", + "style": "IPY_MODEL_e98b7218049f4310951a1608c52c14e0", + "value": " 232k/232k [00:00<00:00, 2.99MB/s]" + } + }, + "78db41a453ce4ff4884960c615147331": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fe8d877f0fc1417baad9838094045475": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "74bfdb85ed55436f8c12bf9b25375533": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e813e2a1cb7248b7a8c404d55e4fb248": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cb3c438fb3a6412d80b5ba673a6455cb": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "bb189f5bc189462cad4824a1c30335c0": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e98b7218049f4310951a1608c52c14e0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3dd1f27ff0d24a1294534ff7e69a7abb": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_4128d82e19f14e9d9be5416ebc974d0d", + "IPY_MODEL_6676a80dc293456ea7aed4ce3e281d83", + "IPY_MODEL_55a7a4c336884f26a53292d559a06ff8" + ], + "layout": "IPY_MODEL_d27469698b1e4ad1ae74ced6f7c3942d" + } + }, + "4128d82e19f14e9d9be5416ebc974d0d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d201490a05c049d38b087008aac0a400", + "placeholder": "", + "style": "IPY_MODEL_703d715a4ef64c4e93cc6496f5340451", + "value": "Downloading: 100%" + } + }, + "6676a80dc293456ea7aed4ce3e281d83": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_41d861058e3e458e949f1f3d92623217", + "max": 711494, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_f4d9343bd31d47b1b3dcf0494825be2d", + "value": 711494 + } + }, + "55a7a4c336884f26a53292d559a06ff8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b75e2a47db2b47dd8740f77b337c308f", + "placeholder": "", + "style": "IPY_MODEL_d8849516ccb44011a7f9e7e745b30c60", + "value": " 711k/711k [00:00<00:00, 6.20MB/s]" + } + }, + "d27469698b1e4ad1ae74ced6f7c3942d": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d201490a05c049d38b087008aac0a400": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "703d715a4ef64c4e93cc6496f5340451": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "41d861058e3e458e949f1f3d92623217": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f4d9343bd31d47b1b3dcf0494825be2d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "b75e2a47db2b47dd8740f77b337c308f": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d8849516ccb44011a7f9e7e745b30c60": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ae5928c8da4243fba06ae9bf5086ba31": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_e613455bcbb24e36a31666acd83d7b24", + "IPY_MODEL_bac45a33f9b444a1985ef56a9be85c52", + "IPY_MODEL_91ac0673e600400f904b1b10deb86cee" + ], + "layout": "IPY_MODEL_289f23dd30814993afde0f5e987fdd9e" + } + }, + "e613455bcbb24e36a31666acd83d7b24": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8e72912c0e434060ac30517a98d07a9e", + "placeholder": "", + "style": "IPY_MODEL_ede252ab2cee4ffbbc2f5519373d1e97", + "value": "Downloading: 100%" + } + }, + "bac45a33f9b444a1985ef56a9be85c52": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f2da21cc1007475ca0233a9e5d146d65", + "max": 125, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_92fd7c43f87142d1bbd05f89ba3bfe39", + "value": 125 + } + }, + "91ac0673e600400f904b1b10deb86cee": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b58c9bfa30b3421496adb52e082cb50a", + "placeholder": "", + "style": "IPY_MODEL_6cbfa925d26e47139365d10b9b28d96a", + "value": " 125/125 [00:00<00:00, 5.13kB/s]" + } + }, + "289f23dd30814993afde0f5e987fdd9e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8e72912c0e434060ac30517a98d07a9e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ede252ab2cee4ffbbc2f5519373d1e97": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f2da21cc1007475ca0233a9e5d146d65": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "92fd7c43f87142d1bbd05f89ba3bfe39": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "b58c9bfa30b3421496adb52e082cb50a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6cbfa925d26e47139365d10b9b28d96a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d161016f9fea41e6b27eb537c12d0703": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_f315bcfdc76848cb8e851a2698e0248b", + "IPY_MODEL_ef152955607540f2a7d38bf9e2207eec", + "IPY_MODEL_456c36425ac94dc294f8402c07668a51" + ], + "layout": "IPY_MODEL_4d305d32efdf4b639e65e816a7132597" + } + }, + "f315bcfdc76848cb8e851a2698e0248b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5c3a10b039c344509be9867ca40a8472", + "placeholder": "", + "style": "IPY_MODEL_6663eac35b7a4043b97edb90a555e3d9", + "value": " 0%" + } + }, + "ef152955607540f2a7d38bf9e2207eec": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "danger", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_30bce58edba74043abc1a2625c492d4a", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_d4b6dbbad9c946ed99b6c6e587bfb6da", + "value": 0 + } + }, + "456c36425ac94dc294f8402c07668a51": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8976a59e4ea049088f92a37f7547e16e", + "placeholder": "", + "style": "IPY_MODEL_6a689955d9b3463abaaaa03b62d3cf69", + "value": " 0/1 [00:00<?, ?ba/s]" + } + }, + "4d305d32efdf4b639e65e816a7132597": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5c3a10b039c344509be9867ca40a8472": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6663eac35b7a4043b97edb90a555e3d9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "30bce58edba74043abc1a2625c492d4a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d4b6dbbad9c946ed99b6c6e587bfb6da": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "8976a59e4ea049088f92a37f7547e16e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6a689955d9b3463abaaaa03b62d3cf69": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9d65a59161cd401aad05f4a52d51c724": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_6f244b91a3884eb5b0fbd577ed5d1710", + "IPY_MODEL_2b6ad660dd1f4c78855433118b9fb61e", + "IPY_MODEL_101fa9a9581a46d8b1e0951f03796740" + ], + "layout": "IPY_MODEL_7ffe4378bc7b410780780dd51d0705ea" + } + }, + "6f244b91a3884eb5b0fbd577ed5d1710": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_77c422e831944566a6529da37645ef6d", + "placeholder": "", + "style": "IPY_MODEL_9a88121d0138438980f1c7e4341f480a", + "value": " 0%" + } + }, + "2b6ad660dd1f4c78855433118b9fb61e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "danger", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_52600cdbf4804b148e02724ae4902de5", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_11f16a0c34e64d6494ac1d2550d18f8f", + "value": 0 + } + }, + "101fa9a9581a46d8b1e0951f03796740": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_645616ac236e479c8303a56100d26d51", + "placeholder": "", + "style": "IPY_MODEL_859e35e323f0407fbdea9eb7ae953742", + "value": " 0/1 [00:00<?, ?ba/s]" + } + }, + "7ffe4378bc7b410780780dd51d0705ea": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "77c422e831944566a6529da37645ef6d": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9a88121d0138438980f1c7e4341f480a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "52600cdbf4804b148e02724ae4902de5": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "11f16a0c34e64d6494ac1d2550d18f8f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "645616ac236e479c8303a56100d26d51": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "859e35e323f0407fbdea9eb7ae953742": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "40c2b37fa07f44648cecc9b7e406e7e2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_69ed5fe8ed6046acb4202689c065f858", + "IPY_MODEL_bbd9cf7a77aa48fda3a648583ed02b08", + "IPY_MODEL_ebe8b2b35e884fd28bb42eacf01ff07c" + ], + "layout": "IPY_MODEL_ddb14bc1d5d4437a9ee4a895846e7d29" + } + }, + "69ed5fe8ed6046acb4202689c065f858": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_669c24c6309f46cbbdcd0c764143e74f", + "placeholder": "", + "style": "IPY_MODEL_d1e4665beafa4bbeb25d0e9e8447a5a9", + "value": " 89%" + } + }, + "bbd9cf7a77aa48fda3a648583ed02b08": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "danger", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_09217bdc1e2145eb84cc97207595e6f0", + "max": 9, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_b294af01ac5f483dacbe2e1c40fdf223", + "value": 8 + } + }, + "ebe8b2b35e884fd28bb42eacf01ff07c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c1e27e9184204d618ce59b97f7302335", + "placeholder": "", + "style": "IPY_MODEL_2e63f2af443d448aaaddf81127def048", + "value": " 8/9 [00:00<00:00, 9.96ba/s]" + } + }, + "ddb14bc1d5d4437a9ee4a895846e7d29": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "669c24c6309f46cbbdcd0c764143e74f": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d1e4665beafa4bbeb25d0e9e8447a5a9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "09217bdc1e2145eb84cc97207595e6f0": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b294af01ac5f483dacbe2e1c40fdf223": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "c1e27e9184204d618ce59b97f7302335": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2e63f2af443d448aaaddf81127def048": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9239cc2fd1d94d86986b7f395de70fca": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_f1e8d31b67db4089ab1b036bda341617", + "IPY_MODEL_33be40ebcab54ff68855f1145cf5e1d6", + "IPY_MODEL_d96c111f09d74a0c9816328f88d9e45b" + ], + "layout": "IPY_MODEL_3907dc2aaa484877aee9beab8a6888d4" + } + }, + "f1e8d31b67db4089ab1b036bda341617": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_aaacfb0f3bd1427ea44ec84c28a2aaf7", + "placeholder": "", + "style": "IPY_MODEL_cb92e843491142e8a2a4008223a90d02", + "value": " 67%" + } + }, + "33be40ebcab54ff68855f1145cf5e1d6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "danger", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_52169f264141463e94a7761a4ffb3f7a", + "max": 3, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_e509b790873740b59aa2f52875ca2038", + "value": 2 + } + }, + "d96c111f09d74a0c9816328f88d9e45b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a395318bce7348d78ca83a308552f042", + "placeholder": "", + "style": "IPY_MODEL_a2bb171f700743559e1d2c472c8289ef", + "value": " 2/3 [00:00<00:00, 8.13ba/s]" + } + }, + "3907dc2aaa484877aee9beab8a6888d4": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "aaacfb0f3bd1427ea44ec84c28a2aaf7": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cb92e843491142e8a2a4008223a90d02": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "52169f264141463e94a7761a4ffb3f7a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e509b790873740b59aa2f52875ca2038": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "a395318bce7348d78ca83a308552f042": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a2bb171f700743559e1d2c472c8289ef": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f0fc94c6df4c432f9e1edcfceaf44edd": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_03f4216f904d4bf6a8e461c50f40378f", + "IPY_MODEL_76fecb01ad6441a08ad0ddb989a8ab80", + "IPY_MODEL_783edab3150d40a3bf99ed910cdbaf88" + ], + "layout": "IPY_MODEL_e33897e8fa4841308c55b19352876ab1" + } + }, + "03f4216f904d4bf6a8e461c50f40378f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_909d7d349769402a8c20b02b35eafbd9", + "placeholder": "", + "style": "IPY_MODEL_6139e3d551044671a79e15e8adf351ab", + "value": "Downloading: 100%" + } + }, + "76fecb01ad6441a08ad0ddb989a8ab80": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1c19ec799ccb4e788f34ed8ac37d495f", + "max": 151, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_7a49ac5f18f64b41825378184f8c32ec", + "value": 151 + } + }, + "783edab3150d40a3bf99ed910cdbaf88": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_76ad2415389f41deb4f2810bca56b753", + "placeholder": "", + "style": "IPY_MODEL_8c4bb02e55fa48429c8f86dac5cb369e", + "value": " 151/151 [00:00<00:00, 5.83kB/s]" + } + }, + "e33897e8fa4841308c55b19352876ab1": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "909d7d349769402a8c20b02b35eafbd9": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6139e3d551044671a79e15e8adf351ab": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "1c19ec799ccb4e788f34ed8ac37d495f": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7a49ac5f18f64b41825378184f8c32ec": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "76ad2415389f41deb4f2810bca56b753": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8c4bb02e55fa48429c8f86dac5cb369e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "980374f604ec4970b0afa70d108c864b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_baa9e8a9169a45efb8c117fdf4ea45b7", + "IPY_MODEL_aa248b7d4eae4e5d965a7d04144adacc", + "IPY_MODEL_311d8199627e4c4f83d02c167b5755f3" + ], + "layout": "IPY_MODEL_a299d926edbb4c51906b1be8f694d074" + } + }, + "baa9e8a9169a45efb8c117fdf4ea45b7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c76ebf5d6c6c46cd9d14591a47725ae1", + "placeholder": "", + "style": "IPY_MODEL_178555439d854d30a01cac053adf9079", + "value": "Downloading: 100%" + } + }, + "aa248b7d4eae4e5d965a7d04144adacc": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_00c8d81110fe4f4bbcf77be4d20581c3", + "max": 1055, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_43c18ef010014cb395f045dd26497fba", + "value": 1055 + } + }, + "311d8199627e4c4f83d02c167b5755f3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3b83d1f429d34e8e8de6ddff555df02d", + "placeholder": "", + "style": "IPY_MODEL_91a8348ac2194686a9ef075f7d49687d", + "value": " 1.05k/1.05k [00:00<00:00, 26.5kB/s]" + } + }, + "a299d926edbb4c51906b1be8f694d074": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c76ebf5d6c6c46cd9d14591a47725ae1": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "178555439d854d30a01cac053adf9079": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "00c8d81110fe4f4bbcf77be4d20581c3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "43c18ef010014cb395f045dd26497fba": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "3b83d1f429d34e8e8de6ddff555df02d": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "91a8348ac2194686a9ef075f7d49687d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "42dcc74bff5440608a3e9f2fa580cd3c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_616c54d0cd534047b93b215e7baf2ba3", + "IPY_MODEL_846ce38d6fb84279a7419091f2d269b0", + "IPY_MODEL_1e15050772b54e34b014a98b9710c783" + ], + "layout": "IPY_MODEL_951f476862ea49619100e202a6e742f6" + } + }, + "616c54d0cd534047b93b215e7baf2ba3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f6e25547ba664cd59128536944a926fd", + "placeholder": "", + "style": "IPY_MODEL_53185a2afedc41e0a680d5007656b90b", + "value": "Downloading: 100%" + } + }, + "846ce38d6fb84279a7419091f2d269b0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_860f8204efae42d6ad2fa4eb9e661810", + "max": 995526, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_e61ec283a9c04cc696d17bbe24ccf460", + "value": 995526 + } + }, + "1e15050772b54e34b014a98b9710c783": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_15abc268777a4e3cb5c3c7f430745c1d", + "placeholder": "", + "style": "IPY_MODEL_e6c1266e8b074bdfafa0db6208743a07", + "value": " 996k/996k [00:00<00:00, 4.17MB/s]" + } + }, + "951f476862ea49619100e202a6e742f6": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f6e25547ba664cd59128536944a926fd": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "53185a2afedc41e0a680d5007656b90b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "860f8204efae42d6ad2fa4eb9e661810": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e61ec283a9c04cc696d17bbe24ccf460": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "15abc268777a4e3cb5c3c7f430745c1d": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e6c1266e8b074bdfafa0db6208743a07": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "01a12f499b7942cc90f2032a8f3284e9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_804bf9b8a2154399a05dd0860f4dfd89", + "IPY_MODEL_66fca4d7946240c3b08ba51fac82f2ae", + "IPY_MODEL_6cefb3be5be9488ca033ed9908c6a8f5" + ], + "layout": "IPY_MODEL_2d0f2804db004da0914e4733ce96b749" + } + }, + "804bf9b8a2154399a05dd0860f4dfd89": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3d6b93cde5254ce99f19802b7c1146f4", + "placeholder": "", + "style": "IPY_MODEL_838416bfbee0400299abff324c4825bc", + "value": "Downloading: 100%" + } + }, + "66fca4d7946240c3b08ba51fac82f2ae": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f7584b77d97f4d48aa5b50bae2df49f8", + "max": 112, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_0a5cd97f8a914ff89fd27aed7b38164d", + "value": 112 + } + }, + "6cefb3be5be9488ca033ed9908c6a8f5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ad4c308ebd574e909d31b161580b9064", + "placeholder": "", + "style": "IPY_MODEL_74030c98fcb942ed9d9ffc43799113f0", + "value": " 112/112 [00:00<00:00, 4.51kB/s]" + } + }, + "2d0f2804db004da0914e4733ce96b749": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3d6b93cde5254ce99f19802b7c1146f4": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "838416bfbee0400299abff324c4825bc": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f7584b77d97f4d48aa5b50bae2df49f8": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0a5cd97f8a914ff89fd27aed7b38164d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "ad4c308ebd574e909d31b161580b9064": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "74030c98fcb942ed9d9ffc43799113f0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "84631a71d2ca4ae8a781019ea3ce6da9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_9666c2ff4f32449ea3cbef076a166836", + "IPY_MODEL_efc4af4547804d9daae235691942e73a", + "IPY_MODEL_c8194cd34f554a789359eab6e7596291" + ], + "layout": "IPY_MODEL_54f649d297ec4456be5b5df14497fb93" + } + }, + "9666c2ff4f32449ea3cbef076a166836": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e35bc6f834c4490e85ac2ae25d9e922f", + "placeholder": "", + "style": "IPY_MODEL_b166c61328bb49ea803f0d3a7d515d81", + "value": " 0%" + } + }, + "efc4af4547804d9daae235691942e73a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "danger", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a2aac740ef3b4f3c913b71c82b408c2c", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_d77e188e43dc4e01b82054f2a6a8e832", + "value": 0 + } + }, + "c8194cd34f554a789359eab6e7596291": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_266b2c90bb4d41198784f016e996066a", + "placeholder": "", + "style": "IPY_MODEL_cc62b20cee8c4a4b8e24576d1c854fbf", + "value": " 0/1 [00:00<?, ?ba/s]" + } + }, + "54f649d297ec4456be5b5df14497fb93": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e35bc6f834c4490e85ac2ae25d9e922f": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b166c61328bb49ea803f0d3a7d515d81": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a2aac740ef3b4f3c913b71c82b408c2c": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d77e188e43dc4e01b82054f2a6a8e832": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "266b2c90bb4d41198784f016e996066a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cc62b20cee8c4a4b8e24576d1c854fbf": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "98bebe04cb254369bb3b6b991d4b2648": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_40080df663cc43749963657150cf632d", + "IPY_MODEL_a04f157a98db4d47b75094b6ef1b0990", + "IPY_MODEL_ba2967950f4c483ea399827046f52963" + ], + "layout": "IPY_MODEL_a9e0ad6a141a462fb9bea1c18d447332" + } + }, + "40080df663cc43749963657150cf632d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_31fbaf0ffb0845f5800e6fca0353b929", + "placeholder": "", + "style": "IPY_MODEL_4a2c17e757d34547a4a68718ef064073", + "value": " 0%" + } + }, + "a04f157a98db4d47b75094b6ef1b0990": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "danger", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f74d219071ab49479194f1061bf343be", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_a92097360dba4d5c848b48e345b0028e", + "value": 0 + } + }, + "ba2967950f4c483ea399827046f52963": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_24a164b22a8f4e00944ef05bcec5d032", + "placeholder": "", + "style": "IPY_MODEL_976ade0b37cd43e2aa5aa272dac2445b", + "value": " 0/1 [00:00<?, ?ba/s]" + } + }, + "a9e0ad6a141a462fb9bea1c18d447332": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "31fbaf0ffb0845f5800e6fca0353b929": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4a2c17e757d34547a4a68718ef064073": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f74d219071ab49479194f1061bf343be": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a92097360dba4d5c848b48e345b0028e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "24a164b22a8f4e00944ef05bcec5d032": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "976ade0b37cd43e2aa5aa272dac2445b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "21b02caa5dc146b8ac2bd1a282381c7f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_0c01b39e34744f74ae47d0d4e70638ce", + "IPY_MODEL_82e7357418144359abba3548449c0c08", + "IPY_MODEL_2ff5e18b6d684b99a82676dbf3db6d32" + ], + "layout": "IPY_MODEL_cd75e771337843d9b55838502bed9a1b" + } + }, + "0c01b39e34744f74ae47d0d4e70638ce": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b6a4250c705f4dd8b9f52731cce2a23d", + "placeholder": "", + "style": "IPY_MODEL_4549eb0838864025ac6a0f3da9192818", + "value": "Downloading: 100%" + } + }, + "82e7357418144359abba3548449c0c08": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b2d377844c1a4bc09433a94088f5213e", + "max": 709144049, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_fe7e058b9a6944969d83f7e72b398bb1", + "value": 709144049 + } + }, + "2ff5e18b6d684b99a82676dbf3db6d32": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_47d5ccd1eafe4ea1a3476e06d998bd74", + "placeholder": "", + "style": "IPY_MODEL_5a17027205cb4c2bbe140e1e96e4b495", + "value": " 709M/709M [00:11<00:00, 62.7MB/s]" + } + }, + "cd75e771337843d9b55838502bed9a1b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b6a4250c705f4dd8b9f52731cce2a23d": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4549eb0838864025ac6a0f3da9192818": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b2d377844c1a4bc09433a94088f5213e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fe7e058b9a6944969d83f7e72b398bb1": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "47d5ccd1eafe4ea1a3476e06d998bd74": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5a17027205cb4c2bbe140e1e96e4b495": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "49642b493a2d4c3592ed010663ef789c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_fcc50d75dfb04fccb26c9b93bf8f1efa", + "IPY_MODEL_dc630459e6564d69833d46b63493a160", + "IPY_MODEL_31d0648af26b4fe797f2cb2ff21336a8" + ], + "layout": "IPY_MODEL_90e567bcc88445f695a896af6d8da649" + } + }, + "fcc50d75dfb04fccb26c9b93bf8f1efa": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2e815baaae8940ffb90e2aaf5c6f7e2a", + "placeholder": "", + "style": "IPY_MODEL_32250602bfc140d18859e6b48f9dbfbc", + "value": " 89%" + } + }, + "dc630459e6564d69833d46b63493a160": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "danger", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a85179eeb7d94bba8c79a10a734d8c6b", + "max": 9, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_7346605d4df548afb179d489217990ff", + "value": 8 + } + }, + "31d0648af26b4fe797f2cb2ff21336a8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6b1a06b7d6ea43da8b9af7f92a882455", + "placeholder": "", + "style": "IPY_MODEL_a1b4b4cc26ff4c50b4ef5c99d8c7cb3e", + "value": " 8/9 [00:00<00:00, 9.55ba/s]" + } + }, + "90e567bcc88445f695a896af6d8da649": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2e815baaae8940ffb90e2aaf5c6f7e2a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "32250602bfc140d18859e6b48f9dbfbc": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a85179eeb7d94bba8c79a10a734d8c6b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7346605d4df548afb179d489217990ff": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "6b1a06b7d6ea43da8b9af7f92a882455": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a1b4b4cc26ff4c50b4ef5c99d8c7cb3e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7b3ce57b6b5e4253b8219adc2c6ff47e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_ee7bc9576ea44b97a6ad9d2ae8adaec7", + "IPY_MODEL_c012f774a21e44188365f5b0646b422e", + "IPY_MODEL_5899086ed64c4d3185374a7f541e22fe" + ], + "layout": "IPY_MODEL_3aaf8990b0574a9183b9902eb33670a5" + } + }, + "ee7bc9576ea44b97a6ad9d2ae8adaec7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cc3d6c6b95ab4c80983b3ce2175acb8a", + "placeholder": "", + "style": "IPY_MODEL_873c876ae8f64b9bba4f25efa3a3859a", + "value": " 67%" + } + }, + "c012f774a21e44188365f5b0646b422e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "danger", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d727d1396df2443e9e46ab7e0c7d5276", + "max": 3, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_cda8e993793c4b949910e309a3f50a03", + "value": 2 + } + }, + "5899086ed64c4d3185374a7f541e22fe": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5f38c6b988b44678a7b7f06a99daa983", + "placeholder": "", + "style": "IPY_MODEL_7b53cf7c82a6439eb94d5f0635afe2f3", + "value": " 2/3 [00:00<00:00, 7.76ba/s]" + } + }, + "3aaf8990b0574a9183b9902eb33670a5": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cc3d6c6b95ab4c80983b3ce2175acb8a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "873c876ae8f64b9bba4f25efa3a3859a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d727d1396df2443e9e46ab7e0c7d5276": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cda8e993793c4b949910e309a3f50a03": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "5f38c6b988b44678a7b7f06a99daa983": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7b53cf7c82a6439eb94d5f0635afe2f3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file