--- a
+++ b/Roberta+LLM/dataset_builder.ipynb
@@ -0,0 +1,618 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Generation of a dataset already splited into train, val and test to standarize the evaluation of the different approaches for NER on clinical trial eligibility criteria**\n",
+    "\n",
+    "In this case, we are not going to consider pregnancy consideration classes, as well as mood entity."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np \n",
+    "from datasets import Dataset, DatasetDict\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "import os\n",
+    "import re\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "from huggingface_hub import notebook_login"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# paths\n",
+    "root = '..'\n",
+    "# root = './drive/MyDrive/ProjectName' # if you are using google drive\n",
+    "data_path = f'{root}/data'\n",
+    "chia_bio = f'{data_path}/chia/'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# read all the sentences per file\n",
+    "files_train = os.listdir(f'{data_path}/chia/trains')\n",
+    "files_test = os.listdir(f'{data_path}/chia/tests')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(1800, 200)"
+      ]
+     },
+     "execution_count": 45,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(files_train), len(files_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 104,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# dict for the entities (entity to int value)\n",
+    "simple_ent = {\"Condition\", \"Value\", \"Drug\", \"Procedure\", \"Measurement\", \"Temporal\", \"Observation\", \"Person\", \"Device\"}\n",
+    "sel_ent = {\n",
+    "    \"O\": 0,\n",
+    "    \"B-Condition\": 1,\n",
+    "    \"I-Condition\": 2,\n",
+    "    \"B-Value\": 3,\n",
+    "    \"I-Value\": 4,\n",
+    "    \"B-Drug\": 5,\n",
+    "    \"I-Drug\": 6,\n",
+    "    \"B-Procedure\": 7,\n",
+    "    \"I-Procedure\": 8,\n",
+    "    \"B-Measurement\": 9,\n",
+    "    \"I-Measurement\": 10,\n",
+    "    \"B-Temporal\": 11,\n",
+    "    \"I-Temporal\": 12,\n",
+    "    \"B-Observation\": 13,\n",
+    "    \"I-Observation\": 14,\n",
+    "    \"B-Person\": 15,\n",
+    "    \"I-Person\": 16,\n",
+    "    \"B-Device\": 17,\n",
+    "    \"I-Device\": 18\n",
+    "}\n",
+    "\n",
+    "entities_list = list(sel_ent.keys())\n",
+    "sel_ent_inv = {v: k for k, v in sel_ent.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def read_file(file_path):\n",
+    "    \"\"\"\n",
+    "    Read the file and return a list of sentences with annotations\n",
+    "    inputs: file_path: str\n",
+    "    outputs: list of sentences with annotations\n",
+    "    \"\"\"\n",
+    "    ann_file = []\n",
+    "    with open(file_path, \"r\") as fd:\n",
+    "        sentences_ann = fd.read().split(\"\\n\\n\")\n",
+    "    sentences_ann = [sentence for sentence in sentences_ann if sentence != \"\" and sentence != '\\n']\n",
+    "\n",
+    "    for i in range(len(sentences_ann)):\n",
+    "        annotation = []\n",
+    "        lines = sentences_ann[i].split(\"\\n\")\n",
+    "        for line in lines:\n",
+    "            if line != \"\":\n",
+    "                spt_line = line.split()\n",
+    "                label = spt_line[-1]\n",
+    "                if label not in sel_ent:\n",
+    "                    label = 'O'\n",
+    "                annotation.append((spt_line[0], label))\n",
+    "        new_annotation = []\n",
+    "        ps = r'(\\.|\\,|\\:|\\;|\\!|\\?|\\-|\\(|\\)|\\[|\\]|\\{|\\}|\\\")'\n",
+    "        for i,(word, tag) in enumerate(annotation):\n",
+    "            if re.search(ps, word):\n",
+    "                # find the ocurrences of the punctuation signs\n",
+    "                occurrences = re.finditer(ps, word)\n",
+    "                indexes = [(match.start(), match.end()) for match in occurrences]\n",
+    "                # create the new tokens\n",
+    "                last = 0\n",
+    "                for j, (beg, end) in enumerate(indexes):\n",
+    "                    if beg > last:\n",
+    "                        new_annotation.append((word[last:beg], tag))\n",
+    "                    if tag != \"O\":\n",
+    "                        label = f'I-{tag.split(\"-\")[1]}'\n",
+    "                    else:\n",
+    "                        label = \"O\"\n",
+    "                    if end < len(word) or (i < len(annotation) - 1 and annotation[i+1][1] == label):\n",
+    "                        new_annotation.append((word[beg:end], label))\n",
+    "                    else:\n",
+    "                        new_annotation.append((word[beg:end], 'O')) \n",
+    "                    last = end\n",
+    "                if last < len(word):\n",
+    "                    new_annotation.append((word[last:], label))\n",
+    "            else:\n",
+    "                new_annotation.append((word, tag))\n",
+    "        ann_file.append(new_annotation)\n",
+    "    return ann_file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of training files: 1800\n",
+      "Total number of sentences in training files: 11102\n"
+     ]
+    }
+   ],
+   "source": [
+    "# read the training files\n",
+    "train_annotations = {}\n",
+    "\n",
+    "for file in files_train:\n",
+    "    train_annotations[file] = read_file(f'{chia_bio}/trains/{file}')\n",
+    "print(f'Number of training files: {len(train_annotations)}')\n",
+    "print(f'Total number of sentences in training files: {sum([len(train_annotations[file]) for file in train_annotations])}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of testing files: 200\n",
+      "Total number of sentences in testing files: 1307\n"
+     ]
+    }
+   ],
+   "source": [
+    "# read the testing files\n",
+    "test_annotations = {}\n",
+    "\n",
+    "for file in files_test:\n",
+    "    test_annotations[file] = read_file(f'{chia_bio}/tests/{file}')\n",
+    "print(f'Number of testing files: {len(test_annotations)}')\n",
+    "print(f'Total number of sentences in testing files: {sum([len(test_annotations[file]) for file in test_annotations])}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def build_df(annotations):\n",
+    "    \"\"\"\n",
+    "    Build a dataframe with the sentences and annotations\n",
+    "    inputs: annotations: dict\n",
+    "    outputs: dataframe\n",
+    "    \"\"\"\n",
+    "    annotations_df = []\n",
+    "    for file in annotations:\n",
+    "        for i in range(len(annotations[file])):\n",
+    "            dict_sent = {\"tokens\": [], \"ner_tags\": [], \"file\": file, \"index\": i}\n",
+    "            for word, tag in annotations[file][i]:\n",
+    "                dict_sent[\"tokens\"].append(word)\n",
+    "                # add the int representation of the entity\n",
+    "                dict_sent[\"ner_tags\"].append(sel_ent[tag])\n",
+    "            annotations_df.append(dict_sent)\n",
+    "    return pd.DataFrame(annotations_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# build the dataframes\n",
+    "train_df = build_df(train_annotations)\n",
+    "test_df = build_df(test_annotations)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>tokens</th>\n",
+       "      <th>ner_tags</th>\n",
+       "      <th>file</th>\n",
+       "      <th>index</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>[Patients, refuse, to, follow, the, research]</td>\n",
+       "      <td>[0, 13, 14, 14, 14, 14]</td>\n",
+       "      <td>NCT03134378_exc.bio.txt</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>[Patient, has, had, previous, eradication, the...</td>\n",
+       "      <td>[0, 0, 0, 11, 7, 8, 0, 1, 2, 2, 0]</td>\n",
+       "      <td>NCT03134378_exc.bio.txt</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>[The, patient, is, pregnant, or, breastfeeding]</td>\n",
+       "      <td>[0, 0, 0, 1, 0, 13]</td>\n",
+       "      <td>NCT03134378_exc.bio.txt</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>[Patients, have, a, history, of, allergy, to, ...</td>\n",
+       "      <td>[0, 0, 0, 13, 0, 1, 0, 0, 5, 6, 6, 6, 6, 0, 5,...</td>\n",
+       "      <td>NCT03134378_exc.bio.txt</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>[Patients, are, known, to, have, impaired, liv...</td>\n",
+       "      <td>[0, 0, 0, 0, 0, 3, 9, 10, 0, 0, 0, 9, 10, 3, 4...</td>\n",
+       "      <td>NCT03134378_exc.bio.txt</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                              tokens  \\\n",
+       "0      [Patients, refuse, to, follow, the, research]   \n",
+       "1  [Patient, has, had, previous, eradication, the...   \n",
+       "2    [The, patient, is, pregnant, or, breastfeeding]   \n",
+       "3  [Patients, have, a, history, of, allergy, to, ...   \n",
+       "4  [Patients, are, known, to, have, impaired, liv...   \n",
+       "\n",
+       "                                            ner_tags                     file  \\\n",
+       "0                            [0, 13, 14, 14, 14, 14]  NCT03134378_exc.bio.txt   \n",
+       "1                 [0, 0, 0, 11, 7, 8, 0, 1, 2, 2, 0]  NCT03134378_exc.bio.txt   \n",
+       "2                                [0, 0, 0, 1, 0, 13]  NCT03134378_exc.bio.txt   \n",
+       "3  [0, 0, 0, 13, 0, 1, 0, 0, 5, 6, 6, 6, 6, 0, 5,...  NCT03134378_exc.bio.txt   \n",
+       "4  [0, 0, 0, 0, 0, 3, 9, 10, 0, 0, 0, 9, 10, 3, 4...  NCT03134378_exc.bio.txt   \n",
+       "\n",
+       "   index  \n",
+       "0      0  \n",
+       "1      1  \n",
+       "2      2  \n",
+       "3      3  \n",
+       "4      4  "
+      ]
+     },
+     "execution_count": 54,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# split the training data into training and validation\n",
+    "train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 120,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(Index(['tokens', 'ner_tags', 'file', 'index'], dtype='object'),\n",
+       " Index(['tokens', 'ner_tags', 'file', 'index'], dtype='object'),\n",
+       " Index(['tokens', 'ner_tags', 'file', 'index'], dtype='object'))"
+      ]
+     },
+     "execution_count": 120,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train_df.columns, val_df.columns, test_df.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 127,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create dataset instance\n",
+    "train_dataset = Dataset.from_pandas(train_df)\n",
+    "val_dataset = Dataset.from_pandas(val_df)\n",
+    "test_dataset = Dataset.from_pandas(test_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 128,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_dataset = train_dataset.remove_columns(['__index_level_0__'])\n",
+    "val_dataset = val_dataset.remove_columns(['__index_level_0__'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 129,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create the dataset with all the data splited in train, validation and test\n",
+    "dataset = DatasetDict({\n",
+    "    'train': train_dataset,\n",
+    "    'val': val_dataset,\n",
+    "    'test': test_dataset\n",
+    "})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 130,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['tokens', 'ner_tags', 'file', 'index'],\n",
+       "        num_rows: 8881\n",
+       "    })\n",
+       "    val: Dataset({\n",
+       "        features: ['tokens', 'ner_tags', 'file', 'index'],\n",
+       "        num_rows: 2221\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['tokens', 'ner_tags', 'file', 'index'],\n",
+       "        num_rows: 1307\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 130,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Analysis of entities distribution on the dataset**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 97,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_entities_tr = [ent for ents in train_df.ner_tags for ent in ents]\n",
+    "all_entities_val = [ent for ents in val_df.ner_tags for ent in ents]\n",
+    "all_entities_test = [ent for ents in test_df.ner_tags for ent in ents]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get just the main entity (not B- or I-)\n",
+    "all_entities_tr = [entities_list[ent][2:] for ent in all_entities_tr if entities_list[ent].startswith('B-')]\n",
+    "all_entities_val = [entities_list[ent][2:] for ent in all_entities_val if entities_list[ent].startswith('B-')]\n",
+    "all_entities_test = [entities_list[ent][2:] for ent in all_entities_test if entities_list[ent].startswith('B-')]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 105,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'Condition': 8031, 'Procedure': 2285, 'Person': 1136, 'Device': 264, 'Value': 2677, 'Drug': 2479, 'Temporal': 2129, 'Measurement': 2292, 'Observation': 1171}\n",
+      "train dict_values([8031, 2285, 1136, 264, 2677, 2479, 2129, 2292, 1171])\n",
+      "{'Condition': 1913, 'Procedure': 602, 'Person': 257, 'Device': 59, 'Value': 628, 'Drug': 593, 'Temporal': 510, 'Measurement': 534, 'Observation': 328}\n",
+      "val dict_values([1913, 602, 257, 59, 628, 593, 510, 534, 328])\n",
+      "{'Condition': 1104, 'Procedure': 311, 'Person': 135, 'Device': 23, 'Value': 345, 'Drug': 443, 'Temporal': 295, 'Measurement': 288, 'Observation': 166}\n",
+      "test dict_values([1104, 311, 135, 23, 345, 443, 295, 288, 166])\n"
+     ]
+    },
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 640x480 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# plot the relations\n",
+    "x = np.arange(len(simple_ent))\n",
+    "width = 0.25  # the width of the bars\n",
+    "multiplier = 0\n",
+    "\n",
+    "fig, ax = plt.subplots(layout='constrained')\n",
+    "\n",
+    "entities_dict = {\n",
+    "    'train': all_entities_tr,\n",
+    "    'val': all_entities_val,\n",
+    "    'test': all_entities_test\n",
+    "}\n",
+    "\n",
+    "for l in ['train', 'val', 'test']:\n",
+    "    offset = width * multiplier\n",
+    "    dict_ent = {ent: entities_dict[l].count(ent) for ent in simple_ent}\n",
+    "    print(dict_ent)\n",
+    "    v = dict_ent.values()\n",
+    "    print(l, v)\n",
+    "    rects = ax.bar(x + offset, v, width, label=l)\n",
+    "    multiplier += 1\n",
+    "\n",
+    "ax.set_ylabel('Number of examples')\n",
+    "ax.set_title('Distriubtion of entities in the dataset')\n",
+    "ax.set_xticks(x + width, simple_ent)\n",
+    "ax.legend(loc='upper right', ncols=2)\n",
+    "\n",
+    "plt.xticks(rotation=90)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Save dataset in huggingface**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 137,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ec47345b20574ccab16428fa5266a7d9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "notebook_login()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 138,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Pushing split train to the Hub.\n",
+      "Creating parquet from Arrow format: 100%|██████████| 9/9 [00:00<00:00, 432.05ba/s]\n",
+      "Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:01<00:00,  1.34s/it]\n",
+      "Pushing split val to the Hub.\n",
+      "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 404.70ba/s]\n",
+      "Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:01<00:00,  1.40s/it]\n",
+      "Pushing split test to the Hub.\n",
+      "Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 347.64ba/s]\n",
+      "Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:00<00:00,  1.13it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "dataset.push_to_hub('JavierLopetegui/chia_v1')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "TER",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}