# HuggingFace Installations

In [1]:
!pip install datasets
!pip install transformers
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 4.9 MB/s 
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 84.5 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 93.2 MB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 87.9 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 91.7 MB/s 
Installing collected packag

In [49]:
import pandas as pd
import numpy as np
import spacy
import tqdm
import sys
from datasets import Dataset, DatasetDict
from transformers import Trainer
from transformers import AutoModelForTokenClassification
from transformers import AutoTokenizer
from transformers import TrainingArguments
from transformers import DataCollatorForTokenClassification
from datasets import load_metric
from transformers import pipeline
from transformers import EarlyStoppingCallback, IntervalStrategy

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd /content/drive/MyDrive/IRE

/content/drive/MyDrive/IRE


# Loading Data

In [5]:
entities = pd.read_csv("data/entities.tsv", delimiter="\t")
entities.head()

Unnamed: 0,filename,mark,label,offset1,offset2,span,code
0,es-S0212-71992007000100007-1,T1,ENFERMEDAD,40,61,arterial hypertension,38341003
1,es-S0212-71992007000100007-1,T2,ENFERMEDAD,66,79,polyarthrosis,36186002
2,es-S0212-71992007000100007-1,T3,ENFERMEDAD,1682,1698,pleural effusion,60046008
3,es-S0212-71992007000100007-1,T4,ENFERMEDAD,1859,1875,pleural effusion,60046008
4,es-S0212-71992007000100007-1,T5,ENFERMEDAD,1626,1648,lower lobe atelectasis,46621007


In [6]:
list_off0 = list(entities['offset1'])
list_off1 = list(entities['offset2'])

In [7]:
text_files_path = "data/text"

In [8]:
f = open(text_files_path + "/" + entities.iloc[1,0] + ".txt", "r", encoding="UTF-8")
for l in f:
  print(l)

A 73-year-old patient with a history of arterial hypertension and polyarthrosis presented to the emergency department with abdominal distension and pain associated with constipation and febrile fever. The symptoms had started three weeks earlier and worsened during the four days prior to admission. During this period, an upper gastrointestinal fibroendoscopy (oesophagus, stomach and duodenum) and a colonoscopy (up to the splenic angle) were performed, but no abnormalities were found.

Physical examination revealed a low-grade fever (37.6º C), a distended abdomen, diffusely painful on palpation, tympanised on percussion, with scant borborygmi but no evidence of peritonism, pulmonary auscultation with decreased ventilation in the lower half of the right hemithorax and the onset of intense pain on palpation and percussion of the last three dorsal spinous processes.

Analyses showed 8.2 x 109 leukocytes / L, haemoglobin 136 g / L, platelets 186 x 109 / L. Except for glycaemia (123 mg/dl), 

In [9]:
#Clinical cases
HCs = {}
for fid in tqdm.tqdm(range(len(entities["filename"]))):
  fname = entities["filename"][fid]
  with open(text_files_path + "/" + fname + ".txt", "r", encoding="UTF-8") as f:
    HCs.update({fname: f.read()})

100%|██████████| 6650/6650 [01:51<00:00, 59.70it/s]


In [10]:
#Diseases
ENF = {}
enfermedades = []
fn = entities["filename"][0]
for fname, enf in zip(entities["filename"], entities["span"]):
    if fname!=fn:
      enfermedades = []
    enfermedades.append(enf)
    ENF.update({fname: enfermedades})
    fn = fname

In [11]:
len(ENF)

741

# Preprocessing

In [12]:
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 2.6 MB/s 
Installing collected packages: en-core-web-sm
  Attempting uninstall: en-core-web-sm
    Found existing installation: en-core-web-sm 3.4.0
    Uninstalling en-core-web-sm-3.4.0:
      Successfully uninstalled en-core-web-sm-3.4.0
Successfully installed en-core-web-sm-3.4.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [13]:
nlp = spacy.load("en_core_web_sm")

In [14]:
HCs_tokenized = []
for hc in HCs:
    hl = []
    tokens = nlp(HCs[hc])
    #tokens = HCs[hc].split(" ") #The simplest option
    for t in tokens:
        hl.append(str(t))
    HCs_tokenized.append(hl)

In [15]:
len(HCs_tokenized)

741

In [16]:
Ent_tokenized = []
for enf in ENF:
    Tks = []
    for e in ENF[enf]:
      sl = []
      tokens = nlp(e)
      #tokens = e.split(" ")
      for t in tokens:
          sl.append(str(t))
      Tks.append(sl)
    Ent_tokenized.append(Tks)

In [17]:
len(Ent_tokenized)

741

In [18]:
Ent_tokenized[0]

[['arterial', 'hypertension'],
 ['polyarthrosis'],
 ['pleural', 'effusion'],
 ['pleural', 'effusion'],
 ['lower', 'lobe', 'atelectasis'],
 ['infectious', 'spondylodiscitis', 'D10', '-', 'D11'],
 ['pleural', 'effusion']]

# Tagging Data with BIO scheme

In [19]:
def find_idx(list_to_check, item_to_find):
    indices = []
    for idx, value in enumerate(list_to_check):
        if value == item_to_find:
            indices.append(idx)
    return indices

In [20]:
labels_tokenized = []
idx =-1
for hct, et in zip(HCs_tokenized, Ent_tokenized):
    idx+=1
    labels = []
    for i in range(len(hct)):
        #Labels: 0->'O'; 1->'B'; 2->'I'
        #labels.append('O')
        labels.append(0)

    #For Entities (Diseases|Enfermedades)
    for enf in et:
      first = True
      for e in enf:
          if first == True:
              try:
                #labels[hct.index(e)] = 'B'
                #labels[posLab] = 'B'
                indices = find_idx(hct, e)
                if len(indices) > 1:
                  for id in indices:
                      labels[id] = 1
                else:
                  labels[hct.index(e)] = 1
                
                first = False
              except:
                first = False
                if e == "sarcoma+carcinoma" or e == "carcinoma+sarcoma":
                  continue
                print(hct)
                print(et)
                print(enf)
                print(e)
                print(idx)
          else:
              try:
                #labels[hct.index(e)] = 'I'
                #labels[posLab] = 'I'
                indices = find_idx(hct, e)
                if len(indices) > 1:
                  for id in indices:
                      if labels[id-1] != 0:
                        labels[id] = 2
                else:
                  labels[hct.index(e)] = 2
              except:
                if e == "sarcoma+carcinoma" or e == "carcinoma+sarcoma":
                  continue
                print(hct)
                print(et)
                print(enf)
                print(e)
                print(idx)

    labels_tokenized.append(labels)

['A', '43', '-', 'year', '-', 'old', 'man', 'was', 'admitted', 'to', 'the', 'emergency', 'department', 'due', 'to', 'sudden', 'left', 'lumbar', 'pain', ',', 'continuous', 'and', 'incapacitating', ',', 'without', 'antalgic', 'position', 'or', 'aggravating', 'factors', ',', 'without', 'irradiation', ',', 'with', 'approximately', '23', 'hours', 'of', 'evolution', '.', 'No', 'nausea', 'or', 'vomiting', ',', 'no', 'macroscopic', 'haematuria', 'or', 'lower', 'urinary', 'tract', 'discomfort', '.', 'Absence', 'of', 'precordial', 'pain', '.', 'Hypertension', 'controlled', 'with', 'verapamil', '.', 'He', 'reported', 'an', 'episode', ',', 'interpreted', 'as', 'a', 'transient', 'ischaemic', 'attack', ',', 'approximately', 'eight', 'weeks', 'earlier', '(', 'not', 'confirmed', ')', '.', 'No', 'history', 'of', 'cardiac', 'arrhythmia', 'or', 'valvular', 'heart', 'disease', '.', 'No', 'other', 'previous', 'thromboembolic', 'episodes', '.', 'No', 'known', 'history', 'of', 'urinary', 'lithiasis', '.', 'N

In [21]:
j = 0
for i in range(len(HCs_tokenized[j])):
  print(str(HCs_tokenized[j][i]) + "\t" + str(labels_tokenized[j][i]))

A	0
73	0
-	0
year	0
-	0
old	0
patient	0
with	0
a	0
history	0
of	0
arterial	1
hypertension	2
and	0
polyarthrosis	1
presented	0
to	0
the	0
emergency	0
department	0
with	0
abdominal	0
distension	0
and	0
pain	0
associated	0
with	0
constipation	0
and	0
febrile	0
fever	0
.	0
The	0
symptoms	0
had	0
started	0
three	0
weeks	0
earlier	0
and	0
worsened	0
during	0
the	0
four	0
days	0
prior	0
to	0
admission	0
.	0
During	0
this	0
period	0
,	0
an	0
upper	0
gastrointestinal	0
fibroendoscopy	0
(	0
oesophagus	0
,	0
stomach	0
and	0
duodenum	0
)	0
and	0
a	0
colonoscopy	0
(	0
up	0
to	0
the	0
splenic	0
angle	0
)	0
were	0
performed	0
,	0
but	0
no	0
abnormalities	0
were	0
found	0
.	0

	0
Physical	0
examination	0
revealed	0
a	0
low	0
-	0
grade	0
fever	0
(	0
37.6º	0
C	0
)	0
,	0
a	0
distended	0
abdomen	0
,	0
diffusely	0
painful	0
on	0
palpation	0
,	0
tympanised	0
on	0
percussion	0
,	0
with	0
scant	0
borborygmi	0
but	0
no	0
evidence	0
of	0
peritonism	0
,	0
pulmonary	0
auscultation	0
with	0
decreased	0
ventilation

# Validating tokenization and alignment with the BIO tags.

In [22]:
flag = 0
for st, lt in zip(HCs_tokenized, labels_tokenized):
    if len(st) != len(lt):
        print(st)
        print(lt)
        flag = 1
if flag==0:
    print("Everything is aligned!")

Everything is aligned!


# Sentence tokenization

In [23]:
sent_tokenized = []
label_sent_tokenized = []
for ht, lht in zip(HCs_tokenized, labels_tokenized):
  st = []; lbst = []
  for h, l in zip(ht,lht):
    if h != ".":
      st.append(h)
      lbst.append(l)
    else:
      st.append(".")
      lbst.append(0)
      sent_tokenized.append(st)
      label_sent_tokenized.append(lbst)
      st = []; lbst = []

In [24]:
len(sent_tokenized)

11668

In [25]:
sent_tokenized[0]

['A',
 '73',
 '-',
 'year',
 '-',
 'old',
 'patient',
 'with',
 'a',
 'history',
 'of',
 'arterial',
 'hypertension',
 'and',
 'polyarthrosis',
 'presented',
 'to',
 'the',
 'emergency',
 'department',
 'with',
 'abdominal',
 'distension',
 'and',
 'pain',
 'associated',
 'with',
 'constipation',
 'and',
 'febrile',
 'fever',
 '.']

In [26]:
len(label_sent_tokenized)

11668

In [27]:
label_sent_tokenized[0]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

# Disease mentions identification as a Token classification problem

# Building the Dataset

## Case as a whole is given as input

In [28]:
dic = {"tokens": HCs_tokenized, "ner_tags": labels_tokenized} #For the whole clinical case. We used this option for our paper.
#dic = {"tokens": sent_tokenized, "ner_tags": label_sent_tokenized} #Use this option if you want to check the model performance with sentences tokenized by ". " b

In [29]:
dataset = Dataset.from_dict(dic)

In [30]:
dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 741
})

In [31]:
#For training, validation, and test partitions
"""
#Train, val, test partitions
train_test = dataset.train_test_split()
test_val = train_test['test'].train_test_split()
raw_datasets = DatasetDict({
    'train': train_test['train'],
    'validation': test_val['train'],
    'test': test_val['test']
    })
"""

#Just for training and validation partitions
train_test = dataset.train_test_split()
raw_datasets = DatasetDict({
    'train': train_test['train'],
    'validation': train_test['test']
    })

In [32]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 555
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 186
    })
})

In [33]:
raw_datasets["train"][0]["ner_tags"]
#raw_datasets["train"][0]["pos_tags"]
#raw_datasets["train"][0]["chunk_tags"]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 2,
 2,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [34]:
raw_datasets['train']

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 555
})

In [35]:
label_names = ['O','B','I']
label_names

['O', 'B', 'I']

In [36]:
words = raw_datasets["train"][0]["tokens"]
labels = [int(n) for n in raw_datasets["train"][0]["ner_tags"]]
#labels = raw_datasets["train"][0]["pos_tags"]
#labels = raw_datasets["train"][0]["chunk_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

This is a 70 - year - old male patient , who was admitted to the emergency department of the Hospital Pablo Tobón Uribe , with symptoms of approximately one hour of evolution consisting of chest tightness , general malaise , asthenia and diaphoresis ; which began after having ingested 100 mg of sildenafil , denies ingestion of another sexual stimulant or cocaine and without sexual intercourse after its consumption . The patient 's only clinical history was arterial hypertension , pharmacologically controlled , and he denies previous episodes of angina or nitrate consumption . The clinical examination and vital signs were normal ; however , after the initial assessment he presented cardiorespiratory arrest secondary to ventricular fibrillation with response to a single defibrillation of 200 joules . 
 The initial electrocardiogram showed ST - segment elevation in the inferior ( II , III and aVF ) and anterior ( V2 - V4 ) leads with reciprocal changes in aVL , with no electrocardiographi

## Helper Functions

In [37]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [38]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [39]:
metric = load_metric("seqeval")
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    try:
      true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    except:
      true_predictions = []
      for prediction, label in zip(predictions, labels):
        label_list = []
        for (p, l) in zip(prediction, label):
          if l != -100:
            if p not in range(len(label_names)):
              p = 0
          
            label_list.append(label_names[p])
        true_predictions.append(label_list)

    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

  """Entry point for launching an IPython kernel.


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

# Loading BERT as a pre-trained model

## Model 1 - d4data/biomedical-ner-all

Complete document wise tokenization

In [40]:
model_checkpoint = "d4data/biomedical-ner-all"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

Downloading:   0%|          | 0.00/373 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [41]:
tokenizer.is_fast

True

In [42]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

Token indices sequence length is longer than the specified maximum sequence length for this model (567 > 512). Running this sequence through the model will result in indexing errors


['[CLS]',
 'this',
 'is',
 'a',
 '70',
 '-',
 'year',
 '-',
 'old',
 'male',
 'patient',
 ',',
 'who',
 'was',
 'admitted',
 'to',
 'the',
 'emergency',
 'department',
 'of',
 'the',
 'hospital',
 'pablo',
 'to',
 '##bon',
 'ur',
 '##ibe',
 ',',
 'with',
 'symptoms',
 'of',
 'approximately',
 'one',
 'hour',
 'of',
 'evolution',
 'consisting',
 'of',
 'chest',
 'tight',
 '##ness',
 ',',
 'general',
 'mala',
 '##ise',
 ',',
 'as',
 '##the',
 '##nia',
 'and',
 'dia',
 '##ph',
 '##ores',
 '##is',
 ';',
 'which',
 'began',
 'after',
 'having',
 'ing',
 '##ested',
 '100',
 'mg',
 'of',
 'si',
 '##lden',
 '##af',
 '##il',
 ',',
 'denies',
 'ing',
 '##est',
 '##ion',
 'of',
 'another',
 'sexual',
 'st',
 '##im',
 '##ula',
 '##nt',
 'or',
 'cocaine',
 'and',
 'without',
 'sexual',
 'intercourse',
 'after',
 'its',
 'consumption',
 '.',
 'the',
 'patient',
 "'",
 's',
 'only',
 'clinical',
 'history',
 'was',
 'arterial',
 'hyper',
 '##tension',
 ',',
 'ph',
 '##arm',
 '##aco',
 '##logical',
 '

In [43]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [44]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [45]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [46]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    0,    0,  ...,    0,    0, -100],
        [-100,    0,    0,  ...,    0,    0, -100]])

In [47]:
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [55]:
model = AutoModelForTokenClassification.from_pretrained(    
    model_checkpoint
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--d4data--biomedical-ner-all/snapshots/e87917020da1384aed6e93b1b46d68771f65ddab/config.json
Model config DistilBertConfig {
  "_name_or_path": "d4data/biomedical-ner-all",
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "O",
    "1": "B-Activity",
    "2": "B-Administration",
    "3": "B-Age",
    "4": "B-Area",
    "5": "B-Biological_attribute",
    "6": "B-Biological_structure",
    "7": "B-Clinical_event",
    "8": "B-Color",
    "9": "B-Coreference",
    "10": "B-Date",
    "11": "B-Detailed_description",
    "12": "B-Diagnostic_procedure",
    "13": "B-Disease_disorder",
    "14": "B-Distance",
    "15": "B-Dosage",
    "16": "B-Duration",
    "17": "B-Family_history",
    "18": "B-Frequency",
    "19": "B-Height",
    "20": "B-History",
    "21": "

In [56]:
args = TrainingArguments(
    "NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased",
    evaluation_strategy = IntervalStrategy.STEPS,
    eval_steps = 50,
    learning_rate=5e-5,
    num_train_epochs=50,
    weight_decay=0.01,
    metric_for_best_model = 'f1',
   load_best_model_at_end=True
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [57]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)
trainer.train()

***** Running training *****
  Num examples = 555
  Num Epochs = 50
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3500


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
50,No log,0.162818,0.270997,0.309661,0.289042,0.937595
100,No log,0.166674,0.348107,0.517594,0.416259,0.938953
150,No log,0.165462,0.37627,0.521433,0.437115,0.938915
200,No log,0.146949,0.404802,0.463852,0.43232,0.945101
250,No log,0.165257,0.440503,0.492642,0.465116,0.944807
300,No log,0.187354,0.421611,0.431862,0.426675,0.945729
350,No log,0.20323,0.445104,0.479846,0.461823,0.945396
400,No log,0.215578,0.41914,0.523992,0.465738,0.944269
450,No log,0.228786,0.456535,0.467051,0.461733,0.945178
500,0.103800,0.261185,0.427002,0.522073,0.469775,0.943641


***** Running Evaluation *****
  Num examples = 186
  Batch size = 8
***** Running Evaluation *****
  Num examples = 186
  Batch size = 8
***** Running Evaluation *****
  Num examples = 186
  Batch size = 8
***** Running Evaluation *****
  Num examples = 186
  Batch size = 8
***** Running Evaluation *****
  Num examples = 186
  Batch size = 8
***** Running Evaluation *****
  Num examples = 186
  Batch size = 8
***** Running Evaluation *****
  Num examples = 186
  Batch size = 8
***** Running Evaluation *****
  Num examples = 186
  Batch size = 8
***** Running Evaluation *****
  Num examples = 186
  Batch size = 8
***** Running Evaluation *****
  Num examples = 186
  Batch size = 8
Saving model checkpoint to NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-500
Configuration saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-500/config.json
Model weights saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoin

TrainOutput(global_step=900, training_loss=0.06402364306979709, metrics={'train_runtime': 159.6722, 'train_samples_per_second': 173.794, 'train_steps_per_second': 21.92, 'total_flos': 932785215873192.0, 'train_loss': 0.06402364306979709, 'epoch': 12.86})

In [58]:
trainer.save_model('model/distilbert-base-uncased-all-tokens')

Saving model checkpoint to model/distilbert-base-uncased-all-tokens
Configuration saved in model/distilbert-base-uncased-all-tokens/config.json
Model weights saved in model/distilbert-base-uncased-all-tokens/pytorch_model.bin
tokenizer config file saved in model/distilbert-base-uncased-all-tokens/tokenizer_config.json
Special tokens file saved in model/distilbert-base-uncased-all-tokens/special_tokens_map.json


## Model 1 - d4data/biomedical-ner-all

### Sentence Based Modelling

In [59]:
dic = {"tokens": sent_tokenized, "ner_tags": label_sent_tokenized} #Use this option if you want to check the model performance with sentences tokenized by ". " b

In [60]:
dataset = Dataset.from_dict(dic)

In [61]:
dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 11668
})

In [62]:
#For training, validation, and test partitions
"""
#Train, val, test partitions
train_test = dataset.train_test_split()
test_val = train_test['test'].train_test_split()
raw_datasets = DatasetDict({
    'train': train_test['train'],
    'validation': test_val['train'],
    'test': test_val['test']
    })
"""

#Just for training and validation partitions
train_test = dataset.train_test_split()
raw_datasets = DatasetDict({
    'train': train_test['train'],
    'validation': train_test['test']
    })

In [63]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 8751
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 2917
    })
})

In [64]:
raw_datasets["train"][0]["ner_tags"]
#raw_datasets["train"][0]["pos_tags"]
#raw_datasets["train"][0]["chunk_tags"]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [65]:
raw_datasets['train']

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 8751
})

In [66]:
label_names = ['O','B','I']
label_names

['O', 'B', 'I']

In [67]:
words = raw_datasets["train"][0]["tokens"]
labels = [int(n) for n in raw_datasets["train"][0]["ner_tags"]]
#labels = raw_datasets["train"][0]["pos_tags"]
#labels = raw_datasets["train"][0]["chunk_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

Since 2006 she had tried several times to lose weight , without success . 
O     O    O   O   O     O       O     O  O    O      O O       O       O 


In [68]:
model_checkpoint = "d4data/biomedical-ner-all"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--d4data--biomedical-ner-all/snapshots/e87917020da1384aed6e93b1b46d68771f65ddab/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--d4data--biomedical-ner-all/snapshots/e87917020da1384aed6e93b1b46d68771f65ddab/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--d4data--biomedical-ner-all/snapshots/e87917020da1384aed6e93b1b46d68771f65ddab/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--d4data--biomedical-ner-all/snapshots/e87917020da1384aed6e93b1b46d68771f65ddab/tokenizer_config.json


In [69]:
tokenizer.is_fast

True

In [70]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'since',
 '2006',
 'she',
 'had',
 'tried',
 'several',
 'times',
 'to',
 'lose',
 'weight',
 ',',
 'without',
 'success',
 '.',
 '[SEP]']

In [71]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]


In [72]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [73]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [74]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100],
        [-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            1,    2,    2,    2,    2,    2,    2,    2,    0, -100]])

In [75]:
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [76]:
model = AutoModelForTokenClassification.from_pretrained(    
    model_checkpoint
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--d4data--biomedical-ner-all/snapshots/e87917020da1384aed6e93b1b46d68771f65ddab/config.json
Model config DistilBertConfig {
  "_name_or_path": "d4data/biomedical-ner-all",
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "O",
    "1": "B-Activity",
    "2": "B-Administration",
    "3": "B-Age",
    "4": "B-Area",
    "5": "B-Biological_attribute",
    "6": "B-Biological_structure",
    "7": "B-Clinical_event",
    "8": "B-Color",
    "9": "B-Coreference",
    "10": "B-Date",
    "11": "B-Detailed_description",
    "12": "B-Diagnostic_procedure",
    "13": "B-Disease_disorder",
    "14": "B-Distance",
    "15": "B-Dosage",
    "16": "B-Duration",
    "17": "B-Family_history",
    "18": "B-Frequency",
    "19": "B-Height",
    "20": "B-History",
    "21": "

In [77]:
args = TrainingArguments(
    "NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased",
    evaluation_strategy = IntervalStrategy.STEPS,
    eval_steps = 50,
    learning_rate=5e-5,
    num_train_epochs=50,
    weight_decay=0.01,
    metric_for_best_model = 'f1',
   load_best_model_at_end=True
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [78]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)
trainer.train()

***** Running training *****
  Num examples = 8751
  Num Epochs = 50
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 54700


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
50,No log,0.220473,0.223507,0.240041,0.231479,0.923262
100,No log,0.178907,0.233188,0.249353,0.241,0.932774
150,No log,0.170712,0.209794,0.210554,0.210173,0.934965
200,No log,0.161026,0.308015,0.341956,0.324099,0.937756
250,No log,0.1678,0.290117,0.282462,0.286239,0.940074
300,No log,0.179418,0.388704,0.302638,0.340314,0.939863
350,No log,0.160154,0.333333,0.44387,0.380741,0.938609
400,No log,0.199905,0.207132,0.204346,0.205729,0.933585
450,No log,0.154181,0.349614,0.422142,0.38247,0.940611
500,0.213900,0.154374,0.376731,0.422142,0.398146,0.941137


***** Running Evaluation *****
  Num examples = 2917
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2917
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2917
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2917
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2917
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2917
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2917
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2917
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2917
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2917
  Batch size = 8
Saving model checkpoint to NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-500
Configuration saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-500/config.json
Model weights saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased

TrainOutput(global_step=1650, training_loss=0.15572210947672527, metrics={'train_runtime': 250.5398, 'train_samples_per_second': 1746.429, 'train_steps_per_second': 218.329, 'total_flos': 244690187133744.0, 'train_loss': 0.15572210947672527, 'epoch': 1.51})

In [79]:
trainer.save_model('model/distilbert-base-uncased-sentence')

Saving model checkpoint to model/distilbert-base-uncased-sentence
Configuration saved in model/distilbert-base-uncased-sentence/config.json
Model weights saved in model/distilbert-base-uncased-sentence/pytorch_model.bin
tokenizer config file saved in model/distilbert-base-uncased-sentence/tokenizer_config.json
Special tokens file saved in model/distilbert-base-uncased-sentence/special_tokens_map.json


## Model 2 - pucpr/clinicalnerpt-medical

Whole document based tokenization

In [80]:
model_checkpoint = "pucpr/clinicalnerpt-medical"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

Downloading:   0%|          | 0.00/151 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--pucpr--clinicalnerpt-medical/snapshots/0d889f90b203734b0ba45904781a6779c8eac2b9/config.json
Model config BertConfig {
  "_name_or_path": "pucpr/clinicalnerpt-medical",
  "_num_labels": 3,
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "eos_token_ids": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-MedicalDevice",
    "2": "I-MedicalDevice"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-MedicalDevice": 1,
    "I-MedicalDevice": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": null,
  "pooler_

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--pucpr--clinicalnerpt-medical/snapshots/0d889f90b203734b0ba45904781a6779c8eac2b9/vocab.txt
loading file tokenizer.json from cache at None
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--pucpr--clinicalnerpt-medical/snapshots/0d889f90b203734b0ba45904781a6779c8eac2b9/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--pucpr--clinicalnerpt-medical/snapshots/0d889f90b203734b0ba45904781a6779c8eac2b9/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--pucpr--clinicalnerpt-medical/snapshots/0d889f90b203734b0ba45904781a6779c8eac2b9/config.json
Model config BertConfig {
  "_name_or_path": "pucpr/clinicalnerpt-medical",
  "_num_labels": 3,
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dro

In [81]:
tokenizer.is_fast

True

In [82]:
dic = {"tokens": HCs_tokenized, "ner_tags": labels_tokenized} #For the whole clinical case. We used this option for our paper.
#dic = {"tokens": sent_tokenized, "ner_tags": label_sent_tokenized} #Use this option if you want to check the model performance with sentences tokenized by ". " b

In [83]:
dataset = Dataset.from_dict(dic)

In [84]:
dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 741
})

In [85]:
#For training, validation, and test partitions
"""
#Train, val, test partitions
train_test = dataset.train_test_split()
test_val = train_test['test'].train_test_split()
raw_datasets = DatasetDict({
    'train': train_test['train'],
    'validation': test_val['train'],
    'test': test_val['test']
    })
"""

#Just for training and validation partitions
train_test = dataset.train_test_split()
raw_datasets = DatasetDict({
    'train': train_test['train'],
    'validation': train_test['test']
    })

In [86]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 555
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 186
    })
})

In [87]:
raw_datasets["train"][0]["ner_tags"]
#raw_datasets["train"][0]["pos_tags"]
#raw_datasets["train"][0]["chunk_tags"]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 1,
 2,
 2,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [88]:
raw_datasets['train']

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 555
})

In [89]:
label_names = ['O','B','I']
label_names

['O', 'B', 'I']

In [90]:
words = raw_datasets["train"][0]["tokens"]
labels = [int(n) for n in raw_datasets["train"][0]["ner_tags"]]
#labels = raw_datasets["train"][0]["pos_tags"]
#labels = raw_datasets["train"][0]["chunk_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

This is a 79 - year - old woman with a history of hypertension , osteoporosis and hysterectomy for myomatosis at the age of 50 . She underwent transurethral resection of infiltrating bladder carcinoma in October 2006 . She subsequently received radiotherapy sessions up to a total of 50 Gy due to persistence of an external tumour mass in the right angle of the bladder , finishing this treatment in June 2007 . In August 2007 she began chemotherapy treatment due to persistence of the bladder lesion and metastases in the spine detected by follow - up CT scan and bone scintigraphy . Her digestive history began in February 2008 when she was admitted for episodes of rectorrhagia , initially scarce and distal , but which soon became more frequent and profuse , accompanied by symptoms of haemodynamic instability and severe anaemia with extensive transfusion requirements . 
 Total colonoscopy was performed , showing only changes typical of actinic proctitis with large friable and bleeding neovas

In [91]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

Token indices sequence length is longer than the specified maximum sequence length for this model (578 > 512). Running this sequence through the model will result in indexing errors


['[CLS]',
 'this',
 'is',
 'a',
 '79',
 '-',
 'year',
 '-',
 'old',
 'woman',
 'with',
 'a',
 'history',
 'of',
 'hy',
 '##pert',
 '##ension',
 ',',
 'os',
 '##te',
 '##op',
 '##oros',
 '##is',
 'and',
 'hy',
 '##ster',
 '##ect',
 '##omy',
 'for',
 'my',
 '##oma',
 '##tos',
 '##is',
 'at',
 'the',
 'age',
 'of',
 '50',
 '.',
 'she',
 'underwent',
 'trans',
 '##ure',
 '##th',
 '##ral',
 'res',
 '##ection',
 'of',
 'in',
 '##fil',
 '##trat',
 '##ing',
 'blad',
 '##der',
 'car',
 '##cino',
 '##ma',
 'in',
 'o',
 '##cto',
 '##ber',
 '2006',
 '.',
 'she',
 'subsequently',
 'received',
 'radio',
 '##ther',
 '##ap',
 '##y',
 'sessions',
 'up',
 'to',
 'a',
 'total',
 'of',
 '50',
 'g',
 '##y',
 'due',
 'to',
 'pers',
 '##isten',
 '##ce',
 'of',
 'an',
 'external',
 'tu',
 '##mou',
 '##r',
 'mass',
 'in',
 'the',
 'right',
 'angle',
 'of',
 'the',
 'blad',
 '##der',
 ',',
 'finishing',
 'this',
 'treatment',
 'in',
 'ju',
 '##ne',
 '2007',
 '.',
 'in',
 'august',
 '2007',
 'she',
 'began',
 'c

In [92]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [93]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [94]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [95]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    0,    0,  ...,    0,    0, -100],
        [-100,    0,    0,  ..., -100, -100, -100]])

In [96]:
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [97]:
model = AutoModelForTokenClassification.from_pretrained(    
    model_checkpoint
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--pucpr--clinicalnerpt-medical/snapshots/0d889f90b203734b0ba45904781a6779c8eac2b9/config.json
Model config BertConfig {
  "_name_or_path": "pucpr/clinicalnerpt-medical",
  "_num_labels": 3,
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "eos_token_ids": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-MedicalDevice",
    "2": "I-MedicalDevice"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-MedicalDevice": 1,
    "I-MedicalDevice": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": null,
  "pooler_

Downloading:   0%|          | 0.00/709M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--pucpr--clinicalnerpt-medical/snapshots/0d889f90b203734b0ba45904781a6779c8eac2b9/pytorch_model.bin
All model checkpoint weights were used when initializing BertForTokenClassification.

All the weights of BertForTokenClassification were initialized from the model checkpoint at pucpr/clinicalnerpt-medical.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForTokenClassification for predictions without further training.


In [98]:
args = TrainingArguments(
    "NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased",
    evaluation_strategy = IntervalStrategy.STEPS,
    eval_steps = 50,
    learning_rate=5e-5,
    num_train_epochs=50,
    weight_decay=0.01,
    metric_for_best_model = 'f1',
   load_best_model_at_end=True
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [99]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)
trainer.train()

***** Running training *****
  Num examples = 555
  Num Epochs = 50
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3500


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
50,No log,0.160077,0.281675,0.325786,0.302129,0.934798
100,No log,0.156098,0.440468,0.425786,0.433003,0.941619
150,No log,0.175201,0.43054,0.491195,0.458872,0.943305
200,No log,0.181505,0.465023,0.438994,0.451634,0.94475
250,No log,0.198468,0.437112,0.487421,0.460898,0.942113
300,No log,0.213199,0.454245,0.518239,0.484136,0.943191
350,No log,0.22587,0.439457,0.52956,0.480319,0.941974
400,No log,0.252911,0.509901,0.453459,0.480027,0.945802
450,No log,0.248783,0.475821,0.501258,0.488208,0.945092
500,0.078000,0.230288,0.44173,0.545912,0.488326,0.941631


***** Running Evaluation *****
  Num examples = 186
  Batch size = 8
***** Running Evaluation *****
  Num examples = 186
  Batch size = 8
***** Running Evaluation *****
  Num examples = 186
  Batch size = 8
***** Running Evaluation *****
  Num examples = 186
  Batch size = 8
***** Running Evaluation *****
  Num examples = 186
  Batch size = 8
***** Running Evaluation *****
  Num examples = 186
  Batch size = 8
***** Running Evaluation *****
  Num examples = 186
  Batch size = 8
***** Running Evaluation *****
  Num examples = 186
  Batch size = 8
***** Running Evaluation *****
  Num examples = 186
  Batch size = 8
***** Running Evaluation *****
  Num examples = 186
  Batch size = 8
Saving model checkpoint to NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-500
Configuration saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-500/config.json
Model weights saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoin

TrainOutput(global_step=1150, training_loss=0.03881311048632083, metrics={'train_runtime': 332.8101, 'train_samples_per_second': 83.381, 'train_steps_per_second': 10.517, 'total_flos': 2382353890443360.0, 'train_loss': 0.03881311048632083, 'epoch': 16.43})

In [100]:
trainer.save_model('model/multilingual-BERT-all-tokens')

Saving model checkpoint to model/multilingual-BERT-all-tokens
Configuration saved in model/multilingual-BERT-all-tokens/config.json
Model weights saved in model/multilingual-BERT-all-tokens/pytorch_model.bin
tokenizer config file saved in model/multilingual-BERT-all-tokens/tokenizer_config.json
Special tokens file saved in model/multilingual-BERT-all-tokens/special_tokens_map.json


## Model 2 - pucpr/clinicalnerpt-medical

### Sentence Based Modelling

In [101]:
dic = {"tokens": sent_tokenized, "ner_tags": label_sent_tokenized} #Use this option if you want to check the model performance with sentences tokenized by ". " b

In [102]:
dataset = Dataset.from_dict(dic)

In [103]:
dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 11668
})

In [104]:
#For training, validation, and test partitions
"""
#Train, val, test partitions
train_test = dataset.train_test_split()
test_val = train_test['test'].train_test_split()
raw_datasets = DatasetDict({
    'train': train_test['train'],
    'validation': test_val['train'],
    'test': test_val['test']
    })
"""

#Just for training and validation partitions
train_test = dataset.train_test_split()
raw_datasets = DatasetDict({
    'train': train_test['train'],
    'validation': train_test['test']
    })

In [105]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 8751
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 2917
    })
})

In [106]:
raw_datasets["train"][0]["ner_tags"]
#raw_datasets["train"][0]["pos_tags"]
#raw_datasets["train"][0]["chunk_tags"]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [107]:
raw_datasets['train']

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 8751
})

In [108]:
label_names = ['O','B','I']
label_names

['O', 'B', 'I']

In [109]:
words = raw_datasets["train"][0]["tokens"]
labels = [int(n) for n in raw_datasets["train"][0]["ner_tags"]]
#labels = raw_datasets["train"][0]["pos_tags"]
#labels = raw_datasets["train"][0]["chunk_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

Since 2006 she had tried several times to lose weight , without success . 
O     O    O   O   O     O       O     O  O    O      O O       O       O 


In [110]:
model_checkpoint = "pucpr/clinicalnerpt-medical"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--pucpr--clinicalnerpt-medical/snapshots/0d889f90b203734b0ba45904781a6779c8eac2b9/config.json
Model config BertConfig {
  "_name_or_path": "pucpr/clinicalnerpt-medical",
  "_num_labels": 3,
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "eos_token_ids": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-MedicalDevice",
    "2": "I-MedicalDevice"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-MedicalDevice": 1,
    "I-MedicalDevice": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": null,
  "pooler_

In [111]:
tokenizer.is_fast

True

In [112]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'since',
 '2006',
 'she',
 'had',
 'tried',
 'several',
 'times',
 'to',
 'lose',
 'weight',
 ',',
 'without',
 'success',
 '.',
 '[SEP]']

In [113]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]


In [114]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [115]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [116]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100],
        [-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    1,    2,    2,    2,    2,    2,    2,    2,    2,    0, -100]])

In [117]:
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [118]:
model = AutoModelForTokenClassification.from_pretrained(    
    model_checkpoint
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--pucpr--clinicalnerpt-medical/snapshots/0d889f90b203734b0ba45904781a6779c8eac2b9/config.json
Model config BertConfig {
  "_name_or_path": "pucpr/clinicalnerpt-medical",
  "_num_labels": 3,
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "eos_token_ids": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-MedicalDevice",
    "2": "I-MedicalDevice"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-MedicalDevice": 1,
    "I-MedicalDevice": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": null,
  "pooler_

In [119]:
args = TrainingArguments(
    "NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased",
    evaluation_strategy = IntervalStrategy.STEPS,
    eval_steps = 50,
    learning_rate=5e-5,
    num_train_epochs=50,
    weight_decay=0.01,
    metric_for_best_model = 'f1',
   load_best_model_at_end=True
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [120]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)
trainer.train()

***** Running training *****
  Num examples = 8751
  Num Epochs = 50
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 54700


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
50,No log,0.213414,0.235529,0.183135,0.206054,0.927267
100,No log,0.185129,0.241594,0.200724,0.219271,0.927296
150,No log,0.175822,0.340069,0.304708,0.321419,0.933018
200,No log,0.175697,0.278355,0.332644,0.303087,0.932284
250,No log,0.183824,0.331652,0.442318,0.379073,0.931647
300,No log,0.197462,0.43122,0.22866,0.298851,0.932342
350,No log,0.167895,0.38607,0.433006,0.408193,0.937046
400,No log,0.190235,0.179594,0.201242,0.189802,0.930726
450,No log,0.182239,0.341333,0.463528,0.393155,0.936134
500,0.197700,0.161157,0.3778,0.383859,0.380806,0.938074


***** Running Evaluation *****
  Num examples = 2917
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2917
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2917
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2917
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2917
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2917
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2917
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2917
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2917
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2917
  Batch size = 8
Saving model checkpoint to NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-500
Configuration saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-500/config.json
Model weights saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased

TrainOutput(global_step=2750, training_loss=0.14159858218106355, metrics={'train_runtime': 653.9792, 'train_samples_per_second': 669.058, 'train_steps_per_second': 83.642, 'total_flos': 861581789561556.0, 'train_loss': 0.14159858218106355, 'epoch': 2.51})

In [121]:
trainer.save_model('model/multilingual-BERT-sentence')

Saving model checkpoint to model/multilingual-BERT-sentence
Configuration saved in model/multilingual-BERT-sentence/config.json
Model weights saved in model/multilingual-BERT-sentence/pytorch_model.bin
tokenizer config file saved in model/multilingual-BERT-sentence/tokenizer_config.json
Special tokens file saved in model/multilingual-BERT-sentence/special_tokens_map.json
