In [55]:
import pandas as pd
import numpy as np
from pathlib import Path
import spacy
from sklearn.model_selection import train_test_split
import json
import os
import shutil

In [2]:
!python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')


# Fetching Data 

In [3]:
entities = pd.read_csv('data/entities.tsv',sep = '\t')
entities.head()

Unnamed: 0,filename,mark,label,offset1,offset2,span,code
0,es-S0212-71992007000100007-1,T1,ENFERMEDAD,40,61,arterial hypertension,38341003
1,es-S0212-71992007000100007-1,T2,ENFERMEDAD,66,79,polyarthrosis,36186002
2,es-S0212-71992007000100007-1,T3,ENFERMEDAD,1682,1698,pleural effusion,60046008
3,es-S0212-71992007000100007-1,T4,ENFERMEDAD,1859,1875,pleural effusion,60046008
4,es-S0212-71992007000100007-1,T5,ENFERMEDAD,1626,1648,lower lobe atelectasis,46621007


In [4]:
text_path = 'data/text/'
text_files = list(Path(text_path).glob('*.txt'))

# Split into Train, Test, Valid Sets

In [5]:
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

In [6]:
train_df = pd.DataFrame()
test_df = pd.DataFrame()
valid_df = pd.DataFrame()

In [10]:
train_text_files, test_text_files = train_test_split(text_files, test_size=1 - train_ratio, random_state=1)

In [11]:
len(train_text_files)

562

In [12]:
len(test_text_files)

188

In [13]:
val_text_files, test_text_files = train_test_split(test_text_files, test_size=test_ratio/(test_ratio + validation_ratio), random_state=1) 

In [14]:
len(test_text_files)

76

In [15]:
len(val_text_files)

112

In [16]:
train_file_text = {}

for file in train_text_files:
    file = str(file)
    with open(file, "r", encoding="UTF-8") as f:
        file_name = file[len(text_path):-4]
        train_file_text.update({file_name: f.read()})

train_text_files = [str(text_file)[len(text_path):-4] for text_file in train_text_files]

In [17]:
test_file_text = {}

for file in test_text_files:
    file = str(file)
    with open(file, "r", encoding="UTF-8") as f:
        file_name = file[len(text_path):-4]
        test_file_text.update({file_name: f.read()})

test_text_files = [str(text_file)[len(text_path):-4] for text_file in test_text_files]

In [18]:
val_file_text = {}

for file in val_text_files:
    file = str(file)
    with open(file, "r", encoding="UTF-8") as f:
        file_name = file[len(text_path):-4]
        val_file_text.update({file_name: f.read()})

val_text_files = [str(text_file)[len(text_path):-4] for text_file in val_text_files]

# Sentence Splitting & Tokenization

In [19]:
nlp = spacy.load("en_core_web_sm")

In [24]:
def tokenize(file_name, file_text):
    text = file_text[file_name]
    doc = nlp(text)
    sentence_ids = []
    tokens = []
    for i, sent in enumerate(doc.sents):
        sentence_ids.append(i)
        tokens.append([(tk.text, tk.idx) for tk in sent])
    return sentence_ids, tokens

# Converting Spans to IOB Format

In [21]:
def get_bio_tags(tokens, entities):
    tags = []
    curr_entity = None
    for token in tokens:
        if len(entities):
            nxt_entity = entities[0]
            start, end, lbl = nxt_entity[0], nxt_entity[1], nxt_entity[2]
            if token[1] >= start and (token[1] + len(token)) <= end:
                if curr_entity:
                    tags.append('I-' + lbl)
                else:
                    tags.append('B-' + lbl)
                    curr_entity = nxt_entity
                if (token[1] + len(token)) >= end:
                    curr_entity = None
                    entities.pop(0)
            else:
                if token[1] >= end:
                    entities.pop(0)
                tags.append('O')
                curr_entity = None
        else:
            tags.append('O')
    return tags

In [22]:
file_info = ['text', 'entities', 'tags', 'sentence_ids', 'tokens']

In [38]:
def dump_jsons(file_info, text_files, entities, file_text):
    res = {}
    for info in file_info:
        res[info] = []
        for file_num in range(len(text_files)):
            if info == 'text':
                res[info].append("")
            else:
                res[info].append([])

    file_idx = {}
    for idx, file in enumerate(text_files):
        file_idx[file] = idx
        
    for file in text_files:
        entities_file = entities.loc[entities['filename'] == file]
        for entity in entities_file.itertuples():
            file = entity[1]
            idx = file_idx[file]
            if res['text'][idx] == "":
                res['text'][idx] = file_text[file]
                res['sentence_ids'][idx], res['tokens'][idx] = tokenize(file, file_text)
            # entity -> offset1, offset2, label, span
            res['entities'][idx].append([entity[4], entity[5], entity[3], entity[6]])

    for idx, file_name in enumerate(text_files):
        text = file_text[file_name]
        doc = nlp(text)
        ents = res['entities'][idx].copy()
        for i in res['sentence_ids'][idx]:
            res['tags'][idx].append(get_bio_tags(res['tokens'][idx][i], ents))
    return res

In [39]:
res_train = dump_jsons(file_info, train_text_files, entities, train_file_text)

In [40]:
file_no = 1
sentence_no = 1

In [42]:
res_train['tokens'][file_no][sentence_no]

[('He', 126),
 ('had', 129),
 ('a', 133),
 ('personal', 135),
 ('history', 144),
 ('of', 152),
 ('arterial', 155),
 ('hypertension', 164),
 ('treated', 177),
 ('with', 185),
 ('angiotensin', 190),
 ('converting', 202),
 ('enzyme', 213),
 ('inhibitors', 220),
 (',', 230),
 ('surgery', 232),
 ('for', 240),
 ('duodenal', 244),
 ('ulcus', 253),
 ('in', 259),
 ('1961', 262),
 ('and', 267),
 ('cholecystectomy', 271),
 ('in', 287),
 ('2002', 290),
 ('.', 294),
 ('\n', 295)]

In [43]:
res_train['tags'][file_no][sentence_no]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-ENFERMEDAD',
 'I-ENFERMEDAD',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-ENFERMEDAD',
 'I-ENFERMEDAD',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [44]:
res_train['sentence_ids'][file_no]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]

In [45]:
res_train['entities'][file_no]

[[155, 176, 'ENFERMEDAD', 'arterial hypertension'],
 [244, 258, 'ENFERMEDAD', 'duodenal ulcus'],
 [299, 335, 'ENFERMEDAD', 'infrarenal abdominal aortic aneurysm'],
 [549, 566, 'ENFERMEDAD', 'horseshoe kidneys'],
 [327, 335, 'ENFERMEDAD', 'aneurysm'],
 [739, 747, 'ENFERMEDAD', 'aneurysm'],
 [1172, 1180, 'ENFERMEDAD', 'aneurysm'],
 [1376, 1385, 'ENFERMEDAD', 'aneurysms'],
 [1575, 1583, 'ENFERMEDAD', 'aneurysm'],
 [1557, 1583, 'ENFERMEDAD', 'infrarenal aortic aneurysm'],
 [2186, 2207, 'ENFERMEDAD', 'aneurysmal dilatation']]

In [46]:
res_train['text'][file_no]

'An 81-year-old man was referred from the outpatient clinic to our urology department for symptoms of the lower urinary tract. He had a personal history of arterial hypertension treated with angiotensin converting enzyme inhibitors, surgery for duodenal ulcus in 1961 and cholecystectomy in 2002.\nAn infrarenal abdominal aortic aneurysm was diagnosed by chance during the abdomino-pelvic ultrasound scan. The CT scan revealed that it did not affect the iliac bifurcation. The examination also showed that the kidneys had a morphology compatible with horseshoe kidneys, with an isthmus located at the level of the infrarenal abdominal aorta. Our case reveals the most favourable situation, in principle, for the surgical approach, since the aneurysm originates distal, 4 cm from the exit of the main renal arteries, which are two, one for each renal half. There is no additional artery at the level of the isthmus.\nIn successive controls, a progressive increase in the aortic diameter was observed, 

In [47]:
dump_file = 'data/processed_inp_data_train.json'

with open(dump_file, 'w') as f:
     f.write(json.dumps(res_train))

In [48]:
res_test = dump_jsons(file_info, test_text_files, entities, test_file_text)

In [49]:
dump_file = 'data/processed_inp_data_test.json'

with open(dump_file, 'w') as f:
     f.write(json.dumps(res_test))

In [50]:
res_val = dump_jsons(file_info, val_text_files, entities, val_file_text)

In [51]:
dump_file = 'data/processed_inp_data_val.json'

with open(dump_file, 'w') as f:
     f.write(json.dumps(res_val))

In [54]:
train_dir = "data/text_files/train"
if not os.path.exists(train_dir):
    os.makedirs(train_dir)

In [56]:
for file in train_text_files:
    shutil.copyfile(os.path.join(text_path, file + ".txt"), os.path.join(train_dir, file + ".txt"))

In [57]:
val_dir = "data/text_files/val"
if not os.path.exists(val_dir):
    os.makedirs(val_dir)

In [58]:
for file in val_text_files:
    shutil.copyfile(os.path.join(text_path, file + ".txt"), os.path.join(val_dir, file + ".txt"))

In [59]:
test_dir = "data/text_files/test"
if not os.path.exists(test_dir):
    os.makedirs(test_dir)

In [60]:
for file in test_text_files:
    shutil.copyfile(os.path.join(text_path, file + ".txt"), os.path.join(test_dir, file + ".txt"))

In [61]:
train_entities_file = pd.DataFrame()
for file in train_text_files:
        entities_file = entities.loc[entities['filename'] == file]
        temp = pd.concat([train_entities_file, entities_file], ignore_index=True)

        train_entities_file.drop(train_entities_file.index[0:], inplace=True)

        train_entities_file[temp.columns] = temp

In [63]:
train_entities_file.to_csv("data/train_entities.csv", index=False)

In [64]:
val_entities_file = pd.DataFrame()
for file in val_text_files:
        entities_file = entities.loc[entities['filename'] == file]
        temp = pd.concat([val_entities_file, entities_file], ignore_index=True)

        val_entities_file.drop(val_entities_file.index[0:], inplace=True)

        val_entities_file[temp.columns] = temp

In [65]:
val_entities_file.to_csv("data/val_entities.csv", index=False)

In [66]:
test_entities_file = pd.DataFrame()
for file in test_text_files:
        entities_file = entities.loc[entities['filename'] == file]
        temp = pd.concat([test_entities_file, entities_file], ignore_index=True)

        test_entities_file.drop(test_entities_file.index[0:], inplace=True)

        test_entities_file[temp.columns] = temp

In [67]:
test_entities_file.to_csv("data/test_entities.csv", index=False)