[036ed5]: / transform_data.py

Download this file

57 lines (44 with data), 2.1 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin
import json
#split_percent = round(len(train['annotations']) * 0.7)
corpus = "annotated_text.json"
# Load the annotated corpus
with open(corpus, encoding='utf8') as f:
reports = json.load(f)
# Convert the tagged data to a format understood by SpaCy
TRAIN_DATA = []
for content, entities in reports["annotations"]:
if len(entities["entities"]) > 0:
TRAIN_DATA.append(([content,entities]))
nlp = spacy.blank("fr") # load a new spacy model
db = DocBin() # create a DocBin object
for text, annot in tqdm(TRAIN_DATA[0:300]): # data in previous format
doc = nlp.make_doc(text) # create doc object from text
ents = []
for start, end, label in annot["entities"]: # add character indexes
span = doc.char_span(start, end, label=label, alignment_mode="contract")
if span is None:
msg = f"Skipping entity [{start}, {end}, {label}] in the following text because the character span '{doc.text[start:end]}' does not align with token boundaries:\n\n{repr(text)}\n"
print(msg)
else:
ents.append(span)
doc.ents = ents # label the text with the ents
db.add(doc)
db.to_disk("./new_train.spacy") # save the docbin object
nlp = spacy.blank("fr") # load a new spacy model
db = DocBin() # create a DocBin object
for text, annot in tqdm(TRAIN_DATA[300:-1]): # data in previous format
doc = nlp.make_doc(text) # create doc object from text
ents = []
for start, end, label in annot["entities"]: # add character indexes
span = doc.char_span(start, end, label=label, alignment_mode="contract")
if span is None:
msg = f"Skipping entity [{start}, {end}, {label}] in the following text because the character span '{doc.text[start:end]}' does not align with token boundaries:\n\n{repr(text)}\n"
print(msg)
else:
ents.append(span)
doc.ents = ents # label the text with the ents
db.add(doc)
db.to_disk("./new_dev.spacy") # save the docbin object