|
a |
|
b/transform_data.py |
|
|
1 |
from tqdm import tqdm |
|
|
2 |
import spacy |
|
|
3 |
from spacy.tokens import DocBin |
|
|
4 |
import json |
|
|
5 |
|
|
|
6 |
#split_percent = round(len(train['annotations']) * 0.7) |
|
|
7 |
|
|
|
8 |
corpus = "annotated_text.json" |
|
|
9 |
|
|
|
10 |
# Load the annotated corpus |
|
|
11 |
with open(corpus, encoding='utf8') as f: |
|
|
12 |
reports = json.load(f) |
|
|
13 |
|
|
|
14 |
# Convert the tagged data to a format understood by SpaCy |
|
|
15 |
TRAIN_DATA = [] |
|
|
16 |
for content, entities in reports["annotations"]: |
|
|
17 |
if len(entities["entities"]) > 0: |
|
|
18 |
TRAIN_DATA.append(([content,entities])) |
|
|
19 |
|
|
|
20 |
nlp = spacy.blank("fr") # load a new spacy model |
|
|
21 |
|
|
|
22 |
db = DocBin() # create a DocBin object |
|
|
23 |
|
|
|
24 |
for text, annot in tqdm(TRAIN_DATA[0:300]): # data in previous format |
|
|
25 |
doc = nlp.make_doc(text) # create doc object from text |
|
|
26 |
ents = [] |
|
|
27 |
for start, end, label in annot["entities"]: # add character indexes |
|
|
28 |
span = doc.char_span(start, end, label=label, alignment_mode="contract") |
|
|
29 |
if span is None: |
|
|
30 |
msg = f"Skipping entity [{start}, {end}, {label}] in the following text because the character span '{doc.text[start:end]}' does not align with token boundaries:\n\n{repr(text)}\n" |
|
|
31 |
print(msg) |
|
|
32 |
else: |
|
|
33 |
ents.append(span) |
|
|
34 |
doc.ents = ents # label the text with the ents |
|
|
35 |
db.add(doc) |
|
|
36 |
db.to_disk("./new_train.spacy") # save the docbin object |
|
|
37 |
|
|
|
38 |
nlp = spacy.blank("fr") # load a new spacy model |
|
|
39 |
|
|
|
40 |
db = DocBin() # create a DocBin object |
|
|
41 |
|
|
|
42 |
for text, annot in tqdm(TRAIN_DATA[300:-1]): # data in previous format |
|
|
43 |
doc = nlp.make_doc(text) # create doc object from text |
|
|
44 |
ents = [] |
|
|
45 |
for start, end, label in annot["entities"]: # add character indexes |
|
|
46 |
span = doc.char_span(start, end, label=label, alignment_mode="contract") |
|
|
47 |
if span is None: |
|
|
48 |
msg = f"Skipping entity [{start}, {end}, {label}] in the following text because the character span '{doc.text[start:end]}' does not align with token boundaries:\n\n{repr(text)}\n" |
|
|
49 |
print(msg) |
|
|
50 |
|
|
|
51 |
else: |
|
|
52 |
ents.append(span) |
|
|
53 |
doc.ents = ents # label the text with the ents |
|
|
54 |
db.add(doc) |
|
|
55 |
db.to_disk("./new_dev.spacy") # save the docbin object |
|
|
56 |
|