a b/transform_data.py
1
from tqdm import tqdm
2
import spacy
3
from spacy.tokens import DocBin
4
import json
5
6
#split_percent = round(len(train['annotations']) * 0.7)
7
8
corpus = "annotated_text.json"
9
10
# Load the annotated corpus
11
with open(corpus, encoding='utf8') as f:
12
    reports = json.load(f)
13
14
# Convert the tagged data to a format understood by SpaCy
15
TRAIN_DATA  = []
16
for content, entities in reports["annotations"]:
17
    if len(entities["entities"]) > 0:
18
        TRAIN_DATA.append(([content,entities]))
19
20
nlp = spacy.blank("fr") # load a new spacy model
21
22
db = DocBin() # create a DocBin object
23
24
for text, annot in tqdm(TRAIN_DATA[0:300]): # data in previous format
25
    doc = nlp.make_doc(text) # create doc object from text
26
    ents = []
27
    for start, end, label in annot["entities"]: # add character indexes
28
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
29
        if span is None:
30
            msg = f"Skipping entity [{start}, {end}, {label}] in the following text because the character span '{doc.text[start:end]}' does not align with token boundaries:\n\n{repr(text)}\n"
31
            print(msg)
32
        else:
33
            ents.append(span)
34
    doc.ents = ents # label the text with the ents
35
    db.add(doc)
36
db.to_disk("./new_train.spacy") # save the docbin object
37
38
nlp = spacy.blank("fr") # load a new spacy model
39
40
db = DocBin() # create a DocBin object
41
42
for text, annot in tqdm(TRAIN_DATA[300:-1]): # data in previous format
43
    doc = nlp.make_doc(text) # create doc object from text
44
    ents = []
45
    for start, end, label in annot["entities"]: # add character indexes
46
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
47
        if span is None:
48
            msg = f"Skipping entity [{start}, {end}, {label}] in the following text because the character span '{doc.text[start:end]}' does not align with token boundaries:\n\n{repr(text)}\n"
49
            print(msg)
50
51
        else:
52
            ents.append(span)
53
    doc.ents = ents # label the text with the ents
54
    db.add(doc)
55
db.to_disk("./new_dev.spacy") # save the docbin object
56