b/transform_data.py
+from tqdm import tqdm
+import spacy
+from spacy.tokens import DocBin
+import json
+#split_percent = round(len(train['annotations']) * 0.7)
+corpus = "annotated_text.json"
+# Load the annotated corpus
+with open(corpus, encoding='utf8') as f:
+    reports = json.load(f)
+# Convert the tagged data to a format understood by SpaCy
+TRAIN_DATA  = []
+for content, entities in reports["annotations"]:
+    if len(entities["entities"]) > 0:
+        TRAIN_DATA.append(([content,entities]))
+nlp = spacy.blank("fr") # load a new spacy model
+db = DocBin() # create a DocBin object
+for text, annot in tqdm(TRAIN_DATA[0:300]): # data in previous format
+    doc = nlp.make_doc(text) # create doc object from text
+    ents = []
+    for start, end, label in annot["entities"]: # add character indexes
+        span = doc.char_span(start, end, label=label, alignment_mode="contract")
+        if span is None:
+            msg = f"Skipping entity [{start}, {end}, {label}] in the following text because the character span '{doc.text[start:end]}' does not align with token boundaries:\n\n{repr(text)}\n"
+            print(msg)
+        else:
+            ents.append(span)
+    doc.ents = ents # label the text with the ents
+    db.add(doc)
+db.to_disk("./new_train.spacy") # save the docbin object
+nlp = spacy.blank("fr") # load a new spacy model
+db = DocBin() # create a DocBin object
+for text, annot in tqdm(TRAIN_DATA[300:-1]): # data in previous format
+    doc = nlp.make_doc(text) # create doc object from text
+    ents = []
+    for start, end, label in annot["entities"]: # add character indexes
+        span = doc.char_span(start, end, label=label, alignment_mode="contract")
+        if span is None:
+            msg = f"Skipping entity [{start}, {end}, {label}] in the following text because the character span '{doc.text[start:end]}' does not align with token boundaries:\n\n{repr(text)}\n"
+            print(msg)
+        else:
+            ents.append(span)
+    doc.ents = ents # label the text with the ents
+    db.add(doc)
+db.to_disk("./new_dev.spacy") # save the docbin object