jupyter:
jupytext:
formats: ipynb,md
text_representation:
extension: .md
format_name: markdown
format_version: "1.3"
jupytext_version: 1.11.4
kernelspec:
display_name: Python 3
language: python
name: python3
%reload_ext autoreload
%autoreload 2
import pandas as pd
import os
import context
from edsnlp.utils.brat import BratConnector
import spacy
We are using Ivan Lerner's work at EDS. Make sure you clone the repo.
data_dir = '../../data/section_dataset/'
brat = BratConnector(data_dir)
texts, annotations = brat.get_brat()
texts
nlp = spacy.blank('fr')
nlp.add_pipe('normaliser')
nlp.add_pipe('sections')
df = texts.copy()
df['doc'] = df.note_text.apply(nlp)
def assign_id(row):
row.doc._.note_id = row.note_id
df.apply(assign_id, axis=1);
df['matches'] = df.doc.apply(lambda d: [dict(
lexical_variant=s.text,
label=s.label_,
start=s.start_char,
end=s.end_char
) for s in d._.section_titles])
df = df[['note_text', 'note_id', 'matches']].explode('matches')
df = df.dropna()
df[['lexical_variant', 'label', 'start', 'end']] = df.matches.apply(pd.Series)
df = df.drop('matches', axis=1)
df.head(20)
df = df.rename(columns={'start': 'offset_begin', 'end': 'offset_end', 'label': 'label_value'})
df['label_name'] = df.label_value
df['modifier_type'] = ''
df['modifier_result'] = ''
from ipywidgets import Output, Button, VBox, Layout, Text, HTML
from IPython.display import display
from labeltool.labelling import GlobalLabels, Labels, Labelling
out = Output()
labels = Labels()
for label in df.label_value.unique():
labels.add(name = label,
color = 'green',
selection_type = 'button')
labeller = Labelling(
df,
save_path='testing.pickle',
labels_dict=labels.dict,
from_save=True,
out=out,
display=display,
)
labeller.run()
out