[cad161]: / notebooks / sections / testing.md

Download this file

169 lines (131 with data), 2.4 kB


jupyter:
jupytext:
formats: ipynb,md
text_representation:
extension: .md
format_name: markdown
format_version: "1.3"
jupytext_version: 1.11.4
kernelspec:
display_name: Python 3
language: python
name: python3


%reload_ext autoreload
%autoreload 2
import pandas as pd
import os
import context
from edsnlp.utils.brat import BratConnector

import spacy

Sections dataset

We are using Ivan Lerner's work at EDS. Make sure you clone the repo.

data_dir = '../../data/section_dataset/'
brat = BratConnector(data_dir)
texts, annotations = brat.get_brat()
texts
nlp = spacy.blank('fr')
nlp.add_pipe('normaliser')
nlp.add_pipe('sections')
df = texts.copy()
df['doc'] = df.note_text.apply(nlp)
def assign_id(row):
    row.doc._.note_id = row.note_id
df.apply(assign_id, axis=1);
df['matches'] = df.doc.apply(lambda d: [dict(
    lexical_variant=s.text,
    label=s.label_,
    start=s.start_char,
    end=s.end_char
) for s in d._.section_titles])
df = df[['note_text', 'note_id', 'matches']].explode('matches')
df = df.dropna()
df[['lexical_variant', 'label', 'start', 'end']] = df.matches.apply(pd.Series)
df = df.drop('matches', axis=1)
df.head(20)
df = df.rename(columns={'start': 'offset_begin', 'end': 'offset_end', 'label': 'label_value'})
df['label_name'] = df.label_value
df['modifier_type'] = ''
df['modifier_result'] = ''
from ipywidgets import Output, Button, VBox, Layout, Text, HTML
from IPython.display import display
from labeltool.labelling import GlobalLabels, Labels, Labelling

out = Output()
labels = Labels()

for label in df.label_value.unique():
    labels.add(name = label,
               color = 'green',
               selection_type = 'button')
labeller = Labelling(
    df,
    save_path='testing.pickle',
    labels_dict=labels.dict,
    from_save=True,
    out=out,
    display=display,
)
labeller.run()
out