Downloads: 1

[cad161]: / notebooks / sections / testing.md

169 lines (131 with data), 2.4 kB

jupyter:
jupytext:
formats: ipynb,md
text_representation:
extension: .md
format_name: markdown
format_version: "1.3"
jupytext_version: 1.11.4
kernelspec:
display_name: Python 3
language: python
name: python3

%reload_ext autoreload
%autoreload 2

import pandas as pd

import os

import context

from edsnlp.utils.brat import BratConnector

import spacy

Sections dataset

We are using Ivan Lerner's work at EDS. Make sure you clone the repo.

data_dir = '../../data/section_dataset/'

brat = BratConnector(data_dir)

texts, annotations = brat.get_brat()

texts

nlp = spacy.blank('fr')

nlp.add_pipe('normaliser')
nlp.add_pipe('sections')

df = texts.copy()

df['doc'] = df.note_text.apply(nlp)

def assign_id(row):
    row.doc._.note_id = row.note_id

df.apply(assign_id, axis=1);

df['matches'] = df.doc.apply(lambda d: [dict(
    lexical_variant=s.text,
    label=s.label_,
    start=s.start_char,
    end=s.end_char
) for s in d._.section_titles])

df = df[['note_text', 'note_id', 'matches']].explode('matches')

df = df.dropna()

df[['lexical_variant', 'label', 'start', 'end']] = df.matches.apply(pd.Series)

df = df.drop('matches', axis=1)

df.head(20)

df = df.rename(columns={'start': 'offset_begin', 'end': 'offset_end', 'label': 'label_value'})

df['label_name'] = df.label_value

df['modifier_type'] = ''
df['modifier_result'] = ''

from ipywidgets import Output, Button, VBox, Layout, Text, HTML
from IPython.display import display
from labeltool.labelling import GlobalLabels, Labels, Labelling

out = Output()

labels = Labels()

for label in df.label_value.unique():
    labels.add(name = label,
               color = 'green',
               selection_type = 'button')

labeller = Labelling(
    df,
    save_path='testing.pickle',
    labels_dict=labels.dict,
    from_save=True,
    out=out,
    display=display,
)

labeller.run()
out