[cad161]: / notebooks / sections / section-dataset.md

Download this file

159 lines (122 with data), 2.6 kB


jupyter:
jupytext:
formats: ipynb,md
text_representation:
extension: .md
format_name: markdown
format_version: '1.3'
jupytext_version: 1.11.4
kernelspec:
display_name: Python 3
language: python
name: python3


%reload_ext autoreload
%autoreload 2
import pandas as pd
import os
import context
from edsnlp.utils.brat import BratConnector

Sections dataset

Réutilisation du travail réalisé par Ivan Lerner à l'EDS.

data_dir = '../../data/section_dataset/'
brat = BratConnector(data_dir)
texts, annotations = brat.get_brat()
df = annotations[['lexical_variant']].drop_duplicates()
df['section'] = ''
df.to_csv('sections.tsv', sep='\t', index=False)
annotated = pd.read_csv('sections.tsv', sep='\t')
annotated.to_csv('annotated_sections.csv', index=False)
annotated = pd.read_excel('sections.xlsx', sheet_name='Annotation', engine='openpyxl')
annotated.columns = ['lexical_variant', 'section', 'keep', 'comment']
annotated.keep = annotated.keep.fillna('Oui') == 'Oui'
annotated = annotated.query('keep')[['lexical_variant', 'section']]
annotated.merge(annotations, on='lexical_variant').section.value_counts()
annotated.lexical_variant = annotated.lexical_variant.str.lower()
annotated_unnaccented = annotated.copy()
from unidecode import unidecode
annotated_unnaccented.lexical_variant = annotated_unnaccented.lexical_variant.apply(unidecode)
# annotated = pd.concat([annotated, annotated_unnaccented])
annotated = annotated_unnaccented
annotated = annotated.drop_duplicates()
annotated = annotated.sort_values(['lexical_variant', 'section'])
annotated
annotated = annotated.drop_duplicates()
sections = {
    section.replace(' ', '_'): list(annotated.query('section == @section').lexical_variant)
    for section in annotated.section.unique()
}
for k, v in sections.items():
    print(unidecode(k.replace(' ', '_')), '=', v)
    print()
sections = {
    section: unidecode(section.replace(' ', '_'))
    for section in annotated.section.unique()
}
for k, v in sections.items():
    print(f"{repr(k)}: {v},")