Downloads: 1

[cad161]: / notebooks / sections / section-dataset.md

159 lines (122 with data), 2.6 kB

jupyter:
jupytext:
formats: ipynb,md
text_representation:
extension: .md
format_name: markdown
format_version: '1.3'
jupytext_version: 1.11.4
kernelspec:
display_name: Python 3
language: python
name: python3

%reload_ext autoreload
%autoreload 2

import pandas as pd

import os

import context

from edsnlp.utils.brat import BratConnector

Sections dataset

Réutilisation du travail réalisé par Ivan Lerner à l'EDS.

data_dir = '../../data/section_dataset/'

brat = BratConnector(data_dir)

texts, annotations = brat.get_brat()

df = annotations[['lexical_variant']].drop_duplicates()

df['section'] = ''

df.to_csv('sections.tsv', sep='\t', index=False)

annotated = pd.read_csv('sections.tsv', sep='\t')

annotated.to_csv('annotated_sections.csv', index=False)

annotated = pd.read_excel('sections.xlsx', sheet_name='Annotation', engine='openpyxl')

annotated.columns = ['lexical_variant', 'section', 'keep', 'comment']

annotated.keep = annotated.keep.fillna('Oui') == 'Oui'

annotated = annotated.query('keep')[['lexical_variant', 'section']]

annotated.merge(annotations, on='lexical_variant').section.value_counts()

annotated.lexical_variant = annotated.lexical_variant.str.lower()

annotated_unnaccented = annotated.copy()

from unidecode import unidecode

annotated_unnaccented.lexical_variant = annotated_unnaccented.lexical_variant.apply(unidecode)

# annotated = pd.concat([annotated, annotated_unnaccented])
annotated = annotated_unnaccented

annotated = annotated.drop_duplicates()

annotated = annotated.sort_values(['lexical_variant', 'section'])

annotated

annotated = annotated.drop_duplicates()

sections = {
    section.replace(' ', '_'): list(annotated.query('section == @section').lexical_variant)
    for section in annotated.section.unique()
}

for k, v in sections.items():
    print(unidecode(k.replace(' ', '_')), '=', v)
    print()

sections = {
    section: unidecode(section.replace(' ', '_'))
    for section in annotated.section.unique()
}

for k, v in sections.items():
    print(f"{repr(k)}: {v},")