|
a |
|
b/tests/data/test_conll.py |
|
|
1 |
from itertools import islice |
|
|
2 |
from pathlib import Path |
|
|
3 |
|
|
|
4 |
import pytest |
|
|
5 |
from typing_extensions import Literal |
|
|
6 |
|
|
|
7 |
import edsnlp |
|
|
8 |
|
|
|
9 |
|
|
|
10 |
@pytest.mark.parametrize("num_cpu_workers", [0, 2]) |
|
|
11 |
@pytest.mark.parametrize("shuffle", ["dataset"]) |
|
|
12 |
def test_read_shuffle_loop( |
|
|
13 |
num_cpu_workers: int, |
|
|
14 |
shuffle: Literal["dataset", "fragment"], |
|
|
15 |
): |
|
|
16 |
input_file = ( |
|
|
17 |
Path(__file__).parent.parent.resolve() / "training" / "rhapsodie_sample.conllu" |
|
|
18 |
) |
|
|
19 |
notes = edsnlp.data.read_conll( |
|
|
20 |
input_file, |
|
|
21 |
shuffle=shuffle, |
|
|
22 |
seed=42, |
|
|
23 |
loop=True, |
|
|
24 |
).set_processing(num_cpu_workers=num_cpu_workers) |
|
|
25 |
notes = list(islice(notes, 6)) |
|
|
26 |
assert len(notes) == 6 |
|
|
27 |
# 32 ce ce PRON _ Gender=Masc|Number=Sing|Person=3|PronType=Dem 30 obl:arg _ _ # noqa: E501 |
|
|
28 |
word_attrs = { |
|
|
29 |
"text": "ce", |
|
|
30 |
"lemma_": "ce", |
|
|
31 |
"pos_": "PRON", |
|
|
32 |
"dep_": "obl:arg", |
|
|
33 |
"morph": "Gender=Masc|Number=Sing|Person=3|PronType=Dem", |
|
|
34 |
"head": "profité", |
|
|
35 |
} |
|
|
36 |
word = notes[0][31] |
|
|
37 |
for attr, val in word_attrs.items(): |
|
|
38 |
assert str(getattr(word, attr)) == val |