a b/tests/data/test_conll.py
1
from itertools import islice
2
from pathlib import Path
3
4
import pytest
5
from typing_extensions import Literal
6
7
import edsnlp
8
9
10
@pytest.mark.parametrize("num_cpu_workers", [0, 2])
11
@pytest.mark.parametrize("shuffle", ["dataset"])
12
def test_read_shuffle_loop(
13
    num_cpu_workers: int,
14
    shuffle: Literal["dataset", "fragment"],
15
):
16
    input_file = (
17
        Path(__file__).parent.parent.resolve() / "training" / "rhapsodie_sample.conllu"
18
    )
19
    notes = edsnlp.data.read_conll(
20
        input_file,
21
        shuffle=shuffle,
22
        seed=42,
23
        loop=True,
24
    ).set_processing(num_cpu_workers=num_cpu_workers)
25
    notes = list(islice(notes, 6))
26
    assert len(notes) == 6
27
    # 32    ce  ce  PRON    _   Gender=Masc|Number=Sing|Person=3|PronType=Dem   30  obl:arg _   _  # noqa: E501
28
    word_attrs = {
29
        "text": "ce",
30
        "lemma_": "ce",
31
        "pos_": "PRON",
32
        "dep_": "obl:arg",
33
        "morph": "Gender=Masc|Number=Sing|Person=3|PronType=Dem",
34
        "head": "profité",
35
    }
36
    word = notes[0][31]
37
    for attr, val in word_attrs.items():
38
        assert str(getattr(word, attr)) == val