Diff of /tests/data/test_pandas.py [000000] .. [cad161]

Switch to unified view

a b/tests/data/test_pandas.py
1
from itertools import islice
2
from pathlib import Path
3
4
import pytest
5
6
import edsnlp
7
8
9
def test_read_write(blank_nlp, text, df_notes_pandas):
10
    reader = edsnlp.data.from_pandas(
11
        df_notes_pandas,
12
        converter="omop",
13
        nlp=blank_nlp,
14
    ).set_processing(backend="simple")
15
    doc = list(reader)[0]
16
    assert doc.text == text
17
18
    blank_nlp.add_pipe("eds.matcher", config={"terms": {"douleur": ["douleurs"]}})
19
    blank_nlp.add_pipe("eds.negation")
20
    docs = reader.map_pipeline(blank_nlp)
21
22
    writer = docs.to_pandas(
23
        converter="omop",
24
        span_attributes=["negation"],
25
        span_getter=["ents"],
26
    )
27
    res = writer.to_dict(orient="records")
28
    assert len(res) == 20
29
    assert sum(len(r["entities"]) for r in res) == 20
30
31
32
@pytest.mark.parametrize("num_cpu_workers", [0, 2])
33
def test_read_shuffle_loop(num_cpu_workers: int):
34
    import pandas as pd
35
36
    data = pd.read_parquet(
37
        Path(__file__).parent.parent.resolve() / "resources" / "docs.parquet"
38
    )
39
    notes = (
40
        edsnlp.data.from_pandas(
41
            data,
42
            shuffle="dataset",
43
            seed=42,
44
            loop=True,
45
        )
46
        .map(lambda x: x["note_id"])
47
        .set_processing(num_cpu_workers=num_cpu_workers)
48
    )
49
    notes = list(islice(notes, 6))
50
    assert notes == [
51
        "subfolder/doc-3",
52
        "subfolder/doc-2",
53
        "subfolder/doc-1",
54
        "subfolder/doc-3",
55
        "subfolder/doc-1",
56
        "subfolder/doc-2",
57
    ]