Diff of /tests/data/test_polars.py [000000] .. [cad161]

Switch to unified view

a b/tests/data/test_polars.py
1
from itertools import islice
2
from pathlib import Path
3
4
import polars
5
import pytest
6
7
import edsnlp
8
9
10
def test_read_write(blank_nlp, text, df_notes_pandas):
11
    import polars
12
13
    df_notes_polars = polars.from_pandas(df_notes_pandas)
14
    reader = edsnlp.data.from_polars(
15
        df_notes_polars,
16
        converter="omop",
17
        nlp=blank_nlp,
18
    ).set_processing(backend="simple")
19
    doc = list(reader)[0]
20
    assert doc.text == text
21
22
    blank_nlp.add_pipe("eds.matcher", config={"terms": {"douleur": ["douleurs"]}})
23
    blank_nlp.add_pipe("eds.negation")
24
    docs = reader.map_pipeline(blank_nlp)
25
26
    writer: polars.DataFrame = docs.to_polars(
27
        converter="omop",
28
        span_attributes=["negation"],
29
        span_getter=["ents"],
30
    )
31
    res = writer.to_dicts()
32
    assert len(res) == 20
33
    assert sum(len(r["entities"]) for r in res) == 20
34
35
36
@pytest.mark.parametrize("num_cpu_workers", [0, 2])
37
def test_read_shuffle_loop(num_cpu_workers: int):
38
    data = polars.read_parquet(
39
        Path(__file__).parent.parent.resolve() / "resources" / "docs.parquet"
40
    )
41
    notes = (
42
        edsnlp.data.from_polars(
43
            data,
44
            shuffle="dataset",
45
            seed=42,
46
            loop=True,
47
        )
48
        .map(lambda x: x["note_id"])
49
        .set_processing(num_cpu_workers=num_cpu_workers)
50
    )
51
    notes = list(islice(notes, 6))
52
    assert notes == [
53
        "subfolder/doc-1",
54
        "subfolder/doc-3",
55
        "subfolder/doc-2",
56
        "subfolder/doc-1",
57
        "subfolder/doc-2",
58
        "subfolder/doc-3",
59
    ]