|
a |
|
b/tests/data/test_pandas.py |
|
|
1 |
from itertools import islice |
|
|
2 |
from pathlib import Path |
|
|
3 |
|
|
|
4 |
import pytest |
|
|
5 |
|
|
|
6 |
import edsnlp |
|
|
7 |
|
|
|
8 |
|
|
|
9 |
def test_read_write(blank_nlp, text, df_notes_pandas): |
|
|
10 |
reader = edsnlp.data.from_pandas( |
|
|
11 |
df_notes_pandas, |
|
|
12 |
converter="omop", |
|
|
13 |
nlp=blank_nlp, |
|
|
14 |
).set_processing(backend="simple") |
|
|
15 |
doc = list(reader)[0] |
|
|
16 |
assert doc.text == text |
|
|
17 |
|
|
|
18 |
blank_nlp.add_pipe("eds.matcher", config={"terms": {"douleur": ["douleurs"]}}) |
|
|
19 |
blank_nlp.add_pipe("eds.negation") |
|
|
20 |
docs = reader.map_pipeline(blank_nlp) |
|
|
21 |
|
|
|
22 |
writer = docs.to_pandas( |
|
|
23 |
converter="omop", |
|
|
24 |
span_attributes=["negation"], |
|
|
25 |
span_getter=["ents"], |
|
|
26 |
) |
|
|
27 |
res = writer.to_dict(orient="records") |
|
|
28 |
assert len(res) == 20 |
|
|
29 |
assert sum(len(r["entities"]) for r in res) == 20 |
|
|
30 |
|
|
|
31 |
|
|
|
32 |
@pytest.mark.parametrize("num_cpu_workers", [0, 2]) |
|
|
33 |
def test_read_shuffle_loop(num_cpu_workers: int): |
|
|
34 |
import pandas as pd |
|
|
35 |
|
|
|
36 |
data = pd.read_parquet( |
|
|
37 |
Path(__file__).parent.parent.resolve() / "resources" / "docs.parquet" |
|
|
38 |
) |
|
|
39 |
notes = ( |
|
|
40 |
edsnlp.data.from_pandas( |
|
|
41 |
data, |
|
|
42 |
shuffle="dataset", |
|
|
43 |
seed=42, |
|
|
44 |
loop=True, |
|
|
45 |
) |
|
|
46 |
.map(lambda x: x["note_id"]) |
|
|
47 |
.set_processing(num_cpu_workers=num_cpu_workers) |
|
|
48 |
) |
|
|
49 |
notes = list(islice(notes, 6)) |
|
|
50 |
assert notes == [ |
|
|
51 |
"subfolder/doc-3", |
|
|
52 |
"subfolder/doc-2", |
|
|
53 |
"subfolder/doc-1", |
|
|
54 |
"subfolder/doc-3", |
|
|
55 |
"subfolder/doc-1", |
|
|
56 |
"subfolder/doc-2", |
|
|
57 |
] |