[cad161]: / tests / data / test_spark.py

Download this file

30 lines (24 with data), 877 Bytes

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import edsnlp
def test_read_write(blank_nlp, text, df_notes_pyspark):
# line below is just to mix params to avoid running too many tests
shuffle = "dataset" if blank_nlp.lang == "eds" else False
reader = edsnlp.data.from_spark(
df_notes_pyspark,
converter="omop",
nlp=blank_nlp,
shuffle=shuffle,
).set_processing(backend="simple")
doc = list(reader)[0]
assert doc.text == text
blank_nlp.add_pipe("eds.matcher", config={"terms": {"douleur": ["douleurs"]}})
blank_nlp.add_pipe("eds.negation")
docs = blank_nlp.pipe(reader)
writer = edsnlp.data.to_spark(
docs,
converter="omop",
span_attributes=["negation"],
span_getter=["ents"],
)
res = writer.toPandas().to_dict(orient="records")
assert len(res) == 20
assert sum(len(r["entities"]) for r in res) == 20