--- a +++ b/tests/data/test_spark.py @@ -0,0 +1,29 @@ +import edsnlp + + +def test_read_write(blank_nlp, text, df_notes_pyspark): + # line below is just to mix params to avoid running too many tests + shuffle = "dataset" if blank_nlp.lang == "eds" else False + + reader = edsnlp.data.from_spark( + df_notes_pyspark, + converter="omop", + nlp=blank_nlp, + shuffle=shuffle, + ).set_processing(backend="simple") + doc = list(reader)[0] + assert doc.text == text + + blank_nlp.add_pipe("eds.matcher", config={"terms": {"douleur": ["douleurs"]}}) + blank_nlp.add_pipe("eds.negation") + docs = blank_nlp.pipe(reader) + + writer = edsnlp.data.to_spark( + docs, + converter="omop", + span_attributes=["negation"], + span_getter=["ents"], + ) + res = writer.toPandas().to_dict(orient="records") + assert len(res) == 20 + assert sum(len(r["entities"]) for r in res) == 20