[cad161]: / tests / connectors / test_omop.py

Download this file

109 lines (71 with data), 2.7 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
import random
import re
from random import choice, randint
from random import random as randomf
from string import ascii_letters, ascii_lowercase
import pandas as pd
import pytest
from edsnlp.connectors.omop import OmopConnector
random.seed(10)
def random_word():
n = randint(1, 20)
return "".join(
[choice(ascii_letters)] + [choice(ascii_lowercase) for _ in range(n)]
)
def random_text():
n = randint(30, 60)
return " ".join([random_word() for _ in range(n)])
def random_note_nlp(text):
ents = []
for match in re.finditer(r"\w+", text):
if randomf() > 0.8:
ent = dict(
start_char=match.start(),
end_char=match.end(),
lexical_variant=match.group(),
note_nlp_source_value=random_word().lower(),
negation=randomf() > 0.5,
)
ents.append(ent)
return ents
@pytest.fixture
def note():
df = pd.DataFrame(dict(note_text=[random_text() for _ in range(10)]))
df["note_id"] = range(len(df))
df["note_datetime"] = "2021-10-19"
return df
@pytest.fixture
def note_nlp(note):
df = note.copy()
df["ents"] = df.note_text.apply(random_note_nlp)
df = df.explode("ents")
df = pd.concat([df, df.ents.apply(pd.Series)], axis=1)
df = df.drop(columns=["note_text", "ents", "note_datetime"])
df["note_nlp_id"] = range(len(df))
return df
@pytest.fixture
def omop(blank_nlp) -> OmopConnector:
blank_nlp.add_pipe("negation")
return OmopConnector(blank_nlp)
@pytest.fixture
def docs(omop: OmopConnector, note, note_nlp):
return omop.omop2docs(note, note_nlp, extensions=["negation"])
def test_omop2docs(docs, note, note_nlp):
lexical_variants = note_nlp.groupby("note_id")["lexical_variant"].agg(list)
for doc, text, lvs in zip(docs, note.note_text, lexical_variants):
assert doc.text == text
assert len(doc.ents) == len(lvs)
for ent, lv in zip(doc.ents, lvs):
assert ent.text == lv
def test_docs2omop(omop: OmopConnector, docs):
note, note_nlp = omop.docs2omop(docs, extensions=["negation"])
lexical_variants = note_nlp.groupby("note_id")["lexical_variant"].agg(list)
for doc, text, lvs in zip(docs, note.note_text, lexical_variants):
assert doc.text == text
assert len(doc.ents) == len(lvs)
for ent, lv in zip(doc.ents, lvs):
assert ent.text == lv
def test_roundtrip(omop: OmopConnector, docs, note, note_nlp):
note2, note_nlp2 = omop.docs2omop(docs, extensions=["negation"])
assert (note2 == note[note2.columns]).all().all()
assert (note_nlp2 == note_nlp[note_nlp2.columns]).all().all()