Switch to unified view

a b/tests/connectors/test_omop.py
1
import random
2
import re
3
from random import choice, randint
4
from random import random as randomf
5
from string import ascii_letters, ascii_lowercase
6
7
import pandas as pd
8
import pytest
9
10
from edsnlp.connectors.omop import OmopConnector
11
12
random.seed(10)
13
14
15
def random_word():
16
    n = randint(1, 20)
17
    return "".join(
18
        [choice(ascii_letters)] + [choice(ascii_lowercase) for _ in range(n)]
19
    )
20
21
22
def random_text():
23
    n = randint(30, 60)
24
    return " ".join([random_word() for _ in range(n)])
25
26
27
def random_note_nlp(text):
28
29
    ents = []
30
31
    for match in re.finditer(r"\w+", text):
32
        if randomf() > 0.8:
33
            ent = dict(
34
                start_char=match.start(),
35
                end_char=match.end(),
36
                lexical_variant=match.group(),
37
                note_nlp_source_value=random_word().lower(),
38
                negation=randomf() > 0.5,
39
            )
40
            ents.append(ent)
41
42
    return ents
43
44
45
@pytest.fixture
46
def note():
47
48
    df = pd.DataFrame(dict(note_text=[random_text() for _ in range(10)]))
49
    df["note_id"] = range(len(df))
50
    df["note_datetime"] = "2021-10-19"
51
52
    return df
53
54
55
@pytest.fixture
56
def note_nlp(note):
57
58
    df = note.copy()
59
    df["ents"] = df.note_text.apply(random_note_nlp)
60
    df = df.explode("ents")
61
    df = pd.concat([df, df.ents.apply(pd.Series)], axis=1)
62
    df = df.drop(columns=["note_text", "ents", "note_datetime"])
63
    df["note_nlp_id"] = range(len(df))
64
65
    return df
66
67
68
@pytest.fixture
69
def omop(blank_nlp) -> OmopConnector:
70
    blank_nlp.add_pipe("negation")
71
    return OmopConnector(blank_nlp)
72
73
74
@pytest.fixture
75
def docs(omop: OmopConnector, note, note_nlp):
76
    return omop.omop2docs(note, note_nlp, extensions=["negation"])
77
78
79
def test_omop2docs(docs, note, note_nlp):
80
81
    lexical_variants = note_nlp.groupby("note_id")["lexical_variant"].agg(list)
82
83
    for doc, text, lvs in zip(docs, note.note_text, lexical_variants):
84
        assert doc.text == text
85
        assert len(doc.ents) == len(lvs)
86
87
        for ent, lv in zip(doc.ents, lvs):
88
            assert ent.text == lv
89
90
91
def test_docs2omop(omop: OmopConnector, docs):
92
    note, note_nlp = omop.docs2omop(docs, extensions=["negation"])
93
94
    lexical_variants = note_nlp.groupby("note_id")["lexical_variant"].agg(list)
95
96
    for doc, text, lvs in zip(docs, note.note_text, lexical_variants):
97
        assert doc.text == text
98
        assert len(doc.ents) == len(lvs)
99
100
        for ent, lv in zip(doc.ents, lvs):
101
            assert ent.text == lv
102
103
104
def test_roundtrip(omop: OmopConnector, docs, note, note_nlp):
105
    note2, note_nlp2 = omop.docs2omop(docs, extensions=["negation"])
106
107
    assert (note2 == note[note2.columns]).all().all()
108
    assert (note_nlp2 == note_nlp[note_nlp2.columns]).all().all()