Diff of /scripts/adicap.py [000000] .. [cad161]

Switch to unified view

a b/scripts/adicap.py
1
"""
2
Process ADICAP codes
3
Thésaurus de la codification ADICAP - Index raisonné des lésions
4
source : https://smt.esante.gouv.fr/terminologie-adicap/
5
6
"""
7
8
import gzip
9
import json
10
import re
11
from pathlib import Path
12
13
import pandas as pd
14
import typer
15
16
17
def parse_each_dict(df, dictionaryCode: str):
18
    d_spec = df.query(f"dictionaryCode=='{dictionaryCode}'")
19
    d_spec.fillna("", inplace=True)
20
21
    decode_d_spec = {}
22
23
    for code, label, anatomyCode in d_spec[["code", "label", "anatomyCode"]].values:
24
        if dictionaryCode == "D5":
25
            if re.match(r"[0-9]{4}", code) is None:
26
                decode_d_spec[str(anatomyCode) + str(code)] = label
27
        else:
28
            decode_d_spec[str(anatomyCode) + str(code)] = label
29
30
    d_value = decode_d_spec.pop(dictionaryCode)
31
32
    return dict(label=d_value, codes=decode_d_spec)
33
34
35
def get_decode_dict(df, dict_keys=["D1", "D2", "D3", "D4", "D5", "D6", "D7"]):
36
    decode_dict = {}
37
    for key in dict_keys:
38
39
        decode_dict[key] = parse_each_dict(df, dictionaryCode=key)
40
41
    return decode_dict
42
43
44
def run(
45
    raw: Path = typer.Argument(..., help="Path to the raw file"),
46
    output: Path = typer.Option(
47
        "edsnlp/resources/adicap.json.gz", help="Path to the output CSV table."
48
    ),
49
) -> None:
50
    """
51
    Convenience script to automatically process the ADICAP codes
52
    into a processable file.
53
    """
54
55
    df = pd.read_excel(
56
        raw,
57
        sheet_name="rawdatas",
58
        header=0,
59
    )
60
61
    decode_dict = get_decode_dict(df)
62
63
    typer.echo(f"Saving to {output}")
64
65
    with gzip.open(output, "w") as f:
66
        f.write(json.dumps(decode_dict).encode("utf-8"))
67
68
    typer.echo("Done !")
69
70
71
if __name__ == "__main__":
72
    typer.run(run)