|
a |
|
b/resources/lang.conf |
|
|
1 |
#@meta {desc: 'linguistic parsing configuration', date: '2024-02-10'} |
|
|
2 |
|
|
|
3 |
|
|
|
4 |
## Spacy token normalizers |
|
|
5 |
# |
|
|
6 |
# base NLP |
|
|
7 |
[map_filter_token_normalizer] |
|
|
8 |
class_name = zensols.nlp.MapTokenNormalizer |
|
|
9 |
# add the split token mapper to have consistent token symmetry between the |
|
|
10 |
# vanilla langres and mednlp_langres parsers as the MedCAT parser splits |
|
|
11 |
# entities; see section `mednlp_map_filter_token_normalizer` |
|
|
12 |
mapper_class_list = list: filter_token_mapper, split_ent_token_mapper |
|
|
13 |
|
|
|
14 |
# medical nlp |
|
|
15 |
[mednlp_map_filter_token_normalizer] |
|
|
16 |
class_name = zensols.nlp.MapTokenNormalizer |
|
|
17 |
mapper_class_list = list: filter_token_mapper |
|
|
18 |
embed_entities = False |
|
|
19 |
|
|
|
20 |
|
|
|
21 |
## MedCat resources |
|
|
22 |
# |
|
|
23 |
[medcat_resource] |
|
|
24 |
class_name = zensols.mednlp.MedCatResource |
|
|
25 |
installer = instance: medcat_installer |
|
|
26 |
vocab_resource = instance: medcat_vocab_resource |
|
|
27 |
cdb_resource = instance: medcat_cdb_resource |
|
|
28 |
mc_status_resource = instance: medcat_status_resource |
|
|
29 |
umls_tuis = instance: medcat_umls_tuis |
|
|
30 |
umls_groups = instance: medcal_uml_groups |
|
|
31 |
requirements_dir = resource(zensols.mednlp): resources/requirements |
|
|
32 |
cat_config = dict: |
|
|
33 |
{'general': |
|
|
34 |
{'spacy_model': '${mednlp_biomed_doc_parser:model_name}'}} |
|
|
35 |
|
|
|
36 |
[mednlp_library] |
|
|
37 |
class_name = zensols.mednlp.MedicalLibrary |
|
|
38 |
medcat_resource = instance: medcat_resource |
|
|
39 |
# entity_linker_resource is optionally added in entlink.conf |
|
|
40 |
|
|
|
41 |
|
|
|
42 |
## Base parser |
|
|
43 |
# |
|
|
44 |
# nlp parser override |
|
|
45 |
[doc_parser] |
|
|
46 |
# midsize model for standard NER |
|
|
47 |
model_name = ${lang}_core_web_md |
|
|
48 |
# install any missing spaCy models at runtime |
|
|
49 |
auto_install_model = True |
|
|
50 |
|
|
|
51 |
|
|
|
52 |
## Biomedical parsers |
|
|
53 |
# |
|
|
54 |
# ScispaCy parser |
|
|
55 |
[mednlp_biomed_doc_parser] |
|
|
56 |
class_name = zensols.nlp.sparser.SpacyFeatureDocumentParser |
|
|
57 |
lang = ${doc_parser:lang} |
|
|
58 |
# not very useful without using an entity recognizer model |
|
|
59 |
# model_name = en_ner_bionlp13cg_md |
|
|
60 |
# model_name = en_ner_jnlpba_md |
|
|
61 |
model_name = ${lang}_core_sci_md |
|
|
62 |
token_normalizer = instance: map_filter_token_normalizer |
|
|
63 |
token_feature_ids = eval({'import': ['zensols.nlp as n']}): |
|
|
64 |
n.FeatureToken.FEATURE_IDS |
|
|
65 |
# install any missing spaCy models at runtime |
|
|
66 |
auto_install_model = True |
|
|
67 |
|
|
|
68 |
# MedCAT features (CUIs, TUIs etc.) |
|
|
69 |
[mednlp_medcat_doc_parser] |
|
|
70 |
class_name = zensols.mednlp.MedCatFeatureDocumentParser |
|
|
71 |
lang = ${doc_parser:lang} |
|
|
72 |
model_name = ${doc_parser:model_name} |
|
|
73 |
token_normalizer = instance: mednlp_map_filter_token_normalizer |
|
|
74 |
medcat_resource = instance: medcat_resource |
|
|
75 |
# set all features (override in your own configuration if you want them all) |
|
|
76 |
token_feature_ids = eval({'import': ['zensols.nlp as n', 'zensols.mednlp as m']}): |
|
|
77 |
(n.FeatureToken.FEATURE_IDS | m.MedicalFeatureToken.FEATURE_IDS) |
|
|
78 |
|
|
|
79 |
|
|
|
80 |
## Combined parsers |
|
|
81 |
# |
|
|
82 |
# adds biomedical ScispaCy features (ent_) to the delegate (doc_parser) |
|
|
83 |
[mednlp_combine_biomed_doc_parser] |
|
|
84 |
class_name = zensols.nlp.combine.MappingCombinerFeatureDocumentParser |
|
|
85 |
delegate = instance: doc_parser |
|
|
86 |
source_parsers = instance: list: mednlp_biomed_doc_parser |
|
|
87 |
# add the FeatureToken attribute regardless |
|
|
88 |
#overwrite_nones = False |
|
|
89 |
# only entities are missing from the medical parser output |
|
|
90 |
yield_features = set: ent_, ent, ent_iob, ent_iob_ |
|
|
91 |
# only map token level instead of sentence |
|
|
92 |
merge_sentences = False |
|
|
93 |
token_feature_ids = ${mednlp_biomed_doc_parser:token_feature_ids} |
|
|
94 |
|
|
|
95 |
# adds MedCAT features (CUIs, TUIs etc.) to the delegate (doc_parser) |
|
|
96 |
[mednlp_combine_medcat_doc_parser] |
|
|
97 |
class_name = zensols.nlp.combine.MappingCombinerFeatureDocumentParser |
|
|
98 |
delegate = instance: doc_parser |
|
|
99 |
source_parsers = instance: list: mednlp_medcat_doc_parser |
|
|
100 |
# add the FeatureToken attribute regardless |
|
|
101 |
overwrite_nones = True |
|
|
102 |
# only entities are missing from the medical parser output |
|
|
103 |
overwrite_features = eval({'import': ['zensols.mednlp as m']}): |
|
|
104 |
m.MedicalFeatureToken.FEATURE_IDS |
|
|
105 |
# only map token level instead of sentence |
|
|
106 |
merge_sentences = False |
|
|
107 |
token_feature_ids = ${mednlp_medcat_doc_parser:token_feature_ids} |
|
|
108 |
|
|
|
109 |
# adds both biomedical ScispaCy and MedCAT features |
|
|
110 |
[mednlp_combine_biomed_medcat_doc_parser] |
|
|
111 |
class_name = zensols.nlp.combine.MappingCombinerFeatureDocumentParser |
|
|
112 |
delegate = instance: doc_parser |
|
|
113 |
source_parsers = instance: list: mednlp_combine_biomed_doc_parser, mednlp_combine_medcat_doc_parser |