114 lines (98 with data), 4.1 kB
#@meta {desc: 'linguistic parsing configuration', date: '2024-02-10'}
## Spacy token normalizers
#
# base NLP
[map_filter_token_normalizer]
class_name = zensols.nlp.MapTokenNormalizer
# add the split token mapper to have consistent token symmetry between the
# vanilla langres and mednlp_langres parsers as the MedCAT parser splits
# entities; see section `mednlp_map_filter_token_normalizer`
mapper_class_list = list: filter_token_mapper, split_ent_token_mapper
# medical nlp
[mednlp_map_filter_token_normalizer]
class_name = zensols.nlp.MapTokenNormalizer
mapper_class_list = list: filter_token_mapper
embed_entities = False
## MedCat resources
#
[medcat_resource]
class_name = zensols.mednlp.MedCatResource
installer = instance: medcat_installer
vocab_resource = instance: medcat_vocab_resource
cdb_resource = instance: medcat_cdb_resource
mc_status_resource = instance: medcat_status_resource
umls_tuis = instance: medcat_umls_tuis
umls_groups = instance: medcal_uml_groups
requirements_dir = resource(zensols.mednlp): resources/requirements
cat_config = dict:
{'general':
{'spacy_model': '${mednlp_biomed_doc_parser:model_name}'}}
[mednlp_library]
class_name = zensols.mednlp.MedicalLibrary
medcat_resource = instance: medcat_resource
# entity_linker_resource is optionally added in entlink.conf
## Base parser
#
# nlp parser override
[doc_parser]
# midsize model for standard NER
model_name = ${lang}_core_web_md
# install any missing spaCy models at runtime
auto_install_model = True
## Biomedical parsers
#
# ScispaCy parser
[mednlp_biomed_doc_parser]
class_name = zensols.nlp.sparser.SpacyFeatureDocumentParser
lang = ${doc_parser:lang}
# not very useful without using an entity recognizer model
# model_name = en_ner_bionlp13cg_md
# model_name = en_ner_jnlpba_md
model_name = ${lang}_core_sci_md
token_normalizer = instance: map_filter_token_normalizer
token_feature_ids = eval({'import': ['zensols.nlp as n']}):
n.FeatureToken.FEATURE_IDS
# install any missing spaCy models at runtime
auto_install_model = True
# MedCAT features (CUIs, TUIs etc.)
[mednlp_medcat_doc_parser]
class_name = zensols.mednlp.MedCatFeatureDocumentParser
lang = ${doc_parser:lang}
model_name = ${doc_parser:model_name}
token_normalizer = instance: mednlp_map_filter_token_normalizer
medcat_resource = instance: medcat_resource
# set all features (override in your own configuration if you want them all)
token_feature_ids = eval({'import': ['zensols.nlp as n', 'zensols.mednlp as m']}):
(n.FeatureToken.FEATURE_IDS | m.MedicalFeatureToken.FEATURE_IDS)
## Combined parsers
#
# adds biomedical ScispaCy features (ent_) to the delegate (doc_parser)
[mednlp_combine_biomed_doc_parser]
class_name = zensols.nlp.combine.MappingCombinerFeatureDocumentParser
delegate = instance: doc_parser
source_parsers = instance: list: mednlp_biomed_doc_parser
# add the FeatureToken attribute regardless
#overwrite_nones = False
# only entities are missing from the medical parser output
yield_features = set: ent_, ent, ent_iob, ent_iob_
# only map token level instead of sentence
merge_sentences = False
token_feature_ids = ${mednlp_biomed_doc_parser:token_feature_ids}
# adds MedCAT features (CUIs, TUIs etc.) to the delegate (doc_parser)
[mednlp_combine_medcat_doc_parser]
class_name = zensols.nlp.combine.MappingCombinerFeatureDocumentParser
delegate = instance: doc_parser
source_parsers = instance: list: mednlp_medcat_doc_parser
# add the FeatureToken attribute regardless
overwrite_nones = True
# only entities are missing from the medical parser output
overwrite_features = eval({'import': ['zensols.mednlp as m']}):
m.MedicalFeatureToken.FEATURE_IDS
# only map token level instead of sentence
merge_sentences = False
token_feature_ids = ${mednlp_medcat_doc_parser:token_feature_ids}
# adds both biomedical ScispaCy and MedCAT features
[mednlp_combine_biomed_medcat_doc_parser]
class_name = zensols.nlp.combine.MappingCombinerFeatureDocumentParser
delegate = instance: doc_parser
source_parsers = instance: list: mednlp_combine_biomed_doc_parser, mednlp_combine_medcat_doc_parser