Diff of /resources/lang.conf [000000] .. [ca4dac]

Switch to side-by-side view

--- a
+++ b/resources/lang.conf
@@ -0,0 +1,113 @@
+#@meta {desc: 'linguistic parsing configuration', date: '2024-02-10'}
+
+
+## Spacy token normalizers
+#
+# base NLP
+[map_filter_token_normalizer]
+class_name = zensols.nlp.MapTokenNormalizer
+# add the split token mapper to have consistent token symmetry between the
+# vanilla langres and mednlp_langres parsers as the MedCAT parser splits
+# entities; see section `mednlp_map_filter_token_normalizer`
+mapper_class_list = list: filter_token_mapper, split_ent_token_mapper
+
+# medical nlp
+[mednlp_map_filter_token_normalizer]
+class_name = zensols.nlp.MapTokenNormalizer
+mapper_class_list = list: filter_token_mapper
+embed_entities = False
+
+
+## MedCat resources
+#
+[medcat_resource]
+class_name = zensols.mednlp.MedCatResource
+installer = instance: medcat_installer
+vocab_resource = instance: medcat_vocab_resource
+cdb_resource = instance: medcat_cdb_resource
+mc_status_resource = instance: medcat_status_resource
+umls_tuis = instance: medcat_umls_tuis
+umls_groups = instance: medcal_uml_groups
+requirements_dir = resource(zensols.mednlp): resources/requirements
+cat_config = dict:
+  {'general':
+    {'spacy_model': '${mednlp_biomed_doc_parser:model_name}'}}
+
+[mednlp_library]
+class_name = zensols.mednlp.MedicalLibrary
+medcat_resource = instance: medcat_resource
+# entity_linker_resource is optionally added in entlink.conf
+
+
+## Base parser
+#
+# nlp parser override
+[doc_parser]
+# midsize model for standard NER
+model_name = ${lang}_core_web_md
+# install any missing spaCy models at runtime
+auto_install_model = True
+
+
+## Biomedical parsers
+#
+# ScispaCy parser
+[mednlp_biomed_doc_parser]
+class_name = zensols.nlp.sparser.SpacyFeatureDocumentParser
+lang = ${doc_parser:lang}
+# not very useful without using an entity recognizer model
+# model_name = en_ner_bionlp13cg_md
+# model_name = en_ner_jnlpba_md
+model_name = ${lang}_core_sci_md
+token_normalizer = instance: map_filter_token_normalizer
+token_feature_ids = eval({'import': ['zensols.nlp as n']}):
+  n.FeatureToken.FEATURE_IDS
+# install any missing spaCy models at runtime
+auto_install_model = True
+
+# MedCAT features (CUIs, TUIs etc.)
+[mednlp_medcat_doc_parser]
+class_name = zensols.mednlp.MedCatFeatureDocumentParser
+lang = ${doc_parser:lang}
+model_name = ${doc_parser:model_name}
+token_normalizer = instance: mednlp_map_filter_token_normalizer
+medcat_resource = instance: medcat_resource
+# set all features (override in your own configuration if you want them all)
+token_feature_ids = eval({'import': ['zensols.nlp as n', 'zensols.mednlp as m']}):
+  (n.FeatureToken.FEATURE_IDS | m.MedicalFeatureToken.FEATURE_IDS)
+
+
+## Combined parsers
+#
+# adds biomedical ScispaCy features (ent_) to the delegate (doc_parser)
+[mednlp_combine_biomed_doc_parser]
+class_name = zensols.nlp.combine.MappingCombinerFeatureDocumentParser
+delegate = instance: doc_parser
+source_parsers = instance: list: mednlp_biomed_doc_parser
+# add the FeatureToken attribute regardless
+#overwrite_nones = False
+# only entities are missing from the medical parser output
+yield_features = set: ent_, ent, ent_iob, ent_iob_
+# only map token level instead of sentence
+merge_sentences = False
+token_feature_ids = ${mednlp_biomed_doc_parser:token_feature_ids}
+
+# adds MedCAT features (CUIs, TUIs etc.) to the delegate (doc_parser)
+[mednlp_combine_medcat_doc_parser]
+class_name = zensols.nlp.combine.MappingCombinerFeatureDocumentParser
+delegate = instance: doc_parser
+source_parsers = instance: list: mednlp_medcat_doc_parser
+# add the FeatureToken attribute regardless
+overwrite_nones = True
+# only entities are missing from the medical parser output
+overwrite_features = eval({'import': ['zensols.mednlp as m']}):
+  m.MedicalFeatureToken.FEATURE_IDS
+# only map token level instead of sentence
+merge_sentences = False
+token_feature_ids = ${mednlp_medcat_doc_parser:token_feature_ids}
+
+# adds both biomedical ScispaCy and MedCAT features
+[mednlp_combine_biomed_medcat_doc_parser]
+class_name = zensols.nlp.combine.MappingCombinerFeatureDocumentParser
+delegate = instance: doc_parser
+source_parsers = instance: list: mednlp_combine_biomed_doc_parser, mednlp_combine_medcat_doc_parser