326 lines (326 with data), 19.7 kB
{
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.10"
},
"orig_nbformat": 2,
"kernelspec": {
"name": "python3",
"display_name": "Python 3",
"language": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2,
"cells": [
{
"source": [
"# Query Extraction\n",
"\n",
"This notebook walks the user through a query extraction test. Using the `QueryExtraction` module of the EHRKit, we can search for specific queries in medical text. This example loads data from the MIMIC-III dataset and uses several different methods to search for a couple given queries.\n",
"\n",
"The user must set the root directory for the EHRKit, and optionally the data directories."
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"ROOT_EHR_DIR = '/home/lily/br384/EHRKit/' # set your root EHRKit directory here (with the '/' at the end)\n",
"import sys\n",
"import os\n",
"sys.path.append(os.path.dirname(ROOT_EHR_DIR))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# Set your mimic path here\n",
"# Put all of the individual mimic csv files in MIMIC_PATH, with the `/` at the end. These files should be all cap csv files, such as NOTEEVENTS.csv. Keep OUTPUT_DATA_PATH empty, the processed data will be deposited there.\n",
"OUTPUT_DATA_PATH = ROOT_EHR_DIR + 'data/output_data/'\n",
"MIMIC_PATH = ROOT_EHR_DIR + 'data/mimic_data/'"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"In /home/lily/br384/anaconda3/envs/EHRKit/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: \n",
"The text.latex.preview rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.\n",
"In /home/lily/br384/anaconda3/envs/EHRKit/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: \n",
"The mathtext.fallback_to_cm rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.\n",
"In /home/lily/br384/anaconda3/envs/EHRKit/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: Support for setting the 'mathtext.fallback_to_cm' rcParam is deprecated since 3.3 and will be removed two minor releases later; use 'mathtext.fallback : 'cm' instead.\n",
"In /home/lily/br384/anaconda3/envs/EHRKit/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: \n",
"The validate_bool_maybe_none function was deprecated in Matplotlib 3.3 and will be removed two minor releases later.\n",
"In /home/lily/br384/anaconda3/envs/EHRKit/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: \n",
"The savefig.jpeg_quality rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.\n",
"In /home/lily/br384/anaconda3/envs/EHRKit/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: \n",
"The keymap.all_axes rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.\n",
"In /home/lily/br384/anaconda3/envs/EHRKit/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: \n",
"The animation.avconv_path rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.\n",
"In /home/lily/br384/anaconda3/envs/EHRKit/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: \n",
"The animation.avconv_args rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.\n"
]
}
],
"source": [
"from mimic_icd9_coding.coding_pipeline import codingPipeline\n",
"from mimic_icd9_coding.utils.mimic_data_preparation import run_mimic_prep"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Run the mimic label preparation, examine the top n most common labels, and create a dataset of labels and text\n",
"# Default is to save a dataset to Data, but if return_df=True, then can stay in ipynb\n",
"run_mimic_prep(output_folder = OUTPUT_DATA_PATH, mimic_data_path= MIMIC_PATH)\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"my_mimic_pipeline = codingPipeline(verbose=False, model=None, data_path = OUTPUT_DATA_PATH, run=False)\n",
"# Load the mimic data without running\n",
"\n"
]
},
{
"source": [
"## Let's test out the model on a specific note!"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Here we load the data into the pipeline, this function simply saves the data, we don't want to save the data automatically because it uses more memory\n",
"my_mimic_pipeline.load_data()\n",
"df = my_mimic_pipeline.data"
]
},
{
"source": [
"Use the following command to move into the root directory of EHRKit, make sure to change code accordingly\n"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"from QueryExtraction.extraction import main_extraction"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Extraction:\n",
"SpaCy TextRank:\n",
"Captured 0\n",
"['mg tablet sig', 'discharge instructions', 'social history', 'history', 'sliding scale recommendations', 'medical history', 'discharge diagnosis', 'discharge', 'sliding scale', 'discharge condition', 'discharge disposition', 'family history', 'midodrine 2.5 mg tablet sig', '300 mg capsule sig', 'upper extremities', 'dr.[**last name', 'reduced sensation distal le', 'insulin srip', 'disp:*60 capsule', 'chronic kidney disease', 'following fluids', 'prior admissions', 'admission labs', 'chronic cough', '30 mg capsule', 'aggressive volume resuscitation', 'admission', 'baseline cr', 'long acting levemir', 'chronic renal insufficiency']\n",
"------------------------------\n",
"Gensim TextRank:\n",
"Captured 1\n",
"['discharge', 'discharged', 'diabetes', 'diabetic', 'disp', 'home', 'gastroparesis ckd', 'nausea vomiting', 'controlled', 'control', 'controls', 'date', 'medical', 'medications', 'power', 'extremities', 'vomit', 'insulin', 'sliding', 'admission', 'admissions', 'sugars', 'upper', 'daily', 'negative', 'nephropathy', 'patient', 'hypertension', 'hypertensive', 'history', 'status', 'sig', 'capsule', 'emesis', 'chronic', 'levemir', 'htn', 'glucose', 'hospitalizations', 'hospital', 'office', 'wretching', 'fluids', 'pump', 'remained', 'initially', 'setting', 'care', 'neck', 'regimen', 'blood cultures', 'given', 'edema', 'phos', 'retinopathy recently hospitalized', 'murmur', 'aggressive volume', 'heent', 'perrla', 'lad', 'cards', 'neuropathy', 'normal', 'normalized', 'metoprolol', 'units', 'unit', 'medicine', 'refills', 'allergies levaquin', 'alert', 'exam', 'iii', 'rbc', 'neuro', 'hematemesis', 'sodium', 'anion', 'awake', 'gen', 'fracture', 'pulm', 'culture', 'coffee', 'nad', 'reduced sensation', 'grounds', 'drug', 'stage', 'silhouette', 'managed', 'management', 'cxr', 'hgb', 'tenting', 'creat']\n",
"------------------------------\n",
"Rake:\n",
"01-Jun-21 18:08:05 - Initiated a keyword detector instance.\n",
"Captured 0\n",
"['[** doctor last name 515 **] sign extremities', '17 **] systolic ejection murmur social history', 'vi systolic ejection murmur heard best', 'poorly controlled type 1 diabetes mellitus w', 'please call dr .[** last name', '3 **] autonomic neuropathy [** date range', 'metoprolol tartrate 50 mg tablet sig', 'metoprolol tartrate 25 mg tablet sig', 'hypertension chronic renal insufficiency discharge condition', 'levaquin attending :[** first name3', 'gabapentin 300 mg capsule sig', 'type 1 diabetes mellitis w', '767 **] [** last name', 'sgot )- 16 alk phos', 'promethazine 25 mg tablet sig', 'citalopram 20 mg tablet sig', 'difficile negative brief hospital course', 'oxycodone 5 mg capsule sig', 'midodrine 5 mg tablet sig', '5 tablets po every four', '5 mg tablet sig', 'disp :* 270 tablet', 'received aggressive volume resuscitation', 'proteinuria l1 vertebral fracture', 'disp :* 60 capsule', '2195 **] chief complaint', 'md [** first name8', 'poorly controlled type', '5 tablet po q8h', 'capsule po every eight']\n",
"------------------------------\n",
"Rakun:\n",
"01-Jun-21 18:08:05 - Number of nodes reduced from 356 to 336\n",
"Captured 0\n",
"['insulin', 'insulin blood vomit', 'blood', 'discharge', 'initially', 'vomit', 'regimen', 'treated', 'insulin sliding scale', 'treated insulin units/hr', 'insulin units/hr', 'started insulin units/hr', 'regarding insulin units/hr', 'started', 'sliding', 'sliding scale', 'scale', 'sliding scale directed', 'please sliding scale', 'follow-up', 'negative', 'sliding scale recommendations', 'wretching', 'solution sliding scale', 'directed', 'primary', 'first', 'please', 'instructions', 'medications']\n",
"------------------------------\n",
"Yake:\n",
"01-Jun-21 18:08:05 - Load pretrained SentenceTransformer: distilbert-base-nli-mean-tokens\n",
"01-Jun-21 18:08:05 - Did not find folder distilbert-base-nli-mean-tokens\n",
"01-Jun-21 18:08:05 - Search model on server: http://sbert.net/models/distilbert-base-nli-mean-tokens.zip\n",
"01-Jun-21 18:08:05 - Load SentenceTransformer from folder: /home/lily/br384/.cache/torch/sentence_transformers/sbert.net_models_distilbert-base-nli-mean-tokens\n",
"01-Jun-21 18:08:07 - Use pytorch device: cuda\n",
"Captured 0\n",
"['tablet sig', 'sig', 'tablet', 'capsule', 'daily', 'capsule sig', 'solution sig', 'dka', 'blood', 'discharge', 'vomiting major surgical', 'medicine allergies', 'levaquin attending', 'home', 'invasive procedure', 'orthostatic hypotension', 'dyspnea on exertion', 'delayed release', 'major surgical', 'surgical or invasive', 'hospitalized for orthostatic', 'insulin', 'units', 'history', 'unit', 'delayed', 'release', 'units subcutaneous', 'htn', 'hours']\n",
"------------------------------\n",
"KeyBERT:\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))",
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "1c9606a30b5f4b8aa0c466ba693f15ce"
}
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=38.0), HTML(value='')))",
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "aa8400d0ef3c42d7a6d9e94395c62eb3"
}
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"Captured 0\n",
"['medicine allergies', '200s diabetic', 'evidence pneumonia', 'diabetes mellitis', 'neuropathy date', 'labs 2117', 'fracture 2117', 'medications citalopram', 'birth 2082', 'fax 85219', 'outpatient setting', 'ketoacidosis patient', 'vomiting hematocrit', 'radiology cxr', 'hsm doctor', 'lipase 22', '22am wbc', 'aspirin 325mg', 'hospitalizations 12', 'doctor 515', 'chronic renal', 'glucose 466', '118 urean', 'home insulin', 'major surgical', 'years gastroparesis', 'ckd retinopathy', '137 potassium', 'keotacidosis hematemesis', 'vomiting coffee']\n"
]
}
],
"source": [
"txt0 = df.iloc[0]['TEXT']\n",
"txt1 = df.iloc[1]['TEXT']\n",
"\n",
"word1 = ['heart', 'liver', 'stomach', 'hypertension']\n",
"\n",
"main_extraction(word1, txt0)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Extraction:\n",
"SpaCy TextRank:\n",
"01-Jun-21 18:08:13 - Initiated a keyword detector instance.\n",
"Captured 0\n",
"['mg tablet sig', 'non-steroidal induced ulcer', 'non-steroidal induced gastritis', 'blood calcium-7.5', 'coffee ground emesis', 'blood wbc-17.9', 'blood alt-126', 'blood alt-113', 'blood pressure', 'grade ii esophageal varices', 'right ventricular chamber size', 'disp:*60 tablet', 'melena x2 days', 'abdominal pain', 'ventricular systolic function', 'folic acid 1 mg tablet sig', 'hcv cirrhosis', 'guaiac positive brown stool', 'medical history', 'moderate pulmonary artery systolic hypertension', 'discharge', 'social history', 'discharge disposition', 'tarry black stool', 'ef>75%', 'prophylactic medications', '40 mg tablet', 'family history', 'right atrium', 'back pain']\n",
"------------------------------\n",
"Gensim TextRank:\n",
"Captured 2\n",
"['daily', 'negative', 'discharge', 'blood', 'medication', 'medical', 'medications', 'days', 'day', 'pain', 'portal', 'times', 'sig', 'egd right', 'history', 'effusion', 'date', 'non', 'varices', 'liver', 'urine', 'hospital', 'positive', 'ventricular wall', 'somnolent', 'somnolence', 'normal', 'vein', 'veins', 'abdominal', 'taking naproxen', 'pulmonary', 'denies', 'hct', 'ascites', 'given', 'emesis', 'ulcer', 'heart', 'disp', 'cirrhosis', 'alkphos', 'systolic', 'strength', 'alcohol', 'hemodynamic', 'tablet', 'tablets', 'variceal bleed', 'left', 'vascular', 'tracing', 'exam', 'radiology', 'apartment', 'endoscopy', 'week', 'weeks', 'refills', 'remains hemodynamically', 'aortic valve', 'known', 'appear', 'appears', 'abd', 'intervals axes', 'interval', 'experiencing intermittent', 'mild', 'glucose', 'size', 'micu', 'spray', 'past', 'regurgitation', 'allergies patient', 'mitral', 'wbc', 'topical', 'results', 'inr', 'slightly', 'noted pertinent', 'increased', 'elevated', 'gram', 'induced', 'chills significant increases', 'rbc', 'fluid bolus running wide', 'warm', 'supple', 'instructions', 'admission', 'fever', 'rdw', 'eos', 'aspirin', 'murmur', 'complaint', 'bowel', 'concerning', 'bleeding', 'bleeds', 'cardiac', 'serum', 'tte', 'sinus rhythm', 'hcv', 'coarsened', 'skin', 'neck', 'membrane', 'medicine', 'mchc', 'recommended']\n",
"------------------------------\n",
"Rake:\n",
"Captured 0\n",
"['furosemide 20mg daily lisinopril 10 mg daily spironolactone 100 mg daily discharge medications', '59m w hcv cirrhosis w grade ii esophageal varices admitted w coffee', 'portal gastropathy grade ii esophageal varices htn social history', 'comparison intervals axes rate pr qrs qt', '17 **]): sinus rhythm prolonged qt interval', 'take naproxen take pantoprazole 40 mg twice daily', '7290 **], md phone :[** telephone', '2424 **], md phone :[** telephone', 'last egd [** 3 -/ 2150 **]),', 'thiamine hcl 100 mg tablet sig', '59m w hepc cirrhosis c', 'spironolactone 100 mg tablet sig', 'folic acid 1 mg tablet sig', 'lisinopril 10 mg tablet sig', 'furosemide 40 mg tablet sig', 'peptic ulcer gi bleed discharge condition', 'mr [** known lastname 52368 **]', 'fluid bolus running wide open', 'acetaminophen 325 mg tablet sig', 'moderate pulmonary artery systolic hypertension', '2 tablets po every [** 6', 'nadolol 20 mg tablet sig', 'take 40 mg daily', 'egd right ij cvl history', 'drugs attending :[** first name3', 'ii esophageal varices', '[** known lastname 52368 **]', 'coffee ground emesis major surgical', 'pantoprazole 40 mg tablet', '9 hr 71 bp 83']\n",
"------------------------------\n",
"Rakun:\n",
"01-Jun-21 18:08:13 - Number of nodes reduced from 454 to 425\n",
"Captured 1\n",
"['blood', 'nsaids', 'instead', 'daily', '05:00am blood', '2150-4-17', 'admission', 'naproxen', 'urine', 'admission 2150-4-17 11:01pm', 'admission 2150-4-17 01:30pm', '2150-4-17 11:01pm urine', 'normal', 'systolic', 'mitral', '2150-4-17 11:01pm', '2150-4-17 01:30pm', 'systolic function mitral', 'systolic function', 'ulcer', '11:01pm urine', 'right', 'aortic', 'mitral valve leaflets', 'ventricular systolic function', 'heart', 'valve', 'function', 'aortic valve leaflets', 'artery systolic function']\n",
"------------------------------\n",
"Yake:\n",
"01-Jun-21 18:08:13 - Load pretrained SentenceTransformer: distilbert-base-nli-mean-tokens\n",
"01-Jun-21 18:08:13 - Did not find folder distilbert-base-nli-mean-tokens\n",
"01-Jun-21 18:08:13 - Search model on server: http://sbert.net/models/distilbert-base-nli-mean-tokens.zip\n",
"01-Jun-21 18:08:13 - Load SentenceTransformer from folder: /home/lily/br384/.cache/torch/sentence_transformers/sbert.net_models_distilbert-base-nli-mean-tokens\n",
"Captured 0\n",
"['tablet sig', 'tablet', 'blood', 'sig', 'coffee ground emesis', 'daily', 'ground emesis major', 'emesis major surgical', 'medicine allergies', 'day', 'chief complaint', 'drugs attending', 'coffee ground', 'discharge', 'major surgical', 'surgical or invasive', 'emesis', 'allergies', 'admission', 'normal', 'naproxen', 'pain', 'date', 'allergies to drugs', 'urine', 'ground emesis', 'emesis major', 'portal', 'history', 'week']\n",
"------------------------------\n",
"KeyBERT:\n",
"01-Jun-21 18:08:15 - Use pytorch device: cuda\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))",
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "dd4fde3a772541dc8c570cccf0276cfc"
}
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=49.0), HTML(value='')))",
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "da1a48765297468a9dfbb905346207d6"
}
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"Captured 0\n",
"['medicine allergies', '2150 coffee', 'hcv cirrhosis', 'fax 2422', 'birth 2090', 'labs 2150', 'cardiology tte', 'naproxen pantoprazole', 'nsaids blood', 'endoscopy week', 'lastname 52368', 'pneumothorax pleural', 'duodenum radiology', '13pm blood', 'bacteri yeast', 'brief hospital', 'hemodynamic monitoring', 'octreotide drip', 'ultrasound showed', 'ct 186', 'menthol lotion', 'hospital1 times', 'instead nsaids', 'discharge hcv', '59m hcv', 'left ventricular', 'acetaminophen 325', 'past medical', 'blood wbc', 'cardiac tamponade']\n"
]
}
],
"source": [
"main_extraction(word1, txt1)"
]
}
]
}