2650 lines (2649 with data), 296.1 kB
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# EDA\n",
"This notebook is used to perform exploratory data analysis on the data.\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/mnt/d/Google Drive/projects/medical_txt_parser/src/notebooks\n",
"/mnt/d/Google Drive/projects/medical_txt_parser/src\n",
"/mnt/d/Google Drive/projects/medical_txt_parser\n"
]
}
],
"source": [
"%reload_ext autoreload\n",
"%autoreload 2\n",
"\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\",category=DeprecationWarning)\n",
"\n",
"path = %pwd\n",
"while \"src\" in path:\n",
" %cd ..\n",
" path = %pwd\n",
"\n",
"import glob\n",
"import pandas as pd\n",
"import os\n",
"from tqdm.notebook import tqdm\n",
"from pprint import pprint\n",
"\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from src.utils.parse_data import parse_ast, parse_concept, parse_relation"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"train_data_path = \"data/train\"\n",
"val_data_path = \"data/val\"\n",
"ast_folder_name = \"ast\"\n",
"concept_folder_name = \"concept\"\n",
"rel_folder_name = \"rel\"\n",
"txt_folder_name = \"txt\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Import data"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "0a2609234fd24eb8a35993fca49a0fc2",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/170 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>filename</th>\n",
" <th>concept</th>\n",
" <th>ast</th>\n",
" <th>rel</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>018636330 DH\\n5425710\\n123524\\n0144918\\n6/2/20...</td>\n",
" <td>018636330_DH</td>\n",
" <td>{'concept_text': ['a workup', 'pain', 'microsc...</td>\n",
" <td>{'concept_text': ['pain', 'hyperlipidemia', 'h...</td>\n",
" <td>{'concept_text_1': ['po pain medications', 'a ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>026350193 RWH\\n7093319\\n549304\\n8417371\\n6/5/2...</td>\n",
" <td>026350193_RWH</td>\n",
" <td>{'concept_text': ['flexeril', 'constipation', ...</td>\n",
" <td>{'concept_text': ['constipation', 'left should...</td>\n",
" <td>{'concept_text_1': [], 'start_line_1': [], 'st...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>037945397 RWH\\n2690633\\n194867\\n151887\\n10/17/...</td>\n",
" <td>037945397_RWH</td>\n",
" <td>{'concept_text': ['ivf', 'near syncope', 'recu...</td>\n",
" <td>{'concept_text': ['near syncope', 'recurrent d...</td>\n",
" <td>{'concept_text_1': [], 'start_line_1': [], 'st...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>044687343 ELMVH\\n01719921\\n1626859\\n3/13/2006 ...</td>\n",
" <td>044687343_ELMVH</td>\n",
" <td>{'concept_text': ['lisinopril pump', 'bipap', ...</td>\n",
" <td>{'concept_text': ['copd', 'nad', 'fatigue', 'g...</td>\n",
" <td>{'concept_text_1': ['bipap', 'fatigue', 'ekg',...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>060376519 DH\\n0649031\\n323495\\n3838556\\n4/5/20...</td>\n",
" <td>060376519_DH</td>\n",
" <td>{'concept_text': ['dizziness', 'benign positio...</td>\n",
" <td>{'concept_text': ['dizziness', 'benign positio...</td>\n",
" <td>{'concept_text_1': ['fever'], 'start_line_1': ...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text filename \\\n",
"0 018636330 DH\\n5425710\\n123524\\n0144918\\n6/2/20... 018636330_DH \n",
"1 026350193 RWH\\n7093319\\n549304\\n8417371\\n6/5/2... 026350193_RWH \n",
"2 037945397 RWH\\n2690633\\n194867\\n151887\\n10/17/... 037945397_RWH \n",
"3 044687343 ELMVH\\n01719921\\n1626859\\n3/13/2006 ... 044687343_ELMVH \n",
"4 060376519 DH\\n0649031\\n323495\\n3838556\\n4/5/20... 060376519_DH \n",
"\n",
" concept \\\n",
"0 {'concept_text': ['a workup', 'pain', 'microsc... \n",
"1 {'concept_text': ['flexeril', 'constipation', ... \n",
"2 {'concept_text': ['ivf', 'near syncope', 'recu... \n",
"3 {'concept_text': ['lisinopril pump', 'bipap', ... \n",
"4 {'concept_text': ['dizziness', 'benign positio... \n",
"\n",
" ast \\\n",
"0 {'concept_text': ['pain', 'hyperlipidemia', 'h... \n",
"1 {'concept_text': ['constipation', 'left should... \n",
"2 {'concept_text': ['near syncope', 'recurrent d... \n",
"3 {'concept_text': ['copd', 'nad', 'fatigue', 'g... \n",
"4 {'concept_text': ['dizziness', 'benign positio... \n",
"\n",
" rel \n",
"0 {'concept_text_1': ['po pain medications', 'a ... \n",
"1 {'concept_text_1': [], 'start_line_1': [], 'st... \n",
"2 {'concept_text_1': [], 'start_line_1': [], 'st... \n",
"3 {'concept_text_1': ['bipap', 'fatigue', 'ekg',... \n",
"4 {'concept_text_1': ['fever'], 'start_line_1': ... "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"text_files = glob.glob(train_data_path + os.sep + txt_folder_name + os.sep + \"*.txt\")\n",
"filename = \"\"\n",
"df = pd.DataFrame()\n",
"for file in tqdm(text_files):\n",
" with open(file, 'r') as f:\n",
" text = f.read()\n",
" filename = file.split(\"/\")[-1].split(\".\")[0]\n",
" ast = parse_ast(train_data_path + os.sep + ast_folder_name + os.sep + filename + \".ast\")\n",
" concept = parse_concept(train_data_path + os.sep + concept_folder_name + os.sep + filename + \".con\")\n",
" rel = parse_relation(train_data_path + os.sep + rel_folder_name + os.sep + filename + \".rel\")\n",
" \n",
" df = df.append(pd.DataFrame({\"text\": [text], \"filename\": [filename] , \"concept\": [concept], \"ast\": [ast], \"rel\": [rel]}), ignore_index=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"a = df[[\"text\", \"filename\"]].set_index(\"filename\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'018636330 DH\\n5425710\\n123524\\n0144918\\n6/2/2005 12:00:00 AM\\nDischarge Summary\\nSigned\\nDIS\\nReport Status :\\nSigned\\nDISCHARGE SUMMARY\\nNAME :\\nKOTE , OA\\nUNIT NUMBER :\\n509-22-30\\nADMISSION DATE :\\n06/02/2005\\nDISCHARGE DATE :\\n06/05/2005\\nPRINCIPAL DIAGNOSIS :\\nC5-6 disc herniation with cord compression and myelopathy .\\nPRINCIPAL PROCEDURE :\\nMicroscopic anterior cervical diskectomy at C5-6 and fusion .\\nHISTORY OF PRESENT ILLNESS :\\nThe patient is a 63-year-old female with a three-year history of bilateral hand numbness and occasional weakness .\\nWithin the past year , these symptoms have progressively gotten worse , to encompass also her feet .\\nShe had a workup by her neurologist and an MRI revealed a C5-6 disc herniation with cord compression and a T2 signal change at that level .\\nPAST MEDICAL HISTORY :\\nSignificant for hypertension , hyperlipidemia .\\nMEDICATIONS ON ADMISSION :\\nLipitor , Flexeril , hydrochlorothiazide and Norvasc .\\nALLERGIES :\\nShe has no known drug allergy .\\nSOCIAL HISTORY :\\nShe smokes one pack per day x45 years .\\nShe occasionally drinks alcohol .\\nPHYSICAL EXAMINATION :\\nShe had 5/5 strength in bilateral upper and lower extremities .\\nShe had a Hoffman 's sign greater on the right than the left and she had 10 beats of clonus in the right foot and 3-5 beats in the left foot .\\nShe had hyperreflexia in both the bilateral upper and lower extremities .\\nHOSPITAL COURSE :\\nThe patient tolerated a C5-6 ACDF by Dr. Miezetri Gach quite well .\\nShe had a postoperative CT scan that revealed partial decompression of the spinal canal and good placement of her hardware .\\nImmediately postop , her exam only improved slightly in her hyperreflexia .\\nShe was ambulating by postoperative day number two .\\nShe tolerated a regular diet .\\nHer pain was under good control with PO pain medications and she was deemed suitable for discharge .\\nDISCHARGE ORDERS :\\nThe patient was asked to call Dr. Miezetri Gach 's office for a follow-up appointment and wound check .\\nShe is asked to call with any fevers , chills , increasing weakness or numbness or any bowel and bladder disruption .\\nDISCHARGE MEDICATIONS :\\nShe was discharged on the following medications .\\n1. Colace , 100 mg PO bid .\\n2. Zantac , 150 mg PO bid .\\n3. Percocet , 5/325 , 1-2 tabs PO q4-6h prn pain .\\n4. Lipitor , 10 mg PO daily .\\n5. Hydrochlorothiazide , 25 mg PO daily .\\n6. Norvasc , 5 mg PO daily .\\nLALIND KOTE , M.D.\\nDICTATING FOR :\\nElectronically Signed MIEZETRI NIMIRY POP , M.D. 08/01/2005 18:50\\n_____________________________ MIEZETRI NIMIRY POP , M.D.\\nTR :\\nqg\\nDD :\\n07/30/2005\\nTD :\\n07/31/2005 9:43 A 123524\\ncc :\\nMIEZETRI NIMIRY POP , M.D.\\n'"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a.loc[\"018636330_DH\"][\"text\"]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['concept_text', 'start_line', 'start_word_number', 'end_line', 'end_word_number', 'concept_type'])"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\"concept\"][0].keys()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filename</th>\n",
" <th>concept_text</th>\n",
" <th>start_line</th>\n",
" <th>start_word_number</th>\n",
" <th>end_line</th>\n",
" <th>end_word_number</th>\n",
" <th>concept_type</th>\n",
" <th>assertion_type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>018636330_DH</td>\n",
" <td>pain</td>\n",
" <td>55</td>\n",
" <td>10</td>\n",
" <td>55</td>\n",
" <td>10</td>\n",
" <td>problem</td>\n",
" <td>hypothetical</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>018636330_DH</td>\n",
" <td>hyperlipidemia</td>\n",
" <td>29</td>\n",
" <td>4</td>\n",
" <td>29</td>\n",
" <td>4</td>\n",
" <td>problem</td>\n",
" <td>present</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>018636330_DH</td>\n",
" <td>her pain</td>\n",
" <td>47</td>\n",
" <td>0</td>\n",
" <td>47</td>\n",
" <td>1</td>\n",
" <td>problem</td>\n",
" <td>present</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>018636330_DH</td>\n",
" <td>cord compression</td>\n",
" <td>27</td>\n",
" <td>16</td>\n",
" <td>27</td>\n",
" <td>17</td>\n",
" <td>problem</td>\n",
" <td>present</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>018636330_DH</td>\n",
" <td>chills</td>\n",
" <td>50</td>\n",
" <td>9</td>\n",
" <td>50</td>\n",
" <td>9</td>\n",
" <td>problem</td>\n",
" <td>hypothetical</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" filename concept_text start_line start_word_number end_line \\\n",
"0 018636330_DH pain 55 10 55 \n",
"1 018636330_DH hyperlipidemia 29 4 29 \n",
"2 018636330_DH her pain 47 0 47 \n",
"3 018636330_DH cord compression 27 16 27 \n",
"4 018636330_DH chills 50 9 50 \n",
"\n",
" end_word_number concept_type assertion_type \n",
"0 10 problem hypothetical \n",
"1 4 problem present \n",
"2 1 problem present \n",
"3 17 problem present \n",
"4 9 problem hypothetical "
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ast_df = pd.DataFrame(columns=[\"filename\"]+list(ast.keys()))\n",
"for i, file in df.iterrows():\n",
" ast_dict = file[\"ast\"]\n",
" tmp = pd.DataFrame(ast_dict)\n",
" tmp[\"filename\"] = file[\"filename\"]\n",
" ast_df = ast_df.append(tmp, ignore_index=True)\n",
"ast_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filename</th>\n",
" <th>concept_text</th>\n",
" <th>start_line</th>\n",
" <th>start_word_number</th>\n",
" <th>end_line</th>\n",
" <th>end_word_number</th>\n",
" <th>concept_type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>018636330_DH</td>\n",
" <td>a workup</td>\n",
" <td>27</td>\n",
" <td>2</td>\n",
" <td>27</td>\n",
" <td>3</td>\n",
" <td>test</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>018636330_DH</td>\n",
" <td>pain</td>\n",
" <td>55</td>\n",
" <td>10</td>\n",
" <td>55</td>\n",
" <td>10</td>\n",
" <td>problem</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>018636330_DH</td>\n",
" <td>microscopic anterior cervical diskectomy at c5-6</td>\n",
" <td>23</td>\n",
" <td>0</td>\n",
" <td>23</td>\n",
" <td>5</td>\n",
" <td>treatment</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>018636330_DH</td>\n",
" <td>hyperlipidemia</td>\n",
" <td>29</td>\n",
" <td>4</td>\n",
" <td>29</td>\n",
" <td>4</td>\n",
" <td>problem</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>018636330_DH</td>\n",
" <td>po pain medications</td>\n",
" <td>47</td>\n",
" <td>7</td>\n",
" <td>47</td>\n",
" <td>9</td>\n",
" <td>treatment</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" filename concept_text start_line \\\n",
"0 018636330_DH a workup 27 \n",
"1 018636330_DH pain 55 \n",
"2 018636330_DH microscopic anterior cervical diskectomy at c5-6 23 \n",
"3 018636330_DH hyperlipidemia 29 \n",
"4 018636330_DH po pain medications 47 \n",
"\n",
" start_word_number end_line end_word_number concept_type \n",
"0 2 27 3 test \n",
"1 10 55 10 problem \n",
"2 0 23 5 treatment \n",
"3 4 29 4 problem \n",
"4 7 47 9 treatment "
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"concept_df = pd.DataFrame(columns=[ \"filename\"]+list(concept.keys()))\n",
"for i, file in df.iterrows():\n",
" concept_dict = file[\"concept\"]\n",
" tmp = pd.DataFrame(concept_dict)\n",
" tmp[\"filename\"] = file[\"filename\"]\n",
" concept_df = concept_df.append(tmp, ignore_index=True)\n",
"concept_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"elements in concept_prob_df: 7073\n",
"elements in ast_df: 7073\n"
]
}
],
"source": [
"concept_prob_df = concept_df[concept_df[\"concept_type\"] == \"problem\"]\n",
"print(\"elements in concept_prob_df: \", len(concept_prob_df))\n",
"print(\"elements in ast_df: \", len(ast_df))"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"concept_prob_df['concept_text'].reset_index(drop=True).equals(ast_df['concept_text'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This means that the `problem` type is totally encoded in the ast data."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Better data representation"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>concept_text</th>\n",
" <th>start_line</th>\n",
" <th>start_word_number</th>\n",
" <th>end_line</th>\n",
" <th>end_word_number</th>\n",
" <th>ast_con_label</th>\n",
" <th>filename</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>16425</th>\n",
" <td>diaphoresis</td>\n",
" <td>14.0</td>\n",
" <td>15.0</td>\n",
" <td>14.0</td>\n",
" <td>15.0</td>\n",
" <td>present</td>\n",
" <td>record-84</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16426</th>\n",
" <td>ectopy</td>\n",
" <td>68.0</td>\n",
" <td>11.0</td>\n",
" <td>68.0</td>\n",
" <td>11.0</td>\n",
" <td>absent</td>\n",
" <td>record-84</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16427</th>\n",
" <td>further pain</td>\n",
" <td>24.0</td>\n",
" <td>12.0</td>\n",
" <td>24.0</td>\n",
" <td>13.0</td>\n",
" <td>absent</td>\n",
" <td>record-84</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16428</th>\n",
" <td>nontender</td>\n",
" <td>53.0</td>\n",
" <td>7.0</td>\n",
" <td>53.0</td>\n",
" <td>7.0</td>\n",
" <td>absent</td>\n",
" <td>record-84</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16429</th>\n",
" <td>jvd</td>\n",
" <td>47.0</td>\n",
" <td>1.0</td>\n",
" <td>47.0</td>\n",
" <td>1.0</td>\n",
" <td>absent</td>\n",
" <td>record-84</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16520</th>\n",
" <td>hydrochlorothiazide</td>\n",
" <td>99.0</td>\n",
" <td>30.0</td>\n",
" <td>99.0</td>\n",
" <td>30.0</td>\n",
" <td>treatment</td>\n",
" <td>record-84</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16521</th>\n",
" <td>his electrolytes</td>\n",
" <td>80.0</td>\n",
" <td>0.0</td>\n",
" <td>80.0</td>\n",
" <td>1.0</td>\n",
" <td>test</td>\n",
" <td>record-84</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16522</th>\n",
" <td>nitroglycerin</td>\n",
" <td>59.0</td>\n",
" <td>42.0</td>\n",
" <td>59.0</td>\n",
" <td>42.0</td>\n",
" <td>treatment</td>\n",
" <td>record-84</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16523</th>\n",
" <td>auscultation</td>\n",
" <td>52.0</td>\n",
" <td>5.0</td>\n",
" <td>52.0</td>\n",
" <td>5.0</td>\n",
" <td>test</td>\n",
" <td>record-84</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16524</th>\n",
" <td>nitroglycerin</td>\n",
" <td>20.0</td>\n",
" <td>12.0</td>\n",
" <td>20.0</td>\n",
" <td>12.0</td>\n",
" <td>treatment</td>\n",
" <td>record-84</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>100 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" concept_text start_line start_word_number end_line \\\n",
"16425 diaphoresis 14.0 15.0 14.0 \n",
"16426 ectopy 68.0 11.0 68.0 \n",
"16427 further pain 24.0 12.0 24.0 \n",
"16428 nontender 53.0 7.0 53.0 \n",
"16429 jvd 47.0 1.0 47.0 \n",
"... ... ... ... ... \n",
"16520 hydrochlorothiazide 99.0 30.0 99.0 \n",
"16521 his electrolytes 80.0 0.0 80.0 \n",
"16522 nitroglycerin 59.0 42.0 59.0 \n",
"16523 auscultation 52.0 5.0 52.0 \n",
"16524 nitroglycerin 20.0 12.0 20.0 \n",
"\n",
" end_word_number ast_con_label filename \n",
"16425 15.0 present record-84 \n",
"16426 11.0 absent record-84 \n",
"16427 13.0 absent record-84 \n",
"16428 7.0 absent record-84 \n",
"16429 1.0 absent record-84 \n",
"... ... ... ... \n",
"16520 30.0 treatment record-84 \n",
"16521 1.0 test record-84 \n",
"16522 42.0 treatment record-84 \n",
"16523 5.0 test record-84 \n",
"16524 12.0 treatment record-84 \n",
"\n",
"[100 rows x 7 columns]"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# ast_concept_df = pd.DataFrame(columns=[\"filename\"]+list(ast.keys()))\n",
"ast_concept_df = pd.DataFrame()\n",
"for i, file in df.iterrows():\n",
" ast_dict = file[\"ast\"]\n",
" concept_dict = file[\"concept\"]\n",
" tmp_ast = pd.DataFrame(ast_dict)\n",
" tmp_ast = tmp_ast.drop(columns=[\"concept_type\"])\n",
" tmp_ast = tmp_ast.rename(columns={\"assertion_type\": \"ast_con_label\"})\n",
"\n",
" #Only concepts with not \"problem\"\n",
" tmp_concept = pd.DataFrame(concept_dict)\n",
" tmp_concept = tmp_concept[tmp_concept[\"concept_type\"] != \"problem\"]\n",
" tmp_concept = tmp_concept.rename(columns={\"concept_type\": \"ast_con_label\"})\n",
" \n",
" tmp_ast[\"filename\"] = file[\"filename\"]\n",
" tmp_concept[\"filename\"] = file[\"filename\"]\n",
" ast_concept_df = ast_concept_df.append(tmp_ast, ignore_index=True)\n",
" ast_concept_df = ast_concept_df.append(tmp_concept, ignore_index=True)\n",
"ast_concept_df.tail(100)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Concept Analysis"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filename</th>\n",
" <th>concept_text</th>\n",
" <th>start_line</th>\n",
" <th>start_word_number</th>\n",
" <th>end_line</th>\n",
" <th>end_word_number</th>\n",
" <th>concept_type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>018636330_DH</td>\n",
" <td>a workup</td>\n",
" <td>27</td>\n",
" <td>2</td>\n",
" <td>27</td>\n",
" <td>3</td>\n",
" <td>test</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>018636330_DH</td>\n",
" <td>pain</td>\n",
" <td>55</td>\n",
" <td>10</td>\n",
" <td>55</td>\n",
" <td>10</td>\n",
" <td>problem</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>018636330_DH</td>\n",
" <td>microscopic anterior cervical diskectomy at c5-6</td>\n",
" <td>23</td>\n",
" <td>0</td>\n",
" <td>23</td>\n",
" <td>5</td>\n",
" <td>treatment</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>018636330_DH</td>\n",
" <td>hyperlipidemia</td>\n",
" <td>29</td>\n",
" <td>4</td>\n",
" <td>29</td>\n",
" <td>4</td>\n",
" <td>problem</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>018636330_DH</td>\n",
" <td>po pain medications</td>\n",
" <td>47</td>\n",
" <td>7</td>\n",
" <td>47</td>\n",
" <td>9</td>\n",
" <td>treatment</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" filename concept_text start_line \\\n",
"0 018636330_DH a workup 27 \n",
"1 018636330_DH pain 55 \n",
"2 018636330_DH microscopic anterior cervical diskectomy at c5-6 23 \n",
"3 018636330_DH hyperlipidemia 29 \n",
"4 018636330_DH po pain medications 47 \n",
"\n",
" start_word_number end_line end_word_number concept_type \n",
"0 2 27 3 test \n",
"1 10 55 10 problem \n",
"2 0 23 5 treatment \n",
"3 4 29 4 problem \n",
"4 7 47 9 treatment "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"concept_df = pd.DataFrame(columns=[ \"filename\"]+list(concept.keys()))\n",
"for i, file in df.iterrows():\n",
" concept_dict = file[\"concept\"]\n",
" tmp = pd.DataFrame(concept_dict)\n",
" tmp[\"filename\"] = file[\"filename\"]\n",
" concept_df = concept_df.append(tmp, ignore_index=True)\n",
"concept_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Some annotations are duplicated in the data we have. So we need to drop them"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"filename concept_text start_line start_word_number end_line end_word_number concept_type\n",
"245096078 kayciel 131 70 131 70 treatment 2\n",
"627258104 cultures 83 0 83 0 test 2\n",
"555509347_PUMC multivitamins 62 0 62 0 treatment 2\n",
"523704694 proctofoam 36 35 36 35 treatment 2\n",
"641557794_WGH papillary carcinoma 50 0 50 1 problem 2\n",
"dtype: int64"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"concept_df.value_counts().head()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"filename concept_text start_line start_word_number end_line end_word_number concept_type\n",
"018636330_DH 10 beats of clonus 39 16 39 19 problem 1\n",
"record-25 placement of 8 caucasian nephrostomy catheter 21 9 21 14 treatment 1\n",
" initial work-up 87 8 87 9 test 1\n",
" inr 42 0 42 0 test 1\n",
" intravenous albumin 79 0 79 1 treatment 1\n",
"dtype: int64"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"concept_df = concept_df.drop_duplicates()\n",
"concept_df.value_counts().head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Feature `concept_type`"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Text(0.5, 1.0, 'Number of Concepts per File')"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 720x2520 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plt.rcParams['figure.figsize'] = [10, 35]\n",
"concept_df[[\"concept_type\", \"filename\"]].groupby(\"filename\").count().sort_values(by=\"concept_type\", ascending=True).plot(kind=\"barh\")\n",
"plt.title(\"Number of Concepts per File\")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filename</th>\n",
" </tr>\n",
" <tr>\n",
" <th>concept_type</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>problem</th>\n",
" <td>7072</td>\n",
" </tr>\n",
" <tr>\n",
" <th>test</th>\n",
" <td>4607</td>\n",
" </tr>\n",
" <tr>\n",
" <th>treatment</th>\n",
" <td>4841</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" filename\n",
"concept_type \n",
"problem 7072\n",
"test 4607\n",
"treatment 4841"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 1800x504 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# number of concept types\n",
"concept_df[[\"concept_type\", \"filename\"]].groupby(\"concept_type\").count().plot(kind=\"barh\")\n",
"concept_df[[\"concept_type\", \"filename\"]].groupby(\"concept_type\").count()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Assertion Analysis"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filename</th>\n",
" <th>concept_text</th>\n",
" <th>start_line</th>\n",
" <th>start_word_number</th>\n",
" <th>end_line</th>\n",
" <th>end_word_number</th>\n",
" <th>concept_type</th>\n",
" <th>assertion_type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>018636330_DH</td>\n",
" <td>pain</td>\n",
" <td>55</td>\n",
" <td>10</td>\n",
" <td>55</td>\n",
" <td>10</td>\n",
" <td>problem</td>\n",
" <td>hypothetical</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>018636330_DH</td>\n",
" <td>hyperlipidemia</td>\n",
" <td>29</td>\n",
" <td>4</td>\n",
" <td>29</td>\n",
" <td>4</td>\n",
" <td>problem</td>\n",
" <td>present</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>018636330_DH</td>\n",
" <td>her pain</td>\n",
" <td>47</td>\n",
" <td>0</td>\n",
" <td>47</td>\n",
" <td>1</td>\n",
" <td>problem</td>\n",
" <td>present</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>018636330_DH</td>\n",
" <td>cord compression</td>\n",
" <td>27</td>\n",
" <td>16</td>\n",
" <td>27</td>\n",
" <td>17</td>\n",
" <td>problem</td>\n",
" <td>present</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>018636330_DH</td>\n",
" <td>chills</td>\n",
" <td>50</td>\n",
" <td>9</td>\n",
" <td>50</td>\n",
" <td>9</td>\n",
" <td>problem</td>\n",
" <td>hypothetical</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" filename concept_text start_line start_word_number end_line \\\n",
"0 018636330_DH pain 55 10 55 \n",
"1 018636330_DH hyperlipidemia 29 4 29 \n",
"2 018636330_DH her pain 47 0 47 \n",
"3 018636330_DH cord compression 27 16 27 \n",
"4 018636330_DH chills 50 9 50 \n",
"\n",
" end_word_number concept_type assertion_type \n",
"0 10 problem hypothetical \n",
"1 4 problem present \n",
"2 1 problem present \n",
"3 17 problem present \n",
"4 9 problem hypothetical "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"assertion_df = pd.DataFrame(columns=[ \"filename\"]+list(ast.keys()))\n",
"for i, file in df.iterrows():\n",
" assertion_dict = file[\"ast\"]\n",
" tmp = pd.DataFrame(assertion_dict)\n",
" tmp[\"filename\"] = file[\"filename\"]\n",
" assertion_df = assertion_df.append(tmp, ignore_index=True)\n",
"assertion_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filename</th>\n",
" </tr>\n",
" <tr>\n",
" <th>assertion_type</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>conditional</th>\n",
" <td>73</td>\n",
" </tr>\n",
" <tr>\n",
" <th>associated_with_someone_else</th>\n",
" <td>89</td>\n",
" </tr>\n",
" <tr>\n",
" <th>possible</th>\n",
" <td>309</td>\n",
" </tr>\n",
" <tr>\n",
" <th>hypothetical</th>\n",
" <td>382</td>\n",
" </tr>\n",
" <tr>\n",
" <th>absent</th>\n",
" <td>1596</td>\n",
" </tr>\n",
" <tr>\n",
" <th>present</th>\n",
" <td>4624</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" filename\n",
"assertion_type \n",
"conditional 73\n",
"associated_with_someone_else 89\n",
"possible 309\n",
"hypothetical 382\n",
"absent 1596\n",
"present 4624"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# count assertion types\n",
"assertion_df[[\"assertion_type\", \"filename\"]].groupby(\"assertion_type\").count().sort_values(by=\"filename\", ascending=True).plot(kind=\"barh\")\n",
"assertion_df[[\"assertion_type\", \"filename\"]].groupby(\"assertion_type\").count().sort_values(by=\"filename\", ascending=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Present\n",
"problems associated with the patient can be present. This is the default category for medical problems and it contains that do not fit the definition of any of the other assertion category."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filename</th>\n",
" <th>concept_text</th>\n",
" <th>start_line</th>\n",
" <th>start_word_number</th>\n",
" <th>end_line</th>\n",
" <th>end_word_number</th>\n",
" <th>concept_type</th>\n",
" <th>assertion_type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>018636330_DH</td>\n",
" <td>hyperlipidemia</td>\n",
" <td>29</td>\n",
" <td>4</td>\n",
" <td>29</td>\n",
" <td>4</td>\n",
" <td>problem</td>\n",
" <td>present</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>018636330_DH</td>\n",
" <td>her pain</td>\n",
" <td>47</td>\n",
" <td>0</td>\n",
" <td>47</td>\n",
" <td>1</td>\n",
" <td>problem</td>\n",
" <td>present</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>018636330_DH</td>\n",
" <td>cord compression</td>\n",
" <td>27</td>\n",
" <td>16</td>\n",
" <td>27</td>\n",
" <td>17</td>\n",
" <td>problem</td>\n",
" <td>present</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>018636330_DH</td>\n",
" <td>hyperreflexia</td>\n",
" <td>40</td>\n",
" <td>2</td>\n",
" <td>40</td>\n",
" <td>2</td>\n",
" <td>problem</td>\n",
" <td>present</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>018636330_DH</td>\n",
" <td>partial decompression of the spinal canal</td>\n",
" <td>43</td>\n",
" <td>8</td>\n",
" <td>43</td>\n",
" <td>13</td>\n",
" <td>problem</td>\n",
" <td>present</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7066</th>\n",
" <td>record-84</td>\n",
" <td>minimal ooze</td>\n",
" <td>55</td>\n",
" <td>5</td>\n",
" <td>55</td>\n",
" <td>6</td>\n",
" <td>problem</td>\n",
" <td>present</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7067</th>\n",
" <td>record-84</td>\n",
" <td>10/10 substernal chest pain</td>\n",
" <td>12</td>\n",
" <td>37</td>\n",
" <td>12</td>\n",
" <td>40</td>\n",
" <td>problem</td>\n",
" <td>present</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7068</th>\n",
" <td>record-84</td>\n",
" <td>st elevations in v1-v3</td>\n",
" <td>15</td>\n",
" <td>14</td>\n",
" <td>15</td>\n",
" <td>17</td>\n",
" <td>problem</td>\n",
" <td>present</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7069</th>\n",
" <td>record-84</td>\n",
" <td>very tense</td>\n",
" <td>43</td>\n",
" <td>5</td>\n",
" <td>43</td>\n",
" <td>6</td>\n",
" <td>problem</td>\n",
" <td>present</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7072</th>\n",
" <td>record-84</td>\n",
" <td>hypertension</td>\n",
" <td>12</td>\n",
" <td>30</td>\n",
" <td>12</td>\n",
" <td>30</td>\n",
" <td>problem</td>\n",
" <td>present</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>4624 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" filename concept_text start_line \\\n",
"1 018636330_DH hyperlipidemia 29 \n",
"2 018636330_DH her pain 47 \n",
"3 018636330_DH cord compression 27 \n",
"5 018636330_DH hyperreflexia 40 \n",
"6 018636330_DH partial decompression of the spinal canal 43 \n",
"... ... ... ... \n",
"7066 record-84 minimal ooze 55 \n",
"7067 record-84 10/10 substernal chest pain 12 \n",
"7068 record-84 st elevations in v1-v3 15 \n",
"7069 record-84 very tense 43 \n",
"7072 record-84 hypertension 12 \n",
"\n",
" start_word_number end_line end_word_number concept_type assertion_type \n",
"1 4 29 4 problem present \n",
"2 0 47 1 problem present \n",
"3 16 27 17 problem present \n",
"5 2 40 2 problem present \n",
"6 8 43 13 problem present \n",
"... ... ... ... ... ... \n",
"7066 5 55 6 problem present \n",
"7067 37 12 40 problem present \n",
"7068 14 15 17 problem present \n",
"7069 5 43 6 problem present \n",
"7072 30 12 30 problem present \n",
"\n",
"[4624 rows x 8 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# examples of as = \"present\"\n",
"assertion_df[assertion_df[\"assertion_type\"] == \"present\"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Absent\n",
"the note asserts that the problem does not exist in the patient. This category also includes mentions where it is stated that the patient HAD a problem, but no longer does."
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filename</th>\n",
" <th>concept_text</th>\n",
" <th>start_line</th>\n",
" <th>start_word_number</th>\n",
" <th>end_line</th>\n",
" <th>end_word_number</th>\n",
" <th>concept_type</th>\n",
" <th>assertion_type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>018636330_DH</td>\n",
" <td>known drug allergy</td>\n",
" <td>33</td>\n",
" <td>3</td>\n",
" <td>33</td>\n",
" <td>5</td>\n",
" <td>problem</td>\n",
" <td>absent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39</th>\n",
" <td>044687343_ELMVH</td>\n",
" <td>nad</td>\n",
" <td>80</td>\n",
" <td>0</td>\n",
" <td>80</td>\n",
" <td>0</td>\n",
" <td>problem</td>\n",
" <td>absent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>42</th>\n",
" <td>044687343_ELMVH</td>\n",
" <td>st changes</td>\n",
" <td>81</td>\n",
" <td>6</td>\n",
" <td>81</td>\n",
" <td>7</td>\n",
" <td>problem</td>\n",
" <td>absent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>48</th>\n",
" <td>044687343_ELMVH</td>\n",
" <td>wheezes</td>\n",
" <td>80</td>\n",
" <td>19</td>\n",
" <td>80</td>\n",
" <td>19</td>\n",
" <td>problem</td>\n",
" <td>absent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>49</th>\n",
" <td>044687343_ELMVH</td>\n",
" <td>uti</td>\n",
" <td>92</td>\n",
" <td>14</td>\n",
" <td>92</td>\n",
" <td>14</td>\n",
" <td>problem</td>\n",
" <td>absent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7061</th>\n",
" <td>record-84</td>\n",
" <td>edema</td>\n",
" <td>54</td>\n",
" <td>8</td>\n",
" <td>54</td>\n",
" <td>8</td>\n",
" <td>problem</td>\n",
" <td>absent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7062</th>\n",
" <td>record-84</td>\n",
" <td>vomiting</td>\n",
" <td>24</td>\n",
" <td>21</td>\n",
" <td>24</td>\n",
" <td>21</td>\n",
" <td>problem</td>\n",
" <td>absent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7064</th>\n",
" <td>record-84</td>\n",
" <td>any chest pain</td>\n",
" <td>86</td>\n",
" <td>4</td>\n",
" <td>86</td>\n",
" <td>6</td>\n",
" <td>problem</td>\n",
" <td>absent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7070</th>\n",
" <td>record-84</td>\n",
" <td>pnd</td>\n",
" <td>25</td>\n",
" <td>18</td>\n",
" <td>25</td>\n",
" <td>18</td>\n",
" <td>problem</td>\n",
" <td>absent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7071</th>\n",
" <td>record-84</td>\n",
" <td>gallops</td>\n",
" <td>51</td>\n",
" <td>10</td>\n",
" <td>51</td>\n",
" <td>10</td>\n",
" <td>problem</td>\n",
" <td>absent</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1596 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" filename concept_text start_line start_word_number \\\n",
"22 018636330_DH known drug allergy 33 3 \n",
"39 044687343_ELMVH nad 80 0 \n",
"42 044687343_ELMVH st changes 81 6 \n",
"48 044687343_ELMVH wheezes 80 19 \n",
"49 044687343_ELMVH uti 92 14 \n",
"... ... ... ... ... \n",
"7061 record-84 edema 54 8 \n",
"7062 record-84 vomiting 24 21 \n",
"7064 record-84 any chest pain 86 4 \n",
"7070 record-84 pnd 25 18 \n",
"7071 record-84 gallops 51 10 \n",
"\n",
" end_line end_word_number concept_type assertion_type \n",
"22 33 5 problem absent \n",
"39 80 0 problem absent \n",
"42 81 7 problem absent \n",
"48 80 19 problem absent \n",
"49 92 14 problem absent \n",
"... ... ... ... ... \n",
"7061 54 8 problem absent \n",
"7062 24 21 problem absent \n",
"7064 86 6 problem absent \n",
"7070 25 18 problem absent \n",
"7071 51 10 problem absent \n",
"\n",
"[1596 rows x 8 columns]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"assertion_df[assertion_df[\"assertion_type\"] == \"absent\"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Possible:\n",
"the note asserts that the patient may have a problem, but there is\n",
"uncertainty expressed in the note. Possible takes precedence over absent, so\n",
"terms like “probably not” or “unlikely” categorize problems as being possible\n",
"just as “probably” and “likely” do."
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filename</th>\n",
" <th>concept_text</th>\n",
" <th>start_line</th>\n",
" <th>start_word_number</th>\n",
" <th>end_line</th>\n",
" <th>end_word_number</th>\n",
" <th>concept_type</th>\n",
" <th>assertion_type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>93</th>\n",
" <td>060376519_DH</td>\n",
" <td>benign positional vertigo</td>\n",
" <td>35</td>\n",
" <td>0</td>\n",
" <td>35</td>\n",
" <td>2</td>\n",
" <td>problem</td>\n",
" <td>possible</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98</th>\n",
" <td>060376519_DH</td>\n",
" <td>labyrinthitis</td>\n",
" <td>35</td>\n",
" <td>4</td>\n",
" <td>35</td>\n",
" <td>4</td>\n",
" <td>problem</td>\n",
" <td>possible</td>\n",
" </tr>\n",
" <tr>\n",
" <th>150</th>\n",
" <td>101407944_PUMC</td>\n",
" <td>primary cns neoplasm</td>\n",
" <td>99</td>\n",
" <td>11</td>\n",
" <td>99</td>\n",
" <td>13</td>\n",
" <td>problem</td>\n",
" <td>possible</td>\n",
" </tr>\n",
" <tr>\n",
" <th>171</th>\n",
" <td>101407944_PUMC</td>\n",
" <td>glioblastoma</td>\n",
" <td>99</td>\n",
" <td>16</td>\n",
" <td>99</td>\n",
" <td>16</td>\n",
" <td>problem</td>\n",
" <td>possible</td>\n",
" </tr>\n",
" <tr>\n",
" <th>181</th>\n",
" <td>101407944_PUMC</td>\n",
" <td>ependymal spread</td>\n",
" <td>100</td>\n",
" <td>13</td>\n",
" <td>100</td>\n",
" <td>14</td>\n",
" <td>problem</td>\n",
" <td>possible</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6978</th>\n",
" <td>record-83</td>\n",
" <td>gastritis</td>\n",
" <td>41</td>\n",
" <td>33</td>\n",
" <td>41</td>\n",
" <td>33</td>\n",
" <td>problem</td>\n",
" <td>possible</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6979</th>\n",
" <td>record-83</td>\n",
" <td>gastric ulcer</td>\n",
" <td>41</td>\n",
" <td>35</td>\n",
" <td>41</td>\n",
" <td>36</td>\n",
" <td>problem</td>\n",
" <td>possible</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6980</th>\n",
" <td>record-83</td>\n",
" <td>lower gi pathology</td>\n",
" <td>41</td>\n",
" <td>40</td>\n",
" <td>41</td>\n",
" <td>42</td>\n",
" <td>problem</td>\n",
" <td>possible</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6986</th>\n",
" <td>record-83</td>\n",
" <td>gastroenteritis</td>\n",
" <td>37</td>\n",
" <td>0</td>\n",
" <td>37</td>\n",
" <td>0</td>\n",
" <td>problem</td>\n",
" <td>possible</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6988</th>\n",
" <td>record-83</td>\n",
" <td>infection</td>\n",
" <td>21</td>\n",
" <td>0</td>\n",
" <td>21</td>\n",
" <td>0</td>\n",
" <td>problem</td>\n",
" <td>possible</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>309 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" filename concept_text start_line start_word_number \\\n",
"93 060376519_DH benign positional vertigo 35 0 \n",
"98 060376519_DH labyrinthitis 35 4 \n",
"150 101407944_PUMC primary cns neoplasm 99 11 \n",
"171 101407944_PUMC glioblastoma 99 16 \n",
"181 101407944_PUMC ependymal spread 100 13 \n",
"... ... ... ... ... \n",
"6978 record-83 gastritis 41 33 \n",
"6979 record-83 gastric ulcer 41 35 \n",
"6980 record-83 lower gi pathology 41 40 \n",
"6986 record-83 gastroenteritis 37 0 \n",
"6988 record-83 infection 21 0 \n",
"\n",
" end_line end_word_number concept_type assertion_type \n",
"93 35 2 problem possible \n",
"98 35 4 problem possible \n",
"150 99 13 problem possible \n",
"171 99 16 problem possible \n",
"181 100 14 problem possible \n",
"... ... ... ... ... \n",
"6978 41 33 problem possible \n",
"6979 41 36 problem possible \n",
"6980 41 42 problem possible \n",
"6986 37 0 problem possible \n",
"6988 21 0 problem possible \n",
"\n",
"[309 rows x 8 columns]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"assertion_df[assertion_df[\"assertion_type\"] == \"possible\"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Conditional:\n",
"the mention of the medical problem asserts that the patient\n",
"experiences the problem only under certain conditions. Allergies can fall into\n",
"this category."
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filename</th>\n",
" <th>concept_text</th>\n",
" <th>start_line</th>\n",
" <th>start_word_number</th>\n",
" <th>end_line</th>\n",
" <th>end_word_number</th>\n",
" <th>concept_type</th>\n",
" <th>assertion_type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>345</th>\n",
" <td>143748600_SC</td>\n",
" <td>episodes of atypical cp x 1 week</td>\n",
" <td>52</td>\n",
" <td>23</td>\n",
" <td>52</td>\n",
" <td>29</td>\n",
" <td>problem</td>\n",
" <td>conditional</td>\n",
" </tr>\n",
" <tr>\n",
" <th>442</th>\n",
" <td>176318078_a</td>\n",
" <td>bleeding from the mouth and nose</td>\n",
" <td>29</td>\n",
" <td>3</td>\n",
" <td>29</td>\n",
" <td>8</td>\n",
" <td>problem</td>\n",
" <td>conditional</td>\n",
" </tr>\n",
" <tr>\n",
" <th>454</th>\n",
" <td>176318078_a</td>\n",
" <td>headache</td>\n",
" <td>43</td>\n",
" <td>3</td>\n",
" <td>43</td>\n",
" <td>3</td>\n",
" <td>problem</td>\n",
" <td>conditional</td>\n",
" </tr>\n",
" <tr>\n",
" <th>457</th>\n",
" <td>176318078_a</td>\n",
" <td>marked hyperkalemia</td>\n",
" <td>88</td>\n",
" <td>10</td>\n",
" <td>88</td>\n",
" <td>11</td>\n",
" <td>problem</td>\n",
" <td>conditional</td>\n",
" </tr>\n",
" <tr>\n",
" <th>461</th>\n",
" <td>176318078_a</td>\n",
" <td>epistaxis</td>\n",
" <td>21</td>\n",
" <td>0</td>\n",
" <td>21</td>\n",
" <td>0</td>\n",
" <td>problem</td>\n",
" <td>conditional</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6756</th>\n",
" <td>record-80</td>\n",
" <td>dusky</td>\n",
" <td>25</td>\n",
" <td>14</td>\n",
" <td>25</td>\n",
" <td>14</td>\n",
" <td>problem</td>\n",
" <td>conditional</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6953</th>\n",
" <td>record-83</td>\n",
" <td>only mild nausea</td>\n",
" <td>42</td>\n",
" <td>7</td>\n",
" <td>42</td>\n",
" <td>9</td>\n",
" <td>problem</td>\n",
" <td>conditional</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6983</th>\n",
" <td>record-83</td>\n",
" <td>occ nausea</td>\n",
" <td>37</td>\n",
" <td>10</td>\n",
" <td>37</td>\n",
" <td>11</td>\n",
" <td>problem</td>\n",
" <td>conditional</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6995</th>\n",
" <td>record-84</td>\n",
" <td>dyspnea</td>\n",
" <td>25</td>\n",
" <td>12</td>\n",
" <td>25</td>\n",
" <td>12</td>\n",
" <td>problem</td>\n",
" <td>conditional</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7065</th>\n",
" <td>record-84</td>\n",
" <td>dyspnea</td>\n",
" <td>86</td>\n",
" <td>15</td>\n",
" <td>86</td>\n",
" <td>15</td>\n",
" <td>problem</td>\n",
" <td>conditional</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>73 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" filename concept_text start_line \\\n",
"345 143748600_SC episodes of atypical cp x 1 week 52 \n",
"442 176318078_a bleeding from the mouth and nose 29 \n",
"454 176318078_a headache 43 \n",
"457 176318078_a marked hyperkalemia 88 \n",
"461 176318078_a epistaxis 21 \n",
"... ... ... ... \n",
"6756 record-80 dusky 25 \n",
"6953 record-83 only mild nausea 42 \n",
"6983 record-83 occ nausea 37 \n",
"6995 record-84 dyspnea 25 \n",
"7065 record-84 dyspnea 86 \n",
"\n",
" start_word_number end_line end_word_number concept_type assertion_type \n",
"345 23 52 29 problem conditional \n",
"442 3 29 8 problem conditional \n",
"454 3 43 3 problem conditional \n",
"457 10 88 11 problem conditional \n",
"461 0 21 0 problem conditional \n",
"... ... ... ... ... ... \n",
"6756 14 25 14 problem conditional \n",
"6953 7 42 9 problem conditional \n",
"6983 10 37 11 problem conditional \n",
"6995 12 25 12 problem conditional \n",
"7065 15 86 15 problem conditional \n",
"\n",
"[73 rows x 8 columns]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"assertion_df[assertion_df[\"assertion_type\"] == \"conditional\"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Hypothetical:\n",
"medical problems that the note asserts the patient may\n",
"develop."
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filename</th>\n",
" <th>concept_text</th>\n",
" <th>start_line</th>\n",
" <th>start_word_number</th>\n",
" <th>end_line</th>\n",
" <th>end_word_number</th>\n",
" <th>concept_type</th>\n",
" <th>assertion_type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>018636330_DH</td>\n",
" <td>pain</td>\n",
" <td>55</td>\n",
" <td>10</td>\n",
" <td>55</td>\n",
" <td>10</td>\n",
" <td>problem</td>\n",
" <td>hypothetical</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>018636330_DH</td>\n",
" <td>chills</td>\n",
" <td>50</td>\n",
" <td>9</td>\n",
" <td>50</td>\n",
" <td>9</td>\n",
" <td>problem</td>\n",
" <td>hypothetical</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>018636330_DH</td>\n",
" <td>fevers</td>\n",
" <td>50</td>\n",
" <td>7</td>\n",
" <td>50</td>\n",
" <td>7</td>\n",
" <td>problem</td>\n",
" <td>hypothetical</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>018636330_DH</td>\n",
" <td>numbness</td>\n",
" <td>50</td>\n",
" <td>14</td>\n",
" <td>50</td>\n",
" <td>14</td>\n",
" <td>problem</td>\n",
" <td>hypothetical</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>018636330_DH</td>\n",
" <td>bowel and bladder disruption</td>\n",
" <td>50</td>\n",
" <td>17</td>\n",
" <td>50</td>\n",
" <td>20</td>\n",
" <td>problem</td>\n",
" <td>hypothetical</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7028</th>\n",
" <td>record-84</td>\n",
" <td>shortness of breath</td>\n",
" <td>99</td>\n",
" <td>25</td>\n",
" <td>99</td>\n",
" <td>27</td>\n",
" <td>problem</td>\n",
" <td>hypothetical</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7029</th>\n",
" <td>record-84</td>\n",
" <td>any chest pain</td>\n",
" <td>87</td>\n",
" <td>12</td>\n",
" <td>87</td>\n",
" <td>14</td>\n",
" <td>problem</td>\n",
" <td>hypothetical</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7040</th>\n",
" <td>record-84</td>\n",
" <td>difficulty breathing</td>\n",
" <td>87</td>\n",
" <td>18</td>\n",
" <td>87</td>\n",
" <td>19</td>\n",
" <td>problem</td>\n",
" <td>hypothetical</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7041</th>\n",
" <td>record-84</td>\n",
" <td>pressure</td>\n",
" <td>87</td>\n",
" <td>16</td>\n",
" <td>87</td>\n",
" <td>16</td>\n",
" <td>problem</td>\n",
" <td>hypothetical</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7045</th>\n",
" <td>record-84</td>\n",
" <td>light-headiness</td>\n",
" <td>87</td>\n",
" <td>23</td>\n",
" <td>87</td>\n",
" <td>23</td>\n",
" <td>problem</td>\n",
" <td>hypothetical</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>382 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" filename concept_text start_line start_word_number \\\n",
"0 018636330_DH pain 55 10 \n",
"4 018636330_DH chills 50 9 \n",
"18 018636330_DH fevers 50 7 \n",
"19 018636330_DH numbness 50 14 \n",
"20 018636330_DH bowel and bladder disruption 50 17 \n",
"... ... ... ... ... \n",
"7028 record-84 shortness of breath 99 25 \n",
"7029 record-84 any chest pain 87 12 \n",
"7040 record-84 difficulty breathing 87 18 \n",
"7041 record-84 pressure 87 16 \n",
"7045 record-84 light-headiness 87 23 \n",
"\n",
" end_line end_word_number concept_type assertion_type \n",
"0 55 10 problem hypothetical \n",
"4 50 9 problem hypothetical \n",
"18 50 7 problem hypothetical \n",
"19 50 14 problem hypothetical \n",
"20 50 20 problem hypothetical \n",
"... ... ... ... ... \n",
"7028 99 27 problem hypothetical \n",
"7029 87 14 problem hypothetical \n",
"7040 87 19 problem hypothetical \n",
"7041 87 16 problem hypothetical \n",
"7045 87 23 problem hypothetical \n",
"\n",
"[382 rows x 8 columns]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"assertion_df[assertion_df[\"assertion_type\"] == \"hypothetical\"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Not associated with Patient:\n",
"the mention of the medical problem is associated\n",
"with someone who is not the patient."
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filename</th>\n",
" <th>concept_text</th>\n",
" <th>start_line</th>\n",
" <th>start_word_number</th>\n",
" <th>end_line</th>\n",
" <th>end_word_number</th>\n",
" <th>concept_type</th>\n",
" <th>assertion_type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>133</th>\n",
" <td>101407944_PUMC</td>\n",
" <td>cva</td>\n",
" <td>49</td>\n",
" <td>8</td>\n",
" <td>49</td>\n",
" <td>8</td>\n",
" <td>problem</td>\n",
" <td>associated_with_someone_else</td>\n",
" </tr>\n",
" <tr>\n",
" <th>136</th>\n",
" <td>101407944_PUMC</td>\n",
" <td>diabetes</td>\n",
" <td>49</td>\n",
" <td>6</td>\n",
" <td>49</td>\n",
" <td>6</td>\n",
" <td>problem</td>\n",
" <td>associated_with_someone_else</td>\n",
" </tr>\n",
" <tr>\n",
" <th>146</th>\n",
" <td>101407944_PUMC</td>\n",
" <td>hypercholesterolemia</td>\n",
" <td>49</td>\n",
" <td>10</td>\n",
" <td>49</td>\n",
" <td>10</td>\n",
" <td>problem</td>\n",
" <td>associated_with_someone_else</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199</th>\n",
" <td>101407944_PUMC</td>\n",
" <td>cad</td>\n",
" <td>49</td>\n",
" <td>2</td>\n",
" <td>49</td>\n",
" <td>2</td>\n",
" <td>problem</td>\n",
" <td>associated_with_someone_else</td>\n",
" </tr>\n",
" <tr>\n",
" <th>245</th>\n",
" <td>130959255</td>\n",
" <td>heart disease</td>\n",
" <td>38</td>\n",
" <td>8</td>\n",
" <td>38</td>\n",
" <td>9</td>\n",
" <td>problem</td>\n",
" <td>associated_with_someone_else</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6809</th>\n",
" <td>record-81</td>\n",
" <td>mentally handicapped</td>\n",
" <td>30</td>\n",
" <td>3</td>\n",
" <td>30</td>\n",
" <td>4</td>\n",
" <td>problem</td>\n",
" <td>associated_with_someone_else</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6827</th>\n",
" <td>record-81</td>\n",
" <td>mentally handicapped</td>\n",
" <td>49</td>\n",
" <td>11</td>\n",
" <td>49</td>\n",
" <td>12</td>\n",
" <td>problem</td>\n",
" <td>associated_with_someone_else</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6831</th>\n",
" <td>record-81</td>\n",
" <td>human immunodeficiency virus</td>\n",
" <td>47</td>\n",
" <td>20</td>\n",
" <td>47</td>\n",
" <td>22</td>\n",
" <td>problem</td>\n",
" <td>associated_with_someone_else</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7031</th>\n",
" <td>record-84</td>\n",
" <td>an mi</td>\n",
" <td>33</td>\n",
" <td>6</td>\n",
" <td>33</td>\n",
" <td>7</td>\n",
" <td>problem</td>\n",
" <td>associated_with_someone_else</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7060</th>\n",
" <td>record-84</td>\n",
" <td>coronary artery disease</td>\n",
" <td>12</td>\n",
" <td>19</td>\n",
" <td>12</td>\n",
" <td>21</td>\n",
" <td>problem</td>\n",
" <td>associated_with_someone_else</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>89 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" filename concept_text start_line \\\n",
"133 101407944_PUMC cva 49 \n",
"136 101407944_PUMC diabetes 49 \n",
"146 101407944_PUMC hypercholesterolemia 49 \n",
"199 101407944_PUMC cad 49 \n",
"245 130959255 heart disease 38 \n",
"... ... ... ... \n",
"6809 record-81 mentally handicapped 30 \n",
"6827 record-81 mentally handicapped 49 \n",
"6831 record-81 human immunodeficiency virus 47 \n",
"7031 record-84 an mi 33 \n",
"7060 record-84 coronary artery disease 12 \n",
"\n",
" start_word_number end_line end_word_number concept_type \\\n",
"133 8 49 8 problem \n",
"136 6 49 6 problem \n",
"146 10 49 10 problem \n",
"199 2 49 2 problem \n",
"245 8 38 9 problem \n",
"... ... ... ... ... \n",
"6809 3 30 4 problem \n",
"6827 11 49 12 problem \n",
"6831 20 47 22 problem \n",
"7031 6 33 7 problem \n",
"7060 19 12 21 problem \n",
"\n",
" assertion_type \n",
"133 associated_with_someone_else \n",
"136 associated_with_someone_else \n",
"146 associated_with_someone_else \n",
"199 associated_with_someone_else \n",
"245 associated_with_someone_else \n",
"... ... \n",
"6809 associated_with_someone_else \n",
"6827 associated_with_someone_else \n",
"6831 associated_with_someone_else \n",
"7031 associated_with_someone_else \n",
"7060 associated_with_someone_else \n",
"\n",
"[89 rows x 8 columns]"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"assertion_df[assertion_df[\"assertion_type\"] == \"associated_with_someone_else\"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Relations Analysis"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'concept_text_1': ['po pain medications', 'a postoperative ct scan', 'percocet', 'c5-6 disc herniation', 'c5-6 disc herniation', 'a c5-6 disc herniation', 'an mri', 'an mri', 'an mri', 'her exam'], 'start_line_1': [47, 43, 55, 21, 21, 27, 27, 27, 27, 44], 'start_word_number_1': [7, 2, 1, 0, 0, 11, 8, 8, 8, 3], 'end_line_1': [47, 43, 55, 21, 21, 27, 27, 27, 27, 44], 'end_word_number_1': [9, 5, 1, 2, 2, 14, 9, 9, 9, 4], 'concept_text_2': ['her pain', 'partial decompression of the spinal canal', 'pain', 'cord compression', 'myelopathy', 'cord compression', 'a c5-6 disc herniation', 'cord compression', 'a t2 signal change', 'her hyperreflexia'], 'start_line_2': [47, 43, 55, 21, 21, 27, 27, 27, 27, 44], 'start_word_number_2': [0, 8, 10, 4, 7, 16, 11, 16, 19, 9], 'end_line_2': [47, 43, 55, 21, 21, 27, 27, 27, 27, 44], 'end_word_number_2': [1, 13, 10, 5, 7, 17, 14, 17, 22, 10], 'relation_type': ['TrIP', 'TeRP', 'TrAP', 'PIP', 'PIP', 'PIP', 'TeRP', 'TeRP', 'TeRP', 'TeCP']}\n"
]
}
],
"source": [
"print(df.rel[0])\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"interpreter": {
"hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
},
"kernelspec": {
"display_name": "Python 3.8.10 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}