389 lines (388 with data), 13.1 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from tqdm import tqdm\n",
"import glob\n",
"import os\n",
"\n",
"path = %pwd\n",
"while \"src\" in path:\n",
" %cd ..\n",
" path = %pwd\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"train_data_path = \"data/train\"\n",
"val_data_path = \"data/val\"\n",
"processed_data_path = \"data/processed\"\n",
"ast_folder_name = \"ast\"\n",
"concept_folder_name = \"concept\"\n",
"rel_folder_name = \"rel\"\n",
"txt_folder_name = \"txt\""
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"text_files = glob.glob(train_data_path + os.sep + txt_folder_name + os.sep + \"*.txt\")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting dateparser\n",
" Using cached dateparser-1.1.0-py2.py3-none-any.whl (288 kB)\n",
"Collecting tzlocal\n",
" Using cached tzlocal-4.1-py3-none-any.whl (19 kB)\n",
"Requirement already satisfied: python-dateutil in /home/mus5900/anaconda3/envs/nlp/lib/python3.9/site-packages (from dateparser) (2.8.2)\n",
"Requirement already satisfied: pytz in /home/mus5900/anaconda3/envs/nlp/lib/python3.9/site-packages (from dateparser) (2021.3)\n",
"Requirement already satisfied: regex!=2019.02.19,!=2021.8.27 in /home/mus5900/anaconda3/envs/nlp/lib/python3.9/site-packages (from dateparser) (2021.11.10)\n",
"Requirement already satisfied: six>=1.5 in /home/mus5900/anaconda3/envs/nlp/lib/python3.9/site-packages (from python-dateutil->dateparser) (1.16.0)\n",
"Collecting pytz-deprecation-shim\n",
" Using cached pytz_deprecation_shim-0.1.0.post0-py2.py3-none-any.whl (15 kB)\n",
"Collecting tzdata\n",
" Using cached tzdata-2021.5-py2.py3-none-any.whl (339 kB)\n",
"Installing collected packages: tzdata, pytz-deprecation-shim, tzlocal, dateparser\n",
"Successfully installed dateparser-1.1.0 pytz-deprecation-shim-0.1.0.post0 tzdata-2021.5 tzlocal-4.1\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"%pip install dateparser"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 0/170 [00:00<?, ?it/s]/home/mus5900/anaconda3/envs/nlp/lib/python3.9/site-packages/dateparser/date_parser.py:35: PytzUsageWarning: The localize method is no longer necessary, as this time zone supports the fold attribute (PEP 495). For more details on migrating to a PEP 495-compliant implementation, see https://pytz-deprecation-shim.readthedocs.io/en/latest/migration.html\n",
" date_obj = stz.localize(date_obj)\n",
"100%|██████████| 170/170 [00:17<00:00, 9.72it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"filename 0\n",
"age 137\n",
"sexe 0\n",
"birthdate 102\n",
"admission_date 41\n",
"discharge_date 44\n",
"dtype: int64\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filename</th>\n",
" <th>age</th>\n",
" <th>sexe</th>\n",
" <th>birthdate</th>\n",
" <th>admission_date</th>\n",
" <th>discharge_date</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>018636330_DH</td>\n",
" <td>None</td>\n",
" <td>N/A</td>\n",
" <td>None</td>\n",
" <td>2005/06/02</td>\n",
" <td>2005/06/05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>026350193_RWH</td>\n",
" <td>5</td>\n",
" <td>N/A</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>037945397_RWH</td>\n",
" <td>7</td>\n",
" <td>N/A</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>044687343_ELMVH</td>\n",
" <td>None</td>\n",
" <td>N/A</td>\n",
" <td>None</td>\n",
" <td>2006/03/13</td>\n",
" <td>2006/03/19</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>060376519_DH</td>\n",
" <td>4</td>\n",
" <td>N/A</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>165</th>\n",
" <td>record-80</td>\n",
" <td>None</td>\n",
" <td>M</td>\n",
" <td>2017/09/22</td>\n",
" <td>2017/09/22</td>\n",
" <td>2017/09/27</td>\n",
" </tr>\n",
" <tr>\n",
" <th>166</th>\n",
" <td>record-81</td>\n",
" <td>None</td>\n",
" <td>M</td>\n",
" <td>None</td>\n",
" <td>2011/02/06</td>\n",
" <td>2011/02/08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>167</th>\n",
" <td>record-82</td>\n",
" <td>None</td>\n",
" <td>N/A</td>\n",
" <td>None</td>\n",
" <td>2015/03/16</td>\n",
" <td>2015/03/19</td>\n",
" </tr>\n",
" <tr>\n",
" <th>168</th>\n",
" <td>record-83</td>\n",
" <td>None</td>\n",
" <td>F</td>\n",
" <td>1930/12/19</td>\n",
" <td>2013/11/04</td>\n",
" <td>2013/11/16</td>\n",
" </tr>\n",
" <tr>\n",
" <th>169</th>\n",
" <td>record-84</td>\n",
" <td>None</td>\n",
" <td>M</td>\n",
" <td>1959/12/09</td>\n",
" <td>2014/10/14</td>\n",
" <td>2014/10/17</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>170 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" filename age sexe birthdate admission_date discharge_date\n",
"0 018636330_DH None N/A None 2005/06/02 2005/06/05\n",
"1 026350193_RWH 5 N/A None None None\n",
"2 037945397_RWH 7 N/A None None None\n",
"3 044687343_ELMVH None N/A None 2006/03/13 2006/03/19\n",
"4 060376519_DH 4 N/A None None None\n",
".. ... ... ... ... ... ...\n",
"165 record-80 None M 2017/09/22 2017/09/22 2017/09/27\n",
"166 record-81 None M None 2011/02/06 2011/02/08\n",
"167 record-82 None N/A None 2015/03/16 2015/03/19\n",
"168 record-83 None F 1930/12/19 2013/11/04 2013/11/16\n",
"169 record-84 None M 1959/12/09 2014/10/14 2014/10/17\n",
"\n",
"[170 rows x 6 columns]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from dateparser import parse\n",
"\n",
"results = pd.DataFrame()\n",
"Patient_id = [] \n",
"Age = []\n",
"Sexe = []\n",
"Birthdate = []\n",
"Admission_date = []\n",
"Discharge_date = []\n",
"for file in tqdm(text_files):\n",
" with open(file, \"r\") as f:\n",
" lines = f.readlines()\n",
" # remove endlines\n",
" lines = [line.strip() for line in lines]\n",
" # remove empty lines\n",
" lines = [line.lower() for line in lines if line.strip()]\n",
"\n",
" try :\n",
" age = lines[lines.index(\"age :\")+ 1][0]\n",
" if age[0].isdigit():\n",
" # keep only digits\n",
" age = \"\".join([c for c in age if c.isdigit()])\n",
" Age.append(age)\n",
" else:\n",
" Age.append(None)\n",
" except :\n",
" Age += [None]\n",
" \n",
" try :\n",
" sx = lines[lines.index(\"sex :\")+ 1]\n",
" if sx[0] in ['m', 'f']:\n",
" Sexe.append('M' if sx[0] == \"m\" else 'F')\n",
" else:\n",
" Sexe.append(\"N/A\") \n",
" except :\n",
" Sexe += [\"N/A\"]\n",
" \n",
" \n",
" try :\n",
" if lines[lines.index(\"date of birth :\")+ 1][0].isdigit():\n",
" date= parse(lines[lines.index(\"date of birth :\")+ 1]).strftime(\"%Y/%m/%d\")\n",
" Birthdate.append(date)\n",
" else:\n",
" Birthdate.append(None)\n",
" except :\n",
" Birthdate.append(None)\n",
" \n",
" try :\n",
" if lines[lines.index(\"admission date :\")+ 1][0].isdigit():\n",
" date = parse(lines[lines.index(\"admission date :\")+ 1]).strftime(\"%Y/%m/%d\")\n",
" Admission_date.append(date)\n",
" else:\n",
" Admission_date.append(None)\n",
" except :\n",
" Admission_date.append(None)\n",
" \n",
" \n",
" try :\n",
" if lines[lines.index(\"discharge date :\")+ 1][0].isdigit():\n",
" date = parse(lines[lines.index(\"discharge date :\")+ 1]).strftime(\"%Y/%m/%d\")\n",
" Discharge_date.append(date)\n",
" else:\n",
" Discharge_date.append(None)\n",
" except :\n",
" Discharge_date.append(None)\n",
"\n",
"results[\"filename\"] = [f.split(os.sep)[-1][:-4] for f in text_files]\n",
"results[\"age\"] = Age\n",
"results[\"sexe\"] = Sexe\n",
"results[\"birthdate\"] = Birthdate\n",
"results[\"admission_date\"] = Admission_date\n",
"results[\"discharge_date\"] = Discharge_date\n",
"\n",
"print(results.isna().sum())\n",
"\n",
"results"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"# results\n",
"# save as jsonl\n",
"os.makedirs(\"data/train/metadata\", exist_ok=True)\n",
"files = glob.glob(\"data/train/metadata/*\")\n",
"for file in files:\n",
" os.remove(file)\n",
" \n",
"for i in range(len(results)):\n",
" filename = results.loc[i, \"filename\"]\n",
" results.iloc[i].to_json(f\"data/train/metadata/{filename}.json\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"interpreter": {
"hash": "9915d85b8f08013a59ff2018edcb7a2314c147afd4a478815763a832134e3444"
},
"kernelspec": {
"display_name": "Python 3.9.10 64-bit (windows store)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}