medical_txt_parser / Git / [d69072] /src/notebooks/parse

Models:
philipB/
medical_txt_parser
Downloads: 1
[d69072]: / src / notebooks / parse_metadata.ipynb
History
Download this file
389 lines (388 with data), 13.1 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "import glob\n",
    "import os\n",
    "\n",
    "path = %pwd\n",
    "while \"src\" in path:\n",
    "    %cd ..\n",
    "    path = %pwd\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data_path = \"data/train\"\n",
    "val_data_path = \"data/val\"\n",
    "processed_data_path = \"data/processed\"\n",
    "ast_folder_name = \"ast\"\n",
    "concept_folder_name = \"concept\"\n",
    "rel_folder_name = \"rel\"\n",
    "txt_folder_name = \"txt\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "text_files = glob.glob(train_data_path + os.sep + txt_folder_name + os.sep +  \"*.txt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting dateparser\n",
      "  Using cached dateparser-1.1.0-py2.py3-none-any.whl (288 kB)\n",
      "Collecting tzlocal\n",
      "  Using cached tzlocal-4.1-py3-none-any.whl (19 kB)\n",
      "Requirement already satisfied: python-dateutil in /home/mus5900/anaconda3/envs/nlp/lib/python3.9/site-packages (from dateparser) (2.8.2)\n",
      "Requirement already satisfied: pytz in /home/mus5900/anaconda3/envs/nlp/lib/python3.9/site-packages (from dateparser) (2021.3)\n",
      "Requirement already satisfied: regex!=2019.02.19,!=2021.8.27 in /home/mus5900/anaconda3/envs/nlp/lib/python3.9/site-packages (from dateparser) (2021.11.10)\n",
      "Requirement already satisfied: six>=1.5 in /home/mus5900/anaconda3/envs/nlp/lib/python3.9/site-packages (from python-dateutil->dateparser) (1.16.0)\n",
      "Collecting pytz-deprecation-shim\n",
      "  Using cached pytz_deprecation_shim-0.1.0.post0-py2.py3-none-any.whl (15 kB)\n",
      "Collecting tzdata\n",
      "  Using cached tzdata-2021.5-py2.py3-none-any.whl (339 kB)\n",
      "Installing collected packages: tzdata, pytz-deprecation-shim, tzlocal, dateparser\n",
      "Successfully installed dateparser-1.1.0 pytz-deprecation-shim-0.1.0.post0 tzdata-2021.5 tzlocal-4.1\n",
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "%pip install dateparser"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|          | 0/170 [00:00<?, ?it/s]/home/mus5900/anaconda3/envs/nlp/lib/python3.9/site-packages/dateparser/date_parser.py:35: PytzUsageWarning: The localize method is no longer necessary, as this time zone supports the fold attribute (PEP 495). For more details on migrating to a PEP 495-compliant implementation, see https://pytz-deprecation-shim.readthedocs.io/en/latest/migration.html\n",
      "  date_obj = stz.localize(date_obj)\n",
      "100%|██████████| 170/170 [00:17<00:00,  9.72it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filename            0\n",
      "age               137\n",
      "sexe                0\n",
      "birthdate         102\n",
      "admission_date     41\n",
      "discharge_date     44\n",
      "dtype: int64\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>filename</th>\n",
       "      <th>age</th>\n",
       "      <th>sexe</th>\n",
       "      <th>birthdate</th>\n",
       "      <th>admission_date</th>\n",
       "      <th>discharge_date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>018636330_DH</td>\n",
       "      <td>None</td>\n",
       "      <td>N/A</td>\n",
       "      <td>None</td>\n",
       "      <td>2005/06/02</td>\n",
       "      <td>2005/06/05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>026350193_RWH</td>\n",
       "      <td>5</td>\n",
       "      <td>N/A</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>037945397_RWH</td>\n",
       "      <td>7</td>\n",
       "      <td>N/A</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>044687343_ELMVH</td>\n",
       "      <td>None</td>\n",
       "      <td>N/A</td>\n",
       "      <td>None</td>\n",
       "      <td>2006/03/13</td>\n",
       "      <td>2006/03/19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>060376519_DH</td>\n",
       "      <td>4</td>\n",
       "      <td>N/A</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>165</th>\n",
       "      <td>record-80</td>\n",
       "      <td>None</td>\n",
       "      <td>M</td>\n",
       "      <td>2017/09/22</td>\n",
       "      <td>2017/09/22</td>\n",
       "      <td>2017/09/27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>166</th>\n",
       "      <td>record-81</td>\n",
       "      <td>None</td>\n",
       "      <td>M</td>\n",
       "      <td>None</td>\n",
       "      <td>2011/02/06</td>\n",
       "      <td>2011/02/08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>167</th>\n",
       "      <td>record-82</td>\n",
       "      <td>None</td>\n",
       "      <td>N/A</td>\n",
       "      <td>None</td>\n",
       "      <td>2015/03/16</td>\n",
       "      <td>2015/03/19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>168</th>\n",
       "      <td>record-83</td>\n",
       "      <td>None</td>\n",
       "      <td>F</td>\n",
       "      <td>1930/12/19</td>\n",
       "      <td>2013/11/04</td>\n",
       "      <td>2013/11/16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>169</th>\n",
       "      <td>record-84</td>\n",
       "      <td>None</td>\n",
       "      <td>M</td>\n",
       "      <td>1959/12/09</td>\n",
       "      <td>2014/10/14</td>\n",
       "      <td>2014/10/17</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>170 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            filename   age sexe   birthdate admission_date discharge_date\n",
       "0       018636330_DH  None  N/A        None     2005/06/02     2005/06/05\n",
       "1      026350193_RWH     5  N/A        None           None           None\n",
       "2      037945397_RWH     7  N/A        None           None           None\n",
       "3    044687343_ELMVH  None  N/A        None     2006/03/13     2006/03/19\n",
       "4       060376519_DH     4  N/A        None           None           None\n",
       "..               ...   ...  ...         ...            ...            ...\n",
       "165        record-80  None    M  2017/09/22     2017/09/22     2017/09/27\n",
       "166        record-81  None    M        None     2011/02/06     2011/02/08\n",
       "167        record-82  None  N/A        None     2015/03/16     2015/03/19\n",
       "168        record-83  None    F  1930/12/19     2013/11/04     2013/11/16\n",
       "169        record-84  None    M  1959/12/09     2014/10/14     2014/10/17\n",
       "\n",
       "[170 rows x 6 columns]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from dateparser import parse\n",
    "\n",
    "results = pd.DataFrame()\n",
    "Patient_id = [] \n",
    "Age = []\n",
    "Sexe = []\n",
    "Birthdate = []\n",
    "Admission_date = []\n",
    "Discharge_date = []\n",
    "for file in tqdm(text_files):\n",
    "    with open(file, \"r\") as f:\n",
    "        lines = f.readlines()\n",
    "        # remove endlines\n",
    "        lines = [line.strip() for line in lines]\n",
    "        # remove empty lines\n",
    "        lines = [line.lower() for line in lines if line.strip()]\n",
    "\n",
    "        try :\n",
    "            age = lines[lines.index(\"age :\")+ 1][0]\n",
    "            if age[0].isdigit():\n",
    "                # keep only digits\n",
    "                age = \"\".join([c for c in age if c.isdigit()])\n",
    "                Age.append(age)\n",
    "            else:\n",
    "                Age.append(None)\n",
    "        except :\n",
    "            Age += [None]\n",
    "        \n",
    "        try :\n",
    "            sx = lines[lines.index(\"sex :\")+ 1]\n",
    "            if sx[0] in ['m', 'f']:\n",
    "                Sexe.append('M' if sx[0] == \"m\" else 'F')\n",
    "            else:\n",
    "                Sexe.append(\"N/A\")    \n",
    "        except :\n",
    "            Sexe += [\"N/A\"]\n",
    "        \n",
    "        \n",
    "        try :\n",
    "            if lines[lines.index(\"date of birth :\")+ 1][0].isdigit():\n",
    "                date= parse(lines[lines.index(\"date of birth :\")+ 1]).strftime(\"%Y/%m/%d\")\n",
    "                Birthdate.append(date)\n",
    "            else:\n",
    "                Birthdate.append(None)\n",
    "        except :\n",
    "            Birthdate.append(None)\n",
    "        \n",
    "        try :\n",
    "            if lines[lines.index(\"admission date :\")+ 1][0].isdigit():\n",
    "                date = parse(lines[lines.index(\"admission date :\")+ 1]).strftime(\"%Y/%m/%d\")\n",
    "                Admission_date.append(date)\n",
    "            else:\n",
    "                Admission_date.append(None)\n",
    "        except :\n",
    "            Admission_date.append(None)\n",
    "   \n",
    "        \n",
    "        try :\n",
    "            if lines[lines.index(\"discharge date :\")+ 1][0].isdigit():\n",
    "                date = parse(lines[lines.index(\"discharge date :\")+ 1]).strftime(\"%Y/%m/%d\")\n",
    "                Discharge_date.append(date)\n",
    "            else:\n",
    "                Discharge_date.append(None)\n",
    "        except :\n",
    "            Discharge_date.append(None)\n",
    "\n",
    "results[\"filename\"] = [f.split(os.sep)[-1][:-4] for f in text_files]\n",
    "results[\"age\"] = Age\n",
    "results[\"sexe\"] = Sexe\n",
    "results[\"birthdate\"] = Birthdate\n",
    "results[\"admission_date\"] = Admission_date\n",
    "results[\"discharge_date\"] = Discharge_date\n",
    "\n",
    "print(results.isna().sum())\n",
    "\n",
    "results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# results\n",
    "# save as jsonl\n",
    "os.makedirs(\"data/train/metadata\", exist_ok=True)\n",
    "files = glob.glob(\"data/train/metadata/*\")\n",
    "for file in files:\n",
    "    os.remove(file)\n",
    "    \n",
    "for i in range(len(results)):\n",
    "    filename = results.loc[i, \"filename\"]\n",
    "    results.iloc[i].to_json(f\"data/train/metadata/{filename}.json\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "9915d85b8f08013a59ff2018edcb7a2314c147afd4a478815763a832134e3444"
  },
  "kernelspec": {
   "display_name": "Python 3.9.10 64-bit (windows store)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}