[7620a5]: / clinicalTrial-xml_to_DataFrame.ipynb

Download this file

215 lines (214 with data), 7.5 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import json\n",
    "import sys\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "import xml.etree.ElementTree as et\n",
    "import io\n",
    "import glob\n",
    "from tqdm._tqdm_notebook import tqdm_notebook as tqdm\n",
    "from tqdm import tqdm\n",
    "\n",
    "#tqdm_notebook.pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 61781/61781 [23:36<00:00, 43.62it/s]\n"
     ]
    }
   ],
   "source": [
    "filePath = glob.glob(\"./new_data/combined_xml/*.xml\")\n",
    "col_names = ['nct_id', 'brief_title', 'official_title', 'overall_status', \n",
    "         'start_date','completion_date', 'phase', 'study_type', \n",
    "             'brief_summary', 'detailed_description', 'enrollment',\n",
    "             'condition', 'intervention_name', 'eligibility']\n",
    "\n",
    "name1 = ['nct_id','brief_title', 'official_title', 'overall_status', \n",
    "         'start_date','completion_date', 'phase', 'study_type']\n",
    "\n",
    "name2 = ['brief_summary', 'detailed_description']\n",
    "\n",
    "name3 = ['enrollment', 'condition']\n",
    "\n",
    "df = pd.DataFrame(columns = col_names)\n",
    "bad_file = []\n",
    "\n",
    "for files in tqdm(filePath):\n",
    "    try:\n",
    "        with open(files, 'r', encoding=\"utf-8\") as content:\n",
    "            tree = et.parse(content)\n",
    "            root = tree.getroot()\n",
    "\n",
    "            row = {name: ''for name in col_names} \n",
    "\n",
    "            for d in tree.iter():\n",
    "                for txt in d.iter():\n",
    "                    if d.tag in name1:\n",
    "                        row[d.tag] = d.text\n",
    "                    if d.tag in name2:\n",
    "                        row[d.tag] = d.getchildren()[0].text\n",
    "                    if d.tag in name3:\n",
    "                        row[d.tag] = txt.text\n",
    "                    if d.tag == \"intervention_name\":\n",
    "                        intSearch = root.find('intervention_browse')\n",
    "                        if intSearch:\n",
    "                            row[d.tag]= [i.text for i in intSearch.findall('mesh_term')]\n",
    "                        else:\n",
    "                            row[d.tag] = [i.text for i in root.iter('intervention_name')]\n",
    "                    if d.tag =='eligibility':\n",
    "                        row[d.tag] = root.find('eligibility').find('criteria').find('textblock').text\n",
    "           \n",
    "        df = df.append(pd.Series(row, index = col_names), ignore_index = True)\n",
    "    except:\n",
    "        bad_file.append(files)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'df' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-1-0a7eb4ab86eb>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      2\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'intervention_name'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'intervention_name'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m', '\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mNameError\u001b[0m: name 'df' is not defined"
     ]
    }
   ],
   "source": [
    "print(len(df))\n",
    "df['intervention_name'] = df['intervention_name'].apply(', '.join)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv('cancerTrials.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['./new_data/combined_xml\\\\NCT00039754.xml',\n",
       " './new_data/combined_xml\\\\NCT00076531.xml',\n",
       " './new_data/combined_xml\\\\NCT00275444.xml',\n",
       " './new_data/combined_xml\\\\NCT00442754.xml']"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bad_file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # Single file processing\n",
    "# xml_file = \"./new_data/combined_xml/NCT01473303.xml\"\n",
    "# tree = et.parse(xml_file)\n",
    "# root = tree.getroot()\n",
    "# col_names = ['nct_id', 'brief_title', 'official_title', 'overall_status', \n",
    "#          'start_date','completion_date', 'phase', 'study_type', \n",
    "#              'brief_summary', 'detailed_description', \n",
    "#              'intervention','condition', 'enrollment']\n",
    "\n",
    "# name1 = ['nct_id','brief_title', 'official_title', 'overall_status', \n",
    "#          'start_date','completion_date', 'phase', 'study_type']\n",
    "\n",
    "# name2 = ['brief_summary', 'detailed_description']\n",
    "             \n",
    "# name3 = ['intervention','condition', 'enrollment']\n",
    "\n",
    "# df = pd.DataFrame(columns = col_names)\n",
    "# row = {name: ''for name in col_names} \n",
    "# for d in tree.iter():\n",
    "#     for txt in d.iter():\n",
    "#         if d.tag in name1:\n",
    "#             row[d.tag] = d.text\n",
    "#         if d.tag in name2:\n",
    "#             row[d.tag] = d.getchildren()[0].text\n",
    "#         if d.tag in name3:\n",
    "#             row[d.tag] = txt.text\n",
    "# df = df.append(pd.Series(row, index = col_names), ignore_index = True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#df.tail()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}