215 lines (214 with data), 7.5 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import re\n",
"import json\n",
"import sys\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"\n",
"import xml.etree.ElementTree as et\n",
"import io\n",
"import glob\n",
"from tqdm._tqdm_notebook import tqdm_notebook as tqdm\n",
"from tqdm import tqdm\n",
"\n",
"#tqdm_notebook.pandas()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 61781/61781 [23:36<00:00, 43.62it/s]\n"
]
}
],
"source": [
"filePath = glob.glob(\"./new_data/combined_xml/*.xml\")\n",
"col_names = ['nct_id', 'brief_title', 'official_title', 'overall_status', \n",
" 'start_date','completion_date', 'phase', 'study_type', \n",
" 'brief_summary', 'detailed_description', 'enrollment',\n",
" 'condition', 'intervention_name', 'eligibility']\n",
"\n",
"name1 = ['nct_id','brief_title', 'official_title', 'overall_status', \n",
" 'start_date','completion_date', 'phase', 'study_type']\n",
"\n",
"name2 = ['brief_summary', 'detailed_description']\n",
"\n",
"name3 = ['enrollment', 'condition']\n",
"\n",
"df = pd.DataFrame(columns = col_names)\n",
"bad_file = []\n",
"\n",
"for files in tqdm(filePath):\n",
" try:\n",
" with open(files, 'r', encoding=\"utf-8\") as content:\n",
" tree = et.parse(content)\n",
" root = tree.getroot()\n",
"\n",
" row = {name: ''for name in col_names} \n",
"\n",
" for d in tree.iter():\n",
" for txt in d.iter():\n",
" if d.tag in name1:\n",
" row[d.tag] = d.text\n",
" if d.tag in name2:\n",
" row[d.tag] = d.getchildren()[0].text\n",
" if d.tag in name3:\n",
" row[d.tag] = txt.text\n",
" if d.tag == \"intervention_name\":\n",
" intSearch = root.find('intervention_browse')\n",
" if intSearch:\n",
" row[d.tag]= [i.text for i in intSearch.findall('mesh_term')]\n",
" else:\n",
" row[d.tag] = [i.text for i in root.iter('intervention_name')]\n",
" if d.tag =='eligibility':\n",
" row[d.tag] = root.find('eligibility').find('criteria').find('textblock').text\n",
" \n",
" df = df.append(pd.Series(row, index = col_names), ignore_index = True)\n",
" except:\n",
" bad_file.append(files)\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'df' is not defined",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-1-0a7eb4ab86eb>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'intervention_name'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'intervention_name'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m', '\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mNameError\u001b[0m: name 'df' is not defined"
]
}
],
"source": [
"print(len(df))\n",
"df['intervention_name'] = df['intervention_name'].apply(', '.join)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"df.to_csv('cancerTrials.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['./new_data/combined_xml\\\\NCT00039754.xml',\n",
" './new_data/combined_xml\\\\NCT00076531.xml',\n",
" './new_data/combined_xml\\\\NCT00275444.xml',\n",
" './new_data/combined_xml\\\\NCT00442754.xml']"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bad_file"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# # Single file processing\n",
"# xml_file = \"./new_data/combined_xml/NCT01473303.xml\"\n",
"# tree = et.parse(xml_file)\n",
"# root = tree.getroot()\n",
"# col_names = ['nct_id', 'brief_title', 'official_title', 'overall_status', \n",
"# 'start_date','completion_date', 'phase', 'study_type', \n",
"# 'brief_summary', 'detailed_description', \n",
"# 'intervention','condition', 'enrollment']\n",
"\n",
"# name1 = ['nct_id','brief_title', 'official_title', 'overall_status', \n",
"# 'start_date','completion_date', 'phase', 'study_type']\n",
"\n",
"# name2 = ['brief_summary', 'detailed_description']\n",
" \n",
"# name3 = ['intervention','condition', 'enrollment']\n",
"\n",
"# df = pd.DataFrame(columns = col_names)\n",
"# row = {name: ''for name in col_names} \n",
"# for d in tree.iter():\n",
"# for txt in d.iter():\n",
"# if d.tag in name1:\n",
"# row[d.tag] = d.text\n",
"# if d.tag in name2:\n",
"# row[d.tag] = d.getchildren()[0].text\n",
"# if d.tag in name3:\n",
"# row[d.tag] = txt.text\n",
"# df = df.append(pd.Series(row, index = col_names), ignore_index = True)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#df.tail()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}