NER-medical-text / Git / Diff of /notebooks/00_data

Models:
philipB/
NER-medical-text
Downloads: 1
Diff of /notebooks/00_data_download.ipynb [000000] .. [c0f169]
Switch to side-by-side view

--- a
+++ b/notebooks/00_data_download.ipynb
@@ -0,0 +1,162 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "308eca10",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "586edd82",
+   "metadata": {},
+   "source": [
+    "## Downloading MACCROBAT_DATA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "3ac872ca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MACCROBAT_DATA_URL = \"https://figshare.com/ndownloader/files/17493650\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "03dca362",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "\n",
+    "def download_zip_from_url(url, save_path, chunk_size=128):\n",
+    "    r = requests.get(url, stream=True)\n",
+    "    with open(save_path, 'wb') as fd:\n",
+    "        for chunk in r.iter_content(chunk_size=chunk_size):\n",
+    "            fd.write(chunk)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "f8dd4c16",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_zipfile = \"../data/MACCROBAT2018.zip\"\n",
+    "MACCROBAT_data_dir = \"../data/MACCROBAT\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "29bfcbd2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Directory '../data/MACCROBAT' created.\n"
+     ]
+    }
+   ],
+   "source": [
+    "if not os.path.exists(MACCROBAT_data_dir):\n",
+    "    os.makedirs(MACCROBAT_data_dir)\n",
+    "    print(f\"Directory '{MACCROBAT_data_dir}' created.\")\n",
+    "else:\n",
+    "    print(f\"Directory '{MACCROBAT_data_dir}' already exists.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "d75e5ad8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "download_zip_from_url(MACCROBAT_DATA_URL, data_zipfile)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "7dcfb4b5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import zipfile\n",
+    "with zipfile.ZipFile(data_zipfile, 'r') as zip_ref:\n",
+    "    zip_ref.extractall(MACCROBAT_data_dir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "880dd367",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.remove(data_zipfile)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "5baefc11",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of txt files corresponding with ann files is 200\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "print(f\"Number of txt files corresponding with ann files is\\\n",
+    " {len([file for file in os.listdir(MACCROBAT_data_dir) if file.endswith('.txt') and file.split('.')[0]+ '.ann' in os.listdir(MACCROBAT_data_dir)])}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d6a58508",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}