--- a +++ b/notebooks/00_data_download.ipynb @@ -0,0 +1,162 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 11, + "id": "308eca10", + "metadata": {}, + "outputs": [], + "source": [ + "import os" + ] + }, + { + "cell_type": "markdown", + "id": "586edd82", + "metadata": {}, + "source": [ + "## Downloading MACCROBAT_DATA" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "3ac872ca", + "metadata": {}, + "outputs": [], + "source": [ + "MACCROBAT_DATA_URL = \"https://figshare.com/ndownloader/files/17493650\"" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "03dca362", + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "\n", + "def download_zip_from_url(url, save_path, chunk_size=128):\n", + " r = requests.get(url, stream=True)\n", + " with open(save_path, 'wb') as fd:\n", + " for chunk in r.iter_content(chunk_size=chunk_size):\n", + " fd.write(chunk)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "f8dd4c16", + "metadata": {}, + "outputs": [], + "source": [ + "data_zipfile = \"../data/MACCROBAT2018.zip\"\n", + "MACCROBAT_data_dir = \"../data/MACCROBAT\"" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "29bfcbd2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Directory '../data/MACCROBAT' created.\n" + ] + } + ], + "source": [ + "if not os.path.exists(MACCROBAT_data_dir):\n", + " os.makedirs(MACCROBAT_data_dir)\n", + " print(f\"Directory '{MACCROBAT_data_dir}' created.\")\n", + "else:\n", + " print(f\"Directory '{MACCROBAT_data_dir}' already exists.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "d75e5ad8", + "metadata": {}, + "outputs": [], + "source": [ + "download_zip_from_url(MACCROBAT_DATA_URL, data_zipfile)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "7dcfb4b5", + "metadata": {}, + "outputs": [], + "source": [ + "import zipfile\n", + "with zipfile.ZipFile(data_zipfile, 'r') as zip_ref:\n", + " zip_ref.extractall(MACCROBAT_data_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "880dd367", + "metadata": {}, + "outputs": [], + "source": [ + "os.remove(data_zipfile)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "5baefc11", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of txt files corresponding with ann files is 200\n" + ] + } + ], + "source": [ + "\n", + "print(f\"Number of txt files corresponding with ann files is\\\n", + " {len([file for file in os.listdir(MACCROBAT_data_dir) if file.endswith('.txt') and file.split('.')[0]+ '.ann' in os.listdir(MACCROBAT_data_dir)])}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6a58508", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}