--- a +++ b/scripts/Data-processing.ipynb @@ -0,0 +1,307 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "4c9d40b1", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import cv2\n", + "import matplotlib.pyplot as plt\n", + "import os.path, sys, re\n", + "import time\n", + "from PIL import Image" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "92e8df2f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/moise/Desktop/Data_Science/Erdos_Institute/ecg-proj/ecg-copy\n" + ] + } + ], + "source": [ + "cd ~/Desktop/Data_Science/Erdos_Institute/ecg-proj/ecg-copy/" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c5a0e302", + "metadata": {}, + "outputs": [], + "source": [ + "pathroot = \"data_v1/\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "3c9b391a", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "Note:\n", + "-----\n", + "\n", + "1)The value for the key 'ECGImagesofMyocardialInfarctionPatients' is a list of dictionaries \n", + "corresponding to the cases\n", + " i) MI <=37\n", + " ii) MI >=38\n", + "2) The value of the key 'ECGImagesofCOVID-19Patients' is broken down as follows:\n", + " i) Binder1_Page file name\n", + " ii) COVID file name\n", + "3) The value of the key 'ECGImagesofPatientthathaveHistoryofMI' corresponds to:\n", + " i) PMI<113\n", + " ii) 113<=PMI<=161 --- ATTENTION: only 12 Leads in the case!\n", + " iii) PMI>=162\n", + "\"\"\"\n", + "superDict = {'NormalPersonECGImages': { 'Lead1':(130,300,640,600), 'Lead2':(641,300,1125,600),'Lead3':(1130,300,1625,600), 'Lead4':(1630,300,2120,600),\\\n", + " 'Lead5':(130,600,640,900), 'Lead6':(641,600,1125,900),'Lead7':(1130,600,1625,900), 'Lead8':(1630,600,2120,900),\\\n", + " 'Lead9':(130,900,640,1200), 'Lead10':(641,900,1125,1200),'Lead11':(1130,900,1625,1200), 'Lead12':(1630,900,2120,1200),\\\n", + " 'Lead13':(130,1205,2120,1450) \n", + " },\n", + " 'ECGImagesofPatientthathaveabnormalheartbeats': { 'Lead1':(130,300,640,600), 'Lead2':(641,300,1125,600),'Lead3':(1130,300,1625,600), 'Lead4':(1630,300,2110,600),\\\n", + " 'Lead5':(130,600,640,900), 'Lead6':(641,600,1125,900),'Lead7':(1130,600,1625,900), 'Lead8':(1630,600,2110,900),\\\n", + " 'Lead9':(130,900,640,1200), 'Lead10':(641,900,1125,1200),'Lead11':(1130,900,1625,1200), 'Lead12':(1630,900,2110,1200),\\\n", + " 'Lead13':(130,1205,2110,1450) \n", + " },\n", + " \n", + " 'ECGImagesofMyocardialInfarctionPatients': [{ 'Lead1':(125,310,625,600), 'Lead2':(640,310,1125,600),'Lead3':(1140,310,1625,600), 'Lead4':(1640,310,2125,600),\\\n", + " 'Lead5':(125,605,625,900), 'Lead6':(640,605,1125,900),'Lead7':(1140,605,1625,900), 'Lead8':(1640,605,2125,900),\\\n", + " 'Lead9':(125,905,625,1200), 'Lead10':(640,905,1125,1200),'Lead11':(1140,905,1625,1200), 'Lead12':(1640,905,2125,1200),\\\n", + " 'Lead13':(125,1205,2125,1495)},\\\n", + " { 'Lead1':(125,300,1125,500), 'Lead2':(1125,300,2125,500),\\\n", + " 'Lead3':(125,500,1125,680), 'Lead4':(1125,500,2125,680),\\\n", + " 'Lead5':(125,680,1125,845), 'Lead6':(1125,680,2125,845),\\\n", + " 'Lead7':(125,845,1125,1000), 'Lead8':(1125,845,2125,1000),\\\n", + " 'Lead9':(125,1000,1125,1150), 'Lead10':(1125,1000,2125,1150),\\\n", + " 'Lead11':(125,1150,1125,1300), 'Lead12':(1125,1150,2125,1300),\\\n", + " 'Lead13':(125,1300,2125,1490)} ],\n", + " \n", + " 'ECGImagesofCOVID-19Patients': [{ 'Lead1':(100,110,300,240), 'Lead2':(300,110,500,240),'Lead3':(500,110,700,240), 'Lead4':(700,110,900,240),\\\n", + " 'Lead5':(100,241,300,350), 'Lead6':(300,241,500,350),'Lead7':(500,241,700,350), 'Lead8':(700,241,900,350),\\\n", + " 'Lead9':(100,351,300,450), 'Lead10':(300,351,500,450),'Lead11':(500,351,700,450), 'Lead12':(700,351,900,450),\\\n", + " 'Lead13':(100,451,900,545)},\\\n", + " { 'Lead1':(125,310,625,600), 'Lead2':(640,310,1125,600),'Lead3':(1140,310,1625,600), 'Lead4':(1640,310,2125,600),\\\n", + " 'Lead5':(125,605,625,900), 'Lead6':(640,605,1125,900),'Lead7':(1140,605,1625,900), 'Lead8':(1640,605,2125,900),\\\n", + " 'Lead9':(125,905,625,1200), 'Lead10':(640,905,1125,1200),'Lead11':(1140,905,1625,1200), 'Lead12':(1640,905,2125,1200),\\\n", + " 'Lead13':(125,1205,2125,1495)} ],\n", + " \n", + " 'ECGImagesofPatientthathaveHistoryofMI': [{ 'Lead1':(125,310,625,600), 'Lead2':(640,310,1125,600),'Lead3':(1140,310,1625,600), 'Lead4':(1640,310,2125,600),\\\n", + " 'Lead5':(125,605,625,900), 'Lead6':(640,605,1125,900),'Lead7':(1140,605,1625,900), 'Lead8':(1640,605,2125,900),\\\n", + " 'Lead9':(125,905,625,1200), 'Lead10':(640,905,1125,1200),'Lead11':(1140,905,1625,1200), 'Lead12':(1640,905,2125,1200),\\\n", + " 'Lead13':(125,1205,2125,1495)},\\\n", + " { 'Lead1':(125,410,1125,590), 'Lead2':(1125,410,2125,590),\\\n", + " 'Lead3':(125,600,1125,815), 'Lead4':(1125,600,2125,815),\\\n", + " 'Lead5':(125,815,1125,1035), 'Lead6':(1125,815,2125,1035),\\\n", + " 'Lead7':(125,1035,1125,1235), 'Lead8':(1125,1035,2125,1235),\\\n", + " 'Lead9':(125,1235,1125,1350), 'Lead10':(1125,1235,2125,1350),\\\n", + " 'Lead11':(125,1350,1125,1490), 'Lead12':(1125,1350,2125,1490),\\\n", + " },\\\n", + " { 'Lead1':(125,290,1125,490), 'Lead2':(1125,290,2125,490),\\\n", + " 'Lead3':(125,490,1125,650), 'Lead4':(1125,490,2125,650),\\\n", + " 'Lead5':(125,650,1125,810), 'Lead6':(1125,650,2125,810),\\\n", + " 'Lead7':(125,810,1125,1000), 'Lead8':(1125,810,2125,1000),\\\n", + " 'Lead9':(125,1000,1125,1190), 'Lead10':(1125,1000,2125,1190),\\\n", + " 'Lead11':(125,1190,1125,1315), 'Lead12':(1125,1190,2125,1315),\\\n", + " 'Lead13':(125,1315,2125,1550)\n", + " }] \n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "310cafaf", + "metadata": {}, + "outputs": [], + "source": [ + "dirList=['NormalPersonECGImages','ECGImagesofPatientthathaveabnormalheartbeats',\n", + " 'ECGImagesofMyocardialInfarctionPatients','ECGImagesofCOVID-19Patients',\n", + " 'ECGImagesofPatientthathaveHistoryofMI']" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "76c9879b", + "metadata": {}, + "outputs": [], + "source": [ + "def processor(dirs,path2,im,of,crop_dict,numLeads=13,threshold_level=50):\n", + " for i in range(1,numLeads+1):\n", + " imCrop = im.crop(box=crop_dict['Lead'+str(i)])\n", + " target = of+'-Cropped_lead'+str(i)\n", + " imCrop.save(os.path.join(path2,target+'.png'), \"PNG\", quality=100)\n", + " img = cv2.imread(os.path.join(path2,target+'.png'))\n", + " gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n", + " coords = np.column_stack(np.where(gray < threshold_level))\n", + " coords[:,[0,1]]=coords[:,[1,0]]\n", + " csvFolder = os.path.join(targetFolder,dirs)\n", + " if not os.path.exists(csvFolder):\n", + " os.makedirs(csvFolder)\n", + " np.savetxt(os.path.join(csvFolder,target+'.csv'),coords) " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "15742532", + "metadata": {}, + "outputs": [], + "source": [ + "def crop2csv(dirs,path,path2,dirList,superDict,pattern='.jpg'):\n", + " count = 0\n", + " for item in os.listdir(path):\n", + " if os.path.isfile(os.path.join(path,item)):\n", + " of, oe = os.path.splitext(item)\n", + " if oe == pattern:\n", + " im = Image.open(os.path.join(path,item))\n", + " if dirs == dirList[2]:\n", + " if int(of[3:]) <= 37:\n", + " processor(dirs,path2,im,of,superDict[dirs][0])\n", + " else:\n", + " processor(dirs,path2,im,of,superDict[dirs][1])\n", + " elif dirs == dirList[3]:\n", + " if of[0:5] == 'COVID':\n", + " processor(dirs,path2,im,of,superDict[dirs][1])\n", + " else:\n", + " processor(dirs,path2,im,of,superDict[dirs][0])\n", + " elif dirs == dirList[4]:\n", + " if int(of[4:]) < 113:\n", + " processor(dirs,path2,im,of,superDict[dirs][0])\n", + " elif (int(of[4:]) >= 113) and (int(of[4:]) <= 161) :\n", + " processor(dirs,path2,im,of,superDict[dirs][1],numLeads=12)\n", + " else:\n", + " processor(dirs,path2,im,of,superDict[dirs][2])\n", + " else:\n", + " processor(dirs,path2,im,of,superDict[dirs])\n", + " count = count+1\n", + " return count" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "61da0528", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing ECGImagesofPatientthathaveabnormalheartbeats folder ...\n", + "546 files processed in this folder in 229 sec...\n", + "\n", + "Processing ECGImagesofPatientthathaveHistoryofMI folder ...\n", + "203 files processed in this folder in 81 sec...\n", + "\n", + "Processing ECGImagesofCOVID-19Patients folder ...\n", + "250 files processed in this folder in 27 sec...\n", + "\n", + "Processing NormalPersonECGImages folder ...\n", + "859 files processed in this folder in 348 sec...\n", + "\n", + "Processing ECGImagesofMyocardialInfarctionPatients folder ...\n", + "74 files processed in this folder in 29 sec...\n", + "\n" + ] + } + ], + "source": [ + "## Cropping\n", + "targetFolder = 'CSV_data_v1'\n", + "if not os.path.exists(targetFolder):\n", + " os.makedirs(targetFolder)\n", + "t0 = time.time()\n", + "for dirs in os.listdir(pathroot):\n", + " t = time.time()\n", + " print('Processing {0} folder ...'.format(dirs))\n", + " if not os.path.isfile(dirs):\n", + " path = os.path.join(pathroot,dirs)\n", + " path2 = os.path.join(path,\"Cropped_Images\")\n", + " if not os.path.exists(path2):\n", + " os.makedirs(path2)\n", + " fileCount = crop2csv(dirs,path,path2,dirList,superDict) \n", + " t = time.time()-t\n", + " print('{0} files processed in this folder in {1} sec...\\n'.format(fileCount,round(t)))\n", + "t0 = time.time()-t0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab0d386d", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb1756ac", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ff6cfcd", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ad56a7b", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2b319f8", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}