Switch to unified view

a b/scripts/Data-processing.ipynb
1
{
2
 "cells": [
3
  {
4
   "cell_type": "code",
5
   "execution_count": 1,
6
   "id": "4c9d40b1",
7
   "metadata": {},
8
   "outputs": [],
9
   "source": [
10
    "import pandas as pd\n",
11
    "import numpy as np\n",
12
    "import cv2\n",
13
    "import matplotlib.pyplot as plt\n",
14
    "import os.path, sys, re\n",
15
    "import time\n",
16
    "from PIL import Image"
17
   ]
18
  },
19
  {
20
   "cell_type": "code",
21
   "execution_count": 2,
22
   "id": "92e8df2f",
23
   "metadata": {},
24
   "outputs": [
25
    {
26
     "name": "stdout",
27
     "output_type": "stream",
28
     "text": [
29
      "/home/moise/Desktop/Data_Science/Erdos_Institute/ecg-proj/ecg-copy\n"
30
     ]
31
    }
32
   ],
33
   "source": [
34
    "cd ~/Desktop/Data_Science/Erdos_Institute/ecg-proj/ecg-copy/"
35
   ]
36
  },
37
  {
38
   "cell_type": "code",
39
   "execution_count": 3,
40
   "id": "c5a0e302",
41
   "metadata": {},
42
   "outputs": [],
43
   "source": [
44
    "pathroot = \"data_v1/\""
45
   ]
46
  },
47
  {
48
   "cell_type": "code",
49
   "execution_count": 4,
50
   "id": "3c9b391a",
51
   "metadata": {},
52
   "outputs": [],
53
   "source": [
54
    "\"\"\"\n",
55
    "Note:\n",
56
    "-----\n",
57
    "\n",
58
    "1)The value for the key 'ECGImagesofMyocardialInfarctionPatients' is a list of dictionaries \n",
59
    "corresponding to the cases\n",
60
    "    i) MI <=37\n",
61
    "    ii) MI >=38\n",
62
    "2) The value of the key 'ECGImagesofCOVID-19Patients' is broken down as follows:\n",
63
    "    i) Binder1_Page file name\n",
64
    "    ii) COVID file name\n",
65
    "3) The value of the key 'ECGImagesofPatientthathaveHistoryofMI' corresponds to:\n",
66
    "    i) PMI<113\n",
67
    "    ii) 113<=PMI<=161 --- ATTENTION: only 12 Leads in the case!\n",
68
    "    iii) PMI>=162\n",
69
    "\"\"\"\n",
70
    "superDict = {'NormalPersonECGImages': { 'Lead1':(130,300,640,600), 'Lead2':(641,300,1125,600),'Lead3':(1130,300,1625,600), 'Lead4':(1630,300,2120,600),\\\n",
71
    "           'Lead5':(130,600,640,900), 'Lead6':(641,600,1125,900),'Lead7':(1130,600,1625,900), 'Lead8':(1630,600,2120,900),\\\n",
72
    "           'Lead9':(130,900,640,1200), 'Lead10':(641,900,1125,1200),'Lead11':(1130,900,1625,1200), 'Lead12':(1630,900,2120,1200),\\\n",
73
    "           'Lead13':(130,1205,2120,1450) \n",
74
    "          },\n",
75
    "             'ECGImagesofPatientthathaveabnormalheartbeats': { 'Lead1':(130,300,640,600), 'Lead2':(641,300,1125,600),'Lead3':(1130,300,1625,600), 'Lead4':(1630,300,2110,600),\\\n",
76
    "           'Lead5':(130,600,640,900), 'Lead6':(641,600,1125,900),'Lead7':(1130,600,1625,900), 'Lead8':(1630,600,2110,900),\\\n",
77
    "           'Lead9':(130,900,640,1200), 'Lead10':(641,900,1125,1200),'Lead11':(1130,900,1625,1200), 'Lead12':(1630,900,2110,1200),\\\n",
78
    "           'Lead13':(130,1205,2110,1450) \n",
79
    "          },\n",
80
    "             \n",
81
    "            'ECGImagesofMyocardialInfarctionPatients': [{ 'Lead1':(125,310,625,600), 'Lead2':(640,310,1125,600),'Lead3':(1140,310,1625,600), 'Lead4':(1640,310,2125,600),\\\n",
82
    "           'Lead5':(125,605,625,900), 'Lead6':(640,605,1125,900),'Lead7':(1140,605,1625,900), 'Lead8':(1640,605,2125,900),\\\n",
83
    "           'Lead9':(125,905,625,1200), 'Lead10':(640,905,1125,1200),'Lead11':(1140,905,1625,1200), 'Lead12':(1640,905,2125,1200),\\\n",
84
    "            'Lead13':(125,1205,2125,1495)},\\\n",
85
    "                                                { 'Lead1':(125,300,1125,500), 'Lead2':(1125,300,2125,500),\\\n",
86
    "            'Lead3':(125,500,1125,680), 'Lead4':(1125,500,2125,680),\\\n",
87
    "            'Lead5':(125,680,1125,845), 'Lead6':(1125,680,2125,845),\\\n",
88
    "            'Lead7':(125,845,1125,1000), 'Lead8':(1125,845,2125,1000),\\\n",
89
    "            'Lead9':(125,1000,1125,1150), 'Lead10':(1125,1000,2125,1150),\\\n",
90
    "            'Lead11':(125,1150,1125,1300), 'Lead12':(1125,1150,2125,1300),\\\n",
91
    "            'Lead13':(125,1300,2125,1490)} ],\n",
92
    "             \n",
93
    "             'ECGImagesofCOVID-19Patients': [{ 'Lead1':(100,110,300,240), 'Lead2':(300,110,500,240),'Lead3':(500,110,700,240), 'Lead4':(700,110,900,240),\\\n",
94
    "           'Lead5':(100,241,300,350), 'Lead6':(300,241,500,350),'Lead7':(500,241,700,350), 'Lead8':(700,241,900,350),\\\n",
95
    "           'Lead9':(100,351,300,450), 'Lead10':(300,351,500,450),'Lead11':(500,351,700,450), 'Lead12':(700,351,900,450),\\\n",
96
    "            'Lead13':(100,451,900,545)},\\\n",
97
    "                                             { 'Lead1':(125,310,625,600), 'Lead2':(640,310,1125,600),'Lead3':(1140,310,1625,600), 'Lead4':(1640,310,2125,600),\\\n",
98
    "           'Lead5':(125,605,625,900), 'Lead6':(640,605,1125,900),'Lead7':(1140,605,1625,900), 'Lead8':(1640,605,2125,900),\\\n",
99
    "           'Lead9':(125,905,625,1200), 'Lead10':(640,905,1125,1200),'Lead11':(1140,905,1625,1200), 'Lead12':(1640,905,2125,1200),\\\n",
100
    "            'Lead13':(125,1205,2125,1495)} ],\n",
101
    "             \n",
102
    "             'ECGImagesofPatientthathaveHistoryofMI': [{ 'Lead1':(125,310,625,600), 'Lead2':(640,310,1125,600),'Lead3':(1140,310,1625,600), 'Lead4':(1640,310,2125,600),\\\n",
103
    "           'Lead5':(125,605,625,900), 'Lead6':(640,605,1125,900),'Lead7':(1140,605,1625,900), 'Lead8':(1640,605,2125,900),\\\n",
104
    "           'Lead9':(125,905,625,1200), 'Lead10':(640,905,1125,1200),'Lead11':(1140,905,1625,1200), 'Lead12':(1640,905,2125,1200),\\\n",
105
    "            'Lead13':(125,1205,2125,1495)},\\\n",
106
    "                                               { 'Lead1':(125,410,1125,590), 'Lead2':(1125,410,2125,590),\\\n",
107
    "            'Lead3':(125,600,1125,815), 'Lead4':(1125,600,2125,815),\\\n",
108
    "            'Lead5':(125,815,1125,1035), 'Lead6':(1125,815,2125,1035),\\\n",
109
    "            'Lead7':(125,1035,1125,1235), 'Lead8':(1125,1035,2125,1235),\\\n",
110
    "            'Lead9':(125,1235,1125,1350), 'Lead10':(1125,1235,2125,1350),\\\n",
111
    "            'Lead11':(125,1350,1125,1490), 'Lead12':(1125,1350,2125,1490),\\\n",
112
    "            },\\\n",
113
    "                                           { 'Lead1':(125,290,1125,490), 'Lead2':(1125,290,2125,490),\\\n",
114
    "            'Lead3':(125,490,1125,650), 'Lead4':(1125,490,2125,650),\\\n",
115
    "            'Lead5':(125,650,1125,810), 'Lead6':(1125,650,2125,810),\\\n",
116
    "            'Lead7':(125,810,1125,1000), 'Lead8':(1125,810,2125,1000),\\\n",
117
    "            'Lead9':(125,1000,1125,1190), 'Lead10':(1125,1000,2125,1190),\\\n",
118
    "            'Lead11':(125,1190,1125,1315), 'Lead12':(1125,1190,2125,1315),\\\n",
119
    "            'Lead13':(125,1315,2125,1550)\n",
120
    "            }]   \n",
121
    "}"
122
   ]
123
  },
124
  {
125
   "cell_type": "code",
126
   "execution_count": 5,
127
   "id": "310cafaf",
128
   "metadata": {},
129
   "outputs": [],
130
   "source": [
131
    "dirList=['NormalPersonECGImages','ECGImagesofPatientthathaveabnormalheartbeats',\n",
132
    "        'ECGImagesofMyocardialInfarctionPatients','ECGImagesofCOVID-19Patients',\n",
133
    "        'ECGImagesofPatientthathaveHistoryofMI']"
134
   ]
135
  },
136
  {
137
   "cell_type": "code",
138
   "execution_count": 6,
139
   "id": "76c9879b",
140
   "metadata": {},
141
   "outputs": [],
142
   "source": [
143
    "def processor(dirs,path2,im,of,crop_dict,numLeads=13,threshold_level=50):\n",
144
    "    for i in range(1,numLeads+1):\n",
145
    "        imCrop = im.crop(box=crop_dict['Lead'+str(i)])\n",
146
    "        target = of+'-Cropped_lead'+str(i)\n",
147
    "        imCrop.save(os.path.join(path2,target+'.png'), \"PNG\", quality=100)\n",
148
    "        img = cv2.imread(os.path.join(path2,target+'.png'))\n",
149
    "        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n",
150
    "        coords = np.column_stack(np.where(gray < threshold_level))\n",
151
    "        coords[:,[0,1]]=coords[:,[1,0]]\n",
152
    "        csvFolder = os.path.join(targetFolder,dirs)\n",
153
    "        if not os.path.exists(csvFolder):\n",
154
    "            os.makedirs(csvFolder)\n",
155
    "        np.savetxt(os.path.join(csvFolder,target+'.csv'),coords) "
156
   ]
157
  },
158
  {
159
   "cell_type": "code",
160
   "execution_count": 7,
161
   "id": "15742532",
162
   "metadata": {},
163
   "outputs": [],
164
   "source": [
165
    "def crop2csv(dirs,path,path2,dirList,superDict,pattern='.jpg'):\n",
166
    "    count = 0\n",
167
    "    for item in os.listdir(path):\n",
168
    "        if os.path.isfile(os.path.join(path,item)):\n",
169
    "            of, oe = os.path.splitext(item)\n",
170
    "            if oe == pattern:\n",
171
    "                im = Image.open(os.path.join(path,item))\n",
172
    "                if dirs == dirList[2]:\n",
173
    "                    if int(of[3:]) <= 37:\n",
174
    "                        processor(dirs,path2,im,of,superDict[dirs][0])\n",
175
    "                    else:\n",
176
    "                        processor(dirs,path2,im,of,superDict[dirs][1])\n",
177
    "                elif dirs == dirList[3]:\n",
178
    "                    if of[0:5] == 'COVID':\n",
179
    "                        processor(dirs,path2,im,of,superDict[dirs][1])\n",
180
    "                    else:\n",
181
    "                        processor(dirs,path2,im,of,superDict[dirs][0])\n",
182
    "                elif dirs == dirList[4]:\n",
183
    "                    if int(of[4:]) < 113:\n",
184
    "                        processor(dirs,path2,im,of,superDict[dirs][0])\n",
185
    "                    elif (int(of[4:]) >= 113) and (int(of[4:]) <= 161) :\n",
186
    "                        processor(dirs,path2,im,of,superDict[dirs][1],numLeads=12)\n",
187
    "                    else:\n",
188
    "                        processor(dirs,path2,im,of,superDict[dirs][2])\n",
189
    "                else:\n",
190
    "                    processor(dirs,path2,im,of,superDict[dirs])\n",
191
    "            count = count+1\n",
192
    "    return count"
193
   ]
194
  },
195
  {
196
   "cell_type": "code",
197
   "execution_count": 8,
198
   "id": "61da0528",
199
   "metadata": {
200
    "scrolled": true
201
   },
202
   "outputs": [
203
    {
204
     "name": "stdout",
205
     "output_type": "stream",
206
     "text": [
207
      "Processing ECGImagesofPatientthathaveabnormalheartbeats folder ...\n",
208
      "546 files processed in this folder in 229 sec...\n",
209
      "\n",
210
      "Processing ECGImagesofPatientthathaveHistoryofMI folder ...\n",
211
      "203 files processed in this folder in 81 sec...\n",
212
      "\n",
213
      "Processing ECGImagesofCOVID-19Patients folder ...\n",
214
      "250 files processed in this folder in 27 sec...\n",
215
      "\n",
216
      "Processing NormalPersonECGImages folder ...\n",
217
      "859 files processed in this folder in 348 sec...\n",
218
      "\n",
219
      "Processing ECGImagesofMyocardialInfarctionPatients folder ...\n",
220
      "74 files processed in this folder in 29 sec...\n",
221
      "\n"
222
     ]
223
    }
224
   ],
225
   "source": [
226
    "## Cropping\n",
227
    "targetFolder = 'CSV_data_v1'\n",
228
    "if not os.path.exists(targetFolder):\n",
229
    "    os.makedirs(targetFolder)\n",
230
    "t0 = time.time()\n",
231
    "for dirs in os.listdir(pathroot):\n",
232
    "    t = time.time()\n",
233
    "    print('Processing {0} folder ...'.format(dirs))\n",
234
    "    if not os.path.isfile(dirs):\n",
235
    "        path = os.path.join(pathroot,dirs)\n",
236
    "        path2 = os.path.join(path,\"Cropped_Images\")\n",
237
    "        if not os.path.exists(path2):\n",
238
    "            os.makedirs(path2)\n",
239
    "        fileCount = crop2csv(dirs,path,path2,dirList,superDict) \n",
240
    "    t = time.time()-t\n",
241
    "    print('{0} files processed in this folder in {1} sec...\\n'.format(fileCount,round(t)))\n",
242
    "t0 = time.time()-t0"
243
   ]
244
  },
245
  {
246
   "cell_type": "code",
247
   "execution_count": null,
248
   "id": "ab0d386d",
249
   "metadata": {},
250
   "outputs": [],
251
   "source": []
252
  },
253
  {
254
   "cell_type": "code",
255
   "execution_count": null,
256
   "id": "bb1756ac",
257
   "metadata": {},
258
   "outputs": [],
259
   "source": []
260
  },
261
  {
262
   "cell_type": "code",
263
   "execution_count": null,
264
   "id": "7ff6cfcd",
265
   "metadata": {},
266
   "outputs": [],
267
   "source": []
268
  },
269
  {
270
   "cell_type": "code",
271
   "execution_count": null,
272
   "id": "5ad56a7b",
273
   "metadata": {},
274
   "outputs": [],
275
   "source": []
276
  },
277
  {
278
   "cell_type": "code",
279
   "execution_count": null,
280
   "id": "f2b319f8",
281
   "metadata": {},
282
   "outputs": [],
283
   "source": []
284
  }
285
 ],
286
 "metadata": {
287
  "kernelspec": {
288
   "display_name": "Python 3",
289
   "language": "python",
290
   "name": "python3"
291
  },
292
  "language_info": {
293
   "codemirror_mode": {
294
    "name": "ipython",
295
    "version": 3
296
   },
297
   "file_extension": ".py",
298
   "mimetype": "text/x-python",
299
   "name": "python",
300
   "nbconvert_exporter": "python",
301
   "pygments_lexer": "ipython3",
302
   "version": "3.7.10"
303
  }
304
 },
305
 "nbformat": 4,
306
 "nbformat_minor": 5
307
}