Diff of /process_mimic.ipynb [000000] .. [bab239]

Switch to unified view

a b/process_mimic.ipynb
1
{
2
  "nbformat": 4,
3
  "nbformat_minor": 0,
4
  "metadata": {
5
    "colab": {
6
      "name": "Copy of process_mimic.ipynb",
7
      "version": "0.3.2",
8
      "provenance": [],
9
      "collapsed_sections": [],
10
      "include_colab_link": true
11
    },
12
    "kernelspec": {
13
      "name": "python3",
14
      "display_name": "Python 3"
15
    }
16
  },
17
  "cells": [
18
    {
19
      "cell_type": "markdown",
20
      "metadata": {
21
        "id": "view-in-github",
22
        "colab_type": "text"
23
      },
24
      "source": [
25
        "<a href=\"https://colab.research.google.com/github/BenM1215/medgan/blob/master/process_mimic.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
26
      ]
27
    },
28
    {
29
      "metadata": {
30
        "id": "dkIm2oJnyjVK",
31
        "colab_type": "code",
32
        "colab": {}
33
      },
34
      "cell_type": "code",
35
      "source": [
36
        "import sys\n",
37
        "import _pickle as pickle\n",
38
        "import numpy as np\n",
39
        "from datetime import datetime"
40
      ],
41
      "execution_count": 0,
42
      "outputs": []
43
    },
44
    {
45
      "metadata": {
46
        "id": "F3LE_9PVyBfI",
47
        "colab_type": "code",
48
        "outputId": "d6ad558d-3bd1-4da1-be4f-248213b11409",
49
        "colab": {
50
          "base_uri": "https://localhost:8080/",
51
          "height": 34
52
        }
53
      },
54
      "cell_type": "code",
55
      "source": [
56
        "from google.colab import drive\n",
57
        "drive.mount('/content/gdrive', force_remount=True)"
58
      ],
59
      "execution_count": 0,
60
      "outputs": [
61
        {
62
          "output_type": "stream",
63
          "text": [
64
            "Mounted at /content/gdrive\n"
65
          ],
66
          "name": "stdout"
67
        }
68
      ]
69
    },
70
    {
71
      "metadata": {
72
        "id": "QbwB-8Fdylim",
73
        "colab_type": "code",
74
        "colab": {}
75
      },
76
      "cell_type": "code",
77
      "source": [
78
        "def convert_to_icd9(dxStr):\n",
79
        "    if dxStr.startswith('E'):\n",
80
        "        if len(dxStr) > 4: return dxStr[:4] + '.' + dxStr[4:]\n",
81
        "        else: return dxStr\n",
82
        "    else:\n",
83
        "        if len(dxStr) > 3: return dxStr[:3] + '.' + dxStr[3:]\n",
84
        "        else: return dxStr"
85
      ],
86
      "execution_count": 0,
87
      "outputs": []
88
    },
89
    {
90
      "metadata": {
91
        "id": "DEHXM6G2yqLR",
92
        "colab_type": "code",
93
        "colab": {}
94
      },
95
      "cell_type": "code",
96
      "source": [
97
        "def convert_to_3digit_icd9(dxStr):\n",
98
        "    if dxStr.startswith('E'):\n",
99
        "        if len(dxStr) > 4: return dxStr[:4]\n",
100
        "        else: return dxStr\n",
101
        "    else:\n",
102
        "        if len(dxStr) > 3: return dxStr[:3]\n",
103
        "        else: return dxStr"
104
      ],
105
      "execution_count": 0,
106
      "outputs": []
107
    },
108
    {
109
      "metadata": {
110
        "id": "WTP8qLpe4GsQ",
111
        "colab_type": "code",
112
        "colab": {}
113
      },
114
      "cell_type": "code",
115
      "source": [
116
        "# input arguments\n",
117
        "binary_count = 'binary'"
118
      ],
119
      "execution_count": 0,
120
      "outputs": []
121
    },
122
    {
123
      "metadata": {
124
        "id": "m40Qy9Ok4KB7",
125
        "colab_type": "code",
126
        "colab": {}
127
      },
128
      "cell_type": "code",
129
      "source": [
130
        "root_dir = \"/content/gdrive/My Drive/\"\n",
131
        "\n",
132
        "if binary_count == 'count':\n",
133
        "  base_dir = root_dir + 'GOSH/Synthetic Data/medgan/count/'\n",
134
        "else:\n",
135
        "  base_dir = root_dir + 'GOSH/Synthetic Data/medgan/binary/'\n",
136
        "\n",
137
        "raw_data_dir = root_dir + 'GOSH/Synthetic Data/medgan/mimic/'\n",
138
        "processed_data_dir = base_dir + 'processed_mimic/'\n",
139
        "model_dir = base_dir + 'models/'\n",
140
        "gen_data_dir = base_dir + 'generated_data/'"
141
      ],
142
      "execution_count": 0,
143
      "outputs": []
144
    },
145
    {
146
      "metadata": {
147
        "id": "84a4GULe5NmG",
148
        "colab_type": "code",
149
        "colab": {}
150
      },
151
      "cell_type": "code",
152
      "source": [
153
        "admissionFile = raw_data_dir + 'ADMISSIONS.csv'\n",
154
        "diagnosisFile = raw_data_dir + 'DIAGNOSES_ICD.csv'\n",
155
        "outFile = processed_data_dir + 'processed_mimic'"
156
      ],
157
      "execution_count": 0,
158
      "outputs": []
159
    },
160
    {
161
      "metadata": {
162
        "id": "0o1e-8_RyttW",
163
        "colab_type": "code",
164
        "colab": {}
165
      },
166
      "cell_type": "code",
167
      "source": [
168
        "if binary_count != 'binary' and binary_count != 'count':\n",
169
        "    print('You must choose either binary or count.')"
170
      ],
171
      "execution_count": 0,
172
      "outputs": []
173
    },
174
    {
175
      "metadata": {
176
        "id": "RYeXZcXe4-Zj",
177
        "colab_type": "code",
178
        "outputId": "0c259433-92d5-497b-da62-b9357d23d938",
179
        "colab": {
180
          "base_uri": "https://localhost:8080/",
181
          "height": 119
182
        }
183
      },
184
      "cell_type": "code",
185
      "source": [
186
        "print('Building pid-admission mapping, admission-date mapping')\n",
187
        "pidAdmMap = {}\n",
188
        "admDateMap = {}\n",
189
        "infd = open(admissionFile, 'r')\n",
190
        "infd.readline()\n",
191
        "for line in infd:\n",
192
        "    tokens = line.strip().split(',')\n",
193
        "    pid = int(tokens[1])\n",
194
        "    admId = int(tokens[2])\n",
195
        "    admTime = datetime.strptime(tokens[3], '%Y-%m-%d %H:%M:%S')\n",
196
        "    admDateMap[admId] = admTime\n",
197
        "    if pid in pidAdmMap: pidAdmMap[pid].append(admId)\n",
198
        "    else: pidAdmMap[pid] = [admId]\n",
199
        "infd.close()\n",
200
        "\n",
201
        "print('Building admission-dxList mapping')\n",
202
        "admDxMap = {}\n",
203
        "infd = open(diagnosisFile, 'r')\n",
204
        "infd.readline()\n",
205
        "for line in infd:\n",
206
        "    tokens = line.strip().split(',')\n",
207
        "    admId = int(tokens[2])\n",
208
        "    # Uncomment this line and comment the line below, if you want to use the entire ICD9 digits.\n",
209
        "    dxStr = 'D_' + convert_to_icd9(tokens[4][1:-1])\n",
210
        "    #dxStr = 'D_' + convert_to_3digit_icd9(tokens[4][1:-1])\n",
211
        "    if admId in admDxMap: admDxMap[admId].append(dxStr)\n",
212
        "    else: admDxMap[admId] = [dxStr]\n",
213
        "infd.close()\n",
214
        "\n",
215
        "print('Building pid-sortedVisits mapping')\n",
216
        "pidSeqMap = {}\n",
217
        "for pid, admIdList in pidAdmMap.items():\n",
218
        "    #if len(admIdList) < 2: continue\n",
219
        "    sortedList = sorted([(admDateMap[admId], admDxMap[admId]) for admId in admIdList])\n",
220
        "    pidSeqMap[pid] = sortedList\n",
221
        "\n",
222
        "print('Building pids, dates, strSeqs')\n",
223
        "pids = []\n",
224
        "dates = []\n",
225
        "seqs = []\n",
226
        "for pid, visits in pidSeqMap.items():\n",
227
        "    pids.append(pid)\n",
228
        "    seq = []\n",
229
        "    date = []\n",
230
        "    for visit in visits:\n",
231
        "        date.append(visit[0])\n",
232
        "        seq.append(visit[1])\n",
233
        "    dates.append(date)\n",
234
        "    seqs.append(seq)\n",
235
        "\n",
236
        "print('Converting strSeqs to intSeqs, and making types')\n",
237
        "types = {}\n",
238
        "newSeqs = []\n",
239
        "for patient in seqs:\n",
240
        "    newPatient = []\n",
241
        "    for visit in patient:\n",
242
        "        newVisit = []\n",
243
        "        for code in visit:\n",
244
        "            if code in types:\n",
245
        "                newVisit.append(types[code])\n",
246
        "            else:\n",
247
        "                types[code] = len(types)\n",
248
        "                newVisit.append(types[code])\n",
249
        "        newPatient.append(newVisit)\n",
250
        "    newSeqs.append(newPatient)\n",
251
        "\n",
252
        "print('Constructing the matrix')\n",
253
        "numPatients = len(newSeqs)\n",
254
        "numCodes = len(types)\n",
255
        "matrix = np.zeros((numPatients, numCodes)).astype('float32')\n",
256
        "for i, patient in enumerate(newSeqs):\n",
257
        "    for visit in patient:\n",
258
        "        for code in visit:\n",
259
        "            if binary_count == 'binary':\n",
260
        "                matrix[i][code] = 1.\n",
261
        "            else:\n",
262
        "                matrix[i][code] += 1.\n",
263
        "\n",
264
        "pickle.dump(pids, open(outFile+'.pids', 'wb'), -1)\n",
265
        "pickle.dump(matrix, open(outFile+'.matrix', 'wb'), -1)\n",
266
        "pickle.dump(types, open(outFile+'.types', 'wb'), -1)"
267
      ],
268
      "execution_count": 0,
269
      "outputs": [
270
        {
271
          "output_type": "stream",
272
          "text": [
273
            "Building pid-admission mapping, admission-date mapping\n",
274
            "Building admission-dxList mapping\n",
275
            "Building pid-sortedVisits mapping\n",
276
            "Building pids, dates, strSeqs\n",
277
            "Converting strSeqs to intSeqs, and making types\n",
278
            "Constructing the matrix\n"
279
          ],
280
          "name": "stdout"
281
        }
282
      ]
283
    },
284
    {
285
      "metadata": {
286
        "id": "AaF-cC2t0pT6",
287
        "colab_type": "code",
288
        "colab": {}
289
      },
290
      "cell_type": "code",
291
      "source": [
292
        ""
293
      ],
294
      "execution_count": 0,
295
      "outputs": []
296
    }
297
  ]
298
}