Switch to unified view

a b/data_processing_TCGA.ipynb
1
{
2
 "cells": [
3
  {
4
   "cell_type": "code",
5
   "execution_count": null,
6
   "metadata": {},
7
   "outputs": [],
8
   "source": [
9
    "import os\n",
10
    "\n",
11
    "import numpy as np\n",
12
    "import pandas as pd"
13
   ]
14
  },
15
  {
16
   "cell_type": "code",
17
   "execution_count": null,
18
   "metadata": {},
19
   "outputs": [],
20
   "source": [
21
    "tumor_list = [\n",
22
    "'ACC',\n",
23
    "'BLCA',\n",
24
    "'BRCA',\n",
25
    "'CESC',\n",
26
    "'CHOL',\n",
27
    "'COAD',\n",
28
    "'COADREAD',\n",
29
    "'DLBC',\n",
30
    "'ESCA',\n",
31
    "'FPPP',\n",
32
    "'GBM',\n",
33
    "'GBMLGG',\n",
34
    "'HNSC',\n",
35
    "'KICH',\n",
36
    "'KIPAN',\n",
37
    "'KIRC',\n",
38
    "'KIRP',\n",
39
    "'LAML',\n",
40
    "'LGG',\n",
41
    "'LIHC',\n",
42
    "'LUAD',\n",
43
    "'LUSC',\n",
44
    "'MESO',\n",
45
    "'OV',\n",
46
    "'PAAD',\n",
47
    "'PCPG',\n",
48
    "'PRAD',\n",
49
    "'READ',\n",
50
    "'SARC',\n",
51
    "'SKCM',\n",
52
    "'STAD',\n",
53
    "'STES',\n",
54
    "'TGCT',\n",
55
    "'THCA',\n",
56
    "'THYM',\n",
57
    "'UCEC',\n",
58
    "'UCS',\n",
59
    "'UVM']"
60
   ]
61
  },
62
  {
63
   "cell_type": "markdown",
64
   "metadata": {},
65
   "source": [
66
    "# RPPA"
67
   ]
68
  },
69
  {
70
   "cell_type": "code",
71
   "execution_count": null,
72
   "metadata": {},
73
   "outputs": [],
74
   "source": [
75
    "## 1. FIND SUPERSET OF RPPA FEATURES\n",
76
    "feat_list = {}\n",
77
    "for tumor in tumor_list:\n",
78
    "    filepath = './RPPA/gdac.broadinstitute.org_{}.RPPA_AnnotateWithGene.Level_3.2016012800.0.0/'.format(tumor)\n",
79
    "    filename = '{}.rppa.txt'.format(tumor)\n",
80
    "\n",
81
    "    if os.path.exists(filepath + filename):\n",
82
    "        tmp = pd.read_csv(filepath + filename, sep='\\t')\n",
83
    "\n",
84
    "        tmp.columns = [list(tmp)[0]] + [f[:15] for f in list(tmp)[1:]]\n",
85
    "        tmp         = tmp.T.reset_index()\n",
86
    "        tmp.columns = tmp.iloc[0, 0:]\n",
87
    "        tmp         = tmp.iloc[1:, :].reset_index(drop=True)\n",
88
    "        \n",
89
    "        feat_list[tumor] = list(tmp)[1:]\n",
90
    "        \n",
91
    "        if tumor == 'ACC':\n",
92
    "            final_feat_list = feat_list[tumor].copy()\n",
93
    "            sup_feat_list   = feat_list[tumor].copy()\n",
94
    "        else:\n",
95
    "            final_feat_list = np.intersect1d(final_feat_list, feat_list[tumor])\n",
96
    "            sup_feat_list  += feat_list[tumor]\n",
97
    "            \n",
98
    "sup_feat_list = np.unique(sup_feat_list).tolist()\n",
99
    "            \n",
100
    "\n",
101
    "for tumor in tumor_list:\n",
102
    "    filepath = './RPPA/gdac.broadinstitute.org_{}.RPPA_AnnotateWithGene.Level_3.2016012800.0.0/'.format(tumor)\n",
103
    "    filename = '{}.rppa.txt'.format(tumor)\n",
104
    "    \n",
105
    "    if os.path.exists(filepath + filename):\n",
106
    "        tmp = pd.read_csv(filepath + filename, sep='\\t')\n",
107
    "\n",
108
    "        tmp.columns = [list(tmp)[0]] + [f[:15] for f in list(tmp)[1:]]\n",
109
    "        tmp         = tmp.T.reset_index()\n",
110
    "        tmp.columns = tmp.iloc[0, 0:]\n",
111
    "        tmp         = tmp.iloc[1:, :].reset_index(drop=True)\n",
112
    "        \n",
113
    "        tmp_ = pd.DataFrame([], columns=['Composite.Element.REF'] + sup_feat_list)\n",
114
    "        tmp_[['Composite.Element.REF'] + feat_list[tumor]] = tmp[['Composite.Element.REF'] + feat_list[tumor]]\n",
115
    "        \n",
116
    "        if tumor == 'ACC':\n",
117
    "#             final_df = tmp[['gene'] + final_feat_list.tolist()]\n",
118
    "            final_df = tmp_\n",
119
    "        else:\n",
120
    "#             final_df = pd.concat([final_df, tmp[['gene'] + final_feat_list.tolist()]], axis=0)\n",
121
    "            final_df = pd.concat([final_df, tmp_], axis=0)\n",
122
    "    \n",
123
    "final_df = final_df.drop_duplicates(subset=['Composite.Element.REF']).reset_index(drop=True)\n",
124
    "final_df.to_csv('./FINAL/RPPA.csv', index=False)"
125
   ]
126
  },
127
  {
128
   "cell_type": "markdown",
129
   "metadata": {},
130
   "source": [
131
    "# miRNA Seq"
132
   ]
133
  },
134
  {
135
   "cell_type": "code",
136
   "execution_count": null,
137
   "metadata": {},
138
   "outputs": [],
139
   "source": [
140
    "## 1. FIND SUPERSET OF miRNASeq FEATURES\n",
141
    "feat_list = {}\n",
142
    "for tumor in tumor_list:\n",
143
    "    filepath = './miRNAseq/gdac.broadinstitute.org_{}.miRseq_Preprocess.Level_3.2016012800.0.0/'.format(tumor)\n",
144
    "    filename = '{}.miRseq_RPKM_log2.txt'.format(tumor)\n",
145
    "\n",
146
    "    if os.path.exists(filepath + filename):\n",
147
    "        tmp = pd.read_csv(filepath + filename, sep='\\t')\n",
148
    "\n",
149
    "        tmp.columns = [list(tmp)[0]] + [f[:15] for f in list(tmp)[1:]]\n",
150
    "        tmp         = tmp.T.reset_index()\n",
151
    "        tmp.columns = tmp.iloc[0, 0:]\n",
152
    "        tmp         = tmp.iloc[1:, :].reset_index(drop=True)\n",
153
    "        \n",
154
    "        feat_list[tumor] = list(tmp)[1:]\n",
155
    "        \n",
156
    "        if tumor == 'ACC':\n",
157
    "            final_feat_list = feat_list[tumor].copy()\n",
158
    "            sup_feat_list   = feat_list[tumor].copy()\n",
159
    "        else:\n",
160
    "            final_feat_list = np.intersect1d(final_feat_list, feat_list[tumor])\n",
161
    "            sup_feat_list  += feat_list[tumor]\n",
162
    "            \n",
163
    "sup_feat_list = np.unique(sup_feat_list).tolist()\n",
164
    "\n",
165
    "for tumor in tumor_list:\n",
166
    "    filepath = './miRNAseq/gdac.broadinstitute.org_{}.miRseq_Preprocess.Level_3.2016012800.0.0/'.format(tumor)\n",
167
    "    filename = '{}.miRseq_RPKM_log2.txt'.format(tumor)\n",
168
    "\n",
169
    "    if os.path.exists(filepath + filename):\n",
170
    "        tmp = pd.read_csv(filepath + filename, sep='\\t')\n",
171
    "\n",
172
    "        tmp.columns = [list(tmp)[0]] + [f[:15] for f in list(tmp)[1:]]\n",
173
    "        tmp         = tmp.T.reset_index()\n",
174
    "        tmp.columns = tmp.iloc[0, 0:]\n",
175
    "        tmp         = tmp.iloc[1:, :].reset_index(drop=True)\n",
176
    "        \n",
177
    "        tmp_ = pd.DataFrame([], columns=['gene'] + sup_feat_list)\n",
178
    "        tmp_[['gene'] + feat_list[tumor]] = tmp[['gene'] + feat_list[tumor]]\n",
179
    "        \n",
180
    "        if tumor == 'ACC':\n",
181
    "#             final_df = tmp[['gene'] + final_feat_list.tolist()]\n",
182
    "            final_df = tmp_\n",
183
    "        else:\n",
184
    "#             final_df = pd.concat([final_df, tmp[['gene'] + final_feat_list.tolist()]], axis=0)\n",
185
    "            final_df = pd.concat([final_df, tmp_], axis=0)\n",
186
    "            \n",
187
    "final_df = final_df.drop_duplicates(subset=['gene']).reset_index(drop=True)\n",
188
    "final_df.to_csv('./FINAL/miRNAseq_RPKM_log2.csv', index=False)"
189
   ]
190
  },
191
  {
192
   "cell_type": "markdown",
193
   "metadata": {},
194
   "source": [
195
    "# METHYLATION"
196
   ]
197
  },
198
  {
199
   "cell_type": "code",
200
   "execution_count": null,
201
   "metadata": {},
202
   "outputs": [],
203
   "source": [
204
    "## 1. FIND SUPERSET OF METHYLATION FEATURES\n",
205
    "feat_list = {}\n",
206
    "for tumor in tumor_list:\n",
207
    "    filepath = './methylation/gdac.broadinstitute.org_{}.Methylation_Preprocess.Level_3.2016012800.0.0/'.format(tumor)\n",
208
    "    filename = '{}.meth.by_mean.data.txt'.format(tumor)\n",
209
    "\n",
210
    "    if os.path.exists(filepath + filename):\n",
211
    "        tmp = pd.read_csv(filepath + filename, sep='\\t')\n",
212
    "        tmp = tmp.iloc[1:, :].reset_index(drop=True)\n",
213
    "\n",
214
    "        tmp.columns = [list(tmp)[0]] + [f[:15] for f in list(tmp)[1:]]\n",
215
    "        tmp         = tmp.T.reset_index()\n",
216
    "        tmp.columns = tmp.iloc[0, 0:]\n",
217
    "        tmp         = tmp.iloc[1:, :].reset_index(drop=True)\n",
218
    "        \n",
219
    "        feat_list[tumor] = list(tmp)[1:]\n",
220
    "            \n",
221
    "        if tumor == 'ACC':\n",
222
    "            final_feat_list = feat_list[tumor].copy()\n",
223
    "            sup_feat_list   = feat_list[tumor].copy()\n",
224
    "        else:\n",
225
    "            final_feat_list = np.intersect1d(final_feat_list, feat_list[tumor])\n",
226
    "            sup_feat_list  += feat_list[tumor]\n",
227
    "            \n",
228
    "sup_feat_list = np.unique(sup_feat_list).tolist()\n",
229
    "\n",
230
    "for tumor in tumor_list:\n",
231
    "    filepath = './methylation/gdac.broadinstitute.org_{}.Methylation_Preprocess.Level_3.2016012800.0.0/'.format(tumor)\n",
232
    "    filename = '{}.meth.by_mean.data.txt'.format(tumor)\n",
233
    "\n",
234
    "    if os.path.exists(filepath + filename):\n",
235
    "        tmp = pd.read_csv(filepath + filename, sep='\\t')\n",
236
    "\n",
237
    "        tmp.columns = [list(tmp)[0]] + [f[:15] for f in list(tmp)[1:]]\n",
238
    "        tmp         = tmp.T.reset_index()\n",
239
    "        tmp.columns = tmp.iloc[0, 0:]\n",
240
    "        tmp         = tmp.iloc[1:, :].reset_index(drop=True)\n",
241
    "        \n",
242
    "        tmp_ = pd.DataFrame([], columns=['Hybridization REF'] + sup_feat_list)\n",
243
    "        tmp_[['Hybridization REF'] + feat_list[tumor]] = tmp[['Hybridization REF'] + feat_list[tumor]]\n",
244
    "        \n",
245
    "        if tumor == 'ACC':\n",
246
    "#             final_df = tmp[['gene'] + final_feat_list.tolist()]\n",
247
    "            final_df = tmp_\n",
248
    "        else:\n",
249
    "#             final_df = pd.concat([final_df, tmp[['gene'] + final_feat_list.tolist()]], axis=0)\n",
250
    "            final_df = pd.concat([final_df, tmp_], axis=0)\n",
251
    "            \n",
252
    "final_df = final_df.drop_duplicates(subset=['Hybridization REF']).reset_index(drop=True)\n",
253
    "final_df.to_csv('./FINAL/methylation.csv', index=False)"
254
   ]
255
  },
256
  {
257
   "cell_type": "markdown",
258
   "metadata": {},
259
   "source": [
260
    "# MAKE MULTI-VIEW OBSERVAITON FILE"
261
   ]
262
  },
263
  {
264
   "cell_type": "code",
265
   "execution_count": null,
266
   "metadata": {},
267
   "outputs": [],
268
   "source": [
269
    "mRNAseq     = pd.read_csv('./FINAL/mRNAseq_RSEM.csv')\n",
270
    "mRNAseq     = mRNAseq.drop_duplicates(subset=['HYBRIDIZATION R']).reset_index(drop=True)\n",
271
    "mRNAseq     = mRNAseq[mRNAseq['HYBRIDIZATION R'] != 'HYBRIDIZATION R'].reset_index(drop=True)\n",
272
    "mRNAseq     = mRNAseq.rename(columns={'HYBRIDIZATION R':'Hybridization REF'})\n",
273
    "mRNAseq['Hybridization REF'] = mRNAseq['Hybridization REF'].apply(lambda x: x.lower()[:-3])\n",
274
    "\n",
275
    "RPPA        = pd.read_csv('./FINAL/RPPA.csv')\n",
276
    "RPPA        = RPPA.rename(columns={'Composite.Element.REF':'Hybridization REF'})\n",
277
    "RPPA['Hybridization REF'] = RPPA['Hybridization REF'].apply(lambda x: x.lower()[:-3])\n",
278
    "\n",
279
    "methylation = pd.read_csv('./FINAL/methylation.csv')\n",
280
    "methylation['Hybridization REF'] = methylation['Hybridization REF'].apply(lambda x: x.lower()[:-3])\n",
281
    "\n",
282
    "miRNAseq    = pd.read_csv('./FINAL/miRNAseq_RPKM_log2.csv')\n",
283
    "miRNAseq     = miRNAseq.rename(columns={'gene':'Hybridization REF'})\n",
284
    "miRNAseq['Hybridization REF'] = miRNAseq['Hybridization REF'].apply(lambda x: x.lower()[:-3])"
285
   ]
286
  },
287
  {
288
   "cell_type": "code",
289
   "execution_count": null,
290
   "metadata": {},
291
   "outputs": [],
292
   "source": [
293
    "mRNAseq      = mRNAseq.drop_duplicates(subset=['Hybridization REF'])\n",
294
    "RPPA         = RPPA.drop_duplicates(subset=['Hybridization REF'])\n",
295
    "methylation  = methylation.drop_duplicates(subset=['Hybridization REF'])\n",
296
    "miRNAseq     = miRNAseq.drop_duplicates(subset=['Hybridization REF'])\n",
297
    "\n",
298
    "\n",
299
    "tmp_list    = np.asarray(list(mRNAseq))\n",
300
    "mRNAseq     = mRNAseq[tmp_list[mRNAseq.isna().sum(axis=0) == 0]]\n",
301
    "\n",
302
    "tmp_list = np.asarray(list(RPPA))\n",
303
    "RPPA     = RPPA[tmp_list[RPPA.isna().sum(axis=0) == 0]]\n",
304
    "\n",
305
    "tmp_list    = np.asarray(list(methylation))\n",
306
    "methylation = methylation[tmp_list[methylation.isna().sum(axis=0) == 0]]\n",
307
    "\n",
308
    "tmp_list    = np.asarray(list(miRNAseq))\n",
309
    "miRNAseq    = miRNAseq[tmp_list[miRNAseq.isna().sum(axis=0) == 0]]"
310
   ]
311
  },
312
  {
313
   "cell_type": "code",
314
   "execution_count": null,
315
   "metadata": {},
316
   "outputs": [],
317
   "source": [
318
    "label = pd.read_csv('./FINAL/clinical_label.csv', header=1)\n",
319
    "label = label.sort_values(by='Hybridization REF').reset_index(drop=True)\n",
320
    "label = label[label['Hybridization REF'].apply(lambda x: 'tcga' in x)].drop_duplicates(subset=['Hybridization REF'], keep ='last').reset_index(drop=True)"
321
   ]
322
  },
323
  {
324
   "cell_type": "code",
325
   "execution_count": null,
326
   "metadata": {},
327
   "outputs": [],
328
   "source": [
329
    "'''\n",
330
    "    Some of the patients had shifted columns for some reason.\n",
331
    "    Manually corrected these errors.\n",
332
    "'''\n",
333
    "\n",
334
    "label.loc[label['days_to_last_followup'] == 'endometrial', 'days_to_last_followup'] = label.loc[label['days_to_last_followup'] == 'endometrial', 'days_to_death']\n",
335
    "label.loc[label['days_to_last_followup'] == 'endometrial', 'days_to_death'] = label.loc[label['days_to_last_followup'] == 'endometrial', 'vital_status']\n",
336
    "label.loc[label['days_to_last_followup'] == 'endometrial', 'vital_status'] = label.loc[label['days_to_last_followup'] == 'endometrial', 'years_to_birth']\n",
337
    "\n",
338
    "label.loc[label['days_to_last_followup'] == 'other  specify', 'days_to_last_followup'] = label.loc[label['days_to_last_followup'] == 'other  specify', 'days_to_death']\n",
339
    "label.loc[label['days_to_last_followup'] == 'other  specify', 'days_to_death'] = label.loc[label['days_to_last_followup'] == 'other  specify', 'vital_status']\n",
340
    "label.loc[label['days_to_last_followup'] == 'other  specify', 'vital_status'] = label.loc[label['days_to_last_followup'] == 'other  specify', 'years_to_birth']\n",
341
    "\n",
342
    "label['1yr-mortality'] = -1.\n",
343
    "label.loc[label['days_to_last_followup'].astype(float) >= 365, '1yr-mortality'] = 0.\n",
344
    "label.loc[label['days_to_death'].astype(float) <= 365, '1yr-mortality'] = 1.\n",
345
    "\n",
346
    "label['3yr-mortality'] = -1.\n",
347
    "label.loc[label['days_to_last_followup'].astype(float) >= 3*365, '3yr-mortality'] = 0.\n",
348
    "label.loc[label['days_to_death'].astype(float) <= 3*365, '3yr-mortality'] = 1.\n",
349
    "\n",
350
    "label['5yr-mortality'] = -1.\n",
351
    "label.loc[label['days_to_last_followup'].astype(float) >= 5*365, '5yr-mortality'] = 0.\n",
352
    "label.loc[label['days_to_death'].astype(float) <= 5*365, '5yr-mortality'] = 1."
353
   ]
354
  },
355
  {
356
   "cell_type": "markdown",
357
   "metadata": {},
358
   "source": [
359
    "# Kernel PCA Dimensionality Reduction"
360
   ]
361
  },
362
  {
363
   "cell_type": "code",
364
   "execution_count": null,
365
   "metadata": {},
366
   "outputs": [],
367
   "source": [
368
    "from sklearn.decomposition import PCA, SparsePCA, KernelPCA\n",
369
    "\n",
370
    "for view in ['RPPA', 'miRNAseq', 'Methylation', 'mRNAseq']:\n",
371
    "    print(view)\n",
372
    "    if view == 'mRNAseq':\n",
373
    "        df    = mRNAseq.copy(deep=True)\n",
374
    "    elif view == 'miRNAseq':\n",
375
    "        df    = miRNAseq.copy(deep=True)\n",
376
    "    elif view == 'Methylation':\n",
377
    "        df    = methylation.copy(deep=True)\n",
378
    "    elif view == 'RPPA':\n",
379
    "        df    = RPPA.copy(deep=True)\n",
380
    "\n",
381
    "    z_dim = 100\n",
382
    "\n",
383
    "    pca   = KernelPCA(kernel='poly', n_components=z_dim, random_state=1234)\n",
384
    "    z     =  pca.fit_transform(np.asarray(df.iloc[:, 1:]))\n",
385
    "\n",
386
    "    df_pca = pd.DataFrame(z, index=df['Hybridization REF']).reset_index()\n",
387
    "    df_pca.to_csv('./FINAL/cleaned/{}_kpca.csv'.format(view), index=False)\n",
388
    "    \n",
389
    "# from sklearn.decomposition import PCA, SparsePCA, KernelPCA\n",
390
    "\n",
391
    "# for view in ['RPPA', 'miRNAseq', 'Methylation', 'mRNAseq']:\n",
392
    "#     print(view)\n",
393
    "#     if view == 'mRNAseq':\n",
394
    "#         df    = mRNAseq.copy(deep=True)\n",
395
    "#     elif view == 'miRNAseq':\n",
396
    "#         df    = miRNAseq.copy(deep=True)\n",
397
    "#     elif view == 'Methylation':\n",
398
    "#         df    = methylation.copy(deep=True)\n",
399
    "#     elif view == 'RPPA':\n",
400
    "#         df    = RPPA.copy(deep=True)\n",
401
    "\n",
402
    "#     z_dim = 100\n",
403
    "\n",
404
    "#     pca   = PCA(n_components=z_dim, random_state=1234)\n",
405
    "#     z     =  pca.fit_transform(np.asarray(df.iloc[:, 1:]))\n",
406
    "\n",
407
    "#     df_pca = pd.DataFrame(z, index=df['Hybridization REF']).reset_index()\n",
408
    "#     df_pca.to_csv('./FINAL/cleaned/{}_pca.csv'.format(view), index=False)\n",
409
    "    \n",
410
    "# from sklearn.decomposition import PCA, SparsePCA, KernelPCA\n",
411
    "\n",
412
    "# for view in ['RPPA', 'miRNAseq', 'Methylation', 'mRNAseq']:\n",
413
    "#     print(view)\n",
414
    "#     if view == 'mRNAseq':\n",
415
    "#         df    = mRNAseq.copy(deep=True)\n",
416
    "#     elif view == 'miRNAseq':\n",
417
    "#         df    = miRNAseq.copy(deep=True)\n",
418
    "#     elif view == 'Methylation':\n",
419
    "#         df    = methylation.copy(deep=True)\n",
420
    "#     elif view == 'RPPA':\n",
421
    "#         df    = RPPA.copy(deep=True)\n",
422
    "\n",
423
    "#     z_dim = 100\n",
424
    "\n",
425
    "#     pca   = SparsePCA(n_components=z_dim, random_state=1234)\n",
426
    "#     z     =  pca.fit_transform(np.asarray(df.iloc[:, 1:]))\n",
427
    "\n",
428
    "#     df_pca = pd.DataFrame(z, index=df['Hybridization REF']).reset_index()\n",
429
    "#     df_pca.to_csv('./FINAL/cleaned/{}_spca.csv'.format(view), index=False)"
430
   ]
431
  },
432
  {
433
   "cell_type": "markdown",
434
   "metadata": {},
435
   "source": [
436
    "# CREATE MULTI-VIEW DATASET"
437
   ]
438
  },
439
  {
440
   "cell_type": "code",
441
   "execution_count": null,
442
   "metadata": {},
443
   "outputs": [],
444
   "source": [
445
    "view = 'mRNAseq'\n",
446
    "df_pca1  = pd.read_csv('./FINAL/cleaned/{}_kpca.csv'.format(view))\n",
447
    "\n",
448
    "view = 'Methylation'\n",
449
    "df_pca2  = pd.read_csv('./FINAL/cleaned/{}_kpca.csv'.format(view))\n",
450
    "\n",
451
    "view = 'miRNAseq'\n",
452
    "df_pca3  = pd.read_csv('./FINAL/cleaned/{}_kpca.csv'.format(view))\n",
453
    "\n",
454
    "view = 'RPPA'\n",
455
    "df_pca4  = pd.read_csv('./FINAL/cleaned/{}_kpca.csv'.format(view))"
456
   ]
457
  },
458
  {
459
   "cell_type": "markdown",
460
   "metadata": {},
461
   "source": [
462
    "### CREATE 1-Yr Mortality Dataset. (Censored samples are removed...)"
463
   ]
464
  },
465
  {
466
   "cell_type": "code",
467
   "execution_count": null,
468
   "metadata": {},
469
   "outputs": [],
470
   "source": [
471
    "idx_list_y = label.loc[label['1yr-mortality'] != -1, 'Hybridization REF']\n",
472
    "\n",
473
    "idx_list1 = df_pca1['Hybridization REF']\n",
474
    "idx_list2 = df_pca2['Hybridization REF']\n",
475
    "idx_list3 = df_pca3['Hybridization REF']\n",
476
    "idx_list4 = df_pca4['Hybridization REF']\n",
477
    "\n",
478
    "idx_list_x = np.unique(idx_list1.tolist() + idx_list2.tolist() + idx_list3.tolist() + idx_list4.tolist())"
479
   ]
480
  },
481
  {
482
   "cell_type": "code",
483
   "execution_count": null,
484
   "metadata": {},
485
   "outputs": [],
486
   "source": [
487
    "idx_list     = np.intersect1d(idx_list_x, idx_list_y)\n",
488
    "df           = pd.DataFrame(idx_list, columns=['Hybridization REF'])  ##superset of samples that has at least one omics available."
489
   ]
490
  },
491
  {
492
   "cell_type": "markdown",
493
   "metadata": {},
494
   "source": [
495
    "### FINAL DATASET"
496
   ]
497
  },
498
  {
499
   "cell_type": "code",
500
   "execution_count": null,
501
   "metadata": {},
502
   "outputs": [],
503
   "source": [
504
    "df1 = pd.merge(df, df_pca1, how='left', on='Hybridization REF')\n",
505
    "df2 = pd.merge(df, df_pca2, how='left', on='Hybridization REF')\n",
506
    "df3 = pd.merge(df, df_pca3, how='left', on='Hybridization REF')\n",
507
    "df4 = pd.merge(df, df_pca4, how='left', on='Hybridization REF')\n",
508
    "dfy = pd.merge(df, label[['Hybridization REF','1yr-mortality']], how='left', on='Hybridization REF')"
509
   ]
510
  },
511
  {
512
   "cell_type": "code",
513
   "execution_count": null,
514
   "metadata": {},
515
   "outputs": [],
516
   "source": [
517
    "np.savez(\n",
518
    "    './FINAL/multi_omics_1yr_mortality.npz',\n",
519
    "    mRNAseq     = np.asarray(df1.iloc[:, 1:]),\n",
520
    "    Methylation = np.asarray(df1.iloc[:, 1:]),\n",
521
    "    miRNAseq    = np.asarray(df1.iloc[:, 1:]),\n",
522
    "    RPPA        = np.asarray(df1.iloc[:, 1:]),\n",
523
    "    label       = np.asarray(df1.iloc[:, 1:])\n",
524
    ")"
525
   ]
526
  }
527
 ],
528
 "metadata": {
529
  "kernelspec": {
530
   "display_name": "Python 3",
531
   "language": "python",
532
   "name": "python3"
533
  },
534
  "language_info": {
535
   "codemirror_mode": {
536
    "name": "ipython",
537
    "version": 3
538
   },
539
   "file_extension": ".py",
540
   "mimetype": "text/x-python",
541
   "name": "python",
542
   "nbconvert_exporter": "python",
543
   "pygments_lexer": "ipython3",
544
   "version": "3.7.9"
545
  }
546
 },
547
 "nbformat": 4,
548
 "nbformat_minor": 4
549
}