DeepIMV / Git / Diff of /data_processing

Models:
AlyssaS/
DeepIMV
Downloads: 1
Diff of /data_processing_TCGA.ipynb [000000] .. [0f2bcf]
Switch to side-by-side view

--- a
+++ b/data_processing_TCGA.ipynb
@@ -0,0 +1,549 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tumor_list = [\n",
+    "'ACC',\n",
+    "'BLCA',\n",
+    "'BRCA',\n",
+    "'CESC',\n",
+    "'CHOL',\n",
+    "'COAD',\n",
+    "'COADREAD',\n",
+    "'DLBC',\n",
+    "'ESCA',\n",
+    "'FPPP',\n",
+    "'GBM',\n",
+    "'GBMLGG',\n",
+    "'HNSC',\n",
+    "'KICH',\n",
+    "'KIPAN',\n",
+    "'KIRC',\n",
+    "'KIRP',\n",
+    "'LAML',\n",
+    "'LGG',\n",
+    "'LIHC',\n",
+    "'LUAD',\n",
+    "'LUSC',\n",
+    "'MESO',\n",
+    "'OV',\n",
+    "'PAAD',\n",
+    "'PCPG',\n",
+    "'PRAD',\n",
+    "'READ',\n",
+    "'SARC',\n",
+    "'SKCM',\n",
+    "'STAD',\n",
+    "'STES',\n",
+    "'TGCT',\n",
+    "'THCA',\n",
+    "'THYM',\n",
+    "'UCEC',\n",
+    "'UCS',\n",
+    "'UVM']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# RPPA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## 1. FIND SUPERSET OF RPPA FEATURES\n",
+    "feat_list = {}\n",
+    "for tumor in tumor_list:\n",
+    "    filepath = './RPPA/gdac.broadinstitute.org_{}.RPPA_AnnotateWithGene.Level_3.2016012800.0.0/'.format(tumor)\n",
+    "    filename = '{}.rppa.txt'.format(tumor)\n",
+    "\n",
+    "    if os.path.exists(filepath + filename):\n",
+    "        tmp = pd.read_csv(filepath + filename, sep='\\t')\n",
+    "\n",
+    "        tmp.columns = [list(tmp)[0]] + [f[:15] for f in list(tmp)[1:]]\n",
+    "        tmp         = tmp.T.reset_index()\n",
+    "        tmp.columns = tmp.iloc[0, 0:]\n",
+    "        tmp         = tmp.iloc[1:, :].reset_index(drop=True)\n",
+    "        \n",
+    "        feat_list[tumor] = list(tmp)[1:]\n",
+    "        \n",
+    "        if tumor == 'ACC':\n",
+    "            final_feat_list = feat_list[tumor].copy()\n",
+    "            sup_feat_list   = feat_list[tumor].copy()\n",
+    "        else:\n",
+    "            final_feat_list = np.intersect1d(final_feat_list, feat_list[tumor])\n",
+    "            sup_feat_list  += feat_list[tumor]\n",
+    "            \n",
+    "sup_feat_list = np.unique(sup_feat_list).tolist()\n",
+    "            \n",
+    "\n",
+    "for tumor in tumor_list:\n",
+    "    filepath = './RPPA/gdac.broadinstitute.org_{}.RPPA_AnnotateWithGene.Level_3.2016012800.0.0/'.format(tumor)\n",
+    "    filename = '{}.rppa.txt'.format(tumor)\n",
+    "    \n",
+    "    if os.path.exists(filepath + filename):\n",
+    "        tmp = pd.read_csv(filepath + filename, sep='\\t')\n",
+    "\n",
+    "        tmp.columns = [list(tmp)[0]] + [f[:15] for f in list(tmp)[1:]]\n",
+    "        tmp         = tmp.T.reset_index()\n",
+    "        tmp.columns = tmp.iloc[0, 0:]\n",
+    "        tmp         = tmp.iloc[1:, :].reset_index(drop=True)\n",
+    "        \n",
+    "        tmp_ = pd.DataFrame([], columns=['Composite.Element.REF'] + sup_feat_list)\n",
+    "        tmp_[['Composite.Element.REF'] + feat_list[tumor]] = tmp[['Composite.Element.REF'] + feat_list[tumor]]\n",
+    "        \n",
+    "        if tumor == 'ACC':\n",
+    "#             final_df = tmp[['gene'] + final_feat_list.tolist()]\n",
+    "            final_df = tmp_\n",
+    "        else:\n",
+    "#             final_df = pd.concat([final_df, tmp[['gene'] + final_feat_list.tolist()]], axis=0)\n",
+    "            final_df = pd.concat([final_df, tmp_], axis=0)\n",
+    "    \n",
+    "final_df = final_df.drop_duplicates(subset=['Composite.Element.REF']).reset_index(drop=True)\n",
+    "final_df.to_csv('./FINAL/RPPA.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# miRNA Seq"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## 1. FIND SUPERSET OF miRNASeq FEATURES\n",
+    "feat_list = {}\n",
+    "for tumor in tumor_list:\n",
+    "    filepath = './miRNAseq/gdac.broadinstitute.org_{}.miRseq_Preprocess.Level_3.2016012800.0.0/'.format(tumor)\n",
+    "    filename = '{}.miRseq_RPKM_log2.txt'.format(tumor)\n",
+    "\n",
+    "    if os.path.exists(filepath + filename):\n",
+    "        tmp = pd.read_csv(filepath + filename, sep='\\t')\n",
+    "\n",
+    "        tmp.columns = [list(tmp)[0]] + [f[:15] for f in list(tmp)[1:]]\n",
+    "        tmp         = tmp.T.reset_index()\n",
+    "        tmp.columns = tmp.iloc[0, 0:]\n",
+    "        tmp         = tmp.iloc[1:, :].reset_index(drop=True)\n",
+    "        \n",
+    "        feat_list[tumor] = list(tmp)[1:]\n",
+    "        \n",
+    "        if tumor == 'ACC':\n",
+    "            final_feat_list = feat_list[tumor].copy()\n",
+    "            sup_feat_list   = feat_list[tumor].copy()\n",
+    "        else:\n",
+    "            final_feat_list = np.intersect1d(final_feat_list, feat_list[tumor])\n",
+    "            sup_feat_list  += feat_list[tumor]\n",
+    "            \n",
+    "sup_feat_list = np.unique(sup_feat_list).tolist()\n",
+    "\n",
+    "for tumor in tumor_list:\n",
+    "    filepath = './miRNAseq/gdac.broadinstitute.org_{}.miRseq_Preprocess.Level_3.2016012800.0.0/'.format(tumor)\n",
+    "    filename = '{}.miRseq_RPKM_log2.txt'.format(tumor)\n",
+    "\n",
+    "    if os.path.exists(filepath + filename):\n",
+    "        tmp = pd.read_csv(filepath + filename, sep='\\t')\n",
+    "\n",
+    "        tmp.columns = [list(tmp)[0]] + [f[:15] for f in list(tmp)[1:]]\n",
+    "        tmp         = tmp.T.reset_index()\n",
+    "        tmp.columns = tmp.iloc[0, 0:]\n",
+    "        tmp         = tmp.iloc[1:, :].reset_index(drop=True)\n",
+    "        \n",
+    "        tmp_ = pd.DataFrame([], columns=['gene'] + sup_feat_list)\n",
+    "        tmp_[['gene'] + feat_list[tumor]] = tmp[['gene'] + feat_list[tumor]]\n",
+    "        \n",
+    "        if tumor == 'ACC':\n",
+    "#             final_df = tmp[['gene'] + final_feat_list.tolist()]\n",
+    "            final_df = tmp_\n",
+    "        else:\n",
+    "#             final_df = pd.concat([final_df, tmp[['gene'] + final_feat_list.tolist()]], axis=0)\n",
+    "            final_df = pd.concat([final_df, tmp_], axis=0)\n",
+    "            \n",
+    "final_df = final_df.drop_duplicates(subset=['gene']).reset_index(drop=True)\n",
+    "final_df.to_csv('./FINAL/miRNAseq_RPKM_log2.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# METHYLATION"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## 1. FIND SUPERSET OF METHYLATION FEATURES\n",
+    "feat_list = {}\n",
+    "for tumor in tumor_list:\n",
+    "    filepath = './methylation/gdac.broadinstitute.org_{}.Methylation_Preprocess.Level_3.2016012800.0.0/'.format(tumor)\n",
+    "    filename = '{}.meth.by_mean.data.txt'.format(tumor)\n",
+    "\n",
+    "    if os.path.exists(filepath + filename):\n",
+    "        tmp = pd.read_csv(filepath + filename, sep='\\t')\n",
+    "        tmp = tmp.iloc[1:, :].reset_index(drop=True)\n",
+    "\n",
+    "        tmp.columns = [list(tmp)[0]] + [f[:15] for f in list(tmp)[1:]]\n",
+    "        tmp         = tmp.T.reset_index()\n",
+    "        tmp.columns = tmp.iloc[0, 0:]\n",
+    "        tmp         = tmp.iloc[1:, :].reset_index(drop=True)\n",
+    "        \n",
+    "        feat_list[tumor] = list(tmp)[1:]\n",
+    "            \n",
+    "        if tumor == 'ACC':\n",
+    "            final_feat_list = feat_list[tumor].copy()\n",
+    "            sup_feat_list   = feat_list[tumor].copy()\n",
+    "        else:\n",
+    "            final_feat_list = np.intersect1d(final_feat_list, feat_list[tumor])\n",
+    "            sup_feat_list  += feat_list[tumor]\n",
+    "            \n",
+    "sup_feat_list = np.unique(sup_feat_list).tolist()\n",
+    "\n",
+    "for tumor in tumor_list:\n",
+    "    filepath = './methylation/gdac.broadinstitute.org_{}.Methylation_Preprocess.Level_3.2016012800.0.0/'.format(tumor)\n",
+    "    filename = '{}.meth.by_mean.data.txt'.format(tumor)\n",
+    "\n",
+    "    if os.path.exists(filepath + filename):\n",
+    "        tmp = pd.read_csv(filepath + filename, sep='\\t')\n",
+    "\n",
+    "        tmp.columns = [list(tmp)[0]] + [f[:15] for f in list(tmp)[1:]]\n",
+    "        tmp         = tmp.T.reset_index()\n",
+    "        tmp.columns = tmp.iloc[0, 0:]\n",
+    "        tmp         = tmp.iloc[1:, :].reset_index(drop=True)\n",
+    "        \n",
+    "        tmp_ = pd.DataFrame([], columns=['Hybridization REF'] + sup_feat_list)\n",
+    "        tmp_[['Hybridization REF'] + feat_list[tumor]] = tmp[['Hybridization REF'] + feat_list[tumor]]\n",
+    "        \n",
+    "        if tumor == 'ACC':\n",
+    "#             final_df = tmp[['gene'] + final_feat_list.tolist()]\n",
+    "            final_df = tmp_\n",
+    "        else:\n",
+    "#             final_df = pd.concat([final_df, tmp[['gene'] + final_feat_list.tolist()]], axis=0)\n",
+    "            final_df = pd.concat([final_df, tmp_], axis=0)\n",
+    "            \n",
+    "final_df = final_df.drop_duplicates(subset=['Hybridization REF']).reset_index(drop=True)\n",
+    "final_df.to_csv('./FINAL/methylation.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# MAKE MULTI-VIEW OBSERVAITON FILE"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mRNAseq     = pd.read_csv('./FINAL/mRNAseq_RSEM.csv')\n",
+    "mRNAseq     = mRNAseq.drop_duplicates(subset=['HYBRIDIZATION R']).reset_index(drop=True)\n",
+    "mRNAseq     = mRNAseq[mRNAseq['HYBRIDIZATION R'] != 'HYBRIDIZATION R'].reset_index(drop=True)\n",
+    "mRNAseq     = mRNAseq.rename(columns={'HYBRIDIZATION R':'Hybridization REF'})\n",
+    "mRNAseq['Hybridization REF'] = mRNAseq['Hybridization REF'].apply(lambda x: x.lower()[:-3])\n",
+    "\n",
+    "RPPA        = pd.read_csv('./FINAL/RPPA.csv')\n",
+    "RPPA        = RPPA.rename(columns={'Composite.Element.REF':'Hybridization REF'})\n",
+    "RPPA['Hybridization REF'] = RPPA['Hybridization REF'].apply(lambda x: x.lower()[:-3])\n",
+    "\n",
+    "methylation = pd.read_csv('./FINAL/methylation.csv')\n",
+    "methylation['Hybridization REF'] = methylation['Hybridization REF'].apply(lambda x: x.lower()[:-3])\n",
+    "\n",
+    "miRNAseq    = pd.read_csv('./FINAL/miRNAseq_RPKM_log2.csv')\n",
+    "miRNAseq     = miRNAseq.rename(columns={'gene':'Hybridization REF'})\n",
+    "miRNAseq['Hybridization REF'] = miRNAseq['Hybridization REF'].apply(lambda x: x.lower()[:-3])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mRNAseq      = mRNAseq.drop_duplicates(subset=['Hybridization REF'])\n",
+    "RPPA         = RPPA.drop_duplicates(subset=['Hybridization REF'])\n",
+    "methylation  = methylation.drop_duplicates(subset=['Hybridization REF'])\n",
+    "miRNAseq     = miRNAseq.drop_duplicates(subset=['Hybridization REF'])\n",
+    "\n",
+    "\n",
+    "tmp_list    = np.asarray(list(mRNAseq))\n",
+    "mRNAseq     = mRNAseq[tmp_list[mRNAseq.isna().sum(axis=0) == 0]]\n",
+    "\n",
+    "tmp_list = np.asarray(list(RPPA))\n",
+    "RPPA     = RPPA[tmp_list[RPPA.isna().sum(axis=0) == 0]]\n",
+    "\n",
+    "tmp_list    = np.asarray(list(methylation))\n",
+    "methylation = methylation[tmp_list[methylation.isna().sum(axis=0) == 0]]\n",
+    "\n",
+    "tmp_list    = np.asarray(list(miRNAseq))\n",
+    "miRNAseq    = miRNAseq[tmp_list[miRNAseq.isna().sum(axis=0) == 0]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "label = pd.read_csv('./FINAL/clinical_label.csv', header=1)\n",
+    "label = label.sort_values(by='Hybridization REF').reset_index(drop=True)\n",
+    "label = label[label['Hybridization REF'].apply(lambda x: 'tcga' in x)].drop_duplicates(subset=['Hybridization REF'], keep ='last').reset_index(drop=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "'''\n",
+    "    Some of the patients had shifted columns for some reason.\n",
+    "    Manually corrected these errors.\n",
+    "'''\n",
+    "\n",
+    "label.loc[label['days_to_last_followup'] == 'endometrial', 'days_to_last_followup'] = label.loc[label['days_to_last_followup'] == 'endometrial', 'days_to_death']\n",
+    "label.loc[label['days_to_last_followup'] == 'endometrial', 'days_to_death'] = label.loc[label['days_to_last_followup'] == 'endometrial', 'vital_status']\n",
+    "label.loc[label['days_to_last_followup'] == 'endometrial', 'vital_status'] = label.loc[label['days_to_last_followup'] == 'endometrial', 'years_to_birth']\n",
+    "\n",
+    "label.loc[label['days_to_last_followup'] == 'other  specify', 'days_to_last_followup'] = label.loc[label['days_to_last_followup'] == 'other  specify', 'days_to_death']\n",
+    "label.loc[label['days_to_last_followup'] == 'other  specify', 'days_to_death'] = label.loc[label['days_to_last_followup'] == 'other  specify', 'vital_status']\n",
+    "label.loc[label['days_to_last_followup'] == 'other  specify', 'vital_status'] = label.loc[label['days_to_last_followup'] == 'other  specify', 'years_to_birth']\n",
+    "\n",
+    "label['1yr-mortality'] = -1.\n",
+    "label.loc[label['days_to_last_followup'].astype(float) >= 365, '1yr-mortality'] = 0.\n",
+    "label.loc[label['days_to_death'].astype(float) <= 365, '1yr-mortality'] = 1.\n",
+    "\n",
+    "label['3yr-mortality'] = -1.\n",
+    "label.loc[label['days_to_last_followup'].astype(float) >= 3*365, '3yr-mortality'] = 0.\n",
+    "label.loc[label['days_to_death'].astype(float) <= 3*365, '3yr-mortality'] = 1.\n",
+    "\n",
+    "label['5yr-mortality'] = -1.\n",
+    "label.loc[label['days_to_last_followup'].astype(float) >= 5*365, '5yr-mortality'] = 0.\n",
+    "label.loc[label['days_to_death'].astype(float) <= 5*365, '5yr-mortality'] = 1."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Kernel PCA Dimensionality Reduction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.decomposition import PCA, SparsePCA, KernelPCA\n",
+    "\n",
+    "for view in ['RPPA', 'miRNAseq', 'Methylation', 'mRNAseq']:\n",
+    "    print(view)\n",
+    "    if view == 'mRNAseq':\n",
+    "        df    = mRNAseq.copy(deep=True)\n",
+    "    elif view == 'miRNAseq':\n",
+    "        df    = miRNAseq.copy(deep=True)\n",
+    "    elif view == 'Methylation':\n",
+    "        df    = methylation.copy(deep=True)\n",
+    "    elif view == 'RPPA':\n",
+    "        df    = RPPA.copy(deep=True)\n",
+    "\n",
+    "    z_dim = 100\n",
+    "\n",
+    "    pca   = KernelPCA(kernel='poly', n_components=z_dim, random_state=1234)\n",
+    "    z     =  pca.fit_transform(np.asarray(df.iloc[:, 1:]))\n",
+    "\n",
+    "    df_pca = pd.DataFrame(z, index=df['Hybridization REF']).reset_index()\n",
+    "    df_pca.to_csv('./FINAL/cleaned/{}_kpca.csv'.format(view), index=False)\n",
+    "    \n",
+    "# from sklearn.decomposition import PCA, SparsePCA, KernelPCA\n",
+    "\n",
+    "# for view in ['RPPA', 'miRNAseq', 'Methylation', 'mRNAseq']:\n",
+    "#     print(view)\n",
+    "#     if view == 'mRNAseq':\n",
+    "#         df    = mRNAseq.copy(deep=True)\n",
+    "#     elif view == 'miRNAseq':\n",
+    "#         df    = miRNAseq.copy(deep=True)\n",
+    "#     elif view == 'Methylation':\n",
+    "#         df    = methylation.copy(deep=True)\n",
+    "#     elif view == 'RPPA':\n",
+    "#         df    = RPPA.copy(deep=True)\n",
+    "\n",
+    "#     z_dim = 100\n",
+    "\n",
+    "#     pca   = PCA(n_components=z_dim, random_state=1234)\n",
+    "#     z     =  pca.fit_transform(np.asarray(df.iloc[:, 1:]))\n",
+    "\n",
+    "#     df_pca = pd.DataFrame(z, index=df['Hybridization REF']).reset_index()\n",
+    "#     df_pca.to_csv('./FINAL/cleaned/{}_pca.csv'.format(view), index=False)\n",
+    "    \n",
+    "# from sklearn.decomposition import PCA, SparsePCA, KernelPCA\n",
+    "\n",
+    "# for view in ['RPPA', 'miRNAseq', 'Methylation', 'mRNAseq']:\n",
+    "#     print(view)\n",
+    "#     if view == 'mRNAseq':\n",
+    "#         df    = mRNAseq.copy(deep=True)\n",
+    "#     elif view == 'miRNAseq':\n",
+    "#         df    = miRNAseq.copy(deep=True)\n",
+    "#     elif view == 'Methylation':\n",
+    "#         df    = methylation.copy(deep=True)\n",
+    "#     elif view == 'RPPA':\n",
+    "#         df    = RPPA.copy(deep=True)\n",
+    "\n",
+    "#     z_dim = 100\n",
+    "\n",
+    "#     pca   = SparsePCA(n_components=z_dim, random_state=1234)\n",
+    "#     z     =  pca.fit_transform(np.asarray(df.iloc[:, 1:]))\n",
+    "\n",
+    "#     df_pca = pd.DataFrame(z, index=df['Hybridization REF']).reset_index()\n",
+    "#     df_pca.to_csv('./FINAL/cleaned/{}_spca.csv'.format(view), index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# CREATE MULTI-VIEW DATASET"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "view = 'mRNAseq'\n",
+    "df_pca1  = pd.read_csv('./FINAL/cleaned/{}_kpca.csv'.format(view))\n",
+    "\n",
+    "view = 'Methylation'\n",
+    "df_pca2  = pd.read_csv('./FINAL/cleaned/{}_kpca.csv'.format(view))\n",
+    "\n",
+    "view = 'miRNAseq'\n",
+    "df_pca3  = pd.read_csv('./FINAL/cleaned/{}_kpca.csv'.format(view))\n",
+    "\n",
+    "view = 'RPPA'\n",
+    "df_pca4  = pd.read_csv('./FINAL/cleaned/{}_kpca.csv'.format(view))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### CREATE 1-Yr Mortality Dataset. (Censored samples are removed...)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "idx_list_y = label.loc[label['1yr-mortality'] != -1, 'Hybridization REF']\n",
+    "\n",
+    "idx_list1 = df_pca1['Hybridization REF']\n",
+    "idx_list2 = df_pca2['Hybridization REF']\n",
+    "idx_list3 = df_pca3['Hybridization REF']\n",
+    "idx_list4 = df_pca4['Hybridization REF']\n",
+    "\n",
+    "idx_list_x = np.unique(idx_list1.tolist() + idx_list2.tolist() + idx_list3.tolist() + idx_list4.tolist())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "idx_list     = np.intersect1d(idx_list_x, idx_list_y)\n",
+    "df           = pd.DataFrame(idx_list, columns=['Hybridization REF'])  ##superset of samples that has at least one omics available."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### FINAL DATASET"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df1 = pd.merge(df, df_pca1, how='left', on='Hybridization REF')\n",
+    "df2 = pd.merge(df, df_pca2, how='left', on='Hybridization REF')\n",
+    "df3 = pd.merge(df, df_pca3, how='left', on='Hybridization REF')\n",
+    "df4 = pd.merge(df, df_pca4, how='left', on='Hybridization REF')\n",
+    "dfy = pd.merge(df, label[['Hybridization REF','1yr-mortality']], how='left', on='Hybridization REF')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.savez(\n",
+    "    './FINAL/multi_omics_1yr_mortality.npz',\n",
+    "    mRNAseq     = np.asarray(df1.iloc[:, 1:]),\n",
+    "    Methylation = np.asarray(df1.iloc[:, 1:]),\n",
+    "    miRNAseq    = np.asarray(df1.iloc[:, 1:]),\n",
+    "    RPPA        = np.asarray(df1.iloc[:, 1:]),\n",
+    "    label       = np.asarray(df1.iloc[:, 1:])\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}