--- a +++ b/data_processing_TCGA.ipynb @@ -0,0 +1,549 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tumor_list = [\n", + "'ACC',\n", + "'BLCA',\n", + "'BRCA',\n", + "'CESC',\n", + "'CHOL',\n", + "'COAD',\n", + "'COADREAD',\n", + "'DLBC',\n", + "'ESCA',\n", + "'FPPP',\n", + "'GBM',\n", + "'GBMLGG',\n", + "'HNSC',\n", + "'KICH',\n", + "'KIPAN',\n", + "'KIRC',\n", + "'KIRP',\n", + "'LAML',\n", + "'LGG',\n", + "'LIHC',\n", + "'LUAD',\n", + "'LUSC',\n", + "'MESO',\n", + "'OV',\n", + "'PAAD',\n", + "'PCPG',\n", + "'PRAD',\n", + "'READ',\n", + "'SARC',\n", + "'SKCM',\n", + "'STAD',\n", + "'STES',\n", + "'TGCT',\n", + "'THCA',\n", + "'THYM',\n", + "'UCEC',\n", + "'UCS',\n", + "'UVM']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RPPA" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## 1. FIND SUPERSET OF RPPA FEATURES\n", + "feat_list = {}\n", + "for tumor in tumor_list:\n", + " filepath = './RPPA/gdac.broadinstitute.org_{}.RPPA_AnnotateWithGene.Level_3.2016012800.0.0/'.format(tumor)\n", + " filename = '{}.rppa.txt'.format(tumor)\n", + "\n", + " if os.path.exists(filepath + filename):\n", + " tmp = pd.read_csv(filepath + filename, sep='\\t')\n", + "\n", + " tmp.columns = [list(tmp)[0]] + [f[:15] for f in list(tmp)[1:]]\n", + " tmp = tmp.T.reset_index()\n", + " tmp.columns = tmp.iloc[0, 0:]\n", + " tmp = tmp.iloc[1:, :].reset_index(drop=True)\n", + " \n", + " feat_list[tumor] = list(tmp)[1:]\n", + " \n", + " if tumor == 'ACC':\n", + " final_feat_list = feat_list[tumor].copy()\n", + " sup_feat_list = feat_list[tumor].copy()\n", + " else:\n", + " final_feat_list = np.intersect1d(final_feat_list, feat_list[tumor])\n", + " sup_feat_list += feat_list[tumor]\n", + " \n", + "sup_feat_list = np.unique(sup_feat_list).tolist()\n", + " \n", + "\n", + "for tumor in tumor_list:\n", + " filepath = './RPPA/gdac.broadinstitute.org_{}.RPPA_AnnotateWithGene.Level_3.2016012800.0.0/'.format(tumor)\n", + " filename = '{}.rppa.txt'.format(tumor)\n", + " \n", + " if os.path.exists(filepath + filename):\n", + " tmp = pd.read_csv(filepath + filename, sep='\\t')\n", + "\n", + " tmp.columns = [list(tmp)[0]] + [f[:15] for f in list(tmp)[1:]]\n", + " tmp = tmp.T.reset_index()\n", + " tmp.columns = tmp.iloc[0, 0:]\n", + " tmp = tmp.iloc[1:, :].reset_index(drop=True)\n", + " \n", + " tmp_ = pd.DataFrame([], columns=['Composite.Element.REF'] + sup_feat_list)\n", + " tmp_[['Composite.Element.REF'] + feat_list[tumor]] = tmp[['Composite.Element.REF'] + feat_list[tumor]]\n", + " \n", + " if tumor == 'ACC':\n", + "# final_df = tmp[['gene'] + final_feat_list.tolist()]\n", + " final_df = tmp_\n", + " else:\n", + "# final_df = pd.concat([final_df, tmp[['gene'] + final_feat_list.tolist()]], axis=0)\n", + " final_df = pd.concat([final_df, tmp_], axis=0)\n", + " \n", + "final_df = final_df.drop_duplicates(subset=['Composite.Element.REF']).reset_index(drop=True)\n", + "final_df.to_csv('./FINAL/RPPA.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# miRNA Seq" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## 1. FIND SUPERSET OF miRNASeq FEATURES\n", + "feat_list = {}\n", + "for tumor in tumor_list:\n", + " filepath = './miRNAseq/gdac.broadinstitute.org_{}.miRseq_Preprocess.Level_3.2016012800.0.0/'.format(tumor)\n", + " filename = '{}.miRseq_RPKM_log2.txt'.format(tumor)\n", + "\n", + " if os.path.exists(filepath + filename):\n", + " tmp = pd.read_csv(filepath + filename, sep='\\t')\n", + "\n", + " tmp.columns = [list(tmp)[0]] + [f[:15] for f in list(tmp)[1:]]\n", + " tmp = tmp.T.reset_index()\n", + " tmp.columns = tmp.iloc[0, 0:]\n", + " tmp = tmp.iloc[1:, :].reset_index(drop=True)\n", + " \n", + " feat_list[tumor] = list(tmp)[1:]\n", + " \n", + " if tumor == 'ACC':\n", + " final_feat_list = feat_list[tumor].copy()\n", + " sup_feat_list = feat_list[tumor].copy()\n", + " else:\n", + " final_feat_list = np.intersect1d(final_feat_list, feat_list[tumor])\n", + " sup_feat_list += feat_list[tumor]\n", + " \n", + "sup_feat_list = np.unique(sup_feat_list).tolist()\n", + "\n", + "for tumor in tumor_list:\n", + " filepath = './miRNAseq/gdac.broadinstitute.org_{}.miRseq_Preprocess.Level_3.2016012800.0.0/'.format(tumor)\n", + " filename = '{}.miRseq_RPKM_log2.txt'.format(tumor)\n", + "\n", + " if os.path.exists(filepath + filename):\n", + " tmp = pd.read_csv(filepath + filename, sep='\\t')\n", + "\n", + " tmp.columns = [list(tmp)[0]] + [f[:15] for f in list(tmp)[1:]]\n", + " tmp = tmp.T.reset_index()\n", + " tmp.columns = tmp.iloc[0, 0:]\n", + " tmp = tmp.iloc[1:, :].reset_index(drop=True)\n", + " \n", + " tmp_ = pd.DataFrame([], columns=['gene'] + sup_feat_list)\n", + " tmp_[['gene'] + feat_list[tumor]] = tmp[['gene'] + feat_list[tumor]]\n", + " \n", + " if tumor == 'ACC':\n", + "# final_df = tmp[['gene'] + final_feat_list.tolist()]\n", + " final_df = tmp_\n", + " else:\n", + "# final_df = pd.concat([final_df, tmp[['gene'] + final_feat_list.tolist()]], axis=0)\n", + " final_df = pd.concat([final_df, tmp_], axis=0)\n", + " \n", + "final_df = final_df.drop_duplicates(subset=['gene']).reset_index(drop=True)\n", + "final_df.to_csv('./FINAL/miRNAseq_RPKM_log2.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# METHYLATION" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## 1. FIND SUPERSET OF METHYLATION FEATURES\n", + "feat_list = {}\n", + "for tumor in tumor_list:\n", + " filepath = './methylation/gdac.broadinstitute.org_{}.Methylation_Preprocess.Level_3.2016012800.0.0/'.format(tumor)\n", + " filename = '{}.meth.by_mean.data.txt'.format(tumor)\n", + "\n", + " if os.path.exists(filepath + filename):\n", + " tmp = pd.read_csv(filepath + filename, sep='\\t')\n", + " tmp = tmp.iloc[1:, :].reset_index(drop=True)\n", + "\n", + " tmp.columns = [list(tmp)[0]] + [f[:15] for f in list(tmp)[1:]]\n", + " tmp = tmp.T.reset_index()\n", + " tmp.columns = tmp.iloc[0, 0:]\n", + " tmp = tmp.iloc[1:, :].reset_index(drop=True)\n", + " \n", + " feat_list[tumor] = list(tmp)[1:]\n", + " \n", + " if tumor == 'ACC':\n", + " final_feat_list = feat_list[tumor].copy()\n", + " sup_feat_list = feat_list[tumor].copy()\n", + " else:\n", + " final_feat_list = np.intersect1d(final_feat_list, feat_list[tumor])\n", + " sup_feat_list += feat_list[tumor]\n", + " \n", + "sup_feat_list = np.unique(sup_feat_list).tolist()\n", + "\n", + "for tumor in tumor_list:\n", + " filepath = './methylation/gdac.broadinstitute.org_{}.Methylation_Preprocess.Level_3.2016012800.0.0/'.format(tumor)\n", + " filename = '{}.meth.by_mean.data.txt'.format(tumor)\n", + "\n", + " if os.path.exists(filepath + filename):\n", + " tmp = pd.read_csv(filepath + filename, sep='\\t')\n", + "\n", + " tmp.columns = [list(tmp)[0]] + [f[:15] for f in list(tmp)[1:]]\n", + " tmp = tmp.T.reset_index()\n", + " tmp.columns = tmp.iloc[0, 0:]\n", + " tmp = tmp.iloc[1:, :].reset_index(drop=True)\n", + " \n", + " tmp_ = pd.DataFrame([], columns=['Hybridization REF'] + sup_feat_list)\n", + " tmp_[['Hybridization REF'] + feat_list[tumor]] = tmp[['Hybridization REF'] + feat_list[tumor]]\n", + " \n", + " if tumor == 'ACC':\n", + "# final_df = tmp[['gene'] + final_feat_list.tolist()]\n", + " final_df = tmp_\n", + " else:\n", + "# final_df = pd.concat([final_df, tmp[['gene'] + final_feat_list.tolist()]], axis=0)\n", + " final_df = pd.concat([final_df, tmp_], axis=0)\n", + " \n", + "final_df = final_df.drop_duplicates(subset=['Hybridization REF']).reset_index(drop=True)\n", + "final_df.to_csv('./FINAL/methylation.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# MAKE MULTI-VIEW OBSERVAITON FILE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mRNAseq = pd.read_csv('./FINAL/mRNAseq_RSEM.csv')\n", + "mRNAseq = mRNAseq.drop_duplicates(subset=['HYBRIDIZATION R']).reset_index(drop=True)\n", + "mRNAseq = mRNAseq[mRNAseq['HYBRIDIZATION R'] != 'HYBRIDIZATION R'].reset_index(drop=True)\n", + "mRNAseq = mRNAseq.rename(columns={'HYBRIDIZATION R':'Hybridization REF'})\n", + "mRNAseq['Hybridization REF'] = mRNAseq['Hybridization REF'].apply(lambda x: x.lower()[:-3])\n", + "\n", + "RPPA = pd.read_csv('./FINAL/RPPA.csv')\n", + "RPPA = RPPA.rename(columns={'Composite.Element.REF':'Hybridization REF'})\n", + "RPPA['Hybridization REF'] = RPPA['Hybridization REF'].apply(lambda x: x.lower()[:-3])\n", + "\n", + "methylation = pd.read_csv('./FINAL/methylation.csv')\n", + "methylation['Hybridization REF'] = methylation['Hybridization REF'].apply(lambda x: x.lower()[:-3])\n", + "\n", + "miRNAseq = pd.read_csv('./FINAL/miRNAseq_RPKM_log2.csv')\n", + "miRNAseq = miRNAseq.rename(columns={'gene':'Hybridization REF'})\n", + "miRNAseq['Hybridization REF'] = miRNAseq['Hybridization REF'].apply(lambda x: x.lower()[:-3])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mRNAseq = mRNAseq.drop_duplicates(subset=['Hybridization REF'])\n", + "RPPA = RPPA.drop_duplicates(subset=['Hybridization REF'])\n", + "methylation = methylation.drop_duplicates(subset=['Hybridization REF'])\n", + "miRNAseq = miRNAseq.drop_duplicates(subset=['Hybridization REF'])\n", + "\n", + "\n", + "tmp_list = np.asarray(list(mRNAseq))\n", + "mRNAseq = mRNAseq[tmp_list[mRNAseq.isna().sum(axis=0) == 0]]\n", + "\n", + "tmp_list = np.asarray(list(RPPA))\n", + "RPPA = RPPA[tmp_list[RPPA.isna().sum(axis=0) == 0]]\n", + "\n", + "tmp_list = np.asarray(list(methylation))\n", + "methylation = methylation[tmp_list[methylation.isna().sum(axis=0) == 0]]\n", + "\n", + "tmp_list = np.asarray(list(miRNAseq))\n", + "miRNAseq = miRNAseq[tmp_list[miRNAseq.isna().sum(axis=0) == 0]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "label = pd.read_csv('./FINAL/clinical_label.csv', header=1)\n", + "label = label.sort_values(by='Hybridization REF').reset_index(drop=True)\n", + "label = label[label['Hybridization REF'].apply(lambda x: 'tcga' in x)].drop_duplicates(subset=['Hybridization REF'], keep ='last').reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + " Some of the patients had shifted columns for some reason.\n", + " Manually corrected these errors.\n", + "'''\n", + "\n", + "label.loc[label['days_to_last_followup'] == 'endometrial', 'days_to_last_followup'] = label.loc[label['days_to_last_followup'] == 'endometrial', 'days_to_death']\n", + "label.loc[label['days_to_last_followup'] == 'endometrial', 'days_to_death'] = label.loc[label['days_to_last_followup'] == 'endometrial', 'vital_status']\n", + "label.loc[label['days_to_last_followup'] == 'endometrial', 'vital_status'] = label.loc[label['days_to_last_followup'] == 'endometrial', 'years_to_birth']\n", + "\n", + "label.loc[label['days_to_last_followup'] == 'other specify', 'days_to_last_followup'] = label.loc[label['days_to_last_followup'] == 'other specify', 'days_to_death']\n", + "label.loc[label['days_to_last_followup'] == 'other specify', 'days_to_death'] = label.loc[label['days_to_last_followup'] == 'other specify', 'vital_status']\n", + "label.loc[label['days_to_last_followup'] == 'other specify', 'vital_status'] = label.loc[label['days_to_last_followup'] == 'other specify', 'years_to_birth']\n", + "\n", + "label['1yr-mortality'] = -1.\n", + "label.loc[label['days_to_last_followup'].astype(float) >= 365, '1yr-mortality'] = 0.\n", + "label.loc[label['days_to_death'].astype(float) <= 365, '1yr-mortality'] = 1.\n", + "\n", + "label['3yr-mortality'] = -1.\n", + "label.loc[label['days_to_last_followup'].astype(float) >= 3*365, '3yr-mortality'] = 0.\n", + "label.loc[label['days_to_death'].astype(float) <= 3*365, '3yr-mortality'] = 1.\n", + "\n", + "label['5yr-mortality'] = -1.\n", + "label.loc[label['days_to_last_followup'].astype(float) >= 5*365, '5yr-mortality'] = 0.\n", + "label.loc[label['days_to_death'].astype(float) <= 5*365, '5yr-mortality'] = 1." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Kernel PCA Dimensionality Reduction" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.decomposition import PCA, SparsePCA, KernelPCA\n", + "\n", + "for view in ['RPPA', 'miRNAseq', 'Methylation', 'mRNAseq']:\n", + " print(view)\n", + " if view == 'mRNAseq':\n", + " df = mRNAseq.copy(deep=True)\n", + " elif view == 'miRNAseq':\n", + " df = miRNAseq.copy(deep=True)\n", + " elif view == 'Methylation':\n", + " df = methylation.copy(deep=True)\n", + " elif view == 'RPPA':\n", + " df = RPPA.copy(deep=True)\n", + "\n", + " z_dim = 100\n", + "\n", + " pca = KernelPCA(kernel='poly', n_components=z_dim, random_state=1234)\n", + " z = pca.fit_transform(np.asarray(df.iloc[:, 1:]))\n", + "\n", + " df_pca = pd.DataFrame(z, index=df['Hybridization REF']).reset_index()\n", + " df_pca.to_csv('./FINAL/cleaned/{}_kpca.csv'.format(view), index=False)\n", + " \n", + "# from sklearn.decomposition import PCA, SparsePCA, KernelPCA\n", + "\n", + "# for view in ['RPPA', 'miRNAseq', 'Methylation', 'mRNAseq']:\n", + "# print(view)\n", + "# if view == 'mRNAseq':\n", + "# df = mRNAseq.copy(deep=True)\n", + "# elif view == 'miRNAseq':\n", + "# df = miRNAseq.copy(deep=True)\n", + "# elif view == 'Methylation':\n", + "# df = methylation.copy(deep=True)\n", + "# elif view == 'RPPA':\n", + "# df = RPPA.copy(deep=True)\n", + "\n", + "# z_dim = 100\n", + "\n", + "# pca = PCA(n_components=z_dim, random_state=1234)\n", + "# z = pca.fit_transform(np.asarray(df.iloc[:, 1:]))\n", + "\n", + "# df_pca = pd.DataFrame(z, index=df['Hybridization REF']).reset_index()\n", + "# df_pca.to_csv('./FINAL/cleaned/{}_pca.csv'.format(view), index=False)\n", + " \n", + "# from sklearn.decomposition import PCA, SparsePCA, KernelPCA\n", + "\n", + "# for view in ['RPPA', 'miRNAseq', 'Methylation', 'mRNAseq']:\n", + "# print(view)\n", + "# if view == 'mRNAseq':\n", + "# df = mRNAseq.copy(deep=True)\n", + "# elif view == 'miRNAseq':\n", + "# df = miRNAseq.copy(deep=True)\n", + "# elif view == 'Methylation':\n", + "# df = methylation.copy(deep=True)\n", + "# elif view == 'RPPA':\n", + "# df = RPPA.copy(deep=True)\n", + "\n", + "# z_dim = 100\n", + "\n", + "# pca = SparsePCA(n_components=z_dim, random_state=1234)\n", + "# z = pca.fit_transform(np.asarray(df.iloc[:, 1:]))\n", + "\n", + "# df_pca = pd.DataFrame(z, index=df['Hybridization REF']).reset_index()\n", + "# df_pca.to_csv('./FINAL/cleaned/{}_spca.csv'.format(view), index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# CREATE MULTI-VIEW DATASET" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "view = 'mRNAseq'\n", + "df_pca1 = pd.read_csv('./FINAL/cleaned/{}_kpca.csv'.format(view))\n", + "\n", + "view = 'Methylation'\n", + "df_pca2 = pd.read_csv('./FINAL/cleaned/{}_kpca.csv'.format(view))\n", + "\n", + "view = 'miRNAseq'\n", + "df_pca3 = pd.read_csv('./FINAL/cleaned/{}_kpca.csv'.format(view))\n", + "\n", + "view = 'RPPA'\n", + "df_pca4 = pd.read_csv('./FINAL/cleaned/{}_kpca.csv'.format(view))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### CREATE 1-Yr Mortality Dataset. (Censored samples are removed...)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "idx_list_y = label.loc[label['1yr-mortality'] != -1, 'Hybridization REF']\n", + "\n", + "idx_list1 = df_pca1['Hybridization REF']\n", + "idx_list2 = df_pca2['Hybridization REF']\n", + "idx_list3 = df_pca3['Hybridization REF']\n", + "idx_list4 = df_pca4['Hybridization REF']\n", + "\n", + "idx_list_x = np.unique(idx_list1.tolist() + idx_list2.tolist() + idx_list3.tolist() + idx_list4.tolist())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "idx_list = np.intersect1d(idx_list_x, idx_list_y)\n", + "df = pd.DataFrame(idx_list, columns=['Hybridization REF']) ##superset of samples that has at least one omics available." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### FINAL DATASET" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df1 = pd.merge(df, df_pca1, how='left', on='Hybridization REF')\n", + "df2 = pd.merge(df, df_pca2, how='left', on='Hybridization REF')\n", + "df3 = pd.merge(df, df_pca3, how='left', on='Hybridization REF')\n", + "df4 = pd.merge(df, df_pca4, how='left', on='Hybridization REF')\n", + "dfy = pd.merge(df, label[['Hybridization REF','1yr-mortality']], how='left', on='Hybridization REF')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.savez(\n", + " './FINAL/multi_omics_1yr_mortality.npz',\n", + " mRNAseq = np.asarray(df1.iloc[:, 1:]),\n", + " Methylation = np.asarray(df1.iloc[:, 1:]),\n", + " miRNAseq = np.asarray(df1.iloc[:, 1:]),\n", + " RPPA = np.asarray(df1.iloc[:, 1:]),\n", + " label = np.asarray(df1.iloc[:, 1:])\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}