550 lines (549 with data), 20.3 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tumor_list = [\n",
"'ACC',\n",
"'BLCA',\n",
"'BRCA',\n",
"'CESC',\n",
"'CHOL',\n",
"'COAD',\n",
"'COADREAD',\n",
"'DLBC',\n",
"'ESCA',\n",
"'FPPP',\n",
"'GBM',\n",
"'GBMLGG',\n",
"'HNSC',\n",
"'KICH',\n",
"'KIPAN',\n",
"'KIRC',\n",
"'KIRP',\n",
"'LAML',\n",
"'LGG',\n",
"'LIHC',\n",
"'LUAD',\n",
"'LUSC',\n",
"'MESO',\n",
"'OV',\n",
"'PAAD',\n",
"'PCPG',\n",
"'PRAD',\n",
"'READ',\n",
"'SARC',\n",
"'SKCM',\n",
"'STAD',\n",
"'STES',\n",
"'TGCT',\n",
"'THCA',\n",
"'THYM',\n",
"'UCEC',\n",
"'UCS',\n",
"'UVM']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# RPPA"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"## 1. FIND SUPERSET OF RPPA FEATURES\n",
"feat_list = {}\n",
"for tumor in tumor_list:\n",
" filepath = './RPPA/gdac.broadinstitute.org_{}.RPPA_AnnotateWithGene.Level_3.2016012800.0.0/'.format(tumor)\n",
" filename = '{}.rppa.txt'.format(tumor)\n",
"\n",
" if os.path.exists(filepath + filename):\n",
" tmp = pd.read_csv(filepath + filename, sep='\\t')\n",
"\n",
" tmp.columns = [list(tmp)[0]] + [f[:15] for f in list(tmp)[1:]]\n",
" tmp = tmp.T.reset_index()\n",
" tmp.columns = tmp.iloc[0, 0:]\n",
" tmp = tmp.iloc[1:, :].reset_index(drop=True)\n",
" \n",
" feat_list[tumor] = list(tmp)[1:]\n",
" \n",
" if tumor == 'ACC':\n",
" final_feat_list = feat_list[tumor].copy()\n",
" sup_feat_list = feat_list[tumor].copy()\n",
" else:\n",
" final_feat_list = np.intersect1d(final_feat_list, feat_list[tumor])\n",
" sup_feat_list += feat_list[tumor]\n",
" \n",
"sup_feat_list = np.unique(sup_feat_list).tolist()\n",
" \n",
"\n",
"for tumor in tumor_list:\n",
" filepath = './RPPA/gdac.broadinstitute.org_{}.RPPA_AnnotateWithGene.Level_3.2016012800.0.0/'.format(tumor)\n",
" filename = '{}.rppa.txt'.format(tumor)\n",
" \n",
" if os.path.exists(filepath + filename):\n",
" tmp = pd.read_csv(filepath + filename, sep='\\t')\n",
"\n",
" tmp.columns = [list(tmp)[0]] + [f[:15] for f in list(tmp)[1:]]\n",
" tmp = tmp.T.reset_index()\n",
" tmp.columns = tmp.iloc[0, 0:]\n",
" tmp = tmp.iloc[1:, :].reset_index(drop=True)\n",
" \n",
" tmp_ = pd.DataFrame([], columns=['Composite.Element.REF'] + sup_feat_list)\n",
" tmp_[['Composite.Element.REF'] + feat_list[tumor]] = tmp[['Composite.Element.REF'] + feat_list[tumor]]\n",
" \n",
" if tumor == 'ACC':\n",
"# final_df = tmp[['gene'] + final_feat_list.tolist()]\n",
" final_df = tmp_\n",
" else:\n",
"# final_df = pd.concat([final_df, tmp[['gene'] + final_feat_list.tolist()]], axis=0)\n",
" final_df = pd.concat([final_df, tmp_], axis=0)\n",
" \n",
"final_df = final_df.drop_duplicates(subset=['Composite.Element.REF']).reset_index(drop=True)\n",
"final_df.to_csv('./FINAL/RPPA.csv', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# miRNA Seq"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"## 1. FIND SUPERSET OF miRNASeq FEATURES\n",
"feat_list = {}\n",
"for tumor in tumor_list:\n",
" filepath = './miRNAseq/gdac.broadinstitute.org_{}.miRseq_Preprocess.Level_3.2016012800.0.0/'.format(tumor)\n",
" filename = '{}.miRseq_RPKM_log2.txt'.format(tumor)\n",
"\n",
" if os.path.exists(filepath + filename):\n",
" tmp = pd.read_csv(filepath + filename, sep='\\t')\n",
"\n",
" tmp.columns = [list(tmp)[0]] + [f[:15] for f in list(tmp)[1:]]\n",
" tmp = tmp.T.reset_index()\n",
" tmp.columns = tmp.iloc[0, 0:]\n",
" tmp = tmp.iloc[1:, :].reset_index(drop=True)\n",
" \n",
" feat_list[tumor] = list(tmp)[1:]\n",
" \n",
" if tumor == 'ACC':\n",
" final_feat_list = feat_list[tumor].copy()\n",
" sup_feat_list = feat_list[tumor].copy()\n",
" else:\n",
" final_feat_list = np.intersect1d(final_feat_list, feat_list[tumor])\n",
" sup_feat_list += feat_list[tumor]\n",
" \n",
"sup_feat_list = np.unique(sup_feat_list).tolist()\n",
"\n",
"for tumor in tumor_list:\n",
" filepath = './miRNAseq/gdac.broadinstitute.org_{}.miRseq_Preprocess.Level_3.2016012800.0.0/'.format(tumor)\n",
" filename = '{}.miRseq_RPKM_log2.txt'.format(tumor)\n",
"\n",
" if os.path.exists(filepath + filename):\n",
" tmp = pd.read_csv(filepath + filename, sep='\\t')\n",
"\n",
" tmp.columns = [list(tmp)[0]] + [f[:15] for f in list(tmp)[1:]]\n",
" tmp = tmp.T.reset_index()\n",
" tmp.columns = tmp.iloc[0, 0:]\n",
" tmp = tmp.iloc[1:, :].reset_index(drop=True)\n",
" \n",
" tmp_ = pd.DataFrame([], columns=['gene'] + sup_feat_list)\n",
" tmp_[['gene'] + feat_list[tumor]] = tmp[['gene'] + feat_list[tumor]]\n",
" \n",
" if tumor == 'ACC':\n",
"# final_df = tmp[['gene'] + final_feat_list.tolist()]\n",
" final_df = tmp_\n",
" else:\n",
"# final_df = pd.concat([final_df, tmp[['gene'] + final_feat_list.tolist()]], axis=0)\n",
" final_df = pd.concat([final_df, tmp_], axis=0)\n",
" \n",
"final_df = final_df.drop_duplicates(subset=['gene']).reset_index(drop=True)\n",
"final_df.to_csv('./FINAL/miRNAseq_RPKM_log2.csv', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# METHYLATION"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"## 1. FIND SUPERSET OF METHYLATION FEATURES\n",
"feat_list = {}\n",
"for tumor in tumor_list:\n",
" filepath = './methylation/gdac.broadinstitute.org_{}.Methylation_Preprocess.Level_3.2016012800.0.0/'.format(tumor)\n",
" filename = '{}.meth.by_mean.data.txt'.format(tumor)\n",
"\n",
" if os.path.exists(filepath + filename):\n",
" tmp = pd.read_csv(filepath + filename, sep='\\t')\n",
" tmp = tmp.iloc[1:, :].reset_index(drop=True)\n",
"\n",
" tmp.columns = [list(tmp)[0]] + [f[:15] for f in list(tmp)[1:]]\n",
" tmp = tmp.T.reset_index()\n",
" tmp.columns = tmp.iloc[0, 0:]\n",
" tmp = tmp.iloc[1:, :].reset_index(drop=True)\n",
" \n",
" feat_list[tumor] = list(tmp)[1:]\n",
" \n",
" if tumor == 'ACC':\n",
" final_feat_list = feat_list[tumor].copy()\n",
" sup_feat_list = feat_list[tumor].copy()\n",
" else:\n",
" final_feat_list = np.intersect1d(final_feat_list, feat_list[tumor])\n",
" sup_feat_list += feat_list[tumor]\n",
" \n",
"sup_feat_list = np.unique(sup_feat_list).tolist()\n",
"\n",
"for tumor in tumor_list:\n",
" filepath = './methylation/gdac.broadinstitute.org_{}.Methylation_Preprocess.Level_3.2016012800.0.0/'.format(tumor)\n",
" filename = '{}.meth.by_mean.data.txt'.format(tumor)\n",
"\n",
" if os.path.exists(filepath + filename):\n",
" tmp = pd.read_csv(filepath + filename, sep='\\t')\n",
"\n",
" tmp.columns = [list(tmp)[0]] + [f[:15] for f in list(tmp)[1:]]\n",
" tmp = tmp.T.reset_index()\n",
" tmp.columns = tmp.iloc[0, 0:]\n",
" tmp = tmp.iloc[1:, :].reset_index(drop=True)\n",
" \n",
" tmp_ = pd.DataFrame([], columns=['Hybridization REF'] + sup_feat_list)\n",
" tmp_[['Hybridization REF'] + feat_list[tumor]] = tmp[['Hybridization REF'] + feat_list[tumor]]\n",
" \n",
" if tumor == 'ACC':\n",
"# final_df = tmp[['gene'] + final_feat_list.tolist()]\n",
" final_df = tmp_\n",
" else:\n",
"# final_df = pd.concat([final_df, tmp[['gene'] + final_feat_list.tolist()]], axis=0)\n",
" final_df = pd.concat([final_df, tmp_], axis=0)\n",
" \n",
"final_df = final_df.drop_duplicates(subset=['Hybridization REF']).reset_index(drop=True)\n",
"final_df.to_csv('./FINAL/methylation.csv', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# MAKE MULTI-VIEW OBSERVAITON FILE"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mRNAseq = pd.read_csv('./FINAL/mRNAseq_RSEM.csv')\n",
"mRNAseq = mRNAseq.drop_duplicates(subset=['HYBRIDIZATION R']).reset_index(drop=True)\n",
"mRNAseq = mRNAseq[mRNAseq['HYBRIDIZATION R'] != 'HYBRIDIZATION R'].reset_index(drop=True)\n",
"mRNAseq = mRNAseq.rename(columns={'HYBRIDIZATION R':'Hybridization REF'})\n",
"mRNAseq['Hybridization REF'] = mRNAseq['Hybridization REF'].apply(lambda x: x.lower()[:-3])\n",
"\n",
"RPPA = pd.read_csv('./FINAL/RPPA.csv')\n",
"RPPA = RPPA.rename(columns={'Composite.Element.REF':'Hybridization REF'})\n",
"RPPA['Hybridization REF'] = RPPA['Hybridization REF'].apply(lambda x: x.lower()[:-3])\n",
"\n",
"methylation = pd.read_csv('./FINAL/methylation.csv')\n",
"methylation['Hybridization REF'] = methylation['Hybridization REF'].apply(lambda x: x.lower()[:-3])\n",
"\n",
"miRNAseq = pd.read_csv('./FINAL/miRNAseq_RPKM_log2.csv')\n",
"miRNAseq = miRNAseq.rename(columns={'gene':'Hybridization REF'})\n",
"miRNAseq['Hybridization REF'] = miRNAseq['Hybridization REF'].apply(lambda x: x.lower()[:-3])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mRNAseq = mRNAseq.drop_duplicates(subset=['Hybridization REF'])\n",
"RPPA = RPPA.drop_duplicates(subset=['Hybridization REF'])\n",
"methylation = methylation.drop_duplicates(subset=['Hybridization REF'])\n",
"miRNAseq = miRNAseq.drop_duplicates(subset=['Hybridization REF'])\n",
"\n",
"\n",
"tmp_list = np.asarray(list(mRNAseq))\n",
"mRNAseq = mRNAseq[tmp_list[mRNAseq.isna().sum(axis=0) == 0]]\n",
"\n",
"tmp_list = np.asarray(list(RPPA))\n",
"RPPA = RPPA[tmp_list[RPPA.isna().sum(axis=0) == 0]]\n",
"\n",
"tmp_list = np.asarray(list(methylation))\n",
"methylation = methylation[tmp_list[methylation.isna().sum(axis=0) == 0]]\n",
"\n",
"tmp_list = np.asarray(list(miRNAseq))\n",
"miRNAseq = miRNAseq[tmp_list[miRNAseq.isna().sum(axis=0) == 0]]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"label = pd.read_csv('./FINAL/clinical_label.csv', header=1)\n",
"label = label.sort_values(by='Hybridization REF').reset_index(drop=True)\n",
"label = label[label['Hybridization REF'].apply(lambda x: 'tcga' in x)].drop_duplicates(subset=['Hybridization REF'], keep ='last').reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"'''\n",
" Some of the patients had shifted columns for some reason.\n",
" Manually corrected these errors.\n",
"'''\n",
"\n",
"label.loc[label['days_to_last_followup'] == 'endometrial', 'days_to_last_followup'] = label.loc[label['days_to_last_followup'] == 'endometrial', 'days_to_death']\n",
"label.loc[label['days_to_last_followup'] == 'endometrial', 'days_to_death'] = label.loc[label['days_to_last_followup'] == 'endometrial', 'vital_status']\n",
"label.loc[label['days_to_last_followup'] == 'endometrial', 'vital_status'] = label.loc[label['days_to_last_followup'] == 'endometrial', 'years_to_birth']\n",
"\n",
"label.loc[label['days_to_last_followup'] == 'other specify', 'days_to_last_followup'] = label.loc[label['days_to_last_followup'] == 'other specify', 'days_to_death']\n",
"label.loc[label['days_to_last_followup'] == 'other specify', 'days_to_death'] = label.loc[label['days_to_last_followup'] == 'other specify', 'vital_status']\n",
"label.loc[label['days_to_last_followup'] == 'other specify', 'vital_status'] = label.loc[label['days_to_last_followup'] == 'other specify', 'years_to_birth']\n",
"\n",
"label['1yr-mortality'] = -1.\n",
"label.loc[label['days_to_last_followup'].astype(float) >= 365, '1yr-mortality'] = 0.\n",
"label.loc[label['days_to_death'].astype(float) <= 365, '1yr-mortality'] = 1.\n",
"\n",
"label['3yr-mortality'] = -1.\n",
"label.loc[label['days_to_last_followup'].astype(float) >= 3*365, '3yr-mortality'] = 0.\n",
"label.loc[label['days_to_death'].astype(float) <= 3*365, '3yr-mortality'] = 1.\n",
"\n",
"label['5yr-mortality'] = -1.\n",
"label.loc[label['days_to_last_followup'].astype(float) >= 5*365, '5yr-mortality'] = 0.\n",
"label.loc[label['days_to_death'].astype(float) <= 5*365, '5yr-mortality'] = 1."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Kernel PCA Dimensionality Reduction"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.decomposition import PCA, SparsePCA, KernelPCA\n",
"\n",
"for view in ['RPPA', 'miRNAseq', 'Methylation', 'mRNAseq']:\n",
" print(view)\n",
" if view == 'mRNAseq':\n",
" df = mRNAseq.copy(deep=True)\n",
" elif view == 'miRNAseq':\n",
" df = miRNAseq.copy(deep=True)\n",
" elif view == 'Methylation':\n",
" df = methylation.copy(deep=True)\n",
" elif view == 'RPPA':\n",
" df = RPPA.copy(deep=True)\n",
"\n",
" z_dim = 100\n",
"\n",
" pca = KernelPCA(kernel='poly', n_components=z_dim, random_state=1234)\n",
" z = pca.fit_transform(np.asarray(df.iloc[:, 1:]))\n",
"\n",
" df_pca = pd.DataFrame(z, index=df['Hybridization REF']).reset_index()\n",
" df_pca.to_csv('./FINAL/cleaned/{}_kpca.csv'.format(view), index=False)\n",
" \n",
"# from sklearn.decomposition import PCA, SparsePCA, KernelPCA\n",
"\n",
"# for view in ['RPPA', 'miRNAseq', 'Methylation', 'mRNAseq']:\n",
"# print(view)\n",
"# if view == 'mRNAseq':\n",
"# df = mRNAseq.copy(deep=True)\n",
"# elif view == 'miRNAseq':\n",
"# df = miRNAseq.copy(deep=True)\n",
"# elif view == 'Methylation':\n",
"# df = methylation.copy(deep=True)\n",
"# elif view == 'RPPA':\n",
"# df = RPPA.copy(deep=True)\n",
"\n",
"# z_dim = 100\n",
"\n",
"# pca = PCA(n_components=z_dim, random_state=1234)\n",
"# z = pca.fit_transform(np.asarray(df.iloc[:, 1:]))\n",
"\n",
"# df_pca = pd.DataFrame(z, index=df['Hybridization REF']).reset_index()\n",
"# df_pca.to_csv('./FINAL/cleaned/{}_pca.csv'.format(view), index=False)\n",
" \n",
"# from sklearn.decomposition import PCA, SparsePCA, KernelPCA\n",
"\n",
"# for view in ['RPPA', 'miRNAseq', 'Methylation', 'mRNAseq']:\n",
"# print(view)\n",
"# if view == 'mRNAseq':\n",
"# df = mRNAseq.copy(deep=True)\n",
"# elif view == 'miRNAseq':\n",
"# df = miRNAseq.copy(deep=True)\n",
"# elif view == 'Methylation':\n",
"# df = methylation.copy(deep=True)\n",
"# elif view == 'RPPA':\n",
"# df = RPPA.copy(deep=True)\n",
"\n",
"# z_dim = 100\n",
"\n",
"# pca = SparsePCA(n_components=z_dim, random_state=1234)\n",
"# z = pca.fit_transform(np.asarray(df.iloc[:, 1:]))\n",
"\n",
"# df_pca = pd.DataFrame(z, index=df['Hybridization REF']).reset_index()\n",
"# df_pca.to_csv('./FINAL/cleaned/{}_spca.csv'.format(view), index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# CREATE MULTI-VIEW DATASET"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"view = 'mRNAseq'\n",
"df_pca1 = pd.read_csv('./FINAL/cleaned/{}_kpca.csv'.format(view))\n",
"\n",
"view = 'Methylation'\n",
"df_pca2 = pd.read_csv('./FINAL/cleaned/{}_kpca.csv'.format(view))\n",
"\n",
"view = 'miRNAseq'\n",
"df_pca3 = pd.read_csv('./FINAL/cleaned/{}_kpca.csv'.format(view))\n",
"\n",
"view = 'RPPA'\n",
"df_pca4 = pd.read_csv('./FINAL/cleaned/{}_kpca.csv'.format(view))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### CREATE 1-Yr Mortality Dataset. (Censored samples are removed...)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"idx_list_y = label.loc[label['1yr-mortality'] != -1, 'Hybridization REF']\n",
"\n",
"idx_list1 = df_pca1['Hybridization REF']\n",
"idx_list2 = df_pca2['Hybridization REF']\n",
"idx_list3 = df_pca3['Hybridization REF']\n",
"idx_list4 = df_pca4['Hybridization REF']\n",
"\n",
"idx_list_x = np.unique(idx_list1.tolist() + idx_list2.tolist() + idx_list3.tolist() + idx_list4.tolist())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"idx_list = np.intersect1d(idx_list_x, idx_list_y)\n",
"df = pd.DataFrame(idx_list, columns=['Hybridization REF']) ##superset of samples that has at least one omics available."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### FINAL DATASET"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df1 = pd.merge(df, df_pca1, how='left', on='Hybridization REF')\n",
"df2 = pd.merge(df, df_pca2, how='left', on='Hybridization REF')\n",
"df3 = pd.merge(df, df_pca3, how='left', on='Hybridization REF')\n",
"df4 = pd.merge(df, df_pca4, how='left', on='Hybridization REF')\n",
"dfy = pd.merge(df, label[['Hybridization REF','1yr-mortality']], how='left', on='Hybridization REF')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"np.savez(\n",
" './FINAL/multi_omics_1yr_mortality.npz',\n",
" mRNAseq = np.asarray(df1.iloc[:, 1:]),\n",
" Methylation = np.asarray(df1.iloc[:, 1:]),\n",
" miRNAseq = np.asarray(df1.iloc[:, 1:]),\n",
" RPPA = np.asarray(df1.iloc[:, 1:]),\n",
" label = np.asarray(df1.iloc[:, 1:])\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 4
}