--- a +++ b/EGFR/EGFRv8.ipynb @@ -0,0 +1,377 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "import torch \n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "import torch.optim as optim\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "import math\n", + "import sklearn.preprocessing as sk\n", + "import seaborn as sns\n", + "from sklearn import metrics\n", + "from sklearn.feature_selection import VarianceThreshold\n", + "from sklearn.model_selection import train_test_split\n", + "from utils import AllTripletSelector,HardestNegativeTripletSelector, RandomNegativeTripletSelector, SemihardNegativeTripletSelector # Strategies for selecting triplets within a minibatch\n", + "from metrics import AverageNonzeroTripletsMetric\n", + "from torch.utils.data.sampler import WeightedRandomSampler\n", + "from sklearn.metrics import roc_auc_score\n", + "from sklearn.metrics import average_precision_score\n", + "import random\n", + "from random import randint\n", + "from sklearn.model_selection import StratifiedKFold\n", + "\n", + "save_results_to = '/home/hnoghabi/EGFR/'\n", + "torch.manual_seed(42)\n", + "random.seed(42)\n", + "\n", + "GDSCE = pd.read_csv(\"GDSC_exprs.z.EGFRi.tsv\", \n", + " sep = \"\\t\", index_col=0, decimal = \",\")\n", + "GDSCE = pd.DataFrame.transpose(GDSCE)\n", + "\n", + "GDSCM = pd.read_csv(\"GDSC_mutations.EGFRi.tsv\", \n", + " sep = \"\\t\", index_col=0, decimal = \".\")\n", + "GDSCM = pd.DataFrame.transpose(GDSCM)\n", + "GDSCM = GDSCM.loc[:,~GDSCM.columns.duplicated()]\n", + "\n", + "GDSCC = pd.read_csv(\"GDSC_CNA.EGFRi.tsv\", \n", + " sep = \"\\t\", index_col=0, decimal = \".\")\n", + "GDSCC.drop_duplicates(keep='last')\n", + "GDSCC = pd.DataFrame.transpose(GDSCC)\n", + "GDSCC = GDSCC.loc[:,~GDSCC.columns.duplicated()]\n", + "\n", + "PDXEerlo = pd.read_csv(\"PDX_exprs.Erlotinib.eb_with.GDSC_exprs.Erlotinib.tsv\", \n", + " sep = \"\\t\", index_col=0, decimal = \",\")\n", + "PDXEerlo = pd.DataFrame.transpose(PDXEerlo)\n", + "# PDXMerlo = pd.read_csv(\"PDX_mutations.Erlotinib.tsv\", \n", + "# sep = \"\\t\", index_col=0, decimal = \".\")\n", + "PDXMerlo = pd.read_csv(\"PDX_mutations.Erlotinib - Copy.tsv\", \n", + " sep = \"\\t\", index_col=0, decimal = \",\")\n", + "PDXMerlo = pd.DataFrame.transpose(PDXMerlo)\n", + "# PDXCerlo = pd.read_csv(\"PDX_CNA.Erlotinib.tsv\", \n", + "# sep = \"\\t\", index_col=0, decimal = \".\")\n", + "PDXCerlo = pd.read_csv(\"PDX_CNV.Erlotinib.tsv\", \n", + " sep = \"\\t\", index_col=0, decimal = \",\")\n", + "PDXCerlo.drop_duplicates(keep='last')\n", + "PDXCerlo = pd.DataFrame.transpose(PDXCerlo)\n", + "PDXCerlo = PDXCerlo.loc[:,~PDXCerlo.columns.duplicated()]\n", + "\n", + "PDXEcet = pd.read_csv(\"PDX_exprs.Cetuximab.eb_with.GDSC_exprs.Cetuximab.tsv\", \n", + " sep = \"\\t\", index_col=0, decimal = \",\")\n", + "PDXEcet = pd.DataFrame.transpose(PDXEcet)\n", + "# PDXMcet = pd.read_csv(\"PDX_mutations.Cetuximab.tsv\", \n", + "# sep = \"\\t\", index_col=0, decimal = \".\")\n", + "PDXMcet = pd.read_csv(\"PDX_mutations.Cetuximab - Copy.tsv\", \n", + " sep = \"\\t\", index_col=0, decimal = \",\")\n", + "PDXMcet = pd.DataFrame.transpose(PDXMcet)\n", + "# PDXCcet = pd.read_csv(\"PDX_CNA.Cetuximab.tsv\", \n", + "# sep = \"\\t\", index_col=0, decimal = \".\")\n", + "PDXCcet = pd.read_csv(\"PDX_CNV.Cetuximab.tsv\", \n", + " sep = \"\\t\", index_col=0, decimal = \",\")\n", + "PDXCcet.drop_duplicates(keep='last')\n", + "PDXCcet = pd.DataFrame.transpose(PDXCcet)\n", + "PDXCcet = PDXCcet.loc[:,~PDXCcet.columns.duplicated()]\n", + "\n", + "selector = VarianceThreshold(0.05)\n", + "selector.fit_transform(GDSCE)\n", + "GDSCE = GDSCE[GDSCE.columns[selector.get_support(indices=True)]]\n", + "\n", + "GDSCM = GDSCM.fillna(0)\n", + "GDSCM[GDSCM != 0.0] = 1\n", + "GDSCC = GDSCC.fillna(0)\n", + "GDSCC[GDSCC != 0.0] = 1\n", + "\n", + "ls = GDSCE.columns.intersection(GDSCM.columns)\n", + "ls = ls.intersection(GDSCC.columns)\n", + "ls = ls.intersection(PDXEerlo.columns)\n", + "ls = ls.intersection(PDXMerlo.columns)\n", + "ls = ls.intersection(PDXCerlo.columns)\n", + "ls = ls.intersection(PDXEcet.columns)\n", + "ls = ls.intersection(PDXMcet.columns)\n", + "ls = ls.intersection(PDXCcet.columns)\n", + "ls2 = GDSCE.index.intersection(GDSCM.index)\n", + "ls2 = ls2.intersection(GDSCC.index)\n", + "ls3 = PDXEerlo.index.intersection(PDXMerlo.index)\n", + "ls3 = ls3.intersection(PDXCerlo.index)\n", + "ls4 = PDXEcet.index.intersection(PDXMcet.index)\n", + "ls4 = ls4.intersection(PDXCcet.index)\n", + "ls = pd.unique(ls)\n", + "\n", + "PDXEerlo = PDXEerlo.loc[ls3,ls]\n", + "PDXMerlo = PDXMerlo.loc[ls3,ls]\n", + "PDXCerlo = PDXCerlo.loc[ls3,ls]\n", + "PDXEcet = PDXEcet.loc[ls4,ls]\n", + "PDXMcet = PDXMcet.loc[ls4,ls]\n", + "PDXCcet = PDXCcet.loc[ls4,ls]\n", + "GDSCE = GDSCE.loc[:,ls]\n", + "GDSCM = GDSCM.loc[:,ls]\n", + "GDSCC = GDSCC.loc[:,ls]\n", + "\n", + "GDSCR = pd.read_csv(\"GDSC_response.EGFRi.tsv\", \n", + " sep = \"\\t\", index_col=0, decimal = \",\")\n", + "\n", + "GDSCR.rename(mapper = str, axis = 'index', inplace = True)\n", + "\n", + "d = {\"R\":0,\"S\":1}\n", + "GDSCR[\"response\"] = GDSCR.loc[:,\"response\"].apply(lambda x: d[x])\n", + "\n", + "responses = GDSCR\n", + "drugs = set(responses[\"drug\"].values)\n", + "exprs_z = GDSCE\n", + "cna = GDSCC\n", + "mut = GDSCM\n", + "expression_zscores = []\n", + "CNA=[]\n", + "mutations = []\n", + "for drug in drugs:\n", + " samples = responses.loc[responses[\"drug\"]==drug,:].index.values\n", + " e_z = exprs_z.loc[samples,:]\n", + " c = cna.loc[samples,:]\n", + " m = mut.loc[samples,:]\n", + " m = mut.loc[samples,:]\n", + " # next 3 rows if you want non-unique sample names\n", + " e_z.rename(lambda x : str(x)+\"_\"+drug, axis = \"index\", inplace=True)\n", + " c.rename(lambda x : str(x)+\"_\"+drug, axis = \"index\", inplace=True)\n", + " m.rename(lambda x : str(x)+\"_\"+drug, axis = \"index\", inplace=True)\n", + " expression_zscores.append(e_z)\n", + " CNA.append(c)\n", + " mutations.append(m)\n", + "responses.index = responses.index.values +\"_\"+responses[\"drug\"].values\n", + "GDSCEv2 = pd.concat(expression_zscores, axis =0 )\n", + "GDSCCv2 = pd.concat(CNA, axis =0 )\n", + "GDSCMv2 = pd.concat(mutations, axis =0 )\n", + "GDSCRv2 = responses\n", + "\n", + "ls2 = GDSCEv2.index.intersection(GDSCMv2.index)\n", + "ls2 = ls2.intersection(GDSCCv2.index)\n", + "GDSCEv2 = GDSCEv2.loc[ls2,:]\n", + "GDSCMv2 = GDSCMv2.loc[ls2,:]\n", + "GDSCCv2 = GDSCCv2.loc[ls2,:]\n", + "GDSCRv2 = GDSCRv2.loc[ls2,:]\n", + "\n", + "Y = GDSCRv2['response'].values\n", + "\n", + "PDXRcet = pd.read_csv(\"PDX_response.Cetuximab.tsv\", \n", + " sep = \"\\t\", index_col=0, decimal = \",\")\n", + "PDXRcet.loc[PDXRcet.iloc[:,1] == 'R'] = 0\n", + "PDXRcet.loc[PDXRcet.iloc[:,1] == 'S'] = 1\n", + "PDXRcet = PDXRcet.loc[ls4,:]\n", + "Ytscet = PDXRcet['response'].values \n", + "\n", + "PDXRerlo = pd.read_csv(\"PDX_response.Erlotinib.tsv\", \n", + " sep = \"\\t\", index_col=0, decimal = \",\")\n", + "PDXRerlo.loc[PDXRerlo.iloc[:,1] == 'R'] = 0\n", + "PDXRerlo.loc[PDXRerlo.iloc[:,1] == 'S'] = 1\n", + "PDXRerlo = PDXRerlo.loc[ls3,:]\n", + "Ytserlo = PDXRerlo['response'].values \n", + "\n", + "hdm1 = 32\n", + "hdm2 = 16\n", + "hdm3 = 256\n", + "rate1 = 0.5\n", + "rate2 = 0.8\n", + "rate3 = 0.5\n", + "rate4 = 0.3\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/hnoghabi/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:475: DataConversionWarning: Data with input dtype object was converted to float64 by StandardScaler.\n", + " warnings.warn(msg, DataConversionWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.9440556750399118\n", + "0.7222222222222222\n", + "0.8\n" + ] + } + ], + "source": [ + "scalerGDSC = sk.StandardScaler()\n", + "scalerGDSC.fit(GDSCEv2.values)\n", + "X_trainE = scalerGDSC.transform(GDSCEv2.values)\n", + "X_testEerlo = scalerGDSC.transform(PDXEerlo.values) \n", + "X_testEcet = scalerGDSC.transform(PDXEcet.values) \n", + "\n", + "X_trainM = np.nan_to_num(GDSCMv2.values)\n", + "X_trainC = np.nan_to_num(GDSCCv2.values)\n", + "X_testMerlo = np.nan_to_num(PDXMerlo.values)\n", + "X_testCerlo = np.nan_to_num(PDXCerlo.values)\n", + "X_testMcet = np.nan_to_num(PDXMcet.values)\n", + "X_testCcet = np.nan_to_num(PDXCcet.values)\n", + "\n", + "TX_testEerlo = torch.FloatTensor(X_testEerlo)\n", + "TX_testMerlo = torch.FloatTensor(X_testMerlo)\n", + "TX_testCerlo = torch.FloatTensor(X_testCerlo)\n", + "ty_testEerlo = torch.FloatTensor(Ytserlo.astype(int))\n", + "\n", + "TX_testEcet = torch.FloatTensor(X_testEcet)\n", + "TX_testMcet = torch.FloatTensor(X_testMcet)\n", + "TX_testCcet = torch.FloatTensor(X_testCcet)\n", + "ty_testEcet = torch.FloatTensor(Ytscet.astype(int))\n", + "\n", + "n_sampE, IE_dim = X_trainE.shape\n", + "n_sampM, IM_dim = X_trainM.shape\n", + "n_sampC, IC_dim = X_trainC.shape\n", + "\n", + "h_dim1 = hdm1\n", + "h_dim2 = hdm2\n", + "h_dim3 = hdm3 \n", + "Z_in = h_dim1 + h_dim2 + h_dim3\n", + "\n", + "costtr = []\n", + "auctr = []\n", + "costts = []\n", + "aucts = []\n", + "\n", + "class AEE(nn.Module):\n", + " def __init__(self):\n", + " super(AEE, self).__init__()\n", + " self.EnE = torch.nn.Sequential(\n", + " nn.Linear(IE_dim, h_dim1),\n", + " nn.BatchNorm1d(h_dim1),\n", + " nn.ReLU(),\n", + " nn.Dropout(rate1))\n", + " def forward(self, x):\n", + " output = self.EnE(x)\n", + " return output\n", + "\n", + "class AEM(nn.Module):\n", + " def __init__(self):\n", + " super(AEM, self).__init__()\n", + " self.EnM = torch.nn.Sequential(\n", + " nn.Linear(IM_dim, h_dim2),\n", + " nn.BatchNorm1d(h_dim2),\n", + " nn.ReLU(),\n", + " nn.Dropout(rate2))\n", + " def forward(self, x):\n", + " output = self.EnM(x)\n", + " return output \n", + "\n", + "\n", + "class AEC(nn.Module):\n", + " def __init__(self):\n", + " super(AEC, self).__init__()\n", + " self.EnC = torch.nn.Sequential(\n", + " nn.Linear(IM_dim, h_dim3),\n", + " nn.BatchNorm1d(h_dim3),\n", + " nn.ReLU(),\n", + " nn.Dropout(rate3))\n", + " def forward(self, x):\n", + " output = self.EnC(x)\n", + " return output \n", + "\n", + "class Classifier(nn.Module):\n", + " def __init__(self):\n", + " super(Classifier, self).__init__()\n", + " self.FC = torch.nn.Sequential(\n", + " nn.Linear(Z_in, 1),\n", + " nn.Dropout(rate4),\n", + " nn.Sigmoid())\n", + " def forward(self, x):\n", + " return self.FC(x)\n", + "\n", + "torch.cuda.manual_seed_all(42)\n", + "\n", + "AutoencoderE = torch.load('EGFRv2Exprs.pt')\n", + "AutoencoderM = torch.load('EGFRv2Mut.pt')\n", + "AutoencoderC = torch.load('EGFRv2CNA.pt')\n", + "\n", + "Clas = torch.load('EGFRv2Class.pt')\n", + "\n", + "AutoencoderE.eval()\n", + "AutoencoderM.eval()\n", + "AutoencoderC.eval()\n", + "Clas.eval()\n", + "\n", + "ZEX = AutoencoderE(torch.FloatTensor(X_trainE))\n", + "ZMX = AutoencoderM(torch.FloatTensor(X_trainM))\n", + "ZCX = AutoencoderC(torch.FloatTensor(X_trainC))\n", + "ZTX = torch.cat((ZEX, ZMX, ZCX), 1)\n", + "ZTX = F.normalize(ZTX, p=2, dim=0)\n", + "PredX = Clas(ZTX)\n", + "AUCt = roc_auc_score(Y, PredX.detach().numpy())\n", + "print(AUCt)\n", + "\n", + "ZETerlo = AutoencoderE(TX_testEerlo)\n", + "ZMTerlo = AutoencoderM(TX_testMerlo)\n", + "ZCTerlo = AutoencoderC(TX_testCerlo)\n", + "ZTTerlo = torch.cat((ZETerlo, ZMTerlo, ZCTerlo), 1)\n", + "ZTTerlo = F.normalize(ZTTerlo, p=2, dim=0)\n", + "PredTerlo = Clas(ZTTerlo)\n", + "AUCterlo = roc_auc_score(Ytserlo, PredTerlo.detach().numpy())\n", + "print(AUCterlo)\n", + "\n", + "ZETcet = AutoencoderE(TX_testEcet)\n", + "ZMTcet = AutoencoderM(TX_testMcet)\n", + "ZCTcet = AutoencoderC(TX_testCcet)\n", + "ZTTcet = torch.cat((ZETcet, ZMTcet, ZCTcet), 1)\n", + "ZTTcet = F.normalize(ZTTcet, p=2, dim=0)\n", + "PredTcet = Clas(ZTTcet)\n", + "AUCtcet = roc_auc_score(Ytscet, PredTcet.detach().numpy())\n", + "print(AUCtcet)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}