378 lines (377 with data), 14.1 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"import torch \n",
"import torch.nn as nn\n",
"import torch.nn.functional as F\n",
"import torch.optim as optim\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"import math\n",
"import sklearn.preprocessing as sk\n",
"import seaborn as sns\n",
"from sklearn import metrics\n",
"from sklearn.feature_selection import VarianceThreshold\n",
"from sklearn.model_selection import train_test_split\n",
"from utils import AllTripletSelector,HardestNegativeTripletSelector, RandomNegativeTripletSelector, SemihardNegativeTripletSelector # Strategies for selecting triplets within a minibatch\n",
"from metrics import AverageNonzeroTripletsMetric\n",
"from torch.utils.data.sampler import WeightedRandomSampler\n",
"from sklearn.metrics import roc_auc_score\n",
"from sklearn.metrics import average_precision_score\n",
"import random\n",
"from random import randint\n",
"from sklearn.model_selection import StratifiedKFold\n",
"\n",
"save_results_to = '/home/hnoghabi/EGFR/'\n",
"torch.manual_seed(42)\n",
"random.seed(42)\n",
"\n",
"GDSCE = pd.read_csv(\"GDSC_exprs.z.EGFRi.tsv\", \n",
" sep = \"\\t\", index_col=0, decimal = \",\")\n",
"GDSCE = pd.DataFrame.transpose(GDSCE)\n",
"\n",
"GDSCM = pd.read_csv(\"GDSC_mutations.EGFRi.tsv\", \n",
" sep = \"\\t\", index_col=0, decimal = \".\")\n",
"GDSCM = pd.DataFrame.transpose(GDSCM)\n",
"GDSCM = GDSCM.loc[:,~GDSCM.columns.duplicated()]\n",
"\n",
"GDSCC = pd.read_csv(\"GDSC_CNA.EGFRi.tsv\", \n",
" sep = \"\\t\", index_col=0, decimal = \".\")\n",
"GDSCC.drop_duplicates(keep='last')\n",
"GDSCC = pd.DataFrame.transpose(GDSCC)\n",
"GDSCC = GDSCC.loc[:,~GDSCC.columns.duplicated()]\n",
"\n",
"PDXEerlo = pd.read_csv(\"PDX_exprs.Erlotinib.eb_with.GDSC_exprs.Erlotinib.tsv\", \n",
" sep = \"\\t\", index_col=0, decimal = \",\")\n",
"PDXEerlo = pd.DataFrame.transpose(PDXEerlo)\n",
"# PDXMerlo = pd.read_csv(\"PDX_mutations.Erlotinib.tsv\", \n",
"# sep = \"\\t\", index_col=0, decimal = \".\")\n",
"PDXMerlo = pd.read_csv(\"PDX_mutations.Erlotinib - Copy.tsv\", \n",
" sep = \"\\t\", index_col=0, decimal = \",\")\n",
"PDXMerlo = pd.DataFrame.transpose(PDXMerlo)\n",
"# PDXCerlo = pd.read_csv(\"PDX_CNA.Erlotinib.tsv\", \n",
"# sep = \"\\t\", index_col=0, decimal = \".\")\n",
"PDXCerlo = pd.read_csv(\"PDX_CNV.Erlotinib.tsv\", \n",
" sep = \"\\t\", index_col=0, decimal = \",\")\n",
"PDXCerlo.drop_duplicates(keep='last')\n",
"PDXCerlo = pd.DataFrame.transpose(PDXCerlo)\n",
"PDXCerlo = PDXCerlo.loc[:,~PDXCerlo.columns.duplicated()]\n",
"\n",
"PDXEcet = pd.read_csv(\"PDX_exprs.Cetuximab.eb_with.GDSC_exprs.Cetuximab.tsv\", \n",
" sep = \"\\t\", index_col=0, decimal = \",\")\n",
"PDXEcet = pd.DataFrame.transpose(PDXEcet)\n",
"# PDXMcet = pd.read_csv(\"PDX_mutations.Cetuximab.tsv\", \n",
"# sep = \"\\t\", index_col=0, decimal = \".\")\n",
"PDXMcet = pd.read_csv(\"PDX_mutations.Cetuximab - Copy.tsv\", \n",
" sep = \"\\t\", index_col=0, decimal = \",\")\n",
"PDXMcet = pd.DataFrame.transpose(PDXMcet)\n",
"# PDXCcet = pd.read_csv(\"PDX_CNA.Cetuximab.tsv\", \n",
"# sep = \"\\t\", index_col=0, decimal = \".\")\n",
"PDXCcet = pd.read_csv(\"PDX_CNV.Cetuximab.tsv\", \n",
" sep = \"\\t\", index_col=0, decimal = \",\")\n",
"PDXCcet.drop_duplicates(keep='last')\n",
"PDXCcet = pd.DataFrame.transpose(PDXCcet)\n",
"PDXCcet = PDXCcet.loc[:,~PDXCcet.columns.duplicated()]\n",
"\n",
"selector = VarianceThreshold(0.05)\n",
"selector.fit_transform(GDSCE)\n",
"GDSCE = GDSCE[GDSCE.columns[selector.get_support(indices=True)]]\n",
"\n",
"GDSCM = GDSCM.fillna(0)\n",
"GDSCM[GDSCM != 0.0] = 1\n",
"GDSCC = GDSCC.fillna(0)\n",
"GDSCC[GDSCC != 0.0] = 1\n",
"\n",
"ls = GDSCE.columns.intersection(GDSCM.columns)\n",
"ls = ls.intersection(GDSCC.columns)\n",
"ls = ls.intersection(PDXEerlo.columns)\n",
"ls = ls.intersection(PDXMerlo.columns)\n",
"ls = ls.intersection(PDXCerlo.columns)\n",
"ls = ls.intersection(PDXEcet.columns)\n",
"ls = ls.intersection(PDXMcet.columns)\n",
"ls = ls.intersection(PDXCcet.columns)\n",
"ls2 = GDSCE.index.intersection(GDSCM.index)\n",
"ls2 = ls2.intersection(GDSCC.index)\n",
"ls3 = PDXEerlo.index.intersection(PDXMerlo.index)\n",
"ls3 = ls3.intersection(PDXCerlo.index)\n",
"ls4 = PDXEcet.index.intersection(PDXMcet.index)\n",
"ls4 = ls4.intersection(PDXCcet.index)\n",
"ls = pd.unique(ls)\n",
"\n",
"PDXEerlo = PDXEerlo.loc[ls3,ls]\n",
"PDXMerlo = PDXMerlo.loc[ls3,ls]\n",
"PDXCerlo = PDXCerlo.loc[ls3,ls]\n",
"PDXEcet = PDXEcet.loc[ls4,ls]\n",
"PDXMcet = PDXMcet.loc[ls4,ls]\n",
"PDXCcet = PDXCcet.loc[ls4,ls]\n",
"GDSCE = GDSCE.loc[:,ls]\n",
"GDSCM = GDSCM.loc[:,ls]\n",
"GDSCC = GDSCC.loc[:,ls]\n",
"\n",
"GDSCR = pd.read_csv(\"GDSC_response.EGFRi.tsv\", \n",
" sep = \"\\t\", index_col=0, decimal = \",\")\n",
"\n",
"GDSCR.rename(mapper = str, axis = 'index', inplace = True)\n",
"\n",
"d = {\"R\":0,\"S\":1}\n",
"GDSCR[\"response\"] = GDSCR.loc[:,\"response\"].apply(lambda x: d[x])\n",
"\n",
"responses = GDSCR\n",
"drugs = set(responses[\"drug\"].values)\n",
"exprs_z = GDSCE\n",
"cna = GDSCC\n",
"mut = GDSCM\n",
"expression_zscores = []\n",
"CNA=[]\n",
"mutations = []\n",
"for drug in drugs:\n",
" samples = responses.loc[responses[\"drug\"]==drug,:].index.values\n",
" e_z = exprs_z.loc[samples,:]\n",
" c = cna.loc[samples,:]\n",
" m = mut.loc[samples,:]\n",
" m = mut.loc[samples,:]\n",
" # next 3 rows if you want non-unique sample names\n",
" e_z.rename(lambda x : str(x)+\"_\"+drug, axis = \"index\", inplace=True)\n",
" c.rename(lambda x : str(x)+\"_\"+drug, axis = \"index\", inplace=True)\n",
" m.rename(lambda x : str(x)+\"_\"+drug, axis = \"index\", inplace=True)\n",
" expression_zscores.append(e_z)\n",
" CNA.append(c)\n",
" mutations.append(m)\n",
"responses.index = responses.index.values +\"_\"+responses[\"drug\"].values\n",
"GDSCEv2 = pd.concat(expression_zscores, axis =0 )\n",
"GDSCCv2 = pd.concat(CNA, axis =0 )\n",
"GDSCMv2 = pd.concat(mutations, axis =0 )\n",
"GDSCRv2 = responses\n",
"\n",
"ls2 = GDSCEv2.index.intersection(GDSCMv2.index)\n",
"ls2 = ls2.intersection(GDSCCv2.index)\n",
"GDSCEv2 = GDSCEv2.loc[ls2,:]\n",
"GDSCMv2 = GDSCMv2.loc[ls2,:]\n",
"GDSCCv2 = GDSCCv2.loc[ls2,:]\n",
"GDSCRv2 = GDSCRv2.loc[ls2,:]\n",
"\n",
"Y = GDSCRv2['response'].values\n",
"\n",
"PDXRcet = pd.read_csv(\"PDX_response.Cetuximab.tsv\", \n",
" sep = \"\\t\", index_col=0, decimal = \",\")\n",
"PDXRcet.loc[PDXRcet.iloc[:,1] == 'R'] = 0\n",
"PDXRcet.loc[PDXRcet.iloc[:,1] == 'S'] = 1\n",
"PDXRcet = PDXRcet.loc[ls4,:]\n",
"Ytscet = PDXRcet['response'].values \n",
"\n",
"PDXRerlo = pd.read_csv(\"PDX_response.Erlotinib.tsv\", \n",
" sep = \"\\t\", index_col=0, decimal = \",\")\n",
"PDXRerlo.loc[PDXRerlo.iloc[:,1] == 'R'] = 0\n",
"PDXRerlo.loc[PDXRerlo.iloc[:,1] == 'S'] = 1\n",
"PDXRerlo = PDXRerlo.loc[ls3,:]\n",
"Ytserlo = PDXRerlo['response'].values \n",
"\n",
"hdm1 = 32\n",
"hdm2 = 16\n",
"hdm3 = 256\n",
"rate1 = 0.5\n",
"rate2 = 0.8\n",
"rate3 = 0.5\n",
"rate4 = 0.3\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/hnoghabi/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:475: DataConversionWarning: Data with input dtype object was converted to float64 by StandardScaler.\n",
" warnings.warn(msg, DataConversionWarning)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9440556750399118\n",
"0.7222222222222222\n",
"0.8\n"
]
}
],
"source": [
"scalerGDSC = sk.StandardScaler()\n",
"scalerGDSC.fit(GDSCEv2.values)\n",
"X_trainE = scalerGDSC.transform(GDSCEv2.values)\n",
"X_testEerlo = scalerGDSC.transform(PDXEerlo.values) \n",
"X_testEcet = scalerGDSC.transform(PDXEcet.values) \n",
"\n",
"X_trainM = np.nan_to_num(GDSCMv2.values)\n",
"X_trainC = np.nan_to_num(GDSCCv2.values)\n",
"X_testMerlo = np.nan_to_num(PDXMerlo.values)\n",
"X_testCerlo = np.nan_to_num(PDXCerlo.values)\n",
"X_testMcet = np.nan_to_num(PDXMcet.values)\n",
"X_testCcet = np.nan_to_num(PDXCcet.values)\n",
"\n",
"TX_testEerlo = torch.FloatTensor(X_testEerlo)\n",
"TX_testMerlo = torch.FloatTensor(X_testMerlo)\n",
"TX_testCerlo = torch.FloatTensor(X_testCerlo)\n",
"ty_testEerlo = torch.FloatTensor(Ytserlo.astype(int))\n",
"\n",
"TX_testEcet = torch.FloatTensor(X_testEcet)\n",
"TX_testMcet = torch.FloatTensor(X_testMcet)\n",
"TX_testCcet = torch.FloatTensor(X_testCcet)\n",
"ty_testEcet = torch.FloatTensor(Ytscet.astype(int))\n",
"\n",
"n_sampE, IE_dim = X_trainE.shape\n",
"n_sampM, IM_dim = X_trainM.shape\n",
"n_sampC, IC_dim = X_trainC.shape\n",
"\n",
"h_dim1 = hdm1\n",
"h_dim2 = hdm2\n",
"h_dim3 = hdm3 \n",
"Z_in = h_dim1 + h_dim2 + h_dim3\n",
"\n",
"costtr = []\n",
"auctr = []\n",
"costts = []\n",
"aucts = []\n",
"\n",
"class AEE(nn.Module):\n",
" def __init__(self):\n",
" super(AEE, self).__init__()\n",
" self.EnE = torch.nn.Sequential(\n",
" nn.Linear(IE_dim, h_dim1),\n",
" nn.BatchNorm1d(h_dim1),\n",
" nn.ReLU(),\n",
" nn.Dropout(rate1))\n",
" def forward(self, x):\n",
" output = self.EnE(x)\n",
" return output\n",
"\n",
"class AEM(nn.Module):\n",
" def __init__(self):\n",
" super(AEM, self).__init__()\n",
" self.EnM = torch.nn.Sequential(\n",
" nn.Linear(IM_dim, h_dim2),\n",
" nn.BatchNorm1d(h_dim2),\n",
" nn.ReLU(),\n",
" nn.Dropout(rate2))\n",
" def forward(self, x):\n",
" output = self.EnM(x)\n",
" return output \n",
"\n",
"\n",
"class AEC(nn.Module):\n",
" def __init__(self):\n",
" super(AEC, self).__init__()\n",
" self.EnC = torch.nn.Sequential(\n",
" nn.Linear(IM_dim, h_dim3),\n",
" nn.BatchNorm1d(h_dim3),\n",
" nn.ReLU(),\n",
" nn.Dropout(rate3))\n",
" def forward(self, x):\n",
" output = self.EnC(x)\n",
" return output \n",
"\n",
"class Classifier(nn.Module):\n",
" def __init__(self):\n",
" super(Classifier, self).__init__()\n",
" self.FC = torch.nn.Sequential(\n",
" nn.Linear(Z_in, 1),\n",
" nn.Dropout(rate4),\n",
" nn.Sigmoid())\n",
" def forward(self, x):\n",
" return self.FC(x)\n",
"\n",
"torch.cuda.manual_seed_all(42)\n",
"\n",
"AutoencoderE = torch.load('EGFRv2Exprs.pt')\n",
"AutoencoderM = torch.load('EGFRv2Mut.pt')\n",
"AutoencoderC = torch.load('EGFRv2CNA.pt')\n",
"\n",
"Clas = torch.load('EGFRv2Class.pt')\n",
"\n",
"AutoencoderE.eval()\n",
"AutoencoderM.eval()\n",
"AutoencoderC.eval()\n",
"Clas.eval()\n",
"\n",
"ZEX = AutoencoderE(torch.FloatTensor(X_trainE))\n",
"ZMX = AutoencoderM(torch.FloatTensor(X_trainM))\n",
"ZCX = AutoencoderC(torch.FloatTensor(X_trainC))\n",
"ZTX = torch.cat((ZEX, ZMX, ZCX), 1)\n",
"ZTX = F.normalize(ZTX, p=2, dim=0)\n",
"PredX = Clas(ZTX)\n",
"AUCt = roc_auc_score(Y, PredX.detach().numpy())\n",
"print(AUCt)\n",
"\n",
"ZETerlo = AutoencoderE(TX_testEerlo)\n",
"ZMTerlo = AutoencoderM(TX_testMerlo)\n",
"ZCTerlo = AutoencoderC(TX_testCerlo)\n",
"ZTTerlo = torch.cat((ZETerlo, ZMTerlo, ZCTerlo), 1)\n",
"ZTTerlo = F.normalize(ZTTerlo, p=2, dim=0)\n",
"PredTerlo = Clas(ZTTerlo)\n",
"AUCterlo = roc_auc_score(Ytserlo, PredTerlo.detach().numpy())\n",
"print(AUCterlo)\n",
"\n",
"ZETcet = AutoencoderE(TX_testEcet)\n",
"ZMTcet = AutoencoderM(TX_testMcet)\n",
"ZCTcet = AutoencoderC(TX_testCcet)\n",
"ZTTcet = torch.cat((ZETcet, ZMTcet, ZCTcet), 1)\n",
"ZTTcet = F.normalize(ZTTcet, p=2, dim=0)\n",
"PredTcet = Clas(ZTTcet)\n",
"AUCtcet = roc_auc_score(Ytscet, PredTcet.detach().numpy())\n",
"print(AUCtcet)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}