a b/EGFR/EGFRv8.ipynb
1
{
2
 "cells": [
3
  {
4
   "cell_type": "code",
5
   "execution_count": 21,
6
   "metadata": {},
7
   "outputs": [],
8
   "source": [
9
    "import torch \n",
10
    "import torch.nn as nn\n",
11
    "import torch.nn.functional as F\n",
12
    "import torch.optim as optim\n",
13
    "import numpy as np\n",
14
    "import matplotlib.pyplot as plt\n",
15
    "import pandas as pd\n",
16
    "import math\n",
17
    "import sklearn.preprocessing as sk\n",
18
    "import seaborn as sns\n",
19
    "from sklearn import metrics\n",
20
    "from sklearn.feature_selection import VarianceThreshold\n",
21
    "from sklearn.model_selection import train_test_split\n",
22
    "from utils import AllTripletSelector,HardestNegativeTripletSelector, RandomNegativeTripletSelector, SemihardNegativeTripletSelector # Strategies for selecting triplets within a minibatch\n",
23
    "from metrics import AverageNonzeroTripletsMetric\n",
24
    "from torch.utils.data.sampler import WeightedRandomSampler\n",
25
    "from sklearn.metrics import roc_auc_score\n",
26
    "from sklearn.metrics import average_precision_score\n",
27
    "import random\n",
28
    "from random import randint\n",
29
    "from sklearn.model_selection import StratifiedKFold\n",
30
    "\n",
31
    "save_results_to = '/home/hnoghabi/EGFR/'\n",
32
    "torch.manual_seed(42)\n",
33
    "random.seed(42)\n",
34
    "\n",
35
    "GDSCE = pd.read_csv(\"GDSC_exprs.z.EGFRi.tsv\", \n",
36
    "                    sep = \"\\t\", index_col=0, decimal = \",\")\n",
37
    "GDSCE = pd.DataFrame.transpose(GDSCE)\n",
38
    "\n",
39
    "GDSCM = pd.read_csv(\"GDSC_mutations.EGFRi.tsv\", \n",
40
    "                    sep = \"\\t\", index_col=0, decimal = \".\")\n",
41
    "GDSCM = pd.DataFrame.transpose(GDSCM)\n",
42
    "GDSCM = GDSCM.loc[:,~GDSCM.columns.duplicated()]\n",
43
    "\n",
44
    "GDSCC = pd.read_csv(\"GDSC_CNA.EGFRi.tsv\", \n",
45
    "                    sep = \"\\t\", index_col=0, decimal = \".\")\n",
46
    "GDSCC.drop_duplicates(keep='last')\n",
47
    "GDSCC = pd.DataFrame.transpose(GDSCC)\n",
48
    "GDSCC = GDSCC.loc[:,~GDSCC.columns.duplicated()]\n",
49
    "\n",
50
    "PDXEerlo = pd.read_csv(\"PDX_exprs.Erlotinib.eb_with.GDSC_exprs.Erlotinib.tsv\", \n",
51
    "                   sep = \"\\t\", index_col=0, decimal = \",\")\n",
52
    "PDXEerlo = pd.DataFrame.transpose(PDXEerlo)\n",
53
    "# PDXMerlo = pd.read_csv(\"PDX_mutations.Erlotinib.tsv\", \n",
54
    "#                    sep = \"\\t\", index_col=0, decimal = \".\")\n",
55
    "PDXMerlo = pd.read_csv(\"PDX_mutations.Erlotinib - Copy.tsv\", \n",
56
    "                   sep = \"\\t\", index_col=0, decimal = \",\")\n",
57
    "PDXMerlo = pd.DataFrame.transpose(PDXMerlo)\n",
58
    "# PDXCerlo = pd.read_csv(\"PDX_CNA.Erlotinib.tsv\", \n",
59
    "#                    sep = \"\\t\", index_col=0, decimal = \".\")\n",
60
    "PDXCerlo = pd.read_csv(\"PDX_CNV.Erlotinib.tsv\", \n",
61
    "                   sep = \"\\t\", index_col=0, decimal = \",\")\n",
62
    "PDXCerlo.drop_duplicates(keep='last')\n",
63
    "PDXCerlo = pd.DataFrame.transpose(PDXCerlo)\n",
64
    "PDXCerlo = PDXCerlo.loc[:,~PDXCerlo.columns.duplicated()]\n",
65
    "\n",
66
    "PDXEcet = pd.read_csv(\"PDX_exprs.Cetuximab.eb_with.GDSC_exprs.Cetuximab.tsv\", \n",
67
    "                   sep = \"\\t\", index_col=0, decimal = \",\")\n",
68
    "PDXEcet = pd.DataFrame.transpose(PDXEcet)\n",
69
    "# PDXMcet = pd.read_csv(\"PDX_mutations.Cetuximab.tsv\", \n",
70
    "#                    sep = \"\\t\", index_col=0, decimal = \".\")\n",
71
    "PDXMcet = pd.read_csv(\"PDX_mutations.Cetuximab - Copy.tsv\", \n",
72
    "                   sep = \"\\t\", index_col=0, decimal = \",\")\n",
73
    "PDXMcet = pd.DataFrame.transpose(PDXMcet)\n",
74
    "# PDXCcet = pd.read_csv(\"PDX_CNA.Cetuximab.tsv\", \n",
75
    "#                    sep = \"\\t\", index_col=0, decimal = \".\")\n",
76
    "PDXCcet = pd.read_csv(\"PDX_CNV.Cetuximab.tsv\", \n",
77
    "                   sep = \"\\t\", index_col=0, decimal = \",\")\n",
78
    "PDXCcet.drop_duplicates(keep='last')\n",
79
    "PDXCcet = pd.DataFrame.transpose(PDXCcet)\n",
80
    "PDXCcet = PDXCcet.loc[:,~PDXCcet.columns.duplicated()]\n",
81
    "\n",
82
    "selector = VarianceThreshold(0.05)\n",
83
    "selector.fit_transform(GDSCE)\n",
84
    "GDSCE = GDSCE[GDSCE.columns[selector.get_support(indices=True)]]\n",
85
    "\n",
86
    "GDSCM = GDSCM.fillna(0)\n",
87
    "GDSCM[GDSCM != 0.0] = 1\n",
88
    "GDSCC = GDSCC.fillna(0)\n",
89
    "GDSCC[GDSCC != 0.0] = 1\n",
90
    "\n",
91
    "ls = GDSCE.columns.intersection(GDSCM.columns)\n",
92
    "ls = ls.intersection(GDSCC.columns)\n",
93
    "ls = ls.intersection(PDXEerlo.columns)\n",
94
    "ls = ls.intersection(PDXMerlo.columns)\n",
95
    "ls = ls.intersection(PDXCerlo.columns)\n",
96
    "ls = ls.intersection(PDXEcet.columns)\n",
97
    "ls = ls.intersection(PDXMcet.columns)\n",
98
    "ls = ls.intersection(PDXCcet.columns)\n",
99
    "ls2 = GDSCE.index.intersection(GDSCM.index)\n",
100
    "ls2 = ls2.intersection(GDSCC.index)\n",
101
    "ls3 = PDXEerlo.index.intersection(PDXMerlo.index)\n",
102
    "ls3 = ls3.intersection(PDXCerlo.index)\n",
103
    "ls4 = PDXEcet.index.intersection(PDXMcet.index)\n",
104
    "ls4 = ls4.intersection(PDXCcet.index)\n",
105
    "ls = pd.unique(ls)\n",
106
    "\n",
107
    "PDXEerlo = PDXEerlo.loc[ls3,ls]\n",
108
    "PDXMerlo = PDXMerlo.loc[ls3,ls]\n",
109
    "PDXCerlo = PDXCerlo.loc[ls3,ls]\n",
110
    "PDXEcet = PDXEcet.loc[ls4,ls]\n",
111
    "PDXMcet = PDXMcet.loc[ls4,ls]\n",
112
    "PDXCcet = PDXCcet.loc[ls4,ls]\n",
113
    "GDSCE = GDSCE.loc[:,ls]\n",
114
    "GDSCM = GDSCM.loc[:,ls]\n",
115
    "GDSCC = GDSCC.loc[:,ls]\n",
116
    "\n",
117
    "GDSCR = pd.read_csv(\"GDSC_response.EGFRi.tsv\", \n",
118
    "                    sep = \"\\t\", index_col=0, decimal = \",\")\n",
119
    "\n",
120
    "GDSCR.rename(mapper = str, axis = 'index', inplace = True)\n",
121
    "\n",
122
    "d = {\"R\":0,\"S\":1}\n",
123
    "GDSCR[\"response\"] = GDSCR.loc[:,\"response\"].apply(lambda x: d[x])\n",
124
    "\n",
125
    "responses = GDSCR\n",
126
    "drugs = set(responses[\"drug\"].values)\n",
127
    "exprs_z = GDSCE\n",
128
    "cna = GDSCC\n",
129
    "mut = GDSCM\n",
130
    "expression_zscores = []\n",
131
    "CNA=[]\n",
132
    "mutations = []\n",
133
    "for drug in drugs:\n",
134
    "    samples = responses.loc[responses[\"drug\"]==drug,:].index.values\n",
135
    "    e_z = exprs_z.loc[samples,:]\n",
136
    "    c = cna.loc[samples,:]\n",
137
    "    m = mut.loc[samples,:]\n",
138
    "    m = mut.loc[samples,:]\n",
139
    "    # next 3 rows if you want non-unique sample names\n",
140
    "    e_z.rename(lambda x : str(x)+\"_\"+drug, axis = \"index\", inplace=True)\n",
141
    "    c.rename(lambda x : str(x)+\"_\"+drug, axis = \"index\", inplace=True)\n",
142
    "    m.rename(lambda x : str(x)+\"_\"+drug, axis = \"index\", inplace=True)\n",
143
    "    expression_zscores.append(e_z)\n",
144
    "    CNA.append(c)\n",
145
    "    mutations.append(m)\n",
146
    "responses.index = responses.index.values +\"_\"+responses[\"drug\"].values\n",
147
    "GDSCEv2 = pd.concat(expression_zscores, axis =0 )\n",
148
    "GDSCCv2 = pd.concat(CNA, axis =0 )\n",
149
    "GDSCMv2 = pd.concat(mutations, axis =0 )\n",
150
    "GDSCRv2 = responses\n",
151
    "\n",
152
    "ls2 = GDSCEv2.index.intersection(GDSCMv2.index)\n",
153
    "ls2 = ls2.intersection(GDSCCv2.index)\n",
154
    "GDSCEv2 = GDSCEv2.loc[ls2,:]\n",
155
    "GDSCMv2 = GDSCMv2.loc[ls2,:]\n",
156
    "GDSCCv2 = GDSCCv2.loc[ls2,:]\n",
157
    "GDSCRv2 = GDSCRv2.loc[ls2,:]\n",
158
    "\n",
159
    "Y = GDSCRv2['response'].values\n",
160
    "\n",
161
    "PDXRcet = pd.read_csv(\"PDX_response.Cetuximab.tsv\", \n",
162
    "                       sep = \"\\t\", index_col=0, decimal = \",\")\n",
163
    "PDXRcet.loc[PDXRcet.iloc[:,1] == 'R'] = 0\n",
164
    "PDXRcet.loc[PDXRcet.iloc[:,1] == 'S'] = 1\n",
165
    "PDXRcet = PDXRcet.loc[ls4,:]\n",
166
    "Ytscet = PDXRcet['response'].values    \n",
167
    "\n",
168
    "PDXRerlo = pd.read_csv(\"PDX_response.Erlotinib.tsv\", \n",
169
    "                       sep = \"\\t\", index_col=0, decimal = \",\")\n",
170
    "PDXRerlo.loc[PDXRerlo.iloc[:,1] == 'R'] = 0\n",
171
    "PDXRerlo.loc[PDXRerlo.iloc[:,1] == 'S'] = 1\n",
172
    "PDXRerlo = PDXRerlo.loc[ls3,:]\n",
173
    "Ytserlo = PDXRerlo['response'].values  \n",
174
    "\n",
175
    "hdm1 = 32\n",
176
    "hdm2 = 16\n",
177
    "hdm3 = 256\n",
178
    "rate1 = 0.5\n",
179
    "rate2 = 0.8\n",
180
    "rate3 = 0.5\n",
181
    "rate4 = 0.3\n",
182
    "\n"
183
   ]
184
  },
185
  {
186
   "cell_type": "code",
187
   "execution_count": null,
188
   "metadata": {},
189
   "outputs": [],
190
   "source": []
191
  },
192
  {
193
   "cell_type": "code",
194
   "execution_count": 22,
195
   "metadata": {},
196
   "outputs": [
197
    {
198
     "name": "stderr",
199
     "output_type": "stream",
200
     "text": [
201
      "/home/hnoghabi/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:475: DataConversionWarning: Data with input dtype object was converted to float64 by StandardScaler.\n",
202
      "  warnings.warn(msg, DataConversionWarning)\n"
203
     ]
204
    },
205
    {
206
     "name": "stdout",
207
     "output_type": "stream",
208
     "text": [
209
      "0.9440556750399118\n",
210
      "0.7222222222222222\n",
211
      "0.8\n"
212
     ]
213
    }
214
   ],
215
   "source": [
216
    "scalerGDSC = sk.StandardScaler()\n",
217
    "scalerGDSC.fit(GDSCEv2.values)\n",
218
    "X_trainE = scalerGDSC.transform(GDSCEv2.values)\n",
219
    "X_testEerlo = scalerGDSC.transform(PDXEerlo.values)    \n",
220
    "X_testEcet = scalerGDSC.transform(PDXEcet.values)    \n",
221
    "\n",
222
    "X_trainM = np.nan_to_num(GDSCMv2.values)\n",
223
    "X_trainC = np.nan_to_num(GDSCCv2.values)\n",
224
    "X_testMerlo = np.nan_to_num(PDXMerlo.values)\n",
225
    "X_testCerlo = np.nan_to_num(PDXCerlo.values)\n",
226
    "X_testMcet = np.nan_to_num(PDXMcet.values)\n",
227
    "X_testCcet = np.nan_to_num(PDXCcet.values)\n",
228
    "\n",
229
    "TX_testEerlo = torch.FloatTensor(X_testEerlo)\n",
230
    "TX_testMerlo = torch.FloatTensor(X_testMerlo)\n",
231
    "TX_testCerlo = torch.FloatTensor(X_testCerlo)\n",
232
    "ty_testEerlo = torch.FloatTensor(Ytserlo.astype(int))\n",
233
    "\n",
234
    "TX_testEcet = torch.FloatTensor(X_testEcet)\n",
235
    "TX_testMcet = torch.FloatTensor(X_testMcet)\n",
236
    "TX_testCcet = torch.FloatTensor(X_testCcet)\n",
237
    "ty_testEcet = torch.FloatTensor(Ytscet.astype(int))\n",
238
    "\n",
239
    "n_sampE, IE_dim = X_trainE.shape\n",
240
    "n_sampM, IM_dim = X_trainM.shape\n",
241
    "n_sampC, IC_dim = X_trainC.shape\n",
242
    "\n",
243
    "h_dim1 = hdm1\n",
244
    "h_dim2 = hdm2\n",
245
    "h_dim3 = hdm3        \n",
246
    "Z_in = h_dim1 + h_dim2 + h_dim3\n",
247
    "\n",
248
    "costtr = []\n",
249
    "auctr = []\n",
250
    "costts = []\n",
251
    "aucts = []\n",
252
    "\n",
253
    "class AEE(nn.Module):\n",
254
    "    def __init__(self):\n",
255
    "        super(AEE, self).__init__()\n",
256
    "        self.EnE = torch.nn.Sequential(\n",
257
    "            nn.Linear(IE_dim, h_dim1),\n",
258
    "            nn.BatchNorm1d(h_dim1),\n",
259
    "            nn.ReLU(),\n",
260
    "            nn.Dropout(rate1))\n",
261
    "    def forward(self, x):\n",
262
    "        output = self.EnE(x)\n",
263
    "        return output\n",
264
    "\n",
265
    "class AEM(nn.Module):\n",
266
    "    def __init__(self):\n",
267
    "        super(AEM, self).__init__()\n",
268
    "        self.EnM = torch.nn.Sequential(\n",
269
    "            nn.Linear(IM_dim, h_dim2),\n",
270
    "            nn.BatchNorm1d(h_dim2),\n",
271
    "            nn.ReLU(),\n",
272
    "            nn.Dropout(rate2))\n",
273
    "    def forward(self, x):\n",
274
    "        output = self.EnM(x)\n",
275
    "        return output    \n",
276
    "\n",
277
    "\n",
278
    "class AEC(nn.Module):\n",
279
    "    def __init__(self):\n",
280
    "        super(AEC, self).__init__()\n",
281
    "        self.EnC = torch.nn.Sequential(\n",
282
    "            nn.Linear(IM_dim, h_dim3),\n",
283
    "            nn.BatchNorm1d(h_dim3),\n",
284
    "            nn.ReLU(),\n",
285
    "            nn.Dropout(rate3))\n",
286
    "    def forward(self, x):\n",
287
    "        output = self.EnC(x)\n",
288
    "        return output       \n",
289
    "\n",
290
    "class Classifier(nn.Module):\n",
291
    "    def __init__(self):\n",
292
    "        super(Classifier, self).__init__()\n",
293
    "        self.FC = torch.nn.Sequential(\n",
294
    "            nn.Linear(Z_in, 1),\n",
295
    "            nn.Dropout(rate4),\n",
296
    "            nn.Sigmoid())\n",
297
    "    def forward(self, x):\n",
298
    "        return self.FC(x)\n",
299
    "\n",
300
    "torch.cuda.manual_seed_all(42)\n",
301
    "\n",
302
    "AutoencoderE = torch.load('EGFRv2Exprs.pt')\n",
303
    "AutoencoderM = torch.load('EGFRv2Mut.pt')\n",
304
    "AutoencoderC = torch.load('EGFRv2CNA.pt')\n",
305
    "\n",
306
    "Clas = torch.load('EGFRv2Class.pt')\n",
307
    "\n",
308
    "AutoencoderE.eval()\n",
309
    "AutoencoderM.eval()\n",
310
    "AutoencoderC.eval()\n",
311
    "Clas.eval()\n",
312
    "\n",
313
    "ZEX = AutoencoderE(torch.FloatTensor(X_trainE))\n",
314
    "ZMX = AutoencoderM(torch.FloatTensor(X_trainM))\n",
315
    "ZCX = AutoencoderC(torch.FloatTensor(X_trainC))\n",
316
    "ZTX = torch.cat((ZEX, ZMX, ZCX), 1)\n",
317
    "ZTX = F.normalize(ZTX, p=2, dim=0)\n",
318
    "PredX = Clas(ZTX)\n",
319
    "AUCt = roc_auc_score(Y, PredX.detach().numpy())\n",
320
    "print(AUCt)\n",
321
    "\n",
322
    "ZETerlo = AutoencoderE(TX_testEerlo)\n",
323
    "ZMTerlo = AutoencoderM(TX_testMerlo)\n",
324
    "ZCTerlo = AutoencoderC(TX_testCerlo)\n",
325
    "ZTTerlo = torch.cat((ZETerlo, ZMTerlo, ZCTerlo), 1)\n",
326
    "ZTTerlo = F.normalize(ZTTerlo, p=2, dim=0)\n",
327
    "PredTerlo = Clas(ZTTerlo)\n",
328
    "AUCterlo = roc_auc_score(Ytserlo, PredTerlo.detach().numpy())\n",
329
    "print(AUCterlo)\n",
330
    "\n",
331
    "ZETcet = AutoencoderE(TX_testEcet)\n",
332
    "ZMTcet = AutoencoderM(TX_testMcet)\n",
333
    "ZCTcet = AutoencoderC(TX_testCcet)\n",
334
    "ZTTcet = torch.cat((ZETcet, ZMTcet, ZCTcet), 1)\n",
335
    "ZTTcet = F.normalize(ZTTcet, p=2, dim=0)\n",
336
    "PredTcet = Clas(ZTTcet)\n",
337
    "AUCtcet = roc_auc_score(Ytscet, PredTcet.detach().numpy())\n",
338
    "print(AUCtcet)"
339
   ]
340
  },
341
  {
342
   "cell_type": "code",
343
   "execution_count": null,
344
   "metadata": {},
345
   "outputs": [],
346
   "source": []
347
  },
348
  {
349
   "cell_type": "code",
350
   "execution_count": null,
351
   "metadata": {},
352
   "outputs": [],
353
   "source": []
354
  }
355
 ],
356
 "metadata": {
357
  "kernelspec": {
358
   "display_name": "Python 3",
359
   "language": "python",
360
   "name": "python3"
361
  },
362
  "language_info": {
363
   "codemirror_mode": {
364
    "name": "ipython",
365
    "version": 3
366
   },
367
   "file_extension": ".py",
368
   "mimetype": "text/x-python",
369
   "name": "python",
370
   "nbconvert_exporter": "python",
371
   "pygments_lexer": "ipython3",
372
   "version": "3.6.7"
373
  }
374
 },
375
 "nbformat": 4,
376
 "nbformat_minor": 2
377
}