Switch to unified view

a b/synthetics/05_compare_associations.ipynb
1
{
2
 "cells": [
3
  {
4
   "cell_type": "markdown",
5
   "id": "3a93d4c0",
6
   "metadata": {},
7
   "source": [
8
    "# Compare Associations\n",
9
    "* This notebook compares the final GWAS p-values for the full synthetic genome/phenome datasets to those in the original genome/phenome datasets and computes the precision, recall and F1 values."
10
   ]
11
  },
12
  {
13
   "cell_type": "code",
14
   "execution_count": 1,
15
   "id": "97ac91d7",
16
   "metadata": {},
17
   "outputs": [],
18
   "source": [
19
    "import os\n",
20
    "import pathlib\n",
21
    "import pandas as pd\n",
22
    "\n",
23
    "base_path = pathlib.Path(os.getcwd().replace(\"/synthetics\", \"\"))\n",
24
    "data_path = base_path / 'mice_data_set' / 'data' \n",
25
    "real_gwas_path = base_path / 'mice_data_set' / 'out' \n",
26
    "synthetic_gwas_path = base_path / 'mice_data_set' / 'out_synth'\n"
27
   ]
28
  },
29
  {
30
   "cell_type": "code",
31
   "execution_count": 6,
32
   "id": "0198fdaf",
33
   "metadata": {},
34
   "outputs": [
35
    {
36
     "data": {
37
      "text/html": [
38
       "<div>\n",
39
       "<style scoped>\n",
40
       "    .dataframe tbody tr th:only-of-type {\n",
41
       "        vertical-align: middle;\n",
42
       "    }\n",
43
       "\n",
44
       "    .dataframe tbody tr th {\n",
45
       "        vertical-align: top;\n",
46
       "    }\n",
47
       "\n",
48
       "    .dataframe thead th {\n",
49
       "        text-align: right;\n",
50
       "    }\n",
51
       "</style>\n",
52
       "<table border=\"1\" class=\"dataframe\">\n",
53
       "  <thead>\n",
54
       "    <tr style=\"text-align: right;\">\n",
55
       "      <th></th>\n",
56
       "      <th>index</th>\n",
57
       "      <th>snp</th>\n",
58
       "      <th>p</th>\n",
59
       "      <th>interest</th>\n",
60
       "    </tr>\n",
61
       "  </thead>\n",
62
       "  <tbody>\n",
63
       "    <tr>\n",
64
       "      <th>0</th>\n",
65
       "      <td>1</td>\n",
66
       "      <td>rs29477109</td>\n",
67
       "      <td>5.052317e-14</td>\n",
68
       "      <td>True</td>\n",
69
       "    </tr>\n",
70
       "    <tr>\n",
71
       "      <th>1</th>\n",
72
       "      <td>2</td>\n",
73
       "      <td>rs27071351</td>\n",
74
       "      <td>7.074181e-14</td>\n",
75
       "      <td>True</td>\n",
76
       "    </tr>\n",
77
       "    <tr>\n",
78
       "      <th>2</th>\n",
79
       "      <td>3</td>\n",
80
       "      <td>rs27024162</td>\n",
81
       "      <td>7.170582e-14</td>\n",
82
       "      <td>True</td>\n",
83
       "    </tr>\n",
84
       "    <tr>\n",
85
       "      <th>3</th>\n",
86
       "      <td>4</td>\n",
87
       "      <td>rs49423067</td>\n",
88
       "      <td>7.198661e-14</td>\n",
89
       "      <td>True</td>\n",
90
       "    </tr>\n",
91
       "    <tr>\n",
92
       "      <th>4</th>\n",
93
       "      <td>5</td>\n",
94
       "      <td>rs29470802</td>\n",
95
       "      <td>8.049849e-14</td>\n",
96
       "      <td>True</td>\n",
97
       "    </tr>\n",
98
       "    <tr>\n",
99
       "      <th>...</th>\n",
100
       "      <td>...</td>\n",
101
       "      <td>...</td>\n",
102
       "      <td>...</td>\n",
103
       "      <td>...</td>\n",
104
       "    </tr>\n",
105
       "    <tr>\n",
106
       "      <th>79640</th>\n",
107
       "      <td>79641</td>\n",
108
       "      <td>rs3162358</td>\n",
109
       "      <td>9.998911e-01</td>\n",
110
       "      <td>False</td>\n",
111
       "    </tr>\n",
112
       "    <tr>\n",
113
       "      <th>79641</th>\n",
114
       "      <td>79642</td>\n",
115
       "      <td>rs50509099</td>\n",
116
       "      <td>9.999012e-01</td>\n",
117
       "      <td>False</td>\n",
118
       "    </tr>\n",
119
       "    <tr>\n",
120
       "      <th>79642</th>\n",
121
       "      <td>79643</td>\n",
122
       "      <td>rs47505090</td>\n",
123
       "      <td>9.999041e-01</td>\n",
124
       "      <td>False</td>\n",
125
       "    </tr>\n",
126
       "    <tr>\n",
127
       "      <th>79643</th>\n",
128
       "      <td>79644</td>\n",
129
       "      <td>rs232293770</td>\n",
130
       "      <td>9.999351e-01</td>\n",
131
       "      <td>False</td>\n",
132
       "    </tr>\n",
133
       "    <tr>\n",
134
       "      <th>79644</th>\n",
135
       "      <td>79645</td>\n",
136
       "      <td>rs247449322</td>\n",
137
       "      <td>9.999861e-01</td>\n",
138
       "      <td>False</td>\n",
139
       "    </tr>\n",
140
       "  </tbody>\n",
141
       "</table>\n",
142
       "<p>79645 rows × 4 columns</p>\n",
143
       "</div>"
144
      ],
145
      "text/plain": [
146
       "       index          snp             p  interest\n",
147
       "0          1   rs29477109  5.052317e-14      True\n",
148
       "1          2   rs27071351  7.074181e-14      True\n",
149
       "2          3   rs27024162  7.170582e-14      True\n",
150
       "3          4   rs49423067  7.198661e-14      True\n",
151
       "4          5   rs29470802  8.049849e-14      True\n",
152
       "...      ...          ...           ...       ...\n",
153
       "79640  79641    rs3162358  9.998911e-01     False\n",
154
       "79641  79642   rs50509099  9.999012e-01     False\n",
155
       "79642  79643   rs47505090  9.999041e-01     False\n",
156
       "79643  79644  rs232293770  9.999351e-01     False\n",
157
       "79644  79645  rs247449322  9.999861e-01     False\n",
158
       "\n",
159
       "[79645 rows x 4 columns]"
160
      ]
161
     },
162
     "execution_count": 6,
163
     "metadata": {},
164
     "output_type": "execute_result"
165
    }
166
   ],
167
   "source": [
168
    "PHENOTYPE = 'abBMD'\n",
169
    "\n",
170
    "real_snps = pd.read_csv(real_gwas_path / f'lm_{PHENOTYPE}_1_79646.csv')\n",
171
    "real_snps = real_snps.rename(columns={real_snps.columns[0]: 'index'})\n",
172
    "real_snps = real_snps[['index', 'snp', 'p']]\n",
173
    "real_snps['interest'] = real_snps['p'].apply(lambda x: True if x <= 1e-8 else False)\n",
174
    "real_snps"
175
   ]
176
  },
177
  {
178
   "cell_type": "code",
179
   "execution_count": 7,
180
   "id": "34958109",
181
   "metadata": {},
182
   "outputs": [
183
    {
184
     "data": {
185
      "text/html": [
186
       "<div>\n",
187
       "<style scoped>\n",
188
       "    .dataframe tbody tr th:only-of-type {\n",
189
       "        vertical-align: middle;\n",
190
       "    }\n",
191
       "\n",
192
       "    .dataframe tbody tr th {\n",
193
       "        vertical-align: top;\n",
194
       "    }\n",
195
       "\n",
196
       "    .dataframe thead th {\n",
197
       "        text-align: right;\n",
198
       "    }\n",
199
       "</style>\n",
200
       "<table border=\"1\" class=\"dataframe\">\n",
201
       "  <thead>\n",
202
       "    <tr style=\"text-align: right;\">\n",
203
       "      <th></th>\n",
204
       "      <th>index</th>\n",
205
       "      <th>snp</th>\n",
206
       "      <th>p</th>\n",
207
       "      <th>interest</th>\n",
208
       "    </tr>\n",
209
       "  </thead>\n",
210
       "  <tbody>\n",
211
       "    <tr>\n",
212
       "      <th>0</th>\n",
213
       "      <td>1</td>\n",
214
       "      <td>rs36353660</td>\n",
215
       "      <td>0.000000e+00</td>\n",
216
       "      <td>True</td>\n",
217
       "    </tr>\n",
218
       "    <tr>\n",
219
       "      <th>1</th>\n",
220
       "      <td>2</td>\n",
221
       "      <td>rs29220747</td>\n",
222
       "      <td>1.398388e-86</td>\n",
223
       "      <td>True</td>\n",
224
       "    </tr>\n",
225
       "    <tr>\n",
226
       "      <th>2</th>\n",
227
       "      <td>3</td>\n",
228
       "      <td>rs29470086</td>\n",
229
       "      <td>5.929727e-86</td>\n",
230
       "      <td>True</td>\n",
231
       "    </tr>\n",
232
       "    <tr>\n",
233
       "      <th>3</th>\n",
234
       "      <td>4</td>\n",
235
       "      <td>rs33102275</td>\n",
236
       "      <td>2.838852e-85</td>\n",
237
       "      <td>True</td>\n",
238
       "    </tr>\n",
239
       "    <tr>\n",
240
       "      <th>4</th>\n",
241
       "      <td>5</td>\n",
242
       "      <td>rs252502314</td>\n",
243
       "      <td>9.043721e-85</td>\n",
244
       "      <td>True</td>\n",
245
       "    </tr>\n",
246
       "    <tr>\n",
247
       "      <th>...</th>\n",
248
       "      <td>...</td>\n",
249
       "      <td>...</td>\n",
250
       "      <td>...</td>\n",
251
       "      <td>...</td>\n",
252
       "    </tr>\n",
253
       "    <tr>\n",
254
       "      <th>71310</th>\n",
255
       "      <td>71311</td>\n",
256
       "      <td>cfw-17-49864534</td>\n",
257
       "      <td>9.999609e-01</td>\n",
258
       "      <td>False</td>\n",
259
       "    </tr>\n",
260
       "    <tr>\n",
261
       "      <th>71311</th>\n",
262
       "      <td>71312</td>\n",
263
       "      <td>rs30856414</td>\n",
264
       "      <td>9.999711e-01</td>\n",
265
       "      <td>False</td>\n",
266
       "    </tr>\n",
267
       "    <tr>\n",
268
       "      <th>71312</th>\n",
269
       "      <td>71313</td>\n",
270
       "      <td>rs108433568</td>\n",
271
       "      <td>9.999735e-01</td>\n",
272
       "      <td>False</td>\n",
273
       "    </tr>\n",
274
       "    <tr>\n",
275
       "      <th>71313</th>\n",
276
       "      <td>71314</td>\n",
277
       "      <td>rs237834328</td>\n",
278
       "      <td>9.999895e-01</td>\n",
279
       "      <td>False</td>\n",
280
       "    </tr>\n",
281
       "    <tr>\n",
282
       "      <th>71314</th>\n",
283
       "      <td>71315</td>\n",
284
       "      <td>rs52090420</td>\n",
285
       "      <td>9.999899e-01</td>\n",
286
       "      <td>False</td>\n",
287
       "    </tr>\n",
288
       "  </tbody>\n",
289
       "</table>\n",
290
       "<p>71315 rows × 4 columns</p>\n",
291
       "</div>"
292
      ],
293
      "text/plain": [
294
       "       index              snp             p  interest\n",
295
       "0          1       rs36353660  0.000000e+00      True\n",
296
       "1          2       rs29220747  1.398388e-86      True\n",
297
       "2          3       rs29470086  5.929727e-86      True\n",
298
       "3          4       rs33102275  2.838852e-85      True\n",
299
       "4          5      rs252502314  9.043721e-85      True\n",
300
       "...      ...              ...           ...       ...\n",
301
       "71310  71311  cfw-17-49864534  9.999609e-01     False\n",
302
       "71311  71312       rs30856414  9.999711e-01     False\n",
303
       "71312  71313      rs108433568  9.999735e-01     False\n",
304
       "71313  71314      rs237834328  9.999895e-01     False\n",
305
       "71314  71315       rs52090420  9.999899e-01     False\n",
306
       "\n",
307
       "[71315 rows x 4 columns]"
308
      ]
309
     },
310
     "execution_count": 7,
311
     "metadata": {},
312
     "output_type": "execute_result"
313
    }
314
   ],
315
   "source": [
316
    "# Be sure to update the name of your lm file appropriately\n",
317
    "\n",
318
    "synthetic_snps = pd.read_csv(synthetic_gwas_path / 'lm_batchall_abBMD_1_71316.csv') \n",
319
    "synthetic_snps = synthetic_snps.rename(columns={synthetic_snps.columns[0]: 'index'})\n",
320
    "synthetic_snps = synthetic_snps[['index', 'snp', 'p']]\n",
321
    "synthetic_snps['interest'] = synthetic_snps['p'].apply(lambda x: True if x <= 1e-8 else False)\n",
322
    "synthetic_snps"
323
   ]
324
  },
325
  {
326
   "cell_type": "code",
327
   "execution_count": 8,
328
   "id": "e1514a22",
329
   "metadata": {},
330
   "outputs": [
331
    {
332
     "data": {
333
      "text/html": [
334
       "<div>\n",
335
       "<style scoped>\n",
336
       "    .dataframe tbody tr th:only-of-type {\n",
337
       "        vertical-align: middle;\n",
338
       "    }\n",
339
       "\n",
340
       "    .dataframe tbody tr th {\n",
341
       "        vertical-align: top;\n",
342
       "    }\n",
343
       "\n",
344
       "    .dataframe thead th {\n",
345
       "        text-align: right;\n",
346
       "    }\n",
347
       "</style>\n",
348
       "<table border=\"1\" class=\"dataframe\">\n",
349
       "  <thead>\n",
350
       "    <tr style=\"text-align: right;\">\n",
351
       "      <th></th>\n",
352
       "      <th>index_synthetic</th>\n",
353
       "      <th>snp</th>\n",
354
       "      <th>p_synthetic</th>\n",
355
       "      <th>interest_synthetic</th>\n",
356
       "      <th>index_real</th>\n",
357
       "      <th>p_real</th>\n",
358
       "      <th>interest_real</th>\n",
359
       "    </tr>\n",
360
       "  </thead>\n",
361
       "  <tbody>\n",
362
       "    <tr>\n",
363
       "      <th>0</th>\n",
364
       "      <td>1</td>\n",
365
       "      <td>rs36353660</td>\n",
366
       "      <td>0.000000e+00</td>\n",
367
       "      <td>True</td>\n",
368
       "      <td>77608</td>\n",
369
       "      <td>9.734443e-01</td>\n",
370
       "      <td>False</td>\n",
371
       "    </tr>\n",
372
       "    <tr>\n",
373
       "      <th>1</th>\n",
374
       "      <td>2</td>\n",
375
       "      <td>rs29220747</td>\n",
376
       "      <td>1.398388e-86</td>\n",
377
       "      <td>True</td>\n",
378
       "      <td>75217</td>\n",
379
       "      <td>9.426825e-01</td>\n",
380
       "      <td>False</td>\n",
381
       "    </tr>\n",
382
       "    <tr>\n",
383
       "      <th>2</th>\n",
384
       "      <td>3</td>\n",
385
       "      <td>rs29470086</td>\n",
386
       "      <td>5.929727e-86</td>\n",
387
       "      <td>True</td>\n",
388
       "      <td>77</td>\n",
389
       "      <td>1.346918e-12</td>\n",
390
       "      <td>True</td>\n",
391
       "    </tr>\n",
392
       "    <tr>\n",
393
       "      <th>3</th>\n",
394
       "      <td>4</td>\n",
395
       "      <td>rs33102275</td>\n",
396
       "      <td>2.838852e-85</td>\n",
397
       "      <td>True</td>\n",
398
       "      <td>70949</td>\n",
399
       "      <td>8.872558e-01</td>\n",
400
       "      <td>False</td>\n",
401
       "    </tr>\n",
402
       "    <tr>\n",
403
       "      <th>4</th>\n",
404
       "      <td>5</td>\n",
405
       "      <td>rs252502314</td>\n",
406
       "      <td>9.043721e-85</td>\n",
407
       "      <td>True</td>\n",
408
       "      <td>74884</td>\n",
409
       "      <td>9.383166e-01</td>\n",
410
       "      <td>False</td>\n",
411
       "    </tr>\n",
412
       "    <tr>\n",
413
       "      <th>...</th>\n",
414
       "      <td>...</td>\n",
415
       "      <td>...</td>\n",
416
       "      <td>...</td>\n",
417
       "      <td>...</td>\n",
418
       "      <td>...</td>\n",
419
       "      <td>...</td>\n",
420
       "      <td>...</td>\n",
421
       "    </tr>\n",
422
       "    <tr>\n",
423
       "      <th>71310</th>\n",
424
       "      <td>71311</td>\n",
425
       "      <td>cfw-17-49864534</td>\n",
426
       "      <td>9.999609e-01</td>\n",
427
       "      <td>False</td>\n",
428
       "      <td>58306</td>\n",
429
       "      <td>7.228062e-01</td>\n",
430
       "      <td>False</td>\n",
431
       "    </tr>\n",
432
       "    <tr>\n",
433
       "      <th>71311</th>\n",
434
       "      <td>71312</td>\n",
435
       "      <td>rs30856414</td>\n",
436
       "      <td>9.999711e-01</td>\n",
437
       "      <td>False</td>\n",
438
       "      <td>31335</td>\n",
439
       "      <td>3.776636e-01</td>\n",
440
       "      <td>False</td>\n",
441
       "    </tr>\n",
442
       "    <tr>\n",
443
       "      <th>71312</th>\n",
444
       "      <td>71313</td>\n",
445
       "      <td>rs108433568</td>\n",
446
       "      <td>9.999735e-01</td>\n",
447
       "      <td>False</td>\n",
448
       "      <td>21151</td>\n",
449
       "      <td>2.508090e-01</td>\n",
450
       "      <td>False</td>\n",
451
       "    </tr>\n",
452
       "    <tr>\n",
453
       "      <th>71313</th>\n",
454
       "      <td>71314</td>\n",
455
       "      <td>rs237834328</td>\n",
456
       "      <td>9.999895e-01</td>\n",
457
       "      <td>False</td>\n",
458
       "      <td>13645</td>\n",
459
       "      <td>1.552650e-01</td>\n",
460
       "      <td>False</td>\n",
461
       "    </tr>\n",
462
       "    <tr>\n",
463
       "      <th>71314</th>\n",
464
       "      <td>71315</td>\n",
465
       "      <td>rs52090420</td>\n",
466
       "      <td>9.999899e-01</td>\n",
467
       "      <td>False</td>\n",
468
       "      <td>66119</td>\n",
469
       "      <td>8.233664e-01</td>\n",
470
       "      <td>False</td>\n",
471
       "    </tr>\n",
472
       "  </tbody>\n",
473
       "</table>\n",
474
       "<p>71315 rows × 7 columns</p>\n",
475
       "</div>"
476
      ],
477
      "text/plain": [
478
       "       index_synthetic              snp   p_synthetic  interest_synthetic  \\\n",
479
       "0                    1       rs36353660  0.000000e+00                True   \n",
480
       "1                    2       rs29220747  1.398388e-86                True   \n",
481
       "2                    3       rs29470086  5.929727e-86                True   \n",
482
       "3                    4       rs33102275  2.838852e-85                True   \n",
483
       "4                    5      rs252502314  9.043721e-85                True   \n",
484
       "...                ...              ...           ...                 ...   \n",
485
       "71310            71311  cfw-17-49864534  9.999609e-01               False   \n",
486
       "71311            71312       rs30856414  9.999711e-01               False   \n",
487
       "71312            71313      rs108433568  9.999735e-01               False   \n",
488
       "71313            71314      rs237834328  9.999895e-01               False   \n",
489
       "71314            71315       rs52090420  9.999899e-01               False   \n",
490
       "\n",
491
       "       index_real        p_real  interest_real  \n",
492
       "0           77608  9.734443e-01          False  \n",
493
       "1           75217  9.426825e-01          False  \n",
494
       "2              77  1.346918e-12           True  \n",
495
       "3           70949  8.872558e-01          False  \n",
496
       "4           74884  9.383166e-01          False  \n",
497
       "...           ...           ...            ...  \n",
498
       "71310       58306  7.228062e-01          False  \n",
499
       "71311       31335  3.776636e-01          False  \n",
500
       "71312       21151  2.508090e-01          False  \n",
501
       "71313       13645  1.552650e-01          False  \n",
502
       "71314       66119  8.233664e-01          False  \n",
503
       "\n",
504
       "[71315 rows x 7 columns]"
505
      ]
506
     },
507
     "execution_count": 8,
508
     "metadata": {},
509
     "output_type": "execute_result"
510
    }
511
   ],
512
   "source": [
513
    "combined = pd.merge(synthetic_snps, \n",
514
    "         real_snps, \n",
515
    "         how='inner', \n",
516
    "         on=['snp'],\n",
517
    "         suffixes=['_synthetic', '_real'])\n",
518
    "combined"
519
   ]
520
  },
521
  {
522
   "cell_type": "code",
523
   "execution_count": 9,
524
   "id": "86ba40f3",
525
   "metadata": {},
526
   "outputs": [
527
    {
528
     "name": "stdout",
529
     "output_type": "stream",
530
     "text": [
531
      "              precision    recall  f1-score   support\n",
532
      "\n",
533
      "       False       1.00      0.99      1.00     71122\n",
534
      "        True       0.32      0.92      0.47       193\n",
535
      "\n",
536
      "    accuracy                           0.99     71315\n",
537
      "   macro avg       0.66      0.96      0.74     71315\n",
538
      "weighted avg       1.00      0.99      1.00     71315\n"
539
     ]
540
    }
541
   ],
542
   "source": [
543
    "from sklearn.metrics import f1_score, classification_report, confusion_matrix\n",
544
    "\n",
545
    "print(classification_report(combined['interest_real'], combined['interest_synthetic']))\n"
546
   ]
547
  },
548
  {
549
   "cell_type": "code",
550
   "execution_count": null,
551
   "id": "b50257ac",
552
   "metadata": {},
553
   "outputs": [],
554
   "source": []
555
  }
556
 ],
557
 "metadata": {
558
  "kernelspec": {
559
   "display_name": "Python 3 (ipykernel)",
560
   "language": "python",
561
   "name": "python3"
562
  },
563
  "language_info": {
564
   "codemirror_mode": {
565
    "name": "ipython",
566
    "version": 3
567
   },
568
   "file_extension": ".py",
569
   "mimetype": "text/x-python",
570
   "name": "python",
571
   "nbconvert_exporter": "python",
572
   "pygments_lexer": "ipython3",
573
   "version": "3.7.5"
574
  }
575
 },
576
 "nbformat": 4,
577
 "nbformat_minor": 5
578
}