{ "cells": [ { "cell_type": "markdown", "id": "3a93d4c0", "metadata": {}, "source": [ "# Compare Associations\n", "* This notebook compares the final GWAS p-values for the full synthetic genome/phenome datasets to those in the original genome/phenome datasets and computes the precision, recall and F1 values." ] }, { "cell_type": "code", "execution_count": 1, "id": "97ac91d7", "metadata": {}, "outputs": [], "source": [ "import os\n", "import pathlib\n", "import pandas as pd\n", "\n", "base_path = pathlib.Path(os.getcwd().replace(\"/synthetics\", \"\"))\n", "data_path = base_path / 'mice_data_set' / 'data' \n", "real_gwas_path = base_path / 'mice_data_set' / 'out' \n", "synthetic_gwas_path = base_path / 'mice_data_set' / 'out_synth'\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "0198fdaf", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexsnppinterest
01rs294771095.052317e-14True
12rs270713517.074181e-14True
23rs270241627.170582e-14True
34rs494230677.198661e-14True
45rs294708028.049849e-14True
...............
7964079641rs31623589.998911e-01False
7964179642rs505090999.999012e-01False
7964279643rs475050909.999041e-01False
7964379644rs2322937709.999351e-01False
7964479645rs2474493229.999861e-01False
\n", "

79645 rows × 4 columns

\n", "
" ], "text/plain": [ " index snp p interest\n", "0 1 rs29477109 5.052317e-14 True\n", "1 2 rs27071351 7.074181e-14 True\n", "2 3 rs27024162 7.170582e-14 True\n", "3 4 rs49423067 7.198661e-14 True\n", "4 5 rs29470802 8.049849e-14 True\n", "... ... ... ... ...\n", "79640 79641 rs3162358 9.998911e-01 False\n", "79641 79642 rs50509099 9.999012e-01 False\n", "79642 79643 rs47505090 9.999041e-01 False\n", "79643 79644 rs232293770 9.999351e-01 False\n", "79644 79645 rs247449322 9.999861e-01 False\n", "\n", "[79645 rows x 4 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "PHENOTYPE = 'abBMD'\n", "\n", "real_snps = pd.read_csv(real_gwas_path / f'lm_{PHENOTYPE}_1_79646.csv')\n", "real_snps = real_snps.rename(columns={real_snps.columns[0]: 'index'})\n", "real_snps = real_snps[['index', 'snp', 'p']]\n", "real_snps['interest'] = real_snps['p'].apply(lambda x: True if x <= 1e-8 else False)\n", "real_snps" ] }, { "cell_type": "code", "execution_count": 7, "id": "34958109", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexsnppinterest
01rs363536600.000000e+00True
12rs292207471.398388e-86True
23rs294700865.929727e-86True
34rs331022752.838852e-85True
45rs2525023149.043721e-85True
...............
7131071311cfw-17-498645349.999609e-01False
7131171312rs308564149.999711e-01False
7131271313rs1084335689.999735e-01False
7131371314rs2378343289.999895e-01False
7131471315rs520904209.999899e-01False
\n", "

71315 rows × 4 columns

\n", "
" ], "text/plain": [ " index snp p interest\n", "0 1 rs36353660 0.000000e+00 True\n", "1 2 rs29220747 1.398388e-86 True\n", "2 3 rs29470086 5.929727e-86 True\n", "3 4 rs33102275 2.838852e-85 True\n", "4 5 rs252502314 9.043721e-85 True\n", "... ... ... ... ...\n", "71310 71311 cfw-17-49864534 9.999609e-01 False\n", "71311 71312 rs30856414 9.999711e-01 False\n", "71312 71313 rs108433568 9.999735e-01 False\n", "71313 71314 rs237834328 9.999895e-01 False\n", "71314 71315 rs52090420 9.999899e-01 False\n", "\n", "[71315 rows x 4 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Be sure to update the name of your lm file appropriately\n", "\n", "synthetic_snps = pd.read_csv(synthetic_gwas_path / 'lm_batchall_abBMD_1_71316.csv') \n", "synthetic_snps = synthetic_snps.rename(columns={synthetic_snps.columns[0]: 'index'})\n", "synthetic_snps = synthetic_snps[['index', 'snp', 'p']]\n", "synthetic_snps['interest'] = synthetic_snps['p'].apply(lambda x: True if x <= 1e-8 else False)\n", "synthetic_snps" ] }, { "cell_type": "code", "execution_count": 8, "id": "e1514a22", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
index_syntheticsnpp_syntheticinterest_syntheticindex_realp_realinterest_real
01rs363536600.000000e+00True776089.734443e-01False
12rs292207471.398388e-86True752179.426825e-01False
23rs294700865.929727e-86True771.346918e-12True
34rs331022752.838852e-85True709498.872558e-01False
45rs2525023149.043721e-85True748849.383166e-01False
........................
7131071311cfw-17-498645349.999609e-01False583067.228062e-01False
7131171312rs308564149.999711e-01False313353.776636e-01False
7131271313rs1084335689.999735e-01False211512.508090e-01False
7131371314rs2378343289.999895e-01False136451.552650e-01False
7131471315rs520904209.999899e-01False661198.233664e-01False
\n", "

71315 rows × 7 columns

\n", "
" ], "text/plain": [ " index_synthetic snp p_synthetic interest_synthetic \\\n", "0 1 rs36353660 0.000000e+00 True \n", "1 2 rs29220747 1.398388e-86 True \n", "2 3 rs29470086 5.929727e-86 True \n", "3 4 rs33102275 2.838852e-85 True \n", "4 5 rs252502314 9.043721e-85 True \n", "... ... ... ... ... \n", "71310 71311 cfw-17-49864534 9.999609e-01 False \n", "71311 71312 rs30856414 9.999711e-01 False \n", "71312 71313 rs108433568 9.999735e-01 False \n", "71313 71314 rs237834328 9.999895e-01 False \n", "71314 71315 rs52090420 9.999899e-01 False \n", "\n", " index_real p_real interest_real \n", "0 77608 9.734443e-01 False \n", "1 75217 9.426825e-01 False \n", "2 77 1.346918e-12 True \n", "3 70949 8.872558e-01 False \n", "4 74884 9.383166e-01 False \n", "... ... ... ... \n", "71310 58306 7.228062e-01 False \n", "71311 31335 3.776636e-01 False \n", "71312 21151 2.508090e-01 False \n", "71313 13645 1.552650e-01 False \n", "71314 66119 8.233664e-01 False \n", "\n", "[71315 rows x 7 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "combined = pd.merge(synthetic_snps, \n", " real_snps, \n", " how='inner', \n", " on=['snp'],\n", " suffixes=['_synthetic', '_real'])\n", "combined" ] }, { "cell_type": "code", "execution_count": 9, "id": "86ba40f3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " False 1.00 0.99 1.00 71122\n", " True 0.32 0.92 0.47 193\n", "\n", " accuracy 0.99 71315\n", " macro avg 0.66 0.96 0.74 71315\n", "weighted avg 1.00 0.99 1.00 71315\n" ] } ], "source": [ "from sklearn.metrics import f1_score, classification_report, confusion_matrix\n", "\n", "print(classification_report(combined['interest_real'], combined['interest_synthetic']))\n" ] }, { "cell_type": "code", "execution_count": null, "id": "b50257ac", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.5" } }, "nbformat": 4, "nbformat_minor": 5 }