579 lines (578 with data), 18.6 kB
{
"cells": [
{
"cell_type": "markdown",
"id": "3a93d4c0",
"metadata": {},
"source": [
"# Compare Associations\n",
"* This notebook compares the final GWAS p-values for the full synthetic genome/phenome datasets to those in the original genome/phenome datasets and computes the precision, recall and F1 values."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "97ac91d7",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import pathlib\n",
"import pandas as pd\n",
"\n",
"base_path = pathlib.Path(os.getcwd().replace(\"/synthetics\", \"\"))\n",
"data_path = base_path / 'mice_data_set' / 'data' \n",
"real_gwas_path = base_path / 'mice_data_set' / 'out' \n",
"synthetic_gwas_path = base_path / 'mice_data_set' / 'out_synth'\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "0198fdaf",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>index</th>\n",
" <th>snp</th>\n",
" <th>p</th>\n",
" <th>interest</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>rs29477109</td>\n",
" <td>5.052317e-14</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>rs27071351</td>\n",
" <td>7.074181e-14</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>rs27024162</td>\n",
" <td>7.170582e-14</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>rs49423067</td>\n",
" <td>7.198661e-14</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>rs29470802</td>\n",
" <td>8.049849e-14</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>79640</th>\n",
" <td>79641</td>\n",
" <td>rs3162358</td>\n",
" <td>9.998911e-01</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>79641</th>\n",
" <td>79642</td>\n",
" <td>rs50509099</td>\n",
" <td>9.999012e-01</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>79642</th>\n",
" <td>79643</td>\n",
" <td>rs47505090</td>\n",
" <td>9.999041e-01</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>79643</th>\n",
" <td>79644</td>\n",
" <td>rs232293770</td>\n",
" <td>9.999351e-01</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>79644</th>\n",
" <td>79645</td>\n",
" <td>rs247449322</td>\n",
" <td>9.999861e-01</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>79645 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" index snp p interest\n",
"0 1 rs29477109 5.052317e-14 True\n",
"1 2 rs27071351 7.074181e-14 True\n",
"2 3 rs27024162 7.170582e-14 True\n",
"3 4 rs49423067 7.198661e-14 True\n",
"4 5 rs29470802 8.049849e-14 True\n",
"... ... ... ... ...\n",
"79640 79641 rs3162358 9.998911e-01 False\n",
"79641 79642 rs50509099 9.999012e-01 False\n",
"79642 79643 rs47505090 9.999041e-01 False\n",
"79643 79644 rs232293770 9.999351e-01 False\n",
"79644 79645 rs247449322 9.999861e-01 False\n",
"\n",
"[79645 rows x 4 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"PHENOTYPE = 'abBMD'\n",
"\n",
"real_snps = pd.read_csv(real_gwas_path / f'lm_{PHENOTYPE}_1_79646.csv')\n",
"real_snps = real_snps.rename(columns={real_snps.columns[0]: 'index'})\n",
"real_snps = real_snps[['index', 'snp', 'p']]\n",
"real_snps['interest'] = real_snps['p'].apply(lambda x: True if x <= 1e-8 else False)\n",
"real_snps"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "34958109",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>index</th>\n",
" <th>snp</th>\n",
" <th>p</th>\n",
" <th>interest</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>rs36353660</td>\n",
" <td>0.000000e+00</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>rs29220747</td>\n",
" <td>1.398388e-86</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>rs29470086</td>\n",
" <td>5.929727e-86</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>rs33102275</td>\n",
" <td>2.838852e-85</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>rs252502314</td>\n",
" <td>9.043721e-85</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71310</th>\n",
" <td>71311</td>\n",
" <td>cfw-17-49864534</td>\n",
" <td>9.999609e-01</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71311</th>\n",
" <td>71312</td>\n",
" <td>rs30856414</td>\n",
" <td>9.999711e-01</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71312</th>\n",
" <td>71313</td>\n",
" <td>rs108433568</td>\n",
" <td>9.999735e-01</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71313</th>\n",
" <td>71314</td>\n",
" <td>rs237834328</td>\n",
" <td>9.999895e-01</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71314</th>\n",
" <td>71315</td>\n",
" <td>rs52090420</td>\n",
" <td>9.999899e-01</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>71315 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" index snp p interest\n",
"0 1 rs36353660 0.000000e+00 True\n",
"1 2 rs29220747 1.398388e-86 True\n",
"2 3 rs29470086 5.929727e-86 True\n",
"3 4 rs33102275 2.838852e-85 True\n",
"4 5 rs252502314 9.043721e-85 True\n",
"... ... ... ... ...\n",
"71310 71311 cfw-17-49864534 9.999609e-01 False\n",
"71311 71312 rs30856414 9.999711e-01 False\n",
"71312 71313 rs108433568 9.999735e-01 False\n",
"71313 71314 rs237834328 9.999895e-01 False\n",
"71314 71315 rs52090420 9.999899e-01 False\n",
"\n",
"[71315 rows x 4 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Be sure to update the name of your lm file appropriately\n",
"\n",
"synthetic_snps = pd.read_csv(synthetic_gwas_path / 'lm_batchall_abBMD_1_71316.csv') \n",
"synthetic_snps = synthetic_snps.rename(columns={synthetic_snps.columns[0]: 'index'})\n",
"synthetic_snps = synthetic_snps[['index', 'snp', 'p']]\n",
"synthetic_snps['interest'] = synthetic_snps['p'].apply(lambda x: True if x <= 1e-8 else False)\n",
"synthetic_snps"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "e1514a22",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>index_synthetic</th>\n",
" <th>snp</th>\n",
" <th>p_synthetic</th>\n",
" <th>interest_synthetic</th>\n",
" <th>index_real</th>\n",
" <th>p_real</th>\n",
" <th>interest_real</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>rs36353660</td>\n",
" <td>0.000000e+00</td>\n",
" <td>True</td>\n",
" <td>77608</td>\n",
" <td>9.734443e-01</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>rs29220747</td>\n",
" <td>1.398388e-86</td>\n",
" <td>True</td>\n",
" <td>75217</td>\n",
" <td>9.426825e-01</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>rs29470086</td>\n",
" <td>5.929727e-86</td>\n",
" <td>True</td>\n",
" <td>77</td>\n",
" <td>1.346918e-12</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>rs33102275</td>\n",
" <td>2.838852e-85</td>\n",
" <td>True</td>\n",
" <td>70949</td>\n",
" <td>8.872558e-01</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>rs252502314</td>\n",
" <td>9.043721e-85</td>\n",
" <td>True</td>\n",
" <td>74884</td>\n",
" <td>9.383166e-01</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71310</th>\n",
" <td>71311</td>\n",
" <td>cfw-17-49864534</td>\n",
" <td>9.999609e-01</td>\n",
" <td>False</td>\n",
" <td>58306</td>\n",
" <td>7.228062e-01</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71311</th>\n",
" <td>71312</td>\n",
" <td>rs30856414</td>\n",
" <td>9.999711e-01</td>\n",
" <td>False</td>\n",
" <td>31335</td>\n",
" <td>3.776636e-01</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71312</th>\n",
" <td>71313</td>\n",
" <td>rs108433568</td>\n",
" <td>9.999735e-01</td>\n",
" <td>False</td>\n",
" <td>21151</td>\n",
" <td>2.508090e-01</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71313</th>\n",
" <td>71314</td>\n",
" <td>rs237834328</td>\n",
" <td>9.999895e-01</td>\n",
" <td>False</td>\n",
" <td>13645</td>\n",
" <td>1.552650e-01</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71314</th>\n",
" <td>71315</td>\n",
" <td>rs52090420</td>\n",
" <td>9.999899e-01</td>\n",
" <td>False</td>\n",
" <td>66119</td>\n",
" <td>8.233664e-01</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>71315 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" index_synthetic snp p_synthetic interest_synthetic \\\n",
"0 1 rs36353660 0.000000e+00 True \n",
"1 2 rs29220747 1.398388e-86 True \n",
"2 3 rs29470086 5.929727e-86 True \n",
"3 4 rs33102275 2.838852e-85 True \n",
"4 5 rs252502314 9.043721e-85 True \n",
"... ... ... ... ... \n",
"71310 71311 cfw-17-49864534 9.999609e-01 False \n",
"71311 71312 rs30856414 9.999711e-01 False \n",
"71312 71313 rs108433568 9.999735e-01 False \n",
"71313 71314 rs237834328 9.999895e-01 False \n",
"71314 71315 rs52090420 9.999899e-01 False \n",
"\n",
" index_real p_real interest_real \n",
"0 77608 9.734443e-01 False \n",
"1 75217 9.426825e-01 False \n",
"2 77 1.346918e-12 True \n",
"3 70949 8.872558e-01 False \n",
"4 74884 9.383166e-01 False \n",
"... ... ... ... \n",
"71310 58306 7.228062e-01 False \n",
"71311 31335 3.776636e-01 False \n",
"71312 21151 2.508090e-01 False \n",
"71313 13645 1.552650e-01 False \n",
"71314 66119 8.233664e-01 False \n",
"\n",
"[71315 rows x 7 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"combined = pd.merge(synthetic_snps, \n",
" real_snps, \n",
" how='inner', \n",
" on=['snp'],\n",
" suffixes=['_synthetic', '_real'])\n",
"combined"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "86ba40f3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" False 1.00 0.99 1.00 71122\n",
" True 0.32 0.92 0.47 193\n",
"\n",
" accuracy 0.99 71315\n",
" macro avg 0.66 0.96 0.74 71315\n",
"weighted avg 1.00 0.99 1.00 71315\n"
]
}
],
"source": [
"from sklearn.metrics import f1_score, classification_report, confusion_matrix\n",
"\n",
"print(classification_report(combined['interest_real'], combined['interest_synthetic']))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b50257ac",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}