[befbfc]: / synthetics / 05_compare_associations.ipynb

Download this file

579 lines (578 with data), 18.6 kB

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "3a93d4c0",
   "metadata": {},
   "source": [
    "# Compare Associations\n",
    "* This notebook compares the final GWAS p-values for the full synthetic genome/phenome datasets to those in the original genome/phenome datasets and computes the precision, recall and F1 values."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "97ac91d7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pathlib\n",
    "import pandas as pd\n",
    "\n",
    "base_path = pathlib.Path(os.getcwd().replace(\"/synthetics\", \"\"))\n",
    "data_path = base_path / 'mice_data_set' / 'data' \n",
    "real_gwas_path = base_path / 'mice_data_set' / 'out' \n",
    "synthetic_gwas_path = base_path / 'mice_data_set' / 'out_synth'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "0198fdaf",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>snp</th>\n",
       "      <th>p</th>\n",
       "      <th>interest</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>rs29477109</td>\n",
       "      <td>5.052317e-14</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>rs27071351</td>\n",
       "      <td>7.074181e-14</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>rs27024162</td>\n",
       "      <td>7.170582e-14</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>rs49423067</td>\n",
       "      <td>7.198661e-14</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>rs29470802</td>\n",
       "      <td>8.049849e-14</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>79640</th>\n",
       "      <td>79641</td>\n",
       "      <td>rs3162358</td>\n",
       "      <td>9.998911e-01</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>79641</th>\n",
       "      <td>79642</td>\n",
       "      <td>rs50509099</td>\n",
       "      <td>9.999012e-01</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>79642</th>\n",
       "      <td>79643</td>\n",
       "      <td>rs47505090</td>\n",
       "      <td>9.999041e-01</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>79643</th>\n",
       "      <td>79644</td>\n",
       "      <td>rs232293770</td>\n",
       "      <td>9.999351e-01</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>79644</th>\n",
       "      <td>79645</td>\n",
       "      <td>rs247449322</td>\n",
       "      <td>9.999861e-01</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>79645 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       index          snp             p  interest\n",
       "0          1   rs29477109  5.052317e-14      True\n",
       "1          2   rs27071351  7.074181e-14      True\n",
       "2          3   rs27024162  7.170582e-14      True\n",
       "3          4   rs49423067  7.198661e-14      True\n",
       "4          5   rs29470802  8.049849e-14      True\n",
       "...      ...          ...           ...       ...\n",
       "79640  79641    rs3162358  9.998911e-01     False\n",
       "79641  79642   rs50509099  9.999012e-01     False\n",
       "79642  79643   rs47505090  9.999041e-01     False\n",
       "79643  79644  rs232293770  9.999351e-01     False\n",
       "79644  79645  rs247449322  9.999861e-01     False\n",
       "\n",
       "[79645 rows x 4 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "PHENOTYPE = 'abBMD'\n",
    "\n",
    "real_snps = pd.read_csv(real_gwas_path / f'lm_{PHENOTYPE}_1_79646.csv')\n",
    "real_snps = real_snps.rename(columns={real_snps.columns[0]: 'index'})\n",
    "real_snps = real_snps[['index', 'snp', 'p']]\n",
    "real_snps['interest'] = real_snps['p'].apply(lambda x: True if x <= 1e-8 else False)\n",
    "real_snps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "34958109",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>snp</th>\n",
       "      <th>p</th>\n",
       "      <th>interest</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>rs36353660</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>rs29220747</td>\n",
       "      <td>1.398388e-86</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>rs29470086</td>\n",
       "      <td>5.929727e-86</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>rs33102275</td>\n",
       "      <td>2.838852e-85</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>rs252502314</td>\n",
       "      <td>9.043721e-85</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>71310</th>\n",
       "      <td>71311</td>\n",
       "      <td>cfw-17-49864534</td>\n",
       "      <td>9.999609e-01</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>71311</th>\n",
       "      <td>71312</td>\n",
       "      <td>rs30856414</td>\n",
       "      <td>9.999711e-01</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>71312</th>\n",
       "      <td>71313</td>\n",
       "      <td>rs108433568</td>\n",
       "      <td>9.999735e-01</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>71313</th>\n",
       "      <td>71314</td>\n",
       "      <td>rs237834328</td>\n",
       "      <td>9.999895e-01</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>71314</th>\n",
       "      <td>71315</td>\n",
       "      <td>rs52090420</td>\n",
       "      <td>9.999899e-01</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>71315 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       index              snp             p  interest\n",
       "0          1       rs36353660  0.000000e+00      True\n",
       "1          2       rs29220747  1.398388e-86      True\n",
       "2          3       rs29470086  5.929727e-86      True\n",
       "3          4       rs33102275  2.838852e-85      True\n",
       "4          5      rs252502314  9.043721e-85      True\n",
       "...      ...              ...           ...       ...\n",
       "71310  71311  cfw-17-49864534  9.999609e-01     False\n",
       "71311  71312       rs30856414  9.999711e-01     False\n",
       "71312  71313      rs108433568  9.999735e-01     False\n",
       "71313  71314      rs237834328  9.999895e-01     False\n",
       "71314  71315       rs52090420  9.999899e-01     False\n",
       "\n",
       "[71315 rows x 4 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Be sure to update the name of your lm file appropriately\n",
    "\n",
    "synthetic_snps = pd.read_csv(synthetic_gwas_path / 'lm_batchall_abBMD_1_71316.csv') \n",
    "synthetic_snps = synthetic_snps.rename(columns={synthetic_snps.columns[0]: 'index'})\n",
    "synthetic_snps = synthetic_snps[['index', 'snp', 'p']]\n",
    "synthetic_snps['interest'] = synthetic_snps['p'].apply(lambda x: True if x <= 1e-8 else False)\n",
    "synthetic_snps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "e1514a22",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index_synthetic</th>\n",
       "      <th>snp</th>\n",
       "      <th>p_synthetic</th>\n",
       "      <th>interest_synthetic</th>\n",
       "      <th>index_real</th>\n",
       "      <th>p_real</th>\n",
       "      <th>interest_real</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>rs36353660</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>True</td>\n",
       "      <td>77608</td>\n",
       "      <td>9.734443e-01</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>rs29220747</td>\n",
       "      <td>1.398388e-86</td>\n",
       "      <td>True</td>\n",
       "      <td>75217</td>\n",
       "      <td>9.426825e-01</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>rs29470086</td>\n",
       "      <td>5.929727e-86</td>\n",
       "      <td>True</td>\n",
       "      <td>77</td>\n",
       "      <td>1.346918e-12</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>rs33102275</td>\n",
       "      <td>2.838852e-85</td>\n",
       "      <td>True</td>\n",
       "      <td>70949</td>\n",
       "      <td>8.872558e-01</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>rs252502314</td>\n",
       "      <td>9.043721e-85</td>\n",
       "      <td>True</td>\n",
       "      <td>74884</td>\n",
       "      <td>9.383166e-01</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>71310</th>\n",
       "      <td>71311</td>\n",
       "      <td>cfw-17-49864534</td>\n",
       "      <td>9.999609e-01</td>\n",
       "      <td>False</td>\n",
       "      <td>58306</td>\n",
       "      <td>7.228062e-01</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>71311</th>\n",
       "      <td>71312</td>\n",
       "      <td>rs30856414</td>\n",
       "      <td>9.999711e-01</td>\n",
       "      <td>False</td>\n",
       "      <td>31335</td>\n",
       "      <td>3.776636e-01</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>71312</th>\n",
       "      <td>71313</td>\n",
       "      <td>rs108433568</td>\n",
       "      <td>9.999735e-01</td>\n",
       "      <td>False</td>\n",
       "      <td>21151</td>\n",
       "      <td>2.508090e-01</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>71313</th>\n",
       "      <td>71314</td>\n",
       "      <td>rs237834328</td>\n",
       "      <td>9.999895e-01</td>\n",
       "      <td>False</td>\n",
       "      <td>13645</td>\n",
       "      <td>1.552650e-01</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>71314</th>\n",
       "      <td>71315</td>\n",
       "      <td>rs52090420</td>\n",
       "      <td>9.999899e-01</td>\n",
       "      <td>False</td>\n",
       "      <td>66119</td>\n",
       "      <td>8.233664e-01</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>71315 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       index_synthetic              snp   p_synthetic  interest_synthetic  \\\n",
       "0                    1       rs36353660  0.000000e+00                True   \n",
       "1                    2       rs29220747  1.398388e-86                True   \n",
       "2                    3       rs29470086  5.929727e-86                True   \n",
       "3                    4       rs33102275  2.838852e-85                True   \n",
       "4                    5      rs252502314  9.043721e-85                True   \n",
       "...                ...              ...           ...                 ...   \n",
       "71310            71311  cfw-17-49864534  9.999609e-01               False   \n",
       "71311            71312       rs30856414  9.999711e-01               False   \n",
       "71312            71313      rs108433568  9.999735e-01               False   \n",
       "71313            71314      rs237834328  9.999895e-01               False   \n",
       "71314            71315       rs52090420  9.999899e-01               False   \n",
       "\n",
       "       index_real        p_real  interest_real  \n",
       "0           77608  9.734443e-01          False  \n",
       "1           75217  9.426825e-01          False  \n",
       "2              77  1.346918e-12           True  \n",
       "3           70949  8.872558e-01          False  \n",
       "4           74884  9.383166e-01          False  \n",
       "...           ...           ...            ...  \n",
       "71310       58306  7.228062e-01          False  \n",
       "71311       31335  3.776636e-01          False  \n",
       "71312       21151  2.508090e-01          False  \n",
       "71313       13645  1.552650e-01          False  \n",
       "71314       66119  8.233664e-01          False  \n",
       "\n",
       "[71315 rows x 7 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "combined = pd.merge(synthetic_snps, \n",
    "         real_snps, \n",
    "         how='inner', \n",
    "         on=['snp'],\n",
    "         suffixes=['_synthetic', '_real'])\n",
    "combined"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "86ba40f3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "       False       1.00      0.99      1.00     71122\n",
      "        True       0.32      0.92      0.47       193\n",
      "\n",
      "    accuracy                           0.99     71315\n",
      "   macro avg       0.66      0.96      0.74     71315\n",
      "weighted avg       1.00      0.99      1.00     71315\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import f1_score, classification_report, confusion_matrix\n",
    "\n",
    "print(classification_report(combined['interest_real'], combined['interest_synthetic']))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b50257ac",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}