--- a
+++ b/synthetics/05_compare_associations.ipynb
@@ -0,0 +1,578 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "3a93d4c0",
+   "metadata": {},
+   "source": [
+    "# Compare Associations\n",
+    "* This notebook compares the final GWAS p-values for the full synthetic genome/phenome datasets to those in the original genome/phenome datasets and computes the precision, recall and F1 values."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "97ac91d7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pathlib\n",
+    "import pandas as pd\n",
+    "\n",
+    "base_path = pathlib.Path(os.getcwd().replace(\"/synthetics\", \"\"))\n",
+    "data_path = base_path / 'mice_data_set' / 'data' \n",
+    "real_gwas_path = base_path / 'mice_data_set' / 'out' \n",
+    "synthetic_gwas_path = base_path / 'mice_data_set' / 'out_synth'\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "0198fdaf",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>index</th>\n",
+       "      <th>snp</th>\n",
+       "      <th>p</th>\n",
+       "      <th>interest</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>rs29477109</td>\n",
+       "      <td>5.052317e-14</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>rs27071351</td>\n",
+       "      <td>7.074181e-14</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3</td>\n",
+       "      <td>rs27024162</td>\n",
+       "      <td>7.170582e-14</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4</td>\n",
+       "      <td>rs49423067</td>\n",
+       "      <td>7.198661e-14</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5</td>\n",
+       "      <td>rs29470802</td>\n",
+       "      <td>8.049849e-14</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>79640</th>\n",
+       "      <td>79641</td>\n",
+       "      <td>rs3162358</td>\n",
+       "      <td>9.998911e-01</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>79641</th>\n",
+       "      <td>79642</td>\n",
+       "      <td>rs50509099</td>\n",
+       "      <td>9.999012e-01</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>79642</th>\n",
+       "      <td>79643</td>\n",
+       "      <td>rs47505090</td>\n",
+       "      <td>9.999041e-01</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>79643</th>\n",
+       "      <td>79644</td>\n",
+       "      <td>rs232293770</td>\n",
+       "      <td>9.999351e-01</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>79644</th>\n",
+       "      <td>79645</td>\n",
+       "      <td>rs247449322</td>\n",
+       "      <td>9.999861e-01</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>79645 rows × 4 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       index          snp             p  interest\n",
+       "0          1   rs29477109  5.052317e-14      True\n",
+       "1          2   rs27071351  7.074181e-14      True\n",
+       "2          3   rs27024162  7.170582e-14      True\n",
+       "3          4   rs49423067  7.198661e-14      True\n",
+       "4          5   rs29470802  8.049849e-14      True\n",
+       "...      ...          ...           ...       ...\n",
+       "79640  79641    rs3162358  9.998911e-01     False\n",
+       "79641  79642   rs50509099  9.999012e-01     False\n",
+       "79642  79643   rs47505090  9.999041e-01     False\n",
+       "79643  79644  rs232293770  9.999351e-01     False\n",
+       "79644  79645  rs247449322  9.999861e-01     False\n",
+       "\n",
+       "[79645 rows x 4 columns]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "PHENOTYPE = 'abBMD'\n",
+    "\n",
+    "real_snps = pd.read_csv(real_gwas_path / f'lm_{PHENOTYPE}_1_79646.csv')\n",
+    "real_snps = real_snps.rename(columns={real_snps.columns[0]: 'index'})\n",
+    "real_snps = real_snps[['index', 'snp', 'p']]\n",
+    "real_snps['interest'] = real_snps['p'].apply(lambda x: True if x <= 1e-8 else False)\n",
+    "real_snps"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "34958109",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>index</th>\n",
+       "      <th>snp</th>\n",
+       "      <th>p</th>\n",
+       "      <th>interest</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>rs36353660</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>rs29220747</td>\n",
+       "      <td>1.398388e-86</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3</td>\n",
+       "      <td>rs29470086</td>\n",
+       "      <td>5.929727e-86</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4</td>\n",
+       "      <td>rs33102275</td>\n",
+       "      <td>2.838852e-85</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5</td>\n",
+       "      <td>rs252502314</td>\n",
+       "      <td>9.043721e-85</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>71310</th>\n",
+       "      <td>71311</td>\n",
+       "      <td>cfw-17-49864534</td>\n",
+       "      <td>9.999609e-01</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>71311</th>\n",
+       "      <td>71312</td>\n",
+       "      <td>rs30856414</td>\n",
+       "      <td>9.999711e-01</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>71312</th>\n",
+       "      <td>71313</td>\n",
+       "      <td>rs108433568</td>\n",
+       "      <td>9.999735e-01</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>71313</th>\n",
+       "      <td>71314</td>\n",
+       "      <td>rs237834328</td>\n",
+       "      <td>9.999895e-01</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>71314</th>\n",
+       "      <td>71315</td>\n",
+       "      <td>rs52090420</td>\n",
+       "      <td>9.999899e-01</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>71315 rows × 4 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       index              snp             p  interest\n",
+       "0          1       rs36353660  0.000000e+00      True\n",
+       "1          2       rs29220747  1.398388e-86      True\n",
+       "2          3       rs29470086  5.929727e-86      True\n",
+       "3          4       rs33102275  2.838852e-85      True\n",
+       "4          5      rs252502314  9.043721e-85      True\n",
+       "...      ...              ...           ...       ...\n",
+       "71310  71311  cfw-17-49864534  9.999609e-01     False\n",
+       "71311  71312       rs30856414  9.999711e-01     False\n",
+       "71312  71313      rs108433568  9.999735e-01     False\n",
+       "71313  71314      rs237834328  9.999895e-01     False\n",
+       "71314  71315       rs52090420  9.999899e-01     False\n",
+       "\n",
+       "[71315 rows x 4 columns]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Be sure to update the name of your lm file appropriately\n",
+    "\n",
+    "synthetic_snps = pd.read_csv(synthetic_gwas_path / 'lm_batchall_abBMD_1_71316.csv') \n",
+    "synthetic_snps = synthetic_snps.rename(columns={synthetic_snps.columns[0]: 'index'})\n",
+    "synthetic_snps = synthetic_snps[['index', 'snp', 'p']]\n",
+    "synthetic_snps['interest'] = synthetic_snps['p'].apply(lambda x: True if x <= 1e-8 else False)\n",
+    "synthetic_snps"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "e1514a22",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>index_synthetic</th>\n",
+       "      <th>snp</th>\n",
+       "      <th>p_synthetic</th>\n",
+       "      <th>interest_synthetic</th>\n",
+       "      <th>index_real</th>\n",
+       "      <th>p_real</th>\n",
+       "      <th>interest_real</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>rs36353660</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "      <td>True</td>\n",
+       "      <td>77608</td>\n",
+       "      <td>9.734443e-01</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>rs29220747</td>\n",
+       "      <td>1.398388e-86</td>\n",
+       "      <td>True</td>\n",
+       "      <td>75217</td>\n",
+       "      <td>9.426825e-01</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3</td>\n",
+       "      <td>rs29470086</td>\n",
+       "      <td>5.929727e-86</td>\n",
+       "      <td>True</td>\n",
+       "      <td>77</td>\n",
+       "      <td>1.346918e-12</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4</td>\n",
+       "      <td>rs33102275</td>\n",
+       "      <td>2.838852e-85</td>\n",
+       "      <td>True</td>\n",
+       "      <td>70949</td>\n",
+       "      <td>8.872558e-01</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5</td>\n",
+       "      <td>rs252502314</td>\n",
+       "      <td>9.043721e-85</td>\n",
+       "      <td>True</td>\n",
+       "      <td>74884</td>\n",
+       "      <td>9.383166e-01</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>71310</th>\n",
+       "      <td>71311</td>\n",
+       "      <td>cfw-17-49864534</td>\n",
+       "      <td>9.999609e-01</td>\n",
+       "      <td>False</td>\n",
+       "      <td>58306</td>\n",
+       "      <td>7.228062e-01</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>71311</th>\n",
+       "      <td>71312</td>\n",
+       "      <td>rs30856414</td>\n",
+       "      <td>9.999711e-01</td>\n",
+       "      <td>False</td>\n",
+       "      <td>31335</td>\n",
+       "      <td>3.776636e-01</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>71312</th>\n",
+       "      <td>71313</td>\n",
+       "      <td>rs108433568</td>\n",
+       "      <td>9.999735e-01</td>\n",
+       "      <td>False</td>\n",
+       "      <td>21151</td>\n",
+       "      <td>2.508090e-01</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>71313</th>\n",
+       "      <td>71314</td>\n",
+       "      <td>rs237834328</td>\n",
+       "      <td>9.999895e-01</td>\n",
+       "      <td>False</td>\n",
+       "      <td>13645</td>\n",
+       "      <td>1.552650e-01</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>71314</th>\n",
+       "      <td>71315</td>\n",
+       "      <td>rs52090420</td>\n",
+       "      <td>9.999899e-01</td>\n",
+       "      <td>False</td>\n",
+       "      <td>66119</td>\n",
+       "      <td>8.233664e-01</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>71315 rows × 7 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       index_synthetic              snp   p_synthetic  interest_synthetic  \\\n",
+       "0                    1       rs36353660  0.000000e+00                True   \n",
+       "1                    2       rs29220747  1.398388e-86                True   \n",
+       "2                    3       rs29470086  5.929727e-86                True   \n",
+       "3                    4       rs33102275  2.838852e-85                True   \n",
+       "4                    5      rs252502314  9.043721e-85                True   \n",
+       "...                ...              ...           ...                 ...   \n",
+       "71310            71311  cfw-17-49864534  9.999609e-01               False   \n",
+       "71311            71312       rs30856414  9.999711e-01               False   \n",
+       "71312            71313      rs108433568  9.999735e-01               False   \n",
+       "71313            71314      rs237834328  9.999895e-01               False   \n",
+       "71314            71315       rs52090420  9.999899e-01               False   \n",
+       "\n",
+       "       index_real        p_real  interest_real  \n",
+       "0           77608  9.734443e-01          False  \n",
+       "1           75217  9.426825e-01          False  \n",
+       "2              77  1.346918e-12           True  \n",
+       "3           70949  8.872558e-01          False  \n",
+       "4           74884  9.383166e-01          False  \n",
+       "...           ...           ...            ...  \n",
+       "71310       58306  7.228062e-01          False  \n",
+       "71311       31335  3.776636e-01          False  \n",
+       "71312       21151  2.508090e-01          False  \n",
+       "71313       13645  1.552650e-01          False  \n",
+       "71314       66119  8.233664e-01          False  \n",
+       "\n",
+       "[71315 rows x 7 columns]"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "combined = pd.merge(synthetic_snps, \n",
+    "         real_snps, \n",
+    "         how='inner', \n",
+    "         on=['snp'],\n",
+    "         suffixes=['_synthetic', '_real'])\n",
+    "combined"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "86ba40f3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "       False       1.00      0.99      1.00     71122\n",
+      "        True       0.32      0.92      0.47       193\n",
+      "\n",
+      "    accuracy                           0.99     71315\n",
+      "   macro avg       0.66      0.96      0.74     71315\n",
+      "weighted avg       1.00      0.99      1.00     71315\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.metrics import f1_score, classification_report, confusion_matrix\n",
+    "\n",
+    "print(classification_report(combined['interest_real'], combined['interest_synthetic']))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b50257ac",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}