# Compare Associations
* This notebook compares the final GWAS p-values for the full synthetic genome/phenome datasets to those in the original genome/phenome datasets and computes the precision, recall and F1 values.

In [1]:
import os
import pathlib
import pandas as pd

base_path = pathlib.Path(os.getcwd().replace("/synthetics", ""))
data_path = base_path / 'mice_data_set' / 'data' 
real_gwas_path = base_path / 'mice_data_set' / 'out' 
synthetic_gwas_path = base_path / 'mice_data_set' / 'out_synth'


In [6]:
PHENOTYPE = 'abBMD'

real_snps = pd.read_csv(real_gwas_path / f'lm_{PHENOTYPE}_1_79646.csv')
real_snps = real_snps.rename(columns={real_snps.columns[0]: 'index'})
real_snps = real_snps[['index', 'snp', 'p']]
real_snps['interest'] = real_snps['p'].apply(lambda x: True if x <= 1e-8 else False)
real_snps

Unnamed: 0,index,snp,p,interest
0,1,rs29477109,5.052317e-14,True
1,2,rs27071351,7.074181e-14,True
2,3,rs27024162,7.170582e-14,True
3,4,rs49423067,7.198661e-14,True
4,5,rs29470802,8.049849e-14,True
...,...,...,...,...
79640,79641,rs3162358,9.998911e-01,False
79641,79642,rs50509099,9.999012e-01,False
79642,79643,rs47505090,9.999041e-01,False
79643,79644,rs232293770,9.999351e-01,False


In [7]:
# Be sure to update the name of your lm file appropriately

synthetic_snps = pd.read_csv(synthetic_gwas_path / 'lm_batchall_abBMD_1_71316.csv') 
synthetic_snps = synthetic_snps.rename(columns={synthetic_snps.columns[0]: 'index'})
synthetic_snps = synthetic_snps[['index', 'snp', 'p']]
synthetic_snps['interest'] = synthetic_snps['p'].apply(lambda x: True if x <= 1e-8 else False)
synthetic_snps

Unnamed: 0,index,snp,p,interest
0,1,rs36353660,0.000000e+00,True
1,2,rs29220747,1.398388e-86,True
2,3,rs29470086,5.929727e-86,True
3,4,rs33102275,2.838852e-85,True
4,5,rs252502314,9.043721e-85,True
...,...,...,...,...
71310,71311,cfw-17-49864534,9.999609e-01,False
71311,71312,rs30856414,9.999711e-01,False
71312,71313,rs108433568,9.999735e-01,False
71313,71314,rs237834328,9.999895e-01,False


In [8]:
combined = pd.merge(synthetic_snps, 
         real_snps, 
         how='inner', 
         on=['snp'],
         suffixes=['_synthetic', '_real'])
combined

Unnamed: 0,index_synthetic,snp,p_synthetic,interest_synthetic,index_real,p_real,interest_real
0,1,rs36353660,0.000000e+00,True,77608,9.734443e-01,False
1,2,rs29220747,1.398388e-86,True,75217,9.426825e-01,False
2,3,rs29470086,5.929727e-86,True,77,1.346918e-12,True
3,4,rs33102275,2.838852e-85,True,70949,8.872558e-01,False
4,5,rs252502314,9.043721e-85,True,74884,9.383166e-01,False
...,...,...,...,...,...,...,...
71310,71311,cfw-17-49864534,9.999609e-01,False,58306,7.228062e-01,False
71311,71312,rs30856414,9.999711e-01,False,31335,3.776636e-01,False
71312,71313,rs108433568,9.999735e-01,False,21151,2.508090e-01,False
71313,71314,rs237834328,9.999895e-01,False,13645,1.552650e-01,False


In [9]:
from sklearn.metrics import f1_score, classification_report, confusion_matrix

print(classification_report(combined['interest_real'], combined['interest_synthetic']))


              precision    recall  f1-score   support

       False       1.00      0.99      1.00     71122
        True       0.32      0.92      0.47       193

    accuracy                           0.99     71315
   macro avg       0.66      0.96      0.74     71315
weighted avg       1.00      0.99      1.00     71315
