interpret_ehr / Git / Diff of /analysis/ml/UnivariateTest.py

Models:

philipB/

interpret_ehr

Downloads: 1

Diff of /analysis/ml/UnivariateTest.py [000000] .. [23d48c]

Switch to unified view

 b/analysis/ml/UnivariateTest.py
+import multiprocessing
+if __name__ == '__main__':
+    multiprocessing.set_start_method('forkserver')
+    import sys
+    from sklearn.feature_selection import f_classif
+    # from sklearn.linear_model import LogisticRegression
+    from p_values_for_logreg import LogisticReg
+    from sklearn.preprocessing import StandardScaler
+    import pdb
+    import numpy as np
+    from read_file import read_file
+    dataset = sys.argv[1]
+    save_file = sys.argv[2]
+    random_state = int(sys.argv[3])
+    rare = eval(sys.argv[4])
+    longitudinal=False
+    features, labels, pt_ids, feature_names, zfile = read_file(dataset,longitudinal,rare)
+    scores=[]
+    # scale the data
+    features = StandardScaler().fit_transform(features)
+    # create the classifier
+    for i in np.arange(features.shape[1]):
+        print('fitting feature',feature_names[i],i,'of',features.shape[1])
+        est = LogisticReg(solver='saga',
+                                 C = 1000,
+                                 random_state=random_state)
+        est.fit(features[:,i].reshape(-1,1),labels)
+        print('pvalue:',est.p_values[0],
+              'weight:',np.abs(est.model.coef_.flatten()[0]))
+        if est.p_values[0] < 0.05:
+            scores.append(np.abs(est.model.coef_.flatten()[0]))
+        else:
+            scores.append(0)
+    # save file
+    out_text=''
+    param_string = ','.join(['{}={}'.format(p, v)
+                             for p,v in est.model.get_params().items()])
+    # algorithm seed    feature score
+    for i,c in enumerate(scores):
+        out_text += '\t'.join(['Univariate LR',
+                               param_string,
+                               str(random_state),
+                               feature_names[i],
+                               str(c)])+'\n'
+    import os
+    if os.path.exists(save_file.split('.csv')[0]+'.imp_score'):
+        os.remove(save_file.split('.csv')[0]+'.imp_score')
+    if os.path.exists(save_file.split('.csv')[0]+'.roc'):
+        os.remove(save_file.split('.csv')[0]+'.roc')
+    ext = '.univariate_score'
+    with open(save_file.split('.csv')[0] + ext,'w') as out:
+        out.write('algorithm\talg-parameters\tseed\tfeature\tscore\n')
+        out.write(out_text)