Switch to unified view

a b/analysis/ml/UnivariateTest.py
1
import multiprocessing
2
3
if __name__ == '__main__':
4
    multiprocessing.set_start_method('forkserver')
5
    import sys
6
    from sklearn.feature_selection import f_classif
7
    # from sklearn.linear_model import LogisticRegression
8
    from p_values_for_logreg import LogisticReg
9
    from sklearn.preprocessing import StandardScaler
10
    import pdb
11
    import numpy as np
12
    from read_file import read_file
13
    dataset = sys.argv[1]
14
    save_file = sys.argv[2]
15
    random_state = int(sys.argv[3])
16
    rare = eval(sys.argv[4])
17
    longitudinal=False
18
19
    features, labels, pt_ids, feature_names, zfile = read_file(dataset,longitudinal,rare)
20
    scores=[]
21
    # scale the data
22
    features = StandardScaler().fit_transform(features)
23
    # create the classifier
24
    for i in np.arange(features.shape[1]):
25
        print('fitting feature',feature_names[i],i,'of',features.shape[1])
26
        est = LogisticReg(solver='saga',
27
                                 C = 1000, 
28
                                 random_state=random_state)
29
        est.fit(features[:,i].reshape(-1,1),labels)
30
        print('pvalue:',est.p_values[0],
31
              'weight:',np.abs(est.model.coef_.flatten()[0]))
32
        if est.p_values[0] < 0.05:
33
            scores.append(np.abs(est.model.coef_.flatten()[0]))
34
        else:
35
            scores.append(0)
36
    # save file
37
    out_text=''
38
    param_string = ','.join(['{}={}'.format(p, v) 
39
                             for p,v in est.model.get_params().items()])
40
41
    # algorithm seed    feature score
42
    for i,c in enumerate(scores):
43
        out_text += '\t'.join(['Univariate LR',
44
                               param_string,
45
                               str(random_state),
46
                               feature_names[i],
47
                               str(c)])+'\n'
48
    import os
49
    if os.path.exists(save_file.split('.csv')[0]+'.imp_score'):
50
        os.remove(save_file.split('.csv')[0]+'.imp_score')
51
    if os.path.exists(save_file.split('.csv')[0]+'.roc'):
52
        os.remove(save_file.split('.csv')[0]+'.roc')
53
54
    ext = '.univariate_score'
55
    with open(save_file.split('.csv')[0] + ext,'w') as out:
56
        out.write('algorithm\talg-parameters\tseed\tfeature\tscore\n')
57
        out.write(out_text)