|
a |
|
b/analysis/ml/UnivariateTest.py |
|
|
1 |
import multiprocessing |
|
|
2 |
|
|
|
3 |
if __name__ == '__main__': |
|
|
4 |
multiprocessing.set_start_method('forkserver') |
|
|
5 |
import sys |
|
|
6 |
from sklearn.feature_selection import f_classif |
|
|
7 |
# from sklearn.linear_model import LogisticRegression |
|
|
8 |
from p_values_for_logreg import LogisticReg |
|
|
9 |
from sklearn.preprocessing import StandardScaler |
|
|
10 |
import pdb |
|
|
11 |
import numpy as np |
|
|
12 |
from read_file import read_file |
|
|
13 |
dataset = sys.argv[1] |
|
|
14 |
save_file = sys.argv[2] |
|
|
15 |
random_state = int(sys.argv[3]) |
|
|
16 |
rare = eval(sys.argv[4]) |
|
|
17 |
longitudinal=False |
|
|
18 |
|
|
|
19 |
features, labels, pt_ids, feature_names, zfile = read_file(dataset,longitudinal,rare) |
|
|
20 |
scores=[] |
|
|
21 |
# scale the data |
|
|
22 |
features = StandardScaler().fit_transform(features) |
|
|
23 |
# create the classifier |
|
|
24 |
for i in np.arange(features.shape[1]): |
|
|
25 |
print('fitting feature',feature_names[i],i,'of',features.shape[1]) |
|
|
26 |
est = LogisticReg(solver='saga', |
|
|
27 |
C = 1000, |
|
|
28 |
random_state=random_state) |
|
|
29 |
est.fit(features[:,i].reshape(-1,1),labels) |
|
|
30 |
print('pvalue:',est.p_values[0], |
|
|
31 |
'weight:',np.abs(est.model.coef_.flatten()[0])) |
|
|
32 |
if est.p_values[0] < 0.05: |
|
|
33 |
scores.append(np.abs(est.model.coef_.flatten()[0])) |
|
|
34 |
else: |
|
|
35 |
scores.append(0) |
|
|
36 |
# save file |
|
|
37 |
out_text='' |
|
|
38 |
param_string = ','.join(['{}={}'.format(p, v) |
|
|
39 |
for p,v in est.model.get_params().items()]) |
|
|
40 |
|
|
|
41 |
# algorithm seed feature score |
|
|
42 |
for i,c in enumerate(scores): |
|
|
43 |
out_text += '\t'.join(['Univariate LR', |
|
|
44 |
param_string, |
|
|
45 |
str(random_state), |
|
|
46 |
feature_names[i], |
|
|
47 |
str(c)])+'\n' |
|
|
48 |
import os |
|
|
49 |
if os.path.exists(save_file.split('.csv')[0]+'.imp_score'): |
|
|
50 |
os.remove(save_file.split('.csv')[0]+'.imp_score') |
|
|
51 |
if os.path.exists(save_file.split('.csv')[0]+'.roc'): |
|
|
52 |
os.remove(save_file.split('.csv')[0]+'.roc') |
|
|
53 |
|
|
|
54 |
ext = '.univariate_score' |
|
|
55 |
with open(save_file.split('.csv')[0] + ext,'w') as out: |
|
|
56 |
out.write('algorithm\talg-parameters\tseed\tfeature\tscore\n') |
|
|
57 |
out.write(out_text) |