|
a |
|
b/utils.py |
|
|
1 |
''' |
|
|
2 |
Contains the utility functions |
|
|
3 |
''' |
|
|
4 |
|
|
|
5 |
import numpy as np |
|
|
6 |
import config as config |
|
|
7 |
|
|
|
8 |
import sys |
|
|
9 |
sys.path.append('../') |
|
|
10 |
|
|
|
11 |
from sksurv.linear_model import CoxnetSurvivalAnalysis |
|
|
12 |
from loader import Dataset |
|
|
13 |
|
|
|
14 |
available_models = ['genomics', 'pyradiomics', |
|
|
15 |
'densenet', 'intermediate_gp', |
|
|
16 |
'intermediate_gd', 'late_gp', |
|
|
17 |
'late_gd'] |
|
|
18 |
|
|
|
19 |
def run_coxnet(l1_ratio, n_alphas, x_train, y_train, x_test, y_test): |
|
|
20 |
|
|
|
21 |
coxnet = CoxnetSurvivalAnalysis(l1_ratio=l1_ratio, n_alphas=n_alphas) |
|
|
22 |
coxnet.fit(x_train, y_train) |
|
|
23 |
outputs = coxnet.predict(x_test) |
|
|
24 |
score = coxnet.score(x_test, y_test) |
|
|
25 |
return outputs, score |
|
|
26 |
|
|
|
27 |
def get_data(split=0, location=config.csv_location, mode='valid'): |
|
|
28 |
''' |
|
|
29 |
use mode = 'test' for testing |
|
|
30 |
''' |
|
|
31 |
|
|
|
32 |
print('Loading data for mode ' + mode + ' from location ' + location) |
|
|
33 |
X_train, y_train, y_train2 = [], [], [] |
|
|
34 |
with open(location + 'train_' + str(split) + '.csv', 'r') as curr_file: |
|
|
35 |
for row in curr_file: |
|
|
36 |
a, b, c = row.split('\t') |
|
|
37 |
X_train.append(a.strip()) |
|
|
38 |
y_train.append(int(b.strip())) |
|
|
39 |
y_train2.append(int(c.strip())) |
|
|
40 |
|
|
|
41 |
X_test, y_test, y_test2 = [], [], [] |
|
|
42 |
with open(location + mode + '_' + str(split) + '.csv', 'r') as curr_file: |
|
|
43 |
for row in curr_file: |
|
|
44 |
a, b, c = row.split('\t') |
|
|
45 |
X_test.append(a.strip()) |
|
|
46 |
y_test.append(int(b.strip())) |
|
|
47 |
y_test2.append(int(c.strip())) |
|
|
48 |
|
|
|
49 |
return X_train, X_test, y_train, y_test, y_train2, y_test2 |
|
|
50 |
|
|
|
51 |
def get_structured_array(data_bool, data_value): |
|
|
52 |
all_bools = data_bool |
|
|
53 |
all_values = data_value |
|
|
54 |
# all_bools = data_bool.cpu().detach().numpy() |
|
|
55 |
# all_values = data_value.cpu().detach().numpy() |
|
|
56 |
|
|
|
57 |
new_list = [] |
|
|
58 |
for idx in range(len(all_bools)): |
|
|
59 |
new_list.append(tuple((all_bools[idx], all_values[idx]))) |
|
|
60 |
return np.array(new_list, dtype='bool, i8') |
|
|
61 |
|
|
|
62 |
|
|
|
63 |
class DataLoader(object): |
|
|
64 |
|
|
|
65 |
def __init__(self, fold=0, num_genes=500, mode='cpu'): |
|
|
66 |
self.fold = fold |
|
|
67 |
self.num_genes = num_genes |
|
|
68 |
self.load_data(mode) |
|
|
69 |
|
|
|
70 |
def load_data(self, mode): |
|
|
71 |
X_train_list, X_valid_list, y_value_train, y_value_valid, \ |
|
|
72 |
y_train, y_valid = get_data(self.fold, config.csv_location, 'valid') |
|
|
73 |
_, X_test_list, _, y_value_test, _, y_test = \ |
|
|
74 |
get_data(self.fold, config.csv_location, 'test') |
|
|
75 |
|
|
|
76 |
self.train_num = len(X_train_list) |
|
|
77 |
self.valid_num = len(X_valid_list) |
|
|
78 |
self.test_num = len(X_test_list) |
|
|
79 |
|
|
|
80 |
# labels |
|
|
81 |
|
|
|
82 |
if mode == 'cpu': |
|
|
83 |
self.y_train_bool = np.array(y_train) |
|
|
84 |
self.y_valid_bool = np.array(y_valid) |
|
|
85 |
self.y_test_bool = np.array(y_test) |
|
|
86 |
self.y_train_value = np.array(y_value_train) |
|
|
87 |
self.y_valid_value = np.array(y_value_valid) |
|
|
88 |
self.y_test_value = np.array(y_value_test) |
|
|
89 |
elif mode == 'gpu': |
|
|
90 |
from torch.autograd import Variable |
|
|
91 |
self.y_train_bool = Variable(torch.from_numpy( |
|
|
92 |
np.array(y_train))).float() |
|
|
93 |
self.y_valid_bool = Variable(torch.from_numpy( |
|
|
94 |
np.array(y_valid))).float() |
|
|
95 |
self.y_test_bool = Variable(torch.from_numpy( |
|
|
96 |
np.array(y_test))).float() |
|
|
97 |
self.y_train_value = Variable(torch.from_numpy( |
|
|
98 |
np.array(y_value_train))).float() |
|
|
99 |
self.y_valid_value = Variable(torch.from_numpy( |
|
|
100 |
np.array(y_value_valid))).float() |
|
|
101 |
self.y_test_value = Variable(torch.from_numpy( |
|
|
102 |
np.array(y_value_test))).float() |
|
|
103 |
else: |
|
|
104 |
raise(NotImplementedError) |
|
|
105 |
|
|
|
106 |
NRG = Dataset(config) |
|
|
107 |
|
|
|
108 |
# genomics |
|
|
109 |
X_gen_train, gen_list = NRG.get_genomics(X_train_list) |
|
|
110 |
X_gen_valid, gen_list = NRG.get_genomics(X_valid_list) |
|
|
111 |
X_gen_test, gen_list = NRG.get_genomics(X_test_list) |
|
|
112 |
|
|
|
113 |
all_std = np.std(np.array(X_gen_train), axis=0) |
|
|
114 |
all_sorted = np.argsort(all_std) |
|
|
115 |
X_gen_train = np.array(X_gen_train)[:, all_sorted[-self.num_genes:]] |
|
|
116 |
X_gen_valid = np.array(X_gen_valid)[:, all_sorted[-self.num_genes:]] |
|
|
117 |
X_gen_test = np.array(X_gen_test)[:, all_sorted[-self.num_genes:]] |
|
|
118 |
|
|
|
119 |
max_gen = np.max(np.concatenate( |
|
|
120 |
(X_gen_train, X_gen_valid, X_gen_train), axis=0)) |
|
|
121 |
X_gen_train = (X_gen_train) / max_gen |
|
|
122 |
X_gen_valid = (X_gen_valid) / max_gen |
|
|
123 |
X_gen_test = (X_gen_test) / max_gen |
|
|
124 |
|
|
|
125 |
if mode == 'gpu': |
|
|
126 |
self.gen_train = Variable(torch.from_numpy(X_gen_train)).float() |
|
|
127 |
self.gen_valid = Variable(torch.from_numpy(X_gen_valid)).float() |
|
|
128 |
self.gen_test = Variable(torch.from_numpy(X_gen_test)).float() |
|
|
129 |
elif mode == 'cpu': |
|
|
130 |
self.gen_train = X_gen_train |
|
|
131 |
self.gen_valid = X_gen_valid |
|
|
132 |
self.gen_test = X_gen_test |
|
|
133 |
|
|
|
134 |
# pyradiomics |
|
|
135 |
X_pyrad_train = NRG.get_pyradiomics(X_train_list) |
|
|
136 |
X_pyrad_valid = NRG.get_pyradiomics(X_valid_list) |
|
|
137 |
X_pyrad_test = NRG.get_pyradiomics(X_test_list) |
|
|
138 |
|
|
|
139 |
max_pyrad = np.max(np.concatenate( |
|
|
140 |
(X_pyrad_train, X_pyrad_valid, X_pyrad_train), axis=0)) |
|
|
141 |
X_pyrad_train = (X_pyrad_train) / max_pyrad |
|
|
142 |
X_pyrad_valid = (X_pyrad_valid) / max_pyrad |
|
|
143 |
X_pyrad_test = (X_pyrad_test) / max_pyrad |
|
|
144 |
|
|
|
145 |
if mode == 'gpu': |
|
|
146 |
self.pyrad_train = Variable(torch.from_numpy(X_pyrad_train)).float() |
|
|
147 |
self.pyrad_valid = Variable(torch.from_numpy(X_pyrad_valid)).float() |
|
|
148 |
self.pyrad_test = Variable(torch.from_numpy(X_pyrad_test)).float() |
|
|
149 |
elif mode == 'cpu': |
|
|
150 |
self.pyrad_train = X_pyrad_train |
|
|
151 |
self.pyrad_valid = X_pyrad_valid |
|
|
152 |
self.pyrad_test = X_pyrad_test |
|
|
153 |
|
|
|
154 |
|
|
|
155 |
# densenet |
|
|
156 |
X_dense_train = NRG.get_densenet_features(X_train_list) |
|
|
157 |
X_dense_valid = NRG.get_densenet_features(X_valid_list) |
|
|
158 |
X_dense_test = NRG.get_densenet_features(X_test_list) |
|
|
159 |
|
|
|
160 |
max_dense = np.max(np.concatenate( |
|
|
161 |
(X_dense_train, X_dense_valid, X_dense_train), axis=0)) |
|
|
162 |
X_dense_train = (X_dense_train) / max_dense |
|
|
163 |
X_dense_valid = (X_dense_valid) / max_dense |
|
|
164 |
X_dense_test = (X_dense_test) / max_dense |
|
|
165 |
|
|
|
166 |
if mode == 'gpu': |
|
|
167 |
self.dense_train = Variable(torch.from_numpy(X_dense_train)).float() |
|
|
168 |
self.dense_valid = Variable(torch.from_numpy(X_dense_valid)).float() |
|
|
169 |
self.dense_test = Variable(torch.from_numpy(X_dense_test)).float() |
|
|
170 |
elif mode == 'cpu': |
|
|
171 |
self.dense_train = X_dense_train |
|
|
172 |
self.dense_valid = X_dense_valid |
|
|
173 |
self.dense_test = X_dense_test |
|
|
174 |
|
|
|
175 |
|
|
|
176 |
|