|
a |
|
b/main.lua |
|
|
1 |
require 'torch' |
|
|
2 |
require 'nn' |
|
|
3 |
require 'optim' |
|
|
4 |
require 'model' |
|
|
5 |
include('util/auRoc.lua') |
|
|
6 |
require 'lfs' |
|
|
7 |
|
|
|
8 |
local cmd = torch.CmdLine() |
|
|
9 |
|
|
|
10 |
|
|
|
11 |
-- model options |
|
|
12 |
cmd:option('-init_from', '') -- resume previous model |
|
|
13 |
cmd:option('-dropout', 0.5) -- dropout probability |
|
|
14 |
cmd:option('-cnn', false) -- cnn model? |
|
|
15 |
cmd:option('-rnn', false) -- rnn model? |
|
|
16 |
cmd:option('-rnn_size', 32) -- rnn embedding size |
|
|
17 |
cmd:option('-rnn_layers', 1) -- number of rnn layers |
|
|
18 |
cmd:option('-unidirectional', false) -- unidirectional RNN |
|
|
19 |
cmd:option('-cnn_filters', '9-5-3') -- convolution filter sizes at each layer for CNN. This parameter also specifies the number of CNN layers. |
|
|
20 |
cmd:option('-cnn_pool', 2) -- covolution layer pool size |
|
|
21 |
cmd:option('-cnn_size', 128) -- number of conv feature maps at each layer for CNN and CNN-RNN |
|
|
22 |
|
|
|
23 |
-- Optimization options |
|
|
24 |
cmd:option('-max_epochs', 50) -- number of iterations to run for |
|
|
25 |
cmd:option('-learning_rate', 1e-2) |
|
|
26 |
cmd:option('-grad_clip', 5) -- gradient clip value magnitude |
|
|
27 |
cmd:option('-lr_decay_every', 10) -- learning rate decay iteration increment |
|
|
28 |
cmd:option('-lr_decay_factor', 0.5) -- learning rate decay number |
|
|
29 |
|
|
|
30 |
-- GPU |
|
|
31 |
cmd:option('-gpu', 1) -- set to 0 if no GPU |
|
|
32 |
|
|
|
33 |
-- Dataset options |
|
|
34 |
cmd:option('-data_root', 'data') -- data root directory |
|
|
35 |
cmd:option('-dataset', 'deepbind') -- dataset |
|
|
36 |
cmd:option('-seq_length', 101) --length of DNA sequences |
|
|
37 |
cmd:option('-TF', 'ATF1_K562_ATF1_-06-325-_Harvard') -- change for different TF |
|
|
38 |
cmd:option('-alphabet', 'ACGT') |
|
|
39 |
cmd:option('-size', 0) -- how much of each dataset to load. 0 = full |
|
|
40 |
cmd:option('-batch_size', 256) |
|
|
41 |
cmd:option('-class_labels','1,0') --specify positive label first |
|
|
42 |
|
|
|
43 |
|
|
|
44 |
|
|
|
45 |
|
|
|
46 |
-- Other |
|
|
47 |
cmd:option('-noprogressbar', false) -- lua progress bar |
|
|
48 |
cmd:option('-name', '') --special name for model (optional) |
|
|
49 |
cmd:option('-checkpoint_every', 0) -- save a model checkpoint every X iterations |
|
|
50 |
|
|
|
51 |
-- Directory to save models to |
|
|
52 |
cmd:option('-save_dir', 'models/') |
|
|
53 |
|
|
|
54 |
|
|
|
55 |
local opt = cmd:parse(arg) |
|
|
56 |
|
|
|
57 |
opt.class_labels_table = opt.class_labels:split(',') |
|
|
58 |
opt.num_classes = #opt.class_labels_table |
|
|
59 |
opt.alphabet_size = #opt.alphabet |
|
|
60 |
|
|
|
61 |
local data_dir = opt.data_root..'/'..opt.dataset..'/' |
|
|
62 |
|
|
|
63 |
|
|
|
64 |
-- Name of directory to save the models to |
|
|
65 |
if opt.cnn and (not opt.rnn) then -- CNN |
|
|
66 |
model_name ='model=CNN,cnn_size='..opt.cnn_size..',cnn_filters='..opt.cnn_filters |
|
|
67 |
elseif opt.cnn and opt.rnn then -- CNN-RNN |
|
|
68 |
model_name = 'model=CNN-RNN,cnn_size='..opt.cnn_size..',cnn_filter='..opt.cnn_filters:split('-')[1]..',rnn_size='..opt.rnn_size..',rnn_layers='..opt.rnn_layers |
|
|
69 |
if opt.unidirectional then model_name = model_name..',unidirectional' end |
|
|
70 |
elseif (not opt.cnn) and opt.rnn then -- RNN |
|
|
71 |
model_name = 'model=RNN,rnn_size='..opt.rnn_size..',rnn_layers='..opt.rnn_layers |
|
|
72 |
if opt.unidirectional then model_name = model_name..',unidirectional' end |
|
|
73 |
else |
|
|
74 |
print('Need either -cnn or -rnn flag! Exiting') |
|
|
75 |
os.exit() |
|
|
76 |
end |
|
|
77 |
model_name = model_name..',dropout='..opt.dropout..',learning_rate='..opt.learning_rate..',batch_size='..opt.batch_size |
|
|
78 |
if (opt.name ~= '') then model_name = model_name..','..opt.name end |
|
|
79 |
|
|
|
80 |
|
|
|
81 |
|
|
|
82 |
-- Set up GPU stuff |
|
|
83 |
local dtype = 'torch.FloatTensor' |
|
|
84 |
if opt.gpu > 0 then |
|
|
85 |
collectgarbage() |
|
|
86 |
require 'cutorch' |
|
|
87 |
require 'cunn' |
|
|
88 |
cutorch.setDevice(opt.gpu ) |
|
|
89 |
dtype = 'torch.CudaTensor' |
|
|
90 |
print(string.format('Running with CUDA on GPU %d', opt.gpu)) |
|
|
91 |
else |
|
|
92 |
print 'Running in CPU mode' |
|
|
93 |
end |
|
|
94 |
|
|
|
95 |
-- check if file exists |
|
|
96 |
function file_exists(name) |
|
|
97 |
local f=io.open(name,"r") |
|
|
98 |
if f~=nil then io.close(f) return true else return false end |
|
|
99 |
end |
|
|
100 |
|
|
|
101 |
-- *Uncomment for loop and if statement to Loop through all TFs and run train/test* |
|
|
102 |
-- for TF in lfs.dir(data_dir) do |
|
|
103 |
-- if TF:sub(1,1) ~= '.' and TF:sub(1,3) ~= 'ALL' then |
|
|
104 |
|
|
|
105 |
flag = true |
|
|
106 |
opt.TF = TF or opt.TF |
|
|
107 |
opt.data_dir = data_dir..opt.TF |
|
|
108 |
print('------TF DATASET-------\n'..opt.data_dir..'\n') |
|
|
109 |
|
|
|
110 |
|
|
|
111 |
local save_file = opt.save_dir..model_name..'/'..opt.TF..'/log.t7' |
|
|
112 |
print('------MODEL LOCATION-------\n'..opt.save_dir..model_name..'\n') |
|
|
113 |
if file_exists(save_file) and opt.init_from == '' then |
|
|
114 |
print('Already trained! Exiting') |
|
|
115 |
else |
|
|
116 |
local log = {} |
|
|
117 |
log['train'] = {} |
|
|
118 |
log['test'] = {} |
|
|
119 |
log['best_epoch'] = 0 |
|
|
120 |
log['best_auc'] = 0 |
|
|
121 |
|
|
|
122 |
-- ============ Load Data =========== -- |
|
|
123 |
require('data') |
|
|
124 |
data = {} |
|
|
125 |
data.train = createDatasetOneHot("train", opt) |
|
|
126 |
data.test = createDatasetOneHot("test", opt) |
|
|
127 |
train_size = data.train.inputs:size() |
|
|
128 |
test_size = data.test.inputs:size() |
|
|
129 |
|
|
|
130 |
-- ====== Initialize the model and criterion ======= -- |
|
|
131 |
local model = nil |
|
|
132 |
if opt.init_from ~= '' then -- load a model |
|
|
133 |
print('Initializing from ', opt.init_from) |
|
|
134 |
model = torch.load(opt.init_from):type(dtype) |
|
|
135 |
else --create new model |
|
|
136 |
local opt_clone = torch.deserialize(torch.serialize(opt)) |
|
|
137 |
print(opt_clone) |
|
|
138 |
model = nn.Model(opt_clone):type(dtype) |
|
|
139 |
end |
|
|
140 |
|
|
|
141 |
local params, grad_params = model:getParameters() |
|
|
142 |
local crit = nn.ClassNLLCriterion():type(dtype) |
|
|
143 |
local optim_config = {learningRate = opt.learning_rate, momentum = 0.9} |
|
|
144 |
local AUC = auRoc:new() |
|
|
145 |
|
|
|
146 |
|
|
|
147 |
-- ========== Run Train/Test ============ -- |
|
|
148 |
local best_trainAUROC = 0 |
|
|
149 |
for epoch = 1,opt.max_epochs do |
|
|
150 |
-- ========== Training ============ -- |
|
|
151 |
print('======> Training epoch '..epoch) |
|
|
152 |
|
|
|
153 |
model:resetStates() |
|
|
154 |
model:training() |
|
|
155 |
for t = 1,train_size,opt.batch_size do |
|
|
156 |
if not opt.noprogressbar then |
|
|
157 |
xlua.progress(t, train_size) |
|
|
158 |
end |
|
|
159 |
model:resetStates() |
|
|
160 |
-- Loss function that we pass to an optim method |
|
|
161 |
local function f(w) |
|
|
162 |
assert(w == params) |
|
|
163 |
grad_params:zero() |
|
|
164 |
|
|
|
165 |
-- Get a minibatch |
|
|
166 |
x = data.train.inputs[t]:type(dtype) |
|
|
167 |
y = data.train.labels[t]:type(dtype) |
|
|
168 |
|
|
|
169 |
|
|
|
170 |
-- forward model |
|
|
171 |
local scores = model:forward(x) |
|
|
172 |
-- print(scores) |
|
|
173 |
-- os.exit() |
|
|
174 |
|
|
|
175 |
-- add scores and labels to AUC |
|
|
176 |
auc_in = scores[{{1,scores:size(1)},{1,1}}]:reshape(scores:size(1)) |
|
|
177 |
for i = 1,auc_in:size(1) do AUC:add(math.exp(auc_in[i]), y[i]) end |
|
|
178 |
|
|
|
179 |
-- forward/backward criterion |
|
|
180 |
local loss = crit:forward(scores, y) |
|
|
181 |
local grad_scores = crit:backward(scores, y):view(x:size(1), 2, -1):reshape(x:size(1),2) |
|
|
182 |
|
|
|
183 |
-- backward model |
|
|
184 |
-- print(x) |
|
|
185 |
-- print(grad_scores) |
|
|
186 |
model:backward(x, grad_scores) |
|
|
187 |
|
|
|
188 |
-- clip gradients |
|
|
189 |
if opt.grad_clip > 0 then |
|
|
190 |
grad_params:clamp(-opt.grad_clip, opt.grad_clip) |
|
|
191 |
end |
|
|
192 |
|
|
|
193 |
return loss, grad_params |
|
|
194 |
end |
|
|
195 |
local _, loss = optim.adam(f, params, optim_config) |
|
|
196 |
end |
|
|
197 |
local trainAUROC = AUC:calculateAuc() |
|
|
198 |
print('\nTrain AUC: '..trainAUROC) |
|
|
199 |
AUC:zero() |
|
|
200 |
|
|
|
201 |
|
|
|
202 |
-- =========== Testing ============ -- |
|
|
203 |
print('======> Testing epoch '..epoch) |
|
|
204 |
model:resetStates() |
|
|
205 |
model:evaluate() |
|
|
206 |
for t = 1,test_size,opt.batch_size do |
|
|
207 |
|
|
|
208 |
--progress bar |
|
|
209 |
if not opt.noprogressbar then |
|
|
210 |
xlua.progress(t, test_size) |
|
|
211 |
end |
|
|
212 |
|
|
|
213 |
-- get data |
|
|
214 |
x = data.test.inputs[t]:type(dtype) |
|
|
215 |
y = data.test.labels[t]:type(dtype) |
|
|
216 |
|
|
|
217 |
-- forward model |
|
|
218 |
model:resetStates() |
|
|
219 |
local scores = model:forward(x) |
|
|
220 |
|
|
|
221 |
-- add scores and labels to AUC |
|
|
222 |
auc_in = scores[{{1,scores:size(1)},{1,1}}]:reshape(scores:size(1)) |
|
|
223 |
for i = 1,auc_in:size(1) do AUC:add(math.exp(auc_in[i]), y[i]) end |
|
|
224 |
end |
|
|
225 |
local testAUROC = AUC:calculateAuc() |
|
|
226 |
AUC:zero() |
|
|
227 |
print('\nTest AUC: '..testAUROC) |
|
|
228 |
|
|
|
229 |
|
|
|
230 |
-- ======== Checkpoint and Log ========== -- |
|
|
231 |
|
|
|
232 |
-- check for training error |
|
|
233 |
if testAUROC < 0.1 then |
|
|
234 |
flag = false |
|
|
235 |
print('error in training, break from current TF') |
|
|
236 |
break |
|
|
237 |
end |
|
|
238 |
|
|
|
239 |
-- log the best AUC results and remember the model |
|
|
240 |
if epoch > 1 and (trainAUROC > best_trainAUROC) then |
|
|
241 |
best_trainAUROC = trainAUROC |
|
|
242 |
best_model=model:clone('weight','bias') |
|
|
243 |
log['best_epoch'] = epoch |
|
|
244 |
log['train_auc'] = trainAUROC |
|
|
245 |
log['test_auc'] = testAUROC |
|
|
246 |
end |
|
|
247 |
|
|
|
248 |
-- Save certain models |
|
|
249 |
if epoch % opt.checkpoint_every == 0 then |
|
|
250 |
torch.save(lfs.currentdir()..'/'..opt.save_dir..model_name..'/'..opt.TF..'/epoch'..epoch..'_model.t7', model) |
|
|
251 |
end |
|
|
252 |
|
|
|
253 |
-- decay learning rate |
|
|
254 |
if epoch % opt.lr_decay_every == 0 then |
|
|
255 |
local old_lr = optim_config.learningRate |
|
|
256 |
optim_config.learningRate = old_lr * opt.lr_decay_factor |
|
|
257 |
end |
|
|
258 |
|
|
|
259 |
-- Log every iteration (we don't log every model because of space) |
|
|
260 |
table.insert(log['test'],testAUROC) |
|
|
261 |
table.insert(log['train'],trainAUROC) |
|
|
262 |
collectgarbage() |
|
|
263 |
end |
|
|
264 |
|
|
|
265 |
-- ======== Save the Best model and Log ========== -- |
|
|
266 |
if flag then |
|
|
267 |
lfs.mkdir(opt.save_dir..model_name) |
|
|
268 |
lfs.mkdir(opt.save_dir..model_name..'/'..opt.TF) |
|
|
269 |
torch.save(lfs.currentdir()..'/'..opt.save_dir..model_name..'/'..opt.TF..'/best_model.t7', model) |
|
|
270 |
torch.save(opt.save_dir..model_name..'/'..opt.TF..'/log'..'.t7', log) |
|
|
271 |
end |
|
|
272 |
end |
|
|
273 |
|
|
|
274 |
-- end -- if directory |
|
|
275 |
-- end -- loop through TFs |