require 'torch'
require 'nn'
require 'optim'
require 'model'
include('util/auRoc.lua')
require 'lfs'
local cmd = torch.CmdLine()
-- model options
cmd:option('-init_from', '') -- resume previous model
cmd:option('-dropout', 0.5) -- dropout probability
cmd:option('-cnn', false) -- cnn model?
cmd:option('-rnn', false) -- rnn model?
cmd:option('-rnn_size', 32) -- rnn embedding size
cmd:option('-rnn_layers', 1) -- number of rnn layers
cmd:option('-unidirectional', false) -- unidirectional RNN
cmd:option('-cnn_filters', '9-5-3') -- convolution filter sizes at each layer for CNN. This parameter also specifies the number of CNN layers.
cmd:option('-cnn_pool', 2) -- covolution layer pool size
cmd:option('-cnn_size', 128) -- number of conv feature maps at each layer for CNN and CNN-RNN
-- Optimization options
cmd:option('-max_epochs', 50) -- number of iterations to run for
cmd:option('-learning_rate', 1e-2)
cmd:option('-grad_clip', 5) -- gradient clip value magnitude
cmd:option('-lr_decay_every', 10) -- learning rate decay iteration increment
cmd:option('-lr_decay_factor', 0.5) -- learning rate decay number
-- GPU
cmd:option('-gpu', 1) -- set to 0 if no GPU
-- Dataset options
cmd:option('-data_root', 'data') -- data root directory
cmd:option('-dataset', 'deepbind') -- dataset
cmd:option('-seq_length', 101) --length of DNA sequences
cmd:option('-TF', 'ATF1_K562_ATF1_-06-325-_Harvard') -- change for different TF
cmd:option('-alphabet', 'ACGT')
cmd:option('-size', 0) -- how much of each dataset to load. 0 = full
cmd:option('-batch_size', 256)
cmd:option('-class_labels','1,0') --specify positive label first
-- Other
cmd:option('-noprogressbar', false) -- lua progress bar
cmd:option('-name', '') --special name for model (optional)
cmd:option('-checkpoint_every', 0) -- save a model checkpoint every X iterations
-- Directory to save models to
cmd:option('-save_dir', 'models/')
local opt = cmd:parse(arg)
opt.class_labels_table = opt.class_labels:split(',')
opt.num_classes = #opt.class_labels_table
opt.alphabet_size = #opt.alphabet
local data_dir = opt.data_root..'/'..opt.dataset..'/'
-- Name of directory to save the models to
if opt.cnn and (not opt.rnn) then -- CNN
model_name ='model=CNN,cnn_size='..opt.cnn_size..',cnn_filters='..opt.cnn_filters
elseif opt.cnn and opt.rnn then -- CNN-RNN
model_name = 'model=CNN-RNN,cnn_size='..opt.cnn_size..',cnn_filter='..opt.cnn_filters:split('-')[1]..',rnn_size='..opt.rnn_size..',rnn_layers='..opt.rnn_layers
if opt.unidirectional then model_name = model_name..',unidirectional' end
elseif (not opt.cnn) and opt.rnn then -- RNN
model_name = 'model=RNN,rnn_size='..opt.rnn_size..',rnn_layers='..opt.rnn_layers
if opt.unidirectional then model_name = model_name..',unidirectional' end
else
print('Need either -cnn or -rnn flag! Exiting')
os.exit()
end
model_name = model_name..',dropout='..opt.dropout..',learning_rate='..opt.learning_rate..',batch_size='..opt.batch_size
if (opt.name ~= '') then model_name = model_name..','..opt.name end
-- Set up GPU stuff
local dtype = 'torch.FloatTensor'
if opt.gpu > 0 then
collectgarbage()
require 'cutorch'
require 'cunn'
cutorch.setDevice(opt.gpu )
dtype = 'torch.CudaTensor'
print(string.format('Running with CUDA on GPU %d', opt.gpu))
else
print 'Running in CPU mode'
end
-- check if file exists
function file_exists(name)
local f=io.open(name,"r")
if f~=nil then io.close(f) return true else return false end
end
-- *Uncomment for loop and if statement to Loop through all TFs and run train/test*
-- for TF in lfs.dir(data_dir) do
-- if TF:sub(1,1) ~= '.' and TF:sub(1,3) ~= 'ALL' then
flag = true
opt.TF = TF or opt.TF
opt.data_dir = data_dir..opt.TF
print('------TF DATASET-------\n'..opt.data_dir..'\n')
local save_file = opt.save_dir..model_name..'/'..opt.TF..'/log.t7'
print('------MODEL LOCATION-------\n'..opt.save_dir..model_name..'\n')
if file_exists(save_file) and opt.init_from == '' then
print('Already trained! Exiting')
else
local log = {}
log['train'] = {}
log['test'] = {}
log['best_epoch'] = 0
log['best_auc'] = 0
-- ============ Load Data =========== --
require('data')
data = {}
data.train = createDatasetOneHot("train", opt)
data.test = createDatasetOneHot("test", opt)
train_size = data.train.inputs:size()
test_size = data.test.inputs:size()
-- ====== Initialize the model and criterion ======= --
local model = nil
if opt.init_from ~= '' then -- load a model
print('Initializing from ', opt.init_from)
model = torch.load(opt.init_from):type(dtype)
else --create new model
local opt_clone = torch.deserialize(torch.serialize(opt))
print(opt_clone)
model = nn.Model(opt_clone):type(dtype)
end
local params, grad_params = model:getParameters()
local crit = nn.ClassNLLCriterion():type(dtype)
local optim_config = {learningRate = opt.learning_rate, momentum = 0.9}
local AUC = auRoc:new()
-- ========== Run Train/Test ============ --
local best_trainAUROC = 0
for epoch = 1,opt.max_epochs do
-- ========== Training ============ --
print('======> Training epoch '..epoch)
model:resetStates()
model:training()
for t = 1,train_size,opt.batch_size do
if not opt.noprogressbar then
xlua.progress(t, train_size)
end
model:resetStates()
-- Loss function that we pass to an optim method
local function f(w)
assert(w == params)
grad_params:zero()
-- Get a minibatch
x = data.train.inputs[t]:type(dtype)
y = data.train.labels[t]:type(dtype)
-- forward model
local scores = model:forward(x)
-- print(scores)
-- os.exit()
-- add scores and labels to AUC
auc_in = scores[{{1,scores:size(1)},{1,1}}]:reshape(scores:size(1))
for i = 1,auc_in:size(1) do AUC:add(math.exp(auc_in[i]), y[i]) end
-- forward/backward criterion
local loss = crit:forward(scores, y)
local grad_scores = crit:backward(scores, y):view(x:size(1), 2, -1):reshape(x:size(1),2)
-- backward model
-- print(x)
-- print(grad_scores)
model:backward(x, grad_scores)
-- clip gradients
if opt.grad_clip > 0 then
grad_params:clamp(-opt.grad_clip, opt.grad_clip)
end
return loss, grad_params
end
local _, loss = optim.adam(f, params, optim_config)
end
local trainAUROC = AUC:calculateAuc()
print('\nTrain AUC: '..trainAUROC)
AUC:zero()
-- =========== Testing ============ --
print('======> Testing epoch '..epoch)
model:resetStates()
model:evaluate()
for t = 1,test_size,opt.batch_size do
--progress bar
if not opt.noprogressbar then
xlua.progress(t, test_size)
end
-- get data
x = data.test.inputs[t]:type(dtype)
y = data.test.labels[t]:type(dtype)
-- forward model
model:resetStates()
local scores = model:forward(x)
-- add scores and labels to AUC
auc_in = scores[{{1,scores:size(1)},{1,1}}]:reshape(scores:size(1))
for i = 1,auc_in:size(1) do AUC:add(math.exp(auc_in[i]), y[i]) end
end
local testAUROC = AUC:calculateAuc()
AUC:zero()
print('\nTest AUC: '..testAUROC)
-- ======== Checkpoint and Log ========== --
-- check for training error
if testAUROC < 0.1 then
flag = false
print('error in training, break from current TF')
break
end
-- log the best AUC results and remember the model
if epoch > 1 and (trainAUROC > best_trainAUROC) then
best_trainAUROC = trainAUROC
best_model=model:clone('weight','bias')
log['best_epoch'] = epoch
log['train_auc'] = trainAUROC
log['test_auc'] = testAUROC
end
-- Save certain models
if epoch % opt.checkpoint_every == 0 then
torch.save(lfs.currentdir()..'/'..opt.save_dir..model_name..'/'..opt.TF..'/epoch'..epoch..'_model.t7', model)
end
-- decay learning rate
if epoch % opt.lr_decay_every == 0 then
local old_lr = optim_config.learningRate
optim_config.learningRate = old_lr * opt.lr_decay_factor
end
-- Log every iteration (we don't log every model because of space)
table.insert(log['test'],testAUROC)
table.insert(log['train'],trainAUROC)
collectgarbage()
end
-- ======== Save the Best model and Log ========== --
if flag then
lfs.mkdir(opt.save_dir..model_name)
lfs.mkdir(opt.save_dir..model_name..'/'..opt.TF)
torch.save(lfs.currentdir()..'/'..opt.save_dir..model_name..'/'..opt.TF..'/best_model.t7', model)
torch.save(opt.save_dir..model_name..'/'..opt.TF..'/log'..'.t7', log)
end
end
-- end -- if directory
-- end -- loop through TFs