[214c6e]: / 01_jund_prediction / dnase_dna_model.py

Download this file

135 lines (104 with data), 4.3 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import argparse
import os
import numpy as np
import pandas as pd
if "CUDA_VISIBLE_DEVICES" not in os.environ:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from keras.layers import Concatenate
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from sklearn.metrics import average_precision_score
from janggu import Janggu
from janggu import model_from_json
from data_utils import get_data
from model_utils import get_opt
np.random.seed(1234)
# Fetch parser arguments
PARSER = argparse.ArgumentParser(description='Command description.')
PARSER.add_argument('-inputpath', dest='inpath',
default='../data', help='Location of input files')
PARSER.add_argument('-path', dest='path',
default='../jund_results',
help="Output directory for the examples.")
dnasemodelname = 'dnase_run_zscorelog_orient_7_{}'
dnamodelname = 'dna_o3_d0.2_run_5_{}'
args = PARSER.parse_args()
os.environ['JANGGU_OUTPUT'] = args.path
inpath = args.inpath
print("#" * 20)
print("Fitting combined models ...")
shared_space = {
'type': 'dnase_dna',
'seq_dropout': True,
'dnaflank': 150,
'order': 3,
'stranded': 'double',
'nmotifs1': 10,
'motiflen': 11,
'pool1': 30,
'nmotifs2': 8,
'hypermotiflen': 3,
'dnaseflank': 450,
'normalize': 'tpm',
'augment': 'orient',
'nkernel1': 10,
'kernel1len': 5,
'kernel1pool': 2,
'nkernel2': 5,
'kernel2len': 3,
'binsize': 200,
'concat': 'flatten',
'inception': True,
'epochs': 100,
'opt': 'sgd'
}
space = {'modeltype': shared_space}
DATA = get_data(shared_space)
train_data = DATA[0]
val_data = DATA[1]
test_data = DATA[2]
auprc_pre_val = []
auprc_pre_test = []
auprc_rand_val = []
auprc_rand_test = []
# Next, we concatenate the individual models and fine-tune them.
# Furthermore, the combined models are reset with random weights and trained from scratch
# as a comparison.
for dnarun, dnaserun in zip([1, 2, 3, 4, 5], [1, 2, 3, 4, 5]):
# load pre-trained models
dnaname = dnamodelname.format(dnarun)
dnasename = dnasemodelname.format(dnaserun)
dnamodel = Janggu.create_by_name(dnaname)
dnasemodel = Janggu.create_by_name(dnasename)
# remove output layer, concatenate the top-hidden layers, append output
hidden_dna = dnamodel.kerasmodel.layers[-2].output
hidden_dnase = dnasemodel.kerasmodel.layers[-2].output
joint_hidden = Concatenate(name='concat')([hidden_dna, hidden_dnase])
output = Dense(1, activation='sigmoid', name='peaks')(joint_hidden)
# fit the model with preinitialized weights
jointmodel = Janggu(dnamodel.kerasmodel.inputs + dnasemodel.kerasmodel.inputs,
output,
name='pretrained_dnase_dna_joint_model_{}_{}'.format(dnasename, dnaname))
# reload the same model architecture, but this will
# randomly reinitialized the weights
newjointmodel = model_from_json(jointmodel.kerasmodel.to_json())
newjointmodel = Janggu(newjointmodel.inputs,
newjointmodel.outputs,
name='randominit_dnase_dna_joint_model_{}_{}'.format(dnasename, dnaname))
newjointmodel.compile(optimizer=get_opt('amsgrad'), loss='binary_crossentropy',
metrics=['acc'])
hist = newjointmodel.fit(train_data[0], train_data[1],
epochs=shared_space['epochs'], batch_size=64,
validation_data=val_data,
callbacks=[EarlyStopping(patience=5, restore_best_weights=True)])
pred_test = newjointmodel.predict(test_data[0])
pred_val = newjointmodel.predict(val_data[0])
auprc_val = average_precision_score(val_data[1][:], pred_val)
auprc_test = average_precision_score(test_data[1][:], pred_test)
print('auprc_val: {:.2%}'.format(auprc_val))
print('auprc_test: {:.2%}'.format(auprc_test))
auprc_rand_val.append(auprc_val)
auprc_rand_test.append(auprc_test)
df = pd.DataFrame({'auprc_val': auprc_rand_val, 'auprc_test': auprc_rand_test})
df.to_csv(os.path.join(os.environ['JANGGU_OUTPUT'],
"dnase_dna_use_randominit_submodels.tsv"), sep='\t')