534 lines (533 with data), 22.5 kB
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Predict regulatory regions from the DNA sequence using keras"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this notebook we illustrate several variants how to predict regulatory regions (of a toy example) from the DNA sequence.\n",
"The reference genome is made up of a concatenation of Oct4 and Mafk binding sites and we shall use all regions on chromosome 'pseudo1' as training\n",
"and 'pseudo2' as test chromosomes."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/wkopp/anaconda3/envs/jdev/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
" from ._conv import register_converters as _register_converters\n",
"Using TensorFlow backend.\n"
]
}
],
"source": [
"import os\n",
"\n",
"import numpy as np\n",
"from keras import Model\n",
"from keras import backend as K\n",
"from keras.layers import Conv2D\n",
"\n",
"from keras.layers import Dense\n",
"from keras.layers import GlobalAveragePooling2D\n",
"from keras.layers import Input\n",
"\n",
"from pkg_resources import resource_filename\n",
"\n",
"from janggu.data import Bioseq\n",
"from janggu.data import Cover\n",
"from janggu.data import ReduceDim\n",
"from janggu.layers import DnaConv2D\n",
"\n",
"from sklearn.metrics import roc_auc_score"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from IPython.display import Image\n",
"\n",
"np.random.seed(1234)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"First, we need to specify the output directory in which the results are stored."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"os.environ['JANGGU_OUTPUT'] = '/home/wkopp/janggu_examples'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Specify the DNA sequence feature order. Order 1, 2 and 3 correspond to mono-, di- and tri-nucleotide based features (see Tutorial)."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"order = 3"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# load the dataset\n",
"# The pseudo genome represents just a concatenation of all sequences\n",
"# in sample.fa and sample2.fa. Therefore, the results should be almost\n",
"# identically to the models obtained from classify_fasta.py.\n",
"REFGENOME = resource_filename('janggu', 'resources/pseudo_genome.fa')\n",
"# ROI contains regions spanning positive and negative examples\n",
"ROI_TRAIN_FILE = resource_filename('janggu', 'resources/roi_train.bed')\n",
"ROI_TEST_FILE = resource_filename('janggu', 'resources/roi_test.bed')\n",
"# PEAK_FILE only contains positive examples\n",
"PEAK_FILE = resource_filename('janggu', 'resources/scores.bed')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Load the datasets for training and testing"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading from lazy loader\n",
"reload /home/wkopp/janggu_examples/datasets/dna/09dc9f2602f1bea4bf22fd52b8adfa21c7500a31f70c0ef7a506f79a4f92b43a.npz\n",
"loading from bed lazy loader\n",
"reload /home/wkopp/janggu_examples/datasets/peaks/fd9826cf7fb9cc044a6c1354a14688c1be0f0bd9c593fdba2e9af3284a2be099.npz\n",
"loading from lazy loader\n",
"loading from bed lazy loader\n"
]
}
],
"source": [
"# Training input and labels are purely defined genomic coordinates\n",
"DNA = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME,\n",
" roi=ROI_TRAIN_FILE,\n",
" binsize=200,\n",
" order=order,\n",
" cache=True)\n",
"\n",
"# The ReduceDim wrapper transforms the dataset from a 4D object to a 2D table-like representation\n",
"LABELS = ReduceDim(Cover.create_from_bed('peaks', roi=ROI_TRAIN_FILE,\n",
" bedfiles=PEAK_FILE,\n",
" binsize=200,\n",
" resolution=200,\n",
" cache=True,\n",
" storage='sparse'))\n",
"\n",
"\n",
"DNA_TEST = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME,\n",
" roi=ROI_TEST_FILE,\n",
" binsize=200,\n",
" order=order)\n",
"\n",
"LABELS_TEST = ReduceDim(Cover.create_from_bed('peaks',\n",
" bedfiles=PEAK_FILE,\n",
" roi=ROI_TEST_FILE,\n",
" binsize=200,\n",
" resolution=200,\n",
" storage='sparse'))\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"training set: (7797, 198, 1, 64) (7797, 1)\n",
"test set: (200, 198, 1, 64) (200, 1)\n"
]
}
],
"source": [
"print('training set:', DNA.shape, LABELS.shape)\n",
"print('test set:', DNA_TEST.shape, LABELS_TEST.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Specify a keras model with compatible dimesions input and output dimensions for the example."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"_________________________________________________________________\n",
"Layer (type) Output Shape Param # \n",
"=================================================================\n",
"input_1 (InputLayer) (None, 198, 1, 64) 0 \n",
"_________________________________________________________________\n",
"dna_conv2d_1 (DnaConv2D) (None, 178, 1, 30) 40350 \n",
"_________________________________________________________________\n",
"global_average_pooling2d_1 ( (None, 30) 0 \n",
"_________________________________________________________________\n",
"dense_1 (Dense) (None, 1) 31 \n",
"=================================================================\n",
"Total params: 40,381\n",
"Trainable params: 40,381\n",
"Non-trainable params: 0\n",
"_________________________________________________________________\n"
]
}
],
"source": [
"xin = Input((200 - order + 1, 1, pow(4, order)))\n",
"layer = DnaConv2D(Conv2D(30, (21, 1),\n",
" activation='relu'))(xin)\n",
"layer = GlobalAveragePooling2D()(layer)\n",
"output = Dense(1, activation='sigmoid')(layer)\n",
"\n",
"\n",
"\n",
"model = Model(xin, output)\n",
"\n",
"model.compile(optimizer='adadelta', loss='binary_crossentropy',\n",
" metrics=['acc'])\n",
"model.summary()\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1/100\n",
"7797/7797 [==============================] - 5s 594us/step - loss: 0.5611 - acc: 0.7354\n",
"Epoch 2/100\n",
"7797/7797 [==============================] - 4s 453us/step - loss: 0.4607 - acc: 0.7980\n",
"Epoch 3/100\n",
"7797/7797 [==============================] - 3s 429us/step - loss: 0.4115 - acc: 0.8222\n",
"Epoch 4/100\n",
"7797/7797 [==============================] - 3s 409us/step - loss: 0.3785 - acc: 0.8424\n",
"Epoch 5/100\n",
"7797/7797 [==============================] - 3s 424us/step - loss: 0.3513 - acc: 0.8551\n",
"Epoch 6/100\n",
"7797/7797 [==============================] - 3s 434us/step - loss: 0.3256 - acc: 0.8657\n",
"Epoch 7/100\n",
"7797/7797 [==============================] - 3s 400us/step - loss: 0.3037 - acc: 0.8821\n",
"Epoch 8/100\n",
"7797/7797 [==============================] - 3s 396us/step - loss: 0.2801 - acc: 0.8916\n",
"Epoch 9/100\n",
"7797/7797 [==============================] - 3s 406us/step - loss: 0.2602 - acc: 0.9006\n",
"Epoch 10/100\n",
"7797/7797 [==============================] - 3s 408us/step - loss: 0.2412 - acc: 0.9127\n",
"Epoch 11/100\n",
"7797/7797 [==============================] - 3s 394us/step - loss: 0.2240 - acc: 0.9209\n",
"Epoch 12/100\n",
"7797/7797 [==============================] - 3s 441us/step - loss: 0.2083 - acc: 0.9278\n",
"Epoch 13/100\n",
"7797/7797 [==============================] - 3s 443us/step - loss: 0.1951 - acc: 0.9318\n",
"Epoch 14/100\n",
"7797/7797 [==============================] - 3s 404us/step - loss: 0.1822 - acc: 0.9396\n",
"Epoch 15/100\n",
"7797/7797 [==============================] - 3s 401us/step - loss: 0.1712 - acc: 0.9425\n",
"Epoch 16/100\n",
"7797/7797 [==============================] - 3s 415us/step - loss: 0.1621 - acc: 0.9450\n",
"Epoch 17/100\n",
"7797/7797 [==============================] - 3s 403us/step - loss: 0.1535 - acc: 0.9484\n",
"Epoch 18/100\n",
"7797/7797 [==============================] - 3s 408us/step - loss: 0.1448 - acc: 0.9511\n",
"Epoch 19/100\n",
"7797/7797 [==============================] - 3s 420us/step - loss: 0.1387 - acc: 0.9534\n",
"Epoch 20/100\n",
"7797/7797 [==============================] - 3s 433us/step - loss: 0.1312 - acc: 0.9595\n",
"Epoch 21/100\n",
"7797/7797 [==============================] - 3s 410us/step - loss: 0.1249 - acc: 0.9593\n",
"Epoch 22/100\n",
"7797/7797 [==============================] - 3s 409us/step - loss: 0.1196 - acc: 0.9614\n",
"Epoch 23/100\n",
"7797/7797 [==============================] - 3s 415us/step - loss: 0.1136 - acc: 0.9647\n",
"Epoch 24/100\n",
"7797/7797 [==============================] - 3s 431us/step - loss: 0.1091 - acc: 0.9663\n",
"Epoch 25/100\n",
"7797/7797 [==============================] - 3s 409us/step - loss: 0.1039 - acc: 0.9668\n",
"Epoch 26/100\n",
"7797/7797 [==============================] - 3s 405us/step - loss: 0.0991 - acc: 0.9708\n",
"Epoch 27/100\n",
"7797/7797 [==============================] - 3s 398us/step - loss: 0.0965 - acc: 0.9701\n",
"Epoch 28/100\n",
"7797/7797 [==============================] - 3s 417us/step - loss: 0.0921 - acc: 0.9727\n",
"Epoch 29/100\n",
"7797/7797 [==============================] - 3s 406us/step - loss: 0.0887 - acc: 0.9751\n",
"Epoch 30/100\n",
"7797/7797 [==============================] - 3s 406us/step - loss: 0.0860 - acc: 0.9760\n",
"Epoch 31/100\n",
"7797/7797 [==============================] - 3s 414us/step - loss: 0.0826 - acc: 0.9764\n",
"Epoch 32/100\n",
"7797/7797 [==============================] - 3s 398us/step - loss: 0.0795 - acc: 0.9777\n",
"Epoch 33/100\n",
"7797/7797 [==============================] - 3s 412us/step - loss: 0.0768 - acc: 0.9808\n",
"Epoch 34/100\n",
"7797/7797 [==============================] - 3s 394us/step - loss: 0.0743 - acc: 0.9797\n",
"Epoch 35/100\n",
"7797/7797 [==============================] - 3s 414us/step - loss: 0.0714 - acc: 0.9810\n",
"Epoch 36/100\n",
"7797/7797 [==============================] - 3s 415us/step - loss: 0.0695 - acc: 0.9822\n",
"Epoch 37/100\n",
"7797/7797 [==============================] - 3s 393us/step - loss: 0.0669 - acc: 0.9833\n",
"Epoch 38/100\n",
"7797/7797 [==============================] - 3s 408us/step - loss: 0.0642 - acc: 0.9844\n",
"Epoch 39/100\n",
"7797/7797 [==============================] - 3s 420us/step - loss: 0.0620 - acc: 0.9849\n",
"Epoch 40/100\n",
"7797/7797 [==============================] - 3s 413us/step - loss: 0.0605 - acc: 0.9858\n",
"Epoch 41/100\n",
"7797/7797 [==============================] - 3s 400us/step - loss: 0.0584 - acc: 0.9860\n",
"Epoch 42/100\n",
"7797/7797 [==============================] - 3s 413us/step - loss: 0.0565 - acc: 0.9863\n",
"Epoch 43/100\n",
"7797/7797 [==============================] - 3s 403us/step - loss: 0.0545 - acc: 0.9878\n",
"Epoch 44/100\n",
"7797/7797 [==============================] - 3s 415us/step - loss: 0.0537 - acc: 0.9885\n",
"Epoch 45/100\n",
"7797/7797 [==============================] - 3s 406us/step - loss: 0.0517 - acc: 0.9895\n",
"Epoch 46/100\n",
"7797/7797 [==============================] - 3s 409us/step - loss: 0.0503 - acc: 0.9887\n",
"Epoch 47/100\n",
"7797/7797 [==============================] - 3s 401us/step - loss: 0.0489 - acc: 0.9890\n",
"Epoch 48/100\n",
"7797/7797 [==============================] - 3s 402us/step - loss: 0.0472 - acc: 0.9901\n",
"Epoch 49/100\n",
"7797/7797 [==============================] - 3s 410us/step - loss: 0.0459 - acc: 0.9905\n",
"Epoch 50/100\n",
"7797/7797 [==============================] - 3s 404us/step - loss: 0.0450 - acc: 0.9895\n",
"Epoch 51/100\n",
"7797/7797 [==============================] - 3s 415us/step - loss: 0.0432 - acc: 0.9897\n",
"Epoch 52/100\n",
"7797/7797 [==============================] - 3s 399us/step - loss: 0.0420 - acc: 0.9915\n",
"Epoch 53/100\n",
"7797/7797 [==============================] - 3s 416us/step - loss: 0.0402 - acc: 0.9928\n",
"Epoch 54/100\n",
"7797/7797 [==============================] - 3s 408us/step - loss: 0.0394 - acc: 0.9919\n",
"Epoch 55/100\n",
"7797/7797 [==============================] - 3s 428us/step - loss: 0.0383 - acc: 0.9920\n",
"Epoch 56/100\n",
"7797/7797 [==============================] - 3s 404us/step - loss: 0.0368 - acc: 0.9932\n",
"Epoch 57/100\n",
"7797/7797 [==============================] - 3s 407us/step - loss: 0.0361 - acc: 0.9932\n",
"Epoch 58/100\n",
"7797/7797 [==============================] - 3s 403us/step - loss: 0.0353 - acc: 0.9931\n",
"Epoch 59/100\n",
"7797/7797 [==============================] - 3s 412us/step - loss: 0.0343 - acc: 0.9937\n",
"Epoch 60/100\n",
"7797/7797 [==============================] - 3s 419us/step - loss: 0.0332 - acc: 0.9938\n",
"Epoch 61/100\n",
"7797/7797 [==============================] - 3s 389us/step - loss: 0.0322 - acc: 0.9942\n",
"Epoch 62/100\n",
"7797/7797 [==============================] - 3s 421us/step - loss: 0.0315 - acc: 0.9946\n",
"Epoch 63/100\n",
"7797/7797 [==============================] - 3s 415us/step - loss: 0.0306 - acc: 0.9944\n",
"Epoch 64/100\n",
"7797/7797 [==============================] - 3s 410us/step - loss: 0.0297 - acc: 0.9956\n",
"Epoch 65/100\n",
"7797/7797 [==============================] - 3s 423us/step - loss: 0.0291 - acc: 0.9959\n",
"Epoch 66/100\n",
"7797/7797 [==============================] - 3s 440us/step - loss: 0.0285 - acc: 0.9955\n",
"Epoch 67/100\n",
"7797/7797 [==============================] - 3s 423us/step - loss: 0.0277 - acc: 0.9947\n",
"Epoch 68/100\n",
"7797/7797 [==============================] - 3s 439us/step - loss: 0.0265 - acc: 0.9955\n",
"Epoch 69/100\n",
"7797/7797 [==============================] - 3s 417us/step - loss: 0.0260 - acc: 0.9963\n",
"Epoch 70/100\n",
"7797/7797 [==============================] - 3s 435us/step - loss: 0.0253 - acc: 0.9968\n",
"Epoch 71/100\n",
"7797/7797 [==============================] - 4s 462us/step - loss: 0.0246 - acc: 0.9967\n",
"Epoch 72/100\n",
"7797/7797 [==============================] - 4s 454us/step - loss: 0.0239 - acc: 0.9964\n",
"Epoch 73/100\n",
"7797/7797 [==============================] - 4s 481us/step - loss: 0.0231 - acc: 0.9977\n",
"Epoch 74/100\n",
"7797/7797 [==============================] - 3s 427us/step - loss: 0.0229 - acc: 0.9967\n",
"Epoch 75/100\n",
"7797/7797 [==============================] - 3s 441us/step - loss: 0.0219 - acc: 0.9977\n",
"Epoch 76/100\n",
"7797/7797 [==============================] - 3s 414us/step - loss: 0.0216 - acc: 0.9974\n",
"Epoch 77/100\n",
"7797/7797 [==============================] - 3s 409us/step - loss: 0.0209 - acc: 0.9976\n",
"Epoch 78/100\n",
"7797/7797 [==============================] - 3s 422us/step - loss: 0.0202 - acc: 0.9982\n",
"Epoch 79/100\n",
"7797/7797 [==============================] - 3s 406us/step - loss: 0.0197 - acc: 0.9979\n",
"Epoch 80/100\n",
"7797/7797 [==============================] - 3s 419us/step - loss: 0.0192 - acc: 0.9986\n",
"Epoch 81/100\n",
"7797/7797 [==============================] - 3s 399us/step - loss: 0.0187 - acc: 0.9981\n",
"Epoch 82/100\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"7797/7797 [==============================] - 3s 408us/step - loss: 0.0184 - acc: 0.9985\n",
"Epoch 83/100\n",
"7797/7797 [==============================] - 3s 387us/step - loss: 0.0177 - acc: 0.9986\n",
"Epoch 84/100\n",
"7797/7797 [==============================] - 3s 412us/step - loss: 0.0172 - acc: 0.9987\n",
"Epoch 85/100\n",
"7797/7797 [==============================] - 3s 412us/step - loss: 0.0168 - acc: 0.9987\n",
"Epoch 86/100\n",
"7797/7797 [==============================] - 3s 406us/step - loss: 0.0165 - acc: 0.9988\n",
"Epoch 87/100\n",
"7797/7797 [==============================] - 3s 413us/step - loss: 0.0160 - acc: 0.9992\n",
"Epoch 88/100\n",
"7797/7797 [==============================] - 3s 412us/step - loss: 0.0156 - acc: 0.9991\n",
"Epoch 89/100\n",
"7797/7797 [==============================] - 3s 425us/step - loss: 0.0153 - acc: 0.9990\n",
"Epoch 90/100\n",
"7797/7797 [==============================] - 4s 449us/step - loss: 0.0149 - acc: 0.9988\n",
"Epoch 91/100\n",
"7797/7797 [==============================] - 3s 407us/step - loss: 0.0145 - acc: 0.9992\n",
"Epoch 92/100\n",
"7797/7797 [==============================] - 3s 447us/step - loss: 0.0144 - acc: 0.9991\n",
"Epoch 93/100\n",
"7797/7797 [==============================] - 4s 469us/step - loss: 0.0138 - acc: 0.9991\n",
"Epoch 94/100\n",
"7797/7797 [==============================] - 3s 422us/step - loss: 0.0133 - acc: 0.9995\n",
"Epoch 95/100\n",
"7797/7797 [==============================] - 3s 403us/step - loss: 0.0130 - acc: 0.9992\n",
"Epoch 96/100\n",
"7797/7797 [==============================] - 3s 436us/step - loss: 0.0125 - acc: 0.9995\n",
"Epoch 97/100\n",
"7797/7797 [==============================] - 3s 433us/step - loss: 0.0125 - acc: 0.9988\n",
"Epoch 98/100\n",
"7797/7797 [==============================] - 3s 407us/step - loss: 0.0123 - acc: 0.9994\n",
"Epoch 99/100\n",
"7797/7797 [==============================] - 3s 420us/step - loss: 0.0118 - acc: 0.9992\n",
"Epoch 100/100\n",
"7797/7797 [==============================] - 3s 406us/step - loss: 0.0117 - acc: 0.9995\n",
"########################################\n",
"loss: 0.011705336400447587, acc: 0.9994869821726305\n",
"########################################\n"
]
}
],
"source": [
"hist = model.fit(DNA, LABELS, epochs=100)\n",
"\n",
"print('#' * 40)\n",
"print('loss: {}, acc: {}'.format(hist.history['loss'][-1],\n",
" hist.history['acc'][-1]))\n",
"print('#' * 40)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The predictions may be converted back to Cover object and subsequently exported as bigwig in order to inspect the plausibility of the results in the genome browser."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# convert the prediction to a cover object\n",
"pred = model.predict(DNA_TEST)\n",
"cov_pred = Cover.create_from_array('BindingProba', pred, LABELS_TEST.gindexer)\n",
"\n",
"# predictions (or feature activities) can finally be exported to bigwig\n",
"cov_pred.export_to_bigwig(output_dir=os.environ['JANGGU_OUTPUT'])\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"AUC: 0.9955999999999999\n"
]
}
],
"source": [
"print(\"AUC:\", roc_auc_score(LABELS_TEST[:], pred))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}