Diff of /tests/test_convertion.py [000000] .. [d7cf27]

Switch to side-by-side view

--- a
+++ b/tests/test_convertion.py
@@ -0,0 +1,206 @@
+import os
+
+import matplotlib
+matplotlib.use('AGG')
+
+import numpy as np
+import pytest
+from keras import backend as K
+from keras.layers import Conv2D
+from pkg_resources import resource_filename
+
+from janggu import Janggu
+from janggu import inputlayer
+from janggu import outputconv
+from janggu.data import Bioseq
+from janggu.data import Cover
+from janggu.layers import DnaConv2D
+from janggu.layers import LocalAveragePooling2D
+
+@pytest.mark.filterwarnings("ignore:inspect")
+@pytest.mark.filterwarnings("ignore:The binary")
+def test_create_from_array_whole_genome_true_from_pred(tmpdir):
+    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
+    # load the dataset
+    # The pseudo genome represents just a concatenation of all sequences
+    # in sample.fa and sample2.fa. Therefore, the results should be almost
+    # identically to the models obtained from classify_fasta.py.
+    REFGENOME = resource_filename('janggu', 'resources/pseudo_genome.fa')
+    # ROI contains regions spanning positive and negative examples
+    ROI_FILE = resource_filename('janggu', 'resources/roi_train.bed')
+    # PEAK_FILE only contains positive examples
+    PEAK_FILE = resource_filename('janggu', 'resources/scores.bed')
+
+    DNA = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME,
+                                       roi=ROI_FILE,
+                                       binsize=200, stepsize=200,
+                                       order=1,
+                                       store_whole_genome=True)
+
+    LABELS = Cover.create_from_bed('peaks', roi=ROI_FILE,
+                                   bedfiles=PEAK_FILE,
+                                   binsize=200, stepsize=200,
+                                   resolution=200,
+                                   store_whole_genome=True)
+
+    @inputlayer
+    @outputconv('sigmoid')
+    def double_stranded_model_dnaconv(inputs, inp, oup, params):
+        with inputs.use('dna') as layer:
+            layer = DnaConv2D(Conv2D(params[0], (params[1], 1),
+                                     activation=params[2]))(layer)
+        output = LocalAveragePooling2D(window_size=layer.shape.as_list()[1],
+                                       name='motif')(layer)
+        return inputs, output
+
+    modeltemplate = double_stranded_model_dnaconv
+
+    K.clear_session()
+
+    # create a new model object
+    model = Janggu.create(template=modeltemplate,
+                          modelparams=(30, 21, 'relu'),
+                          inputs=DNA,
+                          outputs=LABELS)
+
+    model.compile(optimizer='adadelta', loss='binary_crossentropy',
+                  metrics=['acc'])
+
+    pred = model.predict(DNA)
+
+    cov_out = Cover.create_from_array('BindingProba', pred, LABELS.gindexer,
+                                      store_whole_genome=True)
+
+    assert pred.shape == cov_out.shape
+
+    np.testing.assert_equal(pred, cov_out[:])
+
+    assert len(cov_out.gindexer) == len(pred)
+    assert len(cov_out.garray.handle) == 1
+
+
+@pytest.mark.filterwarnings("ignore:inspect")
+@pytest.mark.filterwarnings("ignore:The binary")
+def test_create_from_array_whole_genome_true(tmpdir):
+    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
+    
+    # load the dataset
+    # The pseudo genome represents just a concatenation of all sequences
+    # in sample.fa and sample2.fa. Therefore, the results should be almost
+    # identically to the models obtained from classify_fasta.py.
+    # ROI contains regions spanning positive and negative examples
+    ROI_FILE = resource_filename('janggu', 'resources/roi_train.bed')
+    # PEAK_FILE only contains positive examples
+    PEAK_FILE = resource_filename('janggu', 'resources/scores.bed')
+
+    LABELS = Cover.create_from_bed('peaks', roi=ROI_FILE,
+                                   bedfiles=[PEAK_FILE]*5,
+                                   binsize=200, stepsize=200,
+                                   resolution=200,
+                                   store_whole_genome=True)
+
+    pred = LABELS[:]
+
+    for storage in ['ndarray', 'sparse', 'hdf5']:
+        print(storage)
+        cov_out = Cover.create_from_array('BindingProba', pred,
+                                          LABELS.gindexer,
+                                          cache=True,
+                                          storage=storage,
+                                          store_whole_genome=True)
+
+        np.testing.assert_equal(cov_out[:], LABELS[:])
+        np.testing.assert_equal(cov_out.shape, LABELS.shape)
+
+@pytest.mark.filterwarnings("ignore:The binary")
+def test_create_from_array_whole_genome_false_pred(tmpdir):
+    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
+    # load the dataset
+    # The pseudo genome represents just a concatenation of all sequences
+    # in sample.fa and sample2.fa. Therefore, the results should be almost
+    # identically to the models obtained from classify_fasta.py.
+    REFGENOME = resource_filename('janggu', 'resources/pseudo_genome.fa')
+    # ROI contains regions spanning positive and negative examples
+    ROI_FILE = resource_filename('janggu', 'resources/roi_train.bed')
+    # PEAK_FILE only contains positive examples
+    PEAK_FILE = resource_filename('janggu', 'resources/scores.bed')
+
+    DNA = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME,
+                                       roi=ROI_FILE,
+                                       binsize=200, stepsize=200,
+                                       order=1,
+                                       store_whole_genome=False)
+
+    LABELS = Cover.create_from_bed('peaks', roi=ROI_FILE,
+                                   bedfiles=PEAK_FILE,
+                                   binsize=200, stepsize=200,
+                                   resolution=200,
+                                   store_whole_genome=False)
+
+    @inputlayer
+    @outputconv('sigmoid')
+    def double_stranded_model_dnaconv(inputs, inp, oup, params):
+        with inputs.use('dna') as layer:
+            layer = DnaConv2D(Conv2D(params[0], (params[1], 1),
+                                     activation=params[2]))(layer)
+        output = LocalAveragePooling2D(window_size=layer.shape.as_list()[1],
+                                       name='motif')(layer)
+        return inputs, output
+
+    modeltemplate = double_stranded_model_dnaconv
+
+    K.clear_session()
+
+    # create a new model object
+    model = Janggu.create(template=modeltemplate,
+                          modelparams=(30, 21, 'relu'),
+                          inputs=DNA,
+                          outputs=LABELS)
+
+    model.compile(optimizer='adadelta', loss='binary_crossentropy',
+                  metrics=['acc'])
+
+    pred = model.predict(DNA)
+
+    cov_out = Cover.create_from_array('BindingProba', pred, LABELS.gindexer,
+                                      store_whole_genome=False)
+
+    assert pred.shape == cov_out.shape
+
+    np.testing.assert_equal(pred, cov_out[:])
+
+    assert len(cov_out.gindexer) == len(pred)
+    assert len(cov_out.garray.handle['data']) == len(pred)
+
+@pytest.mark.filterwarnings("ignore:inspect")
+@pytest.mark.filterwarnings("ignore:The binary")
+def test_create_from_array_whole_genome_false(tmpdir):
+    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
+    # load the dataset
+    # The pseudo genome represents just a concatenation of all sequences
+    # in sample.fa and sample2.fa. Therefore, the results should be almost
+    # identically to the models obtained from classify_fasta.py.
+    # ROI contains regions spanning positive and negative examples
+    ROI_FILE = resource_filename('janggu', 'resources/roi_train.bed')
+    # PEAK_FILE only contains positive examples
+    PEAK_FILE = resource_filename('janggu', 'resources/scores.bed')
+
+    LABELS = Cover.create_from_bed('peaks', roi=ROI_FILE,
+                                   bedfiles=[PEAK_FILE]*5,
+                                   binsize=200, stepsize=200,
+                                   resolution=200,
+                                   store_whole_genome=False)
+
+    pred = LABELS[:]
+
+    for storage in ['ndarray', 'sparse', 'hdf5']:
+        print(storage)
+        cov_out = Cover.create_from_array('BindingProba', pred,
+                                          LABELS.gindexer,
+                                          cache=True,
+                                          storage=storage,
+                                          store_whole_genome=False)
+
+        np.testing.assert_equal(cov_out[:], LABELS[:])
+        np.testing.assert_equal(cov_out.shape, LABELS.shape)
+