janggu / Git / Diff of /tests/test_dna

Models:
MarcoTheBlack/
janggu
Downloads: 1
Diff of /tests/test_dna_dataset.py [000000] .. [d7cf27]
Switch to side-by-side view

--- a
+++ b/tests/test_dna_dataset.py
@@ -0,0 +1,1314 @@
+import glob
+import os
+
+import matplotlib
+import numpy as np
+import pandas
+import pkg_resources
+import pytest
+from keras.layers import Input
+from keras.models import Model
+from pybedtools import BedTool
+from pybedtools import Interval
+
+from janggu.data import Bioseq
+from janggu.data import GenomicIndexer
+from janggu.data import VariantStreamer
+from janggu.layers import Complement
+from janggu.layers import Reverse
+from janggu.utils import complement_permmatrix
+from janggu.utils import sequences_from_fasta
+
+matplotlib.use('AGG')
+
+binsize = 200
+flank = 150
+stepsize = 50
+
+
+def datalen(bed_file):
+    binsizes = 0
+    reader = BedTool(bed_file)
+    for reg in reader:
+        binsizes += (reg.iv.end - reg.iv.start - binsize + stepsize)//stepsize
+    return binsizes
+
+
+def test_dna_loading_from_seqrecord(tmpdir):
+    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
+    order = 2
+    data_path = pkg_resources.resource_filename('janggu', 'resources/')
+    bed_merged = os.path.join(data_path, 'sample.gtf')
+    refgenome = os.path.join(data_path, 'sample_genome.fa')
+    seqs = sequences_from_fasta(refgenome)
+
+    data = Bioseq.create_from_refgenome('train', refgenome=seqs,
+                                     roi=bed_merged,
+                                     storage='ndarray',
+                                     store_whole_genome=True,
+                                     order=order)
+
+    np.testing.assert_equal(data[0], data[data.gindexer[0]])
+    chrom = data.gindexer[0].chrom
+    start = data.gindexer[0].start
+    end = data.gindexer[0].end
+    np.testing.assert_equal(data[0], data[(chrom, start, end)])
+    np.testing.assert_equal(data[0], data[chrom, start, end])
+
+
+def test_dna_genomic_interval_access(tmpdir):
+    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
+    order = 2
+    data_path = pkg_resources.resource_filename('janggu', 'resources/')
+    bed_merged = os.path.join(data_path, 'sample.gtf')
+    refgenome = os.path.join(data_path, 'sample_genome.fa')
+
+    data = Bioseq.create_from_refgenome('train', refgenome=refgenome,
+                                     roi=bed_merged,
+                                     storage='ndarray',
+                                     order=order)
+
+    with pytest.raises(Exception):
+        # due to store_whole_genome = False
+        data[data.gindexer[0]]
+
+    data = Bioseq.create_from_refgenome('train', refgenome=refgenome,
+                                     roi=bed_merged,
+                                     storage='ndarray',
+                                     order=order,
+                                     store_whole_genome=True)
+
+    np.testing.assert_equal(data[0], data[data.gindexer[0]])
+    chrom = data.gindexer[0].chrom
+    start = data.gindexer[0].start
+    end = data.gindexer[0].end
+    np.testing.assert_equal(data[0], data[(chrom, start, end)])
+    np.testing.assert_equal(data[0], data[chrom, start, end])
+
+def test_dna_dims_order_1_from_subset_dataframe(tmpdir):
+    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
+    order = 1
+    data_path = pkg_resources.resource_filename('janggu', 'resources/')
+    bed_merged = os.path.join(data_path, 'sample.gtf')
+    refgenome = os.path.join(data_path, 'sample_genome.fa')
+
+    roi = pandas.read_csv(bed_merged,
+                          sep='\t', header=None, usecols=[0, 2, 3, 4, 5,  6], skiprows=2,
+                          names=['chrom', 'name', 'start', 'end', 'score', 'strand'])
+    roi.start -= 1
+    print(roi)
+
+    data = Bioseq.create_from_refgenome('train', refgenome=refgenome,
+                                     roi=roi,
+                                     storage='ndarray',
+                                     store_whole_genome=True,
+                                     order=order)
+
+    np.testing.assert_equal(data[0], data[data.gindexer[0]])
+    assert len(data.garray.handle) == 2
+
+    # for order 1
+    assert len(data) == 2
+    assert data.shape == (2, 10000, 1, 4)
+    assert data[:].sum() == 20000
+
+    roi = BedTool(bed_merged)
+    data = Bioseq.create_from_refgenome('train', refgenome=refgenome,
+                                     roi=roi,
+                                     storage='ndarray',
+                                     store_whole_genome=True,
+                                     order=order)
+
+    np.testing.assert_equal(data[0], data[data.gindexer[0]])
+    assert len(data.garray.handle) == 2
+
+    # for order 1
+    assert len(data) == 2
+    assert data.shape == (2, 10000, 1, 4)
+    assert data[:].sum() == 20000
+
+    roi = [iv for iv in BedTool(bed_merged)]
+    data = Bioseq.create_from_refgenome('train', refgenome=refgenome,
+                                     roi=roi,
+                                     storage='ndarray',
+                                     store_whole_genome=True,
+                                     order=order)
+
+    np.testing.assert_equal(data[0], data[data.gindexer[0]])
+    assert len(data.garray.handle) == 2
+
+    # for order 1
+    assert len(data) == 2
+    assert data.shape == (2, 10000, 1, 4)
+    assert data[:].sum() == 20000
+
+
+def test_dna_dims_order_1_from_subset(tmpdir):
+    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
+    order = 1
+    data_path = pkg_resources.resource_filename('janggu', 'resources/')
+    bed_merged = os.path.join(data_path, 'sample.gtf')
+    refgenome = os.path.join(data_path, 'sample_genome.fa')
+
+    data = Bioseq.create_from_refgenome('train', refgenome=refgenome,
+                                     roi=bed_merged,
+                                     storage='ndarray',
+                                     order=order)
+
+    np.testing.assert_equal(data[0], data[data.gindexer[0]])
+    assert len(data.garray.handle) == 2
+
+    # for order 1
+    assert len(data) == 2
+    assert data.shape == (2, 10000, 1, 4)
+    assert data[:].sum() == 20000
+
+
+def test_dna_dims_order_1_from_subset(tmpdir):
+    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
+    order = 1
+    data_path = pkg_resources.resource_filename('janggu', 'resources/')
+    bed_merged = os.path.join(data_path, 'sample.gtf')
+    refgenome = os.path.join(data_path, 'sample_genome.fa')
+
+    data = Bioseq.create_from_refgenome('train', refgenome=refgenome,
+                                     roi=bed_merged,
+                                     binsize=200, stepsize=200,
+                                     storage='ndarray',
+                                     order=order)
+
+    assert len(data.garray.handle['data']) == 100
+
+    # for order 1
+    assert len(data) == 100
+    assert data.shape == (100, 200, 1, 4)
+    # the correctness of the sequence extraction was also
+    # validated using:
+    # bedtools getfasta -fi sample_genome.fa -bed sample.bed
+    # >chr1:15000-25000
+    # ATTGTGGTGA...
+    # this sequence is read from the forward strand
+    np.testing.assert_equal(data[0][0, :10, 0, :],
+                            np.asarray([[1, 0, 0, 0],  # A
+                                        [0, 0, 0, 1],  # T
+                                        [0, 0, 0, 1],  # T
+                                        [0, 0, 1, 0],  # C
+                                        [0, 0, 0, 1],  # T
+                                        [0, 0, 1, 0],  # G
+                                        [0, 0, 1, 0],  # G
+                                        [0, 0, 0, 1],  # T
+                                        [0, 0, 1, 0],  # G
+                                        [1, 0, 0, 0]],  # A
+                            dtype='int8'))
+
+    # bedtools getfasta -fi sample_genome.fa -bed sample.bed
+    # >chr2:15000-25000
+    # ggggaagcaa...
+    # this sequence is read from the reverse strand
+    # so we have ...ttgcttcccc
+    np.testing.assert_equal(data[50][0, -10:, 0, :],
+                            np.asarray([[0, 0, 0, 1],  # T
+                                        [0, 0, 0, 1],  # T
+                                        [0, 0, 1, 0],  # G
+                                        [0, 1, 0, 0],  # C
+                                        [0, 0, 0, 1],  # T
+                                        [0, 0, 0, 1],  # T
+                                        [0, 1, 0, 0],  # C
+                                        [0, 1, 0, 0],  # C
+                                        [0, 1, 0, 0],  # C
+                                        [0, 1, 0, 0]],  # C
+                            dtype='int8'))
+
+
+def test_dna_dims_order_1_from_reference(tmpdir):
+    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
+    order = 1
+    data_path = pkg_resources.resource_filename('janggu', 'resources/')
+    bed_merged = os.path.join(data_path, 'sample.gtf')
+    refgenome = os.path.join(data_path, 'sample_genome.fa')
+
+    gindexer = GenomicIndexer.create_from_file(bed_merged, 200, 200)
+
+    data = Bioseq.create_from_refgenome('train', refgenome=refgenome,
+                                        storage='ndarray',
+                                        order=order,
+                                        store_whole_genome=True)
+    data.gindexer = gindexer
+    assert len(data.garray.handle) == 2
+    assert 'chr1' in data.garray.handle
+    assert 'chr2' in data.garray.handle
+
+    # for order 1
+    assert len(data) == 100
+    assert data.shape == (100, 200, 1, 4)
+    # the correctness of the sequence extraction was also
+    # validated using:
+    # bedtools getfasta -fi sample_genome.fa -bed sample.bed
+    # >chr1:15000-25000
+    # ATTGTGGTGA...
+    # this sequence is read from the forward strand
+    np.testing.assert_equal(data[0][0, :10, 0, :],
+                            np.asarray([[1, 0, 0, 0],  # A
+                                        [0, 0, 0, 1],  # T
+                                        [0, 0, 0, 1],  # T
+                                        [0, 0, 1, 0],  # C
+                                        [0, 0, 0, 1],  # T
+                                        [0, 0, 1, 0],  # G
+                                        [0, 0, 1, 0],  # G
+                                        [0, 0, 0, 1],  # T
+                                        [0, 0, 1, 0],  # G
+                                        [1, 0, 0, 0]],  # A
+                            dtype='int8'))
+
+    # bedtools getfasta -fi sample_genome.fa -bed sample.bed
+    # >chr2:15000-25000
+    # ggggaagcaa...
+    # this sequence is read from the reverse strand
+    # so we have ...ttgcttcccc
+    np.testing.assert_equal(data[50][0, -10:, 0, :],
+                            np.asarray([[0, 0, 0, 1],  # T
+                                        [0, 0, 0, 1],  # T
+                                        [0, 0, 1, 0],  # G
+                                        [0, 1, 0, 0],  # C
+                                        [0, 0, 0, 1],  # T
+                                        [0, 0, 0, 1],  # T
+                                        [0, 1, 0, 0],  # C
+                                        [0, 1, 0, 0],  # C
+                                        [0, 1, 0, 0],  # C
+                                        [0, 1, 0, 0]],  # C
+                            dtype='int8'))
+
+
+def test_dna_dims_order_2(tmpdir):
+    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
+    order = 2
+    data_path = pkg_resources.resource_filename('janggu', 'resources/')
+    bed_merged = os.path.join(data_path, 'sample.bed')
+    refgenome = os.path.join(data_path, 'sample_genome.fa')
+
+    data = Bioseq.create_from_refgenome('train', refgenome=refgenome,
+                                     roi=bed_merged,
+                                     binsize=200,
+                                     storage='ndarray',
+                                     order=order)
+
+    # for order 1
+    assert len(data) == 100
+    assert data.shape == (100, 199, 1, 16)
+    # the correctness of the sequence extraction was also
+    # validated using:
+    # >bedtools getfasta -fi sample_genome.fa -bed sample.bed
+    # >chr1:15000-25000
+    # ATTGTGGTGAC...
+    np.testing.assert_equal(
+        data[0][0, :10, 0, :],
+        np.asarray([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # AT
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],  # TT
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],  # TG
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],  # GT
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],  # TG
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],  # GG
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],  # GT
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],  # TG
+                    [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],  # GA
+                    [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],  # AC
+                   dtype='int8'))
+
+    # bedtools getfasta -fi sample_genome.fa -bed sample.bed
+    # >chr2:15000-25000
+    # ggggaagcaag...
+    # this sequence is read from the reverse strand
+    # so we have ...cttgcttcccc
+    np.testing.assert_equal(
+        data[50][0, -10:, 0, :],
+        np.asarray([[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],  # CT
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],  # TT
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],  # TG
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],  # GC
+                    [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],  # CT
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],  # TT
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],  # TC
+                    [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # CC
+                    [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # CC
+                    [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],  # CC
+                   dtype='int8'))
+
+
+def reverse_layer(order):
+    data_path = pkg_resources.resource_filename('janggu', 'resources/')
+
+    bed_file = os.path.join(data_path, 'sample.bed')
+
+    refgenome = os.path.join(data_path, 'sample_genome.fa')
+
+    data = Bioseq.create_from_refgenome('train', refgenome=refgenome,
+                                     roi=bed_file,
+                                     storage='ndarray',
+                                     binsize=binsize,
+                                     flank=flank,
+                                     order=order)
+
+    dna_in = Input(shape=data.shape[1:], name='dna')
+    rdna_layer = Reverse()(dna_in)
+
+    rmod = Model(dna_in, rdna_layer)
+
+    # actual shape of DNA
+    dna = data[0]
+    np.testing.assert_equal(dna[:, ::-1, :, :], rmod.predict(dna))
+
+
+def complement_layer(order):
+    data_path = pkg_resources.resource_filename('janggu', 'resources/')
+
+    bed_file = os.path.join(data_path, 'sample.bed')
+
+    refgenome = os.path.join(data_path, 'sample_genome.fa')
+
+    data = Bioseq.create_from_refgenome('train', refgenome=refgenome,
+                                     roi=bed_file,
+                                     storage='ndarray',
+                                     binsize=binsize,
+                                     flank=flank,
+                                     order=order)
+
+    dna_in = Input(shape=data.shape[1:], name='dna')
+    cdna_layer = Complement()(dna_in)
+    cmod = Model(dna_in, cdna_layer)
+
+    # actual shape of DNA
+    dna = data[0]
+
+    cdna = cmod.predict(dna)
+    ccdna = cmod.predict(cdna)
+
+    with pytest.raises(Exception):
+        np.testing.assert_equal(dna, cdna)
+    np.testing.assert_equal(dna, ccdna)
+
+
+def test_reverse_order_1(tmpdir):
+    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
+    reverse_layer(1)
+
+
+def test_reverse_order_2(tmpdir):
+    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
+    reverse_layer(2)
+
+
+def test_complement_order_1(tmpdir):
+    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
+    complement_layer(1)
+
+
+def test_complement_order_2(tmpdir):
+    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
+    complement_layer(2)
+
+
+def test_revcomp_rcmatrix(tmpdir):
+    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
+
+    rcmatrix = complement_permmatrix(1)
+
+    np.testing.assert_equal(rcmatrix,
+                            np.array([[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0],
+                                      [1, 0, 0, 0]]))
+
+    rcmatrix = complement_permmatrix(2)
+
+    np.testing.assert_equal(rcmatrix[0],
+                            np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                      0, 0, 0, 1]))
+    np.testing.assert_equal(rcmatrix[4],
+                            np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                      0, 0, 1, 0]))
+    np.testing.assert_equal(rcmatrix[8],
+                            np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                      0, 1, 0, 0]))
+    np.testing.assert_equal(rcmatrix[12],
+                            np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                      1, 0, 0, 0]))
+
+    np.testing.assert_equal(rcmatrix[1],
+                            np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+                                      0, 0, 0, 0]))
+    np.testing.assert_equal(rcmatrix[5],
+                            np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
+                                      0, 0, 0, 0]))
+    np.testing.assert_equal(rcmatrix[9],
+                            np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
+                                      0, 0, 0, 0]))
+    np.testing.assert_equal(rcmatrix[13],
+                            np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
+                                      0, 0, 0, 0]))
+
+
+def test_rcmatrix_identity():
+
+    for order in range(1, 4):
+
+        rcmatrix = complement_permmatrix(order)
+
+        np.testing.assert_equal(np.eye(pow(4, order)),
+                                np.matmul(rcmatrix, rcmatrix))
+
+
+def test_dna_dataset_sanity(tmpdir):
+    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
+    data_path = pkg_resources.resource_filename('janggu', 'resources/')
+    bed_file = os.path.join(data_path, 'sample.bed')
+
+    refgenome = os.path.join(data_path, 'sample_genome.fa')
+
+    with pytest.raises(Exception):
+        # name must be a string
+        Bioseq.create_from_refgenome(1.23, refgenome='',
+                                  storage='ndarray',
+                                  roi=bed_file, order=1)
+    with pytest.raises(Exception):
+        Bioseq.create_from_refgenome('train', refgenome='',
+                                  storage='ndarray',
+                                  roi=bed_file, order=1)
+    with pytest.raises(Exception):
+        Bioseq.create_from_refgenome('train', refgenome='test',
+                                  storage='ndarray',
+                                  roi=bed_file, order=1)
+
+    with pytest.raises(Exception):
+        Bioseq.create_from_refgenome('train', refgenome=refgenome,
+                                  storage='ndarray',
+                                  roi=bed_file, order=0)
+    with pytest.raises(Exception):
+        Bioseq.create_from_refgenome('train', refgenome=refgenome,
+                                  storage='ndarray',
+                                  roi=bed_file, flank=-1)
+    with pytest.raises(Exception):
+        Bioseq.create_from_refgenome('train', refgenome=refgenome,
+                                  storage='ndarray',
+                                  roi=bed_file, binsize=0)
+    with pytest.raises(Exception):
+        Bioseq.create_from_refgenome('train', refgenome=refgenome,
+                                  storage='ndarray',
+                                  roi=bed_file, stepsize=0)
+
+    with pytest.warns(FutureWarning):
+        Bioseq.create_from_refgenome('train', refgenome=refgenome,
+                                  storage='ndarray',
+                                  roi=bed_file,
+                                  datatags=['help'])
+
+    with pytest.warns(FutureWarning):
+        Bioseq.create_from_refgenome('train', refgenome=refgenome,
+                                  storage='ndarray',
+                                  roi=bed_file,
+                                  overwrite=True)
+    with pytest.raises(Exception):
+        Bioseq.create_from_refgenome('train', refgenome=refgenome,
+                                  storage='step',
+                                  roi=bed_file, order=1)
+
+    assert not os.path.exists(os.path.join(tmpdir.strpath, 'train',
+                                           'storage.h5'))
+    with pytest.raises(ValueError):
+        Bioseq.create_from_refgenome('train', refgenome=refgenome,
+                                  storage='sparse',
+                                  roi=None, order=1,
+                                  store_whole_genome=True)
+    with pytest.raises(ValueError):
+        Bioseq.create_from_refgenome('train', refgenome=refgenome,
+                                  roi=bed_file, order=0,
+                                  store_whole_genome=True)
+    with pytest.raises(ValueError):
+        Bioseq.create_from_refgenome('train', refgenome=refgenome,
+                                     roi=None, store_whole_genome=False)
+
+    Bioseq.create_from_refgenome('train', refgenome=refgenome,
+                              storage='ndarray',
+                              roi=None, order=1,
+                              store_whole_genome=True)
+    file_ = glob.glob(os.path.join(tmpdir.strpath, 'datasets', 'train', '*.h5'))
+    assert len(file_) == 0
+    print(refgenome)
+    print(bed_file)
+    Bioseq.create_from_refgenome('train', refgenome=refgenome,
+        storage='ndarray',
+        roi=bed_file, order=1, cache=True)
+    Bioseq.create_from_refgenome('train', refgenome=refgenome,
+                              storage='hdf5',
+                              roi=bed_file, order=1, cache=True)
+    # a cache file must exist now
+    file_ = glob.glob(os.path.join(tmpdir.strpath, 'datasets', 'train', '*.h5'))
+    assert len(file_) == 1
+
+    # reload the cached file
+    Bioseq.create_from_refgenome('train', refgenome=refgenome,
+                              storage='hdf5',
+                              roi=bed_file, order=1, cache=True)
+
+
+
+def test_read_dna_from_biostring_order_1():
+
+    data_path = pkg_resources.resource_filename('janggu', 'resources/')
+
+    order = 1
+    filename = os.path.join(data_path, 'sample.fa')
+    seqs = sequences_from_fasta(filename)
+    with pytest.raises(ValueError):
+        data = Bioseq.create_from_seq('train', fastafile=seqs, storage='sparse',
+                                     order=order, cache=False)
+
+    data = Bioseq.create_from_seq('train', fastafile=seqs,
+                                 order=order, cache=False)
+
+    np.testing.assert_equal(len(data), 3897)
+    np.testing.assert_equal(data.shape, (3897, 200, 1, 4))
+    np.testing.assert_equal(
+        data[0][0, :10, 0, :],
+        np.asarray([[0, 1, 0, 0],
+                    [1, 0, 0, 0],
+                    [0, 1, 0, 0],
+                    [1, 0, 0, 0],
+                    [0, 0, 1, 0],
+                    [0, 1, 0, 0],
+                    [1, 0, 0, 0],
+                    [0, 0, 1, 0],
+                    [1, 0, 0, 0],
+                    [0, 0, 1, 0]], dtype='int8'))
+
+
+def test_read_dna_from_fasta_order_1():
+
+    data_path = pkg_resources.resource_filename('janggu', 'resources/')
+
+    order = 1
+    filename = os.path.join(data_path, 'sample.fa')
+    data = Bioseq.create_from_seq('train', fastafile=filename,
+                                 order=order, cache=False)
+
+    np.testing.assert_equal(len(data), 3897)
+    np.testing.assert_equal(data.shape, (3897, 200, 1, 4))
+    np.testing.assert_equal(
+        data[0][0, :10, 0, :],
+        np.asarray([[0, 1, 0, 0],
+                    [1, 0, 0, 0],
+                    [0, 1, 0, 0],
+                    [1, 0, 0, 0],
+                    [0, 0, 1, 0],
+                    [0, 1, 0, 0],
+                    [1, 0, 0, 0],
+                    [0, 0, 1, 0],
+                    [1, 0, 0, 0],
+                    [0, 0, 1, 0]], dtype='int8'))
+
+
+def test_read_dna_from_fasta_order_2():
+    data_path = pkg_resources.resource_filename('janggu', 'resources/')
+
+    order = 2
+    filename = os.path.join(data_path, 'sample.fa')
+    for store_genome in [True, False]:
+        data = Bioseq.create_from_seq('train', fastafile=filename,
+                                      order=order, cache=False)
+
+        np.testing.assert_equal(len(data), 3897)
+        np.testing.assert_equal(data.shape, (3897, 199, 1, 16))
+        np.testing.assert_equal(
+            data[0][0, :10, 0, :],
+            np.asarray([[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
+                        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
+                        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]], dtype='int8'))
+
+def test_read_protein_sequences():
+    data_path = pkg_resources.resource_filename('janggu', 'resources/')
+    order = 1
+    filename = os.path.join(data_path, 'sample_protein.fa')
+    data = Bioseq.create_from_seq('train', fastafile=filename,
+                                 order=order, seqtype='protein', fixedlen=1000)
+    np.testing.assert_equal(len(data), 3)
+    np.testing.assert_equal(data.shape, (3, 1000, 1, 20))
+    np.testing.assert_equal(
+        data[0][0, :4, 0, :],
+        np.asarray([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]], dtype='int8'))
+    np.testing.assert_equal(
+        data[0][0, -2:, 0, :], np.zeros((2, 20), dtype='int8'))
+
+    data = Bioseq.create_from_seq('train', fastafile=filename,
+                                 order=order, seqtype='protein', fixedlen=5)
+    np.testing.assert_equal(len(data), 3)
+    np.testing.assert_equal(data.shape, (3, 5, 1, 20))
+    np.testing.assert_equal(
+        data[0][0, :4, 0, :],
+        np.asarray([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]], dtype='int8'))
+
+
+def test_dnabed_overreaching_ends_whole_genome():
+    data_path = pkg_resources.resource_filename('janggu', 'resources/')
+    bed_file = os.path.join(data_path, "bed_test.bed")
+    filename = os.path.join(data_path, 'sample_genome.fa')
+
+    bioseq = Bioseq.create_from_refgenome(
+        'test',
+        refgenome=filename,
+        roi=bed_file,
+        binsize=2,
+        flank=20,
+        store_whole_genome=True,
+        storage='ndarray', cache=False)
+    assert len(bioseq) == 9
+    assert bioseq.shape == (9, 2+2*20, 1, 4)
+    # test if beginning is correctly padded
+    np.testing.assert_equal(bioseq[0].sum(), 22)
+    # test if end is correctly padded
+    np.testing.assert_equal(bioseq['chr1', 29990, 30010].sum(), 10)
+
+
+def test_dnabed_overreaching_ends_partial_genome():
+    data_path = pkg_resources.resource_filename('janggu', 'resources/')
+    bed_file = os.path.join(data_path, "bed_test.bed")
+    filename = os.path.join(data_path, 'sample_genome.fa')
+
+    bioseq = Bioseq.create_from_refgenome(
+        'test',
+        refgenome=filename,
+        roi=bed_file,
+        binsize=2,
+        flank=20,
+        store_whole_genome=False,
+        storage='ndarray')
+    assert len(bioseq) == 9
+    assert bioseq.shape == (9, 2+2*20, 1, 4)
+    np.testing.assert_equal(bioseq[0].sum(), 22)
+    np.testing.assert_equal(bioseq[-1].sum(), 42 - 4)
+
+
+@pytest.mark.filterwarnings("ignore:The truth value")
+def test_janggu_variant_streamer_order_1(tmpdir):
+    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
+    """Test Janggu creation by shape and name. """
+    data_path = pkg_resources.resource_filename('janggu', 'resources/')
+
+    order = 1
+
+    refgenome = os.path.join(data_path, 'sample_genome.fa')
+    vcffile = os.path.join(data_path, 'sample.vcf')
+
+    dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome,
+                                       storage='ndarray',
+                                       binsize=50,
+                                       store_whole_genome=True,
+                                       order=order)
+
+    # even binsize
+    vcf = VariantStreamer(dna, vcffile, binsize=10, batch_size=1)
+    it_vcf = iter(vcf.flow())
+    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
+    # C to T
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    assert names[0] == 'refmismatch'
+    np.testing.assert_equal(reference, alternative)
+    np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0]))
+
+    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
+    # C to T
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    np.testing.assert_equal(reference[0,4,0,:], np.array([0,1,0,0]))
+    np.testing.assert_equal(alternative[0,4,0,:], np.array([0,0,0,1]))
+
+    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
+    # T to C
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    np.testing.assert_equal(reference[0,4,0,:], np.array([0,0,0,1]))
+    np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0]))
+
+    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
+    # A to G
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    np.testing.assert_equal(reference[0,4,0,:], np.array([1,0,0,0]))
+    np.testing.assert_equal(alternative[0,4,0,:], np.array([0,0,1,0]))
+
+    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
+    # G to A
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    np.testing.assert_equal(reference[0,4,0,:], np.array([0,0,1,0]))
+    np.testing.assert_equal(alternative[0,4,0,:], np.array([1,0,0,0]))
+
+    # odd binsize
+    vcf = VariantStreamer(dna, vcffile, binsize=3, batch_size=1)
+    it_vcf = iter(vcf.flow())
+
+    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
+    # C to T
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    assert names[0] == 'refmismatch'
+    np.testing.assert_equal(reference, alternative)
+    np.testing.assert_equal(alternative[0,1,0,:], np.array([0,1,0,0]))
+
+    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
+    # C to T
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    np.testing.assert_equal(reference[0,1,0,:], np.array([0,1,0,0]))
+    np.testing.assert_equal(alternative[0,1,0,:], np.array([0,0,0,1]))
+
+    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
+    # T to C
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    np.testing.assert_equal(reference[0,1,0,:], np.array([0,0,0,1]))
+    np.testing.assert_equal(alternative[0,1,0,:], np.array([0,1,0,0]))
+
+
+@pytest.mark.filterwarnings("ignore:The truth value")
+def test_janggu_variant_streamer_order_1_w_refgenome(tmpdir):
+    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
+    """Test Janggu creation by shape and name. """
+    data_path = pkg_resources.resource_filename('janggu', 'resources/')
+
+    order = 1
+
+    refgenome = os.path.join(data_path, 'sample_genome.fa')
+    vcffile = os.path.join(data_path, 'sample.vcf')
+
+    #dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome,
+    #                                   storage='ndarray',
+    #                                   binsize=50,
+    #                                   store_whole_genome=True,
+    #                                   order=order)
+
+    # even binsize
+    vcf = VariantStreamer(refgenome, vcffile,
+                          binsize=10, batch_size=1,
+                          order=order)
+    it_vcf = iter(vcf.flow())
+    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
+    # C to T
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    assert names[0] == 'refmismatch'
+    np.testing.assert_equal(reference, alternative)
+    np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0]))
+
+    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
+    # C to T
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    np.testing.assert_equal(reference[0,4,0,:], np.array([0,1,0,0]))
+    np.testing.assert_equal(alternative[0,4,0,:], np.array([0,0,0,1]))
+
+    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
+    # T to C
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    np.testing.assert_equal(reference[0,4,0,:], np.array([0,0,0,1]))
+    np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0]))
+
+    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
+    # A to G
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    np.testing.assert_equal(reference[0,4,0,:], np.array([1,0,0,0]))
+    np.testing.assert_equal(alternative[0,4,0,:], np.array([0,0,1,0]))
+
+    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
+    # G to A
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    np.testing.assert_equal(reference[0,4,0,:], np.array([0,0,1,0]))
+    np.testing.assert_equal(alternative[0,4,0,:], np.array([1,0,0,0]))
+
+    # odd binsize
+    vcf = VariantStreamer(refgenome, vcffile, binsize=3, batch_size=1)
+    it_vcf = iter(vcf.flow())
+
+    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
+    # C to T
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    assert names[0] == 'refmismatch'
+    np.testing.assert_equal(reference, alternative)
+    np.testing.assert_equal(alternative[0,1,0,:], np.array([0,1,0,0]))
+
+    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
+    # C to T
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    np.testing.assert_equal(reference[0,1,0,:], np.array([0,1,0,0]))
+    np.testing.assert_equal(alternative[0,1,0,:], np.array([0,0,0,1]))
+
+    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
+    # T to C
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    np.testing.assert_equal(reference[0,1,0,:], np.array([0,0,0,1]))
+    np.testing.assert_equal(alternative[0,1,0,:], np.array([0,1,0,0]))
+
+
+@pytest.mark.filterwarnings("ignore:The truth value")
+def test_janggu_variant_streamer_order_12_ignore_ref_match(tmpdir):
+    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
+    """Test Janggu creation by shape and name. """
+    data_path = pkg_resources.resource_filename('janggu', 'resources/')
+
+    refgenome = os.path.join(data_path, 'sample_genome.fa')
+    vcffile = os.path.join(data_path, 'sample.vcf')
+
+    for order in [1, 2]:
+
+        dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome,
+                                           storage='ndarray',
+                                           binsize=50,
+                                           store_whole_genome=True,
+                                           order=order)
+
+        # even binsize
+        vcf = VariantStreamer(dna, vcffile, binsize=10, batch_size=1,
+                              ignore_reference_match=True)
+        it_vcf = iter(vcf.flow())
+        names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
+        # C to T
+        print(names, chroms, poss, ra, aa)
+        print(reference)
+        print(alternative)
+
+        assert names[0] == 'refmismatch'
+        #np.testing.assert_equal(reference, alternative)
+        np.testing.assert_equal(np.abs(reference-alternative).sum(), 2*order)
+        #np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0]))
+
+        # odd binsize
+        vcf = VariantStreamer(dna, vcffile, binsize=3, batch_size=1,
+                              ignore_reference_match=True)
+        it_vcf = iter(vcf.flow())
+
+        names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
+        # C to T
+        print(names, chroms, poss, ra, aa)
+        print(reference)
+        print(alternative)
+        assert names[0] == 'refmismatch'
+        np.testing.assert_equal(np.abs(reference-alternative).sum(), 2*order)
+        #np.testing.assert_equal(alternative[0,1,0,:], np.array([0,1,0,0]))
+
+
+@pytest.mark.filterwarnings("ignore:The truth value")
+def test_janggu_variant_streamer_order_12_ignore_ref_match_w_refgenome(tmpdir):
+    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
+    """Test Janggu creation by shape and name. """
+    data_path = pkg_resources.resource_filename('janggu', 'resources/')
+
+    refgenome = os.path.join(data_path, 'sample_genome.fa')
+    vcffile = os.path.join(data_path, 'sample.vcf')
+
+    for order in [1, 2]:
+
+        #dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome,
+        #                                   storage='ndarray',
+        #                                   binsize=50,
+        #                                   store_whole_genome=True,
+        #                                   order=order)
+
+        # even binsize
+        vcf = VariantStreamer(refgenome, vcffile, binsize=10, batch_size=1,
+                              ignore_reference_match=True, order=order)
+        it_vcf = iter(vcf.flow())
+        names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
+        # C to T
+        print(names, chroms, poss, ra, aa)
+        print(reference)
+        print(alternative)
+
+        assert names[0] == 'refmismatch'
+        #np.testing.assert_equal(reference, alternative)
+        np.testing.assert_equal(np.abs(reference-alternative).sum(), 2*order)
+        #np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0]))
+
+        # odd binsize
+        vcf = VariantStreamer(refgenome, vcffile, binsize=3, batch_size=1,
+                              ignore_reference_match=True, order=order)
+        it_vcf = iter(vcf.flow())
+
+        names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
+        # C to T
+        print(names, chroms, poss, ra, aa)
+        print(reference)
+        print(alternative)
+        assert names[0] == 'refmismatch'
+        np.testing.assert_equal(np.abs(reference-alternative).sum(), 2*order)
+        #np.testing.assert_equal(alternative[0,1,0,:], np.array([0,1,0,0]))
+
+@pytest.mark.filterwarnings("ignore:The truth value")
+def test_janggu_variant_streamer_order_1_revcomp(tmpdir):
+    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
+    """Test Janggu creation by shape and name. """
+    data_path = pkg_resources.resource_filename('janggu', 'resources/')
+
+    order = 1
+
+    refgenome = os.path.join(data_path, 'sample_genome.fa')
+    vcffile = os.path.join(data_path, 'sample.vcf')
+
+    dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome,
+                                       storage='ndarray',
+                                       binsize=50,
+                                       store_whole_genome=True,
+                                       order=order)
+
+    annot = BedTool([Interval('chr2', 110, 130, '-')])
+
+    # even binsize
+    vcf = VariantStreamer(dna, vcffile, binsize=10, batch_size=1)
+    it_vcf = iter(vcf.flow())
+    next(it_vcf)
+    # C to T
+    #print(names, chroms, poss, ra, aa)
+    #print(reference)
+    #print(alternative)
+    #assert names[0] == 'refmismatch'
+    #np.testing.assert_equal(reference, alternative)
+    #np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0]))
+
+    next(it_vcf)
+    # C to T
+    #print(names, chroms, poss, ra, aa)
+    #print(reference)
+    #print(alternative)
+    #np.testing.assert_equal(reference[0,4,0,:], np.array([0,1,0,0]))
+    #np.testing.assert_equal(alternative[0,4,0,:], np.array([0,0,0,1]))
+
+    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
+    # T to C
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+#    np.testing.assert_equal(reference[0,4,0,:], np.array([0,0,0,1]))
+#    np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0]))
+
+    # even binsize
+    vcf = VariantStreamer(dna, vcffile, binsize=10, batch_size=1,
+                          annotation=annot)
+    it_vcf = iter(vcf.flow())
+    next(it_vcf)
+    # C to T
+
+
+    next(it_vcf)
+    # C to T
+
+    names, chroms, poss, ra, aa, reference2, alternative2 = next(it_vcf)
+    # T to C
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    np.testing.assert_equal(reference, reference2[:,::-1, :, ::-1])
+    np.testing.assert_equal(alternative, alternative2[:,::-1, :, ::-1])
+
+
+@pytest.mark.filterwarnings("ignore:The truth value")
+def test_janggu_variant_streamer_order_1_revcomp_w_refgenome(tmpdir):
+    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
+    """Test Janggu creation by shape and name. """
+    data_path = pkg_resources.resource_filename('janggu', 'resources/')
+
+    order = 1
+
+    refgenome = os.path.join(data_path, 'sample_genome.fa')
+    vcffile = os.path.join(data_path, 'sample.vcf')
+
+    #dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome,
+    #                                   storage='ndarray',
+    #                                   binsize=50,
+    #                                   store_whole_genome=True,
+    #                                   order=order)
+
+    annot = BedTool([Interval('chr2', 110, 130, '-')])
+
+    # even binsize
+    vcf = VariantStreamer(refgenome, vcffile, binsize=10, batch_size=1, order=order)
+    it_vcf = iter(vcf.flow())
+    next(it_vcf)
+    # C to T
+    #print(names, chroms, poss, ra, aa)
+    #print(reference)
+    #print(alternative)
+    #assert names[0] == 'refmismatch'
+    #np.testing.assert_equal(reference, alternative)
+    #np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0]))
+
+    next(it_vcf)
+    # C to T
+    #print(names, chroms, poss, ra, aa)
+    #print(reference)
+    #print(alternative)
+    #np.testing.assert_equal(reference[0,4,0,:], np.array([0,1,0,0]))
+    #np.testing.assert_equal(alternative[0,4,0,:], np.array([0,0,0,1]))
+
+    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
+    # T to C
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+#    np.testing.assert_equal(reference[0,4,0,:], np.array([0,0,0,1]))
+#    np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0]))
+
+    # even binsize
+    vcf = VariantStreamer(refgenome, vcffile, binsize=10, batch_size=1,
+                          annotation=annot, order=order)
+    it_vcf = iter(vcf.flow())
+    next(it_vcf)
+    # C to T
+
+
+    next(it_vcf)
+    # C to T
+
+    names, chroms, poss, ra, aa, reference2, alternative2 = next(it_vcf)
+    # T to C
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    np.testing.assert_equal(reference, reference2[:,::-1, :, ::-1])
+    np.testing.assert_equal(alternative, alternative2[:,::-1, :, ::-1])
+
+
+@pytest.mark.filterwarnings("ignore:The truth value")
+def test_janggu_variant_streamer_order_2(tmpdir):
+    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
+    """Test Janggu creation by shape and name. """
+    data_path = pkg_resources.resource_filename('janggu', 'resources/')
+
+    order = 2
+
+    refgenome = os.path.join(data_path, 'sample_genome.fa')
+    vcffile = os.path.join(data_path, 'sample.vcf')
+
+    dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome,
+                                       storage='ndarray',
+                                       binsize=50,
+                                       store_whole_genome=True,
+                                       order=order)
+
+    vcf = VariantStreamer(dna, vcffile, binsize=10, batch_size=1)
+    it_vcf = iter(vcf.flow())
+
+    names, chroms, poss,  ra, aa, reference, alternative = next(it_vcf)
+    # ACT -> ATT
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    assert names[0] == 'refmismatch'
+    np.testing.assert_equal(reference, alternative)
+
+    names, chroms, poss,  ra, aa, reference, alternative = next(it_vcf)
+    # ACT -> ATT
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    np.testing.assert_equal(reference[0,3,0,1], 1)
+    np.testing.assert_equal(reference[0,4,0,7], 1)
+    np.testing.assert_equal(alternative[0,3,0,3], 1)
+    np.testing.assert_equal(alternative[0,4,0,15], 1)
+
+
+    names, chroms, poss, ra, aa,  reference, alternative = next(it_vcf)
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    # CTC -> CCC
+    np.testing.assert_equal(reference[0,3,0,7], 1)
+    np.testing.assert_equal(reference[0,4,0,13], 1)
+    np.testing.assert_equal(alternative[0,3,0,5], 1)
+    np.testing.assert_equal(alternative[0,4,0,5], 1)
+
+    names, chroms, poss, ra, aa,  reference, alternative = next(it_vcf)
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    # GAC -> GGC
+    np.testing.assert_equal(reference[0,3,0,8], 1)
+    np.testing.assert_equal(reference[0,4,0,1], 1)
+    np.testing.assert_equal(alternative[0,3,0,10], 1)
+    np.testing.assert_equal(alternative[0,4,0,9], 1)
+
+    names, chroms, poss, ra, aa,  reference, alternative = next(it_vcf)
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    # CGG -> CAG
+    np.testing.assert_equal(reference[0,3,0,6], 1)
+    np.testing.assert_equal(reference[0,4,0,10], 1)
+    np.testing.assert_equal(alternative[0,3,0,4], 1)
+    np.testing.assert_equal(alternative[0,4,0,2], 1)
+
+    vcf = VariantStreamer(dna, vcffile, binsize=5, batch_size=1)
+    it_vcf = iter(vcf.flow())
+
+    names, chroms, poss,  ra, aa, reference, alternative = next(it_vcf)
+    # ACT -> ATT
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    assert names[0] == 'refmismatch'
+    np.testing.assert_equal(reference, alternative)
+
+    names, chroms, poss,  ra, aa, reference, alternative = next(it_vcf)
+    # ACT -> ATT
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    np.testing.assert_equal(reference[0,1,0,1], 1)
+    np.testing.assert_equal(reference[0,2,0,7], 1)
+    np.testing.assert_equal(alternative[0,1,0,3], 1)
+    np.testing.assert_equal(alternative[0,2,0,15], 1)
+
+
+    names, chroms, poss, ra, aa,  reference, alternative = next(it_vcf)
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    # CTC -> CCC
+    np.testing.assert_equal(reference[0,1,0,7], 1)
+    np.testing.assert_equal(reference[0,2,0,13], 1)
+    np.testing.assert_equal(alternative[0,1,0,5], 1)
+    np.testing.assert_equal(alternative[0,2,0,5], 1)
+
+
+@pytest.mark.filterwarnings("ignore:The truth value")
+def test_janggu_variant_streamer_order_2_w_refgenome(tmpdir):
+    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
+    """Test Janggu creation by shape and name. """
+    data_path = pkg_resources.resource_filename('janggu', 'resources/')
+
+    order = 2
+
+    refgenome = os.path.join(data_path, 'sample_genome.fa')
+    vcffile = os.path.join(data_path, 'sample.vcf')
+
+    #dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome,
+    #                                   storage='ndarray',
+    #                                   binsize=50,
+    #                                   store_whole_genome=True,
+    #                                   order=order)
+
+    vcf = VariantStreamer(refgenome, vcffile, binsize=10, batch_size=1,
+                          order=order)
+    it_vcf = iter(vcf.flow())
+
+    names, chroms, poss,  ra, aa, reference, alternative = next(it_vcf)
+    # ACT -> ATT
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    assert names[0] == 'refmismatch'
+    np.testing.assert_equal(reference, alternative)
+
+    names, chroms, poss,  ra, aa, reference, alternative = next(it_vcf)
+    # ACT -> ATT
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    np.testing.assert_equal(reference[0,3,0,1], 1)
+    np.testing.assert_equal(reference[0,4,0,7], 1)
+    np.testing.assert_equal(alternative[0,3,0,3], 1)
+    np.testing.assert_equal(alternative[0,4,0,15], 1)
+
+
+    names, chroms, poss, ra, aa,  reference, alternative = next(it_vcf)
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    # CTC -> CCC
+    np.testing.assert_equal(reference[0,3,0,7], 1)
+    np.testing.assert_equal(reference[0,4,0,13], 1)
+    np.testing.assert_equal(alternative[0,3,0,5], 1)
+    np.testing.assert_equal(alternative[0,4,0,5], 1)
+
+    names, chroms, poss, ra, aa,  reference, alternative = next(it_vcf)
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    # GAC -> GGC
+    np.testing.assert_equal(reference[0,3,0,8], 1)
+    np.testing.assert_equal(reference[0,4,0,1], 1)
+    np.testing.assert_equal(alternative[0,3,0,10], 1)
+    np.testing.assert_equal(alternative[0,4,0,9], 1)
+
+    names, chroms, poss, ra, aa,  reference, alternative = next(it_vcf)
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    # CGG -> CAG
+    np.testing.assert_equal(reference[0,3,0,6], 1)
+    np.testing.assert_equal(reference[0,4,0,10], 1)
+    np.testing.assert_equal(alternative[0,3,0,4], 1)
+    np.testing.assert_equal(alternative[0,4,0,2], 1)
+
+    vcf = VariantStreamer(refgenome, vcffile, binsize=5, batch_size=1,
+                          order=order)
+    it_vcf = iter(vcf.flow())
+
+    names, chroms, poss,  ra, aa, reference, alternative = next(it_vcf)
+    # ACT -> ATT
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    assert names[0] == 'refmismatch'
+    np.testing.assert_equal(reference, alternative)
+
+    names, chroms, poss,  ra, aa, reference, alternative = next(it_vcf)
+    # ACT -> ATT
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    np.testing.assert_equal(reference[0,1,0,1], 1)
+    np.testing.assert_equal(reference[0,2,0,7], 1)
+    np.testing.assert_equal(alternative[0,1,0,3], 1)
+    np.testing.assert_equal(alternative[0,2,0,15], 1)
+
+
+    names, chroms, poss, ra, aa,  reference, alternative = next(it_vcf)
+    print(names, chroms, poss, ra, aa)
+    print(reference)
+    print(alternative)
+    # CTC -> CCC
+    np.testing.assert_equal(reference[0,1,0,7], 1)
+    np.testing.assert_equal(reference[0,2,0,13], 1)
+    np.testing.assert_equal(alternative[0,1,0,5], 1)
+    np.testing.assert_equal(alternative[0,2,0,5], 1)