janggu / Git / [0473b3] /tests/test_dna

Models:
MarcoTheBlack/
janggu
Downloads: 1
[0473b3]: / tests / test_dna_dataset.py
History
Download this file
1315 lines (1089 with data), 51.2 kB

import glob
import os

import matplotlib
import numpy as np
import pandas
import pkg_resources
import pytest
from keras.layers import Input
from keras.models import Model
from pybedtools import BedTool
from pybedtools import Interval

from janggu.data import Bioseq
from janggu.data import GenomicIndexer
from janggu.data import VariantStreamer
from janggu.layers import Complement
from janggu.layers import Reverse
from janggu.utils import complement_permmatrix
from janggu.utils import sequences_from_fasta

matplotlib.use('AGG')

binsize = 200
flank = 150
stepsize = 50


def datalen(bed_file):
    binsizes = 0
    reader = BedTool(bed_file)
    for reg in reader:
        binsizes += (reg.iv.end - reg.iv.start - binsize + stepsize)//stepsize
    return binsizes


def test_dna_loading_from_seqrecord(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    order = 2
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_merged = os.path.join(data_path, 'sample.gtf')
    refgenome = os.path.join(data_path, 'sample_genome.fa')
    seqs = sequences_from_fasta(refgenome)

    data = Bioseq.create_from_refgenome('train', refgenome=seqs,
                                     roi=bed_merged,
                                     storage='ndarray',
                                     store_whole_genome=True,
                                     order=order)

    np.testing.assert_equal(data[0], data[data.gindexer[0]])
    chrom = data.gindexer[0].chrom
    start = data.gindexer[0].start
    end = data.gindexer[0].end
    np.testing.assert_equal(data[0], data[(chrom, start, end)])
    np.testing.assert_equal(data[0], data[chrom, start, end])


def test_dna_genomic_interval_access(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    order = 2
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_merged = os.path.join(data_path, 'sample.gtf')
    refgenome = os.path.join(data_path, 'sample_genome.fa')

    data = Bioseq.create_from_refgenome('train', refgenome=refgenome,
                                     roi=bed_merged,
                                     storage='ndarray',
                                     order=order)

    with pytest.raises(Exception):
        # due to store_whole_genome = False
        data[data.gindexer[0]]

    data = Bioseq.create_from_refgenome('train', refgenome=refgenome,
                                     roi=bed_merged,
                                     storage='ndarray',
                                     order=order,
                                     store_whole_genome=True)

    np.testing.assert_equal(data[0], data[data.gindexer[0]])
    chrom = data.gindexer[0].chrom
    start = data.gindexer[0].start
    end = data.gindexer[0].end
    np.testing.assert_equal(data[0], data[(chrom, start, end)])
    np.testing.assert_equal(data[0], data[chrom, start, end])

def test_dna_dims_order_1_from_subset_dataframe(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    order = 1
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_merged = os.path.join(data_path, 'sample.gtf')
    refgenome = os.path.join(data_path, 'sample_genome.fa')

    roi = pandas.read_csv(bed_merged,
                          sep='\t', header=None, usecols=[0, 2, 3, 4, 5,  6], skiprows=2,
                          names=['chrom', 'name', 'start', 'end', 'score', 'strand'])
    roi.start -= 1
    print(roi)

    data = Bioseq.create_from_refgenome('train', refgenome=refgenome,
                                     roi=roi,
                                     storage='ndarray',
                                     store_whole_genome=True,
                                     order=order)

    np.testing.assert_equal(data[0], data[data.gindexer[0]])
    assert len(data.garray.handle) == 2

    # for order 1
    assert len(data) == 2
    assert data.shape == (2, 10000, 1, 4)
    assert data[:].sum() == 20000

    roi = BedTool(bed_merged)
    data = Bioseq.create_from_refgenome('train', refgenome=refgenome,
                                     roi=roi,
                                     storage='ndarray',
                                     store_whole_genome=True,
                                     order=order)

    np.testing.assert_equal(data[0], data[data.gindexer[0]])
    assert len(data.garray.handle) == 2

    # for order 1
    assert len(data) == 2
    assert data.shape == (2, 10000, 1, 4)
    assert data[:].sum() == 20000

    roi = [iv for iv in BedTool(bed_merged)]
    data = Bioseq.create_from_refgenome('train', refgenome=refgenome,
                                     roi=roi,
                                     storage='ndarray',
                                     store_whole_genome=True,
                                     order=order)

    np.testing.assert_equal(data[0], data[data.gindexer[0]])
    assert len(data.garray.handle) == 2

    # for order 1
    assert len(data) == 2
    assert data.shape == (2, 10000, 1, 4)
    assert data[:].sum() == 20000


def test_dna_dims_order_1_from_subset(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    order = 1
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_merged = os.path.join(data_path, 'sample.gtf')
    refgenome = os.path.join(data_path, 'sample_genome.fa')

    data = Bioseq.create_from_refgenome('train', refgenome=refgenome,
                                     roi=bed_merged,
                                     storage='ndarray',
                                     order=order)

    np.testing.assert_equal(data[0], data[data.gindexer[0]])
    assert len(data.garray.handle) == 2

    # for order 1
    assert len(data) == 2
    assert data.shape == (2, 10000, 1, 4)
    assert data[:].sum() == 20000


def test_dna_dims_order_1_from_subset(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    order = 1
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_merged = os.path.join(data_path, 'sample.gtf')
    refgenome = os.path.join(data_path, 'sample_genome.fa')

    data = Bioseq.create_from_refgenome('train', refgenome=refgenome,
                                     roi=bed_merged,
                                     binsize=200, stepsize=200,
                                     storage='ndarray',
                                     order=order)

    assert len(data.garray.handle['data']) == 100

    # for order 1
    assert len(data) == 100
    assert data.shape == (100, 200, 1, 4)
    # the correctness of the sequence extraction was also
    # validated using:
    # bedtools getfasta -fi sample_genome.fa -bed sample.bed
    # >chr1:15000-25000
    # ATTGTGGTGA...
    # this sequence is read from the forward strand
    np.testing.assert_equal(data[0][0, :10, 0, :],
                            np.asarray([[1, 0, 0, 0],  # A
                                        [0, 0, 0, 1],  # T
                                        [0, 0, 0, 1],  # T
                                        [0, 0, 1, 0],  # C
                                        [0, 0, 0, 1],  # T
                                        [0, 0, 1, 0],  # G
                                        [0, 0, 1, 0],  # G
                                        [0, 0, 0, 1],  # T
                                        [0, 0, 1, 0],  # G
                                        [1, 0, 0, 0]],  # A
                            dtype='int8'))

    # bedtools getfasta -fi sample_genome.fa -bed sample.bed
    # >chr2:15000-25000
    # ggggaagcaa...
    # this sequence is read from the reverse strand
    # so we have ...ttgcttcccc
    np.testing.assert_equal(data[50][0, -10:, 0, :],
                            np.asarray([[0, 0, 0, 1],  # T
                                        [0, 0, 0, 1],  # T
                                        [0, 0, 1, 0],  # G
                                        [0, 1, 0, 0],  # C
                                        [0, 0, 0, 1],  # T
                                        [0, 0, 0, 1],  # T
                                        [0, 1, 0, 0],  # C
                                        [0, 1, 0, 0],  # C
                                        [0, 1, 0, 0],  # C
                                        [0, 1, 0, 0]],  # C
                            dtype='int8'))


def test_dna_dims_order_1_from_reference(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    order = 1
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_merged = os.path.join(data_path, 'sample.gtf')
    refgenome = os.path.join(data_path, 'sample_genome.fa')

    gindexer = GenomicIndexer.create_from_file(bed_merged, 200, 200)

    data = Bioseq.create_from_refgenome('train', refgenome=refgenome,
                                        storage='ndarray',
                                        order=order,
                                        store_whole_genome=True)
    data.gindexer = gindexer
    assert len(data.garray.handle) == 2
    assert 'chr1' in data.garray.handle
    assert 'chr2' in data.garray.handle

    # for order 1
    assert len(data) == 100
    assert data.shape == (100, 200, 1, 4)
    # the correctness of the sequence extraction was also
    # validated using:
    # bedtools getfasta -fi sample_genome.fa -bed sample.bed
    # >chr1:15000-25000
    # ATTGTGGTGA...
    # this sequence is read from the forward strand
    np.testing.assert_equal(data[0][0, :10, 0, :],
                            np.asarray([[1, 0, 0, 0],  # A
                                        [0, 0, 0, 1],  # T
                                        [0, 0, 0, 1],  # T
                                        [0, 0, 1, 0],  # C
                                        [0, 0, 0, 1],  # T
                                        [0, 0, 1, 0],  # G
                                        [0, 0, 1, 0],  # G
                                        [0, 0, 0, 1],  # T
                                        [0, 0, 1, 0],  # G
                                        [1, 0, 0, 0]],  # A
                            dtype='int8'))

    # bedtools getfasta -fi sample_genome.fa -bed sample.bed
    # >chr2:15000-25000
    # ggggaagcaa...
    # this sequence is read from the reverse strand
    # so we have ...ttgcttcccc
    np.testing.assert_equal(data[50][0, -10:, 0, :],
                            np.asarray([[0, 0, 0, 1],  # T
                                        [0, 0, 0, 1],  # T
                                        [0, 0, 1, 0],  # G
                                        [0, 1, 0, 0],  # C
                                        [0, 0, 0, 1],  # T
                                        [0, 0, 0, 1],  # T
                                        [0, 1, 0, 0],  # C
                                        [0, 1, 0, 0],  # C
                                        [0, 1, 0, 0],  # C
                                        [0, 1, 0, 0]],  # C
                            dtype='int8'))


def test_dna_dims_order_2(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    order = 2
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_merged = os.path.join(data_path, 'sample.bed')
    refgenome = os.path.join(data_path, 'sample_genome.fa')

    data = Bioseq.create_from_refgenome('train', refgenome=refgenome,
                                     roi=bed_merged,
                                     binsize=200,
                                     storage='ndarray',
                                     order=order)

    # for order 1
    assert len(data) == 100
    assert data.shape == (100, 199, 1, 16)
    # the correctness of the sequence extraction was also
    # validated using:
    # >bedtools getfasta -fi sample_genome.fa -bed sample.bed
    # >chr1:15000-25000
    # ATTGTGGTGAC...
    np.testing.assert_equal(
        data[0][0, :10, 0, :],
        np.asarray([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # AT
                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],  # TT
                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],  # TG
                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],  # GT
                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],  # TG
                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],  # GG
                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],  # GT
                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],  # TG
                    [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],  # GA
                    [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],  # AC
                   dtype='int8'))

    # bedtools getfasta -fi sample_genome.fa -bed sample.bed
    # >chr2:15000-25000
    # ggggaagcaag...
    # this sequence is read from the reverse strand
    # so we have ...cttgcttcccc
    np.testing.assert_equal(
        data[50][0, -10:, 0, :],
        np.asarray([[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],  # CT
                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],  # TT
                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],  # TG
                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],  # GC
                    [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],  # CT
                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],  # TT
                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],  # TC
                    [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # CC
                    [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # CC
                    [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],  # CC
                   dtype='int8'))


def reverse_layer(order):
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    bed_file = os.path.join(data_path, 'sample.bed')

    refgenome = os.path.join(data_path, 'sample_genome.fa')

    data = Bioseq.create_from_refgenome('train', refgenome=refgenome,
                                     roi=bed_file,
                                     storage='ndarray',
                                     binsize=binsize,
                                     flank=flank,
                                     order=order)

    dna_in = Input(shape=data.shape[1:], name='dna')
    rdna_layer = Reverse()(dna_in)

    rmod = Model(dna_in, rdna_layer)

    # actual shape of DNA
    dna = data[0]
    np.testing.assert_equal(dna[:, ::-1, :, :], rmod.predict(dna))


def complement_layer(order):
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    bed_file = os.path.join(data_path, 'sample.bed')

    refgenome = os.path.join(data_path, 'sample_genome.fa')

    data = Bioseq.create_from_refgenome('train', refgenome=refgenome,
                                     roi=bed_file,
                                     storage='ndarray',
                                     binsize=binsize,
                                     flank=flank,
                                     order=order)

    dna_in = Input(shape=data.shape[1:], name='dna')
    cdna_layer = Complement()(dna_in)
    cmod = Model(dna_in, cdna_layer)

    # actual shape of DNA
    dna = data[0]

    cdna = cmod.predict(dna)
    ccdna = cmod.predict(cdna)

    with pytest.raises(Exception):
        np.testing.assert_equal(dna, cdna)
    np.testing.assert_equal(dna, ccdna)


def test_reverse_order_1(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    reverse_layer(1)


def test_reverse_order_2(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    reverse_layer(2)


def test_complement_order_1(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    complement_layer(1)


def test_complement_order_2(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    complement_layer(2)


def test_revcomp_rcmatrix(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath

    rcmatrix = complement_permmatrix(1)

    np.testing.assert_equal(rcmatrix,
                            np.array([[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0],
                                      [1, 0, 0, 0]]))

    rcmatrix = complement_permmatrix(2)

    np.testing.assert_equal(rcmatrix[0],
                            np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                      0, 0, 0, 1]))
    np.testing.assert_equal(rcmatrix[4],
                            np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                      0, 0, 1, 0]))
    np.testing.assert_equal(rcmatrix[8],
                            np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                      0, 1, 0, 0]))
    np.testing.assert_equal(rcmatrix[12],
                            np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                      1, 0, 0, 0]))

    np.testing.assert_equal(rcmatrix[1],
                            np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
                                      0, 0, 0, 0]))
    np.testing.assert_equal(rcmatrix[5],
                            np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
                                      0, 0, 0, 0]))
    np.testing.assert_equal(rcmatrix[9],
                            np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
                                      0, 0, 0, 0]))
    np.testing.assert_equal(rcmatrix[13],
                            np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
                                      0, 0, 0, 0]))


def test_rcmatrix_identity():

    for order in range(1, 4):

        rcmatrix = complement_permmatrix(order)

        np.testing.assert_equal(np.eye(pow(4, order)),
                                np.matmul(rcmatrix, rcmatrix))


def test_dna_dataset_sanity(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, 'sample.bed')

    refgenome = os.path.join(data_path, 'sample_genome.fa')

    with pytest.raises(Exception):
        # name must be a string
        Bioseq.create_from_refgenome(1.23, refgenome='',
                                  storage='ndarray',
                                  roi=bed_file, order=1)
    with pytest.raises(Exception):
        Bioseq.create_from_refgenome('train', refgenome='',
                                  storage='ndarray',
                                  roi=bed_file, order=1)
    with pytest.raises(Exception):
        Bioseq.create_from_refgenome('train', refgenome='test',
                                  storage='ndarray',
                                  roi=bed_file, order=1)

    with pytest.raises(Exception):
        Bioseq.create_from_refgenome('train', refgenome=refgenome,
                                  storage='ndarray',
                                  roi=bed_file, order=0)
    with pytest.raises(Exception):
        Bioseq.create_from_refgenome('train', refgenome=refgenome,
                                  storage='ndarray',
                                  roi=bed_file, flank=-1)
    with pytest.raises(Exception):
        Bioseq.create_from_refgenome('train', refgenome=refgenome,
                                  storage='ndarray',
                                  roi=bed_file, binsize=0)
    with pytest.raises(Exception):
        Bioseq.create_from_refgenome('train', refgenome=refgenome,
                                  storage='ndarray',
                                  roi=bed_file, stepsize=0)

    with pytest.warns(FutureWarning):
        Bioseq.create_from_refgenome('train', refgenome=refgenome,
                                  storage='ndarray',
                                  roi=bed_file,
                                  datatags=['help'])

    with pytest.warns(FutureWarning):
        Bioseq.create_from_refgenome('train', refgenome=refgenome,
                                  storage='ndarray',
                                  roi=bed_file,
                                  overwrite=True)
    with pytest.raises(Exception):
        Bioseq.create_from_refgenome('train', refgenome=refgenome,
                                  storage='step',
                                  roi=bed_file, order=1)

    assert not os.path.exists(os.path.join(tmpdir.strpath, 'train',
                                           'storage.h5'))
    with pytest.raises(ValueError):
        Bioseq.create_from_refgenome('train', refgenome=refgenome,
                                  storage='sparse',
                                  roi=None, order=1,
                                  store_whole_genome=True)
    with pytest.raises(ValueError):
        Bioseq.create_from_refgenome('train', refgenome=refgenome,
                                  roi=bed_file, order=0,
                                  store_whole_genome=True)
    with pytest.raises(ValueError):
        Bioseq.create_from_refgenome('train', refgenome=refgenome,
                                     roi=None, store_whole_genome=False)

    Bioseq.create_from_refgenome('train', refgenome=refgenome,
                              storage='ndarray',
                              roi=None, order=1,
                              store_whole_genome=True)
    file_ = glob.glob(os.path.join(tmpdir.strpath, 'datasets', 'train', '*.h5'))
    assert len(file_) == 0
    print(refgenome)
    print(bed_file)
    Bioseq.create_from_refgenome('train', refgenome=refgenome,
        storage='ndarray',
        roi=bed_file, order=1, cache=True)
    Bioseq.create_from_refgenome('train', refgenome=refgenome,
                              storage='hdf5',
                              roi=bed_file, order=1, cache=True)
    # a cache file must exist now
    file_ = glob.glob(os.path.join(tmpdir.strpath, 'datasets', 'train', '*.h5'))
    assert len(file_) == 1

    # reload the cached file
    Bioseq.create_from_refgenome('train', refgenome=refgenome,
                              storage='hdf5',
                              roi=bed_file, order=1, cache=True)



def test_read_dna_from_biostring_order_1():

    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    order = 1
    filename = os.path.join(data_path, 'sample.fa')
    seqs = sequences_from_fasta(filename)
    with pytest.raises(ValueError):
        data = Bioseq.create_from_seq('train', fastafile=seqs, storage='sparse',
                                     order=order, cache=False)

    data = Bioseq.create_from_seq('train', fastafile=seqs,
                                 order=order, cache=False)

    np.testing.assert_equal(len(data), 3897)
    np.testing.assert_equal(data.shape, (3897, 200, 1, 4))
    np.testing.assert_equal(
        data[0][0, :10, 0, :],
        np.asarray([[0, 1, 0, 0],
                    [1, 0, 0, 0],
                    [0, 1, 0, 0],
                    [1, 0, 0, 0],
                    [0, 0, 1, 0],
                    [0, 1, 0, 0],
                    [1, 0, 0, 0],
                    [0, 0, 1, 0],
                    [1, 0, 0, 0],
                    [0, 0, 1, 0]], dtype='int8'))


def test_read_dna_from_fasta_order_1():

    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    order = 1
    filename = os.path.join(data_path, 'sample.fa')
    data = Bioseq.create_from_seq('train', fastafile=filename,
                                 order=order, cache=False)

    np.testing.assert_equal(len(data), 3897)
    np.testing.assert_equal(data.shape, (3897, 200, 1, 4))
    np.testing.assert_equal(
        data[0][0, :10, 0, :],
        np.asarray([[0, 1, 0, 0],
                    [1, 0, 0, 0],
                    [0, 1, 0, 0],
                    [1, 0, 0, 0],
                    [0, 0, 1, 0],
                    [0, 1, 0, 0],
                    [1, 0, 0, 0],
                    [0, 0, 1, 0],
                    [1, 0, 0, 0],
                    [0, 0, 1, 0]], dtype='int8'))


def test_read_dna_from_fasta_order_2():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    order = 2
    filename = os.path.join(data_path, 'sample.fa')
    for store_genome in [True, False]:
        data = Bioseq.create_from_seq('train', fastafile=filename,
                                      order=order, cache=False)

        np.testing.assert_equal(len(data), 3897)
        np.testing.assert_equal(data.shape, (3897, 199, 1, 16))
        np.testing.assert_equal(
            data[0][0, :10, 0, :],
            np.asarray([[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
                        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
                        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]], dtype='int8'))

def test_read_protein_sequences():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    order = 1
    filename = os.path.join(data_path, 'sample_protein.fa')
    data = Bioseq.create_from_seq('train', fastafile=filename,
                                 order=order, seqtype='protein', fixedlen=1000)
    np.testing.assert_equal(len(data), 3)
    np.testing.assert_equal(data.shape, (3, 1000, 1, 20))
    np.testing.assert_equal(
        data[0][0, :4, 0, :],
        np.asarray([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                    [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]], dtype='int8'))
    np.testing.assert_equal(
        data[0][0, -2:, 0, :], np.zeros((2, 20), dtype='int8'))

    data = Bioseq.create_from_seq('train', fastafile=filename,
                                 order=order, seqtype='protein', fixedlen=5)
    np.testing.assert_equal(len(data), 3)
    np.testing.assert_equal(data.shape, (3, 5, 1, 20))
    np.testing.assert_equal(
        data[0][0, :4, 0, :],
        np.asarray([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                    [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]], dtype='int8'))


def test_dnabed_overreaching_ends_whole_genome():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, "bed_test.bed")
    filename = os.path.join(data_path, 'sample_genome.fa')

    bioseq = Bioseq.create_from_refgenome(
        'test',
        refgenome=filename,
        roi=bed_file,
        binsize=2,
        flank=20,
        store_whole_genome=True,
        storage='ndarray', cache=False)
    assert len(bioseq) == 9
    assert bioseq.shape == (9, 2+2*20, 1, 4)
    # test if beginning is correctly padded
    np.testing.assert_equal(bioseq[0].sum(), 22)
    # test if end is correctly padded
    np.testing.assert_equal(bioseq['chr1', 29990, 30010].sum(), 10)


def test_dnabed_overreaching_ends_partial_genome():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, "bed_test.bed")
    filename = os.path.join(data_path, 'sample_genome.fa')

    bioseq = Bioseq.create_from_refgenome(
        'test',
        refgenome=filename,
        roi=bed_file,
        binsize=2,
        flank=20,
        store_whole_genome=False,
        storage='ndarray')
    assert len(bioseq) == 9
    assert bioseq.shape == (9, 2+2*20, 1, 4)
    np.testing.assert_equal(bioseq[0].sum(), 22)
    np.testing.assert_equal(bioseq[-1].sum(), 42 - 4)


@pytest.mark.filterwarnings("ignore:The truth value")
def test_janggu_variant_streamer_order_1(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    """Test Janggu creation by shape and name. """
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    order = 1

    refgenome = os.path.join(data_path, 'sample_genome.fa')
    vcffile = os.path.join(data_path, 'sample.vcf')

    dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome,
                                       storage='ndarray',
                                       binsize=50,
                                       store_whole_genome=True,
                                       order=order)

    # even binsize
    vcf = VariantStreamer(dna, vcffile, binsize=10, batch_size=1)
    it_vcf = iter(vcf.flow())
    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
    # C to T
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    assert names[0] == 'refmismatch'
    np.testing.assert_equal(reference, alternative)
    np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0]))

    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
    # C to T
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    np.testing.assert_equal(reference[0,4,0,:], np.array([0,1,0,0]))
    np.testing.assert_equal(alternative[0,4,0,:], np.array([0,0,0,1]))

    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
    # T to C
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    np.testing.assert_equal(reference[0,4,0,:], np.array([0,0,0,1]))
    np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0]))

    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
    # A to G
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    np.testing.assert_equal(reference[0,4,0,:], np.array([1,0,0,0]))
    np.testing.assert_equal(alternative[0,4,0,:], np.array([0,0,1,0]))

    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
    # G to A
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    np.testing.assert_equal(reference[0,4,0,:], np.array([0,0,1,0]))
    np.testing.assert_equal(alternative[0,4,0,:], np.array([1,0,0,0]))

    # odd binsize
    vcf = VariantStreamer(dna, vcffile, binsize=3, batch_size=1)
    it_vcf = iter(vcf.flow())

    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
    # C to T
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    assert names[0] == 'refmismatch'
    np.testing.assert_equal(reference, alternative)
    np.testing.assert_equal(alternative[0,1,0,:], np.array([0,1,0,0]))

    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
    # C to T
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    np.testing.assert_equal(reference[0,1,0,:], np.array([0,1,0,0]))
    np.testing.assert_equal(alternative[0,1,0,:], np.array([0,0,0,1]))

    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
    # T to C
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    np.testing.assert_equal(reference[0,1,0,:], np.array([0,0,0,1]))
    np.testing.assert_equal(alternative[0,1,0,:], np.array([0,1,0,0]))


@pytest.mark.filterwarnings("ignore:The truth value")
def test_janggu_variant_streamer_order_1_w_refgenome(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    """Test Janggu creation by shape and name. """
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    order = 1

    refgenome = os.path.join(data_path, 'sample_genome.fa')
    vcffile = os.path.join(data_path, 'sample.vcf')

    #dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome,
    #                                   storage='ndarray',
    #                                   binsize=50,
    #                                   store_whole_genome=True,
    #                                   order=order)

    # even binsize
    vcf = VariantStreamer(refgenome, vcffile,
                          binsize=10, batch_size=1,
                          order=order)
    it_vcf = iter(vcf.flow())
    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
    # C to T
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    assert names[0] == 'refmismatch'
    np.testing.assert_equal(reference, alternative)
    np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0]))

    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
    # C to T
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    np.testing.assert_equal(reference[0,4,0,:], np.array([0,1,0,0]))
    np.testing.assert_equal(alternative[0,4,0,:], np.array([0,0,0,1]))

    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
    # T to C
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    np.testing.assert_equal(reference[0,4,0,:], np.array([0,0,0,1]))
    np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0]))

    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
    # A to G
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    np.testing.assert_equal(reference[0,4,0,:], np.array([1,0,0,0]))
    np.testing.assert_equal(alternative[0,4,0,:], np.array([0,0,1,0]))

    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
    # G to A
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    np.testing.assert_equal(reference[0,4,0,:], np.array([0,0,1,0]))
    np.testing.assert_equal(alternative[0,4,0,:], np.array([1,0,0,0]))

    # odd binsize
    vcf = VariantStreamer(refgenome, vcffile, binsize=3, batch_size=1)
    it_vcf = iter(vcf.flow())

    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
    # C to T
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    assert names[0] == 'refmismatch'
    np.testing.assert_equal(reference, alternative)
    np.testing.assert_equal(alternative[0,1,0,:], np.array([0,1,0,0]))

    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
    # C to T
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    np.testing.assert_equal(reference[0,1,0,:], np.array([0,1,0,0]))
    np.testing.assert_equal(alternative[0,1,0,:], np.array([0,0,0,1]))

    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
    # T to C
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    np.testing.assert_equal(reference[0,1,0,:], np.array([0,0,0,1]))
    np.testing.assert_equal(alternative[0,1,0,:], np.array([0,1,0,0]))


@pytest.mark.filterwarnings("ignore:The truth value")
def test_janggu_variant_streamer_order_12_ignore_ref_match(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    """Test Janggu creation by shape and name. """
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    refgenome = os.path.join(data_path, 'sample_genome.fa')
    vcffile = os.path.join(data_path, 'sample.vcf')

    for order in [1, 2]:

        dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome,
                                           storage='ndarray',
                                           binsize=50,
                                           store_whole_genome=True,
                                           order=order)

        # even binsize
        vcf = VariantStreamer(dna, vcffile, binsize=10, batch_size=1,
                              ignore_reference_match=True)
        it_vcf = iter(vcf.flow())
        names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
        # C to T
        print(names, chroms, poss, ra, aa)
        print(reference)
        print(alternative)

        assert names[0] == 'refmismatch'
        #np.testing.assert_equal(reference, alternative)
        np.testing.assert_equal(np.abs(reference-alternative).sum(), 2*order)
        #np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0]))

        # odd binsize
        vcf = VariantStreamer(dna, vcffile, binsize=3, batch_size=1,
                              ignore_reference_match=True)
        it_vcf = iter(vcf.flow())

        names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
        # C to T
        print(names, chroms, poss, ra, aa)
        print(reference)
        print(alternative)
        assert names[0] == 'refmismatch'
        np.testing.assert_equal(np.abs(reference-alternative).sum(), 2*order)
        #np.testing.assert_equal(alternative[0,1,0,:], np.array([0,1,0,0]))


@pytest.mark.filterwarnings("ignore:The truth value")
def test_janggu_variant_streamer_order_12_ignore_ref_match_w_refgenome(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    """Test Janggu creation by shape and name. """
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    refgenome = os.path.join(data_path, 'sample_genome.fa')
    vcffile = os.path.join(data_path, 'sample.vcf')

    for order in [1, 2]:

        #dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome,
        #                                   storage='ndarray',
        #                                   binsize=50,
        #                                   store_whole_genome=True,
        #                                   order=order)

        # even binsize
        vcf = VariantStreamer(refgenome, vcffile, binsize=10, batch_size=1,
                              ignore_reference_match=True, order=order)
        it_vcf = iter(vcf.flow())
        names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
        # C to T
        print(names, chroms, poss, ra, aa)
        print(reference)
        print(alternative)

        assert names[0] == 'refmismatch'
        #np.testing.assert_equal(reference, alternative)
        np.testing.assert_equal(np.abs(reference-alternative).sum(), 2*order)
        #np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0]))

        # odd binsize
        vcf = VariantStreamer(refgenome, vcffile, binsize=3, batch_size=1,
                              ignore_reference_match=True, order=order)
        it_vcf = iter(vcf.flow())

        names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
        # C to T
        print(names, chroms, poss, ra, aa)
        print(reference)
        print(alternative)
        assert names[0] == 'refmismatch'
        np.testing.assert_equal(np.abs(reference-alternative).sum(), 2*order)
        #np.testing.assert_equal(alternative[0,1,0,:], np.array([0,1,0,0]))

@pytest.mark.filterwarnings("ignore:The truth value")
def test_janggu_variant_streamer_order_1_revcomp(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    """Test Janggu creation by shape and name. """
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    order = 1

    refgenome = os.path.join(data_path, 'sample_genome.fa')
    vcffile = os.path.join(data_path, 'sample.vcf')

    dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome,
                                       storage='ndarray',
                                       binsize=50,
                                       store_whole_genome=True,
                                       order=order)

    annot = BedTool([Interval('chr2', 110, 130, '-')])

    # even binsize
    vcf = VariantStreamer(dna, vcffile, binsize=10, batch_size=1)
    it_vcf = iter(vcf.flow())
    next(it_vcf)
    # C to T
    #print(names, chroms, poss, ra, aa)
    #print(reference)
    #print(alternative)
    #assert names[0] == 'refmismatch'
    #np.testing.assert_equal(reference, alternative)
    #np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0]))

    next(it_vcf)
    # C to T
    #print(names, chroms, poss, ra, aa)
    #print(reference)
    #print(alternative)
    #np.testing.assert_equal(reference[0,4,0,:], np.array([0,1,0,0]))
    #np.testing.assert_equal(alternative[0,4,0,:], np.array([0,0,0,1]))

    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
    # T to C
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
#    np.testing.assert_equal(reference[0,4,0,:], np.array([0,0,0,1]))
#    np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0]))

    # even binsize
    vcf = VariantStreamer(dna, vcffile, binsize=10, batch_size=1,
                          annotation=annot)
    it_vcf = iter(vcf.flow())
    next(it_vcf)
    # C to T


    next(it_vcf)
    # C to T

    names, chroms, poss, ra, aa, reference2, alternative2 = next(it_vcf)
    # T to C
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    np.testing.assert_equal(reference, reference2[:,::-1, :, ::-1])
    np.testing.assert_equal(alternative, alternative2[:,::-1, :, ::-1])


@pytest.mark.filterwarnings("ignore:The truth value")
def test_janggu_variant_streamer_order_1_revcomp_w_refgenome(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    """Test Janggu creation by shape and name. """
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    order = 1

    refgenome = os.path.join(data_path, 'sample_genome.fa')
    vcffile = os.path.join(data_path, 'sample.vcf')

    #dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome,
    #                                   storage='ndarray',
    #                                   binsize=50,
    #                                   store_whole_genome=True,
    #                                   order=order)

    annot = BedTool([Interval('chr2', 110, 130, '-')])

    # even binsize
    vcf = VariantStreamer(refgenome, vcffile, binsize=10, batch_size=1, order=order)
    it_vcf = iter(vcf.flow())
    next(it_vcf)
    # C to T
    #print(names, chroms, poss, ra, aa)
    #print(reference)
    #print(alternative)
    #assert names[0] == 'refmismatch'
    #np.testing.assert_equal(reference, alternative)
    #np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0]))

    next(it_vcf)
    # C to T
    #print(names, chroms, poss, ra, aa)
    #print(reference)
    #print(alternative)
    #np.testing.assert_equal(reference[0,4,0,:], np.array([0,1,0,0]))
    #np.testing.assert_equal(alternative[0,4,0,:], np.array([0,0,0,1]))

    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
    # T to C
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
#    np.testing.assert_equal(reference[0,4,0,:], np.array([0,0,0,1]))
#    np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0]))

    # even binsize
    vcf = VariantStreamer(refgenome, vcffile, binsize=10, batch_size=1,
                          annotation=annot, order=order)
    it_vcf = iter(vcf.flow())
    next(it_vcf)
    # C to T


    next(it_vcf)
    # C to T

    names, chroms, poss, ra, aa, reference2, alternative2 = next(it_vcf)
    # T to C
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    np.testing.assert_equal(reference, reference2[:,::-1, :, ::-1])
    np.testing.assert_equal(alternative, alternative2[:,::-1, :, ::-1])


@pytest.mark.filterwarnings("ignore:The truth value")
def test_janggu_variant_streamer_order_2(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    """Test Janggu creation by shape and name. """
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    order = 2

    refgenome = os.path.join(data_path, 'sample_genome.fa')
    vcffile = os.path.join(data_path, 'sample.vcf')

    dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome,
                                       storage='ndarray',
                                       binsize=50,
                                       store_whole_genome=True,
                                       order=order)

    vcf = VariantStreamer(dna, vcffile, binsize=10, batch_size=1)
    it_vcf = iter(vcf.flow())

    names, chroms, poss,  ra, aa, reference, alternative = next(it_vcf)
    # ACT -> ATT
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    assert names[0] == 'refmismatch'
    np.testing.assert_equal(reference, alternative)

    names, chroms, poss,  ra, aa, reference, alternative = next(it_vcf)
    # ACT -> ATT
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    np.testing.assert_equal(reference[0,3,0,1], 1)
    np.testing.assert_equal(reference[0,4,0,7], 1)
    np.testing.assert_equal(alternative[0,3,0,3], 1)
    np.testing.assert_equal(alternative[0,4,0,15], 1)


    names, chroms, poss, ra, aa,  reference, alternative = next(it_vcf)
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    # CTC -> CCC
    np.testing.assert_equal(reference[0,3,0,7], 1)
    np.testing.assert_equal(reference[0,4,0,13], 1)
    np.testing.assert_equal(alternative[0,3,0,5], 1)
    np.testing.assert_equal(alternative[0,4,0,5], 1)

    names, chroms, poss, ra, aa,  reference, alternative = next(it_vcf)
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    # GAC -> GGC
    np.testing.assert_equal(reference[0,3,0,8], 1)
    np.testing.assert_equal(reference[0,4,0,1], 1)
    np.testing.assert_equal(alternative[0,3,0,10], 1)
    np.testing.assert_equal(alternative[0,4,0,9], 1)

    names, chroms, poss, ra, aa,  reference, alternative = next(it_vcf)
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    # CGG -> CAG
    np.testing.assert_equal(reference[0,3,0,6], 1)
    np.testing.assert_equal(reference[0,4,0,10], 1)
    np.testing.assert_equal(alternative[0,3,0,4], 1)
    np.testing.assert_equal(alternative[0,4,0,2], 1)

    vcf = VariantStreamer(dna, vcffile, binsize=5, batch_size=1)
    it_vcf = iter(vcf.flow())

    names, chroms, poss,  ra, aa, reference, alternative = next(it_vcf)
    # ACT -> ATT
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    assert names[0] == 'refmismatch'
    np.testing.assert_equal(reference, alternative)

    names, chroms, poss,  ra, aa, reference, alternative = next(it_vcf)
    # ACT -> ATT
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    np.testing.assert_equal(reference[0,1,0,1], 1)
    np.testing.assert_equal(reference[0,2,0,7], 1)
    np.testing.assert_equal(alternative[0,1,0,3], 1)
    np.testing.assert_equal(alternative[0,2,0,15], 1)


    names, chroms, poss, ra, aa,  reference, alternative = next(it_vcf)
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    # CTC -> CCC
    np.testing.assert_equal(reference[0,1,0,7], 1)
    np.testing.assert_equal(reference[0,2,0,13], 1)
    np.testing.assert_equal(alternative[0,1,0,5], 1)
    np.testing.assert_equal(alternative[0,2,0,5], 1)


@pytest.mark.filterwarnings("ignore:The truth value")
def test_janggu_variant_streamer_order_2_w_refgenome(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    """Test Janggu creation by shape and name. """
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    order = 2

    refgenome = os.path.join(data_path, 'sample_genome.fa')
    vcffile = os.path.join(data_path, 'sample.vcf')

    #dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome,
    #                                   storage='ndarray',
    #                                   binsize=50,
    #                                   store_whole_genome=True,
    #                                   order=order)

    vcf = VariantStreamer(refgenome, vcffile, binsize=10, batch_size=1,
                          order=order)
    it_vcf = iter(vcf.flow())

    names, chroms, poss,  ra, aa, reference, alternative = next(it_vcf)
    # ACT -> ATT
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    assert names[0] == 'refmismatch'
    np.testing.assert_equal(reference, alternative)

    names, chroms, poss,  ra, aa, reference, alternative = next(it_vcf)
    # ACT -> ATT
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    np.testing.assert_equal(reference[0,3,0,1], 1)
    np.testing.assert_equal(reference[0,4,0,7], 1)
    np.testing.assert_equal(alternative[0,3,0,3], 1)
    np.testing.assert_equal(alternative[0,4,0,15], 1)


    names, chroms, poss, ra, aa,  reference, alternative = next(it_vcf)
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    # CTC -> CCC
    np.testing.assert_equal(reference[0,3,0,7], 1)
    np.testing.assert_equal(reference[0,4,0,13], 1)
    np.testing.assert_equal(alternative[0,3,0,5], 1)
    np.testing.assert_equal(alternative[0,4,0,5], 1)

    names, chroms, poss, ra, aa,  reference, alternative = next(it_vcf)
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    # GAC -> GGC
    np.testing.assert_equal(reference[0,3,0,8], 1)
    np.testing.assert_equal(reference[0,4,0,1], 1)
    np.testing.assert_equal(alternative[0,3,0,10], 1)
    np.testing.assert_equal(alternative[0,4,0,9], 1)

    names, chroms, poss, ra, aa,  reference, alternative = next(it_vcf)
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    # CGG -> CAG
    np.testing.assert_equal(reference[0,3,0,6], 1)
    np.testing.assert_equal(reference[0,4,0,10], 1)
    np.testing.assert_equal(alternative[0,3,0,4], 1)
    np.testing.assert_equal(alternative[0,4,0,2], 1)

    vcf = VariantStreamer(refgenome, vcffile, binsize=5, batch_size=1,
                          order=order)
    it_vcf = iter(vcf.flow())

    names, chroms, poss,  ra, aa, reference, alternative = next(it_vcf)
    # ACT -> ATT
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    assert names[0] == 'refmismatch'
    np.testing.assert_equal(reference, alternative)

    names, chroms, poss,  ra, aa, reference, alternative = next(it_vcf)
    # ACT -> ATT
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    np.testing.assert_equal(reference[0,1,0,1], 1)
    np.testing.assert_equal(reference[0,2,0,7], 1)
    np.testing.assert_equal(alternative[0,1,0,3], 1)
    np.testing.assert_equal(alternative[0,2,0,15], 1)


    names, chroms, poss, ra, aa,  reference, alternative = next(it_vcf)
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    # CTC -> CCC
    np.testing.assert_equal(reference[0,1,0,7], 1)
    np.testing.assert_equal(reference[0,2,0,13], 1)
    np.testing.assert_equal(alternative[0,1,0,5], 1)
    np.testing.assert_equal(alternative[0,2,0,5], 1)