--- a +++ b/tests/test_dna_dataset.py @@ -0,0 +1,1314 @@ +import glob +import os + +import matplotlib +import numpy as np +import pandas +import pkg_resources +import pytest +from keras.layers import Input +from keras.models import Model +from pybedtools import BedTool +from pybedtools import Interval + +from janggu.data import Bioseq +from janggu.data import GenomicIndexer +from janggu.data import VariantStreamer +from janggu.layers import Complement +from janggu.layers import Reverse +from janggu.utils import complement_permmatrix +from janggu.utils import sequences_from_fasta + +matplotlib.use('AGG') + +binsize = 200 +flank = 150 +stepsize = 50 + + +def datalen(bed_file): + binsizes = 0 + reader = BedTool(bed_file) + for reg in reader: + binsizes += (reg.iv.end - reg.iv.start - binsize + stepsize)//stepsize + return binsizes + + +def test_dna_loading_from_seqrecord(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + order = 2 + data_path = pkg_resources.resource_filename('janggu', 'resources/') + bed_merged = os.path.join(data_path, 'sample.gtf') + refgenome = os.path.join(data_path, 'sample_genome.fa') + seqs = sequences_from_fasta(refgenome) + + data = Bioseq.create_from_refgenome('train', refgenome=seqs, + roi=bed_merged, + storage='ndarray', + store_whole_genome=True, + order=order) + + np.testing.assert_equal(data[0], data[data.gindexer[0]]) + chrom = data.gindexer[0].chrom + start = data.gindexer[0].start + end = data.gindexer[0].end + np.testing.assert_equal(data[0], data[(chrom, start, end)]) + np.testing.assert_equal(data[0], data[chrom, start, end]) + + +def test_dna_genomic_interval_access(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + order = 2 + data_path = pkg_resources.resource_filename('janggu', 'resources/') + bed_merged = os.path.join(data_path, 'sample.gtf') + refgenome = os.path.join(data_path, 'sample_genome.fa') + + data = Bioseq.create_from_refgenome('train', refgenome=refgenome, + roi=bed_merged, + storage='ndarray', + order=order) + + with pytest.raises(Exception): + # due to store_whole_genome = False + data[data.gindexer[0]] + + data = Bioseq.create_from_refgenome('train', refgenome=refgenome, + roi=bed_merged, + storage='ndarray', + order=order, + store_whole_genome=True) + + np.testing.assert_equal(data[0], data[data.gindexer[0]]) + chrom = data.gindexer[0].chrom + start = data.gindexer[0].start + end = data.gindexer[0].end + np.testing.assert_equal(data[0], data[(chrom, start, end)]) + np.testing.assert_equal(data[0], data[chrom, start, end]) + +def test_dna_dims_order_1_from_subset_dataframe(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + order = 1 + data_path = pkg_resources.resource_filename('janggu', 'resources/') + bed_merged = os.path.join(data_path, 'sample.gtf') + refgenome = os.path.join(data_path, 'sample_genome.fa') + + roi = pandas.read_csv(bed_merged, + sep='\t', header=None, usecols=[0, 2, 3, 4, 5, 6], skiprows=2, + names=['chrom', 'name', 'start', 'end', 'score', 'strand']) + roi.start -= 1 + print(roi) + + data = Bioseq.create_from_refgenome('train', refgenome=refgenome, + roi=roi, + storage='ndarray', + store_whole_genome=True, + order=order) + + np.testing.assert_equal(data[0], data[data.gindexer[0]]) + assert len(data.garray.handle) == 2 + + # for order 1 + assert len(data) == 2 + assert data.shape == (2, 10000, 1, 4) + assert data[:].sum() == 20000 + + roi = BedTool(bed_merged) + data = Bioseq.create_from_refgenome('train', refgenome=refgenome, + roi=roi, + storage='ndarray', + store_whole_genome=True, + order=order) + + np.testing.assert_equal(data[0], data[data.gindexer[0]]) + assert len(data.garray.handle) == 2 + + # for order 1 + assert len(data) == 2 + assert data.shape == (2, 10000, 1, 4) + assert data[:].sum() == 20000 + + roi = [iv for iv in BedTool(bed_merged)] + data = Bioseq.create_from_refgenome('train', refgenome=refgenome, + roi=roi, + storage='ndarray', + store_whole_genome=True, + order=order) + + np.testing.assert_equal(data[0], data[data.gindexer[0]]) + assert len(data.garray.handle) == 2 + + # for order 1 + assert len(data) == 2 + assert data.shape == (2, 10000, 1, 4) + assert data[:].sum() == 20000 + + +def test_dna_dims_order_1_from_subset(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + order = 1 + data_path = pkg_resources.resource_filename('janggu', 'resources/') + bed_merged = os.path.join(data_path, 'sample.gtf') + refgenome = os.path.join(data_path, 'sample_genome.fa') + + data = Bioseq.create_from_refgenome('train', refgenome=refgenome, + roi=bed_merged, + storage='ndarray', + order=order) + + np.testing.assert_equal(data[0], data[data.gindexer[0]]) + assert len(data.garray.handle) == 2 + + # for order 1 + assert len(data) == 2 + assert data.shape == (2, 10000, 1, 4) + assert data[:].sum() == 20000 + + +def test_dna_dims_order_1_from_subset(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + order = 1 + data_path = pkg_resources.resource_filename('janggu', 'resources/') + bed_merged = os.path.join(data_path, 'sample.gtf') + refgenome = os.path.join(data_path, 'sample_genome.fa') + + data = Bioseq.create_from_refgenome('train', refgenome=refgenome, + roi=bed_merged, + binsize=200, stepsize=200, + storage='ndarray', + order=order) + + assert len(data.garray.handle['data']) == 100 + + # for order 1 + assert len(data) == 100 + assert data.shape == (100, 200, 1, 4) + # the correctness of the sequence extraction was also + # validated using: + # bedtools getfasta -fi sample_genome.fa -bed sample.bed + # >chr1:15000-25000 + # ATTGTGGTGA... + # this sequence is read from the forward strand + np.testing.assert_equal(data[0][0, :10, 0, :], + np.asarray([[1, 0, 0, 0], # A + [0, 0, 0, 1], # T + [0, 0, 0, 1], # T + [0, 0, 1, 0], # C + [0, 0, 0, 1], # T + [0, 0, 1, 0], # G + [0, 0, 1, 0], # G + [0, 0, 0, 1], # T + [0, 0, 1, 0], # G + [1, 0, 0, 0]], # A + dtype='int8')) + + # bedtools getfasta -fi sample_genome.fa -bed sample.bed + # >chr2:15000-25000 + # ggggaagcaa... + # this sequence is read from the reverse strand + # so we have ...ttgcttcccc + np.testing.assert_equal(data[50][0, -10:, 0, :], + np.asarray([[0, 0, 0, 1], # T + [0, 0, 0, 1], # T + [0, 0, 1, 0], # G + [0, 1, 0, 0], # C + [0, 0, 0, 1], # T + [0, 0, 0, 1], # T + [0, 1, 0, 0], # C + [0, 1, 0, 0], # C + [0, 1, 0, 0], # C + [0, 1, 0, 0]], # C + dtype='int8')) + + +def test_dna_dims_order_1_from_reference(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + order = 1 + data_path = pkg_resources.resource_filename('janggu', 'resources/') + bed_merged = os.path.join(data_path, 'sample.gtf') + refgenome = os.path.join(data_path, 'sample_genome.fa') + + gindexer = GenomicIndexer.create_from_file(bed_merged, 200, 200) + + data = Bioseq.create_from_refgenome('train', refgenome=refgenome, + storage='ndarray', + order=order, + store_whole_genome=True) + data.gindexer = gindexer + assert len(data.garray.handle) == 2 + assert 'chr1' in data.garray.handle + assert 'chr2' in data.garray.handle + + # for order 1 + assert len(data) == 100 + assert data.shape == (100, 200, 1, 4) + # the correctness of the sequence extraction was also + # validated using: + # bedtools getfasta -fi sample_genome.fa -bed sample.bed + # >chr1:15000-25000 + # ATTGTGGTGA... + # this sequence is read from the forward strand + np.testing.assert_equal(data[0][0, :10, 0, :], + np.asarray([[1, 0, 0, 0], # A + [0, 0, 0, 1], # T + [0, 0, 0, 1], # T + [0, 0, 1, 0], # C + [0, 0, 0, 1], # T + [0, 0, 1, 0], # G + [0, 0, 1, 0], # G + [0, 0, 0, 1], # T + [0, 0, 1, 0], # G + [1, 0, 0, 0]], # A + dtype='int8')) + + # bedtools getfasta -fi sample_genome.fa -bed sample.bed + # >chr2:15000-25000 + # ggggaagcaa... + # this sequence is read from the reverse strand + # so we have ...ttgcttcccc + np.testing.assert_equal(data[50][0, -10:, 0, :], + np.asarray([[0, 0, 0, 1], # T + [0, 0, 0, 1], # T + [0, 0, 1, 0], # G + [0, 1, 0, 0], # C + [0, 0, 0, 1], # T + [0, 0, 0, 1], # T + [0, 1, 0, 0], # C + [0, 1, 0, 0], # C + [0, 1, 0, 0], # C + [0, 1, 0, 0]], # C + dtype='int8')) + + +def test_dna_dims_order_2(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + order = 2 + data_path = pkg_resources.resource_filename('janggu', 'resources/') + bed_merged = os.path.join(data_path, 'sample.bed') + refgenome = os.path.join(data_path, 'sample_genome.fa') + + data = Bioseq.create_from_refgenome('train', refgenome=refgenome, + roi=bed_merged, + binsize=200, + storage='ndarray', + order=order) + + # for order 1 + assert len(data) == 100 + assert data.shape == (100, 199, 1, 16) + # the correctness of the sequence extraction was also + # validated using: + # >bedtools getfasta -fi sample_genome.fa -bed sample.bed + # >chr1:15000-25000 + # ATTGTGGTGAC... + np.testing.assert_equal( + data[0][0, :10, 0, :], + np.asarray([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # AT + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # TT + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], # TG + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], # GT + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], # TG + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], # GG + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], # GT + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], # TG + [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], # GA + [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], # AC + dtype='int8')) + + # bedtools getfasta -fi sample_genome.fa -bed sample.bed + # >chr2:15000-25000 + # ggggaagcaag... + # this sequence is read from the reverse strand + # so we have ...cttgcttcccc + np.testing.assert_equal( + data[50][0, -10:, 0, :], + np.asarray([[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], # CT + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # TT + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], # TG + [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], # GC + [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], # CT + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # TT + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], # TC + [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # CC + [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # CC + [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], # CC + dtype='int8')) + + +def reverse_layer(order): + data_path = pkg_resources.resource_filename('janggu', 'resources/') + + bed_file = os.path.join(data_path, 'sample.bed') + + refgenome = os.path.join(data_path, 'sample_genome.fa') + + data = Bioseq.create_from_refgenome('train', refgenome=refgenome, + roi=bed_file, + storage='ndarray', + binsize=binsize, + flank=flank, + order=order) + + dna_in = Input(shape=data.shape[1:], name='dna') + rdna_layer = Reverse()(dna_in) + + rmod = Model(dna_in, rdna_layer) + + # actual shape of DNA + dna = data[0] + np.testing.assert_equal(dna[:, ::-1, :, :], rmod.predict(dna)) + + +def complement_layer(order): + data_path = pkg_resources.resource_filename('janggu', 'resources/') + + bed_file = os.path.join(data_path, 'sample.bed') + + refgenome = os.path.join(data_path, 'sample_genome.fa') + + data = Bioseq.create_from_refgenome('train', refgenome=refgenome, + roi=bed_file, + storage='ndarray', + binsize=binsize, + flank=flank, + order=order) + + dna_in = Input(shape=data.shape[1:], name='dna') + cdna_layer = Complement()(dna_in) + cmod = Model(dna_in, cdna_layer) + + # actual shape of DNA + dna = data[0] + + cdna = cmod.predict(dna) + ccdna = cmod.predict(cdna) + + with pytest.raises(Exception): + np.testing.assert_equal(dna, cdna) + np.testing.assert_equal(dna, ccdna) + + +def test_reverse_order_1(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + reverse_layer(1) + + +def test_reverse_order_2(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + reverse_layer(2) + + +def test_complement_order_1(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + complement_layer(1) + + +def test_complement_order_2(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + complement_layer(2) + + +def test_revcomp_rcmatrix(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + + rcmatrix = complement_permmatrix(1) + + np.testing.assert_equal(rcmatrix, + np.array([[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], + [1, 0, 0, 0]])) + + rcmatrix = complement_permmatrix(2) + + np.testing.assert_equal(rcmatrix[0], + np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1])) + np.testing.assert_equal(rcmatrix[4], + np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0])) + np.testing.assert_equal(rcmatrix[8], + np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 0, 0])) + np.testing.assert_equal(rcmatrix[12], + np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0])) + + np.testing.assert_equal(rcmatrix[1], + np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, + 0, 0, 0, 0])) + np.testing.assert_equal(rcmatrix[5], + np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, + 0, 0, 0, 0])) + np.testing.assert_equal(rcmatrix[9], + np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, + 0, 0, 0, 0])) + np.testing.assert_equal(rcmatrix[13], + np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0])) + + +def test_rcmatrix_identity(): + + for order in range(1, 4): + + rcmatrix = complement_permmatrix(order) + + np.testing.assert_equal(np.eye(pow(4, order)), + np.matmul(rcmatrix, rcmatrix)) + + +def test_dna_dataset_sanity(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + data_path = pkg_resources.resource_filename('janggu', 'resources/') + bed_file = os.path.join(data_path, 'sample.bed') + + refgenome = os.path.join(data_path, 'sample_genome.fa') + + with pytest.raises(Exception): + # name must be a string + Bioseq.create_from_refgenome(1.23, refgenome='', + storage='ndarray', + roi=bed_file, order=1) + with pytest.raises(Exception): + Bioseq.create_from_refgenome('train', refgenome='', + storage='ndarray', + roi=bed_file, order=1) + with pytest.raises(Exception): + Bioseq.create_from_refgenome('train', refgenome='test', + storage='ndarray', + roi=bed_file, order=1) + + with pytest.raises(Exception): + Bioseq.create_from_refgenome('train', refgenome=refgenome, + storage='ndarray', + roi=bed_file, order=0) + with pytest.raises(Exception): + Bioseq.create_from_refgenome('train', refgenome=refgenome, + storage='ndarray', + roi=bed_file, flank=-1) + with pytest.raises(Exception): + Bioseq.create_from_refgenome('train', refgenome=refgenome, + storage='ndarray', + roi=bed_file, binsize=0) + with pytest.raises(Exception): + Bioseq.create_from_refgenome('train', refgenome=refgenome, + storage='ndarray', + roi=bed_file, stepsize=0) + + with pytest.warns(FutureWarning): + Bioseq.create_from_refgenome('train', refgenome=refgenome, + storage='ndarray', + roi=bed_file, + datatags=['help']) + + with pytest.warns(FutureWarning): + Bioseq.create_from_refgenome('train', refgenome=refgenome, + storage='ndarray', + roi=bed_file, + overwrite=True) + with pytest.raises(Exception): + Bioseq.create_from_refgenome('train', refgenome=refgenome, + storage='step', + roi=bed_file, order=1) + + assert not os.path.exists(os.path.join(tmpdir.strpath, 'train', + 'storage.h5')) + with pytest.raises(ValueError): + Bioseq.create_from_refgenome('train', refgenome=refgenome, + storage='sparse', + roi=None, order=1, + store_whole_genome=True) + with pytest.raises(ValueError): + Bioseq.create_from_refgenome('train', refgenome=refgenome, + roi=bed_file, order=0, + store_whole_genome=True) + with pytest.raises(ValueError): + Bioseq.create_from_refgenome('train', refgenome=refgenome, + roi=None, store_whole_genome=False) + + Bioseq.create_from_refgenome('train', refgenome=refgenome, + storage='ndarray', + roi=None, order=1, + store_whole_genome=True) + file_ = glob.glob(os.path.join(tmpdir.strpath, 'datasets', 'train', '*.h5')) + assert len(file_) == 0 + print(refgenome) + print(bed_file) + Bioseq.create_from_refgenome('train', refgenome=refgenome, + storage='ndarray', + roi=bed_file, order=1, cache=True) + Bioseq.create_from_refgenome('train', refgenome=refgenome, + storage='hdf5', + roi=bed_file, order=1, cache=True) + # a cache file must exist now + file_ = glob.glob(os.path.join(tmpdir.strpath, 'datasets', 'train', '*.h5')) + assert len(file_) == 1 + + # reload the cached file + Bioseq.create_from_refgenome('train', refgenome=refgenome, + storage='hdf5', + roi=bed_file, order=1, cache=True) + + + +def test_read_dna_from_biostring_order_1(): + + data_path = pkg_resources.resource_filename('janggu', 'resources/') + + order = 1 + filename = os.path.join(data_path, 'sample.fa') + seqs = sequences_from_fasta(filename) + with pytest.raises(ValueError): + data = Bioseq.create_from_seq('train', fastafile=seqs, storage='sparse', + order=order, cache=False) + + data = Bioseq.create_from_seq('train', fastafile=seqs, + order=order, cache=False) + + np.testing.assert_equal(len(data), 3897) + np.testing.assert_equal(data.shape, (3897, 200, 1, 4)) + np.testing.assert_equal( + data[0][0, :10, 0, :], + np.asarray([[0, 1, 0, 0], + [1, 0, 0, 0], + [0, 1, 0, 0], + [1, 0, 0, 0], + [0, 0, 1, 0], + [0, 1, 0, 0], + [1, 0, 0, 0], + [0, 0, 1, 0], + [1, 0, 0, 0], + [0, 0, 1, 0]], dtype='int8')) + + +def test_read_dna_from_fasta_order_1(): + + data_path = pkg_resources.resource_filename('janggu', 'resources/') + + order = 1 + filename = os.path.join(data_path, 'sample.fa') + data = Bioseq.create_from_seq('train', fastafile=filename, + order=order, cache=False) + + np.testing.assert_equal(len(data), 3897) + np.testing.assert_equal(data.shape, (3897, 200, 1, 4)) + np.testing.assert_equal( + data[0][0, :10, 0, :], + np.asarray([[0, 1, 0, 0], + [1, 0, 0, 0], + [0, 1, 0, 0], + [1, 0, 0, 0], + [0, 0, 1, 0], + [0, 1, 0, 0], + [1, 0, 0, 0], + [0, 0, 1, 0], + [1, 0, 0, 0], + [0, 0, 1, 0]], dtype='int8')) + + +def test_read_dna_from_fasta_order_2(): + data_path = pkg_resources.resource_filename('janggu', 'resources/') + + order = 2 + filename = os.path.join(data_path, 'sample.fa') + for store_genome in [True, False]: + data = Bioseq.create_from_seq('train', fastafile=filename, + order=order, cache=False) + + np.testing.assert_equal(len(data), 3897) + np.testing.assert_equal(data.shape, (3897, 199, 1, 16)) + np.testing.assert_equal( + data[0][0, :10, 0, :], + np.asarray([[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]], dtype='int8')) + +def test_read_protein_sequences(): + data_path = pkg_resources.resource_filename('janggu', 'resources/') + order = 1 + filename = os.path.join(data_path, 'sample_protein.fa') + data = Bioseq.create_from_seq('train', fastafile=filename, + order=order, seqtype='protein', fixedlen=1000) + np.testing.assert_equal(len(data), 3) + np.testing.assert_equal(data.shape, (3, 1000, 1, 20)) + np.testing.assert_equal( + data[0][0, :4, 0, :], + np.asarray([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]], dtype='int8')) + np.testing.assert_equal( + data[0][0, -2:, 0, :], np.zeros((2, 20), dtype='int8')) + + data = Bioseq.create_from_seq('train', fastafile=filename, + order=order, seqtype='protein', fixedlen=5) + np.testing.assert_equal(len(data), 3) + np.testing.assert_equal(data.shape, (3, 5, 1, 20)) + np.testing.assert_equal( + data[0][0, :4, 0, :], + np.asarray([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]], dtype='int8')) + + +def test_dnabed_overreaching_ends_whole_genome(): + data_path = pkg_resources.resource_filename('janggu', 'resources/') + bed_file = os.path.join(data_path, "bed_test.bed") + filename = os.path.join(data_path, 'sample_genome.fa') + + bioseq = Bioseq.create_from_refgenome( + 'test', + refgenome=filename, + roi=bed_file, + binsize=2, + flank=20, + store_whole_genome=True, + storage='ndarray', cache=False) + assert len(bioseq) == 9 + assert bioseq.shape == (9, 2+2*20, 1, 4) + # test if beginning is correctly padded + np.testing.assert_equal(bioseq[0].sum(), 22) + # test if end is correctly padded + np.testing.assert_equal(bioseq['chr1', 29990, 30010].sum(), 10) + + +def test_dnabed_overreaching_ends_partial_genome(): + data_path = pkg_resources.resource_filename('janggu', 'resources/') + bed_file = os.path.join(data_path, "bed_test.bed") + filename = os.path.join(data_path, 'sample_genome.fa') + + bioseq = Bioseq.create_from_refgenome( + 'test', + refgenome=filename, + roi=bed_file, + binsize=2, + flank=20, + store_whole_genome=False, + storage='ndarray') + assert len(bioseq) == 9 + assert bioseq.shape == (9, 2+2*20, 1, 4) + np.testing.assert_equal(bioseq[0].sum(), 22) + np.testing.assert_equal(bioseq[-1].sum(), 42 - 4) + + +@pytest.mark.filterwarnings("ignore:The truth value") +def test_janggu_variant_streamer_order_1(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + """Test Janggu creation by shape and name. """ + data_path = pkg_resources.resource_filename('janggu', 'resources/') + + order = 1 + + refgenome = os.path.join(data_path, 'sample_genome.fa') + vcffile = os.path.join(data_path, 'sample.vcf') + + dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, + storage='ndarray', + binsize=50, + store_whole_genome=True, + order=order) + + # even binsize + vcf = VariantStreamer(dna, vcffile, binsize=10, batch_size=1) + it_vcf = iter(vcf.flow()) + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # C to T + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + assert names[0] == 'refmismatch' + np.testing.assert_equal(reference, alternative) + np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0])) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # C to T + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + np.testing.assert_equal(reference[0,4,0,:], np.array([0,1,0,0])) + np.testing.assert_equal(alternative[0,4,0,:], np.array([0,0,0,1])) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # T to C + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + np.testing.assert_equal(reference[0,4,0,:], np.array([0,0,0,1])) + np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0])) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # A to G + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + np.testing.assert_equal(reference[0,4,0,:], np.array([1,0,0,0])) + np.testing.assert_equal(alternative[0,4,0,:], np.array([0,0,1,0])) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # G to A + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + np.testing.assert_equal(reference[0,4,0,:], np.array([0,0,1,0])) + np.testing.assert_equal(alternative[0,4,0,:], np.array([1,0,0,0])) + + # odd binsize + vcf = VariantStreamer(dna, vcffile, binsize=3, batch_size=1) + it_vcf = iter(vcf.flow()) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # C to T + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + assert names[0] == 'refmismatch' + np.testing.assert_equal(reference, alternative) + np.testing.assert_equal(alternative[0,1,0,:], np.array([0,1,0,0])) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # C to T + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + np.testing.assert_equal(reference[0,1,0,:], np.array([0,1,0,0])) + np.testing.assert_equal(alternative[0,1,0,:], np.array([0,0,0,1])) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # T to C + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + np.testing.assert_equal(reference[0,1,0,:], np.array([0,0,0,1])) + np.testing.assert_equal(alternative[0,1,0,:], np.array([0,1,0,0])) + + +@pytest.mark.filterwarnings("ignore:The truth value") +def test_janggu_variant_streamer_order_1_w_refgenome(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + """Test Janggu creation by shape and name. """ + data_path = pkg_resources.resource_filename('janggu', 'resources/') + + order = 1 + + refgenome = os.path.join(data_path, 'sample_genome.fa') + vcffile = os.path.join(data_path, 'sample.vcf') + + #dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, + # storage='ndarray', + # binsize=50, + # store_whole_genome=True, + # order=order) + + # even binsize + vcf = VariantStreamer(refgenome, vcffile, + binsize=10, batch_size=1, + order=order) + it_vcf = iter(vcf.flow()) + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # C to T + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + assert names[0] == 'refmismatch' + np.testing.assert_equal(reference, alternative) + np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0])) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # C to T + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + np.testing.assert_equal(reference[0,4,0,:], np.array([0,1,0,0])) + np.testing.assert_equal(alternative[0,4,0,:], np.array([0,0,0,1])) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # T to C + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + np.testing.assert_equal(reference[0,4,0,:], np.array([0,0,0,1])) + np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0])) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # A to G + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + np.testing.assert_equal(reference[0,4,0,:], np.array([1,0,0,0])) + np.testing.assert_equal(alternative[0,4,0,:], np.array([0,0,1,0])) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # G to A + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + np.testing.assert_equal(reference[0,4,0,:], np.array([0,0,1,0])) + np.testing.assert_equal(alternative[0,4,0,:], np.array([1,0,0,0])) + + # odd binsize + vcf = VariantStreamer(refgenome, vcffile, binsize=3, batch_size=1) + it_vcf = iter(vcf.flow()) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # C to T + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + assert names[0] == 'refmismatch' + np.testing.assert_equal(reference, alternative) + np.testing.assert_equal(alternative[0,1,0,:], np.array([0,1,0,0])) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # C to T + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + np.testing.assert_equal(reference[0,1,0,:], np.array([0,1,0,0])) + np.testing.assert_equal(alternative[0,1,0,:], np.array([0,0,0,1])) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # T to C + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + np.testing.assert_equal(reference[0,1,0,:], np.array([0,0,0,1])) + np.testing.assert_equal(alternative[0,1,0,:], np.array([0,1,0,0])) + + +@pytest.mark.filterwarnings("ignore:The truth value") +def test_janggu_variant_streamer_order_12_ignore_ref_match(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + """Test Janggu creation by shape and name. """ + data_path = pkg_resources.resource_filename('janggu', 'resources/') + + refgenome = os.path.join(data_path, 'sample_genome.fa') + vcffile = os.path.join(data_path, 'sample.vcf') + + for order in [1, 2]: + + dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, + storage='ndarray', + binsize=50, + store_whole_genome=True, + order=order) + + # even binsize + vcf = VariantStreamer(dna, vcffile, binsize=10, batch_size=1, + ignore_reference_match=True) + it_vcf = iter(vcf.flow()) + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # C to T + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + + assert names[0] == 'refmismatch' + #np.testing.assert_equal(reference, alternative) + np.testing.assert_equal(np.abs(reference-alternative).sum(), 2*order) + #np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0])) + + # odd binsize + vcf = VariantStreamer(dna, vcffile, binsize=3, batch_size=1, + ignore_reference_match=True) + it_vcf = iter(vcf.flow()) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # C to T + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + assert names[0] == 'refmismatch' + np.testing.assert_equal(np.abs(reference-alternative).sum(), 2*order) + #np.testing.assert_equal(alternative[0,1,0,:], np.array([0,1,0,0])) + + +@pytest.mark.filterwarnings("ignore:The truth value") +def test_janggu_variant_streamer_order_12_ignore_ref_match_w_refgenome(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + """Test Janggu creation by shape and name. """ + data_path = pkg_resources.resource_filename('janggu', 'resources/') + + refgenome = os.path.join(data_path, 'sample_genome.fa') + vcffile = os.path.join(data_path, 'sample.vcf') + + for order in [1, 2]: + + #dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, + # storage='ndarray', + # binsize=50, + # store_whole_genome=True, + # order=order) + + # even binsize + vcf = VariantStreamer(refgenome, vcffile, binsize=10, batch_size=1, + ignore_reference_match=True, order=order) + it_vcf = iter(vcf.flow()) + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # C to T + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + + assert names[0] == 'refmismatch' + #np.testing.assert_equal(reference, alternative) + np.testing.assert_equal(np.abs(reference-alternative).sum(), 2*order) + #np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0])) + + # odd binsize + vcf = VariantStreamer(refgenome, vcffile, binsize=3, batch_size=1, + ignore_reference_match=True, order=order) + it_vcf = iter(vcf.flow()) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # C to T + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + assert names[0] == 'refmismatch' + np.testing.assert_equal(np.abs(reference-alternative).sum(), 2*order) + #np.testing.assert_equal(alternative[0,1,0,:], np.array([0,1,0,0])) + +@pytest.mark.filterwarnings("ignore:The truth value") +def test_janggu_variant_streamer_order_1_revcomp(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + """Test Janggu creation by shape and name. """ + data_path = pkg_resources.resource_filename('janggu', 'resources/') + + order = 1 + + refgenome = os.path.join(data_path, 'sample_genome.fa') + vcffile = os.path.join(data_path, 'sample.vcf') + + dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, + storage='ndarray', + binsize=50, + store_whole_genome=True, + order=order) + + annot = BedTool([Interval('chr2', 110, 130, '-')]) + + # even binsize + vcf = VariantStreamer(dna, vcffile, binsize=10, batch_size=1) + it_vcf = iter(vcf.flow()) + next(it_vcf) + # C to T + #print(names, chroms, poss, ra, aa) + #print(reference) + #print(alternative) + #assert names[0] == 'refmismatch' + #np.testing.assert_equal(reference, alternative) + #np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0])) + + next(it_vcf) + # C to T + #print(names, chroms, poss, ra, aa) + #print(reference) + #print(alternative) + #np.testing.assert_equal(reference[0,4,0,:], np.array([0,1,0,0])) + #np.testing.assert_equal(alternative[0,4,0,:], np.array([0,0,0,1])) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # T to C + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) +# np.testing.assert_equal(reference[0,4,0,:], np.array([0,0,0,1])) +# np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0])) + + # even binsize + vcf = VariantStreamer(dna, vcffile, binsize=10, batch_size=1, + annotation=annot) + it_vcf = iter(vcf.flow()) + next(it_vcf) + # C to T + + + next(it_vcf) + # C to T + + names, chroms, poss, ra, aa, reference2, alternative2 = next(it_vcf) + # T to C + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + np.testing.assert_equal(reference, reference2[:,::-1, :, ::-1]) + np.testing.assert_equal(alternative, alternative2[:,::-1, :, ::-1]) + + +@pytest.mark.filterwarnings("ignore:The truth value") +def test_janggu_variant_streamer_order_1_revcomp_w_refgenome(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + """Test Janggu creation by shape and name. """ + data_path = pkg_resources.resource_filename('janggu', 'resources/') + + order = 1 + + refgenome = os.path.join(data_path, 'sample_genome.fa') + vcffile = os.path.join(data_path, 'sample.vcf') + + #dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, + # storage='ndarray', + # binsize=50, + # store_whole_genome=True, + # order=order) + + annot = BedTool([Interval('chr2', 110, 130, '-')]) + + # even binsize + vcf = VariantStreamer(refgenome, vcffile, binsize=10, batch_size=1, order=order) + it_vcf = iter(vcf.flow()) + next(it_vcf) + # C to T + #print(names, chroms, poss, ra, aa) + #print(reference) + #print(alternative) + #assert names[0] == 'refmismatch' + #np.testing.assert_equal(reference, alternative) + #np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0])) + + next(it_vcf) + # C to T + #print(names, chroms, poss, ra, aa) + #print(reference) + #print(alternative) + #np.testing.assert_equal(reference[0,4,0,:], np.array([0,1,0,0])) + #np.testing.assert_equal(alternative[0,4,0,:], np.array([0,0,0,1])) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # T to C + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) +# np.testing.assert_equal(reference[0,4,0,:], np.array([0,0,0,1])) +# np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0])) + + # even binsize + vcf = VariantStreamer(refgenome, vcffile, binsize=10, batch_size=1, + annotation=annot, order=order) + it_vcf = iter(vcf.flow()) + next(it_vcf) + # C to T + + + next(it_vcf) + # C to T + + names, chroms, poss, ra, aa, reference2, alternative2 = next(it_vcf) + # T to C + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + np.testing.assert_equal(reference, reference2[:,::-1, :, ::-1]) + np.testing.assert_equal(alternative, alternative2[:,::-1, :, ::-1]) + + +@pytest.mark.filterwarnings("ignore:The truth value") +def test_janggu_variant_streamer_order_2(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + """Test Janggu creation by shape and name. """ + data_path = pkg_resources.resource_filename('janggu', 'resources/') + + order = 2 + + refgenome = os.path.join(data_path, 'sample_genome.fa') + vcffile = os.path.join(data_path, 'sample.vcf') + + dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, + storage='ndarray', + binsize=50, + store_whole_genome=True, + order=order) + + vcf = VariantStreamer(dna, vcffile, binsize=10, batch_size=1) + it_vcf = iter(vcf.flow()) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # ACT -> ATT + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + assert names[0] == 'refmismatch' + np.testing.assert_equal(reference, alternative) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # ACT -> ATT + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + np.testing.assert_equal(reference[0,3,0,1], 1) + np.testing.assert_equal(reference[0,4,0,7], 1) + np.testing.assert_equal(alternative[0,3,0,3], 1) + np.testing.assert_equal(alternative[0,4,0,15], 1) + + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + # CTC -> CCC + np.testing.assert_equal(reference[0,3,0,7], 1) + np.testing.assert_equal(reference[0,4,0,13], 1) + np.testing.assert_equal(alternative[0,3,0,5], 1) + np.testing.assert_equal(alternative[0,4,0,5], 1) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + # GAC -> GGC + np.testing.assert_equal(reference[0,3,0,8], 1) + np.testing.assert_equal(reference[0,4,0,1], 1) + np.testing.assert_equal(alternative[0,3,0,10], 1) + np.testing.assert_equal(alternative[0,4,0,9], 1) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + # CGG -> CAG + np.testing.assert_equal(reference[0,3,0,6], 1) + np.testing.assert_equal(reference[0,4,0,10], 1) + np.testing.assert_equal(alternative[0,3,0,4], 1) + np.testing.assert_equal(alternative[0,4,0,2], 1) + + vcf = VariantStreamer(dna, vcffile, binsize=5, batch_size=1) + it_vcf = iter(vcf.flow()) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # ACT -> ATT + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + assert names[0] == 'refmismatch' + np.testing.assert_equal(reference, alternative) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # ACT -> ATT + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + np.testing.assert_equal(reference[0,1,0,1], 1) + np.testing.assert_equal(reference[0,2,0,7], 1) + np.testing.assert_equal(alternative[0,1,0,3], 1) + np.testing.assert_equal(alternative[0,2,0,15], 1) + + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + # CTC -> CCC + np.testing.assert_equal(reference[0,1,0,7], 1) + np.testing.assert_equal(reference[0,2,0,13], 1) + np.testing.assert_equal(alternative[0,1,0,5], 1) + np.testing.assert_equal(alternative[0,2,0,5], 1) + + +@pytest.mark.filterwarnings("ignore:The truth value") +def test_janggu_variant_streamer_order_2_w_refgenome(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + """Test Janggu creation by shape and name. """ + data_path = pkg_resources.resource_filename('janggu', 'resources/') + + order = 2 + + refgenome = os.path.join(data_path, 'sample_genome.fa') + vcffile = os.path.join(data_path, 'sample.vcf') + + #dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, + # storage='ndarray', + # binsize=50, + # store_whole_genome=True, + # order=order) + + vcf = VariantStreamer(refgenome, vcffile, binsize=10, batch_size=1, + order=order) + it_vcf = iter(vcf.flow()) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # ACT -> ATT + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + assert names[0] == 'refmismatch' + np.testing.assert_equal(reference, alternative) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # ACT -> ATT + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + np.testing.assert_equal(reference[0,3,0,1], 1) + np.testing.assert_equal(reference[0,4,0,7], 1) + np.testing.assert_equal(alternative[0,3,0,3], 1) + np.testing.assert_equal(alternative[0,4,0,15], 1) + + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + # CTC -> CCC + np.testing.assert_equal(reference[0,3,0,7], 1) + np.testing.assert_equal(reference[0,4,0,13], 1) + np.testing.assert_equal(alternative[0,3,0,5], 1) + np.testing.assert_equal(alternative[0,4,0,5], 1) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + # GAC -> GGC + np.testing.assert_equal(reference[0,3,0,8], 1) + np.testing.assert_equal(reference[0,4,0,1], 1) + np.testing.assert_equal(alternative[0,3,0,10], 1) + np.testing.assert_equal(alternative[0,4,0,9], 1) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + # CGG -> CAG + np.testing.assert_equal(reference[0,3,0,6], 1) + np.testing.assert_equal(reference[0,4,0,10], 1) + np.testing.assert_equal(alternative[0,3,0,4], 1) + np.testing.assert_equal(alternative[0,4,0,2], 1) + + vcf = VariantStreamer(refgenome, vcffile, binsize=5, batch_size=1, + order=order) + it_vcf = iter(vcf.flow()) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # ACT -> ATT + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + assert names[0] == 'refmismatch' + np.testing.assert_equal(reference, alternative) + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + # ACT -> ATT + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + np.testing.assert_equal(reference[0,1,0,1], 1) + np.testing.assert_equal(reference[0,2,0,7], 1) + np.testing.assert_equal(alternative[0,1,0,3], 1) + np.testing.assert_equal(alternative[0,2,0,15], 1) + + + names, chroms, poss, ra, aa, reference, alternative = next(it_vcf) + print(names, chroms, poss, ra, aa) + print(reference) + print(alternative) + # CTC -> CCC + np.testing.assert_equal(reference[0,1,0,7], 1) + np.testing.assert_equal(reference[0,2,0,13], 1) + np.testing.assert_equal(alternative[0,1,0,5], 1) + np.testing.assert_equal(alternative[0,2,0,5], 1)