|
a |
|
b/tests/test_data.py |
|
|
1 |
import os |
|
|
2 |
|
|
|
3 |
import matplotlib |
|
|
4 |
import pkg_resources |
|
|
5 |
import pytest |
|
|
6 |
|
|
|
7 |
from janggu.data import Bioseq |
|
|
8 |
from janggu.data import split_train_test |
|
|
9 |
from janggu.data import subset |
|
|
10 |
from janggu.data import view |
|
|
11 |
from janggu.data.data import _data_props |
|
|
12 |
|
|
|
13 |
matplotlib.use('AGG') |
|
|
14 |
|
|
|
15 |
|
|
|
16 |
def test_dna_props_extraction(tmpdir): |
|
|
17 |
os.environ['JANGGU_OUTPUT'] = tmpdir.strpath |
|
|
18 |
data_path = pkg_resources.resource_filename('janggu', 'resources/') |
|
|
19 |
bed_file = os.path.join(data_path, 'sample.bed') |
|
|
20 |
|
|
|
21 |
refgenome = os.path.join(data_path, 'sample_genome.fa') |
|
|
22 |
|
|
|
23 |
dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, |
|
|
24 |
storage='ndarray', |
|
|
25 |
roi=bed_file, |
|
|
26 |
binsize=200, stepsize=200, |
|
|
27 |
order=1) |
|
|
28 |
|
|
|
29 |
props = _data_props(dna) |
|
|
30 |
assert 'dna' in props |
|
|
31 |
assert props['dna']['shape'] == (200, 1, 4) |
|
|
32 |
|
|
|
33 |
with pytest.raises(Exception): |
|
|
34 |
_data_props((0,)) |
|
|
35 |
|
|
|
36 |
|
|
|
37 |
def test_split_train_test(): |
|
|
38 |
data_path = pkg_resources.resource_filename('janggu', 'resources/') |
|
|
39 |
bed_file = os.path.join(data_path, 'sample.bed') |
|
|
40 |
|
|
|
41 |
refgenome = os.path.join(data_path, 'sample_genome.fa') |
|
|
42 |
|
|
|
43 |
dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, |
|
|
44 |
storage='ndarray', |
|
|
45 |
roi=bed_file, |
|
|
46 |
binsize=200, stepsize=200, |
|
|
47 |
order=1, store_whole_genome=True) |
|
|
48 |
|
|
|
49 |
traindna, testdna = split_train_test(dna, holdout_chroms='chr2') |
|
|
50 |
|
|
|
51 |
assert len(traindna) == 50 |
|
|
52 |
assert len(testdna) == 50 |
|
|
53 |
assert len(dna) == len(traindna) + len(testdna) |
|
|
54 |
|
|
|
55 |
traindna, testdna = split_train_test([dna, dna], holdout_chroms='chr2') |
|
|
56 |
|
|
|
57 |
assert len(traindna[0]) == 50 |
|
|
58 |
assert len(testdna[0]) == 50 |
|
|
59 |
assert len(dna) == len(traindna[0]) + len(testdna[0]) |
|
|
60 |
|
|
|
61 |
|
|
|
62 |
def test_subset_include_chrname_test(): |
|
|
63 |
data_path = pkg_resources.resource_filename('janggu', 'resources/') |
|
|
64 |
bed_file = os.path.join(data_path, 'sample.bed') |
|
|
65 |
|
|
|
66 |
refgenome = os.path.join(data_path, 'sample_genome.fa') |
|
|
67 |
|
|
|
68 |
dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, |
|
|
69 |
storage='ndarray', |
|
|
70 |
roi=bed_file, |
|
|
71 |
binsize=200, stepsize=200, |
|
|
72 |
order=1, store_whole_genome=True) |
|
|
73 |
|
|
|
74 |
subdna = subset(dna, include_regions='chr2') |
|
|
75 |
|
|
|
76 |
assert len(subdna) == 50 |
|
|
77 |
|
|
|
78 |
|
|
|
79 |
def test_subset_exclude_chrname_test(): |
|
|
80 |
data_path = pkg_resources.resource_filename('janggu', 'resources/') |
|
|
81 |
bed_file = os.path.join(data_path, 'sample.bed') |
|
|
82 |
|
|
|
83 |
refgenome = os.path.join(data_path, 'sample_genome.fa') |
|
|
84 |
|
|
|
85 |
dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, |
|
|
86 |
storage='ndarray', |
|
|
87 |
roi=bed_file, |
|
|
88 |
binsize=200, stepsize=200, |
|
|
89 |
order=1, store_whole_genome=True) |
|
|
90 |
|
|
|
91 |
subdna = subset(dna, exclude_regions='chr2') |
|
|
92 |
|
|
|
93 |
assert len(subdna) == 50 |
|
|
94 |
|
|
|
95 |
|
|
|
96 |
def test_view_bed_test(): |
|
|
97 |
data_path = pkg_resources.resource_filename('janggu', 'resources/') |
|
|
98 |
bed_file = os.path.join(data_path, 'sample.bed') |
|
|
99 |
bedsub_file = os.path.join(data_path, 'scored_sample.bed') |
|
|
100 |
|
|
|
101 |
refgenome = os.path.join(data_path, 'sample_genome.fa') |
|
|
102 |
|
|
|
103 |
dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, |
|
|
104 |
storage='ndarray', |
|
|
105 |
roi=bed_file, |
|
|
106 |
binsize=200, stepsize=200, |
|
|
107 |
order=1, store_whole_genome=True) |
|
|
108 |
|
|
|
109 |
subdna = view(dna, use_regions=bedsub_file) |
|
|
110 |
|
|
|
111 |
assert len(subdna) == 4 |