Diff of /tests/test_convertion.py [000000] .. [d7cf27]

Switch to unified view

a b/tests/test_convertion.py
1
import os
2
3
import matplotlib
4
matplotlib.use('AGG')
5
6
import numpy as np
7
import pytest
8
from keras import backend as K
9
from keras.layers import Conv2D
10
from pkg_resources import resource_filename
11
12
from janggu import Janggu
13
from janggu import inputlayer
14
from janggu import outputconv
15
from janggu.data import Bioseq
16
from janggu.data import Cover
17
from janggu.layers import DnaConv2D
18
from janggu.layers import LocalAveragePooling2D
19
20
@pytest.mark.filterwarnings("ignore:inspect")
21
@pytest.mark.filterwarnings("ignore:The binary")
22
def test_create_from_array_whole_genome_true_from_pred(tmpdir):
23
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
24
    # load the dataset
25
    # The pseudo genome represents just a concatenation of all sequences
26
    # in sample.fa and sample2.fa. Therefore, the results should be almost
27
    # identically to the models obtained from classify_fasta.py.
28
    REFGENOME = resource_filename('janggu', 'resources/pseudo_genome.fa')
29
    # ROI contains regions spanning positive and negative examples
30
    ROI_FILE = resource_filename('janggu', 'resources/roi_train.bed')
31
    # PEAK_FILE only contains positive examples
32
    PEAK_FILE = resource_filename('janggu', 'resources/scores.bed')
33
34
    DNA = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME,
35
                                       roi=ROI_FILE,
36
                                       binsize=200, stepsize=200,
37
                                       order=1,
38
                                       store_whole_genome=True)
39
40
    LABELS = Cover.create_from_bed('peaks', roi=ROI_FILE,
41
                                   bedfiles=PEAK_FILE,
42
                                   binsize=200, stepsize=200,
43
                                   resolution=200,
44
                                   store_whole_genome=True)
45
46
    @inputlayer
47
    @outputconv('sigmoid')
48
    def double_stranded_model_dnaconv(inputs, inp, oup, params):
49
        with inputs.use('dna') as layer:
50
            layer = DnaConv2D(Conv2D(params[0], (params[1], 1),
51
                                     activation=params[2]))(layer)
52
        output = LocalAveragePooling2D(window_size=layer.shape.as_list()[1],
53
                                       name='motif')(layer)
54
        return inputs, output
55
56
    modeltemplate = double_stranded_model_dnaconv
57
58
    K.clear_session()
59
60
    # create a new model object
61
    model = Janggu.create(template=modeltemplate,
62
                          modelparams=(30, 21, 'relu'),
63
                          inputs=DNA,
64
                          outputs=LABELS)
65
66
    model.compile(optimizer='adadelta', loss='binary_crossentropy',
67
                  metrics=['acc'])
68
69
    pred = model.predict(DNA)
70
71
    cov_out = Cover.create_from_array('BindingProba', pred, LABELS.gindexer,
72
                                      store_whole_genome=True)
73
74
    assert pred.shape == cov_out.shape
75
76
    np.testing.assert_equal(pred, cov_out[:])
77
78
    assert len(cov_out.gindexer) == len(pred)
79
    assert len(cov_out.garray.handle) == 1
80
81
82
@pytest.mark.filterwarnings("ignore:inspect")
83
@pytest.mark.filterwarnings("ignore:The binary")
84
def test_create_from_array_whole_genome_true(tmpdir):
85
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
86
    
87
    # load the dataset
88
    # The pseudo genome represents just a concatenation of all sequences
89
    # in sample.fa and sample2.fa. Therefore, the results should be almost
90
    # identically to the models obtained from classify_fasta.py.
91
    # ROI contains regions spanning positive and negative examples
92
    ROI_FILE = resource_filename('janggu', 'resources/roi_train.bed')
93
    # PEAK_FILE only contains positive examples
94
    PEAK_FILE = resource_filename('janggu', 'resources/scores.bed')
95
96
    LABELS = Cover.create_from_bed('peaks', roi=ROI_FILE,
97
                                   bedfiles=[PEAK_FILE]*5,
98
                                   binsize=200, stepsize=200,
99
                                   resolution=200,
100
                                   store_whole_genome=True)
101
102
    pred = LABELS[:]
103
104
    for storage in ['ndarray', 'sparse', 'hdf5']:
105
        print(storage)
106
        cov_out = Cover.create_from_array('BindingProba', pred,
107
                                          LABELS.gindexer,
108
                                          cache=True,
109
                                          storage=storage,
110
                                          store_whole_genome=True)
111
112
        np.testing.assert_equal(cov_out[:], LABELS[:])
113
        np.testing.assert_equal(cov_out.shape, LABELS.shape)
114
115
@pytest.mark.filterwarnings("ignore:The binary")
116
def test_create_from_array_whole_genome_false_pred(tmpdir):
117
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
118
    # load the dataset
119
    # The pseudo genome represents just a concatenation of all sequences
120
    # in sample.fa and sample2.fa. Therefore, the results should be almost
121
    # identically to the models obtained from classify_fasta.py.
122
    REFGENOME = resource_filename('janggu', 'resources/pseudo_genome.fa')
123
    # ROI contains regions spanning positive and negative examples
124
    ROI_FILE = resource_filename('janggu', 'resources/roi_train.bed')
125
    # PEAK_FILE only contains positive examples
126
    PEAK_FILE = resource_filename('janggu', 'resources/scores.bed')
127
128
    DNA = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME,
129
                                       roi=ROI_FILE,
130
                                       binsize=200, stepsize=200,
131
                                       order=1,
132
                                       store_whole_genome=False)
133
134
    LABELS = Cover.create_from_bed('peaks', roi=ROI_FILE,
135
                                   bedfiles=PEAK_FILE,
136
                                   binsize=200, stepsize=200,
137
                                   resolution=200,
138
                                   store_whole_genome=False)
139
140
    @inputlayer
141
    @outputconv('sigmoid')
142
    def double_stranded_model_dnaconv(inputs, inp, oup, params):
143
        with inputs.use('dna') as layer:
144
            layer = DnaConv2D(Conv2D(params[0], (params[1], 1),
145
                                     activation=params[2]))(layer)
146
        output = LocalAveragePooling2D(window_size=layer.shape.as_list()[1],
147
                                       name='motif')(layer)
148
        return inputs, output
149
150
    modeltemplate = double_stranded_model_dnaconv
151
152
    K.clear_session()
153
154
    # create a new model object
155
    model = Janggu.create(template=modeltemplate,
156
                          modelparams=(30, 21, 'relu'),
157
                          inputs=DNA,
158
                          outputs=LABELS)
159
160
    model.compile(optimizer='adadelta', loss='binary_crossentropy',
161
                  metrics=['acc'])
162
163
    pred = model.predict(DNA)
164
165
    cov_out = Cover.create_from_array('BindingProba', pred, LABELS.gindexer,
166
                                      store_whole_genome=False)
167
168
    assert pred.shape == cov_out.shape
169
170
    np.testing.assert_equal(pred, cov_out[:])
171
172
    assert len(cov_out.gindexer) == len(pred)
173
    assert len(cov_out.garray.handle['data']) == len(pred)
174
175
@pytest.mark.filterwarnings("ignore:inspect")
176
@pytest.mark.filterwarnings("ignore:The binary")
177
def test_create_from_array_whole_genome_false(tmpdir):
178
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
179
    # load the dataset
180
    # The pseudo genome represents just a concatenation of all sequences
181
    # in sample.fa and sample2.fa. Therefore, the results should be almost
182
    # identically to the models obtained from classify_fasta.py.
183
    # ROI contains regions spanning positive and negative examples
184
    ROI_FILE = resource_filename('janggu', 'resources/roi_train.bed')
185
    # PEAK_FILE only contains positive examples
186
    PEAK_FILE = resource_filename('janggu', 'resources/scores.bed')
187
188
    LABELS = Cover.create_from_bed('peaks', roi=ROI_FILE,
189
                                   bedfiles=[PEAK_FILE]*5,
190
                                   binsize=200, stepsize=200,
191
                                   resolution=200,
192
                                   store_whole_genome=False)
193
194
    pred = LABELS[:]
195
196
    for storage in ['ndarray', 'sparse', 'hdf5']:
197
        print(storage)
198
        cov_out = Cover.create_from_array('BindingProba', pred,
199
                                          LABELS.gindexer,
200
                                          cache=True,
201
                                          storage=storage,
202
                                          store_whole_genome=False)
203
204
        np.testing.assert_equal(cov_out[:], LABELS[:])
205
        np.testing.assert_equal(cov_out.shape, LABELS.shape)
206