--- a +++ b/tests/test_evaluation.py @@ -0,0 +1,676 @@ +import json +import os + +import numpy +import pandas +import pkg_resources +import pyBigWig +import pytest +from keras import Input +from keras import Model +from keras.layers import Dense +from keras.layers import Flatten +from pybedtools import BedTool + +from janggu import Janggu +from janggu import inputlayer +from janggu import outputconv +from janggu import outputdense +from janggu.data import Array +from janggu.data import GenomicIndexer +from janggu.evaluation import Scorer +from janggu.evaluation import _dimension_match +from janggu.utils import ExportBed +from janggu.utils import ExportBigwig +from janggu.utils import ExportClustermap +from janggu.utils import ExportScorePlot +from janggu.utils import ExportTsne +from janggu.utils import ExportTsv + + +def test_input_dims(): + data = Array('testa', numpy.zeros((10, 10, 1))) + xin = Input((10, 1), name='testy') + out = Dense(1)(xin) + m = Model(xin, out) + + # False due to mismatch of names + assert not _dimension_match(m, data, 'input_layers') + + xin = Input((20, 10, 1), name='testa') + out = Dense(1)(xin) + m = Model(xin, out) + + # False due to mismatch of dims + assert not _dimension_match(m, data, 'input_layers') + # more input datasets supplied than inputs to models + assert not _dimension_match(m, [data, data], 'input_layers') + + xin = Input((10, 1), name='testa') + out = Dense(1)(xin) + m = Model(xin, out) + + # False due to mismatch of dims + assert _dimension_match(m, data, 'input_layers') + + +def test_output_dims(): + data = Array('testa', numpy.zeros((10, 10, 1))) + label = Array('testy', numpy.zeros((10, 1))) + xin = Input(data.shape, name='asdf') + out = Flatten()(xin) + out = Dense(1)(out) + m = Model(xin, out) + + # False due to mismatch of names + assert not _dimension_match(m, label, 'output_layers') + + xin = Input(data.shape, name='testa') + out = Flatten()(xin) + out = Dense(2, name='testy')(out) + m = Model(xin, out) + + # False due to mismatch of dims + assert not _dimension_match(m, label, 'output_layers') + + xin = Input(data.shape, name='testa') + out = Flatten()(xin) + out = Dense(1, name='testy')(out) + m = Model(xin, out) + + # False due to mismatch of dims + assert _dimension_match(m, label, 'output_layers') + + assert _dimension_match(m, None, 'output_layers') + + +def get_janggu(inputs, outputs): + @inputlayer + @outputdense('sigmoid') + def _model(inputs, inp, oup, params): + return inputs, inputs[0] + bwm = Janggu.create(_model, + inputs=inputs, + outputs=outputs, + name='nptest') + bwm.compile(optimizer='adadelta', loss='binary_crossentropy') + storage = bwm._storage_path(bwm.name, outputdir=bwm.outputdir) + assert not os.path.exists(storage) + return bwm + + +def get_janggu_conv(inputs, outputs): + @inputlayer + @outputconv('sigmoid') + def _model(inputs, inp, oup, params): + return inputs, inputs[0] + + bwm = Janggu.create(_model, + inputs=inputs, + outputs=outputs, + name='nptest') + + bwm.compile(optimizer='adadelta', loss='binary_crossentropy') + + storage = bwm._storage_path(bwm.name, outputdir=bwm.outputdir) + assert not os.path.exists(storage) + return bwm + + +def test_output_score_by_name(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + + inputs = Array("x", numpy.random.random((100, 10))) + outputs = Array('y', numpy.random.randint(2, size=(100, 1)), + conditions=['random']) + + bwm = get_janggu(inputs, outputs) + + dummy_eval = Scorer('score', lambda y_true, y_pred: 0.15, immediate_export=False) + + bwm.evaluate(inputs, outputs, callbacks=['auc', 'roc', 'prc', + 'auprc', 'auroc', + 'cor', 'mae', 'mse', + 'var_explained', dummy_eval]) + + assert os.path.exists(os.path.join(tmpdir.strpath, "evaluation", bwm.name, "auc.tsv")) + assert os.path.exists(os.path.join(tmpdir.strpath, "evaluation", bwm.name, "prc.png")) + assert os.path.exists(os.path.join(tmpdir.strpath, "evaluation", bwm.name, "roc.png")) + assert os.path.exists(os.path.join(tmpdir.strpath, "evaluation", bwm.name, "cor.tsv")) + assert os.path.exists(os.path.join(tmpdir.strpath, "evaluation", bwm.name, "mae.tsv")) + assert os.path.exists(os.path.join(tmpdir.strpath, "evaluation", bwm.name, "mse.tsv")) + assert os.path.exists(os.path.join(tmpdir.strpath, "evaluation", bwm.name, "var_explained.tsv")) + assert os.path.exists(os.path.join(tmpdir.strpath, "evaluation", bwm.name, "auprc.tsv")) + assert not os.path.exists(os.path.join(tmpdir.strpath, "evaluation", bwm.name, "score.json")) + + dummy_eval.export(os.path.join(tmpdir.strpath, dummy_eval.subdir), bwm.name) + assert os.path.exists(os.path.join(tmpdir.strpath, "evaluation", bwm.name, "score.json")) + + with pytest.raises(ValueError): + bwm.evaluate(inputs, outputs, callbacks=['adsf']) + + +def test_output_json_score(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + + inputs = Array("x", numpy.random.random((100, 10))) + outputs = Array('y', numpy.random.randint(2, size=(100, 1)), + conditions=['random']) + + bwm = get_janggu(inputs, outputs) + + # check exception if no scoring function is provided + dummy_eval = Scorer('score') + + with pytest.raises(ValueError): + bwm.evaluate(inputs, outputs, callbacks=[dummy_eval]) + + dummy_eval = Scorer('score', lambda y_true, y_pred: 0.15) + + bwm.evaluate(inputs, outputs, callbacks=[dummy_eval]) + + # check correctness of json + with open(os.path.join(tmpdir.strpath, "evaluation", bwm.name, + "score.json"), 'r') as f: + content = json.load(f) + # now nptest was evaluated + assert 'random' in content + + +def test_output_tsv_score(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + inputs = Array("x", numpy.random.random((100, 10))) + outputs = Array('y', numpy.random.randint(2, size=(100, 1)), + conditions=['random']) + + bwm = get_janggu(inputs, outputs) + + dummy_eval = Scorer('score', lambda y_true, y_pred: 0.15, exporter=ExportTsv()) + + bwm.evaluate(inputs, outputs, callbacks=[dummy_eval]) + + assert pandas.read_csv(os.path.join(tmpdir.strpath, "evaluation", bwm.name, + "score.tsv"), + sep='\t', header=[0]).iloc[0, 0] == 0.15 + + +def test_output_export_score_plot(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + inputs = Array("x", numpy.random.random((100, 10))) + outputs = Array('y', numpy.random.randint(2, size=(100, 1)), + conditions=['random']) + + bwm = get_janggu(inputs, outputs) + + dummy_eval = Scorer('score', + lambda y_true, y_pred: + ([0., 0.5, 0.5, 1.], + [0.5, 0.5, 1., 1.], + [0.8, 0.4, 0.35, 0.1]), + exporter=ExportScorePlot()) + + bwm.evaluate(inputs, outputs, callbacks=[dummy_eval]) + + dummy_eval = Scorer('score', + lambda y_true, y_pred: + ([0., 0.5, 0.5, 1.], + [0.5, 0.5, 1., 1.], + [0.8, 0.4, 0.35, 0.1]), + exporter=ExportScorePlot(figsize=(10,12), + xlabel='FPR', + ylabel='TPR', + fform='eps')) + + bwm.evaluate(inputs, outputs, callbacks=[dummy_eval]) + + # check if plot was produced + assert os.path.exists(os.path.join(tmpdir.strpath, + "evaluation", bwm.name, "score.png")) + assert os.path.exists(os.path.join(tmpdir.strpath, + "evaluation", bwm.name, "score.eps")) + + +def test_output_export_clustermap(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + inputs = Array("x", numpy.random.random((100, 10))) + outputs = Array('y', numpy.random.randint(2, size=(100, 1)), + conditions=['random']) + + @inputlayer + @outputdense('sigmoid') + def _model(inputs, inp, oup, params): + with inputs.use('x') as layer: + outputs = Dense(3, name='hidden')(layer) + return inputs, outputs + + bwm = Janggu.create(_model, + inputs=inputs, + outputs=outputs, + name='nptest') + + bwm.compile(optimizer='adadelta', loss='binary_crossentropy') + + dummy_eval = Scorer('cluster', exporter=ExportClustermap()) + + bwm.predict(inputs, layername='hidden', + callbacks=[dummy_eval]) + + dummy_eval = Scorer('cluster', exporter=ExportClustermap(fform='eps', + annot={'annot':[1]*50 + [0]*50})) + bwm.predict(inputs, layername='hidden', + callbacks=[dummy_eval]) + + # check if plot was produced + assert os.path.exists(os.path.join(tmpdir.strpath, + "evaluation", bwm.name, 'hidden', + "cluster.png")) + assert os.path.exists(os.path.join(tmpdir.strpath, + "evaluation", bwm.name, 'hidden', + "cluster.eps")) + + +@pytest.mark.filterwarnings("ignore:the matrix") +def test_output_export_tsne(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + inputs = Array("x", numpy.random.random((100, 10))) + outputs = Array('y', numpy.random.randint(2, size=(100, 1)), + conditions=['random']) + + @inputlayer + @outputdense('sigmoid') + def _model(inputs, inp, oup, params): + with inputs.use('x') as layer: + outputs = Dense(3, name='hidden')(layer) + return inputs, outputs + + bwm = Janggu.create(_model, + inputs=inputs, + outputs=outputs, + name='nptest') + + bwm.compile(optimizer='adadelta', loss='binary_crossentropy') + + dummy_eval = Scorer('tsne', exporter=ExportTsne()) + + bwm.predict(inputs, layername='hidden', + callbacks=[dummy_eval]) + + dummy_eval = Scorer('tsne', exporter=ExportTsne(fform='eps', + annot={'annot':[1]*50 + [0]*50}, + figsize=(10, 10))) + bwm.predict(inputs, layername='hidden', + callbacks=[dummy_eval]) + # check if plot was produced + assert os.path.exists(os.path.join(tmpdir.strpath, + "evaluation", bwm.name, 'hidden', + "tsne.png")) + assert os.path.exists(os.path.join(tmpdir.strpath, + "evaluation", bwm.name, 'hidden', + "tsne.eps")) + + +def test_output_bed_loss_resolution_equal_stepsize(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + # generate loss + # + # resolution < stepsize + inputs = Array("x", numpy.random.random((7, 1, 1, 10))) + outputs = Array('y', numpy.random.random((7, 1, 1, 4)), + conditions=['c1', 'c2', 'c3', 'c4']) + + bwm = get_janggu_conv(inputs, outputs) + data_path = pkg_resources.resource_filename('janggu', + 'resources/10regions.bed') + + gi = GenomicIndexer.create_from_file(data_path, + binsize=200, + stepsize=200) + + dummy_eval = Scorer('loss', lambda t, p: [0.1] * len(t), + exporter=ExportBed(gindexer=gi, resolution=200)) + + bwm.evaluate(inputs, outputs, callbacks=[dummy_eval]) + + file_ = os.path.join(tmpdir.strpath, 'evaluation', bwm.name, + 'loss.{}.bed') + + for cond in ['c1', 'c2', 'c3', 'c4']: + assert os.path.exists(file_.format(cond)) + + bed = BedTool(file_.format('c1')) + + nreg = 0 + for reg in bed: + numpy.testing.assert_equal(float(reg.score), 0.1) + nreg += 1 +# numpy.testing.assert_equal(breg.score, value) + + assert nreg == 7, 'There should be 7 regions in the bed file.' + + +def test_output_bed_loss_resolution_unequal_stepsize(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + # generate loss + # + # resolution < stepsize + inputs = Array("x", numpy.random.random((7, 4, 1, 10))) + outputs = Array('y', numpy.random.random((7, 4, 1, 4)), + conditions=['c1', 'c2', 'c3', 'c4']) + + bwm = get_janggu(inputs, outputs) + data_path = pkg_resources.resource_filename('janggu', + 'resources/10regions.bed') + + gi = GenomicIndexer.create_from_file(data_path, + binsize=200, + stepsize=200) + + # dummy_eval = Scorer('loss', lambda t, p: -t * numpy.log(p), + # exporter=export_bed, export_args={'gindexer': gi}) + dummy_eval = Scorer('loss', lambda t, p: [0.1] * len(t), + exporter=ExportBed(gindexer=gi, resolution=50)) + + bwm.evaluate(inputs, outputs, callbacks=[dummy_eval]) + + file_ = os.path.join(tmpdir.strpath, 'evaluation', bwm.name, + 'loss.{}.bed') + + for cond in ['c1', 'c2', 'c3', 'c4']: + assert os.path.exists(file_.format(cond)) + + bed = BedTool(file_.format('c1')) + + nreg = 0 + for reg in bed: + numpy.testing.assert_equal(float(reg.score), 0.1) + nreg += 1 + + assert nreg == 28, 'There should be 28 regions in the bed file.' + + +def test_output_bed_predict_resolution_equal_stepsize(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + # generate loss + # + # resolution < stepsize + inputs = Array("x", numpy.random.random((7, 1, 1, 10))) + outputs = Array('y', numpy.random.random((7, 1, 1, 4)), + conditions=['c1', 'c2', 'c3', 'c4']) + + bwm = get_janggu_conv(inputs, outputs) + data_path = pkg_resources.resource_filename('janggu', + 'resources/10regions.bed') + + gi = GenomicIndexer.create_from_file(data_path, + binsize=200, + stepsize=200) + + dummy_eval = Scorer('pred', lambda p: [0.1] * len(p), + exporter=ExportBed(gindexer=gi, resolution=200), + conditions=['c1', 'c2', 'c3', 'c4']) + + bwm.predict(inputs, callbacks=[dummy_eval]) + + file_ = os.path.join(tmpdir.strpath, 'evaluation', bwm.name, + 'pred.{}.bed') + + for cond in ['c1', 'c2', 'c3', 'c4']: + assert os.path.exists(file_.format(cond)) + + bed = BedTool(file_.format('c1')) + + nreg = 0 + for reg in bed: + numpy.testing.assert_equal(float(reg.score), 0.1) + nreg += 1 + + assert nreg == 7, 'There should be 7 regions in the bed file.' + + +def test_output_bed_predict_denseout(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + # generate loss + # + # resolution < stepsize + inputs = Array("x", numpy.random.random((7, 10))) + outputs = Array('y', numpy.random.random((7, 4)), + conditions=['c1', 'c2', 'c3', 'c4']) + + bwm = get_janggu(inputs, outputs) + data_path = pkg_resources.resource_filename('janggu', + 'resources/10regions.bed') + + gi = GenomicIndexer.create_from_file(data_path, + binsize=200, + stepsize=200) + + dummy_eval = Scorer('pred', lambda p: [0.1] * len(p), + exporter=ExportBed(gindexer=gi, resolution=200), + conditions=['c1', 'c2', 'c3', 'c4']) + + bwm.predict(inputs, callbacks=[dummy_eval]) + + file_ = os.path.join(tmpdir.strpath, 'evaluation', bwm.name, + 'pred.{}.bed') + + for cond in ['c1', 'c2', 'c3', 'c4']: + assert os.path.exists(file_.format(cond)) + + bed = BedTool(file_.format('c1')) + + nreg = 0 + for reg in bed: + numpy.testing.assert_equal(float(reg.score), 0.1) + nreg += 1 + + assert nreg == 7, 'There should be 7 regions in the bed file.' + + +def test_output_bed_predict_resolution_unequal_stepsize(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + # generate loss + # + # resolution < stepsize + inputs = Array("x", numpy.random.random((7, 4, 1, 10))) + outputs = Array('y', numpy.random.random((7, 4, 1, 4)), + conditions=['c1', 'c2', 'c3', 'c4']) + + bwm = get_janggu(inputs, outputs) + data_path = pkg_resources.resource_filename('janggu', + 'resources/10regions.bed') + + gi = GenomicIndexer.create_from_file(data_path, + binsize=200, + stepsize=200) + + dummy_eval = Scorer('pred', lambda p: [0.1] * len(p), + exporter=ExportBed(gindexer=gi, resolution=50), + conditions=['c1', 'c2', 'c3', 'c4']) + + bwm.predict(inputs, callbacks=[dummy_eval]) + + file_ = os.path.join(tmpdir.strpath, 'evaluation', bwm.name, + 'pred.{}.bed') + + for cond in ['c1', 'c2', 'c3', 'c4']: + assert os.path.exists(file_.format(cond)) + + bed = BedTool(file_.format('c1')) + + nreg = 0 + for reg in bed: + numpy.testing.assert_equal(float(reg.score), 0.1) + nreg += 1 + + assert nreg == 28, 'There should be 28 regions in the bed file.' + + +def test_output_bigwig_predict_denseout(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + # generate loss + # + # resolution < stepsize + inputs = Array("x", numpy.random.random((7, 10))) + outputs = Array('y', numpy.random.random((7, 4)), + conditions=['c1', 'c2', 'c3', 'c4']) + + bwm = get_janggu(inputs, outputs) + data_path = pkg_resources.resource_filename('janggu', + 'resources/10regions.bed') + + gi = GenomicIndexer.create_from_file(data_path, + binsize=200, + stepsize=200) + + dummy_eval = Scorer('pred', lambda p: [0.1] * len(p), + exporter=ExportBigwig(gindexer=gi), + conditions=['c1', 'c2', 'c3', 'c4']) + + bwm.predict(inputs, callbacks=[dummy_eval]) + + file_ = os.path.join(tmpdir.strpath, 'evaluation', bwm.name, + 'pred.{}.bigwig') + + for cond in ['c1', 'c2', 'c3', 'c4']: + assert os.path.exists(file_.format(cond)) + + bw = pyBigWig.open(file_.format('c1')) + + co = bw.values('chr1', 600, 2000) + + numpy.testing.assert_allclose(numpy.mean(co), 0.1, rtol=1e-5) + + +def test_output_bigwig_predict_convout(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + # generate loss + # + # resolution < stepsize + inputs = Array("x", numpy.random.random((7, 4, 1, 10))) + outputs = Array('y', numpy.random.random((7, 4, 1, 4)), + conditions=['c1', 'c2', 'c3', 'c4']) + + bwm = get_janggu_conv(inputs, outputs) + data_path = pkg_resources.resource_filename('janggu', + 'resources/10regions.bed') + + gi = GenomicIndexer.create_from_file(data_path, + binsize=200, + stepsize=200) + + dummy_eval = Scorer('pred', lambda p: [0.2] * len(p), + exporter=ExportBigwig(gindexer=gi), + conditions=['c1', 'c2', 'c3', 'c4']) + + bwm.predict(inputs, callbacks=[dummy_eval]) + + file_ = os.path.join(tmpdir.strpath, 'evaluation', bwm.name, + 'pred.{}.bigwig') + + for cond in ['c1', 'c2', 'c3', 'c4']: + assert os.path.exists(file_.format(cond)) + + bw = pyBigWig.open(file_.format('c1')) + + co = bw.values('chr1', 600, 2000) + + numpy.testing.assert_allclose(numpy.mean(co), 0.2, rtol=1e-5) + + +def test_output_bigwig_loss_resolution_equal_stepsize(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + # generate loss + # + # resolution < stepsize + inputs = Array("x", numpy.random.random((7, 4, 1, 10))) + outputs = Array('y', numpy.random.random((7, 4, 1, 4)), + conditions=['c1', 'c2', 'c3', 'c4']) + + bwm = get_janggu(inputs, outputs) + data_path = pkg_resources.resource_filename('janggu', + 'resources/10regions.bed') + + gi = GenomicIndexer.create_from_file(data_path, + binsize=200, + stepsize=200) + + dummy_eval = Scorer('loss', lambda t, p: [0.2] * len(t), + exporter=ExportBigwig(gindexer=gi)) + + bwm.evaluate(inputs, outputs, callbacks=[dummy_eval]) + + file_ = os.path.join(tmpdir.strpath, 'evaluation', bwm.name, + 'loss.{}.bigwig') + + for cond in ['c1', 'c2', 'c3', 'c4']: + assert os.path.exists(file_.format(cond)) + + bw = pyBigWig.open(file_.format('c1')) + + co = bw.values('chr1', 600, 2000) + + numpy.testing.assert_allclose(numpy.mean(co), 0.2, rtol=1e-5) + + +def test_output_bigwig_loss_resolution_unequal_stepsize(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + # generate loss + # + # resolution < stepsize + inputs = Array("x", numpy.random.random((7, 4, 1, 10))) + outputs = Array('y', numpy.random.random((7, 4, 1, 4)), + conditions=['c1', 'c2', 'c3', 'c4']) + + bwm = get_janggu(inputs, outputs) + data_path = pkg_resources.resource_filename('janggu', + 'resources/10regions.bed') + + gi = GenomicIndexer.create_from_file(data_path, + binsize=200, + stepsize=50) + + dummy_eval = Scorer('loss', lambda t, p: [0.2] * len(t), + exporter=ExportBigwig(gindexer=gi)) + + bwm.evaluate(inputs, outputs, callbacks=[dummy_eval]) + + file_ = os.path.join(tmpdir.strpath, 'evaluation', bwm.name, + 'loss.{}.bigwig') + + for cond in ['c1', 'c2', 'c3', 'c4']: + assert os.path.exists(file_.format(cond)) + + bw = pyBigWig.open(file_.format('c1')) + + co = bw.values('chr1', 600, 2000-150) + + numpy.testing.assert_allclose(numpy.mean(co), 0.2, rtol=1e-5) + + +def test_output_tsv_score_across_conditions(tmpdir): + os.environ['JANGGU_OUTPUT'] = tmpdir.strpath + inputs = Array("x", numpy.random.random((100, 10))) + outputs = Array('y', numpy.random.randint(2, size=(100, 2)), + conditions=['c1', 'c2']) + + bwm = get_janggu(inputs, outputs) + + dummy_eval = Scorer('score', lambda y_true, y_pred: 0.15, + exporter=ExportTsv()) + dummy_evalacross = Scorer('scoreacross', + lambda y_true, y_pred: 0.15, + exporter=ExportTsv(), + percondition=False) + + bwm.evaluate(inputs, outputs, callbacks=[dummy_eval, dummy_evalacross]) + + # percondition=True + assert pandas.read_csv(os.path.join(tmpdir.strpath, "evaluation", bwm.name, + "score.tsv"), + sep='\t', header=[0]).shape == (1, 2) + # percondition=False + val = pandas.read_csv(os.path.join(tmpdir.strpath, "evaluation", bwm.name, + "scoreacross.tsv"), + sep='\t', header=[0]) + assert val['across'][0] == .15 + assert val.shape == (1, 1)