a b/src/janggu/data/genomicarray.py
1
"""Genomic arrays"""
2
3
import hashlib
4
import os
5
from collections import OrderedDict
6
7
import h5py
8
import numpy as np
9
from pybedtools import Interval
10
from scipy import sparse
11
12
from janggu.utils import _get_output_data_location
13
from janggu.utils import _iv_to_str
14
from janggu.utils import _str_to_iv
15
16
17
def _get_iv_length(length, resolution):
18
    """obtain the chromosome length for a given resolution."""
19
    if resolution is None:
20
        return 1
21
    return int(np.ceil(float(length)/resolution))
22
23
24
def get_collapser(method):
25
    """Get collapse method."""
26
27
    if method is None:
28
        return None
29
    elif  callable(method):
30
        return method
31
    elif method == 'mean':
32
        return lambda x: x.mean(axis=1)
33
    elif method == 'sum':
34
        return lambda x: x.sum(axis=1)
35
    elif method == 'max':
36
        return lambda x: x.max(axis=1)
37
38
    raise ValueError('Unknown method: {}'.format(method))
39
40
41
def create_sha256_cache(data, parameters):
42
    """Cache file determined from files and parameters."""
43
44
    sha256_hash = hashlib.sha256()
45
46
    # add file content to hash
47
    for datum in data or []:
48
        if isinstance(datum, str) and os.path.exists(datum):
49
            with open(datum, 'rb') as file_:
50
                for bblock in iter(lambda: file_.read(1024**2), b""):
51
                    sha256_hash.update(bblock)
52
        elif isinstance(datum, np.ndarray):
53
            sha256_hash.update(datum.tobytes())
54
        else:
55
            sha256_hash.update(str(datum).encode('utf-8'))
56
57
    # add parameter settings to hash
58
    sha256_hash.update(str(parameters).encode('utf-8'))
59
60
    return sha256_hash.hexdigest()
61
62
63
def _get_cachefile(cachestr, tags, fileending):
64
    """ Determine cache file location """
65
    filename = None
66
    if cachestr is not None:
67
        memmap_dir = _get_output_data_location(tags)
68
        if not os.path.exists(memmap_dir):
69
            os.makedirs(memmap_dir)
70
71
        filename = str(cachestr) + fileending
72
        filename = os.path.join(memmap_dir, filename)
73
        return filename
74
    return None
75
76
def _load_data(cachestr, tags, fileending):
77
    """ loading data from scratch or from cache """
78
    filename = _get_cachefile(cachestr, tags, fileending)
79
    if filename is not None and os.path.exists(filename):
80
        return False
81
    return True
82
83
class GenomicArray(object):  # pylint: disable=too-many-instance-attributes
84
    """GenomicArray stores multi-dimensional genomic information.
85
86
    It acts as a dataset for holding genomic data. For instance,
87
    coverage along an entire genome composed of arbitrary length chromosomes
88
    as well as for multiple cell-types and conditions simultaneously.
89
    Inspired by the HTSeq analog, the array can hold the data in different
90
    storage modes, including ndarray, hdf5 or as sparse dataset.
91
92
    Parameters
93
    ----------
94
    stranded : bool
95
        Consider stranded profiles. Default: True.
96
    conditions : list(str) or None
97
        List of cell-type or condition labels associated with the corresponding
98
        array dimensions. Default: None means a one-dimensional array is produced.
99
    typecode : str
100
        Datatype. Default: 'd'.
101
    resolution : int or None
102
        Resolution for storing the genomic array. Only relevant for the use
103
        with Cover Datasets. Default: 1.
104
    padding_value : float
105
        Padding value. Default: 0.
106
    order : int
107
        Order of the alphabet size. Only relevant for Bioseq Datasets. Default: 1.
108
    collapser : None or callable
109
        Method to aggregate values along a given interval.
110
    """
111
    handle = OrderedDict()
112
    _condition = None
113
    _resolution = None
114
    _order = None
115
    region2index = None
116
117
    def __init__(self, stranded=True, conditions=None, typecode='d',
118
                 resolution=1, padding_value=0.,
119
                 order=1, store_whole_genome=True, collapser=None):
120
        self.stranded = stranded
121
        if conditions is None:
122
            conditions = ['sample']
123
124
        self.condition = conditions
125
        self.order = order
126
        self.padding_value = padding_value
127
        if not isinstance(order, int) or order < 1:
128
            raise Exception('order must be a positive integer')
129
130
        self.resolution = resolution
131
        self.typecode = typecode
132
        self._full_genome_stored = store_whole_genome
133
        self.collapser = collapser
134
135
    def _get_indices(self, interval, arraylen):
136
        """Given the original genomic coordinates,
137
           the array indices of the reference dataset (garray.handle)
138
           and the array indices of the view are returned.
139
140
        Parameters
141
        ----------
142
        interval : Interval
143
            Interval containing (chr, start, end)
144
        arraylen : int
145
            Length of the numpy target array.
146
147
        Returns
148
        -------
149
        tuple
150
            Tuple of indices corresponding to the slice in the arrays
151
            (ref_start, ref_end, array_start, array_end).
152
            ref_[start,end] indicate the slice in garray.handle
153
            while array_[start,end] indicates the slice in the
154
            target array / view.
155
        """
156
        chrom = interval.chrom
157
        start = self.get_iv_start(interval.start)
158
        end = self.get_iv_end(interval.end) - self.order + 1
159
160
        if start >= self.handle[chrom].shape[0]:
161
            return 0, 0, 0, 0
162
        else:
163
            array_start = 0
164
            ref_start = start
165
166
        if end > self.handle[chrom].shape[0]:
167
            array_end = arraylen - (end - self.handle[chrom].shape[0])
168
            ref_end = self.handle[chrom].shape[0]
169
        else:
170
            array_end = arraylen
171
            ref_end = end
172
173
        return ref_start, ref_end, array_start, array_end
174
175
    def __setitem__(self, index, value):
176
        interval = index[0]
177
        condition = index[1]
178
        if isinstance(condition, slice) and value.ndim != 3:
179
            raise ValueError('Expected 3D array with condition slice.')
180
        if isinstance(condition, slice):
181
            condition = slice(None, value.shape[-1], None)
182
183
        if self.stranded and value.shape[1] != 2:
184
            raise ValueError('If genomic array is in stranded mode, shape[-1] == 2 is expected')
185
186
        if not self.stranded and value.shape[1] != 1:
187
            value = value.sum(axis=1, keepdims=True)
188
189
        if isinstance(interval, Interval) and isinstance(condition, (int, slice)):
190
            start = self.get_iv_start(interval.start)
191
            end = self.get_iv_end(interval.end)
192
193
            length = end - start - self.order + 1
194
            # value should be a 2 dimensional array
195
            # it will be reshaped to a 2D array where the collapse operation is performed
196
197
            # along the second dimension.
198
            value = self._do_collapse(interval, value)
199
200
            try:
201
                self._setitem(interval, condition, length, value)
202
203
            except KeyError:
204
                # we end up here if the peak regions are not a subset of
205
                # the regions of interest. that might be the case if
206
                # peaks from the holdout proportion of the genome are tried
207
                # to be added.
208
                # unfortunately, it is also possible that store_whole_genome=False
209
                # and the peaks and regions of interest are just not synchronized
210
                # in which case nothing (or too few peaks) are added. in the latter
211
                # case an error would help actually, but I am not sure how to
212
                # check if the first or the second is the case here.
213
                pass
214
        else:
215
            raise IndexError("Cannot interpret interval and condition: {}".format((interval, condition)))
216
217
    def _setitem(self, interval, condition, length, value):
218
        if not self._full_genome_stored:
219
            idx = self.region2index[_iv_to_str(interval.chrom, interval.start,
220
                                               interval.end)]
221
222
            # correcting for the overshooting starts and ends is not necessary
223
            # for partially loaded data
224
            self.handle['data'][idx, :length, :, condition] = value
225
226
        else:
227
            ref_start, ref_end, array_start, \
228
                array_end = self._get_indices(interval, value.shape[0])
229
            self.handle[interval.chrom][ref_start:ref_end, :, condition] = \
230
                               value[array_start:array_end]
231
232
    def _do_collapse(self, interval, value):
233
        if self.collapser is not None:
234
235
            if self.resolution is None and value.shape[0] == 1 or \
236
                self.resolution is not None and \
237
                value.shape[0] == interval.length//self.resolution:
238
                # collapsing becomes obsolete, because the data has already
239
                # the expected shape (after collapsing)
240
                pass
241
            else:
242
                if self.resolution is None:
243
                    # collapse along the entire interval
244
                    value = value.reshape((1,) + value.shape)
245
                else:
246
                    # if the array shape[0] is a multipe of resolution,
247
                    # it can simply be reshaped. otherwise,
248
                    # it needs to be resized before.
249
                    if value.shape[0] % self.resolution > 0:
250
                        value = np.resize(value, (
251
                            int(np.ceil(value.shape[0]/float(self.resolution))*self.resolution),) +
252
                                          value.shape[1:])
253
                    # collapse in bins of size resolution
254
                    value = value.reshape((value.shape[0]//min(self.resolution,
255
                                                               value.shape[0]),
256
                                           min(self.resolution, value.shape[0]),) + \
257
                                          value.shape[1:])
258
259
                value = self.collapser(value)
260
        return value
261
262
    def __getitem__(self, index):
263
        # for now lets ignore everything except for chrom, start and end.
264
        if isinstance(index, Interval):
265
            interval = index
266
            chrom = interval.chrom
267
            start = self.get_iv_start(interval.start)
268
            end = self.get_iv_end(interval.end)
269
270
            # original length
271
            length = end-start - self.order + 1
272
273
            if not self._full_genome_stored:
274
                idx = self.region2index[_iv_to_str(chrom, interval.start, interval.end)]
275
                # correcting for the overshooting starts and ends is not necessary
276
                # for partially loaded data
277
                return self._reshape(self.handle['data'][idx],
278
                                     (length, 2 if self.stranded else 1,
279
                                      len(self.condition)))
280
281
            if chrom not in self.handle:
282
                return np.ones((length, 2 if self.stranded else 1,
283
                                len(self.condition)),
284
                               dtype=self.typecode) * self.padding_value
285
286
            if start >= 0 and end <= self.handle[chrom].shape[0]:
287
                end = end - self.order + 1
288
                # this is a short-cut, which does not require zero-padding
289
                return self._reshape(self.handle[chrom][start:end],
290
                                     (end-start, 2 if self.stranded else 1,
291
                                      len(self.condition)))
292
293
            # below is some functionality for zero-padding, in case the region
294
            # reaches out of the chromosome size
295
296
            if self.padding_value == 0.0:
297
                data = np.zeros((length, 2 if self.stranded else 1,
298
                                 len(self.condition)),
299
                                dtype=self.typecode)
300
            else:
301
                data = np.ones((length, 2 if self.stranded else 1,
302
                                len(self.condition)),
303
                               dtype=self.typecode) * self.padding_value
304
305
            ref_start, ref_end, array_start, array_end = self._get_indices(interval, data.shape[0])
306
307
            data[array_start:array_end, :, :] = self._reshape(self.handle[chrom][ref_start:ref_end],
308
                                                              (ref_end - ref_start,
309
                                                               2 if self.stranded else 1,
310
                                                               len(self.condition)))
311
            return data
312
313
        raise IndexError("Cannot interpret interval: {}".format(index))
314
315
    @property
316
    def condition(self):
317
        """condition"""
318
        return self._condition
319
320
    @condition.setter
321
    def condition(self, conditions):
322
        self._condition = conditions
323
324
    @property
325
    def resolution(self):
326
        """resolution"""
327
        return self._resolution
328
329
    @resolution.setter
330
    def resolution(self, value):
331
        if value is not None and value <= 0:
332
            raise ValueError('resolution>0 required')
333
        self._resolution = value
334
335
    def _reshape(self, data, shape):
336
        # shape not necessary here,
337
        # data should just fall through
338
        return data
339
340
    def interval_length(self, chrom):
341
        """Method returns the interval lengths."""
342
        # extract the length by the interval length
343
        # or by the array shape
344
        locus = _str_to_iv(chrom)
345
        if len(locus) > 1:
346
            return locus[2] - locus[1]
347
348
        return self.resolution
349
350
    def scale_by_region_length(self):
351
        """ This method scales the regions by the region length ."""
352
        for chrom in self.handle:
353
            if self._full_genome_stored:
354
                self.handle[chrom][:] /= self.interval_length(chrom)
355
            else:
356
                for rstr, idx in self.region2index.items():
357
                    self.handle[chrom][idx] /= self.interval_length(rstr)
358
359
    def weighted_mean(self):
360
        """ Base pair resolution mean weighted by interval length
361
        """
362
363
        # summing the signal
364
        sums = [self.sum(chrom) for chrom in self.handle]
365
        sums = np.asarray(sums).sum(axis=0)
366
367
        # weights are determined by interval and chromosome length
368
        weights = [np.prod(self.handle[chrom].shape[:-1]) \
369
                   for chrom in self.handle]
370
        weights = np.asarray(weights).sum()
371
        return sums / weights
372
373
    def shift(self, means):
374
        """Centering the signal by the weighted mean"""
375
        #means = self.weighted_mean()
376
377
        for chrom in self.handle:
378
            # adjust base pair resoltion mean to interval length
379
            self.handle[chrom][:] -= means
380
381
    def rescale(self, scale):
382
        """ Method to rescale the signal """
383
        for chrom in self.handle:
384
            self.handle[chrom][:] /= scale
385
386
    def sum(self, chrom=None):
387
        """Sum signal across chromosomes."""
388
        if chrom is not None:
389
            return self.handle[chrom][:]\
390
                .sum(axis=tuple(range(self.handle[chrom].ndim - 1)))
391
392
        return np.asarray([self.handle[chrom][:]\
393
            .sum(axis=tuple(range(self.handle[chrom].ndim - 1)))
394
                           for chrom in self.handle])
395
396
    def weighted_sd(self):
397
        """ Interval scaled standard deviation """
398
399
        # summing the squared signal signal
400
        sums = [np.square(self.handle[chrom][:, :, :]).sum(
401
            axis=tuple(range(self.handle[chrom].ndim - 1))) \
402
            for chrom in self.handle]
403
        sums = np.asarray(sums).sum(axis=0)
404
405
        # weights are determined by interval and chromosome length
406
        weights = [np.prod(self.handle[chrom].shape[:-1]) \
407
                   for chrom in self.handle]
408
        weights = np.asarray(weights).sum()
409
        return np.sqrt(sums / (weights - 1.))
410
411
    @property
412
    def order(self):
413
        """order"""
414
        return self._order
415
416
    @order.setter
417
    def order(self, order):
418
        if order <= 0:
419
            raise ValueError('order>0 required')
420
        self._order = order
421
422
    def get_iv_end(self, end):
423
        """obtain the chromosome length for a given resolution."""
424
        return _get_iv_length(end, self.resolution)
425
426
    def get_iv_start(self, start):
427
        """obtain the chromosome length for a given resolution."""
428
        if self.resolution is None:
429
            return 0
430
        return start // self.resolution
431
432
def init_with_padding_value(padding_value, shape, dtype):
433
    """ create array with given padding value. """
434
    if padding_value == 0.0:
435
        return np.zeros(shape, dtype)
436
    else:
437
        return np.ones(shape, dtype) * padding_value
438
439
class HDF5GenomicArray(GenomicArray):
440
    """HDF5GenomicArray stores multi-dimensional genomic information.
441
442
    Implements GenomicArray.
443
444
    Parameters
445
    ----------
446
    gsize : GenomicIndexer or callable
447
        GenomicIndexer containing the genome sizes or a callable that
448
        returns a GenomicIndexer to enable lazy loading.
449
    stranded : bool
450
        Consider stranded profiles. Default: True.
451
    conditions : list(str) or None
452
        List of cell-type or condition labels associated with the corresponding
453
        array dimensions. Default: None means a one-dimensional array is produced.
454
    typecode : str
455
        Datatype. Default: 'd'.
456
    datatags : list(str) or None
457
        Tags describing the dataset. This is used to store the cache file.
458
    resolution : int
459
        Resolution for storing the genomic array. Only relevant for the use
460
        with Cover Datasets. Default: 1.
461
    order : int
462
        Order of the alphabet size. Only relevant for Bioseq Datasets. Default: 1.
463
    store_whole_genome : boolean
464
        Whether to store the entire genome or only the regions of interest.
465
        Default: True
466
    padding_value : float
467
        Padding value. Default: 0.
468
    cache : str or None
469
        Hash string of the data and parameters to cache the dataset. If None,
470
        caching is deactivated. Default: None.
471
    overwrite : boolean
472
        Whether to overwrite the cache. Default: False
473
    loader : callable or None
474
        Function to be called for loading the genomic array.
475
    collapser : None or callable
476
        Method to aggregate values along a given interval.
477
    verbose : boolean
478
        Verbosity. Default: False
479
    """
480
481
    def __init__(self, gsize,  # pylint: disable=too-many-locals
482
                 stranded=True,
483
                 conditions=None,
484
                 typecode='d',
485
                 datatags=None,
486
                 resolution=1,
487
                 order=1,
488
                 padding_value=0.,
489
                 store_whole_genome=True,
490
                 cache=None,
491
                 overwrite=False, loader=None,
492
                 normalizer=None,
493
                 collapser=None,
494
                 verbose=False):
495
        super(HDF5GenomicArray, self).__init__(stranded, conditions, typecode,
496
                                               resolution,
497
                                               order=order,
498
                                               padding_value=padding_value,
499
                                               store_whole_genome=store_whole_genome,
500
                                               collapser=collapser)
501
502
        if cache is None:
503
            raise ValueError('cache=True required for HDF format')
504
505
        gsize_ = None
506
507
        if not store_whole_genome:
508
            gsize_ = gsize() if callable(gsize) else gsize
509
            self.region2index = {_iv_to_str(region.chrom,
510
                                            region.start,
511
                                            region.end): i \
512
                                                for i, region in enumerate(gsize_)}
513
514
        cachefile = _get_cachefile(cache, datatags, '.h5')
515
        load_from_file = _load_data(cache, datatags, '.h5')
516
517
        if load_from_file:
518
            if gsize_ is None:
519
                gsize_ = gsize() if callable(gsize) else gsize
520
521
            h5file = h5py.File(cachefile, 'w')
522
523
            if store_whole_genome:
524
                for region in gsize_:
525
                    shape = (_get_iv_length(region.length - self.order + 1, self.resolution),
526
                             2 if stranded else 1, len(self.condition))
527
                    h5file.create_dataset(str(region.chrom), shape,
528
                                          dtype=self.typecode,
529
                                          data=init_with_padding_value(padding_value,
530
                                                                       shape,
531
                                                                       self.typecode))
532
                    self.handle = h5file
533
            else:
534
                shape = (len(gsize_),
535
                         _get_iv_length(gsize_.binsize + 2*gsize_.flank - self.order + 1,
536
                                        self.resolution),
537
                         2 if stranded else 1, len(self.condition))
538
                h5file.create_dataset('data', shape,
539
                                      dtype=self.typecode,
540
                                      data=init_with_padding_value(padding_value,
541
                                                                   shape,
542
                                                                   self.typecode))
543
                self.handle = h5file
544
            # invoke the loader
545
            if loader:
546
                loader(self)
547
548
            for norm in normalizer or []:
549
                get_normalizer(norm)(self)
550
            h5file.close()
551
        if verbose: print('reload {}'.format(cachefile))
552
        h5file = h5py.File(cachefile, 'a', driver='stdio')
553
554
        self.handle = h5file
555
556
557
class NPGenomicArray(GenomicArray):
558
    """NPGenomicArray stores multi-dimensional genomic information.
559
560
    Implements GenomicArray.
561
    Parameters
562
    ----------
563
    gsize : GenomicIndexer or callable
564
        GenomicIndexer containing the genome sizes or a callable that
565
        returns a GenomicIndexer to enable lazy loading.
566
    stranded : bool
567
        Consider stranded profiles. Default: True.
568
    conditions : list(str) or None
569
        List of cell-type or condition labels associated with the corresponding
570
        array dimensions. Default: None means a one-dimensional array is produced.
571
    typecode : str
572
        Datatype. Default: 'd'.
573
    datatags : list(str) or None
574
        Tags describing the dataset. This is used to store the cache file.
575
    resolution : int
576
        Resolution for storing the genomic array. Only relevant for the use
577
        with Cover Datasets. Default: 1.
578
    order : int
579
        Order of the alphabet size. Only relevant for Bioseq Datasets. Default: 1.
580
    store_whole_genome : boolean
581
        Whether to store the entire genome or only the regions of interest.
582
        Default: True
583
    padding_value : float
584
        Padding value. Default: 0.
585
    cache : str or None
586
        Hash string of the data and parameters to cache the dataset. If None,
587
        caching is deactivated. Default: None.
588
    overwrite : boolean
589
        Whether to overwrite the cache. Default: False
590
    loader : callable or None
591
        Function to be called for loading the genomic array.
592
    normalizer : callable or None
593
        Normalization to be applied. This argumenet can be None,
594
        if no normalization is applied, or a callable that takes
595
        a garray and returns a normalized garray.
596
        Default: None.
597
    collapser : None or callable
598
        Method to aggregate values along a given interval.
599
    verbose : boolean
600
        Verbosity. Default: False
601
    """
602
603
    def __init__(self, gsize,  # pylint: disable=too-many-locals
604
                 stranded=True,
605
                 conditions=None,
606
                 typecode='d',
607
                 datatags=None,
608
                 resolution=1,
609
                 order=1,
610
                 padding_value=0.0,
611
                 store_whole_genome=True,
612
                 cache=None,
613
                 overwrite=False, loader=None,
614
                 normalizer=None, collapser=None,
615
                 verbose=False):
616
617
        super(NPGenomicArray, self).__init__(stranded, conditions, typecode,
618
                                             resolution,
619
                                             order=order,
620
                                             padding_value=padding_value,
621
                                             store_whole_genome=store_whole_genome,
622
                                             collapser=collapser)
623
624
        gsize_ = None
625
626
        if not store_whole_genome:
627
628
            gsize_ = gsize() if callable(gsize) else gsize
629
630
            self.region2index = {_iv_to_str(region.chrom,
631
                                            region.start,
632
                                            region.end): i \
633
                                                for i, region in enumerate(gsize_)}
634
635
        cachefile = _get_cachefile(cache, datatags, '.npz')
636
        load_from_file = _load_data(cache, datatags, '.npz')
637
638
        if load_from_file:
639
            if gsize_ is None:
640
                gsize_ = gsize() if callable(gsize) else gsize
641
642
            if store_whole_genome:
643
                data = {str(region.chrom): init_with_padding_value(
644
                    padding_value,
645
                    shape=(_get_iv_length(region.length - self.order + 1,
646
                                          self.resolution),
647
                           2 if stranded else 1,
648
                           len(self.condition)),
649
                    dtype=self.typecode) for region in gsize_}
650
                names = [str(region.chrom) for region in gsize_]
651
                self.handle = data
652
            else:
653
                data = {'data': init_with_padding_value(
654
                    padding_value,
655
                    shape=(len(gsize_),
656
                           _get_iv_length(gsize_.binsize + 2*gsize_.flank - self.order + 1,
657
                                          self.resolution) if self.resolution is not None else 1,
658
                           2 if stranded else 1,
659
                           len(self.condition)),
660
                    dtype=self.typecode)}
661
                names = ['data']
662
                self.handle = data
663
664
            # invoke the loader
665
            if loader:
666
                loader(self)
667
668
669
            if cachefile is not None:
670
                np.savez(cachefile, **data)
671
672
673
        if cachefile is not None:
674
            if verbose: print('reload {}'.format(cachefile))
675
            data = np.load(cachefile)
676
            names = [x for x in data]
677
678
        # here we get either the freshly loaded data or the reloaded
679
        # data from np.load.
680
        self.handle = {key: data[key] for key in names}
681
682
        for norm in normalizer or []:
683
            get_normalizer(norm)(self)
684
685
686
class SparseGenomicArray(GenomicArray):
687
    """SparseGenomicArray stores multi-dimensional genomic information.
688
689
    Implements GenomicArray.
690
691
    Parameters
692
    ----------
693
    gsize : GenomicIndexer or callable
694
        GenomicIndexer containing the genome sizes or a callable that
695
        returns a GenomicIndexer to enable lazy loading.
696
    stranded : bool
697
        Consider stranded profiles. Default: True.
698
    conditions : list(str) or None
699
        List of cell-type or condition labels associated with the corresponding
700
        array dimensions. Default: None means a one-dimensional array is produced.
701
    typecode : str
702
        Datatype. Default: 'd'.
703
    datatags : list(str) or None
704
        Tags describing the dataset. This is used to store the cache file.
705
    resolution : int
706
        Resolution for storing the genomic array. Only relevant for the use
707
        with Cover Datasets. Default: 1.
708
    order : int
709
        Order of the alphabet size. Only relevant for Bioseq Datasets. Default: 1.
710
    store_whole_genome : boolean
711
        Whether to store the entire genome or only the regions of interest.
712
        Default: True
713
    padding_value : float
714
        Padding value. Default: 0.
715
    cache : str or None
716
        Hash string of the data and parameters to cache the dataset. If None,
717
        caching is deactivated. Default: None.
718
    overwrite : boolean
719
        Whether to overwrite the cache. Default: False
720
    loader : callable or None
721
        Function to be called for loading the genomic array.
722
    normalizer : callable or None
723
        Normalization to be applied. This argumenet can be None,
724
        if no normalization is applied, or a callable that takes
725
        a garray and returns a normalized garray.
726
        Default: None.
727
    collapser : None or callable
728
        Method to aggregate values along a given interval.
729
    verbose : boolean
730
        Verbosity. Default: False
731
    """
732
733
    def __init__(self, gsize,  # pylint: disable=too-many-locals
734
                 stranded=True,
735
                 conditions=None,
736
                 typecode='d',
737
                 datatags=None,
738
                 resolution=1,
739
                 order=1,
740
                 store_whole_genome=True,
741
                 cache=None,
742
                 padding_value=0.0,
743
                 overwrite=False,
744
                 loader=None,
745
                 collapser=None,
746
                 verbose=False):
747
        super(SparseGenomicArray, self).__init__(stranded, conditions,
748
                                                 typecode,
749
                                                 resolution,
750
                                                 order=order,
751
                                                 padding_value=padding_value,
752
                                                 store_whole_genome=store_whole_genome,
753
                                                 collapser=collapser)
754
755
        cachefile = _get_cachefile(cache, datatags, '.npz')
756
        load_from_file = _load_data(cache, datatags, '.npz')
757
758
        gsize_ = None
759
760
        if not store_whole_genome:
761
762
            gsize_ = gsize() if callable(gsize) else gsize
763
764
            self.region2index = {_iv_to_str(region.chrom,
765
                                            region.start,
766
                                            region.end): i \
767
                                                for i, region in enumerate(gsize_)}
768
769
        if load_from_file:
770
            if gsize_ is None:
771
                gsize_ = gsize() if callable(gsize) else gsize
772
773
            if store_whole_genome:
774
                data = {str(region.chrom): sparse.dok_matrix(
775
                    (_get_iv_length(region.length - self.order + 1,
776
                                    resolution),
777
                     (2 if stranded else 1) * len(self.condition)),
778
                    dtype=self.typecode)
779
                        for region in gsize_}
780
            else:
781
                data = {'data': sparse.dok_matrix(
782
                    (len(gsize_),
783
                     (_get_iv_length(gsize_.binsize + 2*gsize_.flank - self.order + 1,
784
                                     self.resolution) if self.resolution is not None else 1) *
785
                     (2 if stranded else 1) * len(self.condition)),
786
                    dtype=self.typecode)}
787
            self.handle = data
788
789
            # invoke the loader
790
            if loader:
791
                loader(self)
792
793
            data = self.handle
794
795
            data = {chrom: data[chrom].tocoo() for chrom in data}
796
797
            storage = {chrom: np.column_stack([data[chrom].data,
798
                                               data[chrom].row,
799
                                               data[chrom].col]) \
800
                                               for chrom in data}
801
            for region in gsize_:
802
                if store_whole_genome:
803
                    storage[region.chrom + '__length__'] = region.length
804
805
            names = [name for name in storage]
806
807
            if cachefile is not None:
808
                np.savez(cachefile, **storage)
809
810
        if cachefile is not None:
811
            if verbose: print('reload {}'.format(cachefile))
812
            storage = np.load(cachefile)
813
814
        names = [name for name in storage if '__length__' not in name]
815
816
        if store_whole_genome:
817
            self.handle = {name: sparse.coo_matrix(
818
                (storage[name][:, 0],
819
                 (storage[name][:, 1].astype('int'),
820
                  storage[name][:, 2].astype('int'))),
821
                shape=(_get_iv_length(storage[str(name)+'__length__'], resolution),
822
                       (2 if stranded else 1) * len(self.condition))).tocsr()
823
                           for name in names}
824
        else:
825
            # gsize_ is always available for store_whole_genome=False
826
            self.handle = {name: sparse.coo_matrix(
827
                (storage[name][:, 0],
828
                 (storage[name][:, 1].astype('int'),
829
                  storage[name][:, 2].astype('int'))),
830
                shape=(len(gsize_),
831
                       (_get_iv_length(gsize_.binsize + 2*gsize_.flank, resolution)
832
                        if self.resolution is not None else 1) *
833
                       (2 if stranded else 1) * len(self.condition))).tocsr()
834
                           for name in names}
835
836
    def _reshape(self, data, shape):
837
        # what to do with zero padding
838
        data = data.toarray()
839
840
        if self._full_genome_stored:
841
            return data.reshape(data.shape[0], data.shape[1]//(shape[-1]), shape[-1])
842
        else:
843
            return data.reshape(data.shape[1]//(shape[-2]*shape[-1]), shape[-2], shape[-1])
844
845
    def _setitem(self, interval, condition, length, value):
846
        if not self._full_genome_stored:
847
            regidx = self.region2index[_iv_to_str(interval.chrom, interval.start, interval.end)]
848
            nconditions = len(self.condition)
849
            ncondstrand = len(self.condition) * value.shape[-1]
850
            #end = end - self.order + 1
851
            idxs = np.where(value > 0)
852
            for idx in zip(*idxs):
853
                basepos = idx[0] * ncondstrand
854
                strand = idx[1] * nconditions
855
                cond = condition if isinstance(condition, int) else idx[2]
856
                self.handle['data'][regidx,
857
                                    basepos + strand + cond] = value[idx]
858
        else:
859
            ref_start, ref_end, array_start, _ = self._get_indices(interval, value.shape[0])
860
            idxs = np.where(value > 0)
861
            iarray = np.arange(ref_start, ref_end)
862
            for idx in zip(*idxs):
863
                cond = condition if isinstance(condition, int) else idx[2]
864
                self.handle[interval.chrom][iarray[idx[0]],
865
                                            idx[1] * len(self.condition)
866
                                            + cond] = value[idx[0] + array_start][idx[1:]]
867
868
class PercentileTrimming(object):
869
    """Percentile trimming normalization.
870
871
    This class performs percentile trimming of a GenomicArray to aleviate
872
    the effect of outliers.
873
    All values that exceed the value associated with the given percentile
874
    are set to be equal to the percentile.
875
876
    Parameters
877
    ----------
878
    percentile : float
879
        Percentile at which to perform chromosome-level trimming.
880
    """
881
    def __init__(self, percentile):
882
        self.percentile = percentile
883
884
    def __call__(self, garray):
885
886
        quants = np.percentile(np.concatenate(np.asarray([garray.handle[chrom] for \
887
                                      chrom in garray.handle]), axis=0),
888
                               self.percentile, axis=(0, 1))
889
890
        for icond, quant in enumerate(quants):
891
            for chrom in garray.handle:
892
                arr = garray.handle[chrom][:, :, icond]
893
                arr[arr > quant] = quant
894
                garray.handle[chrom][:, :, icond] = arr
895
        return garray
896
897
    def __str__(self):  # pragma: no cover
898
        return 'PercentileTrimming({})'.format(self.percentile)
899
900
    def __repr__(self):  # pragma: no cover
901
        return str(self)
902
903
904
class RegionLengthNormalization(object):
905
    """ Normalization for variable-region length.
906
907
    This class performs region length normalization of a GenomicArray.
908
    This is relevant when genomic features are of variable size, e.g.
909
    enhancer regions of different width or when using variable length genes.
910
911
    Parameters
912
    ----------
913
    regionmask : str or GenomicIndexer, None
914
        A bed file or a genomic indexer that contains the masking region
915
        that is considered for the signal. For instance, when normalizing
916
        gene expression to TPM, the mask contains exons. Otherwise, the
917
        TPM would normalize for the full length gene annotation.
918
        If None, no mask is included.
919
    """
920
    def __init__(self, regionmask=None):
921
        self.regionmask = regionmask
922
923
    def __call__(self, garray):
924
        # length scaling
925
926
        garray.scale_by_region_length()
927
928
        return garray
929
930
    def __str__(self):  # pragma: no cover
931
        return 'RegionLengthNormalization({})'.format(self.regionmask)
932
933
    def __repr__(self):  # pragma: no cover
934
        return str(self)
935
936
937
class ZScore(object):
938
    """ZScore normalization.
939
940
    This class performs ZScore normalization of a GenomicArray.
941
    It automatically adjusts for variable interval lenths.
942
943
    Parameters
944
    ----------
945
    means : float or None
946
        Provided means will be applied for zero-centering.
947
        If None, the means will be determined
948
        from the GenomicArray and then applied.
949
        Default: None.
950
    stds : float or None
951
        Provided standard deviations will be applied for scaling.
952
        If None, the stds will be determined
953
        from the GenomicArray and then applied.
954
        Default: None.
955
    """
956
    def __init__(self, mean=None, std=None):
957
        self.mean = mean
958
        self.std = std
959
960
    def __call__(self, garray):
961
962
        # determine the mean signal per condition
963
        if self.mean is None:
964
            self.mean = garray.weighted_mean()
965
966
        # centering to zero-mean
967
        garray.shift(self.mean)
968
969
        # determines standard deviation per contition
970
        if self.std is None:
971
            self.std = garray.weighted_sd()
972
973
        # rescale by standard deviation
974
        garray.rescale(self.std)
975
976
        return garray
977
978
    def __str__(self):  # pragma: no cover
979
        return 'ZScore({},{})'.format(self.mean, self.std)
980
981
    def __repr__(self):  # pragma: no cover
982
        return str(self)
983
984
985
class LogTransform(object):
986
    """Log transformation of intput signal.
987
988
    This class performs log-transformation
989
    of a GenomicArray using log(x + 1.) to avoid NAN's from zeros.
990
991
    """
992
    def __init__(self):
993
        pass
994
995
    def __call__(self, garray):
996
997
        for chrom in garray.handle:
998
            garray.handle[chrom][:] = np.log(garray.handle[chrom][:] + 1.)
999
        return garray
1000
1001
    def __str__(self):  # pragma: no cover
1002
        return 'Log'
1003
1004
    def __repr__(self):  # pragma: no cover
1005
        return str(self)
1006
1007
1008
class ZScoreLog(object):
1009
    """ZScore normalization after log transformation.
1010
1011
    This class performs ZScore normalization after log-transformation
1012
    of a GenomicArray using log(x + 1.) to avoid NAN's from zeros.
1013
    It automatically adjusts for variable interval lenths.
1014
1015
    Parameters
1016
    ----------
1017
    means : float or None
1018
        Provided means will be applied for zero-centering.
1019
        If None, the means will be determined
1020
        from the GenomicArray and then applied.
1021
        Default: None.
1022
    stds : float or None
1023
        Provided standard deviations will be applied for scaling.
1024
        If None, the stds will be determined
1025
        from the GenomicArray and then applied.
1026
        Default: None.
1027
    """
1028
    def __init__(self, mean=None, std=None):
1029
        self.logtr = LogTransform()
1030
        self.zscore = ZScore(mean, std)
1031
1032
    def __call__(self, garray):
1033
        return self.zscore(self.logtr(garray))
1034
1035
    def __str__(self):  # pragma: no cover
1036
        return str(self.zscore) + str(self.logtr)
1037
1038
    def __repr__(self):  # pragma: no cover
1039
        return str(self)
1040
1041
1042
def normalize_garray_tpm(garray):
1043
    """This function performs TPM normalization
1044
    for a given GenomicArray.
1045
1046
    """
1047
1048
    # rescale by region lengths in bp
1049
    garray = RegionLengthNormalization()(garray)
1050
1051
    # recale to kb
1052
    garray.rescale(1e-3)
1053
1054
    # compute scaling factor
1055
    scale = garray.sum() # per chromsome sum
1056
    scale = scale.sum(axis=0) # sum across chroms
1057
    scale /= 1e6 # rescale by million
1058
1059
    # divide by scaling factor
1060
    garray.rescale(scale)
1061
1062
    return garray
1063
1064
1065
def get_normalizer(normalizer):
1066
    """ maps built-in normalizers by name and
1067
    returns the respective function """
1068
1069
    if callable(normalizer):
1070
        return normalizer
1071
    elif normalizer == 'zscore':
1072
        return ZScore()
1073
    elif normalizer == 'zscorelog':
1074
        return ZScoreLog()
1075
    elif normalizer == 'binsizenorm':
1076
        return RegionLengthNormalization()
1077
    elif normalizer == 'perctrim':
1078
        return PercentileTrimming(99)
1079
    elif normalizer == 'tpm':
1080
        return normalize_garray_tpm
1081
1082
    raise ValueError('unknown normalizer: {}'.format(normalizer))
1083
1084
1085
def create_genomic_array(chroms, stranded=True, conditions=None, typecode='float32',
1086
                         storage='hdf5', resolution=1,
1087
                         order=1,
1088
                         padding_value=0.0,
1089
                         store_whole_genome=True,
1090
                         datatags=None, cache=None, overwrite=False,
1091
                         loader=None,
1092
                         normalizer=None, collapser=None,
1093
                         verbose=False):
1094
    """Factory function for creating a GenomicArray.
1095
1096
    This function creates a genomic array for a given storage mode.
1097
1098
    Parameters
1099
    ----------
1100
    chroms : dict
1101
        Dictionary with chromosome names as keys and chromosome lengths
1102
        as values.
1103
    stranded : bool
1104
        Consider stranded profiles. Default: True.
1105
    conditions : list(str) or None
1106
        List of cell-type or condition labels associated with the corresponding
1107
        array dimensions. Default: None means a one-dimensional array is produced.
1108
    typecode : str
1109
        Datatype. Default: 'float32'.
1110
    storage : str
1111
        Storage type can be 'ndarray', 'hdf5' or 'sparse'.
1112
        Numpy loads the entire dataset into the memory. HDF5 keeps
1113
        the data on disk and loads the mini-batches from disk.
1114
        Sparse maintains sparse matrix representation of the dataset
1115
        in the memory.
1116
        Usage of numpy will require high memory consumption, but allows fast
1117
        slicing operations on the dataset. HDF5 requires low memory consumption,
1118
        but fetching the data from disk might be time consuming.
1119
        sparse will be a good compromise if the data is indeed sparse. In this
1120
        case, memory consumption will be low while slicing will still be fast.
1121
    datatags : list(str) or None
1122
        Tags describing the dataset. This is used to store the cache file.
1123
    resolution : int
1124
        Resolution for storing the genomic array. Only relevant for the use
1125
        with Cover Datasets. Default: 1.
1126
    order : int
1127
        Order of the alphabet size. Only relevant for Bioseq Datasets. Default: 1.
1128
    padding_value : float
1129
        Padding value. Default: 0.0
1130
    store_whole_genome : boolean
1131
        Whether to store the entire genome or only the regions of interest.
1132
        Default: True
1133
    cache : str or None
1134
        Hash string of the data and parameters to cache the dataset. If None,
1135
        caching is deactivated. Default: None.
1136
    overwrite : boolean
1137
        Whether to overwrite the cache. Default: False
1138
    loader : callable or None
1139
        Function to be called for loading the genomic array.
1140
    normalizer : callable or None
1141
        Normalization to be applied. This argumenet can be None,
1142
        if no normalization is applied, or a callable that takes
1143
        a garray and returns a normalized garray.
1144
        Default: None.
1145
    collapser : str, callable or None
1146
        Collapse method defines how the signal is aggregated for resolution>1 or resolution=None.
1147
        For example, by summing the signal over a given interval.
1148
    verbose : boolean
1149
        Verbosity. Default: False
1150
    """
1151
1152
    # check if collapser available
1153
    if (resolution is None or resolution > 1) and collapser is None:
1154
        raise ValueError('Requiring collapser due to resolution=None or resolution>1')
1155
1156
    # force store_whole_genome=False if resolution=None
1157
    if resolution is None and store_whole_genome:
1158
        print('store_whole_genome=True ignored, because it is not compatible'
1159
              'with resolution=None. store_whole_genome=False is used instead.')
1160
        store_whole_genome = False
1161
1162
    if storage == 'hdf5':
1163
        return HDF5GenomicArray(chroms, stranded=stranded,
1164
                                conditions=conditions,
1165
                                typecode=typecode,
1166
                                datatags=datatags,
1167
                                resolution=resolution,
1168
                                order=order,
1169
                                store_whole_genome=store_whole_genome,
1170
                                cache=cache,
1171
                                padding_value=padding_value,
1172
                                overwrite=overwrite,
1173
                                loader=loader,
1174
                                normalizer=normalizer,
1175
                                collapser=get_collapser(collapser),
1176
                                verbose=verbose)
1177
    elif storage == 'ndarray':
1178
        return NPGenomicArray(chroms, stranded=stranded,
1179
                              conditions=conditions,
1180
                              typecode=typecode,
1181
                              datatags=datatags,
1182
                              resolution=resolution,
1183
                              order=order,
1184
                              store_whole_genome=store_whole_genome,
1185
                              cache=cache,
1186
                              padding_value=padding_value,
1187
                              overwrite=overwrite,
1188
                              loader=loader,
1189
                              normalizer=normalizer,
1190
                              collapser=get_collapser(collapser),
1191
                              verbose=verbose)
1192
    elif storage == 'sparse':
1193
        return SparseGenomicArray(chroms, stranded=stranded,
1194
                                  conditions=conditions,
1195
                                  typecode=typecode,
1196
                                  datatags=datatags,
1197
                                  resolution=resolution,
1198
                                  order=order,
1199
                                  store_whole_genome=store_whole_genome,
1200
                                  cache=cache,
1201
                                  padding_value=padding_value,
1202
                                  overwrite=overwrite,
1203
                                  loader=loader,
1204
                                  collapser=get_collapser(collapser),
1205
                                  verbose=verbose)
1206
1207
    raise Exception("Storage type must be 'hdf5', 'ndarray' or 'sparse'")