Switch to unified view

a b/singlecellmultiomics/features/features.py
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
import numpy as np
4
import gzip
5
import itertools
6
import re
7
import functools
8
import pysam
9
from singlecellmultiomics.utils import Prefetcher
10
from copy import copy
11
import collections
12
import pandas as pd
13
14
def get_gene_id_to_gene_name_conversion_table(annotation_path_exons,
15
                                              featureTypes=['gene_name']):
16
    """Create a dictionary converting a gene id to other gene features,
17
        such as gene_name/gene_biotype etc.
18
19
    Arguments:
20
        annotation_path_exons(str) : path to GTF file (can be gzipped)
21
        featureTypes(list) : list of features to convert to, for example ['gene_name','gene_biotype']
22
23
    Returns:
24
        conversion_dict(dict) : { gene_id : 'firstFeature_secondFeature'}
25
    """
26
    conversion_table = {}
27
    with (gzip.open(annotation_path_exons, 'rt') if annotation_path_exons.endswith('.gz') else open(annotation_path_exons, 'r')) as t:
28
        for i, line in enumerate(t):
29
            parts = line.rstrip().split(None, 8)
30
            keyValues = {}
31
            for part in parts[-1].split(';'):
32
                kv = part.strip().split()
33
                if len(kv) == 2:
34
                    key = kv[0]
35
                    value = kv[1].replace('"', '')
36
                    keyValues[key] = value
37
            # determine the conversion name:
38
            if 'gene_id' in keyValues and any(
39
                    [feat in keyValues for feat in featureTypes]):
40
                conversion_table[keyValues['gene_id']] = '_'.join([
41
                    keyValues.get(feature, 'None')
42
                    for feature in featureTypes])
43
44
    return conversion_table
45
46
47
class FeatureContainer(Prefetcher):
48
49
    def __init__(self, verbose=False):
50
        self.args = locals().copy()
51
        self.startCoordinates = {}  # dict of np.array()
52
        self.features = {}
53
        self.endCoordinates = {}
54
        self.sorted = True  # Flag containing if the features are all coordinate sorted
55
        # When set to true, the class will be (very) verbose
56
        self.debug = False
57
        self.remapKeys = {}  # {'chrMT':'chrM','MT':'chrM'}  Use this to convert chromosome names between the GTF and requested locations
58
        self.verbose = verbose
59
        self.preload_list = []
60
61
    def debugMsg(self, msg):
62
        if self.verbose:
63
            print(msg)
64
65
    def __repr__(self):
66
        s= 'FeatureContainer,'
67
        if len(self.preload_list):
68
            s+= ' Preloaded files:\n'
69
            for f in self.preload_list:
70
                s+=str(f)+'\n'
71
        if len(self.features):
72
            s+=f'Features in memory for {len(self.features)} contigs\n'
73
        else:
74
            s+='No features in memory\n'
75
        return s
76
77
    def __len__(self):
78
        return sum(len(f) for f in self.features.values())
79
80
    def preload_GTF(self, **kwargs):
81
        self.preload_list.append( {'gtf':kwargs} )
82
83
    def get_gene_to_location_dict(self, meta_key='gene_name', with_strand=False):
84
        """
85
        generate dictionary, {gene_name: contig,start,end}
86
87
        Args:
88
            meta_key(str): key of the meta information used to use as primary key for the returned gene_locations
89
90
        Returns:
91
            gene_locations(dict)
92
        """
93
        location_map = {}
94
95
        for contig, start, end, name, strand, meta in self:
96
            meta =dict(meta)
97
            try:
98
                if with_strand:
99
                    location_map[ meta[meta_key]] = (contig, start,end,strand)
100
                else:
101
                    location_map[ meta[meta_key]] = (contig, start,end)
102
            except Exception as e:
103
                pass
104
105
        return location_map
106
107
108
109
    def __iter__(self):
110
        for contig, contig_features in self.features.items():
111
            for feature in contig_features:
112
                yield (contig,)+ feature
113
114
115
    def instance(self, arg_update):
116
        if 'self' in self.args:
117
            del self.args['self']
118
        clone = FeatureContainer(**self.args)
119
        for cmd in self.preload_list:
120
            for preload_type, kwargs in cmd.items():
121
                kwargs_copy = copy(kwargs)
122
                kwargs_copy.update(arg_update)
123
                if preload_type=='gtf':
124
                    clone.loadGTF(**kwargs_copy)
125
                else:
126
                    raise ValueError()
127
        return clone
128
129
130
    def prefetch(self, contig, start, end):
131
        return self.instance( {'contig':contig, 'region_start':start,  'region_end':end})
132
133
134
    def loadGTF(self, path, thirdOnly=None, identifierFields=['gene_id'],
135
                ignChr=False, select_feature_type=None, exon_select=None,
136
                head=None, store_all=False, contig=None, offset=-1,
137
                region_start=None, region_end=None):
138
        """Load annotations from a GTF file.
139
        ignChr: ignore the chr part of the Annotation chromosome
140
        """
141
        if region_end is not None or region_start is not None:
142
            assert contig is not None and region_end is not None and region_start is not None
143
144
        self.loadedGtfFeatures = thirdOnly
145
        #pattern = '^(.*) "(.*).*"'
146
        #prog = re.compile(pattern)
147
        if self.verbose:
148
            if contig is None:
149
                print(f"Loading {path} completely")
150
            elif region_start is None:
151
                print(f"Loading {path}, for contig {contig}")
152
            else:
153
                print(f"Loading {path}, for contig {contig}:{region_start}-{region_end}")
154
        added = 0
155
        is_gff= False
156
        with (gzip.open(path, 'rt') if path.endswith('.gz') else open(path, 'r')) as f:
157
            for line_id, line in enumerate(f):
158
                if head is not None and added > head:
159
                    break
160
                if line[0] == '#':
161
                    if line.startswith('##gff-version 3'):
162
                        is_gff = True
163
                    continue
164
                parts = line.rstrip().split(None, 8)
165
166
                chrom = parts[0]
167
                if contig is not None and chrom != contig:
168
                    continue
169
170
                if thirdOnly is not None:
171
                    if parts[2] not in thirdOnly:
172
                        continue
173
174
                # Example line: (gene)
175
                # 1  havana  gene    11869   14409   .   +   .   gene_id "ENSG00000223972"; gene_version "5"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; havana_gene "OTTHUMG00000000961"; havana_gene_version "2";
176
                # Example line (exon)
177
                # 1  havana  exon    11869   12227   .   +   .   gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000456328"; transcript_version "2"; exon_number "1"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; havana_gene "OTTHUMG00000000961"; havana_gene_version "2"; transcript_name "DDX11L1-002"; transcript_source "havana"; transcript_biotype "processed_transcript"; havana_transcript "OTTHUMT00000362751"; havana_transcript_version "1"; exon_id "ENSE00002234944"; exon_version "1"; tag "basic"; transcript_support_level "1";
178
                #Part, info
179
                # 0  Chromosome
180
                # 1  Source
181
                # 2  Feature type (matches thirdOnly)
182
                # 3 Feature Start
183
                # 4 Feature End
184
                # 5 .
185
                # 6 Strand
186
                # 7 .
187
                # 8 KEY "VALUE";
188
                #keyValues = { part.strip().split()[0]:part.strip().split()[1].replace('"','') for part in parts[-1].split(';') }
189
190
                #keyValues = {i.group(1) : i.group(2) for i in (prog.match(j) for j in parts[-1].split('; '))}
191
                if select_feature_type is not None and not parts[2] in select_feature_type:
192
                    continue
193
194
                exon = parts[7]
195
                if exon_select is not None and exon not in exon_select:
196
                    continue
197
198
                keyValues = {}
199
                for part in parts[-1].split(';'):
200
                    if is_gff:
201
                        kv = part.strip().split('=',1)
202
                    else:
203
                        kv = part.strip().split()
204
                    if len(kv) == 2:
205
                        key = kv[0]
206
                        value = kv[1].replace('"', '')
207
                        keyValues[key] = value
208
209
                #self.addFeature( self.remapKeys.get(parts[0], parts[0]), int(parts[3]), int(parts[4]), parts[9].replace('"','').replace(';',''))
210
211
                chrom = self.remapKeys.get(chrom, chrom)
212
                chromosome = chrom if ignChr == False else chrom.replace(
213
                    'chr', '')
214
215
                if identifierFields is None:
216
                    if parts[2] == 'exon':
217
                        featureName = keyValues['exon_id']
218
                        #featureName = ','.join([keyValues['exon_id'],keyValues['transcript_id']])
219
                    elif parts[2] == 'gene':
220
                        featureName = keyValues['gene_id']
221
                    elif parts[2] == 'transcript':
222
                        featureName = keyValues['transcript_id']
223
                    else:
224
                        featureName = ','.join(
225
                            [parts[2], parts[3], parts[4], keyValues['transcript_id']])
226
                else:
227
                    featureName = ','.join(
228
                        [keyValues.get(i, 'none') for i in identifierFields if i in keyValues])
229
230
231
                start = int( parts[3] ) + offset
232
                end = int( parts[4] ) + offset
233
234
                if region_end is not None and region_start is not None and ( end<region_start or start>region_end):
235
                    continue
236
237
                if store_all:
238
                    keyValues['type'] = parts[2]
239
                    self.addFeature(
240
                        self.remapKeys.get(
241
                            chromosome, chromosome),start,end, strand=parts[6], name=featureName, data=tuple(
242
                            keyValues.items()))
243
244
                else:
245
                    self.addFeature(
246
                        self.remapKeys.get(
247
                            chromosome, chromosome), start,end, strand=parts[6], name=featureName, data=','.join(
248
                            (':'.join(
249
                                ('type', parts[2])), ':'.join(
250
                                ('gene_id', keyValues['gene_id'])))))
251
                added += 1
252
253
            if self.verbose:
254
                print("Loaded %s features, now sorting" %
255
                      sum([len(self.features[c]) for c in self.features]))
256
            self.sort()
257
            if self.verbose:
258
                print("done sorting")
259
        if self.verbose:
260
            print("The following chromosomes are available:")
261
            print(', '.join(sorted(list(self.startCoordinates.keys()))))
262
263
    def annotateUTRs(self, utrs=['three_prime_utr', 'five_prime_utr']):
264
        """flag the exons that contain a utr"""
265
266
        chromosomes = self.features.keys()
267
        typeRegex = re.compile('.*type:([^,]*).*')
268
269
        for c in chromosomes:
270
            print('chromosome:%s' % (c))
271
            types = np.array([typeRegex.match(f[-1]).group(1)
272
                              for f in self.features[c]])
273
            featStartEndStrand = np.array([[f[0] for f in self.features[c]], [
274
                                          f[1] for f in self.features[c]], [f[3] for f in self.features[c]]])
275
276
            fe = np.where(types == 'exon')[0]
277
            names = np.array([f[2] for f in self.features[c]])[fe]
278
            starts = featStartEndStrand[0, :][fe]
279
280
            for utr in utrs:
281
                isUtrF = 'is_' + utr + ':False'
282
                isUtrT = 'is_' + utr + ':True'
283
                utrRegex = re.compile(isUtrF)
284
285
                for position in fe:
286
                    (hitStart, hitEnd, name, hitStrand,
287
                     data) = self.features[c][position]
288
                    data = ','.join((data, isUtrF))
289
                    self.features[c][position] = (
290
                        hitStart, hitEnd, name, hitStrand, data)
291
                lu = np.where(types == utr)[0]
292
                if lu.size:
293
                    foo, idx = np.unique(
294
                        featStartEndStrand[:, lu], axis=1, return_index=True)
295
                    lu = lu[idx]
296
297
                    for i in lu:
298
                        hits = self.findFeaturesBetween(
299
                            c, self.features[c][i][0], self.features[c][i][0], self.features[c][i][3])
300
                        hitNames = np.array([h[2] for h in hits])
301
                        hitStarts = np.array([h[0] for h in hits])
302
                        for index, n in enumerate(hitNames):
303
                            fl = starts == hitStarts[index]
304
                            fn = names[fl] == n
305
                            positions = fe[fl][fn]
306
                            for position in positions:
307
                                (hitStart, hitEnd, name, hitStrand,
308
                                 data) = self.features[c][position]
309
                                data = utrRegex.sub(isUtrT, data)
310
                                self.features[c][position] = (
311
                                    hitStart, hitEnd, name, hitStrand, data)
312
313
                        # hitTypes = np.array([typeRegex.match(h[-1]).group(1) for h in hits])#np.array([h[-1]['type'] for h in hits])
314
                        #hitNames = np.array([h[2] for h in hits])
315
                        #fh = hitTypes=='exon'
316
317
                        # if np.any(fh):
318
                        #   hitNames=np.unique(hitNames[fh])
319
                        #   for n in hitNames:
320
                        #       fn = names==n
321
                        #       positions = fe[fn]
322
                        #       for position in positions:
323
                        #           (hitStart, hitEnd, name, hitStrand, data) = self.features[c][position]
324
                        #           data = utrRegex.sub(isUtrT,data)
325
                        #           self.features[c][position] = (hitStart, hitEnd, name, hitStrand, data)
326
327
    def loadBED(self, path, ignChr=False, parseBlocks=True):
328
        """Load UCSC based table. """
329
330
        """
331
        parseBlocks ; parse the defined blocks as separate features
332
        """
333
        with (gzip.open(path, 'rt') if '.gz' in path else open(path, 'r')) as f:
334
            # chr1   14662   187832  calJac3:ENSCJAT00000061222.1-1.1    943 -
335
            # 187832  187832  0   9   5,129,69,110,42,23,133,202,78,
336
            # 0,38,307,1133,1243,1944,1970,172713,173092,
337
            for line_idx,line in enumerate(f):
338
                if line.split()[0] == "track":
339
                    continue
340
                blockAvail = False
341
342
                parts = line.strip().split()
343
                strand = None
344
                name = None
345
                score = None
346
                if len(parts)==12:
347
                    chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, itemRGB, blockCount, blockSizes, blockStarts = parts
348
                    blockAvail = True
349
                elif len(parts)==10:
350
                    chrom, chromStart, chromEnd, name, score, strand, itemRGB, blockCount, blockSizes, blockStarts = parts
351
                    blockAvail = True
352
                elif len(parts)==6:
353
                    chrom, chromStart, chromEnd, name, score, strand = parts
354
                elif len(parts)>=4:
355
                    chrom, chromStart, chromEnd, value = parts[:4]
356
                    name = value
357
                elif len(parts)==3:
358
                    chrom, chromStart, chromEnd = parts[:3]
359
                    name = str(line_idx)
360
                elif len(parts)==2:
361
                    chrom, chromStart = parts[:2]
362
                    chromEnd = int(chromStart)+1
363
                    name = str(line_idx)
364
                else:
365
                    raise ValueError('Could not read the supplied bed file, it has too little columns, expecting at least 3 columns: contig,start,end[,value]')
366
367
                chrom = self.remapKeys.get(chrom, chrom)
368
                chrom = chrom if ignChr == False else chrom.replace('chr', '')
369
370
                if blockAvail and parseBlocks:
371
                    blockStarts = blockStarts.split(',')
372
                    blocks = [
373
                        (int(
374
                            blockStarts[index]) +
375
                            int(chromStart),
376
                            int(blockSize)) for index,
377
                        blockSize in enumerate(
378
                            blockSizes.split(',')) if len(blockSize) > 0]
379
                    if len(blocks) != int(blockCount):
380
                        raise ValueError(
381
                            'BlockCount at line %s do not match actual amount of blocks present' %
382
                            line)
383
                else:
384
                    blocks = [
385
                        (int(chromStart), int(chromEnd) - int(chromStart),)]
386
387
                # Report very big annotations?
388
                if self.debug:
389
                    if max((e - s for s, e in blocks)) > 10000:
390
                        print('')
391
                        print(line, chromStart, chromEnd, blocks)
392
                for start, length in blocks:
393
                    self.addFeature(
394
                        chrom,
395
                        start,
396
                        start + length,
397
                        name,
398
                        strand=strand,
399
                        data=None)
400
            print("Loaded %s features, now sorting" %
401
                  sum([len(self.features[c]) for c in self.features]))
402
            self.sort()
403
            print("done sorting")
404
405
    def getReferenceList(self):
406
        return(list(self.startCoordinates.keys()))
407
408
    def addFeature(self, chromosome, start, end, name, strand=None, data=None):
409
        if strand is not None and strand not in ['+', '-']:
410
            raise ValueError('Invalid strand specified: %s' % strand)
411
        # if not isinstance(start, int) or not isinstance(end, int):
412
        #     raise ValueError('Start and end coordinates should be integers')
413
414
        if self.debug:
415
            self.debugMsg(
416
                "Adding feature: chromosome:%s, start:%s, end:%s, name:%s, strand:%s, data:%s" %
417
                (chromosome, start, end, name, strand, data))
418
419
        if chromosome not in self.features:
420
            self.features[chromosome] = list()
421
422
        self.features[chromosome].append((start, end, name, strand, data))
423
        self.sorted = False
424
425
    def getCentroids(self):
426
        centroids = {}
427
        for chromosome in self.startCoordinates:
428
            centroids[chromosome] = {}
429
430
            for start, end, name, strand, data in self.features[chromosome]:
431
                centroids[chromosome][name] = (end - start) / 2 + start
432
        return(centroids)
433
434
    def findFeaturesBetween(
435
            self,
436
            chromosome,
437
            sampleStart,
438
            sampleEnd,
439
            strand=None):
440
441
        if chromosome not in self.startCoordinates:
442
            if self.debug:
443
                self.debugMsg(
444
                    "Chromosome %s is not present in the annotations" %
445
                    chromosome)
446
            return([])
447
        startIndex = max(
448
            0,
449
            np.searchsorted(
450
                self.startCoordinates[chromosome],
451
                sampleStart,
452
                'left') - 1)
453
        startIndex = min(
454
            startIndex, max(
455
                0, np.searchsorted(
456
                    self.endCoordinates[chromosome], sampleEnd, 'left')))
457
        hits = set()
458
        x = True
459
        while(x and startIndex < len(self.features[chromosome])):
460
            d = self.features[chromosome][startIndex]
461
            (hitStart, hitEnd, name, hitStrand, data) = d
462
            if hitStart > sampleEnd:
463
                x = False
464
            else:
465
                # Does the feature overlap the sampling region
466
                if(max(sampleStart, hitStart) <= min(sampleEnd, hitEnd)):
467
                    #print(sampleStart,sampleEnd, hitStart,hitEnd, name)
468
                    # Check the strand
469
                    if strand is None or (strand == hitStrand):
470
                        hits.add(d)
471
            startIndex += 1
472
473
        hits.update(set(self.findFeaturesAt(chromosome, sampleStart, strand)))
474
        hits.update(set(self.findFeaturesAt(chromosome, sampleEnd, strand)))
475
476
        return(list(hits))
477
478
    def sort(self):
479
        """ Build coordinate sorted datastructure to perform fast lookups."""
480
        self.endIndexes = {}
481
        self.endIndexLookup = {}
482
        self.fastIndex = {}
483
        self.maxFeatureSizes = {}
484
        self.sorted = True
485
        for chromosome in self.features.keys():
486
            # Sort in place and return new indices
487
            self.features[chromosome].sort()
488
            self.startCoordinates[chromosome] = np.fromiter(
489
                (tup[0] for tup in self.features[chromosome]), dtype=np.int64)
490
            self.endCoordinates[chromosome] = np.fromiter(
491
                (tup[1] for tup in self.features[chromosome]), dtype=np.uint64)
492
            self.endIndexes[chromosome] = np.argsort(
493
                self.endCoordinates[chromosome])
494
            self.endCoordinates[chromosome] = self.endCoordinates[chromosome][self.endIndexes[chromosome]]
495
            self.endIndexLookup[chromosome] = {
496
                inR: orig for orig, inR in enumerate(
497
                    self.endIndexes[chromosome])}
498
499
            ####################### perform magic indexing ####################
500
            maxLengthFeature = np.max([tup[1] - tup[0]
501
                                       for tup in self.features[chromosome]])
502
            self.maxFeatureSizes[chromosome] = maxLengthFeature
503
504
            lowestStarts = np.fromiter(
505
                (min(
506
                    (f[0] for f in self.findFeaturesAt(
507
                        chromosome,
508
                        feature[0],
509
                        optim='nb'))) for feature in self.features[chromosome]),
510
                dtype=np.int64)
511
512
            self.fastIndex[chromosome] = np.searchsorted(
513
                self.startCoordinates[chromosome], lowestStarts, 'left')
514
            ########################
515
516
        # find the longest feature
517
518
    """Return a feature left of the lookupCoordinate"""
519
520
    def findNearestLeftFeature(
521
            self,
522
            chromosome,
523
            lookupCoordinate,
524
            strand=None):
525
        if chromosome not in self.features:
526
            return([])
527
        if not self.sorted:
528
            self.sort()
529
        """ Find closest feature left of the supplied coordinate """
530
        index = np.clip(
531
            np.searchsorted(
532
                self.endCoordinates[chromosome], lookupCoordinate, side='left'), 0, len(
533
                self.endCoordinates))
534
        self.debugMsg(
535
            "Looking up %s, Initial index is %s (start: %s, end %s)" %
536
            (lookupCoordinate,
537
             index,
538
             self.features[chromosome][index][0] if index < len(
539
                 self.features[chromosome]) else 'No feature hit',
540
                self.features[chromosome][index][1] if index < len(
541
                 self.features[chromosome]) else 'No feature hit'))
542
        #index = np.clip(index, 1, len(self.endCoordinates)-1)
543
        # Check if the index is zero, and this feature is actually more right:
544
        if index > (len(self.features[chromosome]) - 1):
545
            self.debugMsg("overflow INDEX condition, decreasing index")
546
            index -= 1
547
548
        if self.features[chromosome][index][0] > lookupCoordinate:
549
            self.debugMsg("Zero INDEX condition. Rejected feature.")
550
            return([])
551
552
        hitStrand = self.features[chromosome][index][3]
553
        # Find first feature which is on the same strand...
554
        while strand is not None and (hitStrand != strand) and index > 0:
555
            index -= 1
556
            if index > 0:
557
                hitStrand = self.features[chromosome][index][3]
558
        if index < 0:
559
            return([])
560
        return([self.features[chromosome][index]])
561
562
    def findNearestRightFeature(
563
            self,
564
            chromosome,
565
            lookupCoordinate,
566
            strand=None):
567
        if chromosome not in self.features:
568
            return([])
569
        if not self.sorted:
570
            self.sort()
571
        """ Find closest feature left of the supplied coordinate """
572
        index = np.searchsorted(
573
            self.startCoordinates[chromosome],
574
            lookupCoordinate + 1,
575
            side='left')
576
        self.debugMsg(
577
            "Looking up %s, Initial index is %s (start: %s, end %s), strand %s" %
578
            (lookupCoordinate,
579
             index,
580
             self.features[chromosome][index][0] if index < len(
581
                 self.features[chromosome]) else 'No feature hit',
582
                self.features[chromosome][index][1] if index < len(
583
                 self.features[chromosome]) else 'No feature hit',
584
                strand))
585
586
        # Check if the index is maxlen, and this feature is actually more right
587
        while not index < len(self.features[chromosome]):
588
            self.debugMsg("No feature on the right")
589
            return([])
590
            index -= 1
591
592
        self.debugMsg("Index is now %s" % index)
593
        if self.features[chromosome][index][1] < lookupCoordinate:
594
            self.debugMsg("Zero INDEX condition. Rejected feature.")
595
            return([])
596
597
        # print(self.features[chromosome][index])
598
        hitStrand = self.features[chromosome][index][3]
599
        # Find first feature which is on the same strand...
600
        while strand is not None and (hitStrand != strand):
601
            index += 1
602
            if not index < len(self.features[chromosome]):
603
                self.debugMsg("No feature on the right")
604
                return([])
605
            hitStrand = self.features[chromosome][index][3]
606
        if index < 0:
607
            return([])
608
        return([self.features[chromosome][index]])
609
610
    @functools.lru_cache(maxsize=512)
611
    def findNearestFeature(self, chromosome, lookupCoordinate, strand=None):
612
613
        s = self.findFeaturesAt(chromosome, lookupCoordinate, strand=None)
614
        if len(s):
615
            self.debugMsg(
616
                'Issued nearest feature search, but the coordinate lies within %s feature(s), returning those' %
617
                len(s))
618
            return(s)
619
620
        fr = self.findNearestRightFeature(
621
            chromosome, lookupCoordinate=lookupCoordinate, strand=strand)
622
        fl = self.findNearestLeftFeature(
623
            chromosome, lookupCoordinate=lookupCoordinate, strand=strand)
624
        self.debugMsg(
625
            'Feature R presence %s, feature L presence: %s' %
626
            (len(fr), len(fl)))
627
        if len(fr) == 0 and len(fl) == 0:
628
            self.debugMsg("Returning no hits")
629
            return([])
630
        elif len(fr) == 0:
631
            self.debugMsg("Returning Left as right is empty")
632
            return(fl)
633
        elif len(fl) == 0:
634
            self.debugMsg("Returning Right as left is empty")
635
            return(fr)
636
        else:
637
            distanceR, distanceL = fr[0][0] - \
638
                lookupCoordinate, lookupCoordinate - fl[0][1]
639
            self.debugMsg("Distances: %s and %s" % (distanceR, distanceL))
640
            if distanceR < distanceL:
641
                return([fr[0]])
642
            elif distanceL < distanceR:
643
                return([fl[0]])
644
            else:
645
                return([fl[0], fr[0]])
646
647
    @functools.lru_cache(maxsize=512)
648
    def findFeaturesAt(
649
            self,
650
            chromosome,
651
            lookupCoordinate,
652
            strand=None,
653
            optim='bdbnb'):
654
        return self._findFeaturesAt(
655
                chromosome,
656
                lookupCoordinate,
657
                strand=strand,
658
                optim=optim)
659
660
    def _findFeaturesAt(
661
            self,
662
            chromosome,
663
            lookupCoordinate,
664
            strand=None,
665
            optim='bdbnb'):
666
        if not self.sorted:
667
            self.sort()
668
        """Obtain the features at a give coordinate and optionally strand."""
669
670
        if chromosome not in self.startCoordinates:
671
            if self.debug:
672
                self.debugMsg(
673
                    "Chromosome %s is not present in the annotations" %
674
                    chromosome)
675
            return([])
676
677
        s = np.searchsorted(
678
            self.startCoordinates[chromosome],
679
            lookupCoordinate + 1,
680
            side='left')
681
        if optim == 'bdbnb':
682
            startRange = self.fastIndex[chromosome][s - 1]
683
            # We stop looking at the rightmost h
684
            endRange = min(s, len(self.features[chromosome]))
685
            if self.debug:
686
                self.debugMsg("index: %s" % self.fastIndex[chromosome])
687
                self.debugMsg("Fast index result: %s" %
688
                              self.fastIndex[chromosome][s - 1])
689
                ml = lookupCoordinate - self.maxFeatureSizes[chromosome]
690
                self.debugMsg(
691
                    "Vanilla index result: %s" %
692
                    np.searchsorted(
693
                        self.startCoordinates[chromosome],
694
                        ml,
695
                        side='left'))
696
                self.debugMsg("Start looking from %s" % startRange)
697
                self.debugMsg("To %s" % endRange)
698
699
            return([self.features[chromosome][i] for i in range(startRange, endRange) if (self.features[chromosome][i][1] >= lookupCoordinate and (strand is None or self.features[chromosome][i][3] == strand))])
700
        elif optim == 'nb':
701
            # Be smarter: take only segments where the end coordinate is bigger
702
            # than the lookupCoordinate
703
            # We start looking from the most left index possible given our
704
            # knowledge of the longest feature:
705
            ml = lookupCoordinate - self.maxFeatureSizes[chromosome]
706
            startRange = np.searchsorted(
707
                self.startCoordinates[chromosome],
708
                ml,
709
                side='left')  # Find where the left most index lies
710
            # For more optimalisation we want to skip the searchsorted and
711
            # prefetch the lookup array?
712
713
            # We stop looking at the leftmost h
714
            endRange = min(s, len(self.features[chromosome]))
715
            if self.debug:
716
                self.debugMsg("Start looking from %s" % startRange)
717
                self.debugMsg("To %s" % endRange)
718
                #self.debugMsg("start is %s"%self.startIndexLookup[chromosome][k])
719
            candidates = set(self.features[chromosome][i] for i in range(
720
                startRange, endRange) if self.features[chromosome][i][1] >= lookupCoordinate)
721
722
        elif optim == 'optim':
723
            s = np.searchsorted(
724
                self.startCoordinates[chromosome],
725
                lookupCoordinate + 1,
726
                side='left')
727
            # Be smarter: take only segments where the end coordinate is bigger
728
            # than the lookupCoordinate
729
            candidates = set(
730
                self.features[chromosome][i] for i in range(
731
                    0, min(
732
                        s, len(
733
                            self.features[chromosome]))) if self.features[chromosome][i][1] >= lookupCoordinate)
734
        else:
735
            e = np.searchsorted(
736
                self.endCoordinates[chromosome],
737
                lookupCoordinate - 1,
738
                side='right')
739
            leftSet = set(
740
                self.features[chromosome][i] for i in range(
741
                    0, min(
742
                        s, len(
743
                            self.features[chromosome]))))
744
            rightSet = set(self.features[chromosome][self.endIndexLookup[chromosome][i]] for i in range(
745
                min(e, len(self.features[chromosome])), len(self.features[chromosome])))
746
            candidates = leftSet.intersection(rightSet)
747
        return([candidate for candidate in candidates if (strand is None or candidate[3] == strand)])
748
        """
749
        if self.debug:
750
            self.debugMsg("Looking up %s %s %s Hit %s %s" % (chromosome, lookupCoordinate, strand, s, e))
751
            self.debugMsg(self.startCoordinates[chromosome])
752
        hits = []
753
        for i in range(s, self.endIndexLookup[chromosome][e] ): #min(s+1, len(self.features[chromosome]))
754
            if self.features[chromosome][i][0]<=lookupCoordinate and self.features[chromosome][i][1]>=lookupCoordinate:
755
                if self.debug:
756
                    self.debugMsg("  Strandless Hit feature %s %s %s [%s %s]" % (self.features[chromosome][i][0], self.features[chromosome][i][1], self.features[chromosome][i][2], self.features[chromosome][i][3],  self.features[chromosome][i][4]))
757
                if strand is None or strand==self.features[chromosome][i][3]:
758
                    hits.append(self.features[chromosome][i])
759
                elif self.debug:
760
                    self.debugMsg("   Strand miss %s for %s %s" % (strand, i, self.features[chromosome][i][3] ))
761
            elif self.debug:
762
                self.debugMsg("  Missed feature %s %s %s [%s %s]" % (self.features[chromosome][i][0], self.features[chromosome][i][1], self.features[chromosome][i][2], self.features[chromosome][i][3],  self.features[chromosome][i][4]))
763
                self.debugMsg("    reason:"  "%s <= coord" % self.features[chromosome][i][0] if  self.features[chromosome][i][0]<=lookupCoordinate else "%s > coord" % self.features[chromosome][i][1] )
764
        return(hits)
765
        """
766
767
    def findFeaturesAtPysamAlign(self, pysamRead, strand=None, method=1):
768
        """Obtain all features mapping the pysam aligned segment.
769
        method 0: Query EVERY base
770
        method 1: Query every subsequent block of reads (pysam aligned segment .get_blocks)
771
        """
772
        if pysamRead.reference_name not in self.startCoordinates:
773
            if self.debug:
774
                self.debugMsg(
775
                    "Chromosome %s is not present in the annotations" %
776
                    pysamRead.reference_name)
777
            return([])
778
        if method == 0:
779
            hits = set()
780
            for queryPos, referencePos in pysamRead.get_aligned_pairs(
781
                    matches_only=True, with_seq=False):
782
                hits.update(set(self.findFeaturesAt(
783
                    pysamRead.reference_name, referencePos, strand=strand)))
784
            return(hits)
785
        else:
786
            return(set(itertools.chain.from_iterable([
787
                self.findFeaturesBetween(
788
                    pysamRead.reference_name,
789
                    lookupCoordinateStart,
790
                    lookupCoordinateEnd,
791
                    strand=strand)
792
                for lookupCoordinateStart, lookupCoordinateEnd
793
                in pysamRead.get_blocks()])))
794
795
    def findFeaturesBetweenBRK(
796
            self,
797
            chromosome,
798
            lookupCoordinateStart,
799
            lookupCoordinateEnd,
800
            strand=None):
801
        """Obtain all features between Start and end coordinate."""
802
        if chromosome not in self.startCoordinates:
803
            if self.debug:
804
                self.debugMsg(
805
                    "Chromosome %s is not present in the annotations" %
806
                    chromosome)
807
            return([])
808
809
        """Find all features which are present at both the supplied start and end coordinate"""
810
        startHits = self.findFeaturesAt(
811
            chromosome, lookupCoordinateStart, strand)
812
        endHits = self.findFeaturesAt(chromosome, lookupCoordinateEnd, strand)
813
        if self.debug:
814
            self.debugMsg("Hits at %s %s %s" %
815
                          (chromosome, lookupCoordinateEnd, strand))
816
            self.debugMsg("  %s" % startHits)
817
            self.debugMsg("  %s" % endHits)
818
        return(list(set(startHits).intersection(set(endHits))))
819
820
    def loadSNPSFromVcf(self, vcfFilePath:str, locations: set=None):
821
        # Should be deprecated
822
        self.loadVariantsFromVcf(vcfFilePath,locations,snp_only=True)
823
824
    def loadVariantsFromVcf(self, vcfFilePath:str, locations: set=None, locations_only: bool=False, snp_only=False):
825
826
        for rec in pysam.VariantFile(vcfFilePath):
827
828
            if locations is None or (rec.chrom, rec.pos) in locations:
829
                if locations_only:
830
                    self.addFeature(rec.chrom, rec.pos, rec.pos+max(1,len(rec.ref)), name='variant')
831
                    continue
832
833
                for sample in rec.samples:
834
                    # get genotypes:
835
                    for allele in rec.samples[sample].alleles:
836
                        try:
837
                            self.addVariant(
838
                                chromosome=rec.chrom,
839
                                start=rec.pos,
840
                                value=allele,
841
                                name='SNP',
842
                                variantType='SNP',
843
                                end=None)
844
                        except BaseException:
845
                            if not snp_only:
846
                                self.addFeature(rec.chrom, rec.pos, rec.pos+max(1,len(rec.ref)), name='variant')
847
                            pass
848
        self.sort()
849
850
    def addVariant(
851
            self,
852
            chromosome,
853
            start,
854
            value=None,
855
            name='SNP',
856
            variantType='SNP',
857
            end=None):
858
        end = end if end is not None else start + 1
859
        if value not in ['A', 'T', 'C', 'G']:
860
            raise ValueError('%s is not a base' % value)
861
        self.addFeature(chromosome, start, end, name=name, data=('SNP', value))
862
863
864
def massIdConvert(
865
        baseIds,
866
        pathToIdMapping='/media/sf_data/references/human/HUMAN_9606_idmapping_selected.tab.gz',
867
        targetCol=1):
868
    """Convert GENE identifiers into another format.
869
    Get a conversion table from ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/
870
    """
871
    converted = {}
872
    baseIds = set(baseIds)
873
    h = gzip.open(pathToIdMapping)
874
    for l in h:
875
        line = l.decode('utf8')
876
        for identifier in baseIds:
877
            if identifier in line:
878
                convertedTo = line.split()[targetCol]
879
                if len(convertedTo):
880
                    if identifier not in converted:
881
                        converted[identifier] = []
882
                    converted[identifier].append(convertedTo)
883
884
    return(converted)
885
886
887
class FeatureAnnotatedObject():
888
889
    def __init__(self, features, stranded, capture_locations, auto_set_intron_exon_features ):
890
891
        self.features = features
892
        self.hits = collections.defaultdict(set)  # feature -> hit_bases
893
        self.stranded = stranded
894
        self.is_annotated = False
895
        self.capture_locations = capture_locations
896
        if capture_locations:
897
            self.feature_locations = {} #feature->locations (chrom,start,end, strand)
898
899
        self.junctions = set()
900
        self.genes = set()
901
        self.introns = set()
902
        self.exons = set()
903
        self.exon_hit_gene_names = set()  # readable names
904
        self.is_spliced = None
905
906
        if auto_set_intron_exon_features:
907
            self.set_intron_exon_features()
908
909
910
    def set_spliced(self, is_spliced):
911
        """ Set wether the transcript is spliced, False has priority over True """
912
        if self.is_spliced and not is_spliced:
913
            # has already been set
914
            self.is_spliced = False
915
        else:
916
            self.is_spliced = is_spliced
917
918
    def get_hit_df(self):
919
        """Obtain dataframe with hits
920
        Returns:
921
            pd.DataFrame
922
        """
923
        if not self.is_annotated:
924
            self.set_intron_exon_features()
925
926
        d = {}
927
        tabulated_hits = []
928
        for hit, locations in self.hits.items():
929
            if not isinstance(hit, tuple):
930
                continue
931
            meta = dict(list(hit))
932
            for location in locations:
933
                location_dict = {
934
                    'chromosome': location[0],
935
                    'start': location[1][0],
936
                    'end': location[1][1]}
937
                location_dict.update(meta)
938
                tabulated_hits.append(location_dict)
939
940
        return pd.DataFrame(tabulated_hits)
941
942
    def write_tags(self):
943
944
        if len(self.exons) > 0:
945
            self.set_meta('EX', ','.join(sorted([str(x) for x in self.exons])))
946
        else:
947
            self.set_meta('EX',None)
948
949
        if len(self.introns) > 0:
950
            self.set_meta('IN', ','.join(
951
                sorted([str(x) for x in self.introns])))
952
        else:
953
            self.set_meta('IN',None)
954
955
        if len(self.genes) > 0:
956
            self.set_meta('GN', ','.join(sorted([str(x) for x in self.genes])))
957
        else:
958
            self.set_meta('GN',None)
959
960
        if len(self.junctions) > 0:
961
            self.set_meta('JN', ','.join(
962
                sorted([str(x) for x in self.junctions])))
963
            # Is transcriptome
964
            self.set_meta('IT', 1)
965
        elif len(self.genes) > 0:
966
            # Maps to gene but not junction
967
            self.set_meta('IT', 0.5)
968
            self.set_meta('JN',None)
969
        else:
970
            # Doesn't map to gene
971
            self.set_meta('IT', 0)
972
            self.set_meta('JN', None)
973
974
        if self.is_spliced is True:
975
            self.set_meta('SP', True)
976
        elif self.is_spliced is False:
977
            self.set_meta('SP', False)
978
        if len(self.exon_hit_gene_names):
979
            self.set_meta('gn', ';'.join(list(self.exon_hit_gene_names)))
980
        else:
981
            self.set_meta('gn',None)
982
983
    def set_intron_exon_features(self):
984
        if not self.is_annotated:
985
            self.annotate()
986
987
        # Collect all hits:
988
        hits = self.hits.keys()
989
990
        # (gene, transcript) -> set( exon_id  .. )
991
        exon_hits = collections.defaultdict(set)
992
        intron_hits = collections.defaultdict(set)
993
994
        for hit, locations in self.hits.items():
995
            if not isinstance(hit, tuple):
996
                continue
997
998
            meta = dict(list(hit))
999
            if 'gene_id' not in meta:
1000
                continue
1001
            if meta.get('type') == 'exon':
1002
                if 'transcript_id' not in meta:
1003
                    continue
1004
                self.genes.add(meta['gene_id'])
1005
                self.exons.add(meta['exon_id'])
1006
                if 'transcript_id' not in meta:
1007
                    raise ValueError(
1008
                        "Please use an Intron GTF file generated using -id 'transcript_id'")
1009
                exon_hits[(meta['gene_id'], meta['transcript_id'])].add(
1010
                    meta['exon_id'])
1011
                if 'gene_name' in meta:
1012
                    self.exon_hit_gene_names.add(meta['gene_name'])
1013
            elif meta.get('type') == 'intron':
1014
                self.genes.add(meta['gene_id'])
1015
                self.introns.add(meta['gene_id'])
1016
1017
        # Find junctions and add all annotations to annotation sets
1018
        debug = []
1019
1020
        for (gene, transcript), exons_overlapping in exon_hits.items():
1021
            # If two exons are detected from the same gene we detected a
1022
            # junction:
1023
            if len(exons_overlapping) > 1:
1024
                self.junctions.add(gene)
1025
1026
                # We found two exons and an intron:
1027
                if gene in self.introns:
1028
                    self.set_spliced(False)
1029
                else:
1030
                    self.set_spliced(True)
1031
1032
            debug.append(
1033
                f'{gene}_{transcript}:' +
1034
                ','.join(
1035
                    list(exons_overlapping)))
1036
1037
        # Write exon dictionary:
1038
        self.set_meta('DB', ';'.join(debug))
1039
1040
1041
if __name__ == "__main__":
1042
    """The following are all test functions for the annotation class"""
1043
1044
    from colorama import Fore  # ,Back, Style
1045
    from colorama import Back
1046
    from colorama import Style
1047
    from colorama import init
1048
    init(autoreset=True)
1049
1050
    def formatColor(string):
1051
        return(string.replace("[GREEN]", Fore.GREEN).replace("[RED]", Fore.RED).replace("[DIM]", Style.DIM).replace("[RESET]", Style.RESET_ALL).replace("[BRIGHT]", Style.BRIGHT).replace("[NORMAL]", Style.NORMAL))
1052
1053
    def printFormatted(string):
1054
        print(formatColor(str(string)))
1055
1056
    def printFormattedDim(string):
1057
        print(formatColor("   [DIM]%s" % str(string)))
1058
    """Self tests:"""
1059
1060
    # addFeature( chromosome, start, end, name, strand=None, data=None):
1061
    # Build the reference:
1062
    print(
1063
        """
1064
    .......(1)---------------------->
1065
    ..............<----------------(2)
1066
    chr1   100    110             200
1067
1068
1069
    .......(3)---------------------->
1070
    .......(4)----->
1071
    .......(5)<-------------
1072
    chr2   100    110     150     200
1073
    .
1074
    """
1075
    )
1076
1077
    def expect(result, desired, presenceTestOnly=False):
1078
        if presenceTestOnly:
1079
            printFormatted(
1080
                "Expecting %s, %s" %
1081
                (desired,
1082
                 ("[BRIGHT][GREEN] SUCCES [RESET][DIM]%s\n" %
1083
                  result) if any(
1084
                     r[2] in desired for r in result) else '[RED]FAIL %s\n' %
1085
                    result))
1086
            return
1087
        if desired is None:
1088
            printFormatted(
1089
                "Expecting %s, %s" %
1090
                (desired,
1091
                 ("[BRIGHT][GREEN] SUCCES [RESET][DIM]%s\n" %
1092
                  result) if len(result) == 0 else '[RED]FAIL %s' %
1093
                    result))
1094
        elif isinstance(desired, list):
1095
            printFormatted(
1096
                "Expecting %s, %s" %
1097
                (desired, ("[BRIGHT][GREEN] SUCCES [RESET][DIM]%s\n" %
1098
                           result) if len(result) == len(desired) and all(
1099
                    r[2] in desired for r in result) else '[RED]FAIL %s\n' %
1100
                    result))
1101
        else:
1102
            printFormatted(
1103
                "Expecting %s, %s" %
1104
                (desired,
1105
                 ("[BRIGHT][GREEN] SUCCES [RESET][DIM]%s\n" %
1106
                  result) if len(result) == 1 and result[0][2] == desired else '[RED]FAIL %s\n' %
1107
                    result))
1108
    f = FeatureContainer()
1109
    f.debug = True
1110
    f.debugMsg = printFormattedDim
1111
    f.addFeature('chrY', 1, 3, 'A', '+', '')
1112
    f.addFeature('chrY', 5, 8, 'B', '+', '')
1113
    f.sort()
1114
    expect(f.findFeaturesAt('chrY', 0, '+'), None)
1115
    expect(f.findFeaturesAt('chrY', 1, '+'), 'A')
1116
    expect(f.findFeaturesAt('chrY', 2, '+'), 'A')
1117
    expect(f.findFeaturesAt('chrY', 3, '+'), 'A')
1118
    expect(f.findFeaturesAt('chrY', 4, '+'), None)
1119
    expect(f.findFeaturesAt('chrY', 5, '+'), 'B')
1120
    expect(f.findFeaturesAt('chrY', 6, '+'), 'B')
1121
    expect(f.findFeaturesAt('chrY', 8, '+'), 'B')
1122
1123
    f = FeatureContainer()
1124
    f.debug = True
1125
    f.debugMsg = printFormattedDim
1126
    f.addFeature('chrX', 10, 1000, 'parentB', '+', '')
1127
    f.addFeature('chrX', 500, 900, 'nestedB', '+', '')
1128
    f.addFeature('chrX', 10000, 12000, 'C', '+', '')
1129
    f.addFeature('chrX', 100000, 120000, 'D', '+', '')
1130
    f.sort()
1131
    print(f.findFeaturesAt('chrX', 550, '+'))
1132
    print(f.findFeaturesAt('chrX', 10000, '+'))
1133
    print(f.findFeaturesAt('chrX', 100000000, '+'))
1134
    expect(f.findFeaturesAt('chrX', 9, '+'), None)
1135
    expect(f.findFeaturesAt('chrX', 12001, '+'), None)
1136
    expect(f.findFeaturesAt('chrX', 12000, '+'), 'C')
1137
    expect(f.findFeaturesAt('chrX', 120000, '+'), 'D')
1138
    expect(f.findFeaturesAt('chrX', 10, '+'), 'parentB')
1139
1140
    f = FeatureContainer()
1141
    f.debug = True
1142
    f.debugMsg = printFormattedDim
1143
1144
    f.addFeature(
1145
        'chr1',
1146
        100,
1147
        200,
1148
        '1',
1149
        '+',
1150
        'A forward feature from 100 to 200 chr1')
1151
    f.addFeature(
1152
        'chr1',
1153
        110,
1154
        200,
1155
        '2',
1156
        '-',
1157
        'A reverse feature from 110 to 200 chr1')
1158
    f.addFeature(
1159
        'chr2',
1160
        100,
1161
        200,
1162
        '3',
1163
        '+',
1164
        'A forward feature from 100 to 200 chr2')
1165
    f.addFeature(
1166
        'chr2',
1167
        100,
1168
        110,
1169
        '4',
1170
        '+',
1171
        'A forward feature from 100 to 110 chr2')
1172
    f.addFeature(
1173
        'chr2',
1174
        100,
1175
        150,
1176
        '5',
1177
        '-',
1178
        'A reverse feature from 100 to 150 chr2')
1179
1180
    f.addFeature('chr3', 100, 150, '6', '-', 'feature 6')
1181
    f.addFeature('chr3', 200, 250, '7', '-', 'feature 7')
1182
    f.addFeature('chr3', 200, 450, '8', '-', 'feature 8')
1183
    f.addFeature('chr3', 10, 15, '9', '-', 'feature 9')
1184
1185
    printFormatted("[BRIGHT]Test for reference presence:")
1186
    result = f.getReferenceList()
1187
    desired = ['chr1', 'chr2', 'chr3']
1188
    printFormatted(
1189
        "Expecting %s, %s" %
1190
        (desired,
1191
         ("[BRIGHT][GREEN] SUCCES [RESET][DIM]%s\n" %
1192
          result) if len(result) == len(desired) and all(
1193
             r in desired for r in result) else '[RED]FAIL %s\n' %
1194
            result))
1195
1196
    printFormatted("[BRIGHT]Test on leftmost start:")
1197
    expect(f.findFeaturesAt('chr1', 100, '+'), '1')
1198
1199
    printFormatted("[BRIGHT]Test on rightmost end:")
1200
    expect(f.findFeaturesAt('chr1', 200, '+'), '1')
1201
1202
    printFormatted("[BRIGHT]Test on random location within feature:")
1203
    expect(f.findFeaturesAt('chr1', 120, '+'), '1')
1204
1205
    printFormatted("[BRIGHT]Test on random location within feature:")
1206
    expect(f.findFeaturesAt('chr2', 120, '+'), '3')
1207
1208
    printFormatted("[BRIGHT]Test on limit location of feature:")
1209
    expect(f.findFeaturesAt('chr2', 200, '+'), '3')
1210
1211
    printFormatted(
1212
        "[BRIGHT]Test on non matching location (match available on other side, and one base left of coord):")
1213
    expect(f.findFeaturesAt('chr2', 151, '-'), None)
1214
1215
    printFormatted(
1216
        "[BRIGHT]Tests on double matching locations without strand spec")
1217
    expect(f.findFeaturesAt('chr1', 120), ['1', '2'])
1218
    expect(f.findFeaturesAt('chr2', 105), ['3', '4', '5'])
1219
1220
    printFormatted("[BRIGHT] ==== Range tests... ====")
1221
    printFormatted(
1222
        "[BRIGHT]Test for matching start and end coordinates overlapping one feature")
1223
    expect(f.findFeaturesBetween('chr2', 102, 200, '+'), ['3', '4'])
1224
1225
    printFormatted("[BRIGHT]Test for matching all features on chromosome")
1226
    expect(f.findFeaturesBetween('chr2', 0, 20000, None), ['3', '4', '5'])
1227
1228
    printFormatted(
1229
        "[BRIGHT]Test for matching all but one features on chromosome")
1230
    expect(f.findFeaturesBetween('chr3', 151, 20000, None), ['8', '7'])
1231
1232
    printFormatted(
1233
        "[BRIGHT]Test for matching all but one features on chromosome")
1234
    expect(f.findFeaturesBetween('chr3', 0, 195, None), ['6', '9'])
1235
1236
    printFormatted("[BRIGHT]Test for range finding non-existent feature")
1237
    expect(f.findFeaturesBetween('chr2', 2001, 2000, '+'), None)
1238
1239
    printFormatted(
1240
        "[BRIGHT]Test for finding non-existent feature LEFT NEXT to the point")
1241
    expect(f.findNearestLeftFeature('chr2', 50, '+'), None)
1242
1243
    printFormatted(
1244
        "[BRIGHT]Test for finding existent feature LEFT NEXT to the point")
1245
    expect(f.findNearestLeftFeature('chr2', 250, None), '3')
1246
1247
    printFormatted(
1248
        "[BRIGHT]Test for finding non-existent feature RIGHT NEXT to the point")
1249
    expect(f.findNearestRightFeature('chr2', 250, '+'), None)
1250
1251
    printFormatted(
1252
        "[BRIGHT]Test for finding existent feature RIGHT NEXT to the point")
1253
    expect(f.findNearestRightFeature('chr2', 0, '-'), '5')
1254
1255
    printFormatted("[BRIGHT]Test for finding closest feature")
1256
    print(f.findNearestFeature('chr1', 0, None))
1257
    expect(f.findNearestFeature('chr1', 0, None), '1')
1258
1259
    # Sharing related stuff
1260
    """from multiprocessing.managers import BaseManager
1261
    import multiprocessing
1262
    # Share the genome annotations over multiple processes:
1263
    class bdbsSharedObjectManager(BaseManager): pass
1264
    def Manager():
1265
        m = bdbsSharedObjectManager()
1266
        m.start()
1267
        return m
1268
    # initialisation #
1269
    bdbsSharedObjectManager.register('FeatureContainer', FeatureContainer)
1270
    sharedDataManager = Manager()
1271
    f = sharedDataManager.FeatureContainer()
1272
1273
1274
    def findFeaturesAt(args):
1275
        featureContainer, chromosome, position, strand = args
1276
        featureContainer.findFeaturesAt(chromosome, position, strand )
1277
1278
    print('Running with pool')
1279
    pool = multiprocessing.Pool(8)
1280
    print("With index:")
1281
    bar = progressbar.ProgressBar(max_value=N)
1282
    for q,result in enumerate(pool.imap( findFeaturesAt, (  (f, 'chr1', q, '+') for q in range(N) ),1000)):
1283
         bar.update(q)
1284
        #expect( f.findFeaturesAt('chr1',q,'+'), '1', True)
1285
    bar.finish()
1286
    exit()
1287
    """
1288
    #######################################
1289
    import random
1290
    import progressbar
1291
    print('Creating random features')
1292
1293
    N = 1000000
1294
    f.debug = False
1295
    printFormatted(
1296
        "[BRIGHT]Test with %s random features added to the chromosome" %
1297
        N)
1298
    expect(f.findFeaturesAt('chr1', 100, '+'), '1', True)
1299
    for i in range(0, N):
1300
        s = random.randint(0, 100_000_000)
1301
        f.addFeature('chr1', s, s +
1302
                     random.randint(1, 1000), 'art_%s' %
1303
                     i, ['-', '+'][random.randint(0, 1)], 'A random feature')
1304
        #f.findNearestFeature('chr1', random.randint(0,1000), '+')
1305
    print('Constructing index...')
1306
    f.sort()
1307
    #######################################
1308
1309
    print("With index:")
1310
    bar = progressbar.ProgressBar(max_value=N)
1311
    for q in range(N):
1312
        f.findFeaturesAt('chr1', q, '+')
1313
        bar.update(q)
1314
        #expect( f.findFeaturesAt('chr1',q,'+'), '1', True)
1315
    bar.finish()
1316
1317
    print("Without index:")
1318
    bar = progressbar.ProgressBar(max_value=N)
1319
    for q in range(N):
1320
        f.findFeaturesAt('chr1', q, '+', optim='nb')
1321
        bar.update(q)
1322
        #expect( f.findFeaturesAt('chr1',q,'+'), '1', True)
1323
    bar.finish()