Switch to unified view

a b/singlecellmultiomics/utils/bdbbio.py
1
from Bio import SeqIO
2
from Bio.Seq import Seq
3
import os
4
import uuid
5
from Bio import pairwise2
6
import math
7
import subprocess
8
from colorama import Fore #,Back, Style
9
from colorama import Back
10
from colorama import Style
11
from colorama import init
12
init(autoreset=True)
13
import numpy as np
14
import re
15
from itertools import takewhile,repeat
16
#import localInstall
17
18
if not os.name=='nt':
19
    import pysam
20
### custom path config: (not used when not available)
21
22
#localInstall.addToPath('/hpc/hub_oudenaarden/bdebarbanson/bin/samtools-1.3.1/')
23
#localInstall.addToPath('/hpc/hub_oudenaarden/bin/software/bwa-0.7.10/')
24
#localInstall.addToPath('/media/sf_externalTools/')
25
26
### end
27
28
class AlignmentVisualiser:
29
    def __init__(self, initialiser):
30
31
        if type(initialiser) is pysam.AlignedSegment:
32
            self.alignment = initialiser
33
34
    def getPhredScoreColor(self, phredScore):
35
36
        p = ord(phredScore)-33.0
37
        if p>32:
38
            return(Fore.GREEN + Style.BRIGHT)
39
        if p>25:
40
            return(Fore.GREEN)
41
        if p>20:
42
            return(Fore.GREEN + Style.DIM)
43
        if p>18:
44
            return(Fore.YELLOW)
45
        return(Fore.RED)
46
47
    def visualizeAlignment(self, annotateReference=None, annotateQuery=None, intronSummary=True):
48
49
        reference = []
50
        query = []
51
        alignmentSymbols = []
52
        qualitieSymbols = []
53
        prevIntron = 0
54
        for index,pair in  enumerate(self.alignment.get_aligned_pairs( matches_only=False, with_seq=True) ):
55
            #Pair structure = (queryPosition, referencePosition, referenceBase)
56
57
            if pair[0] is None and pair[2] is None:
58
                prevIntron+=1
59
                continue
60
            else:
61
                if prevIntron>0:
62
                    intronString = ' N%s ' % prevIntron
63
                    alignmentSymbols.append(Style.DIM + intronString + Style.RESET_ALL)
64
                    qualitieSymbols.append(' '*len(intronString))
65
                    reference.append(Style.DIM +  ('-'*len(intronString)) + Style.RESET_ALL)
66
                    query.append( Style.DIM +  (' '*len(intronString)) + Style.RESET_ALL)
67
                    prevIntron = 0
68
69
            queryPosition = pair[0]
70
            queryNucleotide= "%s%s-%s"% (Back.RED, Fore.WHITE, Style.RESET_ALL)
71
            referenceNucleotide= ' '# % (Fore.RED, Style.RESET_ALL) #if queryPosition>self.alignment.query_alignment_start and queryPosition<=self.alignment.query_alignment_end else ' '
72
            queryPhred=' '
73
            if queryPosition!=None:
74
                queryNucleotide = self.alignment.query_sequence[queryPosition].upper()
75
                queryPhred = self.alignment.qual[queryPosition]
76
                qualitieSymbols.append(self.getPhredScoreColor(queryPhred) + queryPhred + Style.RESET_ALL)
77
            else:
78
                qualitieSymbols.append(' ')
79
80
            #print("Q:%s  S:%s" % (queryPosition, self.alignment.query_alignment_start))
81
82
            if pair[2]!=None:
83
                referenceNucleotide =  pair[2].upper()
84
            elif queryPosition is None:
85
                referenceNucleotide = "%s%s-%s"% (Back.RED, Fore.WHITE, Style.RESET_ALL)
86
            elif queryPosition>self.alignment.query_alignment_start and queryPosition<=self.alignment.query_alignment_end:
87
88
                referenceNucleotide = "%s%s-%s"% (Back.RED, Fore.WHITE, Style.RESET_ALL)
89
90
            if pair[2]==queryNucleotide:
91
                alignmentSymbols.append('|')
92
            else:
93
                alignmentSymbols.append(' ')
94
95
            reference.append(referenceNucleotide)
96
            if queryPosition!=None and annotateQuery!=None:
97
98
                if ((queryPosition - self.alignment.query_length) in  annotateQuery and  (queryPosition - self.alignment.query_length<0 )):
99
                    query.append(getattr(Back,annotateQuery[(queryPosition - self.alignment.query_length)]) + Fore.BLACK + queryNucleotide + Style.RESET_ALL)
100
                elif queryPosition in annotateQuery:
101
                    query.append(getattr(Back,annotateQuery[queryPosition]) + Fore.WHITE + queryNucleotide + Style.RESET_ALL)
102
                else:
103
                    #DIM soft clipped things:
104
                    if queryPosition<self.alignment.query_alignment_start or queryPosition>self.alignment.query_alignment_end:
105
                        query.append(Style.DIM + Fore.YELLOW + queryNucleotide + Style.RESET_ALL)
106
107
                    else:
108
                        query.append(queryNucleotide)
109
            else:
110
                query.append(queryNucleotide)
111
        rStart = self.alignment.reference_end if  self.alignment.is_reverse else self.alignment.reference_start
112
        rEnd =  self.alignment.reference_start if  self.alignment.is_reverse else self.alignment.reference_end
113
        refStartStr = str(rStart)
114
        qStartStr = str(self.alignment.query_alignment_start)
115
        labelSize = max(len(refStartStr), len(qStartStr))
116
117
118
        annotatedCigar = []
119
        for operation,length in self.alignment.cigartuples:
120
            annotatedCigar.append( '%s%s%s' % (
121
                [ '%s%sM%s'% (Fore.GREEN, Style.BRIGHT, Style.NORMAL),
122
                 '%s%sI%s' % (Fore.YELLOW,Style.BRIGHT, Style.NORMAL),
123
                 '%s%sD%s' % (Fore.RED,Style.BRIGHT, Style.NORMAL),
124
                 '%sN' % Fore.MAGENTA,
125
                 '%s%sS' % (Style.DIM,Fore.RED),
126
                 '%s%sH' % (Style.DIM,Fore.RED),
127
                 '%s%sP' % (Style.DIM,Fore.RED),
128
                 '%s%s=' % (Style.DIM,Fore.GREEN),
129
                 '%s%sX' % (Style.DIM,Fore.BLUE)]
130
                [operation]
131
                , length, Style.RESET_ALL))
132
133
        multiMapping = ''
134
        try:
135
            multiMappingCount = int(self.alignment.get_tag('NH:i'))
136
137
            if multiMappingCount==1:
138
                multiMapping= Fore.GREEN + 'Unique map' + Style.RESET_ALL
139
            elif multiMappingCount>1:
140
                multiMapping= Fore.WHITE + Back.RED + ('Mapping to at least %s locations' % multiMappingCount) + Style.RESET_ALL
141
        except:
142
            pass
143
144
        print( ('%sReference:%s %s %s %s MQ:%s %s' % ( Style.BRIGHT, Style.RESET_ALL, self.alignment.reference_name, "".join(annotatedCigar), ("FWD" if not self.alignment.is_reverse else "REV"), self.alignment.mapping_quality, multiMapping))+ (' MULTIMAPPING' if self.alignment.is_secondary else '' )  )
145
        print(('%s%s %s %s (%s%s)' % (
146
            Style.DIM+refStartStr+Style.RESET_ALL,
147
            " "*(labelSize-len(refStartStr)),
148
            "".join(reference),
149
            (Style.DIM+str(rEnd)+Style.RESET_ALL),
150
            "" if self.alignment.is_reverse else "+",
151
            (Style.DIM+str(rEnd-rStart)+Style.RESET_ALL))))
152
        print(('%s%s' % (" "*(labelSize+1), "".join(alignmentSymbols))))
153
        print(('%s%s %s %s (%s)' % (
154
            " "*(labelSize-len(qStartStr)),
155
            Style.DIM+qStartStr+Style.RESET_ALL,
156
            "".join(query),
157
            (Style.DIM+str(self.alignment.query_alignment_end)+Style.RESET_ALL),
158
            (Style.DIM+str(self.alignment.query_alignment_end-self.alignment.query_alignment_start)+Style.RESET_ALL),
159
            )))
160
        print(('%s%s' % (" "*(labelSize+1), "".join(qualitieSymbols))))
161
        print(('%sQuery:%s %s' % (Style.BRIGHT, Style.RESET_ALL,self.alignment.query_name)))
162
163
def getLevenshteinDistance(source, target, maxDist=99999999999999):
164
    if len(source) < len(target):
165
        return getLevenshteinDistance(target, source)
166
167
    # So now we have len(source) >= len(target).
168
    if len(target) == 0:
169
        return len(source)
170
171
    # We call tuple() to force strings to be used as sequences
172
    # ('c', 'a', 't', 's') - numpy uses them as values by default.
173
    source = np.array(tuple(source))
174
    target = np.array(tuple(target))
175
176
    # We use a dynamic programming algorithm, but with the
177
    # added optimization that we only need the last two rows
178
    # of the matrix.
179
    previous_row = np.arange(target.size + 1)
180
181
    for s in source:
182
        # Insertion (target grows longer than source):
183
        current_row = previous_row + 1
184
185
        # Substitution or matching:
186
        # Target and source items are aligned, and either
187
        # are different (cost of 1), or are the same (cost of 0).
188
        current_row[1:] = np.minimum(
189
                current_row[1:],
190
                np.add(previous_row[:-1], target != s))
191
192
        # Deletion (target grows shorter than source):
193
        current_row[1:] = np.minimum(
194
                current_row[1:],
195
                current_row[0:-1] + 1)
196
197
        previous_row = current_row
198
        if( np.amin(previous_row)>maxDist):
199
            return(None)
200
201
202
    return previous_row[-1]
203
204
205
def cigarStringToDict(cigarString):
206
    if cigarString==None:
207
        cs = ''
208
    else:
209
        cs = cigarString
210
    cigarStringDict = {}
211
    for symbol in ['M','D','I','S']:
212
         # Extract the integer which is found before the symbol supplied
213
        # returns 0 when the symbol is not found as the sum of the list will be zero
214
        matches = re.findall('[0-9]*%s' % symbol, cs)
215
        cigarStringDict[symbol] = sum( [ int(x.replace(symbol,'')) for x in matches])
216
217
    return( cigarStringDict)
218
219
220
def distanceLineToPoint(x1, y1, x2, y2, x0, y0):
221
222
    return( float( abs( (y2-y1)*x0 - (x2 - x1 )*y0 + x2*y1 - y2*x1 ) ) / (math.sqrt( math.pow(y2-y1,2) + math.pow(x2-x1,2))) )
223
224
225
def fixGraphML(graphmlPath):
226
    with open(graphmlPath) as f:
227
        lines = []
228
        for line in f:
229
            lines.append( line.replace('"d3"','"fixedD3"') )
230
    if lines!=None:
231
        with open(graphmlPath,'w') as f:
232
            for line in lines:
233
                f.write(line)
234
235
236
def getHammingDistance(s1, s2,maxDist=999999):
237
    d = 0
238
    for index,base in enumerate(s1):
239
        d+=(base!=s2[index] and s2[index]!="N" and base!="N")
240
        if d>maxDist:
241
            return(maxDist+1)
242
    return(d)
243
244
245
def getHammingIndices( seqA, seqB, maxIndices=999999 ):
246
247
    indices = []
248
    for index,base in enumerate(seqA):
249
        if len(seqB)<index:
250
            break
251
        if base!=seqB[index]:
252
            indices.append(index)
253
            if indices>=maxIndices:
254
                return(indices)
255
256
    return(indices)
257
258
def humanReadable(value, targetDigits=2,fp=0):
259
260
        if value == 0:
261
                return('0')
262
263
        baseId = int(math.floor( math.log10(float(value))/3.0 ))
264
        suffix = ""
265
        if baseId==0:
266
                sVal =  str(round(value,targetDigits))
267
                if len(sVal)>targetDigits and sVal.find('.'):
268
                        sVal = sVal.split('.')[0]
269
270
        elif baseId>0:
271
272
                sStrD = max(0,targetDigits-len(str( '{:.0f}'.format((value/(math.pow(10,baseId*3)))) )))
273
274
275
                sVal = ('{:.%sf}' % min(fp, sStrD)).format((value/(math.pow(10,baseId*3))))
276
                suffix = 'kMGTYZ'[baseId-1]
277
        else:
278
279
                sStrD = max(0,targetDigits-len(str( '{:.0f}'.format((value*(math.pow(10,-baseId*3)))) )))
280
                sVal = ('{:.%sf}' %  min(fp, sStrD)).format((value*(math.pow(10,-baseId*3))))
281
                suffix = 'mnpf'[-baseId-1]
282
283
                if len(sVal)+1>targetDigits:
284
                        # :(
285
                        sVal = str(round(value,fp))[1:]
286
                        suffix = ''
287
288
289
        return('%s%s' % (sVal,suffix))
290
291
292
293
294
##Pathtools
295
def decodePath( path, spacers={'flag': ',', 'param':'_', 'kv':'=', 'invalid':['=','_']}):
296
297
298
        invalidFileNameCharacters = spacers['invalid']
299
        flagSpacer = spacers['flag']
300
        paramSpacer = spacers['param']
301
        keyValueSpacer = spacers['kv']
302
303
        decodedPath = {
304
            'runId':None,
305
            'step':-1,
306
            'name':'unknown',
307
            'parameters':{},
308
            'flags' : [],
309
            'extension':''
310
        }
311
312
        parts = os.path.splitext(path)
313
        basePath = parts[0]
314
        decodedPath['extension'] = parts[1]
315
316
        #Check if the anlysis base directory is in the path:
317
        parts = basePath.split('/')
318
        if 'analysis_' in parts[0]:
319
            decodedPath['runId'] = parts[0]
320
            toDecode = parts[1]
321
        else:
322
            toDecode = path
323
324
        keyValuePairs = basePath.split(paramSpacer)
325
        for pair in keyValuePairs:
326
            keyValue = pair.split(keyValueSpacer)
327
            if len(keyValue)!=2:
328
                print(('Parameter decoding failed: %s, this probably means input files contain invalid characters such as %s' % (pair, ", ".join(invalidFileNameCharacters))))
329
            else:
330
                key = keyValue[0]
331
                value = keyValue[1]
332
                if key!='flags' and key!='name':
333
                    decodedPath['parameters'][key] = value
334
                elif key=='flags':
335
                    decodedPath['flags'] = value.split(flagSpacer)
336
                elif key=='name':
337
                    decodedPath['name'] = value
338
339
        return(decodedPath)
340
341
342
def invBase(nucleotide):
343
344
        if nucleotide=='A':
345
            return(['T','C','G'])
346
        if nucleotide=='C':
347
            return(['T','A','G'])
348
        if nucleotide=='T':
349
            return(['C','A','G'])
350
        if nucleotide=='G':
351
            return(['C','A','T'])
352
        return([''])
353
354
def encodePath(step, name, params, extension, spacers={'flag': ',', 'param':'_', 'kv':'=', 'invalid':['=','_']}):
355
356
    invalidFileNameCharacters = spacers['invalid']
357
    flagSpacer = spacers['flag']
358
    paramSpacer = spacers['param']
359
    keyValueSpacer = spacers['kv']
360
361
    #Todo get keyValueSpacer in here
362
    encodedPath = 'step={:0>2d}'.format(step)
363
    encodedPath += '_name=%s' % name
364
    for parname in params:
365
        if parname.lower()!="flags":
366
            if not( any(i in parname for i in invalidFileNameCharacters) or any(i in params[parname] for i in invalidFileNameCharacters) ):
367
                encodedPath+='%s%s%s%s' % (paramSpacer, parname, keyValueSpacer, params[parname])
368
            else:
369
                raise ValueError('Parameter encoding failed: %s %s contains at least one invalid character' % (parname, params[parname]) )
370
371
    #Todo check flags for mistakes
372
    if 'flags' in params:
373
        encodedPath+='%sflags%s%s' % (paramSpacer, keyValueSpacer,flagSpacer.join(params['flags']))
374
375
    if extension!=None and extension!=False:
376
        encodedPath = '%s.%s' % (encodedPath, extension)
377
378
    return(encodedPath)
379
380
381
def locateIndel( seqA, seqB, verbose=False ):
382
383
    alignment = pairwise2.align.globalmx(seqA, seqB, 2, -1, one_alignment_only=True)[0]
384
385
    #pairwise2.format_alignment(*alignment)
386
    indelLocA = -1
387
    indelLocB = -1
388
389
    if alignment:
390
        align1 = alignment[0]
391
        align2 = alignment[1]
392
393
        localAlign1 = align1.strip('-')
394
        localAlign2 = align2.strip('-')
395
396
        dels = min( localAlign1.count('-'), localAlign2.count('-') )
397
        ins =  max( localAlign1.count('-'), localAlign2.count('-') )
398
        if (ins==1) and dels==0:
399
400
            seqNHasIndel = localAlign2.count('-')==0 #bool: 0: seqA has indel, 1: seqB has indel
401
402
            #Find indel location:
403
404
            if seqNHasIndel:
405
                indelLocA = align1.find('-')
406
                if verbose:
407
                    print(('seqA:%s\nseqB:%s'% (align1,align2)))
408
                    print((' '*(indelLocA+5)+'^'))
409
            else:
410
                indelLocB = align2.find('-')
411
                if verbose:
412
                    print(('seqA:%s\nseqB:%s'% (align1,align2)))
413
                    print((' '*(indelLocB+5)+'^'))
414
415
416
            #print(alignment[2]) is score
417
418
        return(indelLocA,indelLocB)
419
420
421
class Histogram():
422
    def __init__(self):
423
        self.data = {}
424
425
    def addCount(self,idx, count=1):
426
        if not idx in self.data:
427
            self.data[idx] = 0
428
        self.data[idx]+=count
429
430
    def write(self, path):
431
        with open(path,'w') as f:
432
            for index in self.data:
433
                f.write('%s;%s\n' % (index, self.data[index]))
434
435
436
437
globalWritingUpdates = False
438
class InternalTool():
439
    def __init__(self, screenName = "unnamed"):
440
        self.verbose = True
441
        self.screenName = screenName
442
443
    def setVerbosity(self, verbosity):
444
        self.verbose = verbosity
445
446
    def __priorRunningPrint(self):
447
        if self.isUpdating():
448
            print('')
449
            self.setUpdating(False)
450
451
    def isUpdating(self):
452
        global globalWritingUpdates
453
        if globalWritingUpdates:
454
            return(True)
455
456
    def setUpdating(self, boolean):
457
458
        global globalWritingUpdates
459
        globalWritingUpdates = boolean
460
461
462
    def log(self, message):
463
        if self.verbose:
464
            self.__priorRunningPrint()
465
            print((Fore.GREEN + self.screenName + ', INFO: ' + Style.RESET_ALL + message))
466
467
    def warn(self, message):
468
        self.__priorRunningPrint()
469
        print((Fore.RED + self.screenName + ', WARN: ' + Style.RESET_ALL + message))
470
471
    def softWarn(self, message):
472
        self.__priorRunningPrint()
473
        print((Fore.YELLOW + self.screenName + ', SOFT WARN: ' + Style.RESET_ALL + message))
474
475
476
477
    def update(self, message):
478
        print('\r' + Fore.GREEN + self.screenName + ', UPDATE: ' + Style.RESET_ALL +message, end=' ')
479
        self.setUpdating(True)
480
481
482
    def info(self, message):
483
        self.log(message)
484
485
486
    def fatal(self, message):
487
        self.__priorRunningPrint()
488
        print((Fore.RED + self.screenName + ', FATAL ERROR: ' + Style.RESET_ALL + message))
489
        exit()
490
491
492
class ExternalTool(InternalTool):
493
    def __init__(self, screenName = "unnamed"):
494
        InternalTool.__init__(self)
495
        #Locations to look for binaries when they are not found on the PATH
496
497
498
    def setExecutablePath(self, path):
499
        self.executable = path
500
501
    #Only works in unix and cygwin
502
    def isAvailable(self):
503
        available = subprocess.call("type " + self.executable.split()[0], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0
504
        return(available or os.path.isfile(self.executable))
505
506
507
class FastqToFasta(ExternalTool):
508
    def __init__(self):
509
        ExternalTool.__init__(self)
510
        self.setExecutablePath('fastq_to_fasta')
511
        self.screenName = "FASTQ_TO_FASTA"
512
513
    def execute(self, fqPath, fastaPath):
514
        os.system('%s -n -i %s -o %s'  % (self.executable, fqPath, fastaPath ))
515
516
517
class FastQQualityFilter(ExternalTool):
518
519
    def __init__(self):
520
        ExternalTool.__init__(self)
521
        self.setExecutablePath('fastq_quality_filter')
522
        self.minBaseQuality = 36
523
        self.sequenceCountSuffix = 'fastqQualityResultCount'
524
        self.screenName = "FASTQ_QUALITY_FILTER"
525
526
    def setMinBaseQuality(self, minBaseQuality):
527
        self.minBaseQuality = minBaseQuality
528
529
    def getCountPath(self,filteredTargetPath):
530
        return( '%s_%s' % (filteredTargetPath,self.sequenceCountSuffix) )
531
532
533
    def execute(self, fqPath, filteredTargetPath):
534
        os.system('%s -q %s -i %s -o %s -v -sam > %s' % (self.executable, self.minBaseQuality, fqPath, filteredTargetPath, self.getCountPath()))
535
        r = 0
536
        with open(self.getCountPath(), 'w') as f:
537
            r = int(f.readline())
538
539
        return(r)
540
541
class ART(ExternalTool):
542
543
    def __init__(self):
544
        ExternalTool.__init__(self)
545
        self.setExecutablePath('./art_bin_ChocolateCherryCake/art_illumina')
546
        self.screenName = "ART"
547
548
    def simulateAmpliconReads(self,sequenceAbundanceCounter, prefix, readSize=76, coverage=10, reverseComplementAmplicon=False):
549
550
        sequences = []
551
        print(( sequenceAbundanceCounter.most_common(1)))
552
        (v,hCount) = sequenceAbundanceCounter.most_common(1)[0]
553
        for iteration in range(0,hCount):
554
555
            for sequence,count in sequenceAbundanceCounter.most_common():
556
                if count>=hCount:
557
                    sequences.append(sequence)
558
                else:
559
                    break
560
            hCount-=1
561
562
563
        bpythonSeqs=[]
564
        for index,sequence in enumerate(sequences):
565
            if reverseComplementAmplicon :
566
                bpythonSeqs.append(SeqIO.SeqRecord(Seq(sequence).reverse_complement(), '%s-%s' % (str(index),str(sequence))))
567
            else:
568
                bpythonSeqs.append(SeqIO.SeqRecord(Seq(sequence), '%s-%s' % (str(index),str(sequence))))
569
570
        fastaPath = getTempFileName('art')+'.fa'
571
        SeqIO.write(bpythonSeqs, fastaPath, "fasta")
572
573
        os.system('%s -amp -i %s -l %s -f %s -o %s -sam -ss HS25' %(self.executable, fastaPath, readSize, coverage, prefix))
574
575
576
577
    def setMinBaseQuality(self, minBaseQuality):
578
        self.minBaseQuality = minBaseQuality
579
580
    def getCountPath(self,filteredTargetPath):
581
        return( '%s_%s' % (filteredTargetPath,self.sequenceCountSuffix) )
582
583
584
    def execute(self, fqPath, filteredTargetPath):
585
        os.system('%s -q %s -i %s -o %s -v > %s' % (self.executable, self.minBaseQuality, fqPath, filteredTargetPath, self.getCountPath()))
586
        r = 0
587
        with open(self.getCountPath(), 'w') as f:
588
            r = int(f.readline())
589
590
        return(r)
591
592
593
class NeedleAll(ExternalTool):
594
595
    def __init__(self):
596
        ExternalTool.__init__(self)
597
        self.setExecutablePath('needleall')
598
        self.screenName = "NEEDLE_ALL"
599
600
601
    def execute(self, fqPathA, fqPathB, outputFilePath):
602
        command = '%s -auto -stdout -asequence "%s" -bsequence "%s" > %s' % (self.executable, fqPathA, fqPathB, outputFilePath)
603
        print(command)
604
        os.system(command)
605
        r = 0
606
607
        #This is an inverted distance matrix
608
        invScores = DistanceMatrix()
609
610
        #find max score:
611
        maxScore = 0
612
        # To perform T-SNE a normalised matrix is required: the total sum should be 1
613
        totalScore = 0
614
        cells = 0
615
        with open(outputFilePath, 'r') as f:
616
           for line in f:
617
                #print(line)
618
                parts = line.strip().replace('(','').replace(')','').split(' ')
619
                if len(parts)==4:
620
                    value = float(parts[3])
621
                    if value>maxScore:
622
                        maxScore=value
623
                    totalScore+=value
624
                    cells+=1
625
626
        #Normalise the scores
627
628
629
        matrixSum = 0
630
        with open(outputFilePath, 'r') as f:
631
           for line in f:
632
                #print(line)
633
                parts = line.strip().replace('(','').replace(')','').split(' ')
634
                #print('%s %s %s %s' % (parts[0], parts[1],parts[2],parts[3]))
635
                if len(parts)==4:
636
                    if float(parts[3])>0:
637
                        value = (maxScore-float(parts[3]))
638
                        matrixSum+= value
639
                        invScores.setDistance(str(parts[0]), str(parts[1]), value)
640
641
        print(('Sum of distance matrix is %s' % matrixSum))
642
        return(invScores)
643
644
645
646
class ClustalOmega(ExternalTool):
647
648
    def __init__(self):
649
        ExternalTool.__init__(self)
650
        self.setExecutablePath('clustalo')
651
        self.screenName = "CLUSTAL-OMEGA"
652
653
    def alignFasta(self, fastaPath,outputFastaFilePath):
654
        self.log('aligning %s to %s' % (fastaPath, outputFastaFilePath))
655
        os.system('%s -i %s -o %s --force' % (self.executable, fastaPath, outputFastaFilePath))
656
657
658
    def alignFastqs(self, fqPath, outputFastaFilePath):
659
        #self.log('converting %s to fasta' % fqpath)
660
        f = FastqToFasta()
661
        tmpFasta = getTempFileName()
662
        f.execute(fqPath, tmpFasta)
663
        self.alignFasta(tmpFasta,outputFastaFilePath)
664
        os.remove(tmpFasta)
665
666
667
668
669
class SamTools(ExternalTool):
670
    def __init__(self):
671
        ExternalTool.__init__(self)
672
        self.setExecutablePath('samtools')
673
        self.screenName = "SAMTOOLS"
674
675
    def samToSortedBam(self, samPath, bamPath, deleteSam=False):
676
        self.log('Converting SAM to sorted BAM')
677
        os.system('%s view -bS %s | %s sort - -o %s.bam' % (self.executable, samPath, self.executable, bamPath))
678
        self.log('Completed conversion')
679
        if deleteSam:
680
            os.remove(samPath)
681
682
    def bamToSortedBam(self, bamPath, sortedBamPath):
683
        self.log('Converting BAM to sorted BAM')
684
        os.system('%s sort -o %s.bam %s' % (self.executable, sortedBamPath, bamPath))
685
        self.log('Completed conversion')
686
687
688
    def index(self, bamPath):
689
        self.log('Indexing BAM file')
690
        os.system('%s index %s' % (self.executable, bamPath))
691
        self.log('Completed indexing')
692
693
    def readBamToDictOld(self,bamFilePath, columns={'cigar':5, 'left':3,'scarSequence':9}, index='scarSequence', appendTo={}): # columns:{ targetName:columnIndex, ... }
694
        retDict = appendTo
695
        #os.system("samtools view %s | cut -f1,6 | grep -oEi '^.*([0-9]+D){1,1}.*' ")
696
        #tempfile = './temp/temp_' + str(uuid.uuid1())
697
        tempfile = getTempFileName()
698
        self.log("samtools view %s > %s" %(bamFilePath,tempfile))
699
        os.system("samtools view %s > %s" %(bamFilePath,tempfile))
700
        with open(tempfile) as f:
701
            #Go over all lines in the BAM file
702
            for line in f:
703
                #Extract all columns:
704
                parts = line.replace('\n','').split()
705
                #Extract the primary key value:
706
                indexKey = parts[columns[index]]
707
                #Initialise the existence of the primary key value in the return dictionary
708
                if not indexKey in retDict:
709
                    retDict[indexKey] = {}
710
                #Iterate over the keysm except the index
711
                for key in columns:
712
                    if key is not index:
713
                        #Get the value of the key on this line in the BAM file
714
                        v = parts[columns[key]]
715
                        if not v in retDict[indexKey]:
716
                            retDict[indexKey][v] = 1
717
                        else:
718
                            retDict[indexKey][v] += 1
719
        os.remove(tempfile)
720
        return(retDict)
721
722
    def readBamToDict(self,bamFilePath, columns={'cigar':5, 'id':0,'left':3,'scarSequence':9}, index='scarSequence', appendTo={}): # columns:{ targetName:columnIndex, ... }
723
        retDict = appendTo
724
        #os.system("samtools view %s | cut -f1,6 | grep -oEi '^.*([0-9]+D){1,1}.*' ")
725
        tempfile = './temp/temp_' + str(uuid.uuid1())
726
        self.log("samtools view %s > %s" %(bamFilePath,tempfile))
727
        os.system("samtools view %s > %s" %(bamFilePath,tempfile))
728
        with open(tempfile) as f:
729
            #Go over all lines in the BAM file
730
            for line in f:
731
                #Extract all columns:
732
                parts = line.replace('\n','').split()
733
                #Extract the primary key value:
734
                indexKey = parts[columns[index]]
735
                #Initialise the existence of the primary key value in the return dictionary
736
                if not indexKey in retDict:
737
                    retDict[indexKey] = {}
738
                #Iterate over the keysm except the index
739
                for key in columns:
740
                    if key is not index:
741
                        #Get the value of the key on this line in the BAM file
742
                        if columns[key]<len(parts):
743
                            v = parts[columns[key]]
744
                            retDict[indexKey][key] = v
745
746
        try:
747
            os.remove(tempfile)
748
        except:
749
            print(('Temporary file could not be removed %s' % tempfile))
750
            pass
751
        return(retDict)
752
753
754
# Make sure to index the reference genome!
755
class BWA_MEM(ExternalTool):
756
    def __init__(self):
757
        ExternalTool.__init__(self)
758
        self.setExecutablePath('bwa mem')
759
        self.screenName = "BWA MEM"
760
        self.fixedFlags = []
761
762
    def setReferencePath(self, referenceGenomePath):
763
        self.referenceGenomePath = referenceGenomePath
764
765
    def execute(self, fqPath, samPath, readGroupInfo=None, secondSplitHits=False, getSecondaryAlignments=False):
766
        if not isinstance(fqPath, str):
767
            fqPath = " ".join(fqPath)
768
769
        if self.referenceGenomePath!=None:
770
            self.log('Aligning %s to %s' %(fqPath, self.referenceGenomePath))
771
772
            flags=[]+self.fixedFlags
773
            if secondSplitHits:
774
                flags.append("-M")
775
            if readGroupInfo!=None:
776
                flags.append("-R '%s'" % readGroupInfo)
777
            if getSecondaryAlignments:
778
                flags.append("-a")
779
780
            self.log("Flags: %s" % (" ".join(flags)))
781
            cmd = '%s %s %s %s > %s' % (self.executable, " ".join(flags),  self.referenceGenomePath, fqPath, samPath)
782
            self.log(cmd)
783
            os.system(cmd)
784
        else:
785
            self.warn('Warning; no reference genome specified for BWA MEM, canceled alignment.')
786
787
# Make sure to index the reference genome!
788
class STAR(ExternalTool):
789
    def __init__(self):
790
        ExternalTool.__init__(self)
791
        self.setExecutablePath('STAR')
792
        self.screenName = "STAR"
793
        self.threads = 4
794
795
    def index(self):
796
797
        if not os.path.exists(self.refDirectory):
798
            try:
799
                os.mkdir(self.refDirectory )
800
            except:
801
                self.fatal('Could not create index directory on %s' % self.refDirectory)
802
803
        os.system('%s --runMode genomeGenerate --genomeDir %s --genomeFastaFiles %s --runThreadN %s' %
804
                  (self.executable, self.refDirectory,  self.referenceGenomePath, self.threads))
805
806
807
808
    def setReferencePath(self, referenceGenomePath):
809
        self.referenceGenomePath = referenceGenomePath
810
        self.refDirectory = '%s_STAR_INDEX' % os.path.basename(self.referenceGenomePath)
811
        if not os.path.exists(self.refDirectory):
812
            self.info("Reference index %s does not exist." % referenceGenomePath)
813
            self.index()
814
815
816
    def execute(self, fqPath, outputPath):
817
        os.system('%s --genomeDir %s --readFilesIn %s --runThreadN %s --outFileNamePrefix %s' %
818
                (self.executable, self.refDirectory, fqPath, self.threads, outputPath))
819
820
821
    #!!!sjdbOverhang  should match the length of the reads minus one!!!
822
    def pass2alignment(self, SJouttabPath, sjdbOverhang=74):
823
824
        base = os.path.basename(SJouttabPath)
825
826
        self.p2refDirectory = '%s_P2_STAR_INDEX' % os.path.basename(SJouttabPath)
827
        if not os.path.exists(self.p2refDirectory):
828
            try:
829
                os.mkdir(self.p2refDirectory )
830
            except:
831
                self.fatal('Could not create index directory on %s' % self.p2refDirectory)
832
833
834
    def mapSingleEnd(self, fqPath, samPath):
835
        if not isinstance(fqPath, str):
836
            fqPath = " ".join(fqPath)
837
838
        if self.referenceGenomePath!=None:
839
            self.log('Aligning %s to %s' %(fqPath, self.referenceGenomePath))
840
            os.system('%s --genomeDir %s --readFilesIn %s --runThreadN %s' %
841
                (self.executable, self.refDirectory, fqPath, self.threads))
842
        else:
843
            self.warn('Warning; no reference genome specified for STAR, canceled alignment.')
844
845
class GSNAP(ExternalTool):
846
    def __init__(self):
847
        ExternalTool.__init__(self)
848
        self.setExecutablePath('gsnap')
849
        self.screenName = "GSNAP"
850
        #self.cmd=  'sudo gsnap -d lintrace analysis_bdefa5bc-6b5c-11e5-a487-0800279bd60a_fullScarList_Protocol1-DNA-seq.fq -t 4 -A sam > analysis_bdefa5bc-6b5c-11e5-a487-0800279bd60a_fullScarList_Protocol1-DNA-seq.sam'
851
852
    #Reference genome path should be the name of the reference build folder
853
    def setReferencePath(self, referenceGenomePath):
854
        self.referenceGenomePath = referenceGenomePath
855
856
    def execute(self, fqPath, samPath):
857
        if self.referenceGenomePath!=None:
858
            self.log('Aligning %s to %s' %(fqPath, self.referenceGenomePath))
859
            os.system('%s -d %s %s --format=sam > %s' % (self.executable, self.referenceGenomePath, fqPath, samPath))
860
        else:
861
            self.warn('Warning; no reference genome specified for GSNAP, canceled alignment.')
862
863
864
def getTempFileName(centerFix=""):
865
    if not os.path.exists('./temp'):
866
        os.mkdir('./temp')
867
    return('./temp/temp_' + centerFix + str(uuid.uuid1()))
868
869
def mapReads(readsFile, baseName, mapper, samTools=None,readGroupInfo=None, secondSplitHits=False):
870
871
    if samTools==None:
872
        samTools = SamTools()
873
874
    basePath = '%s/%s' % ( '.', baseName)
875
    samPath = '%s.sam' %( basePath)
876
    bamPath = '%s.bam' %( basePath)
877
878
879
    if readGroupInfo==None and secondSplitHits==False:
880
        mapper.execute(readsFile, samPath)
881
    else:
882
        mapper.execute(readsFile, samPath, readGroupInfo=None, secondSplitHits=False)
883
884
    samTools.samToSortedBam(samPath, basePath, True)
885
    samTools.index(bamPath)
886
    return({'samPath':samPath, 'bamPath':bamPath})
887
888
def mapSequencesToDict(mapper, sequences):
889
890
    bpythonSeqs=[]
891
    for index,sequence in enumerate(sequences):
892
        bpythonSeqs.append(SeqIO.SeqRecord(Seq(sequence), str(index)))
893
894
    fastaPath = getTempFileName('bwa')+'.fa'
895
    SeqIO.write(bpythonSeqs, fastaPath, "fasta")
896
    bPath = getTempFileName('samtools')
897
    samPath = bPath+'.sam'
898
    bamPath = bPath+'.bam'
899
    mapper.execute(fastaPath, samPath)
900
    samtools = SamTools()
901
    samtools.samToSortedBam(samPath, bPath, True)
902
    samtools.index(bamPath)
903
    d = samtools.readBamToDict(bamPath)
904
    return(d)
905
906
def mapDictOfSequences(mapper, sequenceDict, bPath=None):
907
908
    bpythonSeqs=[]
909
    for seqId in sequenceDict:
910
        bpythonSeqs.append(SeqIO.SeqRecord(Seq(sequenceDict[seqId]), str(seqId)))
911
912
    fastaPath = getTempFileName('bwa')+'.fa'
913
    SeqIO.write(bpythonSeqs, fastaPath, "fasta")
914
    if bPath==None:
915
        bPath = getTempFileName('samtools')
916
    samPath = bPath+'.sam'
917
    bamPath = bPath+'.bam'
918
    mapper.execute(fastaPath, samPath)
919
    samtools = SamTools()
920
    samtools.samToSortedBam(samPath, bPath, False)
921
    samtools.index(bamPath)
922
    return(bamPath)
923
924
def mapReadsToDict(mapper, fqPath, index='id',basePath=None, readGroupInfo=None, secondSplitHits=False):
925
    if basePath==None:
926
        bPath = getTempFileName('samtools')
927
    else:
928
        bPath=basePath
929
    samPath = bPath+'.sam'
930
    bamPath = bPath+'.bam'
931
    mapper.execute(fqPath, samPath,  readGroupInfo, secondSplitHits)
932
    samtools = SamTools()
933
    samtools.samToSortedBam(samPath, bPath, True)
934
    samtools.index(bamPath)
935
    r = {}
936
    d = samtools.readBamToDict(bamPath, index=index, appendTo=r)
937
    return(d)
938
939
# Reference to this function: http://stackoverflow.com/questions/845058/how-to-get-line-count-cheaply-in-python
940
# This method is the quickest way to count lines in big files. (Quicker than wc..)
941
def fileLineCount(filename):
942
    f = open(filename, 'rb')
943
    bufgen = takewhile(lambda x: x, (f.read(1024*1024) for _ in repeat(None)))
944
    return sum( buf.count(b'\n') for buf in bufgen )
945
946
947
# Deprecated: should use Numpy sparse matrix
948
class DistanceMatrix():
949
    def __init__(self, maxDist=1000000000):
950
        self.data = {}
951
        self.keys = set()
952
        self.maxDist = maxDist
953
954
        self.finalKeying = True # Do not store keys on every update
955
956
    def setDistances(self,fromList, toList, distanceList ):
957
        # Add keys:
958
        if not self.finalKeying:
959
            self.keys.update(fromList+toList)
960
961
        for index,a in enumerate(fromList):
962
            self._update(a,toList[index], distanceList[index])
963
964
    def getNumpyArray(self):
965
        #Make sure the keys are properly indexed
966
        self._performKeying()
967
968
        #Initialise empty matrix:
969
        narray = np.zeros((len(self.keys), len(self.keys)))
970
971
        for aIndex,keyA in enumerate(self.keys):
972
            for bIndex,keyB in enumerate(self.keys):
973
                narray[aIndex,bIndex] = self.getDistance(keyA,keyB)
974
975
        return(narray)
976
977
978
    def _update(self, a,b,distance):
979
        if distance<=self.maxDist:
980
            if (not a in self.data) and (not b in self.data):
981
                self.data[a] = {}
982
                self.data[a][b] = distance
983
                return(True)
984
            if a in self.data and b in self.data[a]:
985
                self.data[a][b] = distance
986
                return(True)
987
            if b in self.data and a in self.data[b]:
988
                self.data[b][a] = distance
989
                return(True)
990
991
992
        # A already exists but B does not: (And b also does not exists as main entry)
993
        if a in self.data and not b in self.data and not b in self.data[a]:
994
            self.data[a][b] = distance
995
            return(True)
996
        elif b in self.data:
997
            if not a in self.data[b]:
998
                self.data[b][a] = distance
999
                return(True)
1000
        return(False)
1001
1002
1003
    def setDistance(self, a,b,distance ):
1004
        if not self.finalKeying:
1005
            if a not in self.keys and b not in self.keys:
1006
                self.keys.update([a,b])
1007
            else:
1008
                if a not in self.keys:
1009
                    self.keys.add(a)
1010
                if b not in self.keys:
1011
                    self.keys.add(b)
1012
1013
        return( self._update(a,b, distance ))
1014
1015
1016
    def getDistance(self, a,b ):
1017
        if a in self.data and b in self.data[a]:
1018
            return(self.data[a][b])
1019
        if b in self.data and a in self.data[b]:
1020
            return(self.data[b][a])
1021
        return(self.maxDist)
1022
1023
1024
    def _performKeying(self):
1025
        #Retrieve all keys
1026
        self.keys = set()
1027
        addKeys = []
1028
        self.keys.update(list(self.data.keys()))
1029
        for a in self.data:
1030
            for b in self.data[a]:
1031
                if b not in self.keys:
1032
                    addKeys.append(b)
1033
        self.keys.update(addKeys)
1034
1035
    def writeToFile(self, path):
1036
1037
        if self.finalKeying:
1038
            self._performKeying()
1039
1040
        separator = ';'
1041
        with open(path, 'w') as f:
1042
            values= ['name']
1043
            for keyA in self.keys:
1044
                values.append(keyA)
1045
            f.write('%s\n' % (separator.join(values)))
1046
            for keyA in self.keys:
1047
                values = []
1048
                for keyB in self.keys:
1049
                    values.append(str(self.getDistance(keyA, keyB)))
1050
                f.write('%s%s%s\n' % (keyA, separator,separator.join(values)))
1051
1052
1053
    def loadFromFile(self, path,separator=None):
1054
        with open(path) as f:
1055
            idx = 0
1056
            header = {}
1057
            for line in f:
1058
                parts = line.rstrip().split(separator)
1059
                if idx==0:
1060
                    for keyId,keyName in enumerate(parts):
1061
                        header[keyId] = keyName
1062
                else:
1063
1064
                    for partIndex, part in parts:
1065
                        if partIndex==0:
1066
                            keyA = part
1067
                        else:
1068
                            keyB=header[partIndex]
1069
                            self.setDistance(keyA,keyB, float(part))
1070
1071
                idx+=1