|
a |
|
b/singlecellmultiomics/utils/bdbbio.py |
|
|
1 |
from Bio import SeqIO |
|
|
2 |
from Bio.Seq import Seq |
|
|
3 |
import os |
|
|
4 |
import uuid |
|
|
5 |
from Bio import pairwise2 |
|
|
6 |
import math |
|
|
7 |
import subprocess |
|
|
8 |
from colorama import Fore #,Back, Style |
|
|
9 |
from colorama import Back |
|
|
10 |
from colorama import Style |
|
|
11 |
from colorama import init |
|
|
12 |
init(autoreset=True) |
|
|
13 |
import numpy as np |
|
|
14 |
import re |
|
|
15 |
from itertools import takewhile,repeat |
|
|
16 |
#import localInstall |
|
|
17 |
|
|
|
18 |
if not os.name=='nt': |
|
|
19 |
import pysam |
|
|
20 |
### custom path config: (not used when not available) |
|
|
21 |
|
|
|
22 |
#localInstall.addToPath('/hpc/hub_oudenaarden/bdebarbanson/bin/samtools-1.3.1/') |
|
|
23 |
#localInstall.addToPath('/hpc/hub_oudenaarden/bin/software/bwa-0.7.10/') |
|
|
24 |
#localInstall.addToPath('/media/sf_externalTools/') |
|
|
25 |
|
|
|
26 |
### end |
|
|
27 |
|
|
|
28 |
class AlignmentVisualiser: |
|
|
29 |
def __init__(self, initialiser): |
|
|
30 |
|
|
|
31 |
if type(initialiser) is pysam.AlignedSegment: |
|
|
32 |
self.alignment = initialiser |
|
|
33 |
|
|
|
34 |
def getPhredScoreColor(self, phredScore): |
|
|
35 |
|
|
|
36 |
p = ord(phredScore)-33.0 |
|
|
37 |
if p>32: |
|
|
38 |
return(Fore.GREEN + Style.BRIGHT) |
|
|
39 |
if p>25: |
|
|
40 |
return(Fore.GREEN) |
|
|
41 |
if p>20: |
|
|
42 |
return(Fore.GREEN + Style.DIM) |
|
|
43 |
if p>18: |
|
|
44 |
return(Fore.YELLOW) |
|
|
45 |
return(Fore.RED) |
|
|
46 |
|
|
|
47 |
def visualizeAlignment(self, annotateReference=None, annotateQuery=None, intronSummary=True): |
|
|
48 |
|
|
|
49 |
reference = [] |
|
|
50 |
query = [] |
|
|
51 |
alignmentSymbols = [] |
|
|
52 |
qualitieSymbols = [] |
|
|
53 |
prevIntron = 0 |
|
|
54 |
for index,pair in enumerate(self.alignment.get_aligned_pairs( matches_only=False, with_seq=True) ): |
|
|
55 |
#Pair structure = (queryPosition, referencePosition, referenceBase) |
|
|
56 |
|
|
|
57 |
if pair[0] is None and pair[2] is None: |
|
|
58 |
prevIntron+=1 |
|
|
59 |
continue |
|
|
60 |
else: |
|
|
61 |
if prevIntron>0: |
|
|
62 |
intronString = ' N%s ' % prevIntron |
|
|
63 |
alignmentSymbols.append(Style.DIM + intronString + Style.RESET_ALL) |
|
|
64 |
qualitieSymbols.append(' '*len(intronString)) |
|
|
65 |
reference.append(Style.DIM + ('-'*len(intronString)) + Style.RESET_ALL) |
|
|
66 |
query.append( Style.DIM + (' '*len(intronString)) + Style.RESET_ALL) |
|
|
67 |
prevIntron = 0 |
|
|
68 |
|
|
|
69 |
queryPosition = pair[0] |
|
|
70 |
queryNucleotide= "%s%s-%s"% (Back.RED, Fore.WHITE, Style.RESET_ALL) |
|
|
71 |
referenceNucleotide= ' '# % (Fore.RED, Style.RESET_ALL) #if queryPosition>self.alignment.query_alignment_start and queryPosition<=self.alignment.query_alignment_end else ' ' |
|
|
72 |
queryPhred=' ' |
|
|
73 |
if queryPosition!=None: |
|
|
74 |
queryNucleotide = self.alignment.query_sequence[queryPosition].upper() |
|
|
75 |
queryPhred = self.alignment.qual[queryPosition] |
|
|
76 |
qualitieSymbols.append(self.getPhredScoreColor(queryPhred) + queryPhred + Style.RESET_ALL) |
|
|
77 |
else: |
|
|
78 |
qualitieSymbols.append(' ') |
|
|
79 |
|
|
|
80 |
#print("Q:%s S:%s" % (queryPosition, self.alignment.query_alignment_start)) |
|
|
81 |
|
|
|
82 |
if pair[2]!=None: |
|
|
83 |
referenceNucleotide = pair[2].upper() |
|
|
84 |
elif queryPosition is None: |
|
|
85 |
referenceNucleotide = "%s%s-%s"% (Back.RED, Fore.WHITE, Style.RESET_ALL) |
|
|
86 |
elif queryPosition>self.alignment.query_alignment_start and queryPosition<=self.alignment.query_alignment_end: |
|
|
87 |
|
|
|
88 |
referenceNucleotide = "%s%s-%s"% (Back.RED, Fore.WHITE, Style.RESET_ALL) |
|
|
89 |
|
|
|
90 |
if pair[2]==queryNucleotide: |
|
|
91 |
alignmentSymbols.append('|') |
|
|
92 |
else: |
|
|
93 |
alignmentSymbols.append(' ') |
|
|
94 |
|
|
|
95 |
reference.append(referenceNucleotide) |
|
|
96 |
if queryPosition!=None and annotateQuery!=None: |
|
|
97 |
|
|
|
98 |
if ((queryPosition - self.alignment.query_length) in annotateQuery and (queryPosition - self.alignment.query_length<0 )): |
|
|
99 |
query.append(getattr(Back,annotateQuery[(queryPosition - self.alignment.query_length)]) + Fore.BLACK + queryNucleotide + Style.RESET_ALL) |
|
|
100 |
elif queryPosition in annotateQuery: |
|
|
101 |
query.append(getattr(Back,annotateQuery[queryPosition]) + Fore.WHITE + queryNucleotide + Style.RESET_ALL) |
|
|
102 |
else: |
|
|
103 |
#DIM soft clipped things: |
|
|
104 |
if queryPosition<self.alignment.query_alignment_start or queryPosition>self.alignment.query_alignment_end: |
|
|
105 |
query.append(Style.DIM + Fore.YELLOW + queryNucleotide + Style.RESET_ALL) |
|
|
106 |
|
|
|
107 |
else: |
|
|
108 |
query.append(queryNucleotide) |
|
|
109 |
else: |
|
|
110 |
query.append(queryNucleotide) |
|
|
111 |
rStart = self.alignment.reference_end if self.alignment.is_reverse else self.alignment.reference_start |
|
|
112 |
rEnd = self.alignment.reference_start if self.alignment.is_reverse else self.alignment.reference_end |
|
|
113 |
refStartStr = str(rStart) |
|
|
114 |
qStartStr = str(self.alignment.query_alignment_start) |
|
|
115 |
labelSize = max(len(refStartStr), len(qStartStr)) |
|
|
116 |
|
|
|
117 |
|
|
|
118 |
annotatedCigar = [] |
|
|
119 |
for operation,length in self.alignment.cigartuples: |
|
|
120 |
annotatedCigar.append( '%s%s%s' % ( |
|
|
121 |
[ '%s%sM%s'% (Fore.GREEN, Style.BRIGHT, Style.NORMAL), |
|
|
122 |
'%s%sI%s' % (Fore.YELLOW,Style.BRIGHT, Style.NORMAL), |
|
|
123 |
'%s%sD%s' % (Fore.RED,Style.BRIGHT, Style.NORMAL), |
|
|
124 |
'%sN' % Fore.MAGENTA, |
|
|
125 |
'%s%sS' % (Style.DIM,Fore.RED), |
|
|
126 |
'%s%sH' % (Style.DIM,Fore.RED), |
|
|
127 |
'%s%sP' % (Style.DIM,Fore.RED), |
|
|
128 |
'%s%s=' % (Style.DIM,Fore.GREEN), |
|
|
129 |
'%s%sX' % (Style.DIM,Fore.BLUE)] |
|
|
130 |
[operation] |
|
|
131 |
, length, Style.RESET_ALL)) |
|
|
132 |
|
|
|
133 |
multiMapping = '' |
|
|
134 |
try: |
|
|
135 |
multiMappingCount = int(self.alignment.get_tag('NH:i')) |
|
|
136 |
|
|
|
137 |
if multiMappingCount==1: |
|
|
138 |
multiMapping= Fore.GREEN + 'Unique map' + Style.RESET_ALL |
|
|
139 |
elif multiMappingCount>1: |
|
|
140 |
multiMapping= Fore.WHITE + Back.RED + ('Mapping to at least %s locations' % multiMappingCount) + Style.RESET_ALL |
|
|
141 |
except: |
|
|
142 |
pass |
|
|
143 |
|
|
|
144 |
print( ('%sReference:%s %s %s %s MQ:%s %s' % ( Style.BRIGHT, Style.RESET_ALL, self.alignment.reference_name, "".join(annotatedCigar), ("FWD" if not self.alignment.is_reverse else "REV"), self.alignment.mapping_quality, multiMapping))+ (' MULTIMAPPING' if self.alignment.is_secondary else '' ) ) |
|
|
145 |
print(('%s%s %s %s (%s%s)' % ( |
|
|
146 |
Style.DIM+refStartStr+Style.RESET_ALL, |
|
|
147 |
" "*(labelSize-len(refStartStr)), |
|
|
148 |
"".join(reference), |
|
|
149 |
(Style.DIM+str(rEnd)+Style.RESET_ALL), |
|
|
150 |
"" if self.alignment.is_reverse else "+", |
|
|
151 |
(Style.DIM+str(rEnd-rStart)+Style.RESET_ALL)))) |
|
|
152 |
print(('%s%s' % (" "*(labelSize+1), "".join(alignmentSymbols)))) |
|
|
153 |
print(('%s%s %s %s (%s)' % ( |
|
|
154 |
" "*(labelSize-len(qStartStr)), |
|
|
155 |
Style.DIM+qStartStr+Style.RESET_ALL, |
|
|
156 |
"".join(query), |
|
|
157 |
(Style.DIM+str(self.alignment.query_alignment_end)+Style.RESET_ALL), |
|
|
158 |
(Style.DIM+str(self.alignment.query_alignment_end-self.alignment.query_alignment_start)+Style.RESET_ALL), |
|
|
159 |
))) |
|
|
160 |
print(('%s%s' % (" "*(labelSize+1), "".join(qualitieSymbols)))) |
|
|
161 |
print(('%sQuery:%s %s' % (Style.BRIGHT, Style.RESET_ALL,self.alignment.query_name))) |
|
|
162 |
|
|
|
163 |
def getLevenshteinDistance(source, target, maxDist=99999999999999): |
|
|
164 |
if len(source) < len(target): |
|
|
165 |
return getLevenshteinDistance(target, source) |
|
|
166 |
|
|
|
167 |
# So now we have len(source) >= len(target). |
|
|
168 |
if len(target) == 0: |
|
|
169 |
return len(source) |
|
|
170 |
|
|
|
171 |
# We call tuple() to force strings to be used as sequences |
|
|
172 |
# ('c', 'a', 't', 's') - numpy uses them as values by default. |
|
|
173 |
source = np.array(tuple(source)) |
|
|
174 |
target = np.array(tuple(target)) |
|
|
175 |
|
|
|
176 |
# We use a dynamic programming algorithm, but with the |
|
|
177 |
# added optimization that we only need the last two rows |
|
|
178 |
# of the matrix. |
|
|
179 |
previous_row = np.arange(target.size + 1) |
|
|
180 |
|
|
|
181 |
for s in source: |
|
|
182 |
# Insertion (target grows longer than source): |
|
|
183 |
current_row = previous_row + 1 |
|
|
184 |
|
|
|
185 |
# Substitution or matching: |
|
|
186 |
# Target and source items are aligned, and either |
|
|
187 |
# are different (cost of 1), or are the same (cost of 0). |
|
|
188 |
current_row[1:] = np.minimum( |
|
|
189 |
current_row[1:], |
|
|
190 |
np.add(previous_row[:-1], target != s)) |
|
|
191 |
|
|
|
192 |
# Deletion (target grows shorter than source): |
|
|
193 |
current_row[1:] = np.minimum( |
|
|
194 |
current_row[1:], |
|
|
195 |
current_row[0:-1] + 1) |
|
|
196 |
|
|
|
197 |
previous_row = current_row |
|
|
198 |
if( np.amin(previous_row)>maxDist): |
|
|
199 |
return(None) |
|
|
200 |
|
|
|
201 |
|
|
|
202 |
return previous_row[-1] |
|
|
203 |
|
|
|
204 |
|
|
|
205 |
def cigarStringToDict(cigarString): |
|
|
206 |
if cigarString==None: |
|
|
207 |
cs = '' |
|
|
208 |
else: |
|
|
209 |
cs = cigarString |
|
|
210 |
cigarStringDict = {} |
|
|
211 |
for symbol in ['M','D','I','S']: |
|
|
212 |
# Extract the integer which is found before the symbol supplied |
|
|
213 |
# returns 0 when the symbol is not found as the sum of the list will be zero |
|
|
214 |
matches = re.findall('[0-9]*%s' % symbol, cs) |
|
|
215 |
cigarStringDict[symbol] = sum( [ int(x.replace(symbol,'')) for x in matches]) |
|
|
216 |
|
|
|
217 |
return( cigarStringDict) |
|
|
218 |
|
|
|
219 |
|
|
|
220 |
def distanceLineToPoint(x1, y1, x2, y2, x0, y0): |
|
|
221 |
|
|
|
222 |
return( float( abs( (y2-y1)*x0 - (x2 - x1 )*y0 + x2*y1 - y2*x1 ) ) / (math.sqrt( math.pow(y2-y1,2) + math.pow(x2-x1,2))) ) |
|
|
223 |
|
|
|
224 |
|
|
|
225 |
def fixGraphML(graphmlPath): |
|
|
226 |
with open(graphmlPath) as f: |
|
|
227 |
lines = [] |
|
|
228 |
for line in f: |
|
|
229 |
lines.append( line.replace('"d3"','"fixedD3"') ) |
|
|
230 |
if lines!=None: |
|
|
231 |
with open(graphmlPath,'w') as f: |
|
|
232 |
for line in lines: |
|
|
233 |
f.write(line) |
|
|
234 |
|
|
|
235 |
|
|
|
236 |
def getHammingDistance(s1, s2,maxDist=999999): |
|
|
237 |
d = 0 |
|
|
238 |
for index,base in enumerate(s1): |
|
|
239 |
d+=(base!=s2[index] and s2[index]!="N" and base!="N") |
|
|
240 |
if d>maxDist: |
|
|
241 |
return(maxDist+1) |
|
|
242 |
return(d) |
|
|
243 |
|
|
|
244 |
|
|
|
245 |
def getHammingIndices( seqA, seqB, maxIndices=999999 ): |
|
|
246 |
|
|
|
247 |
indices = [] |
|
|
248 |
for index,base in enumerate(seqA): |
|
|
249 |
if len(seqB)<index: |
|
|
250 |
break |
|
|
251 |
if base!=seqB[index]: |
|
|
252 |
indices.append(index) |
|
|
253 |
if indices>=maxIndices: |
|
|
254 |
return(indices) |
|
|
255 |
|
|
|
256 |
return(indices) |
|
|
257 |
|
|
|
258 |
def humanReadable(value, targetDigits=2,fp=0): |
|
|
259 |
|
|
|
260 |
if value == 0: |
|
|
261 |
return('0') |
|
|
262 |
|
|
|
263 |
baseId = int(math.floor( math.log10(float(value))/3.0 )) |
|
|
264 |
suffix = "" |
|
|
265 |
if baseId==0: |
|
|
266 |
sVal = str(round(value,targetDigits)) |
|
|
267 |
if len(sVal)>targetDigits and sVal.find('.'): |
|
|
268 |
sVal = sVal.split('.')[0] |
|
|
269 |
|
|
|
270 |
elif baseId>0: |
|
|
271 |
|
|
|
272 |
sStrD = max(0,targetDigits-len(str( '{:.0f}'.format((value/(math.pow(10,baseId*3)))) ))) |
|
|
273 |
|
|
|
274 |
|
|
|
275 |
sVal = ('{:.%sf}' % min(fp, sStrD)).format((value/(math.pow(10,baseId*3)))) |
|
|
276 |
suffix = 'kMGTYZ'[baseId-1] |
|
|
277 |
else: |
|
|
278 |
|
|
|
279 |
sStrD = max(0,targetDigits-len(str( '{:.0f}'.format((value*(math.pow(10,-baseId*3)))) ))) |
|
|
280 |
sVal = ('{:.%sf}' % min(fp, sStrD)).format((value*(math.pow(10,-baseId*3)))) |
|
|
281 |
suffix = 'mnpf'[-baseId-1] |
|
|
282 |
|
|
|
283 |
if len(sVal)+1>targetDigits: |
|
|
284 |
# :( |
|
|
285 |
sVal = str(round(value,fp))[1:] |
|
|
286 |
suffix = '' |
|
|
287 |
|
|
|
288 |
|
|
|
289 |
return('%s%s' % (sVal,suffix)) |
|
|
290 |
|
|
|
291 |
|
|
|
292 |
|
|
|
293 |
|
|
|
294 |
##Pathtools |
|
|
295 |
def decodePath( path, spacers={'flag': ',', 'param':'_', 'kv':'=', 'invalid':['=','_']}): |
|
|
296 |
|
|
|
297 |
|
|
|
298 |
invalidFileNameCharacters = spacers['invalid'] |
|
|
299 |
flagSpacer = spacers['flag'] |
|
|
300 |
paramSpacer = spacers['param'] |
|
|
301 |
keyValueSpacer = spacers['kv'] |
|
|
302 |
|
|
|
303 |
decodedPath = { |
|
|
304 |
'runId':None, |
|
|
305 |
'step':-1, |
|
|
306 |
'name':'unknown', |
|
|
307 |
'parameters':{}, |
|
|
308 |
'flags' : [], |
|
|
309 |
'extension':'' |
|
|
310 |
} |
|
|
311 |
|
|
|
312 |
parts = os.path.splitext(path) |
|
|
313 |
basePath = parts[0] |
|
|
314 |
decodedPath['extension'] = parts[1] |
|
|
315 |
|
|
|
316 |
#Check if the anlysis base directory is in the path: |
|
|
317 |
parts = basePath.split('/') |
|
|
318 |
if 'analysis_' in parts[0]: |
|
|
319 |
decodedPath['runId'] = parts[0] |
|
|
320 |
toDecode = parts[1] |
|
|
321 |
else: |
|
|
322 |
toDecode = path |
|
|
323 |
|
|
|
324 |
keyValuePairs = basePath.split(paramSpacer) |
|
|
325 |
for pair in keyValuePairs: |
|
|
326 |
keyValue = pair.split(keyValueSpacer) |
|
|
327 |
if len(keyValue)!=2: |
|
|
328 |
print(('Parameter decoding failed: %s, this probably means input files contain invalid characters such as %s' % (pair, ", ".join(invalidFileNameCharacters)))) |
|
|
329 |
else: |
|
|
330 |
key = keyValue[0] |
|
|
331 |
value = keyValue[1] |
|
|
332 |
if key!='flags' and key!='name': |
|
|
333 |
decodedPath['parameters'][key] = value |
|
|
334 |
elif key=='flags': |
|
|
335 |
decodedPath['flags'] = value.split(flagSpacer) |
|
|
336 |
elif key=='name': |
|
|
337 |
decodedPath['name'] = value |
|
|
338 |
|
|
|
339 |
return(decodedPath) |
|
|
340 |
|
|
|
341 |
|
|
|
342 |
def invBase(nucleotide): |
|
|
343 |
|
|
|
344 |
if nucleotide=='A': |
|
|
345 |
return(['T','C','G']) |
|
|
346 |
if nucleotide=='C': |
|
|
347 |
return(['T','A','G']) |
|
|
348 |
if nucleotide=='T': |
|
|
349 |
return(['C','A','G']) |
|
|
350 |
if nucleotide=='G': |
|
|
351 |
return(['C','A','T']) |
|
|
352 |
return(['']) |
|
|
353 |
|
|
|
354 |
def encodePath(step, name, params, extension, spacers={'flag': ',', 'param':'_', 'kv':'=', 'invalid':['=','_']}): |
|
|
355 |
|
|
|
356 |
invalidFileNameCharacters = spacers['invalid'] |
|
|
357 |
flagSpacer = spacers['flag'] |
|
|
358 |
paramSpacer = spacers['param'] |
|
|
359 |
keyValueSpacer = spacers['kv'] |
|
|
360 |
|
|
|
361 |
#Todo get keyValueSpacer in here |
|
|
362 |
encodedPath = 'step={:0>2d}'.format(step) |
|
|
363 |
encodedPath += '_name=%s' % name |
|
|
364 |
for parname in params: |
|
|
365 |
if parname.lower()!="flags": |
|
|
366 |
if not( any(i in parname for i in invalidFileNameCharacters) or any(i in params[parname] for i in invalidFileNameCharacters) ): |
|
|
367 |
encodedPath+='%s%s%s%s' % (paramSpacer, parname, keyValueSpacer, params[parname]) |
|
|
368 |
else: |
|
|
369 |
raise ValueError('Parameter encoding failed: %s %s contains at least one invalid character' % (parname, params[parname]) ) |
|
|
370 |
|
|
|
371 |
#Todo check flags for mistakes |
|
|
372 |
if 'flags' in params: |
|
|
373 |
encodedPath+='%sflags%s%s' % (paramSpacer, keyValueSpacer,flagSpacer.join(params['flags'])) |
|
|
374 |
|
|
|
375 |
if extension!=None and extension!=False: |
|
|
376 |
encodedPath = '%s.%s' % (encodedPath, extension) |
|
|
377 |
|
|
|
378 |
return(encodedPath) |
|
|
379 |
|
|
|
380 |
|
|
|
381 |
def locateIndel( seqA, seqB, verbose=False ): |
|
|
382 |
|
|
|
383 |
alignment = pairwise2.align.globalmx(seqA, seqB, 2, -1, one_alignment_only=True)[0] |
|
|
384 |
|
|
|
385 |
#pairwise2.format_alignment(*alignment) |
|
|
386 |
indelLocA = -1 |
|
|
387 |
indelLocB = -1 |
|
|
388 |
|
|
|
389 |
if alignment: |
|
|
390 |
align1 = alignment[0] |
|
|
391 |
align2 = alignment[1] |
|
|
392 |
|
|
|
393 |
localAlign1 = align1.strip('-') |
|
|
394 |
localAlign2 = align2.strip('-') |
|
|
395 |
|
|
|
396 |
dels = min( localAlign1.count('-'), localAlign2.count('-') ) |
|
|
397 |
ins = max( localAlign1.count('-'), localAlign2.count('-') ) |
|
|
398 |
if (ins==1) and dels==0: |
|
|
399 |
|
|
|
400 |
seqNHasIndel = localAlign2.count('-')==0 #bool: 0: seqA has indel, 1: seqB has indel |
|
|
401 |
|
|
|
402 |
#Find indel location: |
|
|
403 |
|
|
|
404 |
if seqNHasIndel: |
|
|
405 |
indelLocA = align1.find('-') |
|
|
406 |
if verbose: |
|
|
407 |
print(('seqA:%s\nseqB:%s'% (align1,align2))) |
|
|
408 |
print((' '*(indelLocA+5)+'^')) |
|
|
409 |
else: |
|
|
410 |
indelLocB = align2.find('-') |
|
|
411 |
if verbose: |
|
|
412 |
print(('seqA:%s\nseqB:%s'% (align1,align2))) |
|
|
413 |
print((' '*(indelLocB+5)+'^')) |
|
|
414 |
|
|
|
415 |
|
|
|
416 |
#print(alignment[2]) is score |
|
|
417 |
|
|
|
418 |
return(indelLocA,indelLocB) |
|
|
419 |
|
|
|
420 |
|
|
|
421 |
class Histogram(): |
|
|
422 |
def __init__(self): |
|
|
423 |
self.data = {} |
|
|
424 |
|
|
|
425 |
def addCount(self,idx, count=1): |
|
|
426 |
if not idx in self.data: |
|
|
427 |
self.data[idx] = 0 |
|
|
428 |
self.data[idx]+=count |
|
|
429 |
|
|
|
430 |
def write(self, path): |
|
|
431 |
with open(path,'w') as f: |
|
|
432 |
for index in self.data: |
|
|
433 |
f.write('%s;%s\n' % (index, self.data[index])) |
|
|
434 |
|
|
|
435 |
|
|
|
436 |
|
|
|
437 |
globalWritingUpdates = False |
|
|
438 |
class InternalTool(): |
|
|
439 |
def __init__(self, screenName = "unnamed"): |
|
|
440 |
self.verbose = True |
|
|
441 |
self.screenName = screenName |
|
|
442 |
|
|
|
443 |
def setVerbosity(self, verbosity): |
|
|
444 |
self.verbose = verbosity |
|
|
445 |
|
|
|
446 |
def __priorRunningPrint(self): |
|
|
447 |
if self.isUpdating(): |
|
|
448 |
print('') |
|
|
449 |
self.setUpdating(False) |
|
|
450 |
|
|
|
451 |
def isUpdating(self): |
|
|
452 |
global globalWritingUpdates |
|
|
453 |
if globalWritingUpdates: |
|
|
454 |
return(True) |
|
|
455 |
|
|
|
456 |
def setUpdating(self, boolean): |
|
|
457 |
|
|
|
458 |
global globalWritingUpdates |
|
|
459 |
globalWritingUpdates = boolean |
|
|
460 |
|
|
|
461 |
|
|
|
462 |
def log(self, message): |
|
|
463 |
if self.verbose: |
|
|
464 |
self.__priorRunningPrint() |
|
|
465 |
print((Fore.GREEN + self.screenName + ', INFO: ' + Style.RESET_ALL + message)) |
|
|
466 |
|
|
|
467 |
def warn(self, message): |
|
|
468 |
self.__priorRunningPrint() |
|
|
469 |
print((Fore.RED + self.screenName + ', WARN: ' + Style.RESET_ALL + message)) |
|
|
470 |
|
|
|
471 |
def softWarn(self, message): |
|
|
472 |
self.__priorRunningPrint() |
|
|
473 |
print((Fore.YELLOW + self.screenName + ', SOFT WARN: ' + Style.RESET_ALL + message)) |
|
|
474 |
|
|
|
475 |
|
|
|
476 |
|
|
|
477 |
def update(self, message): |
|
|
478 |
print('\r' + Fore.GREEN + self.screenName + ', UPDATE: ' + Style.RESET_ALL +message, end=' ') |
|
|
479 |
self.setUpdating(True) |
|
|
480 |
|
|
|
481 |
|
|
|
482 |
def info(self, message): |
|
|
483 |
self.log(message) |
|
|
484 |
|
|
|
485 |
|
|
|
486 |
def fatal(self, message): |
|
|
487 |
self.__priorRunningPrint() |
|
|
488 |
print((Fore.RED + self.screenName + ', FATAL ERROR: ' + Style.RESET_ALL + message)) |
|
|
489 |
exit() |
|
|
490 |
|
|
|
491 |
|
|
|
492 |
class ExternalTool(InternalTool): |
|
|
493 |
def __init__(self, screenName = "unnamed"): |
|
|
494 |
InternalTool.__init__(self) |
|
|
495 |
#Locations to look for binaries when they are not found on the PATH |
|
|
496 |
|
|
|
497 |
|
|
|
498 |
def setExecutablePath(self, path): |
|
|
499 |
self.executable = path |
|
|
500 |
|
|
|
501 |
#Only works in unix and cygwin |
|
|
502 |
def isAvailable(self): |
|
|
503 |
available = subprocess.call("type " + self.executable.split()[0], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0 |
|
|
504 |
return(available or os.path.isfile(self.executable)) |
|
|
505 |
|
|
|
506 |
|
|
|
507 |
class FastqToFasta(ExternalTool): |
|
|
508 |
def __init__(self): |
|
|
509 |
ExternalTool.__init__(self) |
|
|
510 |
self.setExecutablePath('fastq_to_fasta') |
|
|
511 |
self.screenName = "FASTQ_TO_FASTA" |
|
|
512 |
|
|
|
513 |
def execute(self, fqPath, fastaPath): |
|
|
514 |
os.system('%s -n -i %s -o %s' % (self.executable, fqPath, fastaPath )) |
|
|
515 |
|
|
|
516 |
|
|
|
517 |
class FastQQualityFilter(ExternalTool): |
|
|
518 |
|
|
|
519 |
def __init__(self): |
|
|
520 |
ExternalTool.__init__(self) |
|
|
521 |
self.setExecutablePath('fastq_quality_filter') |
|
|
522 |
self.minBaseQuality = 36 |
|
|
523 |
self.sequenceCountSuffix = 'fastqQualityResultCount' |
|
|
524 |
self.screenName = "FASTQ_QUALITY_FILTER" |
|
|
525 |
|
|
|
526 |
def setMinBaseQuality(self, minBaseQuality): |
|
|
527 |
self.minBaseQuality = minBaseQuality |
|
|
528 |
|
|
|
529 |
def getCountPath(self,filteredTargetPath): |
|
|
530 |
return( '%s_%s' % (filteredTargetPath,self.sequenceCountSuffix) ) |
|
|
531 |
|
|
|
532 |
|
|
|
533 |
def execute(self, fqPath, filteredTargetPath): |
|
|
534 |
os.system('%s -q %s -i %s -o %s -v -sam > %s' % (self.executable, self.minBaseQuality, fqPath, filteredTargetPath, self.getCountPath())) |
|
|
535 |
r = 0 |
|
|
536 |
with open(self.getCountPath(), 'w') as f: |
|
|
537 |
r = int(f.readline()) |
|
|
538 |
|
|
|
539 |
return(r) |
|
|
540 |
|
|
|
541 |
class ART(ExternalTool): |
|
|
542 |
|
|
|
543 |
def __init__(self): |
|
|
544 |
ExternalTool.__init__(self) |
|
|
545 |
self.setExecutablePath('./art_bin_ChocolateCherryCake/art_illumina') |
|
|
546 |
self.screenName = "ART" |
|
|
547 |
|
|
|
548 |
def simulateAmpliconReads(self,sequenceAbundanceCounter, prefix, readSize=76, coverage=10, reverseComplementAmplicon=False): |
|
|
549 |
|
|
|
550 |
sequences = [] |
|
|
551 |
print(( sequenceAbundanceCounter.most_common(1))) |
|
|
552 |
(v,hCount) = sequenceAbundanceCounter.most_common(1)[0] |
|
|
553 |
for iteration in range(0,hCount): |
|
|
554 |
|
|
|
555 |
for sequence,count in sequenceAbundanceCounter.most_common(): |
|
|
556 |
if count>=hCount: |
|
|
557 |
sequences.append(sequence) |
|
|
558 |
else: |
|
|
559 |
break |
|
|
560 |
hCount-=1 |
|
|
561 |
|
|
|
562 |
|
|
|
563 |
bpythonSeqs=[] |
|
|
564 |
for index,sequence in enumerate(sequences): |
|
|
565 |
if reverseComplementAmplicon : |
|
|
566 |
bpythonSeqs.append(SeqIO.SeqRecord(Seq(sequence).reverse_complement(), '%s-%s' % (str(index),str(sequence)))) |
|
|
567 |
else: |
|
|
568 |
bpythonSeqs.append(SeqIO.SeqRecord(Seq(sequence), '%s-%s' % (str(index),str(sequence)))) |
|
|
569 |
|
|
|
570 |
fastaPath = getTempFileName('art')+'.fa' |
|
|
571 |
SeqIO.write(bpythonSeqs, fastaPath, "fasta") |
|
|
572 |
|
|
|
573 |
os.system('%s -amp -i %s -l %s -f %s -o %s -sam -ss HS25' %(self.executable, fastaPath, readSize, coverage, prefix)) |
|
|
574 |
|
|
|
575 |
|
|
|
576 |
|
|
|
577 |
def setMinBaseQuality(self, minBaseQuality): |
|
|
578 |
self.minBaseQuality = minBaseQuality |
|
|
579 |
|
|
|
580 |
def getCountPath(self,filteredTargetPath): |
|
|
581 |
return( '%s_%s' % (filteredTargetPath,self.sequenceCountSuffix) ) |
|
|
582 |
|
|
|
583 |
|
|
|
584 |
def execute(self, fqPath, filteredTargetPath): |
|
|
585 |
os.system('%s -q %s -i %s -o %s -v > %s' % (self.executable, self.minBaseQuality, fqPath, filteredTargetPath, self.getCountPath())) |
|
|
586 |
r = 0 |
|
|
587 |
with open(self.getCountPath(), 'w') as f: |
|
|
588 |
r = int(f.readline()) |
|
|
589 |
|
|
|
590 |
return(r) |
|
|
591 |
|
|
|
592 |
|
|
|
593 |
class NeedleAll(ExternalTool): |
|
|
594 |
|
|
|
595 |
def __init__(self): |
|
|
596 |
ExternalTool.__init__(self) |
|
|
597 |
self.setExecutablePath('needleall') |
|
|
598 |
self.screenName = "NEEDLE_ALL" |
|
|
599 |
|
|
|
600 |
|
|
|
601 |
def execute(self, fqPathA, fqPathB, outputFilePath): |
|
|
602 |
command = '%s -auto -stdout -asequence "%s" -bsequence "%s" > %s' % (self.executable, fqPathA, fqPathB, outputFilePath) |
|
|
603 |
print(command) |
|
|
604 |
os.system(command) |
|
|
605 |
r = 0 |
|
|
606 |
|
|
|
607 |
#This is an inverted distance matrix |
|
|
608 |
invScores = DistanceMatrix() |
|
|
609 |
|
|
|
610 |
#find max score: |
|
|
611 |
maxScore = 0 |
|
|
612 |
# To perform T-SNE a normalised matrix is required: the total sum should be 1 |
|
|
613 |
totalScore = 0 |
|
|
614 |
cells = 0 |
|
|
615 |
with open(outputFilePath, 'r') as f: |
|
|
616 |
for line in f: |
|
|
617 |
#print(line) |
|
|
618 |
parts = line.strip().replace('(','').replace(')','').split(' ') |
|
|
619 |
if len(parts)==4: |
|
|
620 |
value = float(parts[3]) |
|
|
621 |
if value>maxScore: |
|
|
622 |
maxScore=value |
|
|
623 |
totalScore+=value |
|
|
624 |
cells+=1 |
|
|
625 |
|
|
|
626 |
#Normalise the scores |
|
|
627 |
|
|
|
628 |
|
|
|
629 |
matrixSum = 0 |
|
|
630 |
with open(outputFilePath, 'r') as f: |
|
|
631 |
for line in f: |
|
|
632 |
#print(line) |
|
|
633 |
parts = line.strip().replace('(','').replace(')','').split(' ') |
|
|
634 |
#print('%s %s %s %s' % (parts[0], parts[1],parts[2],parts[3])) |
|
|
635 |
if len(parts)==4: |
|
|
636 |
if float(parts[3])>0: |
|
|
637 |
value = (maxScore-float(parts[3])) |
|
|
638 |
matrixSum+= value |
|
|
639 |
invScores.setDistance(str(parts[0]), str(parts[1]), value) |
|
|
640 |
|
|
|
641 |
print(('Sum of distance matrix is %s' % matrixSum)) |
|
|
642 |
return(invScores) |
|
|
643 |
|
|
|
644 |
|
|
|
645 |
|
|
|
646 |
class ClustalOmega(ExternalTool): |
|
|
647 |
|
|
|
648 |
def __init__(self): |
|
|
649 |
ExternalTool.__init__(self) |
|
|
650 |
self.setExecutablePath('clustalo') |
|
|
651 |
self.screenName = "CLUSTAL-OMEGA" |
|
|
652 |
|
|
|
653 |
def alignFasta(self, fastaPath,outputFastaFilePath): |
|
|
654 |
self.log('aligning %s to %s' % (fastaPath, outputFastaFilePath)) |
|
|
655 |
os.system('%s -i %s -o %s --force' % (self.executable, fastaPath, outputFastaFilePath)) |
|
|
656 |
|
|
|
657 |
|
|
|
658 |
def alignFastqs(self, fqPath, outputFastaFilePath): |
|
|
659 |
#self.log('converting %s to fasta' % fqpath) |
|
|
660 |
f = FastqToFasta() |
|
|
661 |
tmpFasta = getTempFileName() |
|
|
662 |
f.execute(fqPath, tmpFasta) |
|
|
663 |
self.alignFasta(tmpFasta,outputFastaFilePath) |
|
|
664 |
os.remove(tmpFasta) |
|
|
665 |
|
|
|
666 |
|
|
|
667 |
|
|
|
668 |
|
|
|
669 |
class SamTools(ExternalTool): |
|
|
670 |
def __init__(self): |
|
|
671 |
ExternalTool.__init__(self) |
|
|
672 |
self.setExecutablePath('samtools') |
|
|
673 |
self.screenName = "SAMTOOLS" |
|
|
674 |
|
|
|
675 |
def samToSortedBam(self, samPath, bamPath, deleteSam=False): |
|
|
676 |
self.log('Converting SAM to sorted BAM') |
|
|
677 |
os.system('%s view -bS %s | %s sort - -o %s.bam' % (self.executable, samPath, self.executable, bamPath)) |
|
|
678 |
self.log('Completed conversion') |
|
|
679 |
if deleteSam: |
|
|
680 |
os.remove(samPath) |
|
|
681 |
|
|
|
682 |
def bamToSortedBam(self, bamPath, sortedBamPath): |
|
|
683 |
self.log('Converting BAM to sorted BAM') |
|
|
684 |
os.system('%s sort -o %s.bam %s' % (self.executable, sortedBamPath, bamPath)) |
|
|
685 |
self.log('Completed conversion') |
|
|
686 |
|
|
|
687 |
|
|
|
688 |
def index(self, bamPath): |
|
|
689 |
self.log('Indexing BAM file') |
|
|
690 |
os.system('%s index %s' % (self.executable, bamPath)) |
|
|
691 |
self.log('Completed indexing') |
|
|
692 |
|
|
|
693 |
def readBamToDictOld(self,bamFilePath, columns={'cigar':5, 'left':3,'scarSequence':9}, index='scarSequence', appendTo={}): # columns:{ targetName:columnIndex, ... } |
|
|
694 |
retDict = appendTo |
|
|
695 |
#os.system("samtools view %s | cut -f1,6 | grep -oEi '^.*([0-9]+D){1,1}.*' ") |
|
|
696 |
#tempfile = './temp/temp_' + str(uuid.uuid1()) |
|
|
697 |
tempfile = getTempFileName() |
|
|
698 |
self.log("samtools view %s > %s" %(bamFilePath,tempfile)) |
|
|
699 |
os.system("samtools view %s > %s" %(bamFilePath,tempfile)) |
|
|
700 |
with open(tempfile) as f: |
|
|
701 |
#Go over all lines in the BAM file |
|
|
702 |
for line in f: |
|
|
703 |
#Extract all columns: |
|
|
704 |
parts = line.replace('\n','').split() |
|
|
705 |
#Extract the primary key value: |
|
|
706 |
indexKey = parts[columns[index]] |
|
|
707 |
#Initialise the existence of the primary key value in the return dictionary |
|
|
708 |
if not indexKey in retDict: |
|
|
709 |
retDict[indexKey] = {} |
|
|
710 |
#Iterate over the keysm except the index |
|
|
711 |
for key in columns: |
|
|
712 |
if key is not index: |
|
|
713 |
#Get the value of the key on this line in the BAM file |
|
|
714 |
v = parts[columns[key]] |
|
|
715 |
if not v in retDict[indexKey]: |
|
|
716 |
retDict[indexKey][v] = 1 |
|
|
717 |
else: |
|
|
718 |
retDict[indexKey][v] += 1 |
|
|
719 |
os.remove(tempfile) |
|
|
720 |
return(retDict) |
|
|
721 |
|
|
|
722 |
def readBamToDict(self,bamFilePath, columns={'cigar':5, 'id':0,'left':3,'scarSequence':9}, index='scarSequence', appendTo={}): # columns:{ targetName:columnIndex, ... } |
|
|
723 |
retDict = appendTo |
|
|
724 |
#os.system("samtools view %s | cut -f1,6 | grep -oEi '^.*([0-9]+D){1,1}.*' ") |
|
|
725 |
tempfile = './temp/temp_' + str(uuid.uuid1()) |
|
|
726 |
self.log("samtools view %s > %s" %(bamFilePath,tempfile)) |
|
|
727 |
os.system("samtools view %s > %s" %(bamFilePath,tempfile)) |
|
|
728 |
with open(tempfile) as f: |
|
|
729 |
#Go over all lines in the BAM file |
|
|
730 |
for line in f: |
|
|
731 |
#Extract all columns: |
|
|
732 |
parts = line.replace('\n','').split() |
|
|
733 |
#Extract the primary key value: |
|
|
734 |
indexKey = parts[columns[index]] |
|
|
735 |
#Initialise the existence of the primary key value in the return dictionary |
|
|
736 |
if not indexKey in retDict: |
|
|
737 |
retDict[indexKey] = {} |
|
|
738 |
#Iterate over the keysm except the index |
|
|
739 |
for key in columns: |
|
|
740 |
if key is not index: |
|
|
741 |
#Get the value of the key on this line in the BAM file |
|
|
742 |
if columns[key]<len(parts): |
|
|
743 |
v = parts[columns[key]] |
|
|
744 |
retDict[indexKey][key] = v |
|
|
745 |
|
|
|
746 |
try: |
|
|
747 |
os.remove(tempfile) |
|
|
748 |
except: |
|
|
749 |
print(('Temporary file could not be removed %s' % tempfile)) |
|
|
750 |
pass |
|
|
751 |
return(retDict) |
|
|
752 |
|
|
|
753 |
|
|
|
754 |
# Make sure to index the reference genome! |
|
|
755 |
class BWA_MEM(ExternalTool): |
|
|
756 |
def __init__(self): |
|
|
757 |
ExternalTool.__init__(self) |
|
|
758 |
self.setExecutablePath('bwa mem') |
|
|
759 |
self.screenName = "BWA MEM" |
|
|
760 |
self.fixedFlags = [] |
|
|
761 |
|
|
|
762 |
def setReferencePath(self, referenceGenomePath): |
|
|
763 |
self.referenceGenomePath = referenceGenomePath |
|
|
764 |
|
|
|
765 |
def execute(self, fqPath, samPath, readGroupInfo=None, secondSplitHits=False, getSecondaryAlignments=False): |
|
|
766 |
if not isinstance(fqPath, str): |
|
|
767 |
fqPath = " ".join(fqPath) |
|
|
768 |
|
|
|
769 |
if self.referenceGenomePath!=None: |
|
|
770 |
self.log('Aligning %s to %s' %(fqPath, self.referenceGenomePath)) |
|
|
771 |
|
|
|
772 |
flags=[]+self.fixedFlags |
|
|
773 |
if secondSplitHits: |
|
|
774 |
flags.append("-M") |
|
|
775 |
if readGroupInfo!=None: |
|
|
776 |
flags.append("-R '%s'" % readGroupInfo) |
|
|
777 |
if getSecondaryAlignments: |
|
|
778 |
flags.append("-a") |
|
|
779 |
|
|
|
780 |
self.log("Flags: %s" % (" ".join(flags))) |
|
|
781 |
cmd = '%s %s %s %s > %s' % (self.executable, " ".join(flags), self.referenceGenomePath, fqPath, samPath) |
|
|
782 |
self.log(cmd) |
|
|
783 |
os.system(cmd) |
|
|
784 |
else: |
|
|
785 |
self.warn('Warning; no reference genome specified for BWA MEM, canceled alignment.') |
|
|
786 |
|
|
|
787 |
# Make sure to index the reference genome! |
|
|
788 |
class STAR(ExternalTool): |
|
|
789 |
def __init__(self): |
|
|
790 |
ExternalTool.__init__(self) |
|
|
791 |
self.setExecutablePath('STAR') |
|
|
792 |
self.screenName = "STAR" |
|
|
793 |
self.threads = 4 |
|
|
794 |
|
|
|
795 |
def index(self): |
|
|
796 |
|
|
|
797 |
if not os.path.exists(self.refDirectory): |
|
|
798 |
try: |
|
|
799 |
os.mkdir(self.refDirectory ) |
|
|
800 |
except: |
|
|
801 |
self.fatal('Could not create index directory on %s' % self.refDirectory) |
|
|
802 |
|
|
|
803 |
os.system('%s --runMode genomeGenerate --genomeDir %s --genomeFastaFiles %s --runThreadN %s' % |
|
|
804 |
(self.executable, self.refDirectory, self.referenceGenomePath, self.threads)) |
|
|
805 |
|
|
|
806 |
|
|
|
807 |
|
|
|
808 |
def setReferencePath(self, referenceGenomePath): |
|
|
809 |
self.referenceGenomePath = referenceGenomePath |
|
|
810 |
self.refDirectory = '%s_STAR_INDEX' % os.path.basename(self.referenceGenomePath) |
|
|
811 |
if not os.path.exists(self.refDirectory): |
|
|
812 |
self.info("Reference index %s does not exist." % referenceGenomePath) |
|
|
813 |
self.index() |
|
|
814 |
|
|
|
815 |
|
|
|
816 |
def execute(self, fqPath, outputPath): |
|
|
817 |
os.system('%s --genomeDir %s --readFilesIn %s --runThreadN %s --outFileNamePrefix %s' % |
|
|
818 |
(self.executable, self.refDirectory, fqPath, self.threads, outputPath)) |
|
|
819 |
|
|
|
820 |
|
|
|
821 |
#!!!sjdbOverhang should match the length of the reads minus one!!! |
|
|
822 |
def pass2alignment(self, SJouttabPath, sjdbOverhang=74): |
|
|
823 |
|
|
|
824 |
base = os.path.basename(SJouttabPath) |
|
|
825 |
|
|
|
826 |
self.p2refDirectory = '%s_P2_STAR_INDEX' % os.path.basename(SJouttabPath) |
|
|
827 |
if not os.path.exists(self.p2refDirectory): |
|
|
828 |
try: |
|
|
829 |
os.mkdir(self.p2refDirectory ) |
|
|
830 |
except: |
|
|
831 |
self.fatal('Could not create index directory on %s' % self.p2refDirectory) |
|
|
832 |
|
|
|
833 |
|
|
|
834 |
def mapSingleEnd(self, fqPath, samPath): |
|
|
835 |
if not isinstance(fqPath, str): |
|
|
836 |
fqPath = " ".join(fqPath) |
|
|
837 |
|
|
|
838 |
if self.referenceGenomePath!=None: |
|
|
839 |
self.log('Aligning %s to %s' %(fqPath, self.referenceGenomePath)) |
|
|
840 |
os.system('%s --genomeDir %s --readFilesIn %s --runThreadN %s' % |
|
|
841 |
(self.executable, self.refDirectory, fqPath, self.threads)) |
|
|
842 |
else: |
|
|
843 |
self.warn('Warning; no reference genome specified for STAR, canceled alignment.') |
|
|
844 |
|
|
|
845 |
class GSNAP(ExternalTool): |
|
|
846 |
def __init__(self): |
|
|
847 |
ExternalTool.__init__(self) |
|
|
848 |
self.setExecutablePath('gsnap') |
|
|
849 |
self.screenName = "GSNAP" |
|
|
850 |
#self.cmd= 'sudo gsnap -d lintrace analysis_bdefa5bc-6b5c-11e5-a487-0800279bd60a_fullScarList_Protocol1-DNA-seq.fq -t 4 -A sam > analysis_bdefa5bc-6b5c-11e5-a487-0800279bd60a_fullScarList_Protocol1-DNA-seq.sam' |
|
|
851 |
|
|
|
852 |
#Reference genome path should be the name of the reference build folder |
|
|
853 |
def setReferencePath(self, referenceGenomePath): |
|
|
854 |
self.referenceGenomePath = referenceGenomePath |
|
|
855 |
|
|
|
856 |
def execute(self, fqPath, samPath): |
|
|
857 |
if self.referenceGenomePath!=None: |
|
|
858 |
self.log('Aligning %s to %s' %(fqPath, self.referenceGenomePath)) |
|
|
859 |
os.system('%s -d %s %s --format=sam > %s' % (self.executable, self.referenceGenomePath, fqPath, samPath)) |
|
|
860 |
else: |
|
|
861 |
self.warn('Warning; no reference genome specified for GSNAP, canceled alignment.') |
|
|
862 |
|
|
|
863 |
|
|
|
864 |
def getTempFileName(centerFix=""): |
|
|
865 |
if not os.path.exists('./temp'): |
|
|
866 |
os.mkdir('./temp') |
|
|
867 |
return('./temp/temp_' + centerFix + str(uuid.uuid1())) |
|
|
868 |
|
|
|
869 |
def mapReads(readsFile, baseName, mapper, samTools=None,readGroupInfo=None, secondSplitHits=False): |
|
|
870 |
|
|
|
871 |
if samTools==None: |
|
|
872 |
samTools = SamTools() |
|
|
873 |
|
|
|
874 |
basePath = '%s/%s' % ( '.', baseName) |
|
|
875 |
samPath = '%s.sam' %( basePath) |
|
|
876 |
bamPath = '%s.bam' %( basePath) |
|
|
877 |
|
|
|
878 |
|
|
|
879 |
if readGroupInfo==None and secondSplitHits==False: |
|
|
880 |
mapper.execute(readsFile, samPath) |
|
|
881 |
else: |
|
|
882 |
mapper.execute(readsFile, samPath, readGroupInfo=None, secondSplitHits=False) |
|
|
883 |
|
|
|
884 |
samTools.samToSortedBam(samPath, basePath, True) |
|
|
885 |
samTools.index(bamPath) |
|
|
886 |
return({'samPath':samPath, 'bamPath':bamPath}) |
|
|
887 |
|
|
|
888 |
def mapSequencesToDict(mapper, sequences): |
|
|
889 |
|
|
|
890 |
bpythonSeqs=[] |
|
|
891 |
for index,sequence in enumerate(sequences): |
|
|
892 |
bpythonSeqs.append(SeqIO.SeqRecord(Seq(sequence), str(index))) |
|
|
893 |
|
|
|
894 |
fastaPath = getTempFileName('bwa')+'.fa' |
|
|
895 |
SeqIO.write(bpythonSeqs, fastaPath, "fasta") |
|
|
896 |
bPath = getTempFileName('samtools') |
|
|
897 |
samPath = bPath+'.sam' |
|
|
898 |
bamPath = bPath+'.bam' |
|
|
899 |
mapper.execute(fastaPath, samPath) |
|
|
900 |
samtools = SamTools() |
|
|
901 |
samtools.samToSortedBam(samPath, bPath, True) |
|
|
902 |
samtools.index(bamPath) |
|
|
903 |
d = samtools.readBamToDict(bamPath) |
|
|
904 |
return(d) |
|
|
905 |
|
|
|
906 |
def mapDictOfSequences(mapper, sequenceDict, bPath=None): |
|
|
907 |
|
|
|
908 |
bpythonSeqs=[] |
|
|
909 |
for seqId in sequenceDict: |
|
|
910 |
bpythonSeqs.append(SeqIO.SeqRecord(Seq(sequenceDict[seqId]), str(seqId))) |
|
|
911 |
|
|
|
912 |
fastaPath = getTempFileName('bwa')+'.fa' |
|
|
913 |
SeqIO.write(bpythonSeqs, fastaPath, "fasta") |
|
|
914 |
if bPath==None: |
|
|
915 |
bPath = getTempFileName('samtools') |
|
|
916 |
samPath = bPath+'.sam' |
|
|
917 |
bamPath = bPath+'.bam' |
|
|
918 |
mapper.execute(fastaPath, samPath) |
|
|
919 |
samtools = SamTools() |
|
|
920 |
samtools.samToSortedBam(samPath, bPath, False) |
|
|
921 |
samtools.index(bamPath) |
|
|
922 |
return(bamPath) |
|
|
923 |
|
|
|
924 |
def mapReadsToDict(mapper, fqPath, index='id',basePath=None, readGroupInfo=None, secondSplitHits=False): |
|
|
925 |
if basePath==None: |
|
|
926 |
bPath = getTempFileName('samtools') |
|
|
927 |
else: |
|
|
928 |
bPath=basePath |
|
|
929 |
samPath = bPath+'.sam' |
|
|
930 |
bamPath = bPath+'.bam' |
|
|
931 |
mapper.execute(fqPath, samPath, readGroupInfo, secondSplitHits) |
|
|
932 |
samtools = SamTools() |
|
|
933 |
samtools.samToSortedBam(samPath, bPath, True) |
|
|
934 |
samtools.index(bamPath) |
|
|
935 |
r = {} |
|
|
936 |
d = samtools.readBamToDict(bamPath, index=index, appendTo=r) |
|
|
937 |
return(d) |
|
|
938 |
|
|
|
939 |
# Reference to this function: http://stackoverflow.com/questions/845058/how-to-get-line-count-cheaply-in-python |
|
|
940 |
# This method is the quickest way to count lines in big files. (Quicker than wc..) |
|
|
941 |
def fileLineCount(filename): |
|
|
942 |
f = open(filename, 'rb') |
|
|
943 |
bufgen = takewhile(lambda x: x, (f.read(1024*1024) for _ in repeat(None))) |
|
|
944 |
return sum( buf.count(b'\n') for buf in bufgen ) |
|
|
945 |
|
|
|
946 |
|
|
|
947 |
# Deprecated: should use Numpy sparse matrix |
|
|
948 |
class DistanceMatrix(): |
|
|
949 |
def __init__(self, maxDist=1000000000): |
|
|
950 |
self.data = {} |
|
|
951 |
self.keys = set() |
|
|
952 |
self.maxDist = maxDist |
|
|
953 |
|
|
|
954 |
self.finalKeying = True # Do not store keys on every update |
|
|
955 |
|
|
|
956 |
def setDistances(self,fromList, toList, distanceList ): |
|
|
957 |
# Add keys: |
|
|
958 |
if not self.finalKeying: |
|
|
959 |
self.keys.update(fromList+toList) |
|
|
960 |
|
|
|
961 |
for index,a in enumerate(fromList): |
|
|
962 |
self._update(a,toList[index], distanceList[index]) |
|
|
963 |
|
|
|
964 |
def getNumpyArray(self): |
|
|
965 |
#Make sure the keys are properly indexed |
|
|
966 |
self._performKeying() |
|
|
967 |
|
|
|
968 |
#Initialise empty matrix: |
|
|
969 |
narray = np.zeros((len(self.keys), len(self.keys))) |
|
|
970 |
|
|
|
971 |
for aIndex,keyA in enumerate(self.keys): |
|
|
972 |
for bIndex,keyB in enumerate(self.keys): |
|
|
973 |
narray[aIndex,bIndex] = self.getDistance(keyA,keyB) |
|
|
974 |
|
|
|
975 |
return(narray) |
|
|
976 |
|
|
|
977 |
|
|
|
978 |
def _update(self, a,b,distance): |
|
|
979 |
if distance<=self.maxDist: |
|
|
980 |
if (not a in self.data) and (not b in self.data): |
|
|
981 |
self.data[a] = {} |
|
|
982 |
self.data[a][b] = distance |
|
|
983 |
return(True) |
|
|
984 |
if a in self.data and b in self.data[a]: |
|
|
985 |
self.data[a][b] = distance |
|
|
986 |
return(True) |
|
|
987 |
if b in self.data and a in self.data[b]: |
|
|
988 |
self.data[b][a] = distance |
|
|
989 |
return(True) |
|
|
990 |
|
|
|
991 |
|
|
|
992 |
# A already exists but B does not: (And b also does not exists as main entry) |
|
|
993 |
if a in self.data and not b in self.data and not b in self.data[a]: |
|
|
994 |
self.data[a][b] = distance |
|
|
995 |
return(True) |
|
|
996 |
elif b in self.data: |
|
|
997 |
if not a in self.data[b]: |
|
|
998 |
self.data[b][a] = distance |
|
|
999 |
return(True) |
|
|
1000 |
return(False) |
|
|
1001 |
|
|
|
1002 |
|
|
|
1003 |
def setDistance(self, a,b,distance ): |
|
|
1004 |
if not self.finalKeying: |
|
|
1005 |
if a not in self.keys and b not in self.keys: |
|
|
1006 |
self.keys.update([a,b]) |
|
|
1007 |
else: |
|
|
1008 |
if a not in self.keys: |
|
|
1009 |
self.keys.add(a) |
|
|
1010 |
if b not in self.keys: |
|
|
1011 |
self.keys.add(b) |
|
|
1012 |
|
|
|
1013 |
return( self._update(a,b, distance )) |
|
|
1014 |
|
|
|
1015 |
|
|
|
1016 |
def getDistance(self, a,b ): |
|
|
1017 |
if a in self.data and b in self.data[a]: |
|
|
1018 |
return(self.data[a][b]) |
|
|
1019 |
if b in self.data and a in self.data[b]: |
|
|
1020 |
return(self.data[b][a]) |
|
|
1021 |
return(self.maxDist) |
|
|
1022 |
|
|
|
1023 |
|
|
|
1024 |
def _performKeying(self): |
|
|
1025 |
#Retrieve all keys |
|
|
1026 |
self.keys = set() |
|
|
1027 |
addKeys = [] |
|
|
1028 |
self.keys.update(list(self.data.keys())) |
|
|
1029 |
for a in self.data: |
|
|
1030 |
for b in self.data[a]: |
|
|
1031 |
if b not in self.keys: |
|
|
1032 |
addKeys.append(b) |
|
|
1033 |
self.keys.update(addKeys) |
|
|
1034 |
|
|
|
1035 |
def writeToFile(self, path): |
|
|
1036 |
|
|
|
1037 |
if self.finalKeying: |
|
|
1038 |
self._performKeying() |
|
|
1039 |
|
|
|
1040 |
separator = ';' |
|
|
1041 |
with open(path, 'w') as f: |
|
|
1042 |
values= ['name'] |
|
|
1043 |
for keyA in self.keys: |
|
|
1044 |
values.append(keyA) |
|
|
1045 |
f.write('%s\n' % (separator.join(values))) |
|
|
1046 |
for keyA in self.keys: |
|
|
1047 |
values = [] |
|
|
1048 |
for keyB in self.keys: |
|
|
1049 |
values.append(str(self.getDistance(keyA, keyB))) |
|
|
1050 |
f.write('%s%s%s\n' % (keyA, separator,separator.join(values))) |
|
|
1051 |
|
|
|
1052 |
|
|
|
1053 |
def loadFromFile(self, path,separator=None): |
|
|
1054 |
with open(path) as f: |
|
|
1055 |
idx = 0 |
|
|
1056 |
header = {} |
|
|
1057 |
for line in f: |
|
|
1058 |
parts = line.rstrip().split(separator) |
|
|
1059 |
if idx==0: |
|
|
1060 |
for keyId,keyName in enumerate(parts): |
|
|
1061 |
header[keyId] = keyName |
|
|
1062 |
else: |
|
|
1063 |
|
|
|
1064 |
for partIndex, part in parts: |
|
|
1065 |
if partIndex==0: |
|
|
1066 |
keyA = part |
|
|
1067 |
else: |
|
|
1068 |
keyB=header[partIndex] |
|
|
1069 |
self.setDistance(keyA,keyB, float(part)) |
|
|
1070 |
|
|
|
1071 |
idx+=1 |