from Bio import SeqIO
from Bio.Seq import Seq
import os
import uuid
from Bio import pairwise2
import math
import subprocess
from colorama import Fore #,Back, Style
from colorama import Back
from colorama import Style
from colorama import init
init(autoreset=True)
import numpy as np
import re
from itertools import takewhile,repeat
#import localInstall
if not os.name=='nt':
import pysam
### custom path config: (not used when not available)
#localInstall.addToPath('/hpc/hub_oudenaarden/bdebarbanson/bin/samtools-1.3.1/')
#localInstall.addToPath('/hpc/hub_oudenaarden/bin/software/bwa-0.7.10/')
#localInstall.addToPath('/media/sf_externalTools/')
### end
class AlignmentVisualiser:
def __init__(self, initialiser):
if type(initialiser) is pysam.AlignedSegment:
self.alignment = initialiser
def getPhredScoreColor(self, phredScore):
p = ord(phredScore)-33.0
if p>32:
return(Fore.GREEN + Style.BRIGHT)
if p>25:
return(Fore.GREEN)
if p>20:
return(Fore.GREEN + Style.DIM)
if p>18:
return(Fore.YELLOW)
return(Fore.RED)
def visualizeAlignment(self, annotateReference=None, annotateQuery=None, intronSummary=True):
reference = []
query = []
alignmentSymbols = []
qualitieSymbols = []
prevIntron = 0
for index,pair in enumerate(self.alignment.get_aligned_pairs( matches_only=False, with_seq=True) ):
#Pair structure = (queryPosition, referencePosition, referenceBase)
if pair[0] is None and pair[2] is None:
prevIntron+=1
continue
else:
if prevIntron>0:
intronString = ' N%s ' % prevIntron
alignmentSymbols.append(Style.DIM + intronString + Style.RESET_ALL)
qualitieSymbols.append(' '*len(intronString))
reference.append(Style.DIM + ('-'*len(intronString)) + Style.RESET_ALL)
query.append( Style.DIM + (' '*len(intronString)) + Style.RESET_ALL)
prevIntron = 0
queryPosition = pair[0]
queryNucleotide= "%s%s-%s"% (Back.RED, Fore.WHITE, Style.RESET_ALL)
referenceNucleotide= ' '# % (Fore.RED, Style.RESET_ALL) #if queryPosition>self.alignment.query_alignment_start and queryPosition<=self.alignment.query_alignment_end else ' '
queryPhred=' '
if queryPosition!=None:
queryNucleotide = self.alignment.query_sequence[queryPosition].upper()
queryPhred = self.alignment.qual[queryPosition]
qualitieSymbols.append(self.getPhredScoreColor(queryPhred) + queryPhred + Style.RESET_ALL)
else:
qualitieSymbols.append(' ')
#print("Q:%s S:%s" % (queryPosition, self.alignment.query_alignment_start))
if pair[2]!=None:
referenceNucleotide = pair[2].upper()
elif queryPosition is None:
referenceNucleotide = "%s%s-%s"% (Back.RED, Fore.WHITE, Style.RESET_ALL)
elif queryPosition>self.alignment.query_alignment_start and queryPosition<=self.alignment.query_alignment_end:
referenceNucleotide = "%s%s-%s"% (Back.RED, Fore.WHITE, Style.RESET_ALL)
if pair[2]==queryNucleotide:
alignmentSymbols.append('|')
else:
alignmentSymbols.append(' ')
reference.append(referenceNucleotide)
if queryPosition!=None and annotateQuery!=None:
if ((queryPosition - self.alignment.query_length) in annotateQuery and (queryPosition - self.alignment.query_length<0 )):
query.append(getattr(Back,annotateQuery[(queryPosition - self.alignment.query_length)]) + Fore.BLACK + queryNucleotide + Style.RESET_ALL)
elif queryPosition in annotateQuery:
query.append(getattr(Back,annotateQuery[queryPosition]) + Fore.WHITE + queryNucleotide + Style.RESET_ALL)
else:
#DIM soft clipped things:
if queryPosition<self.alignment.query_alignment_start or queryPosition>self.alignment.query_alignment_end:
query.append(Style.DIM + Fore.YELLOW + queryNucleotide + Style.RESET_ALL)
else:
query.append(queryNucleotide)
else:
query.append(queryNucleotide)
rStart = self.alignment.reference_end if self.alignment.is_reverse else self.alignment.reference_start
rEnd = self.alignment.reference_start if self.alignment.is_reverse else self.alignment.reference_end
refStartStr = str(rStart)
qStartStr = str(self.alignment.query_alignment_start)
labelSize = max(len(refStartStr), len(qStartStr))
annotatedCigar = []
for operation,length in self.alignment.cigartuples:
annotatedCigar.append( '%s%s%s' % (
[ '%s%sM%s'% (Fore.GREEN, Style.BRIGHT, Style.NORMAL),
'%s%sI%s' % (Fore.YELLOW,Style.BRIGHT, Style.NORMAL),
'%s%sD%s' % (Fore.RED,Style.BRIGHT, Style.NORMAL),
'%sN' % Fore.MAGENTA,
'%s%sS' % (Style.DIM,Fore.RED),
'%s%sH' % (Style.DIM,Fore.RED),
'%s%sP' % (Style.DIM,Fore.RED),
'%s%s=' % (Style.DIM,Fore.GREEN),
'%s%sX' % (Style.DIM,Fore.BLUE)]
[operation]
, length, Style.RESET_ALL))
multiMapping = ''
try:
multiMappingCount = int(self.alignment.get_tag('NH:i'))
if multiMappingCount==1:
multiMapping= Fore.GREEN + 'Unique map' + Style.RESET_ALL
elif multiMappingCount>1:
multiMapping= Fore.WHITE + Back.RED + ('Mapping to at least %s locations' % multiMappingCount) + Style.RESET_ALL
except:
pass
print( ('%sReference:%s %s %s %s MQ:%s %s' % ( Style.BRIGHT, Style.RESET_ALL, self.alignment.reference_name, "".join(annotatedCigar), ("FWD" if not self.alignment.is_reverse else "REV"), self.alignment.mapping_quality, multiMapping))+ (' MULTIMAPPING' if self.alignment.is_secondary else '' ) )
print(('%s%s %s %s (%s%s)' % (
Style.DIM+refStartStr+Style.RESET_ALL,
" "*(labelSize-len(refStartStr)),
"".join(reference),
(Style.DIM+str(rEnd)+Style.RESET_ALL),
"" if self.alignment.is_reverse else "+",
(Style.DIM+str(rEnd-rStart)+Style.RESET_ALL))))
print(('%s%s' % (" "*(labelSize+1), "".join(alignmentSymbols))))
print(('%s%s %s %s (%s)' % (
" "*(labelSize-len(qStartStr)),
Style.DIM+qStartStr+Style.RESET_ALL,
"".join(query),
(Style.DIM+str(self.alignment.query_alignment_end)+Style.RESET_ALL),
(Style.DIM+str(self.alignment.query_alignment_end-self.alignment.query_alignment_start)+Style.RESET_ALL),
)))
print(('%s%s' % (" "*(labelSize+1), "".join(qualitieSymbols))))
print(('%sQuery:%s %s' % (Style.BRIGHT, Style.RESET_ALL,self.alignment.query_name)))
def getLevenshteinDistance(source, target, maxDist=99999999999999):
if len(source) < len(target):
return getLevenshteinDistance(target, source)
# So now we have len(source) >= len(target).
if len(target) == 0:
return len(source)
# We call tuple() to force strings to be used as sequences
# ('c', 'a', 't', 's') - numpy uses them as values by default.
source = np.array(tuple(source))
target = np.array(tuple(target))
# We use a dynamic programming algorithm, but with the
# added optimization that we only need the last two rows
# of the matrix.
previous_row = np.arange(target.size + 1)
for s in source:
# Insertion (target grows longer than source):
current_row = previous_row + 1
# Substitution or matching:
# Target and source items are aligned, and either
# are different (cost of 1), or are the same (cost of 0).
current_row[1:] = np.minimum(
current_row[1:],
np.add(previous_row[:-1], target != s))
# Deletion (target grows shorter than source):
current_row[1:] = np.minimum(
current_row[1:],
current_row[0:-1] + 1)
previous_row = current_row
if( np.amin(previous_row)>maxDist):
return(None)
return previous_row[-1]
def cigarStringToDict(cigarString):
if cigarString==None:
cs = ''
else:
cs = cigarString
cigarStringDict = {}
for symbol in ['M','D','I','S']:
# Extract the integer which is found before the symbol supplied
# returns 0 when the symbol is not found as the sum of the list will be zero
matches = re.findall('[0-9]*%s' % symbol, cs)
cigarStringDict[symbol] = sum( [ int(x.replace(symbol,'')) for x in matches])
return( cigarStringDict)
def distanceLineToPoint(x1, y1, x2, y2, x0, y0):
return( float( abs( (y2-y1)*x0 - (x2 - x1 )*y0 + x2*y1 - y2*x1 ) ) / (math.sqrt( math.pow(y2-y1,2) + math.pow(x2-x1,2))) )
def fixGraphML(graphmlPath):
with open(graphmlPath) as f:
lines = []
for line in f:
lines.append( line.replace('"d3"','"fixedD3"') )
if lines!=None:
with open(graphmlPath,'w') as f:
for line in lines:
f.write(line)
def getHammingDistance(s1, s2,maxDist=999999):
d = 0
for index,base in enumerate(s1):
d+=(base!=s2[index] and s2[index]!="N" and base!="N")
if d>maxDist:
return(maxDist+1)
return(d)
def getHammingIndices( seqA, seqB, maxIndices=999999 ):
indices = []
for index,base in enumerate(seqA):
if len(seqB)<index:
break
if base!=seqB[index]:
indices.append(index)
if indices>=maxIndices:
return(indices)
return(indices)
def humanReadable(value, targetDigits=2,fp=0):
if value == 0:
return('0')
baseId = int(math.floor( math.log10(float(value))/3.0 ))
suffix = ""
if baseId==0:
sVal = str(round(value,targetDigits))
if len(sVal)>targetDigits and sVal.find('.'):
sVal = sVal.split('.')[0]
elif baseId>0:
sStrD = max(0,targetDigits-len(str( '{:.0f}'.format((value/(math.pow(10,baseId*3)))) )))
sVal = ('{:.%sf}' % min(fp, sStrD)).format((value/(math.pow(10,baseId*3))))
suffix = 'kMGTYZ'[baseId-1]
else:
sStrD = max(0,targetDigits-len(str( '{:.0f}'.format((value*(math.pow(10,-baseId*3)))) )))
sVal = ('{:.%sf}' % min(fp, sStrD)).format((value*(math.pow(10,-baseId*3))))
suffix = 'mnpf'[-baseId-1]
if len(sVal)+1>targetDigits:
# :(
sVal = str(round(value,fp))[1:]
suffix = ''
return('%s%s' % (sVal,suffix))
##Pathtools
def decodePath( path, spacers={'flag': ',', 'param':'_', 'kv':'=', 'invalid':['=','_']}):
invalidFileNameCharacters = spacers['invalid']
flagSpacer = spacers['flag']
paramSpacer = spacers['param']
keyValueSpacer = spacers['kv']
decodedPath = {
'runId':None,
'step':-1,
'name':'unknown',
'parameters':{},
'flags' : [],
'extension':''
}
parts = os.path.splitext(path)
basePath = parts[0]
decodedPath['extension'] = parts[1]
#Check if the anlysis base directory is in the path:
parts = basePath.split('/')
if 'analysis_' in parts[0]:
decodedPath['runId'] = parts[0]
toDecode = parts[1]
else:
toDecode = path
keyValuePairs = basePath.split(paramSpacer)
for pair in keyValuePairs:
keyValue = pair.split(keyValueSpacer)
if len(keyValue)!=2:
print(('Parameter decoding failed: %s, this probably means input files contain invalid characters such as %s' % (pair, ", ".join(invalidFileNameCharacters))))
else:
key = keyValue[0]
value = keyValue[1]
if key!='flags' and key!='name':
decodedPath['parameters'][key] = value
elif key=='flags':
decodedPath['flags'] = value.split(flagSpacer)
elif key=='name':
decodedPath['name'] = value
return(decodedPath)
def invBase(nucleotide):
if nucleotide=='A':
return(['T','C','G'])
if nucleotide=='C':
return(['T','A','G'])
if nucleotide=='T':
return(['C','A','G'])
if nucleotide=='G':
return(['C','A','T'])
return([''])
def encodePath(step, name, params, extension, spacers={'flag': ',', 'param':'_', 'kv':'=', 'invalid':['=','_']}):
invalidFileNameCharacters = spacers['invalid']
flagSpacer = spacers['flag']
paramSpacer = spacers['param']
keyValueSpacer = spacers['kv']
#Todo get keyValueSpacer in here
encodedPath = 'step={:0>2d}'.format(step)
encodedPath += '_name=%s' % name
for parname in params:
if parname.lower()!="flags":
if not( any(i in parname for i in invalidFileNameCharacters) or any(i in params[parname] for i in invalidFileNameCharacters) ):
encodedPath+='%s%s%s%s' % (paramSpacer, parname, keyValueSpacer, params[parname])
else:
raise ValueError('Parameter encoding failed: %s %s contains at least one invalid character' % (parname, params[parname]) )
#Todo check flags for mistakes
if 'flags' in params:
encodedPath+='%sflags%s%s' % (paramSpacer, keyValueSpacer,flagSpacer.join(params['flags']))
if extension!=None and extension!=False:
encodedPath = '%s.%s' % (encodedPath, extension)
return(encodedPath)
def locateIndel( seqA, seqB, verbose=False ):
alignment = pairwise2.align.globalmx(seqA, seqB, 2, -1, one_alignment_only=True)[0]
#pairwise2.format_alignment(*alignment)
indelLocA = -1
indelLocB = -1
if alignment:
align1 = alignment[0]
align2 = alignment[1]
localAlign1 = align1.strip('-')
localAlign2 = align2.strip('-')
dels = min( localAlign1.count('-'), localAlign2.count('-') )
ins = max( localAlign1.count('-'), localAlign2.count('-') )
if (ins==1) and dels==0:
seqNHasIndel = localAlign2.count('-')==0 #bool: 0: seqA has indel, 1: seqB has indel
#Find indel location:
if seqNHasIndel:
indelLocA = align1.find('-')
if verbose:
print(('seqA:%s\nseqB:%s'% (align1,align2)))
print((' '*(indelLocA+5)+'^'))
else:
indelLocB = align2.find('-')
if verbose:
print(('seqA:%s\nseqB:%s'% (align1,align2)))
print((' '*(indelLocB+5)+'^'))
#print(alignment[2]) is score
return(indelLocA,indelLocB)
class Histogram():
def __init__(self):
self.data = {}
def addCount(self,idx, count=1):
if not idx in self.data:
self.data[idx] = 0
self.data[idx]+=count
def write(self, path):
with open(path,'w') as f:
for index in self.data:
f.write('%s;%s\n' % (index, self.data[index]))
globalWritingUpdates = False
class InternalTool():
def __init__(self, screenName = "unnamed"):
self.verbose = True
self.screenName = screenName
def setVerbosity(self, verbosity):
self.verbose = verbosity
def __priorRunningPrint(self):
if self.isUpdating():
print('')
self.setUpdating(False)
def isUpdating(self):
global globalWritingUpdates
if globalWritingUpdates:
return(True)
def setUpdating(self, boolean):
global globalWritingUpdates
globalWritingUpdates = boolean
def log(self, message):
if self.verbose:
self.__priorRunningPrint()
print((Fore.GREEN + self.screenName + ', INFO: ' + Style.RESET_ALL + message))
def warn(self, message):
self.__priorRunningPrint()
print((Fore.RED + self.screenName + ', WARN: ' + Style.RESET_ALL + message))
def softWarn(self, message):
self.__priorRunningPrint()
print((Fore.YELLOW + self.screenName + ', SOFT WARN: ' + Style.RESET_ALL + message))
def update(self, message):
print('\r' + Fore.GREEN + self.screenName + ', UPDATE: ' + Style.RESET_ALL +message, end=' ')
self.setUpdating(True)
def info(self, message):
self.log(message)
def fatal(self, message):
self.__priorRunningPrint()
print((Fore.RED + self.screenName + ', FATAL ERROR: ' + Style.RESET_ALL + message))
exit()
class ExternalTool(InternalTool):
def __init__(self, screenName = "unnamed"):
InternalTool.__init__(self)
#Locations to look for binaries when they are not found on the PATH
def setExecutablePath(self, path):
self.executable = path
#Only works in unix and cygwin
def isAvailable(self):
available = subprocess.call("type " + self.executable.split()[0], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0
return(available or os.path.isfile(self.executable))
class FastqToFasta(ExternalTool):
def __init__(self):
ExternalTool.__init__(self)
self.setExecutablePath('fastq_to_fasta')
self.screenName = "FASTQ_TO_FASTA"
def execute(self, fqPath, fastaPath):
os.system('%s -n -i %s -o %s' % (self.executable, fqPath, fastaPath ))
class FastQQualityFilter(ExternalTool):
def __init__(self):
ExternalTool.__init__(self)
self.setExecutablePath('fastq_quality_filter')
self.minBaseQuality = 36
self.sequenceCountSuffix = 'fastqQualityResultCount'
self.screenName = "FASTQ_QUALITY_FILTER"
def setMinBaseQuality(self, minBaseQuality):
self.minBaseQuality = minBaseQuality
def getCountPath(self,filteredTargetPath):
return( '%s_%s' % (filteredTargetPath,self.sequenceCountSuffix) )
def execute(self, fqPath, filteredTargetPath):
os.system('%s -q %s -i %s -o %s -v -sam > %s' % (self.executable, self.minBaseQuality, fqPath, filteredTargetPath, self.getCountPath()))
r = 0
with open(self.getCountPath(), 'w') as f:
r = int(f.readline())
return(r)
class ART(ExternalTool):
def __init__(self):
ExternalTool.__init__(self)
self.setExecutablePath('./art_bin_ChocolateCherryCake/art_illumina')
self.screenName = "ART"
def simulateAmpliconReads(self,sequenceAbundanceCounter, prefix, readSize=76, coverage=10, reverseComplementAmplicon=False):
sequences = []
print(( sequenceAbundanceCounter.most_common(1)))
(v,hCount) = sequenceAbundanceCounter.most_common(1)[0]
for iteration in range(0,hCount):
for sequence,count in sequenceAbundanceCounter.most_common():
if count>=hCount:
sequences.append(sequence)
else:
break
hCount-=1
bpythonSeqs=[]
for index,sequence in enumerate(sequences):
if reverseComplementAmplicon :
bpythonSeqs.append(SeqIO.SeqRecord(Seq(sequence).reverse_complement(), '%s-%s' % (str(index),str(sequence))))
else:
bpythonSeqs.append(SeqIO.SeqRecord(Seq(sequence), '%s-%s' % (str(index),str(sequence))))
fastaPath = getTempFileName('art')+'.fa'
SeqIO.write(bpythonSeqs, fastaPath, "fasta")
os.system('%s -amp -i %s -l %s -f %s -o %s -sam -ss HS25' %(self.executable, fastaPath, readSize, coverage, prefix))
def setMinBaseQuality(self, minBaseQuality):
self.minBaseQuality = minBaseQuality
def getCountPath(self,filteredTargetPath):
return( '%s_%s' % (filteredTargetPath,self.sequenceCountSuffix) )
def execute(self, fqPath, filteredTargetPath):
os.system('%s -q %s -i %s -o %s -v > %s' % (self.executable, self.minBaseQuality, fqPath, filteredTargetPath, self.getCountPath()))
r = 0
with open(self.getCountPath(), 'w') as f:
r = int(f.readline())
return(r)
class NeedleAll(ExternalTool):
def __init__(self):
ExternalTool.__init__(self)
self.setExecutablePath('needleall')
self.screenName = "NEEDLE_ALL"
def execute(self, fqPathA, fqPathB, outputFilePath):
command = '%s -auto -stdout -asequence "%s" -bsequence "%s" > %s' % (self.executable, fqPathA, fqPathB, outputFilePath)
print(command)
os.system(command)
r = 0
#This is an inverted distance matrix
invScores = DistanceMatrix()
#find max score:
maxScore = 0
# To perform T-SNE a normalised matrix is required: the total sum should be 1
totalScore = 0
cells = 0
with open(outputFilePath, 'r') as f:
for line in f:
#print(line)
parts = line.strip().replace('(','').replace(')','').split(' ')
if len(parts)==4:
value = float(parts[3])
if value>maxScore:
maxScore=value
totalScore+=value
cells+=1
#Normalise the scores
matrixSum = 0
with open(outputFilePath, 'r') as f:
for line in f:
#print(line)
parts = line.strip().replace('(','').replace(')','').split(' ')
#print('%s %s %s %s' % (parts[0], parts[1],parts[2],parts[3]))
if len(parts)==4:
if float(parts[3])>0:
value = (maxScore-float(parts[3]))
matrixSum+= value
invScores.setDistance(str(parts[0]), str(parts[1]), value)
print(('Sum of distance matrix is %s' % matrixSum))
return(invScores)
class ClustalOmega(ExternalTool):
def __init__(self):
ExternalTool.__init__(self)
self.setExecutablePath('clustalo')
self.screenName = "CLUSTAL-OMEGA"
def alignFasta(self, fastaPath,outputFastaFilePath):
self.log('aligning %s to %s' % (fastaPath, outputFastaFilePath))
os.system('%s -i %s -o %s --force' % (self.executable, fastaPath, outputFastaFilePath))
def alignFastqs(self, fqPath, outputFastaFilePath):
#self.log('converting %s to fasta' % fqpath)
f = FastqToFasta()
tmpFasta = getTempFileName()
f.execute(fqPath, tmpFasta)
self.alignFasta(tmpFasta,outputFastaFilePath)
os.remove(tmpFasta)
class SamTools(ExternalTool):
def __init__(self):
ExternalTool.__init__(self)
self.setExecutablePath('samtools')
self.screenName = "SAMTOOLS"
def samToSortedBam(self, samPath, bamPath, deleteSam=False):
self.log('Converting SAM to sorted BAM')
os.system('%s view -bS %s | %s sort - -o %s.bam' % (self.executable, samPath, self.executable, bamPath))
self.log('Completed conversion')
if deleteSam:
os.remove(samPath)
def bamToSortedBam(self, bamPath, sortedBamPath):
self.log('Converting BAM to sorted BAM')
os.system('%s sort -o %s.bam %s' % (self.executable, sortedBamPath, bamPath))
self.log('Completed conversion')
def index(self, bamPath):
self.log('Indexing BAM file')
os.system('%s index %s' % (self.executable, bamPath))
self.log('Completed indexing')
def readBamToDictOld(self,bamFilePath, columns={'cigar':5, 'left':3,'scarSequence':9}, index='scarSequence', appendTo={}): # columns:{ targetName:columnIndex, ... }
retDict = appendTo
#os.system("samtools view %s | cut -f1,6 | grep -oEi '^.*([0-9]+D){1,1}.*' ")
#tempfile = './temp/temp_' + str(uuid.uuid1())
tempfile = getTempFileName()
self.log("samtools view %s > %s" %(bamFilePath,tempfile))
os.system("samtools view %s > %s" %(bamFilePath,tempfile))
with open(tempfile) as f:
#Go over all lines in the BAM file
for line in f:
#Extract all columns:
parts = line.replace('\n','').split()
#Extract the primary key value:
indexKey = parts[columns[index]]
#Initialise the existence of the primary key value in the return dictionary
if not indexKey in retDict:
retDict[indexKey] = {}
#Iterate over the keysm except the index
for key in columns:
if key is not index:
#Get the value of the key on this line in the BAM file
v = parts[columns[key]]
if not v in retDict[indexKey]:
retDict[indexKey][v] = 1
else:
retDict[indexKey][v] += 1
os.remove(tempfile)
return(retDict)
def readBamToDict(self,bamFilePath, columns={'cigar':5, 'id':0,'left':3,'scarSequence':9}, index='scarSequence', appendTo={}): # columns:{ targetName:columnIndex, ... }
retDict = appendTo
#os.system("samtools view %s | cut -f1,6 | grep -oEi '^.*([0-9]+D){1,1}.*' ")
tempfile = './temp/temp_' + str(uuid.uuid1())
self.log("samtools view %s > %s" %(bamFilePath,tempfile))
os.system("samtools view %s > %s" %(bamFilePath,tempfile))
with open(tempfile) as f:
#Go over all lines in the BAM file
for line in f:
#Extract all columns:
parts = line.replace('\n','').split()
#Extract the primary key value:
indexKey = parts[columns[index]]
#Initialise the existence of the primary key value in the return dictionary
if not indexKey in retDict:
retDict[indexKey] = {}
#Iterate over the keysm except the index
for key in columns:
if key is not index:
#Get the value of the key on this line in the BAM file
if columns[key]<len(parts):
v = parts[columns[key]]
retDict[indexKey][key] = v
try:
os.remove(tempfile)
except:
print(('Temporary file could not be removed %s' % tempfile))
pass
return(retDict)
# Make sure to index the reference genome!
class BWA_MEM(ExternalTool):
def __init__(self):
ExternalTool.__init__(self)
self.setExecutablePath('bwa mem')
self.screenName = "BWA MEM"
self.fixedFlags = []
def setReferencePath(self, referenceGenomePath):
self.referenceGenomePath = referenceGenomePath
def execute(self, fqPath, samPath, readGroupInfo=None, secondSplitHits=False, getSecondaryAlignments=False):
if not isinstance(fqPath, str):
fqPath = " ".join(fqPath)
if self.referenceGenomePath!=None:
self.log('Aligning %s to %s' %(fqPath, self.referenceGenomePath))
flags=[]+self.fixedFlags
if secondSplitHits:
flags.append("-M")
if readGroupInfo!=None:
flags.append("-R '%s'" % readGroupInfo)
if getSecondaryAlignments:
flags.append("-a")
self.log("Flags: %s" % (" ".join(flags)))
cmd = '%s %s %s %s > %s' % (self.executable, " ".join(flags), self.referenceGenomePath, fqPath, samPath)
self.log(cmd)
os.system(cmd)
else:
self.warn('Warning; no reference genome specified for BWA MEM, canceled alignment.')
# Make sure to index the reference genome!
class STAR(ExternalTool):
def __init__(self):
ExternalTool.__init__(self)
self.setExecutablePath('STAR')
self.screenName = "STAR"
self.threads = 4
def index(self):
if not os.path.exists(self.refDirectory):
try:
os.mkdir(self.refDirectory )
except:
self.fatal('Could not create index directory on %s' % self.refDirectory)
os.system('%s --runMode genomeGenerate --genomeDir %s --genomeFastaFiles %s --runThreadN %s' %
(self.executable, self.refDirectory, self.referenceGenomePath, self.threads))
def setReferencePath(self, referenceGenomePath):
self.referenceGenomePath = referenceGenomePath
self.refDirectory = '%s_STAR_INDEX' % os.path.basename(self.referenceGenomePath)
if not os.path.exists(self.refDirectory):
self.info("Reference index %s does not exist." % referenceGenomePath)
self.index()
def execute(self, fqPath, outputPath):
os.system('%s --genomeDir %s --readFilesIn %s --runThreadN %s --outFileNamePrefix %s' %
(self.executable, self.refDirectory, fqPath, self.threads, outputPath))
#!!!sjdbOverhang should match the length of the reads minus one!!!
def pass2alignment(self, SJouttabPath, sjdbOverhang=74):
base = os.path.basename(SJouttabPath)
self.p2refDirectory = '%s_P2_STAR_INDEX' % os.path.basename(SJouttabPath)
if not os.path.exists(self.p2refDirectory):
try:
os.mkdir(self.p2refDirectory )
except:
self.fatal('Could not create index directory on %s' % self.p2refDirectory)
def mapSingleEnd(self, fqPath, samPath):
if not isinstance(fqPath, str):
fqPath = " ".join(fqPath)
if self.referenceGenomePath!=None:
self.log('Aligning %s to %s' %(fqPath, self.referenceGenomePath))
os.system('%s --genomeDir %s --readFilesIn %s --runThreadN %s' %
(self.executable, self.refDirectory, fqPath, self.threads))
else:
self.warn('Warning; no reference genome specified for STAR, canceled alignment.')
class GSNAP(ExternalTool):
def __init__(self):
ExternalTool.__init__(self)
self.setExecutablePath('gsnap')
self.screenName = "GSNAP"
#self.cmd= 'sudo gsnap -d lintrace analysis_bdefa5bc-6b5c-11e5-a487-0800279bd60a_fullScarList_Protocol1-DNA-seq.fq -t 4 -A sam > analysis_bdefa5bc-6b5c-11e5-a487-0800279bd60a_fullScarList_Protocol1-DNA-seq.sam'
#Reference genome path should be the name of the reference build folder
def setReferencePath(self, referenceGenomePath):
self.referenceGenomePath = referenceGenomePath
def execute(self, fqPath, samPath):
if self.referenceGenomePath!=None:
self.log('Aligning %s to %s' %(fqPath, self.referenceGenomePath))
os.system('%s -d %s %s --format=sam > %s' % (self.executable, self.referenceGenomePath, fqPath, samPath))
else:
self.warn('Warning; no reference genome specified for GSNAP, canceled alignment.')
def getTempFileName(centerFix=""):
if not os.path.exists('./temp'):
os.mkdir('./temp')
return('./temp/temp_' + centerFix + str(uuid.uuid1()))
def mapReads(readsFile, baseName, mapper, samTools=None,readGroupInfo=None, secondSplitHits=False):
if samTools==None:
samTools = SamTools()
basePath = '%s/%s' % ( '.', baseName)
samPath = '%s.sam' %( basePath)
bamPath = '%s.bam' %( basePath)
if readGroupInfo==None and secondSplitHits==False:
mapper.execute(readsFile, samPath)
else:
mapper.execute(readsFile, samPath, readGroupInfo=None, secondSplitHits=False)
samTools.samToSortedBam(samPath, basePath, True)
samTools.index(bamPath)
return({'samPath':samPath, 'bamPath':bamPath})
def mapSequencesToDict(mapper, sequences):
bpythonSeqs=[]
for index,sequence in enumerate(sequences):
bpythonSeqs.append(SeqIO.SeqRecord(Seq(sequence), str(index)))
fastaPath = getTempFileName('bwa')+'.fa'
SeqIO.write(bpythonSeqs, fastaPath, "fasta")
bPath = getTempFileName('samtools')
samPath = bPath+'.sam'
bamPath = bPath+'.bam'
mapper.execute(fastaPath, samPath)
samtools = SamTools()
samtools.samToSortedBam(samPath, bPath, True)
samtools.index(bamPath)
d = samtools.readBamToDict(bamPath)
return(d)
def mapDictOfSequences(mapper, sequenceDict, bPath=None):
bpythonSeqs=[]
for seqId in sequenceDict:
bpythonSeqs.append(SeqIO.SeqRecord(Seq(sequenceDict[seqId]), str(seqId)))
fastaPath = getTempFileName('bwa')+'.fa'
SeqIO.write(bpythonSeqs, fastaPath, "fasta")
if bPath==None:
bPath = getTempFileName('samtools')
samPath = bPath+'.sam'
bamPath = bPath+'.bam'
mapper.execute(fastaPath, samPath)
samtools = SamTools()
samtools.samToSortedBam(samPath, bPath, False)
samtools.index(bamPath)
return(bamPath)
def mapReadsToDict(mapper, fqPath, index='id',basePath=None, readGroupInfo=None, secondSplitHits=False):
if basePath==None:
bPath = getTempFileName('samtools')
else:
bPath=basePath
samPath = bPath+'.sam'
bamPath = bPath+'.bam'
mapper.execute(fqPath, samPath, readGroupInfo, secondSplitHits)
samtools = SamTools()
samtools.samToSortedBam(samPath, bPath, True)
samtools.index(bamPath)
r = {}
d = samtools.readBamToDict(bamPath, index=index, appendTo=r)
return(d)
# Reference to this function: http://stackoverflow.com/questions/845058/how-to-get-line-count-cheaply-in-python
# This method is the quickest way to count lines in big files. (Quicker than wc..)
def fileLineCount(filename):
f = open(filename, 'rb')
bufgen = takewhile(lambda x: x, (f.read(1024*1024) for _ in repeat(None)))
return sum( buf.count(b'\n') for buf in bufgen )
# Deprecated: should use Numpy sparse matrix
class DistanceMatrix():
def __init__(self, maxDist=1000000000):
self.data = {}
self.keys = set()
self.maxDist = maxDist
self.finalKeying = True # Do not store keys on every update
def setDistances(self,fromList, toList, distanceList ):
# Add keys:
if not self.finalKeying:
self.keys.update(fromList+toList)
for index,a in enumerate(fromList):
self._update(a,toList[index], distanceList[index])
def getNumpyArray(self):
#Make sure the keys are properly indexed
self._performKeying()
#Initialise empty matrix:
narray = np.zeros((len(self.keys), len(self.keys)))
for aIndex,keyA in enumerate(self.keys):
for bIndex,keyB in enumerate(self.keys):
narray[aIndex,bIndex] = self.getDistance(keyA,keyB)
return(narray)
def _update(self, a,b,distance):
if distance<=self.maxDist:
if (not a in self.data) and (not b in self.data):
self.data[a] = {}
self.data[a][b] = distance
return(True)
if a in self.data and b in self.data[a]:
self.data[a][b] = distance
return(True)
if b in self.data and a in self.data[b]:
self.data[b][a] = distance
return(True)
# A already exists but B does not: (And b also does not exists as main entry)
if a in self.data and not b in self.data and not b in self.data[a]:
self.data[a][b] = distance
return(True)
elif b in self.data:
if not a in self.data[b]:
self.data[b][a] = distance
return(True)
return(False)
def setDistance(self, a,b,distance ):
if not self.finalKeying:
if a not in self.keys and b not in self.keys:
self.keys.update([a,b])
else:
if a not in self.keys:
self.keys.add(a)
if b not in self.keys:
self.keys.add(b)
return( self._update(a,b, distance ))
def getDistance(self, a,b ):
if a in self.data and b in self.data[a]:
return(self.data[a][b])
if b in self.data and a in self.data[b]:
return(self.data[b][a])
return(self.maxDist)
def _performKeying(self):
#Retrieve all keys
self.keys = set()
addKeys = []
self.keys.update(list(self.data.keys()))
for a in self.data:
for b in self.data[a]:
if b not in self.keys:
addKeys.append(b)
self.keys.update(addKeys)
def writeToFile(self, path):
if self.finalKeying:
self._performKeying()
separator = ';'
with open(path, 'w') as f:
values= ['name']
for keyA in self.keys:
values.append(keyA)
f.write('%s\n' % (separator.join(values)))
for keyA in self.keys:
values = []
for keyB in self.keys:
values.append(str(self.getDistance(keyA, keyB)))
f.write('%s%s%s\n' % (keyA, separator,separator.join(values)))
def loadFromFile(self, path,separator=None):
with open(path) as f:
idx = 0
header = {}
for line in f:
parts = line.rstrip().split(separator)
if idx==0:
for keyId,keyName in enumerate(parts):
header[keyId] = keyName
else:
for partIndex, part in parts:
if partIndex==0:
keyA = part
else:
keyB=header[partIndex]
self.setDistance(keyA,keyB, float(part))
idx+=1