--- a +++ b/singlecellmultiomics/modularDemultiplexer/demultiplexModules/scCHIC.py @@ -0,0 +1,405 @@ +from singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods import UmiBarcodeDemuxMethod, NonMultiplexable, IlluminaBaseDemultiplexer, phredToFastqHeaderSafeQualities +from singlecellmultiomics.modularDemultiplexer.demultiplexModules import CELSeq2_c8_u6 +import re +from singlecellmultiomics.utils import reverse_complement +# SCCHIC using NLAIII adapter, 384 well format with 3bp UMI followed by +# "A" base + + +class SCCHIC_384w_c8_u3(UmiBarcodeDemuxMethod): + def __init__(self, barcodeFileParser, random_primer_read=1,random_primer_length=6, **kwargs): + self.barcodeFileAlias = 'maya_384NLA' + UmiBarcodeDemuxMethod.__init__( + self, + umiRead=0, + umiStart=0, + umiLength=3, + barcodeRead=0, + barcodeStart=3, + barcodeLength=8, + random_primer_read=random_primer_read, + random_primer_length=random_primer_length, + barcodeFileAlias=self.barcodeFileAlias, + barcodeFileParser=barcodeFileParser, + **kwargs) + self.shortName = 'scCHIC384C8U3' + self.longName = 'Single cell CHIC, 384well CB: 8bp UMI: 3bp, RP: 6BP' + self.autoDetectable = True + self.description = '384 well format. 3bp umi followed by 8bp barcode and a single A. R2 ends with a 6bp random primer' + + self.sequenceCapture[0] = slice( + self.barcodeLength + self.umiLength + 1, + None) # dont capture the first base + + def demultiplex(self, records, **kwargs): + + if kwargs.get( + 'probe') and records[0].sequence[self.barcodeLength + self.umiLength] != 'T': + raise NonMultiplexable + + + # add first 2 bases as ligation tag: + ligation_start = self.barcodeLength + self.umiLength + ligation_end = ligation_start + 2 + ligation_sequence = records[0].sequence[ligation_start:ligation_end] + ligation_qualities = records[0].qual[ligation_start:ligation_end] + + taggedRecords = UmiBarcodeDemuxMethod.demultiplex( + self, records, **kwargs) + + taggedRecords[0].addTagByTag( + 'lh', + ligation_sequence, + isPhred=False, + make_safe=False) + taggedRecords[0].addTagByTag( + 'lq', + ligation_qualities, + isPhred=True, + make_safe=False) + taggedRecords[1].addTagByTag( + 'lh', + ligation_sequence, + isPhred=False, + make_safe=False) + taggedRecords[1].addTagByTag( + 'lq', + ligation_qualities, + isPhred=True, + make_safe=False) + #taggedRecords[0].sequence = taggedRecords[0].sequence[1:] + #taggedRecords[0].qualities = taggedRecords[0].qualities[1:] + return taggedRecords + +class SCCHIC_384w_c8_u3_direct_ligation(UmiBarcodeDemuxMethod): + def __init__(self, barcodeFileParser, random_primer_read=None,random_primer_length=None, **kwargs): + self.barcodeFileAlias = 'maya_384NLA' + UmiBarcodeDemuxMethod.__init__( + self, + umiRead=0, + umiStart=0, + umiLength=3, + barcodeRead=0, + barcodeStart=3, + barcodeLength=8, + random_primer_read=random_primer_read, + random_primer_length=random_primer_length, + barcodeFileAlias=self.barcodeFileAlias, + barcodeFileParser=barcodeFileParser, + **kwargs) + self.shortName = 'scCHIC384C8U3l' + self.longName = 'Single cell CHIC, 384well CB: 8bp UMI: 3bp, no RP' + self.autoDetectable = False + self.description = '384 well format. 3bp umi followed by 8bp barcode and a single A. R2 does not contain a random primer' + + self.sequenceCapture[0] = slice( + self.barcodeLength + self.umiLength + 1, + None) # dont capture the first base + + def demultiplex(self, records, **kwargs): + + if kwargs.get( + 'probe') and records[0].sequence[self.barcodeLength + self.umiLength] != 'T': + raise NonMultiplexable + + + # add first 2 bases as ligation tag: + ligation_start = self.barcodeLength + self.umiLength + ligation_end = ligation_start + 2 + ligation_sequence = records[0].sequence[ligation_start:ligation_end] + ligation_qualities = records[0].qual[ligation_start:ligation_end] + + taggedRecords = UmiBarcodeDemuxMethod.demultiplex( + self, records, **kwargs) + + taggedRecords[0].addTagByTag( + 'lh', + ligation_sequence, + isPhred=False, + make_safe=False) + taggedRecords[0].addTagByTag( + 'lq', + ligation_qualities, + isPhred=True, + make_safe=False) + taggedRecords[1].addTagByTag( + 'lh', + ligation_sequence, + isPhred=False, + make_safe=False) + taggedRecords[1].addTagByTag( + 'lq', + ligation_qualities, + isPhred=True, + make_safe=False) + #taggedRecords[0].sequence = taggedRecords[0].sequence[1:] + #taggedRecords[0].qualities = taggedRecords[0].qualities[1:] + return taggedRecords + + +class SCCHIC_384w_c8_u3_direct_ligation_SINGLE_END(UmiBarcodeDemuxMethod): + def __init__(self, barcodeFileParser, random_primer_read=None,random_primer_length=None, **kwargs): + self.barcodeFileAlias = 'maya_384NLA' + UmiBarcodeDemuxMethod.__init__( + self, + umiRead=0, + umiStart=0, + umiLength=3, + barcodeRead=0, + barcodeStart=3, + barcodeLength=8, + random_primer_read=None, + random_primer_length=None, + barcodeFileAlias=self.barcodeFileAlias, + barcodeFileParser=barcodeFileParser, + **kwargs) + self.shortName = 'scCHIC384C8U3se' + self.longName = 'Single cell CHIC, 384well CB: 8bp UMI: 3bp, single end, no RP' + self.autoDetectable = True + self.description = '384 well format. 3bp umi followed by 8bp barcode and a single A. No read 2' + + self.sequenceCapture[0] = slice( + self.barcodeLength + self.umiLength + 1, + None) # dont capture the first base + + def demultiplex(self, records, **kwargs): + + if kwargs.get( + 'probe') and records[0].sequence[self.barcodeLength + self.umiLength] != 'T': + raise NonMultiplexable + + if len(records) != 1: + raise NonMultiplexable + + # add first 2 bases as ligation tag: + ligation_start = self.barcodeLength + self.umiLength + ligation_end = ligation_start + 2 + ligation_sequence = records[0].sequence[ligation_start:ligation_end] + ligation_qualities = records[0].qual[ligation_start:ligation_end] + + taggedRecords = UmiBarcodeDemuxMethod.demultiplex( + self, records, **kwargs) + + taggedRecords[0].addTagByTag( + 'lh', + ligation_sequence, + isPhred=False, + make_safe=False) + taggedRecords[0].addTagByTag( + 'lq', + ligation_qualities, + isPhred=True, + make_safe=False) + + return taggedRecords + +class SCCHIC_384w_c8_u3_pdt(IlluminaBaseDemultiplexer): + + def __init__( + self, + barcodeFileParser=None, + indexFileParser=None, + indexFileAlias='illumina_merged_ThruPlex48S_RP', + + **kwargs): + + IlluminaBaseDemultiplexer.__init__( + self, + indexFileParser=indexFileParser, + indexFileAlias=indexFileAlias) + + self.description = '384 well format, mixed transcriptome and CHiC. scCHiC: 3bp umi followed by 8bp barcode and a single A. R2 ends with a 6bp random primer. Transcriptome: cs2 + template switching oligo' + self.shortName = 'CHICTV' + + self.autoDetectable = False + + # The demultiplexer used for the chic reads: + self.chic_demux = SCCHIC_384w_c8_u3(barcodeFileParser=barcodeFileParser,random_primer_read=None,**kwargs) + + self.barcodeSummary = self.chic_demux.barcodeSummary + self.longName = f'{self.chic_demux.longName} and TV primer ' + + def __repr__(self): + return f'{self.longName} {self.description}' + + def demultiplex(self, records, **kwargs): + + # Check if the supplied reads are mate-pair: + if len(records) != 2: + raise NonMultiplexable('Not mate pair') + + # Check if the TSO oligo is present.. + if 'AGACTCTTT' in records[0].sequence: + + + taggedRecords = self.chic_demux.demultiplex(records,**kwargs) + + + if not 'AGACTCTTT' in taggedRecords[0].sequence: + raise NonMultiplexable('No match to transcriptome or CHiC') + + oligo_position = taggedRecords[0].sequence.index('AGACTCTTT') + umi_start = max(0,oligo_position-6) + umi = taggedRecords[0].sequence[umi_start:oligo_position] + + # Set umi : + for r in taggedRecords: + r.tags['tu'] = umi + r.tags['MX'] = "CTV" + + # Clip the read down: + taggedRecords[0].sequence = taggedRecords[0].sequence[:oligo_position] + taggedRecords[0].qualities = taggedRecords[0].qualities[:oligo_position] + + return taggedRecords + raise NonMultiplexable('No match to transcriptome or CHiC') + +class SCCHIC_384w_c8_u3_cs2(UmiBarcodeDemuxMethod): + + def __init__(self, barcodeFileParser, random_primer_read=None,random_primer_length=None, **kwargs): + self.barcodeFileAlias = 'maya_384NLA' + UmiBarcodeDemuxMethod.__init__( + self, + umiRead=0, + umiStart=0, + umiLength=3, + barcodeRead=0, + barcodeStart=3, + barcodeLength=8, + random_primer_read=random_primer_read, + random_primer_length=random_primer_length, + barcodeFileAlias=self.barcodeFileAlias, + barcodeFileParser=barcodeFileParser, + **kwargs) + + self.description = '384 well format, mixed transcriptome and CHiC. scCHiC: 3bp umi followed by 8bp barcode and a single A. R2 has no random primer. Transcriptome is VASA' + self.shortName = 'TCHIC' + + self.autoDetectable = False + + # The demultiplexer used for the transcriptome reads: + self.transcriptome_demux = CELSeq2_c8_u6(barcodeFileParser=barcodeFileParser,**kwargs) + + # Contains expected bleedthrough sequence + self.id_to_cs2_barcode = { v:k + 'TTTTT' for k,v in barcodeFileParser['celseq2'].items() } + assert len(self.id_to_cs2_barcode)>0 + self.tx_umi_len = 6 + + # The demultiplexer used for the chic reads: + self.chic_demux = SCCHIC_384w_c8_u3_direct_ligation(barcodeFileParser=barcodeFileParser,**kwargs) + + self.barcodeSummary = f'{self.chic_demux.barcodeSummary} and {self.transcriptome_demux.barcodeSummary}' + self.longName = f'{self.chic_demux.longName} and {self.transcriptome_demux.longName}' + + self.poly_length=10 + self.poly_A = self.poly_length*'A' + self.poly_T = self.poly_length*'T' + self.poly_G = self.poly_length*'G' + + self.r2_trimmer = re.compile('[GA]*$') + + self.sequenceCapture[0] = slice( + self.barcodeLength + self.umiLength + 1, + None) # dont capture the first base + + + def trim_r2(self, sequence, qualities ): + + start = sequence.find(self.poly_A) + if start != -1: + sequence, qualities = sequence[:start], qualities[:start] + + start = sequence.find(self.poly_G) + if start != -1: + sequence, qualities = sequence[:start], qualities[:start] + + # Trim any trailing A and G bases from the end and # Trim down 3 bases + sequence = self.r2_trimmer.sub('',sequence)[:-3] + qualities = qualities[:len(sequence)] + + + return sequence, qualities + + + def __repr__(self): + return f'{self.longName} {self.description}' + + def extract_vasa_umi(self, sequence, vasa_barcode): + # Extract vasa umi from sequence given the vasa barcode + # Returns None when the it cannot be extracted + vasa_umi_end = sequence.find(vasa_barcode) + vasa_umi_start = max(0,vasa_umi_end - self.tx_umi_len) + if vasa_umi_end - vasa_umi_start > 0: + return sequence[vasa_umi_start:vasa_umi_end] + return None + + def demultiplex(self, records, **kwargs): + + # Check if the supplied reads are mate-pair: + if len(records) != 2: + raise NonMultiplexable('Not mate pair') + + + # add first 2 bases as ligation tag: + ligation_start = self.barcodeLength + self.umiLength + ligation_end = ligation_start + 2 + ligation_sequence = records[0].sequence[ligation_start:ligation_end] + ligation_qualities = records[0].qual[ligation_start:ligation_end] + # Obtain the chic barcode and umi: + try: + taggedRecords = UmiBarcodeDemuxMethod.demultiplex(self,records, **kwargs) + except NonMultiplexable: + raise + + # Trim ligation motif: + # add first 2 bases as ligation tag: + ud = { + 'lh':ligation_sequence, + 'lq':phredToFastqHeaderSafeQualities(ligation_qualities), + 'dt':'CHIC' + } + + taggedRecords[0].tags.update(ud) + taggedRecords[1].tags.update(ud) + + # Check for tx contamination: + expected_barcode = self.id_to_cs2_barcode.get( taggedRecords[0].tags['bi'] ) + if expected_barcode is None: + raise ValueError(taggedRecords[0].tags['bi']) + if expected_barcode in taggedRecords[0].sequence or expected_barcode in reverse_complement(taggedRecords[1].sequence): + # tx contaminant: + # Obtain vasa UMI: + if expected_barcode in taggedRecords[0].sequence: + # NNNNNNNNN [UMI] BC POLY T + tx_umi = self.extract_vasa_umi(taggedRecords[0].sequence, expected_barcode) + else: + rc2 = reverse_complement(taggedRecords[1].sequence) + tx_umi = self.extract_vasa_umi(rc2, expected_barcode) + + # Trim read2 down + taggedRecords[1].sequence, taggedRecords[1].qualities = self.trim_r2(taggedRecords[1].sequence, taggedRecords[1].qualities) + for r in taggedRecords: + r.tags['dt'] = 'VASA' + if tx_umi is not None: + r.tags['rx'] = tx_umi + + #r.tags['MX'] = 'CS2' + return taggedRecords + elif 'TTTTTTTTTTTTTTTTTTTTTTT' in taggedRecords[0].sequence or 'TTTTTTTTTTTTTTTTTTTTTTT' in taggedRecords[1].sequence: + raise NonMultiplexable('PolyT') + elif 'AGTCCGACGAT' in taggedRecords[0].sequence[:30] or 'GTTCTACAGT' in taggedRecords[0].sequence[:30] or 'TAATACGACTCACTATAGGG' in taggedRecords[0].sequence: + + # desc = 'none?' + # if 'AGTCCGACGAT' in taggedRecords[0].sequence[:30]: + # desc = 'AGTCCGACGAT' + # elif 'GTTCTACAGT' in taggedRecords[0].sequence[:30]: + # desc = 'GTTCTACAGT' + # elif 'TAATACGACTCACTATAGGG' in taggedRecords[0].sequence: + # desc = 'TAATACGACTCACTATAGGG' + + for r in taggedRecords: + r.tags['dt'] = 'VASA' + r.tags['RR'] = 'T7_found' + #r.tags['TR'] = desc + + return taggedRecords