SingleCellMultiOmics / Git / Diff of /singlecellmultiomics/modularDemultiplexer/demultiplexModules/scCHIC.py

Models:
AlyssaS/
SingleCellMultiOmics
Downloads: 1
Diff of /singlecellmultiomics/modularDemultiplexer/demultiplexModules/scCHIC.py [000000] .. [2c420a]
Switch to side-by-side view

--- a
+++ b/singlecellmultiomics/modularDemultiplexer/demultiplexModules/scCHIC.py
@@ -0,0 +1,405 @@
+from singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods import UmiBarcodeDemuxMethod, NonMultiplexable, IlluminaBaseDemultiplexer, phredToFastqHeaderSafeQualities
+from singlecellmultiomics.modularDemultiplexer.demultiplexModules import CELSeq2_c8_u6
+import re
+from singlecellmultiomics.utils import reverse_complement
+# SCCHIC using NLAIII adapter, 384 well format with 3bp UMI followed by
+# "A" base
+
+
+class SCCHIC_384w_c8_u3(UmiBarcodeDemuxMethod):
+    def __init__(self, barcodeFileParser, random_primer_read=1,random_primer_length=6, **kwargs):
+        self.barcodeFileAlias = 'maya_384NLA'
+        UmiBarcodeDemuxMethod.__init__(
+            self,
+            umiRead=0,
+            umiStart=0,
+            umiLength=3,
+            barcodeRead=0,
+            barcodeStart=3,
+            barcodeLength=8,
+            random_primer_read=random_primer_read,
+            random_primer_length=random_primer_length,
+            barcodeFileAlias=self.barcodeFileAlias,
+            barcodeFileParser=barcodeFileParser,
+            **kwargs)
+        self.shortName = 'scCHIC384C8U3'
+        self.longName = 'Single cell CHIC, 384well CB: 8bp UMI: 3bp, RP: 6BP'
+        self.autoDetectable = True
+        self.description = '384 well format. 3bp umi followed by 8bp barcode and a single A. R2 ends with a 6bp random primer'
+
+        self.sequenceCapture[0] = slice(
+            self.barcodeLength + self.umiLength + 1,
+            None)  # dont capture the first base
+
+    def demultiplex(self, records, **kwargs):
+
+        if kwargs.get(
+                'probe') and records[0].sequence[self.barcodeLength + self.umiLength] != 'T':
+            raise NonMultiplexable
+
+
+        # add first 2 bases as ligation tag:
+        ligation_start = self.barcodeLength + self.umiLength
+        ligation_end = ligation_start + 2
+        ligation_sequence = records[0].sequence[ligation_start:ligation_end]
+        ligation_qualities = records[0].qual[ligation_start:ligation_end]
+
+        taggedRecords = UmiBarcodeDemuxMethod.demultiplex(
+            self, records, **kwargs)
+
+        taggedRecords[0].addTagByTag(
+            'lh',
+            ligation_sequence,
+            isPhred=False,
+            make_safe=False)
+        taggedRecords[0].addTagByTag(
+            'lq',
+            ligation_qualities,
+            isPhred=True,
+            make_safe=False)
+        taggedRecords[1].addTagByTag(
+            'lh',
+            ligation_sequence,
+            isPhred=False,
+            make_safe=False)
+        taggedRecords[1].addTagByTag(
+            'lq',
+            ligation_qualities,
+            isPhred=True,
+            make_safe=False)
+        #taggedRecords[0].sequence = taggedRecords[0].sequence[1:]
+        #taggedRecords[0].qualities = taggedRecords[0].qualities[1:]
+        return taggedRecords
+
+class SCCHIC_384w_c8_u3_direct_ligation(UmiBarcodeDemuxMethod):
+    def __init__(self, barcodeFileParser, random_primer_read=None,random_primer_length=None, **kwargs):
+        self.barcodeFileAlias = 'maya_384NLA'
+        UmiBarcodeDemuxMethod.__init__(
+            self,
+            umiRead=0,
+            umiStart=0,
+            umiLength=3,
+            barcodeRead=0,
+            barcodeStart=3,
+            barcodeLength=8,
+            random_primer_read=random_primer_read,
+            random_primer_length=random_primer_length,
+            barcodeFileAlias=self.barcodeFileAlias,
+            barcodeFileParser=barcodeFileParser,
+            **kwargs)
+        self.shortName = 'scCHIC384C8U3l'
+        self.longName = 'Single cell CHIC, 384well CB: 8bp UMI: 3bp, no RP'
+        self.autoDetectable = False
+        self.description = '384 well format. 3bp umi followed by 8bp barcode and a single A. R2 does not contain a random primer'
+
+        self.sequenceCapture[0] = slice(
+            self.barcodeLength + self.umiLength + 1,
+            None)  # dont capture the first base
+
+    def demultiplex(self, records, **kwargs):
+
+        if kwargs.get(
+                'probe') and records[0].sequence[self.barcodeLength + self.umiLength] != 'T':
+            raise NonMultiplexable
+
+
+        # add first 2 bases as ligation tag:
+        ligation_start = self.barcodeLength + self.umiLength
+        ligation_end = ligation_start + 2
+        ligation_sequence = records[0].sequence[ligation_start:ligation_end]
+        ligation_qualities = records[0].qual[ligation_start:ligation_end]
+
+        taggedRecords = UmiBarcodeDemuxMethod.demultiplex(
+            self, records, **kwargs)
+
+        taggedRecords[0].addTagByTag(
+            'lh',
+            ligation_sequence,
+            isPhred=False,
+            make_safe=False)
+        taggedRecords[0].addTagByTag(
+            'lq',
+            ligation_qualities,
+            isPhred=True,
+            make_safe=False)
+        taggedRecords[1].addTagByTag(
+            'lh',
+            ligation_sequence,
+            isPhred=False,
+            make_safe=False)
+        taggedRecords[1].addTagByTag(
+            'lq',
+            ligation_qualities,
+            isPhred=True,
+            make_safe=False)
+        #taggedRecords[0].sequence = taggedRecords[0].sequence[1:]
+        #taggedRecords[0].qualities = taggedRecords[0].qualities[1:]
+        return taggedRecords
+
+
+class SCCHIC_384w_c8_u3_direct_ligation_SINGLE_END(UmiBarcodeDemuxMethod):
+    def __init__(self, barcodeFileParser, random_primer_read=None,random_primer_length=None, **kwargs):
+        self.barcodeFileAlias = 'maya_384NLA'
+        UmiBarcodeDemuxMethod.__init__(
+            self,
+            umiRead=0,
+            umiStart=0,
+            umiLength=3,
+            barcodeRead=0,
+            barcodeStart=3,
+            barcodeLength=8,
+            random_primer_read=None,
+            random_primer_length=None,
+            barcodeFileAlias=self.barcodeFileAlias,
+            barcodeFileParser=barcodeFileParser,
+            **kwargs)
+        self.shortName = 'scCHIC384C8U3se'
+        self.longName = 'Single cell CHIC, 384well CB: 8bp UMI: 3bp, single end, no RP'
+        self.autoDetectable = True
+        self.description = '384 well format. 3bp umi followed by 8bp barcode and a single A. No read 2'
+
+        self.sequenceCapture[0] = slice(
+            self.barcodeLength + self.umiLength + 1,
+            None)  # dont capture the first base
+
+    def demultiplex(self, records, **kwargs):
+
+        if kwargs.get(
+                'probe') and records[0].sequence[self.barcodeLength + self.umiLength] != 'T':
+            raise NonMultiplexable
+
+        if len(records) != 1:
+            raise NonMultiplexable
+
+        # add first 2 bases as ligation tag:
+        ligation_start = self.barcodeLength + self.umiLength
+        ligation_end = ligation_start + 2
+        ligation_sequence = records[0].sequence[ligation_start:ligation_end]
+        ligation_qualities = records[0].qual[ligation_start:ligation_end]
+
+        taggedRecords = UmiBarcodeDemuxMethod.demultiplex(
+            self, records, **kwargs)
+
+        taggedRecords[0].addTagByTag(
+            'lh',
+            ligation_sequence,
+            isPhred=False,
+            make_safe=False)
+        taggedRecords[0].addTagByTag(
+            'lq',
+            ligation_qualities,
+            isPhred=True,
+            make_safe=False)
+
+        return taggedRecords
+
+class SCCHIC_384w_c8_u3_pdt(IlluminaBaseDemultiplexer):
+
+    def __init__(
+            self,
+            barcodeFileParser=None,
+            indexFileParser=None,
+            indexFileAlias='illumina_merged_ThruPlex48S_RP',
+
+            **kwargs):
+
+        IlluminaBaseDemultiplexer.__init__(
+            self,
+            indexFileParser=indexFileParser,
+            indexFileAlias=indexFileAlias)
+
+        self.description = '384 well format, mixed transcriptome and CHiC. scCHiC: 3bp umi followed by 8bp barcode and a single A. R2 ends with a 6bp random primer. Transcriptome: cs2 + template switching oligo'
+        self.shortName = 'CHICTV'
+
+        self.autoDetectable = False
+
+        # The demultiplexer used for the chic reads:
+        self.chic_demux =  SCCHIC_384w_c8_u3(barcodeFileParser=barcodeFileParser,random_primer_read=None,**kwargs)
+
+        self.barcodeSummary = self.chic_demux.barcodeSummary
+        self.longName = f'{self.chic_demux.longName} and TV primer '
+
+    def __repr__(self):
+        return f'{self.longName} {self.description}'
+
+    def demultiplex(self, records, **kwargs):
+
+        # Check if the supplied reads are mate-pair:
+        if len(records) != 2:
+            raise NonMultiplexable('Not mate pair')
+
+        # Check if the TSO oligo is present..
+        if 'AGACTCTTT' in records[0].sequence:
+
+
+            taggedRecords = self.chic_demux.demultiplex(records,**kwargs)
+
+
+            if not 'AGACTCTTT' in taggedRecords[0].sequence:
+                raise NonMultiplexable('No match to transcriptome or CHiC')
+
+            oligo_position = taggedRecords[0].sequence.index('AGACTCTTT')
+            umi_start = max(0,oligo_position-6)
+            umi = taggedRecords[0].sequence[umi_start:oligo_position]
+
+            # Set umi :
+            for r in taggedRecords:
+                r.tags['tu'] = umi
+                r.tags['MX'] = "CTV"
+
+            # Clip the read down:
+            taggedRecords[0].sequence = taggedRecords[0].sequence[:oligo_position]
+            taggedRecords[0].qualities = taggedRecords[0].qualities[:oligo_position]
+
+            return taggedRecords
+        raise NonMultiplexable('No match to transcriptome or CHiC')
+
+class SCCHIC_384w_c8_u3_cs2(UmiBarcodeDemuxMethod):
+
+    def __init__(self, barcodeFileParser, random_primer_read=None,random_primer_length=None, **kwargs):
+        self.barcodeFileAlias = 'maya_384NLA'
+        UmiBarcodeDemuxMethod.__init__(
+            self,
+            umiRead=0,
+            umiStart=0,
+            umiLength=3,
+            barcodeRead=0,
+            barcodeStart=3,
+            barcodeLength=8,
+            random_primer_read=random_primer_read,
+            random_primer_length=random_primer_length,
+            barcodeFileAlias=self.barcodeFileAlias,
+            barcodeFileParser=barcodeFileParser,
+            **kwargs)
+
+        self.description = '384 well format, mixed transcriptome and CHiC. scCHiC: 3bp umi followed by 8bp barcode and a single A. R2 has no random primer. Transcriptome is VASA'
+        self.shortName = 'TCHIC'
+
+        self.autoDetectable = False
+
+        # The demultiplexer used for the transcriptome reads:
+        self.transcriptome_demux =  CELSeq2_c8_u6(barcodeFileParser=barcodeFileParser,**kwargs)
+
+        # Contains expected bleedthrough sequence
+        self.id_to_cs2_barcode = { v:k + 'TTTTT' for k,v in barcodeFileParser['celseq2'].items() }
+        assert len(self.id_to_cs2_barcode)>0
+        self.tx_umi_len = 6
+
+        # The demultiplexer used for the chic reads:
+        self.chic_demux =  SCCHIC_384w_c8_u3_direct_ligation(barcodeFileParser=barcodeFileParser,**kwargs)
+
+        self.barcodeSummary = f'{self.chic_demux.barcodeSummary} and {self.transcriptome_demux.barcodeSummary}'
+        self.longName = f'{self.chic_demux.longName} and {self.transcriptome_demux.longName}'
+
+        self.poly_length=10
+        self.poly_A = self.poly_length*'A'
+        self.poly_T = self.poly_length*'T'
+        self.poly_G = self.poly_length*'G'
+
+        self.r2_trimmer = re.compile('[GA]*$')
+
+        self.sequenceCapture[0] = slice(
+            self.barcodeLength + self.umiLength + 1,
+            None)  # dont capture the first base
+
+
+    def trim_r2(self, sequence, qualities ):
+
+        start = sequence.find(self.poly_A)
+        if start != -1:
+            sequence, qualities = sequence[:start], qualities[:start]
+
+        start = sequence.find(self.poly_G)
+        if start != -1:
+            sequence, qualities = sequence[:start], qualities[:start]
+
+        # Trim any trailing A and G bases from the end and # Trim down 3 bases
+        sequence = self.r2_trimmer.sub('',sequence)[:-3]
+        qualities = qualities[:len(sequence)]
+
+
+        return sequence, qualities
+
+
+    def __repr__(self):
+        return f'{self.longName} {self.description}'
+
+    def extract_vasa_umi(self, sequence, vasa_barcode):
+        # Extract vasa umi from sequence given the vasa barcode
+        # Returns None when the it cannot be extracted
+        vasa_umi_end = sequence.find(vasa_barcode)
+        vasa_umi_start = max(0,vasa_umi_end - self.tx_umi_len)
+        if vasa_umi_end - vasa_umi_start > 0:
+            return sequence[vasa_umi_start:vasa_umi_end]
+        return None
+
+    def demultiplex(self, records, **kwargs):
+
+        # Check if the supplied reads are mate-pair:
+        if len(records) != 2:
+            raise NonMultiplexable('Not mate pair')
+
+
+        # add first 2 bases as ligation tag:
+        ligation_start = self.barcodeLength + self.umiLength
+        ligation_end = ligation_start + 2
+        ligation_sequence = records[0].sequence[ligation_start:ligation_end]
+        ligation_qualities = records[0].qual[ligation_start:ligation_end]
+        # Obtain the chic barcode and umi:
+        try:
+            taggedRecords = UmiBarcodeDemuxMethod.demultiplex(self,records, **kwargs)
+        except NonMultiplexable:
+            raise
+
+        # Trim ligation motif:
+        # add first 2 bases as ligation tag:
+        ud = {
+            'lh':ligation_sequence,
+            'lq':phredToFastqHeaderSafeQualities(ligation_qualities),
+            'dt':'CHIC'
+        }
+
+        taggedRecords[0].tags.update(ud)
+        taggedRecords[1].tags.update(ud)
+
+        # Check for tx contamination:
+        expected_barcode = self.id_to_cs2_barcode.get( taggedRecords[0].tags['bi'] )
+        if expected_barcode is None:
+            raise ValueError(taggedRecords[0].tags['bi'])
+        if expected_barcode in taggedRecords[0].sequence or expected_barcode in reverse_complement(taggedRecords[1].sequence):
+            # tx contaminant:
+            # Obtain vasa UMI:
+            if expected_barcode in taggedRecords[0].sequence:
+                # NNNNNNNNN [UMI] BC POLY T
+                tx_umi = self.extract_vasa_umi(taggedRecords[0].sequence, expected_barcode)
+            else:
+                rc2 = reverse_complement(taggedRecords[1].sequence)
+                tx_umi = self.extract_vasa_umi(rc2, expected_barcode)
+
+            # Trim read2 down
+            taggedRecords[1].sequence, taggedRecords[1].qualities = self.trim_r2(taggedRecords[1].sequence, taggedRecords[1].qualities)
+            for r in taggedRecords:
+                r.tags['dt'] = 'VASA'
+                if tx_umi is not None:
+                    r.tags['rx'] = tx_umi
+
+                #r.tags['MX'] = 'CS2'
+            return taggedRecords
+        elif 'TTTTTTTTTTTTTTTTTTTTTTT' in taggedRecords[0].sequence or  'TTTTTTTTTTTTTTTTTTTTTTT' in taggedRecords[1].sequence:
+            raise NonMultiplexable('PolyT')
+        elif 'AGTCCGACGAT' in taggedRecords[0].sequence[:30] or 'GTTCTACAGT' in taggedRecords[0].sequence[:30] or 'TAATACGACTCACTATAGGG' in taggedRecords[0].sequence:
+
+            # desc = 'none?'
+            # if 'AGTCCGACGAT' in taggedRecords[0].sequence[:30]:
+            #     desc = 'AGTCCGACGAT'
+            # elif 'GTTCTACAGT' in taggedRecords[0].sequence[:30]:
+            #     desc = 'GTTCTACAGT'
+            # elif 'TAATACGACTCACTATAGGG' in taggedRecords[0].sequence:
+            #     desc = 'TAATACGACTCACTATAGGG'
+
+            for r in taggedRecords:
+                r.tags['dt'] = 'VASA'
+                r.tags['RR'] = 'T7_found'
+                #r.tags['TR'] = desc
+
+        return taggedRecords