--- a +++ b/singlecellmultiomics/fragment/chic.py @@ -0,0 +1,199 @@ +from singlecellmultiomics.fragment import Fragment +from singlecellmultiomics.utils.sequtils import hamming_distance + + +class CHICFragment(Fragment): + def __init__(self, + reads, + R1_primer_length=4, + R2_primer_length=6, # Is automatically detected now + assignment_radius=0, + umi_hamming_distance=1, + invert_strand=False, + no_umi_cigar_processing=False, + **kwargs + ): + self.invert_strand = invert_strand + + + # Perform random primer autodect, + # When the demux profile is MX:Z:scCHIC384C8U3l, then there is no random primer + for read in reads: + if read is not None and read.has_tag('MX'): + if read.get_tag('MX')[-1]=='l': + R2_primer_length = 0 + break + + Fragment.__init__(self, + reads, + assignment_radius=assignment_radius, + R1_primer_length=R1_primer_length, + R2_primer_length=R2_primer_length, + umi_hamming_distance=umi_hamming_distance, + max_NUC_stretch = 18, + **kwargs + + ) + + # set CHIC cut site given reads + self.no_umi_cigar_processing = no_umi_cigar_processing + self.strand = None + self.ligation_motif = None + self.site_location = None + self.cut_site_strand = None + self.found_valid_site = False + self.identify_site() + + + if self.is_valid(): + + if self.assignment_radius==0: + self.match_hash = ( + self.strand, + self.cut_site_strand, + self.site_location[0], + self.site_location[1], + self.sample) + else: + self.match_hash = ( + self.cut_site_strand, + self.site_location[0], + self.sample) + else: + self.match_hash = None + + def set_site( + self, + site_chrom, + site_pos, + site_strand=None, + is_trimmed=False): + if self.found_valid_site: + self.set_meta('DS', site_pos) + if site_strand is not None: + self.set_meta('RS', site_strand) + self.set_strand(site_strand) + self.site_location = (site_chrom, site_pos) + self.cut_site_strand = site_strand + + def identify_site(self): + self.found_valid_site = False + R1 = self.get_R1() + + if R1 is None: + self.set_rejection_reason("R1_undefined") + return None + + if R1.has_tag('lh'): + self.ligation_motif = R1.get_tag('lh') + + """ Valid configs: + Observed read pair: + T######## R1 ########## ^ ########## R2 ########## + Real molecule: + A########################################################### + + Real molecule: and adapter: + ADAPTER: 5' NNNNT 3' + A{A:80%,N:20%}NNN [CHIC MOLECULE] + ^ real cut site + """ + + is_trimmed = (R1.has_tag('MX') and R1.get_tag( + 'MX').startswith('scCHIC')) + + if R1.is_unmapped: + self.set_rejection_reason("R1_unmapped") + return(None) + + ### Check if the orientation of the reads makes sense, reads need to point inwards + # | ----- > < ---- | + if self.has_R2(): + R2 = self.get_R2() + if not R2.is_unmapped: + if R1.is_reverse and not R2.is_reverse: + pass + elif not R1.is_reverse and R2.is_reverse: + pass + else: + # it does not make sense ... + self.set_rejection_reason("orientation") + return(None) + + + # Identify the start coordinate of Read 1 by reading the amount of softclips on the start of the read + r1_start =(R1.reference_end+1 if R1.is_reverse else R1.reference_start-2) + if not self.no_umi_cigar_processing: + if R1.is_reverse: + if R1.cigartuples[-1][0]==4: # softclipped at end + r1_start+=R1.cigartuples[-1][1] + else: + if R1.cigartuples[0][0]==4: # softclipped at start + r1_start-=R1.cigartuples[0][1] + + if is_trimmed: + # The first base of the read has been taken off and the lh tag is + # already set, this can be copied to RZ + self.found_valid_site = True + self.set_site( + site_strand=not R1.is_reverse if self.invert_strand else R1.is_reverse, + # We sequence the other strand (Starting with a T, this is an A in the molecule), the digestion thus happened on the other strand + # On the next line we asume that the mnsase cut is one base after the ligated A, but it can be more bases upstream + site_chrom=R1.reference_name, + site_pos=r1_start, + is_trimmed=True + ) + + else: + self.found_valid_site = True + self.set_site( + site_strand=not R1.is_reverse if self.invert_strand else R1.is_reverse, + # We sequence the other strand (Starting with a T, this is an A in the molecule), the digestion thus happened on the other strand + # On the next line we asume that the mnsase cut is one base after the ligated A, but it can be more bases upstream + site_chrom=R1.reference_name, + site_pos=(r1_start - 1 if R1.is_reverse else r1_start + 1), + is_trimmed=False) + + def is_valid(self): + if self.qcfail: + return False + + if self.max_fragment_size is not None: + try: + size = self.get_fragment_size() + if size>self.max_fragment_size: + return False + except Exception: + pass + + return self.found_valid_site + + def get_site_location(self): + if self.site_location is not None: + return self.site_location + else: + # We need some kind of coordinate... + for read in self: + if read is not None and read.reference_name is not None and read.reference_start is not None: + return read.reference_name, read.reference_start + + def __repr__(self): + return Fragment.__repr__( + self) + f'\n\tMNase cut site:{self.get_site_location()}' + + def __eq__(self, other): + # Make sure fragments map to the same strand, cheap comparisons + if self.match_hash != other.match_hash: + return False + + if self.site_location is None or other.site_location is None: + return False + + # Check distance between the fragments: + if self.assignment_radius>0 and abs(self.site_location[1]-other.site_location[1])>self.assignment_radius: + return False + + + # Make sure UMI's are similar enough, more expensive hamming distance + # calculation + return self.umi_eq(other)