SingleCellMultiOmics / Git / [45ad7e] /singlecellmultiomics/bamProcessing/bamExtractNearMolecules.py

Models:
AlyssaS/
SingleCellMultiOmics
Downloads: 1
[45ad7e]: / singlecellmultiomics / bamProcessing / bamExtractNearMolecules.py
History
Download this file
146 lines (104 with data), 5.4 kB

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import argparse
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pysam
from array import array
from singlecellmultiomics.bamProcessing.bamAnalyzeCutDistances import get_stranded_pairwise_counts, get_sc_cut_dictionary, strict_read_counts_function
from singlecellmultiomics.bamProcessing.bamFunctions import get_contigs_with_reads, sorted_bam_file
from multiprocessing import Pool
from singlecellmultiomics.utils import is_main_chromosome
from collections import defaultdict
from pysamiterators import MatePairIterator
from singlecellmultiomics.molecule import MoleculeIterator, CHICMolecule
from singlecellmultiomics.fragment import CHICFragment
from singlecellmultiomics.bamProcessing.bamFunctions import get_reference_path_from_bam

def get_nearby_reads():

    pairing_indices = {}
    current_pair_id = 0

    with pysam.AlignmentFile(bam_path,threads=8) as alignments:
        for contig in contigs:

            for R1, R2 in MatePairIterator(alignments, contig=contig):
                if R1 is None or R1.is_qcfail or R1.mapping_quality<50:
                    continue

                # Obtain the cut site and strand
                cut_location = R1.get_tag('DS')
                cell = R1.get_tag( 'SM')
                strand = R1.is_reverse

                if not cell in selection:
                    continue

                if not (R1.reference_name, cut_location) in selection[cell] :
                    continue

                # Obtain paired cut site
                paired_cut = pairings[(cell,contig,cut_location)]
                R1.set_tag('PC',int(paired_cut))
                if R2 is not None:
                    R2.set_tag('PC',int(paired_cut))

                # Set pair ID:
                k = (cell, contig, min(paired_cut, cut_location))
                if not k in pairing_indices:
                    pairing_indices[k] = current_pair_id
                    pid = current_pair_id
                    current_pair_id+=1
                else:
                    pid = pairing_indices.get(k,-1)

                R1.set_tag('BX',pid)
                if R2 is not None:
                    R2.set_tag('BX',pid)

                yield R1, R2


def crc(reads, molecule):
    bx = molecule[0][0].get_tag('BX')
    for r in reads:
        r.set_tag('BX',bx)


def extract_near_cuts(bam_path, output_path):

    cuts_stranded = get_sc_cut_dictionary(bam_path,strand_specific=True, filter_function=strict_read_counts_function)
    contigs = [contig for contig in get_contigs_with_reads(bam_path) if is_main_chromosome(contig) and contig[0]!='J' and contig[0]!='M']



if __name__ == '__main__':

    argparser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description='Extract nearby molecules. Sets BX tag for nearby molecules and PC (paired cut) location')

    argparser.add_argument('alignmentfile', type=str)
    argparser.add_argument('outputfile', type=str, help='output bam')

    args = argparser.parse_args()

    #extract_near_cuts(args.alignmentfile, args.outputfile)
    bam_path = args.alignmentfile
    output_path = args.outputfile
    cuts_stranded = get_sc_cut_dictionary(bam_path,strand_specific=True, filter_function=strict_read_counts_function)
    contigs = [contig for contig in get_contigs_with_reads(bam_path) if is_main_chromosome(contig) and contig[0]!='J' and contig[0]!='M']

    def create_molecule_selection(contig) :

        selection = defaultdict( set )
        pairings =  dict()


        for cell in cuts_stranded[contig]:
            fwd_cuts = np.array( sorted([pos for strand, pos in cuts_stranded[contig][cell] if not strand]) )
            rev_cuts = np.array( sorted([pos for strand, pos in cuts_stranded[contig][cell] if strand]) )


            for cut in fwd_cuts:
                distances = rev_cuts - cut

                near_idcs = (distances>0) & (distances<1500)

                if sum(near_idcs)>0:
                    for c in rev_cuts[near_idcs]:
                        selection[cell].add( (contig, c ))
                        pairings[(cell,contig,c)] = cut
                    selection[cell].add( (contig, cut ))
                    pairings[(cell,contig,cut)] = c


                continue
        return selection, pairings

    selection = defaultdict( set )
    pairings =  dict()
    with Pool() as workers:

        for r, pair_res in workers.imap(create_molecule_selection, contigs):
            for cell, sel in r.items():
                selection[cell].update(sel)
            pairings.update( pair_res )

    molecule = None


    with pysam.AlignmentFile(bam_path,threads=8) as alignments, \
        pysam.FastaFile(get_reference_path_from_bam(alignments)) as reference:
        with sorted_bam_file(output_path,alignments) as out:
            for molecule in MoleculeIterator(get_nearby_reads(),

                                             molecule_class=CHICMolecule,
                                             fragment_class = CHICFragment,
                                             molecule_class_args = {'reference':reference}
                                            ):

                molecule.write_pysam(out, consensus=True,consensus_read_callback=crc, consensus_read_callback_kwargs={'molecule':molecule})