SingleCellMultiOmics / Git / [45ad7e] /singlecellmultiomics/bamProcessing/bamToNucleosomePositions.py

Models:
AlyssaS/
SingleCellMultiOmics
Downloads: 1
[45ad7e]: / singlecellmultiomics / bamProcessing / bamToNucleosomePositions.py
History
Download this file
267 lines (195 with data), 10.1 kB

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from singlecellmultiomics.molecule import MoleculeIterator, CHICMolecule
from singlecellmultiomics.fragment import CHICFragment
from singlecellmultiomics.bamProcessing.bamFunctions import get_reference_path_from_bam, get_contigs
from collections import defaultdict
import pyBigWig
import pysam
from singlecellmultiomics.bamProcessing.bamBinCounts import get_binned_counts
import pandas as pd
import argparse
from singlecellmultiomics.bamProcessing import get_contig_sizes
from singlecellmultiomics.utils import is_autosome, pool_wrapper
import numpy as np
from multiprocessing import Pool
from more_itertools import windowed
from glob import glob
import pyBigWig
from scipy.ndimage import gaussian_filter1d
from scipy.signal import find_peaks
import argparse
from itertools import product
import asyncio

def get_aligned_chic_len(molecule):

    for f in molecule:
        f.R2_primer_length = 6

    mlen = molecule.estimated_max_length
    if mlen is None:
        return None

    return mlen + 2 # 2 extra bases because 1 base is potentially lost on both sides

def _generate_molecule_coordinate_dict(args, max_fragment_size=800):

    bam_path,  sel_contig= args
    molecule_coordinate_dict = defaultdict(list) # cell->[ (start,end), (start,end) ..]
    with pysam.AlignmentFile(bam_path) as alignments:
       # pysam.FastaFile(get_reference_path_from_bam(alignments)) as reference:

        for molecule in MoleculeIterator(alignments,
                                         contig=sel_contig,
                                         molecule_class=CHICMolecule,
                                         fragment_class = CHICFragment,
                                         #molecule_class_args = {'reference':reference}
                                        ):



            contig, start, strand = molecule.get_cut_site()
            if not molecule.is_completely_matching:
                continue

            if molecule.get_mean_mapping_qual()<55:
                continue
            slen = get_aligned_chic_len(molecule)

            if slen is None or slen>max_fragment_size:
                continue

            if strand:
                end = start - slen
            else:
                end = start + slen


            molecule_coordinate_dict[molecule.sample].append( (start,end) )
    for sample in molecule_coordinate_dict:
        molecule_coordinate_dict[sample] =  sorted(molecule_coordinate_dict[sample])
    return sel_contig, molecule_coordinate_dict


def generate_molecule_coordinate_dict(bams, n_threads=None):

    molecule_coordinate_dicts = {}
    contigs = get_contigs(bams[0])

    with Pool(n_threads) as workers:
        for contig,d in workers.imap( _generate_molecule_coordinate_dict,
                                     product(bams,filter(is_autosome, contigs))):
            if not contig in molecule_coordinate_dicts:

                molecule_coordinate_dicts[contig] = d
            else:
                molecule_coordinate_dicts[contig].update( d )
            print(f'Finished {contig}')
    return molecule_coordinate_dicts


    #bin_size = 100_000

#distances_per_bin = defaultdict(lambda: defaultdict(list)  )

def contig_coordinate_dict_to_votes(d, n, min_total_cuts_per_cell=3, p_nuc_bin_size=5, max_linker_length=90, linker_vote_radius=25):
    #molecule_coordinate_dicts[contig] = d
    # n = size of contig
    vote_nucleosome = np.zeros(int( n/p_nuc_bin_size) )
    vote_linker = np.zeros(int( n/p_nuc_bin_size) )

    for cell in d.keys():
        if len(d[cell])<min_total_cuts_per_cell:
            continue
        for (start_a, end_a),(start_b,end_b) in windowed( d[cell],2):

            o_start_a = start_a

            #Check if the fragments are not overlapping:
            (start_a, end_a) = min(start_a, end_a), max(start_a, end_a)
            (start_b, end_b) = min(start_b, end_b), max(start_b, end_b)

            # The locations of the nucleosomes are constrained:
            # \\ start_a  -----  end_a \\     \\ start_b  ---- end_b \\
            if end_a - start_a >= 147 and end_a - start_a < (147*2+10): # it could contain a single nucleosome, but not more
                s = int(((start_a + 70 ))/p_nuc_bin_size)
                e = int(((end_a - 70))/p_nuc_bin_size)
                vote_nucleosome[ s:(e+1) ] += 1/(e-s)

            # Any starting point of a molecule is _always_ part of a linker
            # lets say += 25bp

            c=o_start_a
            s = int(((c-linker_vote_radius))/p_nuc_bin_size)
            e = int(((c+linker_vote_radius))/p_nuc_bin_size)+1
            vote_linker[s:(e+1)] += 1/(e-s)

            if end_a > start_b:
                continue

            if start_b - end_a  > max_linker_length: # The distance is larger than 90bp, which is a very long linker. Skip the linker vote
                continue

            s = int(((end_a))/p_nuc_bin_size)
            e = int(((start_b))/p_nuc_bin_size)


            vote_linker[s:(e+1)] += 1/(e-s+1)
    return vote_nucleosome, vote_linker


async def write_to_bw(handle, starts, ends,  values, contig,size=None):

    if size is not None:
        print( ends[ends>=size] )

    handle.addEntries(
        [contig]*len(starts), #Contig
        starts.astype(np.int64) , #Start
        ends= ends.astype(np.int64) ,  #end
        values=  values.astype(np.float64)
    )

async def coordinate_dicts_to_nucleosome_position_files(bams, molecule_coordinate_dicts, p_nuc_bin_size = 5, alias='npos', n_threads=None ):

    contigs = get_contigs(bams[0])
    sizes = get_contig_sizes(bams[0])


    # Create output bigwig file handles and worker pool
    linker_vote_write_path= f'{alias}_linkers.bw'
    nuc_vote_write_path= f'{alias}_nucleosomes.bw'
    merged_vote_write_path= f'{alias}_nucleosomes_min_linkers.bw'
    centroids = f'{alias}_nucleosome_centers.bed'


    with Pool(n_threads) as workers, \
        pyBigWig.open(linker_vote_write_path,'w') as linkers_out, \
        pyBigWig.open(nuc_vote_write_path,'w') as nuc_out, \
        pyBigWig.open(merged_vote_write_path,'w') as merged_out, \
        open(centroids,'w') as centroids_out:

        # Add headers for all output bigwigfiles
        for h in (linkers_out, nuc_out, merged_out):
            # Write header
            h.addHeader(list(zip(contigs, [sizes[c] for c in contigs])))


        # Obtain nucleosome and linker votes for each contig
        for ret_contig, (vote_nucleosome, vote_linker) in zip(
                    molecule_coordinate_dicts.keys(),
                    workers.imap(pool_wrapper,
            (
            (contig_coordinate_dict_to_votes,
            {
                'd':d,
                'n':sizes[contig],
                'min_total_cuts_per_cell':3,
                'p_nuc_bin_size':5,
                'max_linker_length':90,
                'linker_vote_radius':25
            })
                for contig, d in molecule_coordinate_dicts.items()
            ))):

            print(f'Writing votes for {ret_contig}')
            contig_len = sizes[ret_contig]

            smooth_linkers = gaussian_filter1d(vote_linker,2)
            smooth_nuc = gaussian_filter1d(vote_nucleosome,2)

            # Write nucleosome vote track:
            starts = np.array( np.linspace(0, contig_len-p_nuc_bin_size-1, len(vote_nucleosome)) )

            size = sizes[ret_contig]
            # Check if all numbers are preceding...
            d = np.diff(starts)
            d = d[d<1]
            if len(d):
                print(d)
            else:
                print(f'{len(starts)} Coordinates are in correct order')


            await asyncio.gather(
                write_to_bw(nuc_out, starts, starts+p_nuc_bin_size,  np.nan_to_num(smooth_nuc), ret_contig, size),
                write_to_bw(linkers_out, starts, starts+p_nuc_bin_size,  np.nan_to_num(smooth_linkers), ret_contig, size),
                write_to_bw(merged_out, starts, starts+p_nuc_bin_size,  np.nan_to_num(smooth_nuc - smooth_linkers), ret_contig, size=size)
            )


            # Use the find peak function to select properly spaced nucleosome positions.
            print(f'Writing centroids for {ret_contig}')
            mindist = 147
            min_dist_bins = int( mindist / p_nuc_bin_size )
            estimated_nucleosome_positions = find_peaks( gaussian_filter1d(smooth_nuc - smooth_linkers,2),distance=min_dist_bins)[0]

            for nucpos, score in zip( estimated_nucleosome_positions*p_nuc_bin_size, smooth_nuc[estimated_nucleosome_positions]):
                centroids_out.write(f'{ret_contig}\t{nucpos}\t{nucpos+p_nuc_bin_size}\t{score}\n')



    #contig_coordinate_dict_to_votes(d, n, min_total_cuts_per_cell=3, p_nuc_bin_size=5, max_linker_length=90, linker_vote_radius=25)


if __name__ == '__main__':

    argparser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description='Estimate nucleosome positions from scCHiC seq bam file(s)')

    argparser.add_argument('alignmentfiles', type=str, nargs='+')
    argparser.add_argument('-o', type=str, required=True, help='output prefix')

    argparser.add_argument('-bin_size', type=int,  default=3, help='Nucleosome position precision (bp), increasing this value increases memory consumption linearly')
    argparser.add_argument('-n_threads', type=int, help='Amount of worker threads')

    args = argparser.parse_args()

    async def main(args):
        # Run the analysis:
        print('Creating molecule coordinate database')
        d = generate_molecule_coordinate_dict(args.alignmentfiles, args.n_threads)
        print('Estimating nucleosome positions')
        await coordinate_dicts_to_nucleosome_position_files(args.alignmentfiles,
                    d,
                    p_nuc_bin_size = args.bin_size,
                    n_threads=args.n_threads,
                    alias=args.o)


    loop = asyncio.get_event_loop()
    loop.run_until_complete(main(args))

    #asyncio.run(main(args)) # python >= 3.7