SingleCellMultiOmics / Git / Diff of /singlecellmultiomics/bamProcessing/bamToNucleosomePositions.py

Models:
AlyssaS/
SingleCellMultiOmics
Downloads: 1
Diff of /singlecellmultiomics/bamProcessing/bamToNucleosomePositions.py [000000] .. [2c420a]
Switch to side-by-side view

--- a
+++ b/singlecellmultiomics/bamProcessing/bamToNucleosomePositions.py
@@ -0,0 +1,266 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from singlecellmultiomics.molecule import MoleculeIterator, CHICMolecule
+from singlecellmultiomics.fragment import CHICFragment
+from singlecellmultiomics.bamProcessing.bamFunctions import get_reference_path_from_bam, get_contigs
+from collections import defaultdict
+import pyBigWig
+import pysam
+from singlecellmultiomics.bamProcessing.bamBinCounts import get_binned_counts
+import pandas as pd
+import argparse
+from singlecellmultiomics.bamProcessing import get_contig_sizes
+from singlecellmultiomics.utils import is_autosome, pool_wrapper
+import numpy as np
+from multiprocessing import Pool
+from more_itertools import windowed
+from glob import glob
+import pyBigWig
+from scipy.ndimage import gaussian_filter1d
+from scipy.signal import find_peaks
+import argparse
+from itertools import product
+import asyncio
+
+def get_aligned_chic_len(molecule):
+
+    for f in molecule:
+        f.R2_primer_length = 6
+
+    mlen = molecule.estimated_max_length
+    if mlen is None:
+        return None
+
+    return mlen + 2 # 2 extra bases because 1 base is potentially lost on both sides
+
+def _generate_molecule_coordinate_dict(args, max_fragment_size=800):
+
+    bam_path,  sel_contig= args
+    molecule_coordinate_dict = defaultdict(list) # cell->[ (start,end), (start,end) ..]
+    with pysam.AlignmentFile(bam_path) as alignments:
+       # pysam.FastaFile(get_reference_path_from_bam(alignments)) as reference:
+
+        for molecule in MoleculeIterator(alignments,
+                                         contig=sel_contig,
+                                         molecule_class=CHICMolecule,
+                                         fragment_class = CHICFragment,
+                                         #molecule_class_args = {'reference':reference}
+                                        ):
+
+
+
+            contig, start, strand = molecule.get_cut_site()
+            if not molecule.is_completely_matching:
+                continue
+
+            if molecule.get_mean_mapping_qual()<55:
+                continue
+            slen = get_aligned_chic_len(molecule)
+
+            if slen is None or slen>max_fragment_size:
+                continue
+
+            if strand:
+                end = start - slen
+            else:
+                end = start + slen
+
+
+            molecule_coordinate_dict[molecule.sample].append( (start,end) )
+    for sample in molecule_coordinate_dict:
+        molecule_coordinate_dict[sample] =  sorted(molecule_coordinate_dict[sample])
+    return sel_contig, molecule_coordinate_dict
+
+
+def generate_molecule_coordinate_dict(bams, n_threads=None):
+
+    molecule_coordinate_dicts = {}
+    contigs = get_contigs(bams[0])
+
+    with Pool(n_threads) as workers:
+        for contig,d in workers.imap( _generate_molecule_coordinate_dict,
+                                     product(bams,filter(is_autosome, contigs))):
+            if not contig in molecule_coordinate_dicts:
+
+                molecule_coordinate_dicts[contig] = d
+            else:
+                molecule_coordinate_dicts[contig].update( d )
+            print(f'Finished {contig}')
+    return molecule_coordinate_dicts
+
+
+    #bin_size = 100_000
+
+#distances_per_bin = defaultdict(lambda: defaultdict(list)  )
+
+def contig_coordinate_dict_to_votes(d, n, min_total_cuts_per_cell=3, p_nuc_bin_size=5, max_linker_length=90, linker_vote_radius=25):
+    #molecule_coordinate_dicts[contig] = d
+    # n = size of contig
+    vote_nucleosome = np.zeros(int( n/p_nuc_bin_size) )
+    vote_linker = np.zeros(int( n/p_nuc_bin_size) )
+
+    for cell in d.keys():
+        if len(d[cell])<min_total_cuts_per_cell:
+            continue
+        for (start_a, end_a),(start_b,end_b) in windowed( d[cell],2):
+
+            o_start_a = start_a
+
+            #Check if the fragments are not overlapping:
+            (start_a, end_a) = min(start_a, end_a), max(start_a, end_a)
+            (start_b, end_b) = min(start_b, end_b), max(start_b, end_b)
+
+            # The locations of the nucleosomes are constrained:
+            # \\ start_a  -----  end_a \\     \\ start_b  ---- end_b \\
+            if end_a - start_a >= 147 and end_a - start_a < (147*2+10): # it could contain a single nucleosome, but not more
+                s = int(((start_a + 70 ))/p_nuc_bin_size)
+                e = int(((end_a - 70))/p_nuc_bin_size)
+                vote_nucleosome[ s:(e+1) ] += 1/(e-s)
+
+            # Any starting point of a molecule is _always_ part of a linker
+            # lets say += 25bp
+
+            c=o_start_a
+            s = int(((c-linker_vote_radius))/p_nuc_bin_size)
+            e = int(((c+linker_vote_radius))/p_nuc_bin_size)+1
+            vote_linker[s:(e+1)] += 1/(e-s)
+
+            if end_a > start_b:
+                continue
+
+            if start_b - end_a  > max_linker_length: # The distance is larger than 90bp, which is a very long linker. Skip the linker vote
+                continue
+
+            s = int(((end_a))/p_nuc_bin_size)
+            e = int(((start_b))/p_nuc_bin_size)
+
+
+            vote_linker[s:(e+1)] += 1/(e-s+1)
+    return vote_nucleosome, vote_linker
+
+
+async def write_to_bw(handle, starts, ends,  values, contig,size=None):
+
+    if size is not None:
+        print( ends[ends>=size] )
+
+    handle.addEntries(
+        [contig]*len(starts), #Contig
+        starts.astype(np.int64) , #Start
+        ends= ends.astype(np.int64) ,  #end
+        values=  values.astype(np.float64)
+    )
+
+async def coordinate_dicts_to_nucleosome_position_files(bams, molecule_coordinate_dicts, p_nuc_bin_size = 5, alias='npos', n_threads=None ):
+
+    contigs = get_contigs(bams[0])
+    sizes = get_contig_sizes(bams[0])
+
+
+    # Create output bigwig file handles and worker pool
+    linker_vote_write_path= f'{alias}_linkers.bw'
+    nuc_vote_write_path= f'{alias}_nucleosomes.bw'
+    merged_vote_write_path= f'{alias}_nucleosomes_min_linkers.bw'
+    centroids = f'{alias}_nucleosome_centers.bed'
+
+
+    with Pool(n_threads) as workers, \
+        pyBigWig.open(linker_vote_write_path,'w') as linkers_out, \
+        pyBigWig.open(nuc_vote_write_path,'w') as nuc_out, \
+        pyBigWig.open(merged_vote_write_path,'w') as merged_out, \
+        open(centroids,'w') as centroids_out:
+
+        # Add headers for all output bigwigfiles
+        for h in (linkers_out, nuc_out, merged_out):
+            # Write header
+            h.addHeader(list(zip(contigs, [sizes[c] for c in contigs])))
+
+
+        # Obtain nucleosome and linker votes for each contig
+        for ret_contig, (vote_nucleosome, vote_linker) in zip(
+                    molecule_coordinate_dicts.keys(),
+                    workers.imap(pool_wrapper,
+            (
+            (contig_coordinate_dict_to_votes,
+            {
+                'd':d,
+                'n':sizes[contig],
+                'min_total_cuts_per_cell':3,
+                'p_nuc_bin_size':5,
+                'max_linker_length':90,
+                'linker_vote_radius':25
+            })
+                for contig, d in molecule_coordinate_dicts.items()
+            ))):
+
+            print(f'Writing votes for {ret_contig}')
+            contig_len = sizes[ret_contig]
+
+            smooth_linkers = gaussian_filter1d(vote_linker,2)
+            smooth_nuc = gaussian_filter1d(vote_nucleosome,2)
+
+            # Write nucleosome vote track:
+            starts = np.array( np.linspace(0, contig_len-p_nuc_bin_size-1, len(vote_nucleosome)) )
+
+            size = sizes[ret_contig]
+            # Check if all numbers are preceding...
+            d = np.diff(starts)
+            d = d[d<1]
+            if len(d):
+                print(d)
+            else:
+                print(f'{len(starts)} Coordinates are in correct order')
+
+
+            await asyncio.gather(
+                write_to_bw(nuc_out, starts, starts+p_nuc_bin_size,  np.nan_to_num(smooth_nuc), ret_contig, size),
+                write_to_bw(linkers_out, starts, starts+p_nuc_bin_size,  np.nan_to_num(smooth_linkers), ret_contig, size),
+                write_to_bw(merged_out, starts, starts+p_nuc_bin_size,  np.nan_to_num(smooth_nuc - smooth_linkers), ret_contig, size=size)
+            )
+
+
+            # Use the find peak function to select properly spaced nucleosome positions.
+            print(f'Writing centroids for {ret_contig}')
+            mindist = 147
+            min_dist_bins = int( mindist / p_nuc_bin_size )
+            estimated_nucleosome_positions = find_peaks( gaussian_filter1d(smooth_nuc - smooth_linkers,2),distance=min_dist_bins)[0]
+
+            for nucpos, score in zip( estimated_nucleosome_positions*p_nuc_bin_size, smooth_nuc[estimated_nucleosome_positions]):
+                centroids_out.write(f'{ret_contig}\t{nucpos}\t{nucpos+p_nuc_bin_size}\t{score}\n')
+
+
+
+    #contig_coordinate_dict_to_votes(d, n, min_total_cuts_per_cell=3, p_nuc_bin_size=5, max_linker_length=90, linker_vote_radius=25)
+
+
+if __name__ == '__main__':
+
+    argparser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description='Estimate nucleosome positions from scCHiC seq bam file(s)')
+
+    argparser.add_argument('alignmentfiles', type=str, nargs='+')
+    argparser.add_argument('-o', type=str, required=True, help='output prefix')
+
+    argparser.add_argument('-bin_size', type=int,  default=3, help='Nucleosome position precision (bp), increasing this value increases memory consumption linearly')
+    argparser.add_argument('-n_threads', type=int, help='Amount of worker threads')
+
+    args = argparser.parse_args()
+
+    async def main(args):
+        # Run the analysis:
+        print('Creating molecule coordinate database')
+        d = generate_molecule_coordinate_dict(args.alignmentfiles, args.n_threads)
+        print('Estimating nucleosome positions')
+        await coordinate_dicts_to_nucleosome_position_files(args.alignmentfiles,
+                    d,
+                    p_nuc_bin_size = args.bin_size,
+                    n_threads=args.n_threads,
+                    alias=args.o)
+
+
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(main(args))
+
+    #asyncio.run(main(args)) # python >= 3.7