--- a +++ b/singlecellmultiomics/bamProcessing/bamToNucleosomePositions.py @@ -0,0 +1,266 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from singlecellmultiomics.molecule import MoleculeIterator, CHICMolecule +from singlecellmultiomics.fragment import CHICFragment +from singlecellmultiomics.bamProcessing.bamFunctions import get_reference_path_from_bam, get_contigs +from collections import defaultdict +import pyBigWig +import pysam +from singlecellmultiomics.bamProcessing.bamBinCounts import get_binned_counts +import pandas as pd +import argparse +from singlecellmultiomics.bamProcessing import get_contig_sizes +from singlecellmultiomics.utils import is_autosome, pool_wrapper +import numpy as np +from multiprocessing import Pool +from more_itertools import windowed +from glob import glob +import pyBigWig +from scipy.ndimage import gaussian_filter1d +from scipy.signal import find_peaks +import argparse +from itertools import product +import asyncio + +def get_aligned_chic_len(molecule): + + for f in molecule: + f.R2_primer_length = 6 + + mlen = molecule.estimated_max_length + if mlen is None: + return None + + return mlen + 2 # 2 extra bases because 1 base is potentially lost on both sides + +def _generate_molecule_coordinate_dict(args, max_fragment_size=800): + + bam_path, sel_contig= args + molecule_coordinate_dict = defaultdict(list) # cell->[ (start,end), (start,end) ..] + with pysam.AlignmentFile(bam_path) as alignments: + # pysam.FastaFile(get_reference_path_from_bam(alignments)) as reference: + + for molecule in MoleculeIterator(alignments, + contig=sel_contig, + molecule_class=CHICMolecule, + fragment_class = CHICFragment, + #molecule_class_args = {'reference':reference} + ): + + + + contig, start, strand = molecule.get_cut_site() + if not molecule.is_completely_matching: + continue + + if molecule.get_mean_mapping_qual()<55: + continue + slen = get_aligned_chic_len(molecule) + + if slen is None or slen>max_fragment_size: + continue + + if strand: + end = start - slen + else: + end = start + slen + + + molecule_coordinate_dict[molecule.sample].append( (start,end) ) + for sample in molecule_coordinate_dict: + molecule_coordinate_dict[sample] = sorted(molecule_coordinate_dict[sample]) + return sel_contig, molecule_coordinate_dict + + +def generate_molecule_coordinate_dict(bams, n_threads=None): + + molecule_coordinate_dicts = {} + contigs = get_contigs(bams[0]) + + with Pool(n_threads) as workers: + for contig,d in workers.imap( _generate_molecule_coordinate_dict, + product(bams,filter(is_autosome, contigs))): + if not contig in molecule_coordinate_dicts: + + molecule_coordinate_dicts[contig] = d + else: + molecule_coordinate_dicts[contig].update( d ) + print(f'Finished {contig}') + return molecule_coordinate_dicts + + + #bin_size = 100_000 + +#distances_per_bin = defaultdict(lambda: defaultdict(list) ) + +def contig_coordinate_dict_to_votes(d, n, min_total_cuts_per_cell=3, p_nuc_bin_size=5, max_linker_length=90, linker_vote_radius=25): + #molecule_coordinate_dicts[contig] = d + # n = size of contig + vote_nucleosome = np.zeros(int( n/p_nuc_bin_size) ) + vote_linker = np.zeros(int( n/p_nuc_bin_size) ) + + for cell in d.keys(): + if len(d[cell])<min_total_cuts_per_cell: + continue + for (start_a, end_a),(start_b,end_b) in windowed( d[cell],2): + + o_start_a = start_a + + #Check if the fragments are not overlapping: + (start_a, end_a) = min(start_a, end_a), max(start_a, end_a) + (start_b, end_b) = min(start_b, end_b), max(start_b, end_b) + + # The locations of the nucleosomes are constrained: + # \\ start_a ----- end_a \\ \\ start_b ---- end_b \\ + if end_a - start_a >= 147 and end_a - start_a < (147*2+10): # it could contain a single nucleosome, but not more + s = int(((start_a + 70 ))/p_nuc_bin_size) + e = int(((end_a - 70))/p_nuc_bin_size) + vote_nucleosome[ s:(e+1) ] += 1/(e-s) + + # Any starting point of a molecule is _always_ part of a linker + # lets say += 25bp + + c=o_start_a + s = int(((c-linker_vote_radius))/p_nuc_bin_size) + e = int(((c+linker_vote_radius))/p_nuc_bin_size)+1 + vote_linker[s:(e+1)] += 1/(e-s) + + if end_a > start_b: + continue + + if start_b - end_a > max_linker_length: # The distance is larger than 90bp, which is a very long linker. Skip the linker vote + continue + + s = int(((end_a))/p_nuc_bin_size) + e = int(((start_b))/p_nuc_bin_size) + + + vote_linker[s:(e+1)] += 1/(e-s+1) + return vote_nucleosome, vote_linker + + +async def write_to_bw(handle, starts, ends, values, contig,size=None): + + if size is not None: + print( ends[ends>=size] ) + + handle.addEntries( + [contig]*len(starts), #Contig + starts.astype(np.int64) , #Start + ends= ends.astype(np.int64) , #end + values= values.astype(np.float64) + ) + +async def coordinate_dicts_to_nucleosome_position_files(bams, molecule_coordinate_dicts, p_nuc_bin_size = 5, alias='npos', n_threads=None ): + + contigs = get_contigs(bams[0]) + sizes = get_contig_sizes(bams[0]) + + + # Create output bigwig file handles and worker pool + linker_vote_write_path= f'{alias}_linkers.bw' + nuc_vote_write_path= f'{alias}_nucleosomes.bw' + merged_vote_write_path= f'{alias}_nucleosomes_min_linkers.bw' + centroids = f'{alias}_nucleosome_centers.bed' + + + with Pool(n_threads) as workers, \ + pyBigWig.open(linker_vote_write_path,'w') as linkers_out, \ + pyBigWig.open(nuc_vote_write_path,'w') as nuc_out, \ + pyBigWig.open(merged_vote_write_path,'w') as merged_out, \ + open(centroids,'w') as centroids_out: + + # Add headers for all output bigwigfiles + for h in (linkers_out, nuc_out, merged_out): + # Write header + h.addHeader(list(zip(contigs, [sizes[c] for c in contigs]))) + + + # Obtain nucleosome and linker votes for each contig + for ret_contig, (vote_nucleosome, vote_linker) in zip( + molecule_coordinate_dicts.keys(), + workers.imap(pool_wrapper, + ( + (contig_coordinate_dict_to_votes, + { + 'd':d, + 'n':sizes[contig], + 'min_total_cuts_per_cell':3, + 'p_nuc_bin_size':5, + 'max_linker_length':90, + 'linker_vote_radius':25 + }) + for contig, d in molecule_coordinate_dicts.items() + ))): + + print(f'Writing votes for {ret_contig}') + contig_len = sizes[ret_contig] + + smooth_linkers = gaussian_filter1d(vote_linker,2) + smooth_nuc = gaussian_filter1d(vote_nucleosome,2) + + # Write nucleosome vote track: + starts = np.array( np.linspace(0, contig_len-p_nuc_bin_size-1, len(vote_nucleosome)) ) + + size = sizes[ret_contig] + # Check if all numbers are preceding... + d = np.diff(starts) + d = d[d<1] + if len(d): + print(d) + else: + print(f'{len(starts)} Coordinates are in correct order') + + + await asyncio.gather( + write_to_bw(nuc_out, starts, starts+p_nuc_bin_size, np.nan_to_num(smooth_nuc), ret_contig, size), + write_to_bw(linkers_out, starts, starts+p_nuc_bin_size, np.nan_to_num(smooth_linkers), ret_contig, size), + write_to_bw(merged_out, starts, starts+p_nuc_bin_size, np.nan_to_num(smooth_nuc - smooth_linkers), ret_contig, size=size) + ) + + + # Use the find peak function to select properly spaced nucleosome positions. + print(f'Writing centroids for {ret_contig}') + mindist = 147 + min_dist_bins = int( mindist / p_nuc_bin_size ) + estimated_nucleosome_positions = find_peaks( gaussian_filter1d(smooth_nuc - smooth_linkers,2),distance=min_dist_bins)[0] + + for nucpos, score in zip( estimated_nucleosome_positions*p_nuc_bin_size, smooth_nuc[estimated_nucleosome_positions]): + centroids_out.write(f'{ret_contig}\t{nucpos}\t{nucpos+p_nuc_bin_size}\t{score}\n') + + + + #contig_coordinate_dict_to_votes(d, n, min_total_cuts_per_cell=3, p_nuc_bin_size=5, max_linker_length=90, linker_vote_radius=25) + + +if __name__ == '__main__': + + argparser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description='Estimate nucleosome positions from scCHiC seq bam file(s)') + + argparser.add_argument('alignmentfiles', type=str, nargs='+') + argparser.add_argument('-o', type=str, required=True, help='output prefix') + + argparser.add_argument('-bin_size', type=int, default=3, help='Nucleosome position precision (bp), increasing this value increases memory consumption linearly') + argparser.add_argument('-n_threads', type=int, help='Amount of worker threads') + + args = argparser.parse_args() + + async def main(args): + # Run the analysis: + print('Creating molecule coordinate database') + d = generate_molecule_coordinate_dict(args.alignmentfiles, args.n_threads) + print('Estimating nucleosome positions') + await coordinate_dicts_to_nucleosome_position_files(args.alignmentfiles, + d, + p_nuc_bin_size = args.bin_size, + n_threads=args.n_threads, + alias=args.o) + + + loop = asyncio.get_event_loop() + loop.run_until_complete(main(args)) + + #asyncio.run(main(args)) # python >= 3.7