|
a |
|
b/singlecellmultiomics/bamProcessing/bamToNucleosomePositions.py |
|
|
1 |
#!/usr/bin/env python |
|
|
2 |
# -*- coding: utf-8 -*- |
|
|
3 |
|
|
|
4 |
from singlecellmultiomics.molecule import MoleculeIterator, CHICMolecule |
|
|
5 |
from singlecellmultiomics.fragment import CHICFragment |
|
|
6 |
from singlecellmultiomics.bamProcessing.bamFunctions import get_reference_path_from_bam, get_contigs |
|
|
7 |
from collections import defaultdict |
|
|
8 |
import pyBigWig |
|
|
9 |
import pysam |
|
|
10 |
from singlecellmultiomics.bamProcessing.bamBinCounts import get_binned_counts |
|
|
11 |
import pandas as pd |
|
|
12 |
import argparse |
|
|
13 |
from singlecellmultiomics.bamProcessing import get_contig_sizes |
|
|
14 |
from singlecellmultiomics.utils import is_autosome, pool_wrapper |
|
|
15 |
import numpy as np |
|
|
16 |
from multiprocessing import Pool |
|
|
17 |
from more_itertools import windowed |
|
|
18 |
from glob import glob |
|
|
19 |
import pyBigWig |
|
|
20 |
from scipy.ndimage import gaussian_filter1d |
|
|
21 |
from scipy.signal import find_peaks |
|
|
22 |
import argparse |
|
|
23 |
from itertools import product |
|
|
24 |
import asyncio |
|
|
25 |
|
|
|
26 |
def get_aligned_chic_len(molecule): |
|
|
27 |
|
|
|
28 |
for f in molecule: |
|
|
29 |
f.R2_primer_length = 6 |
|
|
30 |
|
|
|
31 |
mlen = molecule.estimated_max_length |
|
|
32 |
if mlen is None: |
|
|
33 |
return None |
|
|
34 |
|
|
|
35 |
return mlen + 2 # 2 extra bases because 1 base is potentially lost on both sides |
|
|
36 |
|
|
|
37 |
def _generate_molecule_coordinate_dict(args, max_fragment_size=800): |
|
|
38 |
|
|
|
39 |
bam_path, sel_contig= args |
|
|
40 |
molecule_coordinate_dict = defaultdict(list) # cell->[ (start,end), (start,end) ..] |
|
|
41 |
with pysam.AlignmentFile(bam_path) as alignments: |
|
|
42 |
# pysam.FastaFile(get_reference_path_from_bam(alignments)) as reference: |
|
|
43 |
|
|
|
44 |
for molecule in MoleculeIterator(alignments, |
|
|
45 |
contig=sel_contig, |
|
|
46 |
molecule_class=CHICMolecule, |
|
|
47 |
fragment_class = CHICFragment, |
|
|
48 |
#molecule_class_args = {'reference':reference} |
|
|
49 |
): |
|
|
50 |
|
|
|
51 |
|
|
|
52 |
|
|
|
53 |
contig, start, strand = molecule.get_cut_site() |
|
|
54 |
if not molecule.is_completely_matching: |
|
|
55 |
continue |
|
|
56 |
|
|
|
57 |
if molecule.get_mean_mapping_qual()<55: |
|
|
58 |
continue |
|
|
59 |
slen = get_aligned_chic_len(molecule) |
|
|
60 |
|
|
|
61 |
if slen is None or slen>max_fragment_size: |
|
|
62 |
continue |
|
|
63 |
|
|
|
64 |
if strand: |
|
|
65 |
end = start - slen |
|
|
66 |
else: |
|
|
67 |
end = start + slen |
|
|
68 |
|
|
|
69 |
|
|
|
70 |
molecule_coordinate_dict[molecule.sample].append( (start,end) ) |
|
|
71 |
for sample in molecule_coordinate_dict: |
|
|
72 |
molecule_coordinate_dict[sample] = sorted(molecule_coordinate_dict[sample]) |
|
|
73 |
return sel_contig, molecule_coordinate_dict |
|
|
74 |
|
|
|
75 |
|
|
|
76 |
def generate_molecule_coordinate_dict(bams, n_threads=None): |
|
|
77 |
|
|
|
78 |
molecule_coordinate_dicts = {} |
|
|
79 |
contigs = get_contigs(bams[0]) |
|
|
80 |
|
|
|
81 |
with Pool(n_threads) as workers: |
|
|
82 |
for contig,d in workers.imap( _generate_molecule_coordinate_dict, |
|
|
83 |
product(bams,filter(is_autosome, contigs))): |
|
|
84 |
if not contig in molecule_coordinate_dicts: |
|
|
85 |
|
|
|
86 |
molecule_coordinate_dicts[contig] = d |
|
|
87 |
else: |
|
|
88 |
molecule_coordinate_dicts[contig].update( d ) |
|
|
89 |
print(f'Finished {contig}') |
|
|
90 |
return molecule_coordinate_dicts |
|
|
91 |
|
|
|
92 |
|
|
|
93 |
#bin_size = 100_000 |
|
|
94 |
|
|
|
95 |
#distances_per_bin = defaultdict(lambda: defaultdict(list) ) |
|
|
96 |
|
|
|
97 |
def contig_coordinate_dict_to_votes(d, n, min_total_cuts_per_cell=3, p_nuc_bin_size=5, max_linker_length=90, linker_vote_radius=25): |
|
|
98 |
#molecule_coordinate_dicts[contig] = d |
|
|
99 |
# n = size of contig |
|
|
100 |
vote_nucleosome = np.zeros(int( n/p_nuc_bin_size) ) |
|
|
101 |
vote_linker = np.zeros(int( n/p_nuc_bin_size) ) |
|
|
102 |
|
|
|
103 |
for cell in d.keys(): |
|
|
104 |
if len(d[cell])<min_total_cuts_per_cell: |
|
|
105 |
continue |
|
|
106 |
for (start_a, end_a),(start_b,end_b) in windowed( d[cell],2): |
|
|
107 |
|
|
|
108 |
o_start_a = start_a |
|
|
109 |
|
|
|
110 |
#Check if the fragments are not overlapping: |
|
|
111 |
(start_a, end_a) = min(start_a, end_a), max(start_a, end_a) |
|
|
112 |
(start_b, end_b) = min(start_b, end_b), max(start_b, end_b) |
|
|
113 |
|
|
|
114 |
# The locations of the nucleosomes are constrained: |
|
|
115 |
# \\ start_a ----- end_a \\ \\ start_b ---- end_b \\ |
|
|
116 |
if end_a - start_a >= 147 and end_a - start_a < (147*2+10): # it could contain a single nucleosome, but not more |
|
|
117 |
s = int(((start_a + 70 ))/p_nuc_bin_size) |
|
|
118 |
e = int(((end_a - 70))/p_nuc_bin_size) |
|
|
119 |
vote_nucleosome[ s:(e+1) ] += 1/(e-s) |
|
|
120 |
|
|
|
121 |
# Any starting point of a molecule is _always_ part of a linker |
|
|
122 |
# lets say += 25bp |
|
|
123 |
|
|
|
124 |
c=o_start_a |
|
|
125 |
s = int(((c-linker_vote_radius))/p_nuc_bin_size) |
|
|
126 |
e = int(((c+linker_vote_radius))/p_nuc_bin_size)+1 |
|
|
127 |
vote_linker[s:(e+1)] += 1/(e-s) |
|
|
128 |
|
|
|
129 |
if end_a > start_b: |
|
|
130 |
continue |
|
|
131 |
|
|
|
132 |
if start_b - end_a > max_linker_length: # The distance is larger than 90bp, which is a very long linker. Skip the linker vote |
|
|
133 |
continue |
|
|
134 |
|
|
|
135 |
s = int(((end_a))/p_nuc_bin_size) |
|
|
136 |
e = int(((start_b))/p_nuc_bin_size) |
|
|
137 |
|
|
|
138 |
|
|
|
139 |
vote_linker[s:(e+1)] += 1/(e-s+1) |
|
|
140 |
return vote_nucleosome, vote_linker |
|
|
141 |
|
|
|
142 |
|
|
|
143 |
async def write_to_bw(handle, starts, ends, values, contig,size=None): |
|
|
144 |
|
|
|
145 |
if size is not None: |
|
|
146 |
print( ends[ends>=size] ) |
|
|
147 |
|
|
|
148 |
handle.addEntries( |
|
|
149 |
[contig]*len(starts), #Contig |
|
|
150 |
starts.astype(np.int64) , #Start |
|
|
151 |
ends= ends.astype(np.int64) , #end |
|
|
152 |
values= values.astype(np.float64) |
|
|
153 |
) |
|
|
154 |
|
|
|
155 |
async def coordinate_dicts_to_nucleosome_position_files(bams, molecule_coordinate_dicts, p_nuc_bin_size = 5, alias='npos', n_threads=None ): |
|
|
156 |
|
|
|
157 |
contigs = get_contigs(bams[0]) |
|
|
158 |
sizes = get_contig_sizes(bams[0]) |
|
|
159 |
|
|
|
160 |
|
|
|
161 |
# Create output bigwig file handles and worker pool |
|
|
162 |
linker_vote_write_path= f'{alias}_linkers.bw' |
|
|
163 |
nuc_vote_write_path= f'{alias}_nucleosomes.bw' |
|
|
164 |
merged_vote_write_path= f'{alias}_nucleosomes_min_linkers.bw' |
|
|
165 |
centroids = f'{alias}_nucleosome_centers.bed' |
|
|
166 |
|
|
|
167 |
|
|
|
168 |
with Pool(n_threads) as workers, \ |
|
|
169 |
pyBigWig.open(linker_vote_write_path,'w') as linkers_out, \ |
|
|
170 |
pyBigWig.open(nuc_vote_write_path,'w') as nuc_out, \ |
|
|
171 |
pyBigWig.open(merged_vote_write_path,'w') as merged_out, \ |
|
|
172 |
open(centroids,'w') as centroids_out: |
|
|
173 |
|
|
|
174 |
# Add headers for all output bigwigfiles |
|
|
175 |
for h in (linkers_out, nuc_out, merged_out): |
|
|
176 |
# Write header |
|
|
177 |
h.addHeader(list(zip(contigs, [sizes[c] for c in contigs]))) |
|
|
178 |
|
|
|
179 |
|
|
|
180 |
# Obtain nucleosome and linker votes for each contig |
|
|
181 |
for ret_contig, (vote_nucleosome, vote_linker) in zip( |
|
|
182 |
molecule_coordinate_dicts.keys(), |
|
|
183 |
workers.imap(pool_wrapper, |
|
|
184 |
( |
|
|
185 |
(contig_coordinate_dict_to_votes, |
|
|
186 |
{ |
|
|
187 |
'd':d, |
|
|
188 |
'n':sizes[contig], |
|
|
189 |
'min_total_cuts_per_cell':3, |
|
|
190 |
'p_nuc_bin_size':5, |
|
|
191 |
'max_linker_length':90, |
|
|
192 |
'linker_vote_radius':25 |
|
|
193 |
}) |
|
|
194 |
for contig, d in molecule_coordinate_dicts.items() |
|
|
195 |
))): |
|
|
196 |
|
|
|
197 |
print(f'Writing votes for {ret_contig}') |
|
|
198 |
contig_len = sizes[ret_contig] |
|
|
199 |
|
|
|
200 |
smooth_linkers = gaussian_filter1d(vote_linker,2) |
|
|
201 |
smooth_nuc = gaussian_filter1d(vote_nucleosome,2) |
|
|
202 |
|
|
|
203 |
# Write nucleosome vote track: |
|
|
204 |
starts = np.array( np.linspace(0, contig_len-p_nuc_bin_size-1, len(vote_nucleosome)) ) |
|
|
205 |
|
|
|
206 |
size = sizes[ret_contig] |
|
|
207 |
# Check if all numbers are preceding... |
|
|
208 |
d = np.diff(starts) |
|
|
209 |
d = d[d<1] |
|
|
210 |
if len(d): |
|
|
211 |
print(d) |
|
|
212 |
else: |
|
|
213 |
print(f'{len(starts)} Coordinates are in correct order') |
|
|
214 |
|
|
|
215 |
|
|
|
216 |
await asyncio.gather( |
|
|
217 |
write_to_bw(nuc_out, starts, starts+p_nuc_bin_size, np.nan_to_num(smooth_nuc), ret_contig, size), |
|
|
218 |
write_to_bw(linkers_out, starts, starts+p_nuc_bin_size, np.nan_to_num(smooth_linkers), ret_contig, size), |
|
|
219 |
write_to_bw(merged_out, starts, starts+p_nuc_bin_size, np.nan_to_num(smooth_nuc - smooth_linkers), ret_contig, size=size) |
|
|
220 |
) |
|
|
221 |
|
|
|
222 |
|
|
|
223 |
# Use the find peak function to select properly spaced nucleosome positions. |
|
|
224 |
print(f'Writing centroids for {ret_contig}') |
|
|
225 |
mindist = 147 |
|
|
226 |
min_dist_bins = int( mindist / p_nuc_bin_size ) |
|
|
227 |
estimated_nucleosome_positions = find_peaks( gaussian_filter1d(smooth_nuc - smooth_linkers,2),distance=min_dist_bins)[0] |
|
|
228 |
|
|
|
229 |
for nucpos, score in zip( estimated_nucleosome_positions*p_nuc_bin_size, smooth_nuc[estimated_nucleosome_positions]): |
|
|
230 |
centroids_out.write(f'{ret_contig}\t{nucpos}\t{nucpos+p_nuc_bin_size}\t{score}\n') |
|
|
231 |
|
|
|
232 |
|
|
|
233 |
|
|
|
234 |
#contig_coordinate_dict_to_votes(d, n, min_total_cuts_per_cell=3, p_nuc_bin_size=5, max_linker_length=90, linker_vote_radius=25) |
|
|
235 |
|
|
|
236 |
|
|
|
237 |
if __name__ == '__main__': |
|
|
238 |
|
|
|
239 |
argparser = argparse.ArgumentParser( |
|
|
240 |
formatter_class=argparse.ArgumentDefaultsHelpFormatter, |
|
|
241 |
description='Estimate nucleosome positions from scCHiC seq bam file(s)') |
|
|
242 |
|
|
|
243 |
argparser.add_argument('alignmentfiles', type=str, nargs='+') |
|
|
244 |
argparser.add_argument('-o', type=str, required=True, help='output prefix') |
|
|
245 |
|
|
|
246 |
argparser.add_argument('-bin_size', type=int, default=3, help='Nucleosome position precision (bp), increasing this value increases memory consumption linearly') |
|
|
247 |
argparser.add_argument('-n_threads', type=int, help='Amount of worker threads') |
|
|
248 |
|
|
|
249 |
args = argparser.parse_args() |
|
|
250 |
|
|
|
251 |
async def main(args): |
|
|
252 |
# Run the analysis: |
|
|
253 |
print('Creating molecule coordinate database') |
|
|
254 |
d = generate_molecule_coordinate_dict(args.alignmentfiles, args.n_threads) |
|
|
255 |
print('Estimating nucleosome positions') |
|
|
256 |
await coordinate_dicts_to_nucleosome_position_files(args.alignmentfiles, |
|
|
257 |
d, |
|
|
258 |
p_nuc_bin_size = args.bin_size, |
|
|
259 |
n_threads=args.n_threads, |
|
|
260 |
alias=args.o) |
|
|
261 |
|
|
|
262 |
|
|
|
263 |
loop = asyncio.get_event_loop() |
|
|
264 |
loop.run_until_complete(main(args)) |
|
|
265 |
|
|
|
266 |
#asyncio.run(main(args)) # python >= 3.7 |