[45ad7e]: / singlecellmultiomics / bamProcessing / bamToNucleosomePositions.py

Download this file

267 lines (195 with data), 10.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from singlecellmultiomics.molecule import MoleculeIterator, CHICMolecule
from singlecellmultiomics.fragment import CHICFragment
from singlecellmultiomics.bamProcessing.bamFunctions import get_reference_path_from_bam, get_contigs
from collections import defaultdict
import pyBigWig
import pysam
from singlecellmultiomics.bamProcessing.bamBinCounts import get_binned_counts
import pandas as pd
import argparse
from singlecellmultiomics.bamProcessing import get_contig_sizes
from singlecellmultiomics.utils import is_autosome, pool_wrapper
import numpy as np
from multiprocessing import Pool
from more_itertools import windowed
from glob import glob
import pyBigWig
from scipy.ndimage import gaussian_filter1d
from scipy.signal import find_peaks
import argparse
from itertools import product
import asyncio
def get_aligned_chic_len(molecule):
for f in molecule:
f.R2_primer_length = 6
mlen = molecule.estimated_max_length
if mlen is None:
return None
return mlen + 2 # 2 extra bases because 1 base is potentially lost on both sides
def _generate_molecule_coordinate_dict(args, max_fragment_size=800):
bam_path, sel_contig= args
molecule_coordinate_dict = defaultdict(list) # cell->[ (start,end), (start,end) ..]
with pysam.AlignmentFile(bam_path) as alignments:
# pysam.FastaFile(get_reference_path_from_bam(alignments)) as reference:
for molecule in MoleculeIterator(alignments,
contig=sel_contig,
molecule_class=CHICMolecule,
fragment_class = CHICFragment,
#molecule_class_args = {'reference':reference}
):
contig, start, strand = molecule.get_cut_site()
if not molecule.is_completely_matching:
continue
if molecule.get_mean_mapping_qual()<55:
continue
slen = get_aligned_chic_len(molecule)
if slen is None or slen>max_fragment_size:
continue
if strand:
end = start - slen
else:
end = start + slen
molecule_coordinate_dict[molecule.sample].append( (start,end) )
for sample in molecule_coordinate_dict:
molecule_coordinate_dict[sample] = sorted(molecule_coordinate_dict[sample])
return sel_contig, molecule_coordinate_dict
def generate_molecule_coordinate_dict(bams, n_threads=None):
molecule_coordinate_dicts = {}
contigs = get_contigs(bams[0])
with Pool(n_threads) as workers:
for contig,d in workers.imap( _generate_molecule_coordinate_dict,
product(bams,filter(is_autosome, contigs))):
if not contig in molecule_coordinate_dicts:
molecule_coordinate_dicts[contig] = d
else:
molecule_coordinate_dicts[contig].update( d )
print(f'Finished {contig}')
return molecule_coordinate_dicts
#bin_size = 100_000
#distances_per_bin = defaultdict(lambda: defaultdict(list) )
def contig_coordinate_dict_to_votes(d, n, min_total_cuts_per_cell=3, p_nuc_bin_size=5, max_linker_length=90, linker_vote_radius=25):
#molecule_coordinate_dicts[contig] = d
# n = size of contig
vote_nucleosome = np.zeros(int( n/p_nuc_bin_size) )
vote_linker = np.zeros(int( n/p_nuc_bin_size) )
for cell in d.keys():
if len(d[cell])<min_total_cuts_per_cell:
continue
for (start_a, end_a),(start_b,end_b) in windowed( d[cell],2):
o_start_a = start_a
#Check if the fragments are not overlapping:
(start_a, end_a) = min(start_a, end_a), max(start_a, end_a)
(start_b, end_b) = min(start_b, end_b), max(start_b, end_b)
# The locations of the nucleosomes are constrained:
# \\ start_a ----- end_a \\ \\ start_b ---- end_b \\
if end_a - start_a >= 147 and end_a - start_a < (147*2+10): # it could contain a single nucleosome, but not more
s = int(((start_a + 70 ))/p_nuc_bin_size)
e = int(((end_a - 70))/p_nuc_bin_size)
vote_nucleosome[ s:(e+1) ] += 1/(e-s)
# Any starting point of a molecule is _always_ part of a linker
# lets say += 25bp
c=o_start_a
s = int(((c-linker_vote_radius))/p_nuc_bin_size)
e = int(((c+linker_vote_radius))/p_nuc_bin_size)+1
vote_linker[s:(e+1)] += 1/(e-s)
if end_a > start_b:
continue
if start_b - end_a > max_linker_length: # The distance is larger than 90bp, which is a very long linker. Skip the linker vote
continue
s = int(((end_a))/p_nuc_bin_size)
e = int(((start_b))/p_nuc_bin_size)
vote_linker[s:(e+1)] += 1/(e-s+1)
return vote_nucleosome, vote_linker
async def write_to_bw(handle, starts, ends, values, contig,size=None):
if size is not None:
print( ends[ends>=size] )
handle.addEntries(
[contig]*len(starts), #Contig
starts.astype(np.int64) , #Start
ends= ends.astype(np.int64) , #end
values= values.astype(np.float64)
)
async def coordinate_dicts_to_nucleosome_position_files(bams, molecule_coordinate_dicts, p_nuc_bin_size = 5, alias='npos', n_threads=None ):
contigs = get_contigs(bams[0])
sizes = get_contig_sizes(bams[0])
# Create output bigwig file handles and worker pool
linker_vote_write_path= f'{alias}_linkers.bw'
nuc_vote_write_path= f'{alias}_nucleosomes.bw'
merged_vote_write_path= f'{alias}_nucleosomes_min_linkers.bw'
centroids = f'{alias}_nucleosome_centers.bed'
with Pool(n_threads) as workers, \
pyBigWig.open(linker_vote_write_path,'w') as linkers_out, \
pyBigWig.open(nuc_vote_write_path,'w') as nuc_out, \
pyBigWig.open(merged_vote_write_path,'w') as merged_out, \
open(centroids,'w') as centroids_out:
# Add headers for all output bigwigfiles
for h in (linkers_out, nuc_out, merged_out):
# Write header
h.addHeader(list(zip(contigs, [sizes[c] for c in contigs])))
# Obtain nucleosome and linker votes for each contig
for ret_contig, (vote_nucleosome, vote_linker) in zip(
molecule_coordinate_dicts.keys(),
workers.imap(pool_wrapper,
(
(contig_coordinate_dict_to_votes,
{
'd':d,
'n':sizes[contig],
'min_total_cuts_per_cell':3,
'p_nuc_bin_size':5,
'max_linker_length':90,
'linker_vote_radius':25
})
for contig, d in molecule_coordinate_dicts.items()
))):
print(f'Writing votes for {ret_contig}')
contig_len = sizes[ret_contig]
smooth_linkers = gaussian_filter1d(vote_linker,2)
smooth_nuc = gaussian_filter1d(vote_nucleosome,2)
# Write nucleosome vote track:
starts = np.array( np.linspace(0, contig_len-p_nuc_bin_size-1, len(vote_nucleosome)) )
size = sizes[ret_contig]
# Check if all numbers are preceding...
d = np.diff(starts)
d = d[d<1]
if len(d):
print(d)
else:
print(f'{len(starts)} Coordinates are in correct order')
await asyncio.gather(
write_to_bw(nuc_out, starts, starts+p_nuc_bin_size, np.nan_to_num(smooth_nuc), ret_contig, size),
write_to_bw(linkers_out, starts, starts+p_nuc_bin_size, np.nan_to_num(smooth_linkers), ret_contig, size),
write_to_bw(merged_out, starts, starts+p_nuc_bin_size, np.nan_to_num(smooth_nuc - smooth_linkers), ret_contig, size=size)
)
# Use the find peak function to select properly spaced nucleosome positions.
print(f'Writing centroids for {ret_contig}')
mindist = 147
min_dist_bins = int( mindist / p_nuc_bin_size )
estimated_nucleosome_positions = find_peaks( gaussian_filter1d(smooth_nuc - smooth_linkers,2),distance=min_dist_bins)[0]
for nucpos, score in zip( estimated_nucleosome_positions*p_nuc_bin_size, smooth_nuc[estimated_nucleosome_positions]):
centroids_out.write(f'{ret_contig}\t{nucpos}\t{nucpos+p_nuc_bin_size}\t{score}\n')
#contig_coordinate_dict_to_votes(d, n, min_total_cuts_per_cell=3, p_nuc_bin_size=5, max_linker_length=90, linker_vote_radius=25)
if __name__ == '__main__':
argparser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description='Estimate nucleosome positions from scCHiC seq bam file(s)')
argparser.add_argument('alignmentfiles', type=str, nargs='+')
argparser.add_argument('-o', type=str, required=True, help='output prefix')
argparser.add_argument('-bin_size', type=int, default=3, help='Nucleosome position precision (bp), increasing this value increases memory consumption linearly')
argparser.add_argument('-n_threads', type=int, help='Amount of worker threads')
args = argparser.parse_args()
async def main(args):
# Run the analysis:
print('Creating molecule coordinate database')
d = generate_molecule_coordinate_dict(args.alignmentfiles, args.n_threads)
print('Estimating nucleosome positions')
await coordinate_dicts_to_nucleosome_position_files(args.alignmentfiles,
d,
p_nuc_bin_size = args.bin_size,
n_threads=args.n_threads,
alias=args.o)
loop = asyncio.get_event_loop()
loop.run_until_complete(main(args))
#asyncio.run(main(args)) # python >= 3.7