Switch to unified view

a b/singlecellmultiomics/bamProcessing/bamToNucleosomePositions.py
1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
4
from singlecellmultiomics.molecule import MoleculeIterator, CHICMolecule
5
from singlecellmultiomics.fragment import CHICFragment
6
from singlecellmultiomics.bamProcessing.bamFunctions import get_reference_path_from_bam, get_contigs
7
from collections import defaultdict
8
import pyBigWig
9
import pysam
10
from singlecellmultiomics.bamProcessing.bamBinCounts import get_binned_counts
11
import pandas as pd
12
import argparse
13
from singlecellmultiomics.bamProcessing import get_contig_sizes
14
from singlecellmultiomics.utils import is_autosome, pool_wrapper
15
import numpy as np
16
from multiprocessing import Pool
17
from more_itertools import windowed
18
from glob import glob
19
import pyBigWig
20
from scipy.ndimage import gaussian_filter1d
21
from scipy.signal import find_peaks
22
import argparse
23
from itertools import product
24
import asyncio
25
26
def get_aligned_chic_len(molecule):
27
28
    for f in molecule:
29
        f.R2_primer_length = 6
30
31
    mlen = molecule.estimated_max_length
32
    if mlen is None:
33
        return None
34
35
    return mlen + 2 # 2 extra bases because 1 base is potentially lost on both sides
36
37
def _generate_molecule_coordinate_dict(args, max_fragment_size=800):
38
39
    bam_path,  sel_contig= args
40
    molecule_coordinate_dict = defaultdict(list) # cell->[ (start,end), (start,end) ..]
41
    with pysam.AlignmentFile(bam_path) as alignments:
42
       # pysam.FastaFile(get_reference_path_from_bam(alignments)) as reference:
43
44
        for molecule in MoleculeIterator(alignments,
45
                                         contig=sel_contig,
46
                                         molecule_class=CHICMolecule,
47
                                         fragment_class = CHICFragment,
48
                                         #molecule_class_args = {'reference':reference}
49
                                        ):
50
51
52
53
            contig, start, strand = molecule.get_cut_site()
54
            if not molecule.is_completely_matching:
55
                continue
56
57
            if molecule.get_mean_mapping_qual()<55:
58
                continue
59
            slen = get_aligned_chic_len(molecule)
60
61
            if slen is None or slen>max_fragment_size:
62
                continue
63
64
            if strand:
65
                end = start - slen
66
            else:
67
                end = start + slen
68
69
70
            molecule_coordinate_dict[molecule.sample].append( (start,end) )
71
    for sample in molecule_coordinate_dict:
72
        molecule_coordinate_dict[sample] =  sorted(molecule_coordinate_dict[sample])
73
    return sel_contig, molecule_coordinate_dict
74
75
76
def generate_molecule_coordinate_dict(bams, n_threads=None):
77
78
    molecule_coordinate_dicts = {}
79
    contigs = get_contigs(bams[0])
80
81
    with Pool(n_threads) as workers:
82
        for contig,d in workers.imap( _generate_molecule_coordinate_dict,
83
                                     product(bams,filter(is_autosome, contigs))):
84
            if not contig in molecule_coordinate_dicts:
85
86
                molecule_coordinate_dicts[contig] = d
87
            else:
88
                molecule_coordinate_dicts[contig].update( d )
89
            print(f'Finished {contig}')
90
    return molecule_coordinate_dicts
91
92
93
    #bin_size = 100_000
94
95
#distances_per_bin = defaultdict(lambda: defaultdict(list)  )
96
97
def contig_coordinate_dict_to_votes(d, n, min_total_cuts_per_cell=3, p_nuc_bin_size=5, max_linker_length=90, linker_vote_radius=25):
98
    #molecule_coordinate_dicts[contig] = d
99
    # n = size of contig
100
    vote_nucleosome = np.zeros(int( n/p_nuc_bin_size) )
101
    vote_linker = np.zeros(int( n/p_nuc_bin_size) )
102
103
    for cell in d.keys():
104
        if len(d[cell])<min_total_cuts_per_cell:
105
            continue
106
        for (start_a, end_a),(start_b,end_b) in windowed( d[cell],2):
107
108
            o_start_a = start_a
109
110
            #Check if the fragments are not overlapping:
111
            (start_a, end_a) = min(start_a, end_a), max(start_a, end_a)
112
            (start_b, end_b) = min(start_b, end_b), max(start_b, end_b)
113
114
            # The locations of the nucleosomes are constrained:
115
            # \\ start_a  -----  end_a \\     \\ start_b  ---- end_b \\
116
            if end_a - start_a >= 147 and end_a - start_a < (147*2+10): # it could contain a single nucleosome, but not more
117
                s = int(((start_a + 70 ))/p_nuc_bin_size)
118
                e = int(((end_a - 70))/p_nuc_bin_size)
119
                vote_nucleosome[ s:(e+1) ] += 1/(e-s)
120
121
            # Any starting point of a molecule is _always_ part of a linker
122
            # lets say += 25bp
123
124
            c=o_start_a
125
            s = int(((c-linker_vote_radius))/p_nuc_bin_size)
126
            e = int(((c+linker_vote_radius))/p_nuc_bin_size)+1
127
            vote_linker[s:(e+1)] += 1/(e-s)
128
129
            if end_a > start_b:
130
                continue
131
132
            if start_b - end_a  > max_linker_length: # The distance is larger than 90bp, which is a very long linker. Skip the linker vote
133
                continue
134
135
            s = int(((end_a))/p_nuc_bin_size)
136
            e = int(((start_b))/p_nuc_bin_size)
137
138
139
            vote_linker[s:(e+1)] += 1/(e-s+1)
140
    return vote_nucleosome, vote_linker
141
142
143
async def write_to_bw(handle, starts, ends,  values, contig,size=None):
144
145
    if size is not None:
146
        print( ends[ends>=size] )
147
148
    handle.addEntries(
149
        [contig]*len(starts), #Contig
150
        starts.astype(np.int64) , #Start
151
        ends= ends.astype(np.int64) ,  #end
152
        values=  values.astype(np.float64)
153
    )
154
155
async def coordinate_dicts_to_nucleosome_position_files(bams, molecule_coordinate_dicts, p_nuc_bin_size = 5, alias='npos', n_threads=None ):
156
157
    contigs = get_contigs(bams[0])
158
    sizes = get_contig_sizes(bams[0])
159
160
161
    # Create output bigwig file handles and worker pool
162
    linker_vote_write_path= f'{alias}_linkers.bw'
163
    nuc_vote_write_path= f'{alias}_nucleosomes.bw'
164
    merged_vote_write_path= f'{alias}_nucleosomes_min_linkers.bw'
165
    centroids = f'{alias}_nucleosome_centers.bed'
166
167
168
    with Pool(n_threads) as workers, \
169
        pyBigWig.open(linker_vote_write_path,'w') as linkers_out, \
170
        pyBigWig.open(nuc_vote_write_path,'w') as nuc_out, \
171
        pyBigWig.open(merged_vote_write_path,'w') as merged_out, \
172
        open(centroids,'w') as centroids_out:
173
174
        # Add headers for all output bigwigfiles
175
        for h in (linkers_out, nuc_out, merged_out):
176
            # Write header
177
            h.addHeader(list(zip(contigs, [sizes[c] for c in contigs])))
178
179
180
        # Obtain nucleosome and linker votes for each contig
181
        for ret_contig, (vote_nucleosome, vote_linker) in zip(
182
                    molecule_coordinate_dicts.keys(),
183
                    workers.imap(pool_wrapper,
184
            (
185
            (contig_coordinate_dict_to_votes,
186
            {
187
                'd':d,
188
                'n':sizes[contig],
189
                'min_total_cuts_per_cell':3,
190
                'p_nuc_bin_size':5,
191
                'max_linker_length':90,
192
                'linker_vote_radius':25
193
            })
194
                for contig, d in molecule_coordinate_dicts.items()
195
            ))):
196
197
            print(f'Writing votes for {ret_contig}')
198
            contig_len = sizes[ret_contig]
199
200
            smooth_linkers = gaussian_filter1d(vote_linker,2)
201
            smooth_nuc = gaussian_filter1d(vote_nucleosome,2)
202
203
            # Write nucleosome vote track:
204
            starts = np.array( np.linspace(0, contig_len-p_nuc_bin_size-1, len(vote_nucleosome)) )
205
206
            size = sizes[ret_contig]
207
            # Check if all numbers are preceding...
208
            d = np.diff(starts)
209
            d = d[d<1]
210
            if len(d):
211
                print(d)
212
            else:
213
                print(f'{len(starts)} Coordinates are in correct order')
214
215
216
            await asyncio.gather(
217
                write_to_bw(nuc_out, starts, starts+p_nuc_bin_size,  np.nan_to_num(smooth_nuc), ret_contig, size),
218
                write_to_bw(linkers_out, starts, starts+p_nuc_bin_size,  np.nan_to_num(smooth_linkers), ret_contig, size),
219
                write_to_bw(merged_out, starts, starts+p_nuc_bin_size,  np.nan_to_num(smooth_nuc - smooth_linkers), ret_contig, size=size)
220
            )
221
222
223
            # Use the find peak function to select properly spaced nucleosome positions.
224
            print(f'Writing centroids for {ret_contig}')
225
            mindist = 147
226
            min_dist_bins = int( mindist / p_nuc_bin_size )
227
            estimated_nucleosome_positions = find_peaks( gaussian_filter1d(smooth_nuc - smooth_linkers,2),distance=min_dist_bins)[0]
228
229
            for nucpos, score in zip( estimated_nucleosome_positions*p_nuc_bin_size, smooth_nuc[estimated_nucleosome_positions]):
230
                centroids_out.write(f'{ret_contig}\t{nucpos}\t{nucpos+p_nuc_bin_size}\t{score}\n')
231
232
233
234
    #contig_coordinate_dict_to_votes(d, n, min_total_cuts_per_cell=3, p_nuc_bin_size=5, max_linker_length=90, linker_vote_radius=25)
235
236
237
if __name__ == '__main__':
238
239
    argparser = argparse.ArgumentParser(
240
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
241
        description='Estimate nucleosome positions from scCHiC seq bam file(s)')
242
243
    argparser.add_argument('alignmentfiles', type=str, nargs='+')
244
    argparser.add_argument('-o', type=str, required=True, help='output prefix')
245
246
    argparser.add_argument('-bin_size', type=int,  default=3, help='Nucleosome position precision (bp), increasing this value increases memory consumption linearly')
247
    argparser.add_argument('-n_threads', type=int, help='Amount of worker threads')
248
249
    args = argparser.parse_args()
250
251
    async def main(args):
252
        # Run the analysis:
253
        print('Creating molecule coordinate database')
254
        d = generate_molecule_coordinate_dict(args.alignmentfiles, args.n_threads)
255
        print('Estimating nucleosome positions')
256
        await coordinate_dicts_to_nucleosome_position_files(args.alignmentfiles,
257
                    d,
258
                    p_nuc_bin_size = args.bin_size,
259
                    n_threads=args.n_threads,
260
                    alias=args.o)
261
262
263
    loop = asyncio.get_event_loop()
264
    loop.run_until_complete(main(args))
265
266
    #asyncio.run(main(args)) # python >= 3.7