|
a |
|
b/singlecellmultiomics/bamProcessing/bamToMethylationCalls.py |
|
|
1 |
#!/usr/bin/env python |
|
|
2 |
# -*- coding: utf-8 -*- |
|
|
3 |
|
|
|
4 |
import matplotlib |
|
|
5 |
matplotlib.rcParams['figure.dpi'] = 160 |
|
|
6 |
matplotlib.use('Agg') |
|
|
7 |
import matplotlib.pyplot as plt |
|
|
8 |
import pandas as pd |
|
|
9 |
import seaborn as sns |
|
|
10 |
import multiprocessing |
|
|
11 |
from singlecellmultiomics.bamProcessing.bamBinCounts import generate_commands, count_methylation_binned |
|
|
12 |
import argparse |
|
|
13 |
from colorama import Fore, Style |
|
|
14 |
from singlecellmultiomics.utils.export import dataframe_to_wig |
|
|
15 |
from singlecellmultiomics.methylation import MethylationCountMatrix |
|
|
16 |
from singlecellmultiomics.bamProcessing.bamFunctions import get_reference_from_pysam_alignmentFile |
|
|
17 |
from colorama import Fore,Style |
|
|
18 |
|
|
|
19 |
|
|
|
20 |
def prefilter(counts, cell_names, min_measurements, min_variance): |
|
|
21 |
if min_measurements>0: |
|
|
22 |
counts = counts.loc[:, (counts >= 0).sum() > min_measurements] |
|
|
23 |
if min_variance>0: |
|
|
24 |
return counts.loc[:, counts.var() >= min_variance].reindex(cell_names) |
|
|
25 |
else: |
|
|
26 |
return counts.reindex(cell_names) |
|
|
27 |
|
|
|
28 |
|
|
|
29 |
def panda_and_prefilter(args): |
|
|
30 |
d, args = args # counts_dict ,(cell_names, min_measurements, min_variance) |
|
|
31 |
return prefilter(pd.DataFrame(d), *args) |
|
|
32 |
|
|
|
33 |
|
|
|
34 |
def get_methylation_count_matrix(bam_path, |
|
|
35 |
bin_size: int, |
|
|
36 |
bp_per_job: int, |
|
|
37 |
min_samples: int = None, |
|
|
38 |
min_variance: int = None, |
|
|
39 |
min_mapping_qual: int = None, |
|
|
40 |
skip_contigs: set = None, |
|
|
41 |
known_variants: str = None, |
|
|
42 |
maxtime: int = None, |
|
|
43 |
head: int=None, |
|
|
44 |
threads: int = None, |
|
|
45 |
count_reads: bool = True, |
|
|
46 |
**kwargs |
|
|
47 |
): |
|
|
48 |
|
|
|
49 |
|
|
|
50 |
all_kwargs = {'known': known_variants, |
|
|
51 |
'maxtime': maxtime, |
|
|
52 |
'single_location':bin_size==1, |
|
|
53 |
'min_samples':min_samples, |
|
|
54 |
'min_variance':min_variance, |
|
|
55 |
'threads':threads, |
|
|
56 |
'count_reads':count_reads |
|
|
57 |
} |
|
|
58 |
|
|
|
59 |
all_kwargs.update(kwargs) |
|
|
60 |
commands = generate_commands( |
|
|
61 |
alignments_path=bam_path, |
|
|
62 |
bin_size=bin_size if bin_size!=1 else bp_per_job, |
|
|
63 |
key_tags=None, |
|
|
64 |
max_fragment_size=0, |
|
|
65 |
dedup=True, |
|
|
66 |
head=head, |
|
|
67 |
bins_per_job= int(bp_per_job / bin_size) if bin_size!=1 else 1, min_mq=min_mapping_qual, |
|
|
68 |
kwargs=all_kwargs, |
|
|
69 |
skip_contigs=skip_contigs |
|
|
70 |
) |
|
|
71 |
|
|
|
72 |
|
|
|
73 |
count_mat = MethylationCountMatrix() |
|
|
74 |
read_count_mat = dict() |
|
|
75 |
|
|
|
76 |
if threads==1: |
|
|
77 |
for command in commands: |
|
|
78 |
result, result_read_counts = count_methylation_binned(command) |
|
|
79 |
#result.prune(min_samples=min_samples, min_variance=min_variance) |
|
|
80 |
count_mat.update( result ) |
|
|
81 |
if count_reads: |
|
|
82 |
read_count_mat.update(result_read_counts) |
|
|
83 |
|
|
|
84 |
else: |
|
|
85 |
with multiprocessing.Pool(threads) as workers: |
|
|
86 |
|
|
|
87 |
for result,result_read_counts in workers.imap_unordered(count_methylation_binned, commands): |
|
|
88 |
#result.prune(min_samples=min_samples, min_variance=min_variance) |
|
|
89 |
count_mat.update(result) |
|
|
90 |
if count_reads: |
|
|
91 |
read_count_mat.update(result_read_counts) |
|
|
92 |
|
|
|
93 |
return count_mat, read_count_mat |
|
|
94 |
|
|
|
95 |
|
|
|
96 |
|
|
|
97 |
if __name__ == '__main__': |
|
|
98 |
argparser = argparse.ArgumentParser( |
|
|
99 |
formatter_class=argparse.ArgumentDefaultsHelpFormatter, |
|
|
100 |
description="""Extract methylation calls from bam file |
|
|
101 |
""") |
|
|
102 |
argparser.add_argument('bamfile', metavar='bamfile', type=str) |
|
|
103 |
argparser.add_argument('-bin_size', default=500, type=int, help='bin size, set to 1 for single CpG') |
|
|
104 |
argparser.add_argument('-bp_per_job', default=1_000_000, type=int, help='Amount of basepairs to be processed per thread per chunk') |
|
|
105 |
argparser.add_argument('-threads', default=None, type=int, help='Amount of threads to use for counting, None to use the amount of available threads') |
|
|
106 |
argparser.add_argument('-threads_agg', default=1, type=int, help='Amount of threads to use for aggregation. Aggregation is very memory intensive, so this amount of threads should probably be lower than -threads') |
|
|
107 |
argparser.add_argument('-reference', default=None, type=str, help='Path to reference fasta file') |
|
|
108 |
|
|
|
109 |
argparser.add_argument('--mirror_cpg_dyad',action='store_true', help='Count CpG methylation of a single dyad as one position') |
|
|
110 |
argparser.add_argument('--stranded',action='store_true', help='Perform strand specific methylation calls') |
|
|
111 |
|
|
|
112 |
|
|
|
113 |
fi = argparser.add_argument_group("Filters") |
|
|
114 |
fi.add_argument('-min_variance', default=None, type=float) |
|
|
115 |
fi.add_argument('-min_mapping_qual', default=40, type=int) |
|
|
116 |
fi.add_argument('-head', default=None, type=int, help='Process the first n bins') |
|
|
117 |
fi.add_argument('-min_samples', default=1, type=int) |
|
|
118 |
fi.add_argument('-skip_contigs', type=str, help='Comma separated contigs to skip', default='MT,chrM') |
|
|
119 |
fi.add_argument('-known_variants', |
|
|
120 |
help='VCF file with known variants, will be not taken into account as methylated/unmethylated', |
|
|
121 |
type=str) |
|
|
122 |
|
|
|
123 |
fi.add_argument('-contexts_to_capture', default=None, type=str, help='Contexts to capture, comma separated')\ |
|
|
124 |
|
|
|
125 |
og = argparser.add_argument_group("Output") |
|
|
126 |
#og.add_argument('-bed', type=str, help='Bed file to write methylation calls to') |
|
|
127 |
og.add_argument('-default_sample_name', type=str, help='Default sample name to use when SM tag is not available in the read') |
|
|
128 |
og.add_argument('-wig_beta', type=str, help='WIG file to write mean methylation per bin to') |
|
|
129 |
og.add_argument('-wig_n_samples', type=str, help='WIG file to write amount of samples covering to') |
|
|
130 |
og.add_argument('-capture_read_depth', default=None, type=str, help='Capture read depth into this csv file') |
|
|
131 |
|
|
|
132 |
og.add_argument('-betas', type=str, help='CSV or pickle file to write single cell methylation betas to') |
|
|
133 |
#og.add_argument('-mets', type=str, help='CSV or pickle file to write single cell methylation unmethylated frame to') |
|
|
134 |
#og.add_argument('-unmets', type=str, help='CSV or pickle file to write single cell methylation frame to') |
|
|
135 |
|
|
|
136 |
og.add_argument('-bismark_tabfile', type=str, help='Tabulated file to write to, contains: chr | start | end | unmethylated_counts | methylated_counts | beta_value') |
|
|
137 |
og.add_argument('-tabfile', type=str, |
|
|
138 |
help='Tabulated file to write to, contains: chr | start | end | unmethylated_counts | methylated_counts | beta_value | variance | n_samples') |
|
|
139 |
og.add_argument('-distmat', type=str, help='CSV or pickle file to write single cell distance matrix to') |
|
|
140 |
og.add_argument('-distmat_plot', type=str, help='.PNG file to write distance matrix image to') |
|
|
141 |
|
|
|
142 |
args = argparser.parse_args() |
|
|
143 |
|
|
|
144 |
if args.contexts_to_capture is None: |
|
|
145 |
contexts_to_capture = None |
|
|
146 |
context_radius = None |
|
|
147 |
else: |
|
|
148 |
assert args.mirror_cpg_dyad is False, '--mirror_cpg_dyad can only be used when no contexts_to_capture are supplied. (CpG mode) ' |
|
|
149 |
contexts_to_capture = args.contexts_to_capture.split(',') |
|
|
150 |
lengths = set( (len(context) for context in contexts_to_capture ) ) |
|
|
151 |
assert len(lengths)==1, 'Please only supply contexts of the same length' |
|
|
152 |
context_radius = int((list(lengths)[0] - 1) / 2) |
|
|
153 |
for context in contexts_to_capture: |
|
|
154 |
assert context[context_radius]=='C', 'Supply only contexts with a C in the middle' |
|
|
155 |
|
|
|
156 |
assert args.reference is not None, 'Please supply supply a fasta file using -reference' |
|
|
157 |
|
|
|
158 |
|
|
|
159 |
if args.stranded: |
|
|
160 |
assert args.mirror_cpg_dyad is False, '--mirror_cpg_dyad can only be used in non-strand specific mode ' |
|
|
161 |
assert args.wig_beta is None, '-wig_beta cannot be used in stranded mode as WIG files are not strand specific' |
|
|
162 |
assert args.wig_n_samples is None, '-wig_n_samples cannot be used in stranded mode as WIG files are not strand specific' |
|
|
163 |
|
|
|
164 |
if args.distmat_plot is not None and not args.distmat_plot.endswith('.png'): |
|
|
165 |
args.distmat_plot += '.png' |
|
|
166 |
|
|
|
167 |
print('Obtaining counts ', end="") |
|
|
168 |
counts, read_dictionary = get_methylation_count_matrix(bam_path=args.bamfile, |
|
|
169 |
bin_size=args.bin_size, |
|
|
170 |
bp_per_job=args.bp_per_job, |
|
|
171 |
min_samples=args.min_samples, |
|
|
172 |
min_variance=args.min_variance, |
|
|
173 |
known_variants=args.known_variants, |
|
|
174 |
skip_contigs=args.skip_contigs.split(','), |
|
|
175 |
min_mapping_qual=args.min_mapping_qual, |
|
|
176 |
head=args.head, |
|
|
177 |
threads=args.threads, |
|
|
178 |
single_location=(args.bin_size==1), |
|
|
179 |
contexts_to_capture=contexts_to_capture, |
|
|
180 |
context_radius=context_radius, |
|
|
181 |
reference_path=args.reference, |
|
|
182 |
dyad_mode=args.mirror_cpg_dyad, |
|
|
183 |
stranded=args.stranded, |
|
|
184 |
count_reads=args.capture_read_depth is not None |
|
|
185 |
|
|
|
186 |
) |
|
|
187 |
print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ") |
|
|
188 |
|
|
|
189 |
if args.capture_read_depth: |
|
|
190 |
print('Writing read depth file ', end="") |
|
|
191 |
pd.DataFrame(read_dictionary).sort_index().sort_index(1).to_csv(args.capture_read_depth) |
|
|
192 |
print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ") |
|
|
193 |
|
|
|
194 |
|
|
|
195 |
print(counts) |
|
|
196 |
|
|
|
197 |
if args.betas is not None: |
|
|
198 |
print('Writing counts ', end="") |
|
|
199 |
if args.betas.endswith('.pickle.gz'): |
|
|
200 |
counts.to_pickle(args.betas) |
|
|
201 |
else: |
|
|
202 |
counts.get_frame('beta').to_csv(args.betas) |
|
|
203 |
print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ") |
|
|
204 |
|
|
|
205 |
if args.wig_beta is not None: |
|
|
206 |
# Calculate mean beta value and write to wig: |
|
|
207 |
print('Writing WIG beta', end="") |
|
|
208 |
dataframe_to_wig(counts.get_bulk_frame()[['beta']], args.wig_beta, span=args.bin_size) |
|
|
209 |
print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ") |
|
|
210 |
|
|
|
211 |
if args.wig_n_samples is not None: |
|
|
212 |
# Calculate mean beta value and write to wig: |
|
|
213 |
print('Writing WIG n_samples', end="") |
|
|
214 |
dataframe_to_wig(counts.get_bulk_frame()[['n_samples']], args.wig_n_samples, span=args.bin_size) |
|
|
215 |
print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ") |
|
|
216 |
|
|
|
217 |
if args.distmat is not None or args.distmat_plot is not None: |
|
|
218 |
print('Calculating distance matrix', end="") |
|
|
219 |
dmat = counts.get_sample_distance_matrix() |
|
|
220 |
print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ") |
|
|
221 |
if args.distmat_plot is not None: |
|
|
222 |
print('Writing distmat_image', end="") |
|
|
223 |
try: |
|
|
224 |
sns.clustermap(dmat) |
|
|
225 |
plt.tight_layout() |
|
|
226 |
plt.savefig(args.distmat_plot) |
|
|
227 |
print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ") |
|
|
228 |
except Exception as e: |
|
|
229 |
print(e) |
|
|
230 |
print(f" [ {Fore.RED}FAIL{Style.RESET_ALL} ] ") |
|
|
231 |
|
|
|
232 |
if args.distmat is not None: |
|
|
233 |
print('Writing distance matrix', end="") |
|
|
234 |
if args.distmat.endswith('.pickle.gz'): |
|
|
235 |
dmat.to_pickle(args.distmat) |
|
|
236 |
else: |
|
|
237 |
dmat.to_csv(args.distmat, sep='\t') |
|
|
238 |
print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ") |
|
|
239 |
|
|
|
240 |
if args.bismark_tabfile is not None: |
|
|
241 |
print('Writing bismark_tabfile matrix', end="") |
|
|
242 |
bf = counts.get_bulk_frame() |
|
|
243 |
|
|
|
244 |
bf.index.set_names(['chr','start', 'end'] + (['strand'] if args.stranded else []), inplace=True) |
|
|
245 |
bf[['unmethylated', 'methylated', 'beta']].to_csv(args.bismark_tabfile, sep='\t') |
|
|
246 |
print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ") |
|
|
247 |
|
|
|
248 |
if args.tabfile is not None: |
|
|
249 |
print('Writing tabfile', end="") |
|
|
250 |
if args.tabfile.endswith('.pickle.gz'): |
|
|
251 |
counts.get_bulk_frame().to_pickle(args.tabfile) |
|
|
252 |
else: |
|
|
253 |
counts.get_bulk_frame().to_csv(args.tabfile, sep='\t') |
|
|
254 |
print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ") |