[2c420a]: / singlecellmultiomics / bamProcessing / bamToMethylationCalls.py

Download this file

255 lines (209 with data), 12.3 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import matplotlib
matplotlib.rcParams['figure.dpi'] = 160
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import multiprocessing
from singlecellmultiomics.bamProcessing.bamBinCounts import generate_commands, count_methylation_binned
import argparse
from colorama import Fore, Style
from singlecellmultiomics.utils.export import dataframe_to_wig
from singlecellmultiomics.methylation import MethylationCountMatrix
from singlecellmultiomics.bamProcessing.bamFunctions import get_reference_from_pysam_alignmentFile
from colorama import Fore,Style
def prefilter(counts, cell_names, min_measurements, min_variance):
if min_measurements>0:
counts = counts.loc[:, (counts >= 0).sum() > min_measurements]
if min_variance>0:
return counts.loc[:, counts.var() >= min_variance].reindex(cell_names)
else:
return counts.reindex(cell_names)
def panda_and_prefilter(args):
d, args = args # counts_dict ,(cell_names, min_measurements, min_variance)
return prefilter(pd.DataFrame(d), *args)
def get_methylation_count_matrix(bam_path,
bin_size: int,
bp_per_job: int,
min_samples: int = None,
min_variance: int = None,
min_mapping_qual: int = None,
skip_contigs: set = None,
known_variants: str = None,
maxtime: int = None,
head: int=None,
threads: int = None,
count_reads: bool = True,
**kwargs
):
all_kwargs = {'known': known_variants,
'maxtime': maxtime,
'single_location':bin_size==1,
'min_samples':min_samples,
'min_variance':min_variance,
'threads':threads,
'count_reads':count_reads
}
all_kwargs.update(kwargs)
commands = generate_commands(
alignments_path=bam_path,
bin_size=bin_size if bin_size!=1 else bp_per_job,
key_tags=None,
max_fragment_size=0,
dedup=True,
head=head,
bins_per_job= int(bp_per_job / bin_size) if bin_size!=1 else 1, min_mq=min_mapping_qual,
kwargs=all_kwargs,
skip_contigs=skip_contigs
)
count_mat = MethylationCountMatrix()
read_count_mat = dict()
if threads==1:
for command in commands:
result, result_read_counts = count_methylation_binned(command)
#result.prune(min_samples=min_samples, min_variance=min_variance)
count_mat.update( result )
if count_reads:
read_count_mat.update(result_read_counts)
else:
with multiprocessing.Pool(threads) as workers:
for result,result_read_counts in workers.imap_unordered(count_methylation_binned, commands):
#result.prune(min_samples=min_samples, min_variance=min_variance)
count_mat.update(result)
if count_reads:
read_count_mat.update(result_read_counts)
return count_mat, read_count_mat
if __name__ == '__main__':
argparser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description="""Extract methylation calls from bam file
""")
argparser.add_argument('bamfile', metavar='bamfile', type=str)
argparser.add_argument('-bin_size', default=500, type=int, help='bin size, set to 1 for single CpG')
argparser.add_argument('-bp_per_job', default=1_000_000, type=int, help='Amount of basepairs to be processed per thread per chunk')
argparser.add_argument('-threads', default=None, type=int, help='Amount of threads to use for counting, None to use the amount of available threads')
argparser.add_argument('-threads_agg', default=1, type=int, help='Amount of threads to use for aggregation. Aggregation is very memory intensive, so this amount of threads should probably be lower than -threads')
argparser.add_argument('-reference', default=None, type=str, help='Path to reference fasta file')
argparser.add_argument('--mirror_cpg_dyad',action='store_true', help='Count CpG methylation of a single dyad as one position')
argparser.add_argument('--stranded',action='store_true', help='Perform strand specific methylation calls')
fi = argparser.add_argument_group("Filters")
fi.add_argument('-min_variance', default=None, type=float)
fi.add_argument('-min_mapping_qual', default=40, type=int)
fi.add_argument('-head', default=None, type=int, help='Process the first n bins')
fi.add_argument('-min_samples', default=1, type=int)
fi.add_argument('-skip_contigs', type=str, help='Comma separated contigs to skip', default='MT,chrM')
fi.add_argument('-known_variants',
help='VCF file with known variants, will be not taken into account as methylated/unmethylated',
type=str)
fi.add_argument('-contexts_to_capture', default=None, type=str, help='Contexts to capture, comma separated')\
og = argparser.add_argument_group("Output")
#og.add_argument('-bed', type=str, help='Bed file to write methylation calls to')
og.add_argument('-default_sample_name', type=str, help='Default sample name to use when SM tag is not available in the read')
og.add_argument('-wig_beta', type=str, help='WIG file to write mean methylation per bin to')
og.add_argument('-wig_n_samples', type=str, help='WIG file to write amount of samples covering to')
og.add_argument('-capture_read_depth', default=None, type=str, help='Capture read depth into this csv file')
og.add_argument('-betas', type=str, help='CSV or pickle file to write single cell methylation betas to')
#og.add_argument('-mets', type=str, help='CSV or pickle file to write single cell methylation unmethylated frame to')
#og.add_argument('-unmets', type=str, help='CSV or pickle file to write single cell methylation frame to')
og.add_argument('-bismark_tabfile', type=str, help='Tabulated file to write to, contains: chr | start | end | unmethylated_counts | methylated_counts | beta_value')
og.add_argument('-tabfile', type=str,
help='Tabulated file to write to, contains: chr | start | end | unmethylated_counts | methylated_counts | beta_value | variance | n_samples')
og.add_argument('-distmat', type=str, help='CSV or pickle file to write single cell distance matrix to')
og.add_argument('-distmat_plot', type=str, help='.PNG file to write distance matrix image to')
args = argparser.parse_args()
if args.contexts_to_capture is None:
contexts_to_capture = None
context_radius = None
else:
assert args.mirror_cpg_dyad is False, '--mirror_cpg_dyad can only be used when no contexts_to_capture are supplied. (CpG mode) '
contexts_to_capture = args.contexts_to_capture.split(',')
lengths = set( (len(context) for context in contexts_to_capture ) )
assert len(lengths)==1, 'Please only supply contexts of the same length'
context_radius = int((list(lengths)[0] - 1) / 2)
for context in contexts_to_capture:
assert context[context_radius]=='C', 'Supply only contexts with a C in the middle'
assert args.reference is not None, 'Please supply supply a fasta file using -reference'
if args.stranded:
assert args.mirror_cpg_dyad is False, '--mirror_cpg_dyad can only be used in non-strand specific mode '
assert args.wig_beta is None, '-wig_beta cannot be used in stranded mode as WIG files are not strand specific'
assert args.wig_n_samples is None, '-wig_n_samples cannot be used in stranded mode as WIG files are not strand specific'
if args.distmat_plot is not None and not args.distmat_plot.endswith('.png'):
args.distmat_plot += '.png'
print('Obtaining counts ', end="")
counts, read_dictionary = get_methylation_count_matrix(bam_path=args.bamfile,
bin_size=args.bin_size,
bp_per_job=args.bp_per_job,
min_samples=args.min_samples,
min_variance=args.min_variance,
known_variants=args.known_variants,
skip_contigs=args.skip_contigs.split(','),
min_mapping_qual=args.min_mapping_qual,
head=args.head,
threads=args.threads,
single_location=(args.bin_size==1),
contexts_to_capture=contexts_to_capture,
context_radius=context_radius,
reference_path=args.reference,
dyad_mode=args.mirror_cpg_dyad,
stranded=args.stranded,
count_reads=args.capture_read_depth is not None
)
print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")
if args.capture_read_depth:
print('Writing read depth file ', end="")
pd.DataFrame(read_dictionary).sort_index().sort_index(1).to_csv(args.capture_read_depth)
print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")
print(counts)
if args.betas is not None:
print('Writing counts ', end="")
if args.betas.endswith('.pickle.gz'):
counts.to_pickle(args.betas)
else:
counts.get_frame('beta').to_csv(args.betas)
print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")
if args.wig_beta is not None:
# Calculate mean beta value and write to wig:
print('Writing WIG beta', end="")
dataframe_to_wig(counts.get_bulk_frame()[['beta']], args.wig_beta, span=args.bin_size)
print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")
if args.wig_n_samples is not None:
# Calculate mean beta value and write to wig:
print('Writing WIG n_samples', end="")
dataframe_to_wig(counts.get_bulk_frame()[['n_samples']], args.wig_n_samples, span=args.bin_size)
print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")
if args.distmat is not None or args.distmat_plot is not None:
print('Calculating distance matrix', end="")
dmat = counts.get_sample_distance_matrix()
print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")
if args.distmat_plot is not None:
print('Writing distmat_image', end="")
try:
sns.clustermap(dmat)
plt.tight_layout()
plt.savefig(args.distmat_plot)
print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")
except Exception as e:
print(e)
print(f" [ {Fore.RED}FAIL{Style.RESET_ALL} ] ")
if args.distmat is not None:
print('Writing distance matrix', end="")
if args.distmat.endswith('.pickle.gz'):
dmat.to_pickle(args.distmat)
else:
dmat.to_csv(args.distmat, sep='\t')
print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")
if args.bismark_tabfile is not None:
print('Writing bismark_tabfile matrix', end="")
bf = counts.get_bulk_frame()
bf.index.set_names(['chr','start', 'end'] + (['strand'] if args.stranded else []), inplace=True)
bf[['unmethylated', 'methylated', 'beta']].to_csv(args.bismark_tabfile, sep='\t')
print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")
if args.tabfile is not None:
print('Writing tabfile', end="")
if args.tabfile.endswith('.pickle.gz'):
counts.get_bulk_frame().to_pickle(args.tabfile)
else:
counts.get_bulk_frame().to_csv(args.tabfile, sep='\t')
print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")