Switch to unified view

a b/singlecellmultiomics/bamProcessing/bamToMethylationCalls.py
1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
4
import matplotlib
5
matplotlib.rcParams['figure.dpi'] = 160
6
matplotlib.use('Agg')
7
import matplotlib.pyplot as plt
8
import pandas as pd
9
import seaborn as sns
10
import multiprocessing
11
from singlecellmultiomics.bamProcessing.bamBinCounts import generate_commands, count_methylation_binned
12
import argparse
13
from colorama import Fore, Style
14
from singlecellmultiomics.utils.export import dataframe_to_wig
15
from singlecellmultiomics.methylation import MethylationCountMatrix
16
from singlecellmultiomics.bamProcessing.bamFunctions import get_reference_from_pysam_alignmentFile
17
from colorama import Fore,Style
18
19
20
def prefilter(counts, cell_names, min_measurements, min_variance):
21
    if min_measurements>0:
22
        counts = counts.loc[:, (counts >= 0).sum() > min_measurements]
23
    if min_variance>0:
24
        return counts.loc[:, counts.var() >= min_variance].reindex(cell_names)
25
    else:
26
        return counts.reindex(cell_names)
27
28
29
def panda_and_prefilter(args):
30
    d, args = args # counts_dict ,(cell_names, min_measurements, min_variance)
31
    return prefilter(pd.DataFrame(d), *args)
32
33
34
def get_methylation_count_matrix(bam_path,
35
                                 bin_size: int,
36
                                 bp_per_job: int,
37
                                 min_samples: int = None,
38
                                 min_variance: int = None,
39
                                 min_mapping_qual: int = None,
40
                                 skip_contigs: set = None,
41
                                 known_variants: str = None,
42
                                 maxtime: int = None,
43
                                 head: int=None,
44
                                 threads: int = None,
45
                                 count_reads: bool = True,
46
                                 **kwargs
47
                                 ):
48
49
50
    all_kwargs = {'known': known_variants,
51
            'maxtime': maxtime,
52
            'single_location':bin_size==1,
53
            'min_samples':min_samples,
54
            'min_variance':min_variance,
55
            'threads':threads,
56
            'count_reads':count_reads
57
            }
58
59
    all_kwargs.update(kwargs)
60
    commands = generate_commands(
61
        alignments_path=bam_path,
62
        bin_size=bin_size if bin_size!=1 else bp_per_job,
63
        key_tags=None,
64
        max_fragment_size=0,
65
        dedup=True,
66
        head=head,
67
        bins_per_job= int(bp_per_job / bin_size) if bin_size!=1 else 1, min_mq=min_mapping_qual,
68
        kwargs=all_kwargs,
69
        skip_contigs=skip_contigs
70
    )
71
72
73
    count_mat = MethylationCountMatrix()
74
    read_count_mat = dict()
75
76
    if threads==1:
77
        for command in commands:
78
            result, result_read_counts = count_methylation_binned(command)
79
            #result.prune(min_samples=min_samples, min_variance=min_variance)
80
            count_mat.update( result )
81
            if count_reads:
82
                read_count_mat.update(result_read_counts)
83
84
    else:
85
        with multiprocessing.Pool(threads) as workers:
86
87
            for result,result_read_counts in workers.imap_unordered(count_methylation_binned, commands):
88
                #result.prune(min_samples=min_samples, min_variance=min_variance)
89
                count_mat.update(result)
90
                if count_reads:
91
                    read_count_mat.update(result_read_counts)
92
93
    return count_mat, read_count_mat
94
95
96
97
if __name__ == '__main__':
98
    argparser = argparse.ArgumentParser(
99
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
100
        description="""Extract methylation calls from bam file
101
    """)
102
    argparser.add_argument('bamfile', metavar='bamfile', type=str)
103
    argparser.add_argument('-bin_size', default=500, type=int, help='bin size, set to 1 for single CpG')
104
    argparser.add_argument('-bp_per_job', default=1_000_000, type=int, help='Amount of basepairs to be processed per thread per chunk')
105
    argparser.add_argument('-threads', default=None, type=int, help='Amount of threads to use for counting, None to use the amount of available threads')
106
    argparser.add_argument('-threads_agg', default=1, type=int, help='Amount of threads to use for aggregation. Aggregation is very memory intensive, so this amount of threads should probably be lower than -threads')
107
    argparser.add_argument('-reference', default=None, type=str, help='Path to reference fasta file')
108
109
    argparser.add_argument('--mirror_cpg_dyad',action='store_true', help='Count CpG methylation of a single dyad as one position')
110
    argparser.add_argument('--stranded',action='store_true', help='Perform strand specific methylation calls')
111
112
113
    fi = argparser.add_argument_group("Filters")
114
    fi.add_argument('-min_variance', default=None, type=float)
115
    fi.add_argument('-min_mapping_qual', default=40, type=int)
116
    fi.add_argument('-head', default=None, type=int, help='Process the first n bins')
117
    fi.add_argument('-min_samples', default=1, type=int)
118
    fi.add_argument('-skip_contigs', type=str, help='Comma separated contigs to skip', default='MT,chrM')
119
    fi.add_argument('-known_variants',
120
                           help='VCF file with known variants, will be not taken into account as methylated/unmethylated',
121
                           type=str)
122
123
    fi.add_argument('-contexts_to_capture', default=None, type=str, help='Contexts to capture, comma separated')\
124
125
    og = argparser.add_argument_group("Output")
126
    #og.add_argument('-bed', type=str, help='Bed file to write methylation calls to')
127
    og.add_argument('-default_sample_name', type=str, help='Default sample name to use when SM tag is not available in the read')
128
    og.add_argument('-wig_beta', type=str, help='WIG file to write mean methylation per bin to')
129
    og.add_argument('-wig_n_samples', type=str, help='WIG file to write amount of samples covering to')
130
    og.add_argument('-capture_read_depth', default=None, type=str, help='Capture read depth into this csv file')
131
132
    og.add_argument('-betas', type=str, help='CSV or pickle file to write single cell methylation betas to')
133
    #og.add_argument('-mets', type=str, help='CSV or pickle file to write single cell methylation unmethylated frame to')
134
    #og.add_argument('-unmets', type=str, help='CSV or pickle file to write single cell methylation frame to')
135
136
    og.add_argument('-bismark_tabfile', type=str, help='Tabulated file to write to, contains: chr | start | end | unmethylated_counts | methylated_counts | beta_value')
137
    og.add_argument('-tabfile', type=str,
138
                    help='Tabulated file to write to, contains: chr | start | end | unmethylated_counts | methylated_counts | beta_value | variance | n_samples')
139
    og.add_argument('-distmat', type=str, help='CSV or pickle file to write single cell distance matrix to')
140
    og.add_argument('-distmat_plot', type=str, help='.PNG file to write distance matrix image to')
141
142
    args = argparser.parse_args()
143
144
    if args.contexts_to_capture is None:
145
        contexts_to_capture = None
146
        context_radius = None
147
    else:
148
        assert args.mirror_cpg_dyad is False, '--mirror_cpg_dyad can only be used when no contexts_to_capture are supplied. (CpG mode) '
149
        contexts_to_capture = args.contexts_to_capture.split(',')
150
        lengths = set( (len(context) for context in contexts_to_capture ) )
151
        assert len(lengths)==1, 'Please only supply contexts of the same length'
152
        context_radius = int((list(lengths)[0] - 1) / 2)
153
        for context in contexts_to_capture:
154
            assert context[context_radius]=='C', 'Supply only contexts with a C in the middle'
155
156
        assert args.reference is not None, 'Please supply supply a fasta file using -reference'
157
158
159
    if args.stranded:
160
        assert args.mirror_cpg_dyad is False, '--mirror_cpg_dyad can only be used in non-strand specific mode '
161
        assert args.wig_beta is None, '-wig_beta cannot be used in stranded mode as WIG files are not strand specific'
162
        assert args.wig_n_samples is None, '-wig_n_samples cannot be used in stranded mode as WIG files are not strand specific'
163
164
    if args.distmat_plot is not None and not args.distmat_plot.endswith('.png'):
165
        args.distmat_plot += '.png'
166
167
    print('Obtaining counts ', end="")
168
    counts, read_dictionary = get_methylation_count_matrix(bam_path=args.bamfile,
169
                                          bin_size=args.bin_size,
170
                                          bp_per_job=args.bp_per_job,
171
                                          min_samples=args.min_samples,
172
                                          min_variance=args.min_variance,
173
                                          known_variants=args.known_variants,
174
                                          skip_contigs=args.skip_contigs.split(','),
175
                                          min_mapping_qual=args.min_mapping_qual,
176
                                          head=args.head,
177
                                          threads=args.threads,
178
                                          single_location=(args.bin_size==1),
179
                                          contexts_to_capture=contexts_to_capture,
180
                                          context_radius=context_radius,
181
                                          reference_path=args.reference,
182
                                          dyad_mode=args.mirror_cpg_dyad,
183
                                          stranded=args.stranded,
184
                                          count_reads=args.capture_read_depth is not None
185
186
    )
187
    print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")
188
189
    if args.capture_read_depth:
190
        print('Writing read depth file ', end="")
191
        pd.DataFrame(read_dictionary).sort_index().sort_index(1).to_csv(args.capture_read_depth)
192
        print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")
193
194
195
    print(counts)
196
197
    if args.betas is not None:
198
        print('Writing counts ', end="")
199
        if args.betas.endswith('.pickle.gz'):
200
            counts.to_pickle(args.betas)
201
        else:
202
            counts.get_frame('beta').to_csv(args.betas)
203
        print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")
204
205
    if args.wig_beta is not None:
206
        # Calculate mean beta value and write to wig:
207
        print('Writing WIG beta', end="")
208
        dataframe_to_wig(counts.get_bulk_frame()[['beta']], args.wig_beta, span=args.bin_size)
209
        print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")
210
211
    if args.wig_n_samples is not None:
212
        # Calculate mean beta value and write to wig:
213
        print('Writing WIG n_samples', end="")
214
        dataframe_to_wig(counts.get_bulk_frame()[['n_samples']], args.wig_n_samples, span=args.bin_size)
215
        print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")
216
217
    if args.distmat is not None or args.distmat_plot is not None:
218
        print('Calculating distance matrix', end="")
219
        dmat = counts.get_sample_distance_matrix()
220
        print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")
221
        if args.distmat_plot is not None:
222
            print('Writing distmat_image', end="")
223
            try:
224
                sns.clustermap(dmat)
225
                plt.tight_layout()
226
                plt.savefig(args.distmat_plot)
227
                print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")
228
            except Exception as e:
229
                print(e)
230
                print(f" [ {Fore.RED}FAIL{Style.RESET_ALL} ] ")
231
232
        if args.distmat is not None:
233
            print('Writing distance matrix', end="")
234
            if args.distmat.endswith('.pickle.gz'):
235
                dmat.to_pickle(args.distmat)
236
            else:
237
                dmat.to_csv(args.distmat, sep='\t')
238
            print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")
239
240
    if args.bismark_tabfile is not None:
241
        print('Writing bismark_tabfile matrix', end="")
242
        bf = counts.get_bulk_frame()
243
244
        bf.index.set_names(['chr','start', 'end'] + (['strand'] if args.stranded else []), inplace=True)
245
        bf[['unmethylated', 'methylated', 'beta']].to_csv(args.bismark_tabfile, sep='\t')
246
        print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")
247
248
    if args.tabfile is not None:
249
        print('Writing tabfile', end="")
250
        if args.tabfile.endswith('.pickle.gz'):
251
            counts.get_bulk_frame().to_pickle(args.tabfile)
252
        else:
253
            counts.get_bulk_frame().to_csv(args.tabfile, sep='\t')
254
        print(f" [ {Fore.GREEN}OK{Style.RESET_ALL} ] ")