Switch to unified view

a b/singlecellmultiomics/methylation/methylation.py
1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
import pysam
4
import pandas as pd
5
import numpy as np
6
from multiprocessing import Pool, Manager
7
from collections import defaultdict
8
from singlecellmultiomics.bamProcessing import get_reference_path_from_bam
9
from singlecellmultiomics.molecule import MoleculeIterator,TAPS
10
import gzip
11
from singlecellmultiomics.utils import invert_strand_f, is_autosome
12
import os
13
import matplotlib.pyplot as plt
14
import pyBigWig
15
16
def get_methylation_calls_from_tabfile(path: str):
17
    """
18
    Reading routine, for reading the default taps-tabulator output files
19
20
    Args:
21
        path (str), path to the taps tabulator file to read
22
23
    Yields:
24
        contig, cpg_location, strand, methylation_stat (tuple). The cpg location is zero indexed
25
    """
26
    with (gzip.open(path,'rt') if path.endswith('.gz') else open(path)) as f:
27
        for i,line in enumerate(f):
28
            parts = line.strip().split('\t',4)
29
            meta, contig, cpg_location, methylation_stat, ligation_motif_and_others = parts
30
31
            cpg_location = int(cpg_location)-1
32
33
            cell, molecule_id, cut_pos, frag_size ,umi,strand =  meta.split(':')
34
35
            yield contig, cpg_location, strand, methylation_stat
36
37
38
def get_single_cpg_calls_from_tabfile(path: str):
39
    """
40
    Obtain single CpG calls from taps-tabulator file
41
42
    Args:
43
        path (str), path to the taps tabulator file to read, needs to be sorted in order to work correctly
44
45
    Yields:
46
        (contig, cpg_location, strand), methylated, unmethylated. The cpg location is zero indexed
47
    """
48
    prev = None
49
    met,unmet = 0,0
50
    for contig, cpg_location, strand, methylation_stat in get_methylation_calls_from_tabfile(path):
51
52
        current = (contig, cpg_location,strand)
53
        if prev is not None and current!=prev:
54
            yield prev,met,unmet
55
56
            met,unmet = 0,0
57
        if methylation_stat.isupper():
58
            met+=1
59
        else:
60
            unmet+=1
61
62
        prev= current
63
64
    if met>0 or unmet>0:
65
        yield prev,met,unmet
66
67
def sort_methylation_tabfile(path, pathout,threads=4):
68
    """
69
    Sort methylation tab file. Sorts first on the chromosome, then the position, then the cell/umi
70
    """
71
    cmd = f"""/bin/bash -c "zcat {path} | sort -k2,2 -k3,3n -k1,1 --parallel={threads} | gzip -1 > {pathout}" """
72
    os.system(cmd)
73
74
def methylation_tabfile_to_bed(tabpath: str, bedpath: str, invert_strand=False):
75
    """ Convert methylation tabfile at tabpath to a methylation bedfile at bedpath """
76
    cmap = plt.get_cmap('bwr')
77
    with open(bedpath, 'w') as o:
78
        for call in get_single_cpg_calls_from_tabfile(tabpath):
79
            (contig,pos,strand),met,unmet = call
80
            beta = (met/(unmet+met))
81
            rgb = cmap(beta)
82
            o.write(f'{contig}\t{pos}\t{pos+1}\t.\t{min(1000,unmet+met)}\t{invert_strand_f(strand) if invert_strand else strand}\t{pos}\t{pos+1}\t{int(rgb[0]*255)},{int(rgb[1]*255)},{int(255*rgb[2])}\t{unmet+met}\t{int(100*beta)}\n')
83
84
85
def iter_methylation_calls_from_bigbed(path: str, MINCOV :int=0, autosomes_only: bool=False):
86
    with pyBigWig.open(path) as f:
87
88
        # Iterate over all contigs, exclude scaffolds and only include autosomes
89
        for chrom,l in f.chroms().items():
90
            if autosomes_only and not is_autosome(chrom):
91
                continue
92
93
            for entry in f.entries(chrom,0,l):
94
                name, score, strandedness, _, __, ___, coverage, obs_beta = entry[2].split()
95
                if int(coverage)>=MINCOV:
96
                    yield (chrom, entry[0],entry), (float(obs_beta),score,strandedness,int(coverage))
97
98
99
def methylation_calls_from_bigbed_to_dict(path: str, MINCOV :int=0, autosomes_only: bool=False):
100
    """Obtain all methylation calls from the specified bigbed file
101
102
    Args:
103
        path : path to the methylation bigbed file
104
105
        MINCOV: minimum amount of reads covering the position to be included
106
107
    Returns:
108
        reference_betas (dict) : {chrom : {position : value (float)}}
109
    """
110
    betas = defaultdict(dict)
111
112
    for (chrom,pos,entry),(beta,score,strandedness,coverage) in iter_methylation_calls_from_bigbed(path, MINCOV, autosomes_only):
113
        betas[chrom][pos] = beta
114
115
    return betas
116
117
def get_bulk_vector(args):
118
    obj, samples, location = args
119
    return obj.get_bulk_column(samples, location)
120
121
class MethylationCountMatrix:
122
123
    def __init__(self, counts: dict = None, threads=None):
124
        # Sample->(contig,bin_start,bin_end)-> [methylated_counts, unmethylated]
125
        self.counts = {} if counts is None else counts
126
127
        # { (contig, bin_start, bin_end), (contig, bin_start, bin_end) .. }
128
        #or
129
        # { (contig, bin_start, bin_end,strand), (contig, bin_start, bin_end, strand) .. }
130
        self.sites = set()
131
        self.threads = threads
132
133
    def __getitem__(self, key: tuple):
134
        sample, location = key
135
        if not sample in self.counts:
136
            self.counts[sample] = {}
137
        if not location in self.counts[sample]:
138
            self.sites.add(location)
139
            self.counts[sample][location] = [0, 0]
140
        return self.counts[sample][location]
141
142
    def get_without_init(self, key: tuple):
143
        # Obtain a key without setting it
144
        # sample, location = key
145
        try:
146
            return self.counts[key[0]][key[1]]
147
        except KeyError:
148
            return (0,0)
149
150
    def __setitem__(self, key: tuple, value: list):
151
        sample, location = key
152
        if not sample in self.counts:
153
            self.counts[sample] = {}
154
        self.counts[sample][location] = value
155
156
    def update(self, other):
157
        # This does not work for regions with overlap! Those will be overwritten
158
        for sample, counts in other.counts.items():
159
            if sample not in self.counts:
160
                self.counts[sample] = {}
161
            self.counts[sample].update(counts)
162
        self.sites.update(other.sites)
163
164
    def get_sample_list(self):
165
        return sorted(list(self.counts.keys()))
166
167
    def __repr__(self):
168
        return f'Methylation call matrix containing {len(self.counts)} samples and {len(self.sites)} locations'
169
170
    def prune(self, min_samples: int = 0, min_variance: float = None):
171
        if len(self.sites)==0 or len(self.counts) == 0 or min_samples == 0 and min_variance is None:
172
            return
173
174
        for location, row in self.get_bulk_frame(use_multi=False).iterrows():
175
            if row.n_samples < min_samples:
176
                self.delete_location(location)
177
            elif min_variance is not None and (np.isnan(row.variance) or row.variance < min_variance):
178
                self.delete_location(location)
179
180
    def delete_location(self, location):
181
182
        drop_samples = []
183
        for sample in self.counts:
184
            if location in self.counts[sample]:
185
                del self.counts[sample][location]
186
                if len(self.counts[sample]) == 0:
187
                    drop_samples.append(sample)
188
        self.sites.remove(location)
189
190
        # Remove samples without any data left:
191
        for d in drop_samples:
192
            del self.counts[d]
193
194
    def get_sample_distance_matrix(self):
195
        self.check_integrity()
196
        def distance(row, matrix):
197
            # Amount of differences / total comparisons
198
            return np.nansum(np.abs((matrix - row)), axis=1) / (np.isfinite(matrix - row).sum(axis=1))
199
200
        def get_dmat(df):
201
            dmat = np.apply_along_axis(distance, 1, df.values, matrix=df.values)
202
            return pd.DataFrame(dmat, columns=df.index, index=df.index)
203
204
        with np.errstate(divide='ignore', invalid='ignore'):
205
            dmat = get_dmat(self.get_frame('beta'))
206
207
            while dmat.isna().sum().sum() > 0:
208
                sample = dmat.isna().sum().idxmax()
209
                dmat.drop(sample, 0, inplace=True)
210
                dmat.drop(sample, 1, inplace=True)
211
212
        return dmat
213
214
215
216
217
    def get_frame(self, dtype: str):
218
        """
219
        Get pandas dataframe containing the selected column
220
221
        Args:
222
            dtype: either 'methylated', 'unmethylated' or 'beta'
223
224
        Returns:
225
            df(pd.DataFrame) : Dataframe containing the selected column, rows are samples, columns are locations
226
227
        """
228
        self.check_integrity()
229
        # Fix columns
230
        columns = list(sorted(self.sites))
231
        # Create column to index mapping:
232
        column_to_index = {c: i for i, c in enumerate(columns)}
233
234
        samples = self.get_sample_list()
235
236
        mat = np.zeros((len(samples), len(columns)))
237
        mat[:] = np.nan
238
239
        for i, sample in enumerate(samples):
240
            for location, (unmethylated, methylated) in self.counts[sample].items():
241
                if dtype == 'methylated':
242
                    value = methylated
243
                elif dtype == 'unmethylated':
244
                    value = unmethylated
245
                elif dtype == 'beta':
246
                    value = methylated / (methylated + unmethylated)
247
                else:
248
                    raise ValueError
249
                mat[i, [column_to_index[location]]] = value
250
251
        return pd.DataFrame(mat, index=samples, columns=pd.MultiIndex.from_tuples(columns))
252
253
    def check_integrity(self):
254
        if len(self.sites) == 0 or len(self.counts) == 0:
255
            print(self)
256
            raise ValueError('The count matrix contains no data, verify if the input data was empty or filtered to stringently')
257
258
259
    def get_bulk_column(self, samples, location):
260
261
        total_un, total_met = 0, 0
262
        betas = []
263
        n_samples = 0
264
        for sample in samples:
265
            unmethylated, methylated = self.get_without_init((sample, location))
266
            total_un += unmethylated
267
            total_met += methylated
268
269
            if methylated + unmethylated > 0:
270
                n_samples += 1
271
                betas.append(methylated / (methylated + unmethylated))
272
273
        empty = (total_met+total_un) == 0
274
        return [ total_un, total_met, np.nan if empty else total_met/(total_un+total_met), np.var(betas) if len(betas) else np.nan, n_samples]
275
276
277
    def get_bulk_frame(self, dtype='pd', use_multi=True):
278
        """
279
        Get pandas dataframe containing the selected columns
280
281
282
        Returns:
283
            df(pd.DataFrame) : Dataframe containing the selected column, rows are locations,
284
285
        """
286
        self.check_integrity()
287
        # Fix columns
288
        columns = list(sorted(self.sites))
289
        # Create column to index mapping:
290
        column_to_index = {c: i for i, c in enumerate(columns)}
291
292
        samples = self.get_sample_list()
293
294
        mat = np.zeros((len(columns), 5))
295
        mat[:] = np.nan
296
297
298
        if use_multi and (self.threads is not None and self.threads>1):
299
            with Pool(self.threads) as workers:
300
                for index,column in enumerate(
301
                                                    workers.imap( get_bulk_vector,
302
                                                    ( (self, samples, location)
303
                                                    for index, location in enumerate(columns) ), chunksize=100_000)):
304
                    mat[index, :] =  column
305
        else:
306
            for index, location in enumerate(columns):
307
                mat[index, :] = self.get_bulk_column(samples, location)
308
309
        if dtype == 'pd':
310
            return pd.DataFrame(mat, index=pd.MultiIndex.from_tuples(columns),
311
                                columns=('unmethylated', 'methylated', 'beta', 'variance', 'n_samples'))
312
        elif dtype == 'np':
313
            return mat
314
        else:
315
            raise ValueError('dtype should be pd or np')
316
317
318
def methylation_dict_to_location_values(methylation_per_location_per_cell: dict, select_samples=None)->tuple:
319
    """
320
    Convert a dictionary
321
    { location -> cell -> [0,0] }
322
    into
323
    { contig : [ locations (list) ] }
324
    { contig : [ values (list) ] }
325
    """
326
    write_locations = defaultdict(list) # contig -> locations
327
    write_values = defaultdict(dict) # contig -> location -> value
328
329
    for location, cell_info_for_location in methylation_per_location_per_cell.items():
330
        # Calculate beta value:
331
        unmet = 0
332
        met = 0
333
334
        for cell, (c_unmet, c_met) in cell_info_for_location.items():
335
            if select_samples is not None and not cell in select_samples:
336
                continue
337
            unmet+=c_unmet
338
            met+=c_met
339
340
        support = unmet+met
341
        if support == 0:
342
            continue
343
        contig = location[0]
344
        position = location[1]
345
346
        write_locations[contig].append(position)
347
        write_values[contig][position] = met / support
348
349
    return write_locations, write_values
350
351
352
353
def twolist():
354
    return [0,0]
355
356
def defdict():
357
    return defaultdict(twolist)
358
359
360
def met_unmet_dict_to_betas(methylation_per_cell_per_cpg: dict, bin_size=None) -> dict:
361
    """
362
    Convert dictionary of count form to beta form:
363
364
    cell -> location -> [unmet, met]
365
366
    to
367
368
    cell -> location -> beta
369
    """
370
    export_table = defaultdict(dict) #location->cell->beta
371
    for (contig, start), data_per_cell in methylation_per_cell_per_cpg.items():
372
         for cell,(met,unmet) in data_per_cell.items():
373
                if type(start)==int and bin_size is not None:
374
                    export_table[cell][contig, start, start+bin_size] = met/ (unmet+met)
375
                else:
376
                    export_table[cell][contig, start] = met/ (unmet+met)
377
    return export_table
378
379
380
def extract_cpgs(bam,
381
                 contig,
382
                 fragment_class,
383
                 molecule_class,
384
                 start = None,
385
                 end = None,
386
                 fetch_start = None,
387
                 fetch_end = None,
388
                 context='Z',
389
                 stranded=False,
390
                 mirror_cpg = False,
391
                 allelic=False,
392
                 select_samples=None,
393
                 pool_alias = None,
394
                 reference_path = None,
395
                 methylation_consensus_kwargs= {},
396
                 bin_size=None):
397
398
    methylation_per_cell_per_cpg = defaultdict(defdict) # location -> cell -> [0,0]
399
400
    taps = TAPS()
401
    with pysam.AlignmentFile(bam) as al,\
402
         pysam.FastaFile((get_reference_path_from_bam(bam) if reference_path is None else reference_path)) as reference:
403
404
            for molecule in MoleculeIterator(
405
                al,
406
                fragment_class=fragment_class,
407
                molecule_class=molecule_class,
408
                molecule_class_args={
409
                     'reference':reference,
410
                     'taps':taps,
411
                     'taps_strand':'R',
412
413
                     'methylation_consensus_kwargs':methylation_consensus_kwargs,
414
                },
415
                fragment_class_args={},
416
                contig = contig,
417
                start=fetch_start,
418
                end=fetch_end
419
            ):
420
                if allelic:
421
                    allele =  molecule.allele
422
423
                if select_samples is not None and not molecule.sample in select_samples:
424
                    continue
425
426
                for (cnt, pos), call in molecule.methylation_call_dict.items():
427
428
                    if (start is not None and pos<start) or (end is not None and pos>=end):
429
                        continue
430
431
                    ctx = call['context']
432
                    if ctx.upper()!=context:
433
                        continue
434
435
                    if mirror_cpg and context=='Z' and not molecule.strand:
436
                        pos-=1
437
438
                    if pool_alias:
439
                        location_key = pool_alias
440
                    else:
441
                        if bin_size is not None:
442
                            location_key = [cnt, int(bin_size*int(pos/bin_size))]
443
                        else:
444
                            location_key = [cnt,pos]
445
                        if allelic:
446
                            location_key += [allele]
447
448
                        if stranded:
449
                            location_key += [molecule.strand]
450
451
                    methylation_per_cell_per_cpg[tuple(location_key)][molecule.sample][int(ctx.isupper())]+=1
452
453
    return methylation_per_cell_per_cpg