|
a |
|
b/singlecellmultiomics/bamProcessing/bamToMethylationAndCopyNumber.py |
|
|
1 |
#!/usr/bin/env python3 |
|
|
2 |
# -*- coding: utf-8 -*- |
|
|
3 |
|
|
|
4 |
import argparse |
|
|
5 |
import pysam |
|
|
6 |
import collections |
|
|
7 |
import pandas as pd |
|
|
8 |
import numpy as np |
|
|
9 |
import os |
|
|
10 |
|
|
|
11 |
|
|
|
12 |
def obtain_approximate_reference_cut_position(site, contig, alt_spans): |
|
|
13 |
#contig, cut_start, strand = molecule.get_cut_site() |
|
|
14 |
alt_contig, alt_start, alt_end = alt_spans[contig] |
|
|
15 |
return contig, site + alt_start |
|
|
16 |
|
|
|
17 |
|
|
|
18 |
if __name__ == '__main__': |
|
|
19 |
argparser = argparse.ArgumentParser( |
|
|
20 |
formatter_class=argparse.ArgumentDefaultsHelpFormatter, |
|
|
21 |
description='Create methylation and copy number tables from bam file, ALT contig aware') |
|
|
22 |
|
|
|
23 |
argparser.add_argument('alignmentfile', type=str) |
|
|
24 |
argparser.add_argument('-head', type=int, default=None) |
|
|
25 |
argparser.add_argument('-bin_size', type=int, default=250_000) |
|
|
26 |
argparser.add_argument( |
|
|
27 |
'-min_mq', |
|
|
28 |
type=int, |
|
|
29 |
default=30, |
|
|
30 |
help='Minimum mapping quality') |
|
|
31 |
argparser.add_argument('--met', action='store_true') |
|
|
32 |
argparser.add_argument('--cnv', action='store_true') |
|
|
33 |
argparser.add_argument('-ref', type=str, required=True) |
|
|
34 |
|
|
|
35 |
args = argparser.parse_args() |
|
|
36 |
|
|
|
37 |
#chromosomes = [f'chr{x}' for x in range(1,23)] + ['chrX'] |
|
|
38 |
|
|
|
39 |
output_alias = args.alignmentfile + f'.table.bs_{args.bin_size}' |
|
|
40 |
# Load alternative contigs if available: |
|
|
41 |
alt_path = f'{args.ref}.64.alt' |
|
|
42 |
alt_spans = None |
|
|
43 |
if os.path.exists(alt_path): |
|
|
44 |
print(f'Loading ALT data from {alt_path}') |
|
|
45 |
with pysam.AlignmentFile(alt_path) as alts: |
|
|
46 |
alt_spans = {} |
|
|
47 |
for alt in alts: |
|
|
48 |
alt_spans[alt.query_name] = ( |
|
|
49 |
alt.reference_name, alt.reference_start, alt.reference_end) |
|
|
50 |
|
|
|
51 |
# cell -> (allele, chromosome, bin) -> umi_count |
|
|
52 |
fragment_abundance = collections.defaultdict(collections.Counter) |
|
|
53 |
# cell -> (context, chromosome, bin) -> methylation |
|
|
54 |
methylation_pos = collections.defaultdict(collections.Counter) |
|
|
55 |
# cell -> (context, chromosome, bin) -> methylation |
|
|
56 |
methylation_neg = collections.defaultdict(collections.Counter) |
|
|
57 |
# cell -> (allele, chromosome, bin) -> raw_count |
|
|
58 |
fragment_read_abundance = collections.defaultdict(collections.Counter) |
|
|
59 |
|
|
|
60 |
with pysam.AlignmentFile(args.alignmentfile, threads=4) as alignments: |
|
|
61 |
|
|
|
62 |
min_mq = 30 |
|
|
63 |
wrote = 0 |
|
|
64 |
# for chromosome in chromosomes: |
|
|
65 |
# for i,read in enumerate(alignments.fetch(chromosome)): |
|
|
66 |
for read in alignments: |
|
|
67 |
if read.is_duplicate: |
|
|
68 |
continue |
|
|
69 |
if read.has_tag('mp') and read.get_tag('mp') != 'unique': |
|
|
70 |
continue |
|
|
71 |
if read.mapping_quality < args.min_mq or not read.has_tag('DS'): |
|
|
72 |
continue |
|
|
73 |
if args.head is not None and wrote >= (args.head - 1): |
|
|
74 |
break |
|
|
75 |
|
|
|
76 |
if read.has_tag('RZ') and read.get_tag('RZ') != 'CATG': |
|
|
77 |
continue |
|
|
78 |
|
|
|
79 |
contig = read.reference_name |
|
|
80 |
site = int(read.get_tag('DS')) |
|
|
81 |
if alt_spans is not None and contig in alt_spans: |
|
|
82 |
contig, site = obtain_approximate_reference_cut_position( |
|
|
83 |
site, contig, alt_spans) |
|
|
84 |
|
|
|
85 |
wrote += 1 |
|
|
86 |
bin_i = int(site / args.bin_size) |
|
|
87 |
|
|
|
88 |
allele = 'unk' |
|
|
89 |
if read.has_tag('DA'): |
|
|
90 |
allele = read.get_tag('DA') |
|
|
91 |
|
|
|
92 |
if args.met: |
|
|
93 |
for context in 'xhz': |
|
|
94 |
if read.has_tag(f's{context}'): |
|
|
95 |
methylation_neg[read.get_tag('SM')][( |
|
|
96 |
context, allele, contig, bin_i)] += int(read.get_tag(f's{context}')) |
|
|
97 |
if read.has_tag(f's{context.upper()}'): |
|
|
98 |
methylation_pos[read.get_tag('SM')][( |
|
|
99 |
context, allele, contig, bin_i)] += int(read.get_tag(f's{context.upper()}')) |
|
|
100 |
|
|
|
101 |
fragment_abundance[read.get_tag( |
|
|
102 |
'SM')][(allele, contig, bin_i)] += 1 |
|
|
103 |
fragment_read_abundance[read.get_tag('SM')][( |
|
|
104 |
allele, contig, bin_i)] += read.get_tag('af') |
|
|
105 |
|
|
|
106 |
if args.cnv: |
|
|
107 |
print('writing count table') |
|
|
108 |
pd.DataFrame(fragment_abundance).to_pickle( |
|
|
109 |
f'{output_alias}.CNV_umis.pickle.gz') |
|
|
110 |
|
|
|
111 |
pd.DataFrame(fragment_read_abundance).to_pickle( |
|
|
112 |
f'{output_alias}.CNV_reads.pickle.gz') |
|
|
113 |
|
|
|
114 |
if args.met: |
|
|
115 |
print('writing count table for methylation status') |
|
|
116 |
methylation_pos = pd.DataFrame( |
|
|
117 |
methylation_pos).T.fillna(0).sort_index() |
|
|
118 |
methylation_neg = pd.DataFrame( |
|
|
119 |
methylation_neg).T.fillna(0).sort_index() |
|
|
120 |
|
|
|
121 |
for context in 'xhz': |
|
|
122 |
methylation_pos[context].T.to_pickle( |
|
|
123 |
f'{output_alias}.{context.upper()}.pickle.gz') |
|
|
124 |
methylation_neg[context].T.to_pickle( |
|
|
125 |
f'{output_alias}.{context}.pickle.gz') |
|
|
126 |
|
|
|
127 |
(methylation_pos[context] / (methylation_neg[context] + methylation_pos[context]) |
|
|
128 |
).T.to_pickle(f'{output_alias}.{context}.beta.pickle.gz') |