Switch to unified view

a b/singlecellmultiomics/bamProcessing/bamToMethylationAndCopyNumber.py
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
4
import argparse
5
import pysam
6
import collections
7
import pandas as pd
8
import numpy as np
9
import os
10
11
12
def obtain_approximate_reference_cut_position(site, contig, alt_spans):
13
    #contig, cut_start, strand = molecule.get_cut_site()
14
    alt_contig, alt_start, alt_end = alt_spans[contig]
15
    return contig, site + alt_start
16
17
18
if __name__ == '__main__':
19
    argparser = argparse.ArgumentParser(
20
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
21
        description='Create methylation and copy number tables from bam file, ALT contig aware')
22
23
    argparser.add_argument('alignmentfile', type=str)
24
    argparser.add_argument('-head', type=int, default=None)
25
    argparser.add_argument('-bin_size', type=int, default=250_000)
26
    argparser.add_argument(
27
        '-min_mq',
28
        type=int,
29
        default=30,
30
        help='Minimum mapping quality')
31
    argparser.add_argument('--met', action='store_true')
32
    argparser.add_argument('--cnv', action='store_true')
33
    argparser.add_argument('-ref', type=str, required=True)
34
35
    args = argparser.parse_args()
36
37
    #chromosomes = [f'chr{x}' for x in range(1,23)] + ['chrX']
38
39
    output_alias = args.alignmentfile + f'.table.bs_{args.bin_size}'
40
    # Load alternative contigs if available:
41
    alt_path = f'{args.ref}.64.alt'
42
    alt_spans = None
43
    if os.path.exists(alt_path):
44
        print(f'Loading ALT data from {alt_path}')
45
        with pysam.AlignmentFile(alt_path) as alts:
46
            alt_spans = {}
47
            for alt in alts:
48
                alt_spans[alt.query_name] = (
49
                    alt.reference_name, alt.reference_start, alt.reference_end)
50
51
    # cell -> (allele, chromosome,   bin) -> umi_count
52
    fragment_abundance = collections.defaultdict(collections.Counter)
53
    # cell -> (context, chromosome, bin) -> methylation
54
    methylation_pos = collections.defaultdict(collections.Counter)
55
    # cell -> (context, chromosome, bin) -> methylation
56
    methylation_neg = collections.defaultdict(collections.Counter)
57
    # cell -> (allele, chromosome,   bin) -> raw_count
58
    fragment_read_abundance = collections.defaultdict(collections.Counter)
59
60
    with pysam.AlignmentFile(args.alignmentfile, threads=4) as alignments:
61
62
        min_mq = 30
63
        wrote = 0
64
        # for chromosome in chromosomes:
65
        #    for i,read in enumerate(alignments.fetch(chromosome)):
66
        for read in alignments:
67
            if read.is_duplicate:
68
                continue
69
            if read.has_tag('mp') and read.get_tag('mp') != 'unique':
70
                continue
71
            if read.mapping_quality < args.min_mq or not read.has_tag('DS'):
72
                continue
73
            if args.head is not None and wrote >= (args.head - 1):
74
                break
75
76
            if read.has_tag('RZ') and read.get_tag('RZ') != 'CATG':
77
                continue
78
79
            contig = read.reference_name
80
            site = int(read.get_tag('DS'))
81
            if alt_spans is not None and contig in alt_spans:
82
                contig, site = obtain_approximate_reference_cut_position(
83
                    site, contig, alt_spans)
84
85
            wrote += 1
86
            bin_i = int(site / args.bin_size)
87
88
            allele = 'unk'
89
            if read.has_tag('DA'):
90
                allele = read.get_tag('DA')
91
92
            if args.met:
93
                for context in 'xhz':
94
                    if read.has_tag(f's{context}'):
95
                        methylation_neg[read.get_tag('SM')][(
96
                            context, allele, contig, bin_i)] += int(read.get_tag(f's{context}'))
97
                    if read.has_tag(f's{context.upper()}'):
98
                        methylation_pos[read.get_tag('SM')][(
99
                            context, allele, contig, bin_i)] += int(read.get_tag(f's{context.upper()}'))
100
101
            fragment_abundance[read.get_tag(
102
                'SM')][(allele, contig, bin_i)] += 1
103
            fragment_read_abundance[read.get_tag('SM')][(
104
                allele, contig, bin_i)] += read.get_tag('af')
105
106
    if args.cnv:
107
        print('writing count table')
108
        pd.DataFrame(fragment_abundance).to_pickle(
109
            f'{output_alias}.CNV_umis.pickle.gz')
110
111
        pd.DataFrame(fragment_read_abundance).to_pickle(
112
            f'{output_alias}.CNV_reads.pickle.gz')
113
114
    if args.met:
115
        print('writing count table for methylation status')
116
        methylation_pos = pd.DataFrame(
117
            methylation_pos).T.fillna(0).sort_index()
118
        methylation_neg = pd.DataFrame(
119
            methylation_neg).T.fillna(0).sort_index()
120
121
        for context in 'xhz':
122
            methylation_pos[context].T.to_pickle(
123
                f'{output_alias}.{context.upper()}.pickle.gz')
124
            methylation_neg[context].T.to_pickle(
125
                f'{output_alias}.{context}.pickle.gz')
126
127
            (methylation_pos[context] / (methylation_neg[context] + methylation_pos[context])
128
             ).T.to_pickle(f'{output_alias}.{context}.beta.pickle.gz')