Switch to unified view

a b/singlecellmultiomics/bamProcessing/bamOverseq.py
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
import pysam
4
from collections import Counter, defaultdict
5
import pandas as pd
6
import seaborn as sns
7
from multiprocessing import Pool
8
from singlecellmultiomics.bamProcessing.bamBinCounts import blacklisted_binning_contigs
9
from more_itertools import grouper
10
import argparse
11
import os
12
13
def overseq_dict():
14
    return defaultdict(Counter)
15
16
def merge_overseq_dicts(into, source):
17
    for k, overseq_per_sample in source.items():
18
        for sample, overseq_for_sample in overseq_per_sample.items():
19
            into[k][sample] += overseq_for_sample
20
21
def create_overseq_distribution(args):
22
    locations, bin_size, bam_path, min_mq, allelic = args
23
    overseq = defaultdict(overseq_dict)
24
    with pysam.AlignmentFile(bam_path) as alignments:
25
        for location in locations:
26
            if location is None:
27
                continue
28
29
            # Location is a tuple (contig, start, end, fetch_start, fetch_end)
30
            # We only need (contig, start, end) to obtain reads in the target region
31
            for i, read in enumerate(alignments.fetch(*location[:3])):
32
33
                start, end = location[-2:]
34
                if read.is_duplicate or read.mapping_quality < min_mq or read.is_read2 or read.is_qcfail or (allelic and not read.has_tag(
35
                        'DA')):
36
                    continue
37
38
                # Prevent double counting of fragments
39
                if read.has_tag('DS'):
40
                    site = read.get_tag('DS')
41
                    if site < start or site > end:
42
                        continue
43
                    # @todo ; implement for reads without DS tag
44
45
                bin_index = int(read.get_tag('DS') / bin_size)
46
47
                location_key = (
48
                        read.reference_name,
49
                        bin_index * bin_size,
50
                        (bin_index + 1) * bin_size
51
                    )
52
53
                if allelic:
54
                    # Prepend allele to bin identifier:
55
                    location_key = (read.get_tag('DA'), *location_key)
56
57
                overseq[location_key ][read.get_tag('SM')][read.get_tag('af')] += 1
58
                # if i%1_000_000==0:
59
                #    print(i,read.reference_name, read.reference_start,end='\r')
60
61
    return overseq
62
63
def obtain_overseq_dictionary(bam_path, bin_size, min_mq=10, allelic=False, blacklist_path=None):
64
    """
65
    Create molecule duplicate counting dictionary
66
67
    Args:
68
        bam_path: path to bam file to obtain duplicate counts from (requires af tag to be set)
69
        bin_size: bin size for output dict
70
        min_mq: minimum mapping quality
71
        allelic: wether to split bins into multiple alleles based on the DA tag value
72
        blacklist_path: path to blacklist bed file (optional)
73
74
    Returns:
75
        overseq (dict) : {(location):{sample(str):{reads_per_molecule(int):count (int)}}}
76
77
    """
78
79
80
81
    overseq = defaultdict(overseq_dict)
82
83
    worker_bins = list(grouper(blacklisted_binning_contigs(contig_length_resource=bam_path,
84
                                                           blacklist_path=blacklist_path,
85
                                                           bin_size=bin_size,
86
                                                           fragment_size=0), 50))  # take N bins for every worker
87
88
    with Pool() as workers:
89
        for i, overseq_for_bin in enumerate(workers.imap_unordered(create_overseq_distribution, (
90
                (
91
                        locations,
92
                        bin_size,
93
                        bam_path,
94
                        min_mq,
95
                        allelic
96
                )
97
98
                for locations in worker_bins))):
99
100
            merge_overseq_dicts(overseq, overseq_for_bin)
101
102
            print(f'{((i / len(worker_bins)) * 100):.2f} % ({i}/{len(worker_bins)})       ', end='\r')
103
104
    return overseq
105
106
def write_overseq_dict_to_single_sample_files(overseq, target_dir):
107
    # reads per cell:
108
    reads_per_cell = Counter()
109
    locations = sorted(list(overseq.keys()))
110
111
    for k, overseq_per_cell in overseq.items():
112
        for s in overseq_per_cell:
113
            reads_per_cell[s] += sum([copies * seen for copies, seen in overseq_per_cell[s].items()])
114
    reads_per_cell_dict = reads_per_cell
115
    reads_per_cell = pd.DataFrame({'reads': reads_per_cell})
116
    selected_cells = reads_per_cell[reads_per_cell['reads'] > 100].index
117
    selected_cells = sorted(selected_cells)
118
119
    for sample in selected_cells:
120
        cell_hist = {}
121
        cell_map = defaultdict(dict)
122
        for ki, k in enumerate(locations):
123
            overseq_per_cell = overseq[k]
124
            overseq_counter_for_cell = overseq_per_cell[sample]
125
            cell_hist[k] = overseq_counter_for_cell
126
127
        print(sample)
128
        pd.DataFrame(cell_hist).sort_index().sort_index(1).to_csv(f'{target_dir}/cell_hist_{sample}.csv')
129
130
131
132
133
if __name__ == '__main__':
134
    argparser = argparse.ArgumentParser(
135
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
136
        description="""Create oversequencing tables for single cells""")
137
    argparser.add_argument('bamfile', metavar='bamfile', type=str, help='Bam file where duplicates have been flagged \
138
    , and the amount of reads associated to the duplicate stored in the sam tag "af"')
139
    argparser.add_argument('-output_folder', default='./overseq_tables', help='Folder to write sample CSV files to')
140
    argparser.add_argument('--allelic', action='store_true', help='Split bins into alleles (based on DA tag)')
141
    argparser.add_argument('-bin_size', type=int, default=500_000, help='Bin size for the output files')
142
    argparser.add_argument('-blacklist', type=str, help='Bed file containing regions to ignore from counting')
143
144
    fi = argparser.add_argument_group("Filters")
145
    fi.add_argument('-min_mapping_qual', default=40, type=int)
146
147
    args = argparser.parse_args()
148
149
    # Create output folder if it does not exist:
150
    if not os.path.exists(args.output_folder):
151
        os.makedirs(args.output_folder)
152
153
    overseq = obtain_overseq_dictionary(args.bamfile, args.bin_size, min_mq=args.min_mapping_qual, allelic=args.allelic, blacklist_path=args.blacklist)
154
    # (location) : cell : duplicates : count
155
    write_overseq_dict_to_single_sample_files(overseq, args.output_folder)