|
a |
|
b/singlecellmultiomics/bamProcessing/bamOverseq.py |
|
|
1 |
#!/usr/bin/env python3 |
|
|
2 |
# -*- coding: utf-8 -*- |
|
|
3 |
import pysam |
|
|
4 |
from collections import Counter, defaultdict |
|
|
5 |
import pandas as pd |
|
|
6 |
import seaborn as sns |
|
|
7 |
from multiprocessing import Pool |
|
|
8 |
from singlecellmultiomics.bamProcessing.bamBinCounts import blacklisted_binning_contigs |
|
|
9 |
from more_itertools import grouper |
|
|
10 |
import argparse |
|
|
11 |
import os |
|
|
12 |
|
|
|
13 |
def overseq_dict(): |
|
|
14 |
return defaultdict(Counter) |
|
|
15 |
|
|
|
16 |
def merge_overseq_dicts(into, source): |
|
|
17 |
for k, overseq_per_sample in source.items(): |
|
|
18 |
for sample, overseq_for_sample in overseq_per_sample.items(): |
|
|
19 |
into[k][sample] += overseq_for_sample |
|
|
20 |
|
|
|
21 |
def create_overseq_distribution(args): |
|
|
22 |
locations, bin_size, bam_path, min_mq, allelic = args |
|
|
23 |
overseq = defaultdict(overseq_dict) |
|
|
24 |
with pysam.AlignmentFile(bam_path) as alignments: |
|
|
25 |
for location in locations: |
|
|
26 |
if location is None: |
|
|
27 |
continue |
|
|
28 |
|
|
|
29 |
# Location is a tuple (contig, start, end, fetch_start, fetch_end) |
|
|
30 |
# We only need (contig, start, end) to obtain reads in the target region |
|
|
31 |
for i, read in enumerate(alignments.fetch(*location[:3])): |
|
|
32 |
|
|
|
33 |
start, end = location[-2:] |
|
|
34 |
if read.is_duplicate or read.mapping_quality < min_mq or read.is_read2 or read.is_qcfail or (allelic and not read.has_tag( |
|
|
35 |
'DA')): |
|
|
36 |
continue |
|
|
37 |
|
|
|
38 |
# Prevent double counting of fragments |
|
|
39 |
if read.has_tag('DS'): |
|
|
40 |
site = read.get_tag('DS') |
|
|
41 |
if site < start or site > end: |
|
|
42 |
continue |
|
|
43 |
# @todo ; implement for reads without DS tag |
|
|
44 |
|
|
|
45 |
bin_index = int(read.get_tag('DS') / bin_size) |
|
|
46 |
|
|
|
47 |
location_key = ( |
|
|
48 |
read.reference_name, |
|
|
49 |
bin_index * bin_size, |
|
|
50 |
(bin_index + 1) * bin_size |
|
|
51 |
) |
|
|
52 |
|
|
|
53 |
if allelic: |
|
|
54 |
# Prepend allele to bin identifier: |
|
|
55 |
location_key = (read.get_tag('DA'), *location_key) |
|
|
56 |
|
|
|
57 |
overseq[location_key ][read.get_tag('SM')][read.get_tag('af')] += 1 |
|
|
58 |
# if i%1_000_000==0: |
|
|
59 |
# print(i,read.reference_name, read.reference_start,end='\r') |
|
|
60 |
|
|
|
61 |
return overseq |
|
|
62 |
|
|
|
63 |
def obtain_overseq_dictionary(bam_path, bin_size, min_mq=10, allelic=False, blacklist_path=None): |
|
|
64 |
""" |
|
|
65 |
Create molecule duplicate counting dictionary |
|
|
66 |
|
|
|
67 |
Args: |
|
|
68 |
bam_path: path to bam file to obtain duplicate counts from (requires af tag to be set) |
|
|
69 |
bin_size: bin size for output dict |
|
|
70 |
min_mq: minimum mapping quality |
|
|
71 |
allelic: wether to split bins into multiple alleles based on the DA tag value |
|
|
72 |
blacklist_path: path to blacklist bed file (optional) |
|
|
73 |
|
|
|
74 |
Returns: |
|
|
75 |
overseq (dict) : {(location):{sample(str):{reads_per_molecule(int):count (int)}}} |
|
|
76 |
|
|
|
77 |
""" |
|
|
78 |
|
|
|
79 |
|
|
|
80 |
|
|
|
81 |
overseq = defaultdict(overseq_dict) |
|
|
82 |
|
|
|
83 |
worker_bins = list(grouper(blacklisted_binning_contigs(contig_length_resource=bam_path, |
|
|
84 |
blacklist_path=blacklist_path, |
|
|
85 |
bin_size=bin_size, |
|
|
86 |
fragment_size=0), 50)) # take N bins for every worker |
|
|
87 |
|
|
|
88 |
with Pool() as workers: |
|
|
89 |
for i, overseq_for_bin in enumerate(workers.imap_unordered(create_overseq_distribution, ( |
|
|
90 |
( |
|
|
91 |
locations, |
|
|
92 |
bin_size, |
|
|
93 |
bam_path, |
|
|
94 |
min_mq, |
|
|
95 |
allelic |
|
|
96 |
) |
|
|
97 |
|
|
|
98 |
for locations in worker_bins))): |
|
|
99 |
|
|
|
100 |
merge_overseq_dicts(overseq, overseq_for_bin) |
|
|
101 |
|
|
|
102 |
print(f'{((i / len(worker_bins)) * 100):.2f} % ({i}/{len(worker_bins)}) ', end='\r') |
|
|
103 |
|
|
|
104 |
return overseq |
|
|
105 |
|
|
|
106 |
def write_overseq_dict_to_single_sample_files(overseq, target_dir): |
|
|
107 |
# reads per cell: |
|
|
108 |
reads_per_cell = Counter() |
|
|
109 |
locations = sorted(list(overseq.keys())) |
|
|
110 |
|
|
|
111 |
for k, overseq_per_cell in overseq.items(): |
|
|
112 |
for s in overseq_per_cell: |
|
|
113 |
reads_per_cell[s] += sum([copies * seen for copies, seen in overseq_per_cell[s].items()]) |
|
|
114 |
reads_per_cell_dict = reads_per_cell |
|
|
115 |
reads_per_cell = pd.DataFrame({'reads': reads_per_cell}) |
|
|
116 |
selected_cells = reads_per_cell[reads_per_cell['reads'] > 100].index |
|
|
117 |
selected_cells = sorted(selected_cells) |
|
|
118 |
|
|
|
119 |
for sample in selected_cells: |
|
|
120 |
cell_hist = {} |
|
|
121 |
cell_map = defaultdict(dict) |
|
|
122 |
for ki, k in enumerate(locations): |
|
|
123 |
overseq_per_cell = overseq[k] |
|
|
124 |
overseq_counter_for_cell = overseq_per_cell[sample] |
|
|
125 |
cell_hist[k] = overseq_counter_for_cell |
|
|
126 |
|
|
|
127 |
print(sample) |
|
|
128 |
pd.DataFrame(cell_hist).sort_index().sort_index(1).to_csv(f'{target_dir}/cell_hist_{sample}.csv') |
|
|
129 |
|
|
|
130 |
|
|
|
131 |
|
|
|
132 |
|
|
|
133 |
if __name__ == '__main__': |
|
|
134 |
argparser = argparse.ArgumentParser( |
|
|
135 |
formatter_class=argparse.ArgumentDefaultsHelpFormatter, |
|
|
136 |
description="""Create oversequencing tables for single cells""") |
|
|
137 |
argparser.add_argument('bamfile', metavar='bamfile', type=str, help='Bam file where duplicates have been flagged \ |
|
|
138 |
, and the amount of reads associated to the duplicate stored in the sam tag "af"') |
|
|
139 |
argparser.add_argument('-output_folder', default='./overseq_tables', help='Folder to write sample CSV files to') |
|
|
140 |
argparser.add_argument('--allelic', action='store_true', help='Split bins into alleles (based on DA tag)') |
|
|
141 |
argparser.add_argument('-bin_size', type=int, default=500_000, help='Bin size for the output files') |
|
|
142 |
argparser.add_argument('-blacklist', type=str, help='Bed file containing regions to ignore from counting') |
|
|
143 |
|
|
|
144 |
fi = argparser.add_argument_group("Filters") |
|
|
145 |
fi.add_argument('-min_mapping_qual', default=40, type=int) |
|
|
146 |
|
|
|
147 |
args = argparser.parse_args() |
|
|
148 |
|
|
|
149 |
# Create output folder if it does not exist: |
|
|
150 |
if not os.path.exists(args.output_folder): |
|
|
151 |
os.makedirs(args.output_folder) |
|
|
152 |
|
|
|
153 |
overseq = obtain_overseq_dictionary(args.bamfile, args.bin_size, min_mq=args.min_mapping_qual, allelic=args.allelic, blacklist_path=args.blacklist) |
|
|
154 |
# (location) : cell : duplicates : count |
|
|
155 |
write_overseq_dict_to_single_sample_files(overseq, args.output_folder) |