Download this file

156 lines (119 with data), 6.3 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import pysam
from collections import Counter, defaultdict
import pandas as pd
import seaborn as sns
from multiprocessing import Pool
from singlecellmultiomics.bamProcessing.bamBinCounts import blacklisted_binning_contigs
from more_itertools import grouper
import argparse
import os
def overseq_dict():
return defaultdict(Counter)
def merge_overseq_dicts(into, source):
for k, overseq_per_sample in source.items():
for sample, overseq_for_sample in overseq_per_sample.items():
into[k][sample] += overseq_for_sample
def create_overseq_distribution(args):
locations, bin_size, bam_path, min_mq, allelic = args
overseq = defaultdict(overseq_dict)
with pysam.AlignmentFile(bam_path) as alignments:
for location in locations:
if location is None:
continue
# Location is a tuple (contig, start, end, fetch_start, fetch_end)
# We only need (contig, start, end) to obtain reads in the target region
for i, read in enumerate(alignments.fetch(*location[:3])):
start, end = location[-2:]
if read.is_duplicate or read.mapping_quality < min_mq or read.is_read2 or read.is_qcfail or (allelic and not read.has_tag(
'DA')):
continue
# Prevent double counting of fragments
if read.has_tag('DS'):
site = read.get_tag('DS')
if site < start or site > end:
continue
# @todo ; implement for reads without DS tag
bin_index = int(read.get_tag('DS') / bin_size)
location_key = (
read.reference_name,
bin_index * bin_size,
(bin_index + 1) * bin_size
)
if allelic:
# Prepend allele to bin identifier:
location_key = (read.get_tag('DA'), *location_key)
overseq[location_key ][read.get_tag('SM')][read.get_tag('af')] += 1
# if i%1_000_000==0:
# print(i,read.reference_name, read.reference_start,end='\r')
return overseq
def obtain_overseq_dictionary(bam_path, bin_size, min_mq=10, allelic=False, blacklist_path=None):
"""
Create molecule duplicate counting dictionary
Args:
bam_path: path to bam file to obtain duplicate counts from (requires af tag to be set)
bin_size: bin size for output dict
min_mq: minimum mapping quality
allelic: wether to split bins into multiple alleles based on the DA tag value
blacklist_path: path to blacklist bed file (optional)
Returns:
overseq (dict) : {(location):{sample(str):{reads_per_molecule(int):count (int)}}}
"""
overseq = defaultdict(overseq_dict)
worker_bins = list(grouper(blacklisted_binning_contigs(contig_length_resource=bam_path,
blacklist_path=blacklist_path,
bin_size=bin_size,
fragment_size=0), 50)) # take N bins for every worker
with Pool() as workers:
for i, overseq_for_bin in enumerate(workers.imap_unordered(create_overseq_distribution, (
(
locations,
bin_size,
bam_path,
min_mq,
allelic
)
for locations in worker_bins))):
merge_overseq_dicts(overseq, overseq_for_bin)
print(f'{((i / len(worker_bins)) * 100):.2f} % ({i}/{len(worker_bins)}) ', end='\r')
return overseq
def write_overseq_dict_to_single_sample_files(overseq, target_dir):
# reads per cell:
reads_per_cell = Counter()
locations = sorted(list(overseq.keys()))
for k, overseq_per_cell in overseq.items():
for s in overseq_per_cell:
reads_per_cell[s] += sum([copies * seen for copies, seen in overseq_per_cell[s].items()])
reads_per_cell_dict = reads_per_cell
reads_per_cell = pd.DataFrame({'reads': reads_per_cell})
selected_cells = reads_per_cell[reads_per_cell['reads'] > 100].index
selected_cells = sorted(selected_cells)
for sample in selected_cells:
cell_hist = {}
cell_map = defaultdict(dict)
for ki, k in enumerate(locations):
overseq_per_cell = overseq[k]
overseq_counter_for_cell = overseq_per_cell[sample]
cell_hist[k] = overseq_counter_for_cell
print(sample)
pd.DataFrame(cell_hist).sort_index().sort_index(1).to_csv(f'{target_dir}/cell_hist_{sample}.csv')
if __name__ == '__main__':
argparser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description="""Create oversequencing tables for single cells""")
argparser.add_argument('bamfile', metavar='bamfile', type=str, help='Bam file where duplicates have been flagged \
, and the amount of reads associated to the duplicate stored in the sam tag "af"')
argparser.add_argument('-output_folder', default='./overseq_tables', help='Folder to write sample CSV files to')
argparser.add_argument('--allelic', action='store_true', help='Split bins into alleles (based on DA tag)')
argparser.add_argument('-bin_size', type=int, default=500_000, help='Bin size for the output files')
argparser.add_argument('-blacklist', type=str, help='Bed file containing regions to ignore from counting')
fi = argparser.add_argument_group("Filters")
fi.add_argument('-min_mapping_qual', default=40, type=int)
args = argparser.parse_args()
# Create output folder if it does not exist:
if not os.path.exists(args.output_folder):
os.makedirs(args.output_folder)
overseq = obtain_overseq_dictionary(args.bamfile, args.bin_size, min_mq=args.min_mapping_qual, allelic=args.allelic, blacklist_path=args.blacklist)
# (location) : cell : duplicates : count
write_overseq_dict_to_single_sample_files(overseq, args.output_folder)