SingleCellMultiOmics / Git / Diff of /singlecellmultiomics/bamProcessing/bamOverseq.py

Models:
AlyssaS/
SingleCellMultiOmics
Downloads: 1
Diff of /singlecellmultiomics/bamProcessing/bamOverseq.py [000000] .. [2c420a]
Switch to side-by-side view

--- a
+++ b/singlecellmultiomics/bamProcessing/bamOverseq.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import pysam
+from collections import Counter, defaultdict
+import pandas as pd
+import seaborn as sns
+from multiprocessing import Pool
+from singlecellmultiomics.bamProcessing.bamBinCounts import blacklisted_binning_contigs
+from more_itertools import grouper
+import argparse
+import os
+
+def overseq_dict():
+    return defaultdict(Counter)
+
+def merge_overseq_dicts(into, source):
+    for k, overseq_per_sample in source.items():
+        for sample, overseq_for_sample in overseq_per_sample.items():
+            into[k][sample] += overseq_for_sample
+
+def create_overseq_distribution(args):
+    locations, bin_size, bam_path, min_mq, allelic = args
+    overseq = defaultdict(overseq_dict)
+    with pysam.AlignmentFile(bam_path) as alignments:
+        for location in locations:
+            if location is None:
+                continue
+
+            # Location is a tuple (contig, start, end, fetch_start, fetch_end)
+            # We only need (contig, start, end) to obtain reads in the target region
+            for i, read in enumerate(alignments.fetch(*location[:3])):
+
+                start, end = location[-2:]
+                if read.is_duplicate or read.mapping_quality < min_mq or read.is_read2 or read.is_qcfail or (allelic and not read.has_tag(
+                        'DA')):
+                    continue
+
+                # Prevent double counting of fragments
+                if read.has_tag('DS'):
+                    site = read.get_tag('DS')
+                    if site < start or site > end:
+                        continue
+                    # @todo ; implement for reads without DS tag
+
+                bin_index = int(read.get_tag('DS') / bin_size)
+
+                location_key = (
+                        read.reference_name,
+                        bin_index * bin_size,
+                        (bin_index + 1) * bin_size
+                    )
+
+                if allelic:
+                    # Prepend allele to bin identifier:
+                    location_key = (read.get_tag('DA'), *location_key)
+
+                overseq[location_key ][read.get_tag('SM')][read.get_tag('af')] += 1
+                # if i%1_000_000==0:
+                #    print(i,read.reference_name, read.reference_start,end='\r')
+
+    return overseq
+
+def obtain_overseq_dictionary(bam_path, bin_size, min_mq=10, allelic=False, blacklist_path=None):
+    """
+    Create molecule duplicate counting dictionary
+
+    Args:
+        bam_path: path to bam file to obtain duplicate counts from (requires af tag to be set)
+        bin_size: bin size for output dict
+        min_mq: minimum mapping quality
+        allelic: wether to split bins into multiple alleles based on the DA tag value
+        blacklist_path: path to blacklist bed file (optional)
+
+    Returns:
+        overseq (dict) : {(location):{sample(str):{reads_per_molecule(int):count (int)}}}
+
+    """
+
+
+
+    overseq = defaultdict(overseq_dict)
+
+    worker_bins = list(grouper(blacklisted_binning_contigs(contig_length_resource=bam_path,
+                                                           blacklist_path=blacklist_path,
+                                                           bin_size=bin_size,
+                                                           fragment_size=0), 50))  # take N bins for every worker
+
+    with Pool() as workers:
+        for i, overseq_for_bin in enumerate(workers.imap_unordered(create_overseq_distribution, (
+                (
+                        locations,
+                        bin_size,
+                        bam_path,
+                        min_mq,
+                        allelic
+                )
+
+                for locations in worker_bins))):
+
+            merge_overseq_dicts(overseq, overseq_for_bin)
+
+            print(f'{((i / len(worker_bins)) * 100):.2f} % ({i}/{len(worker_bins)})       ', end='\r')
+
+    return overseq
+
+def write_overseq_dict_to_single_sample_files(overseq, target_dir):
+    # reads per cell:
+    reads_per_cell = Counter()
+    locations = sorted(list(overseq.keys()))
+
+    for k, overseq_per_cell in overseq.items():
+        for s in overseq_per_cell:
+            reads_per_cell[s] += sum([copies * seen for copies, seen in overseq_per_cell[s].items()])
+    reads_per_cell_dict = reads_per_cell
+    reads_per_cell = pd.DataFrame({'reads': reads_per_cell})
+    selected_cells = reads_per_cell[reads_per_cell['reads'] > 100].index
+    selected_cells = sorted(selected_cells)
+
+    for sample in selected_cells:
+        cell_hist = {}
+        cell_map = defaultdict(dict)
+        for ki, k in enumerate(locations):
+            overseq_per_cell = overseq[k]
+            overseq_counter_for_cell = overseq_per_cell[sample]
+            cell_hist[k] = overseq_counter_for_cell
+
+        print(sample)
+        pd.DataFrame(cell_hist).sort_index().sort_index(1).to_csv(f'{target_dir}/cell_hist_{sample}.csv')
+
+
+
+
+if __name__ == '__main__':
+    argparser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description="""Create oversequencing tables for single cells""")
+    argparser.add_argument('bamfile', metavar='bamfile', type=str, help='Bam file where duplicates have been flagged \
+    , and the amount of reads associated to the duplicate stored in the sam tag "af"')
+    argparser.add_argument('-output_folder', default='./overseq_tables', help='Folder to write sample CSV files to')
+    argparser.add_argument('--allelic', action='store_true', help='Split bins into alleles (based on DA tag)')
+    argparser.add_argument('-bin_size', type=int, default=500_000, help='Bin size for the output files')
+    argparser.add_argument('-blacklist', type=str, help='Bed file containing regions to ignore from counting')
+
+    fi = argparser.add_argument_group("Filters")
+    fi.add_argument('-min_mapping_qual', default=40, type=int)
+
+    args = argparser.parse_args()
+
+    # Create output folder if it does not exist:
+    if not os.path.exists(args.output_folder):
+        os.makedirs(args.output_folder)
+
+    overseq = obtain_overseq_dictionary(args.bamfile, args.bin_size, min_mq=args.min_mapping_qual, allelic=args.allelic, blacklist_path=args.blacklist)
+    # (location) : cell : duplicates : count
+    write_overseq_dict_to_single_sample_files(overseq, args.output_folder)