SingleCellMultiOmics / Git / [45ad7e] /singlecellmultiomics/bamProcessing/bamOverseq.py

Models:
AlyssaS/
SingleCellMultiOmics
Downloads: 1
[45ad7e]: / singlecellmultiomics / bamProcessing / bamOverseq.py
History
Download this file
156 lines (119 with data), 6.3 kB

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import pysam
from collections import Counter, defaultdict
import pandas as pd
import seaborn as sns
from multiprocessing import Pool
from singlecellmultiomics.bamProcessing.bamBinCounts import blacklisted_binning_contigs
from more_itertools import grouper
import argparse
import os

def overseq_dict():
    return defaultdict(Counter)

def merge_overseq_dicts(into, source):
    for k, overseq_per_sample in source.items():
        for sample, overseq_for_sample in overseq_per_sample.items():
            into[k][sample] += overseq_for_sample

def create_overseq_distribution(args):
    locations, bin_size, bam_path, min_mq, allelic = args
    overseq = defaultdict(overseq_dict)
    with pysam.AlignmentFile(bam_path) as alignments:
        for location in locations:
            if location is None:
                continue

            # Location is a tuple (contig, start, end, fetch_start, fetch_end)
            # We only need (contig, start, end) to obtain reads in the target region
            for i, read in enumerate(alignments.fetch(*location[:3])):

                start, end = location[-2:]
                if read.is_duplicate or read.mapping_quality < min_mq or read.is_read2 or read.is_qcfail or (allelic and not read.has_tag(
                        'DA')):
                    continue

                # Prevent double counting of fragments
                if read.has_tag('DS'):
                    site = read.get_tag('DS')
                    if site < start or site > end:
                        continue
                    # @todo ; implement for reads without DS tag

                bin_index = int(read.get_tag('DS') / bin_size)

                location_key = (
                        read.reference_name,
                        bin_index * bin_size,
                        (bin_index + 1) * bin_size
                    )

                if allelic:
                    # Prepend allele to bin identifier:
                    location_key = (read.get_tag('DA'), *location_key)

                overseq[location_key ][read.get_tag('SM')][read.get_tag('af')] += 1
                # if i%1_000_000==0:
                #    print(i,read.reference_name, read.reference_start,end='\r')

    return overseq

def obtain_overseq_dictionary(bam_path, bin_size, min_mq=10, allelic=False, blacklist_path=None):
    """
    Create molecule duplicate counting dictionary

    Args:
        bam_path: path to bam file to obtain duplicate counts from (requires af tag to be set)
        bin_size: bin size for output dict
        min_mq: minimum mapping quality
        allelic: wether to split bins into multiple alleles based on the DA tag value
        blacklist_path: path to blacklist bed file (optional)

    Returns:
        overseq (dict) : {(location):{sample(str):{reads_per_molecule(int):count (int)}}}

    """



    overseq = defaultdict(overseq_dict)

    worker_bins = list(grouper(blacklisted_binning_contigs(contig_length_resource=bam_path,
                                                           blacklist_path=blacklist_path,
                                                           bin_size=bin_size,
                                                           fragment_size=0), 50))  # take N bins for every worker

    with Pool() as workers:
        for i, overseq_for_bin in enumerate(workers.imap_unordered(create_overseq_distribution, (
                (
                        locations,
                        bin_size,
                        bam_path,
                        min_mq,
                        allelic
                )

                for locations in worker_bins))):

            merge_overseq_dicts(overseq, overseq_for_bin)

            print(f'{((i / len(worker_bins)) * 100):.2f} % ({i}/{len(worker_bins)})       ', end='\r')

    return overseq

def write_overseq_dict_to_single_sample_files(overseq, target_dir):
    # reads per cell:
    reads_per_cell = Counter()
    locations = sorted(list(overseq.keys()))

    for k, overseq_per_cell in overseq.items():
        for s in overseq_per_cell:
            reads_per_cell[s] += sum([copies * seen for copies, seen in overseq_per_cell[s].items()])
    reads_per_cell_dict = reads_per_cell
    reads_per_cell = pd.DataFrame({'reads': reads_per_cell})
    selected_cells = reads_per_cell[reads_per_cell['reads'] > 100].index
    selected_cells = sorted(selected_cells)

    for sample in selected_cells:
        cell_hist = {}
        cell_map = defaultdict(dict)
        for ki, k in enumerate(locations):
            overseq_per_cell = overseq[k]
            overseq_counter_for_cell = overseq_per_cell[sample]
            cell_hist[k] = overseq_counter_for_cell

        print(sample)
        pd.DataFrame(cell_hist).sort_index().sort_index(1).to_csv(f'{target_dir}/cell_hist_{sample}.csv')




if __name__ == '__main__':
    argparser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="""Create oversequencing tables for single cells""")
    argparser.add_argument('bamfile', metavar='bamfile', type=str, help='Bam file where duplicates have been flagged \
    , and the amount of reads associated to the duplicate stored in the sam tag "af"')
    argparser.add_argument('-output_folder', default='./overseq_tables', help='Folder to write sample CSV files to')
    argparser.add_argument('--allelic', action='store_true', help='Split bins into alleles (based on DA tag)')
    argparser.add_argument('-bin_size', type=int, default=500_000, help='Bin size for the output files')
    argparser.add_argument('-blacklist', type=str, help='Bed file containing regions to ignore from counting')

    fi = argparser.add_argument_group("Filters")
    fi.add_argument('-min_mapping_qual', default=40, type=int)

    args = argparser.parse_args()

    # Create output folder if it does not exist:
    if not os.path.exists(args.output_folder):
        os.makedirs(args.output_folder)

    overseq = obtain_overseq_dictionary(args.bamfile, args.bin_size, min_mq=args.min_mapping_qual, allelic=args.allelic, blacklist_path=args.blacklist)
    # (location) : cell : duplicates : count
    write_overseq_dict_to_single_sample_files(overseq, args.output_folder)