SingleCellMultiOmics / Git / [45ad7e] /singlecellmultiomics/bamProcessing/bamExtractSamples.py

Models:
AlyssaS/
SingleCellMultiOmics
Downloads: 1
[45ad7e]: / singlecellmultiomics / bamProcessing / bamExtractSamples.py
History
Download this file
136 lines (111 with data), 4.9 kB

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import pysam
import argparse
import sys
from singlecellmultiomics.bamProcessing import get_sample_to_read_group_dict
import collections
from singlecellmultiomics.utils.path import get_valid_filename

def extract_samples( input_bam_path, output_path, capture_samples, head=None, write_group_rg=False, rg_group_prefix=None ):
    """
    Extract samples from a bam file

    Args:
        input_bam_path(str) : path to bam file from which to extract data from specified samples

        output_path(str) : prefix of path to write bam files to

        capture_samples(dict) : dictionary of lists {group: [sample, sample, sample], }

        head(int) : write this amount of reads, then exit

    """
    bamFile = pysam.AlignmentFile(input_bam_path, "rb")

    # Write to multiple files:
    output_handles = {}
    print('Groups:')
    for group in capture_samples:

        header = bamFile.header.copy()

        if write_group_rg:

            if rg_group_prefix:
                rg = rg_group_prefix + group
            else:
                rg = group
            header = header.to_dict()
            header['RG'] = [{'ID': rg,
            'LB': rg,
            'PL': 'ILLUMINA',
            'SM': rg,
            'PU': '1'}] # reset read group header

        output_handles[group] = pysam.AlignmentFile(output_path.replace('.bam',f'{group}.bam'), "wb", header=header)
        print(f'\t{group}\t{output_handles[group].filename.decode()}')

    sample2handle = {}
    sample2group = {}
    for group,samples in capture_samples.items():
        for sample in samples:
            if sample in sample2handle:
                raise ValueError(f'The sample {sample} is assigned to more than one group at least:{group} and {sample2group[sample]}')
            print(f'\t{sample}\t->{group}')
            sample2handle[sample] = output_handles[group]
            sample2group[sample] = group

    written = 0
    for r in bamFile:
        sm = r.get_tag('SM')
        try:
            handle = sample2handle[sm]
        except KeyError:
            continue

        if write_group_rg:
            if rg_group_prefix:
                r.set_tag('RG', rg_group_prefix + sample2group[sm] )
            else:
                r.set_tag('RG', sample2group[sm] )


        handle.write(r)
        written += 1
        if head is not None and written >= head:
            break

    print(f'Filtering finished, {written} reads')
    for group,handle in output_handles.items():
        handle.close()
        try:
            os.system(f'samtools index {handle.filename.decode()}')
        except Exception as e:
            print(e)
            pass


if __name__ == '__main__':
    argparser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="""Extract samples/cells from bam file, writes to multiple bam files if multiple groups are supplied
    """)
    argparser.add_argument('bamfile', metavar='bamfile', type=str)
    argparser.add_argument('extract', help="""Sample file, a list of samples to extract. First column: sample name, second column (optional) group name""")
    argparser.add_argument(
        '-o',
        type=str,
        required=True,
        help='output bam path, to the end of the file name the group name is appended')
    argparser.add_argument(
        '--f',
        action='store_true',
        help='Force overwrite of existing files')
    argparser.add_argument('-head', type=int)
    argparser.add_argument('--write_group_rg', action='store_true', help='Set the group label as read-group, requires second GROUP column in the input file')
    argparser.add_argument('-rg_group_prefix', type=str, help='Prepend this string tot the read group. Requires --write_group_rg ')
    args = argparser.parse_args()

    if os.path.exists(args.o) and not args.f:
        raise ValueError(
            f'The output file {args.o} already exists! Use --f or remove the file')

    assert args.o.endswith('.bam'), 'output file path should end with .bam'

    capture_samples = collections.defaultdict( set ) # group -> set( samples )
    with open(args.extract) as e:
        for line in e:
            parts = line.strip().split(None, 1)
            if len(parts)==0:
                continue

            if len(parts)==1:
                group = ''
                sample = parts[0]
            elif len(parts)==2:
                sample, group = parts
                group = get_valid_filename(group)
            else:
                print(f'Line with issue: {line} :: {parts}')
                raise ValueError("Please supply a file with one or two columns: [SAMPLE], or [SAMPLE]{tab}[GROUP]")
            capture_samples[group].add( parts[0] )

    extract_samples(args.bamfile, args.o, capture_samples, head=args.head, write_group_rg=args.write_group_rg, rg_group_prefix=args.rg_group_prefix )