SingleCellMultiOmics / Git / [45ad7e] /singlecellmultiomics/bamProcessing/bamMatchGATKBQSRReport.py

Models:
AlyssaS/
SingleCellMultiOmics
Downloads: 1
[45ad7e]: / singlecellmultiomics / bamProcessing / bamMatchGATKBQSRReport.py
History
Download this file
112 lines (96 with data), 4.0 kB

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import pysam
import argparse
import sys
import collections
import pysam
import pandas as pd
from uuid import uuid4


def tables_gen(path,verbose=False):
    with open(path) as i:
        iterable=iter(i)
        header = next(i)
        table_data=None
        n_columns = 0
        n_rows = 0
        formatting=None
        table_name=None
        table_header=None
        for line in iterable:
            line = line.strip()

            if line.startswith('#'):
                if table_data is not None:
                    yield table_generator(formatting, table_name, table_header, table_data, n_columns,n_rows)
                formatting = line.split(':')
                n_columns,n_rows = int(formatting[2]), int(formatting[3])
                line = next(i).strip()
                table_name = line.split(':')
                table_header =  next(i).strip().split()
                if verbose:
                    print('FMT',formatting)
                    print('table_name',table_name)
                    print('table_header',table_header)
                table_data = []
                continue
            # fill the table:
            parts = line.strip().split()
            if len(parts)==n_columns:
                table_data.append(parts)
        if formatting is not None and table_name is not None and table_header is not None:
            yield table_generator(formatting, table_name, table_header, table_data, n_columns,n_rows)

def table_generator(formatting, table_name, table_header, table_data, n_columns,n_rows):
    """
    Generates pandas dataframes from GATK formatted tables
    """
    df = pd.DataFrame(table_data)
    try:
        df.columns = table_header[-n_columns:]
    except Exception as e:
        #print(e)
        pass
    return [df,
            {
                'formatting':formatting,
                'table_name':table_name,
                'table_header':table_header,
                'n_columns':n_columns,
                'n_rows':n_rows

            }]

def match_bam_with_GATK_BQSR_report(input_bam_path, output_bam_path, bqsr_report_path ):

    # Identify all read groups present in the bam:
    with pysam.AlignmentFile(input_bam_path) as a:
        read_groups_in_bam = set(rg['ID'] for rg in a.header['RG'] )

    print( f'{len(read_groups_in_bam)} read groups present in bam file')

    print('Now reading BQSR report...')
    # Find which read groups are missing in the bqsr report file:
    total_missing= set()
    for i,(table, meta) in enumerate(tables_gen(path=bqsr_report_path)):
        if 'ReadGroup' in table.columns:
            read_groups_in_gatk_table = set( table['ReadGroup'].unique() )
            missing = read_groups_in_bam.difference(read_groups_in_gatk_table)
            total_missing.update(missing)
        del table
        del meta

    rg_file_path = f'./{str( uuid4() )}.rg.txt'
    total_present = read_groups_in_bam.difference(total_missing)
    print(f'{len(total_missing)} read groups are missing in the report. Keeping {len(total_present)}')
    # Write bam file with only selected read groups
    with open(rg_file_path,'w') as out:
        for rg in total_present:
            out.write(f'{rg}\n')

    print('Extracting alignments ...')

    os.system( f'samtools view -h {input_bam_path} -R {rg_file_path}  -O BAM -o {output_bam_path} -@ 8' )
    os.remove(rg_file_path)
    print('All done.')


if __name__ == '__main__':
    argparser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="""Extract samples/cells from bam file, writes to multiple bam files if multiple groups are supplied
    """)
    argparser.add_argument('bamfile', metavar='bam_input_file', type=str)
    argparser.add_argument('bqsr', metavar='bqsr_file', type=str)
    argparser.add_argument('-o', type=str, help='output path bam', required=True)
    args = argparser.parse_args()

    match_bam_with_GATK_BQSR_report(args.bamfile,args.o, args.bqsr )