[45ad7e]: / singlecellmultiomics / bamProcessing / bamReadGroupFormat.py

Download this file

80 lines (63 with data), 2.8 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import pysam
import sys
from datetime import datetime
import singlecellmultiomics
from singlecellmultiomics.fragment import Fragment
from singlecellmultiomics.bamProcessing.bamFunctions import get_read_group_from_read, sorted_bam_file, write_program_tag
def set_read_group_format( in_bam_path, out_bam_path, format, threads=4 ):
"""
Set read group format of bam file
Args:
in_bam_path(str) : bam file to read
in_bam_path(str) : bam file to write
format(int) : formatting mode, see `singlecellmultiomics.fragment.Fragment.get_read_group`
threads(int) : Amount of decompression threads for reading
"""
read_groups = dict() # Store unique read groups in this set
"""
read_groups(set/dict) : set or dictionary which contains read groups. The dictionary should have the format { read_group_id (str)
{ 'ID': ID, 'LB':library,
'PL':platform,
'SM':sampleLib,
'PU':readGroup }
"""
with pysam.AlignmentFile(in_bam_path, threads = threads) as input_bam:
input_header = input_bam.header.as_dict()
# Write provenance information to BAM header
write_program_tag(
input_header,
program_name='bamReadGroupFormat',
command_line=" ".join(
sys.argv),
version=singlecellmultiomics.__version__,
description=f'SingleCellMultiOmics read group formatting, executed at {datetime.now().strftime("%d/%m/%Y %H:%M:%S")}')
with sorted_bam_file(out_bam_path, header=input_header, read_groups=read_groups, input_is_sorted=True) as out:
print('Started writing')
for read in input_bam:
rg_id = get_read_group_from_read(read, format=format, with_attr_dict=False )
if not rg_id in read_groups:
rg_id, rg_data = get_read_group_from_read(read, format=format, with_attr_dict=True )
read_groups[rg_id] = rg_data
read.set_tag('RG',rg_id)
out.write(read)
if __name__=='__main__':
argparser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description='Change format of read groups')
argparser.add_argument('bamin', type=str)
argparser.add_argument(
'-format',
type=int,
default=0,
help="Read group format code")
argparser.add_argument(
'-t',
type=int,
default=4,
help="Threads")
argparser.add_argument('-o', type=str, help="output bam file", required=True)
args = argparser.parse_args()
set_read_group_format(args.bamin, args.o, args.format, args.t)