|
a |
|
b/singlecellmultiomics/bamProcessing/bamSetReadGroup.py |
|
|
1 |
#!/usr/bin/env python3 |
|
|
2 |
# -*- coding: utf-8 -*- |
|
|
3 |
|
|
|
4 |
import argparse |
|
|
5 |
import pysam |
|
|
6 |
import sys |
|
|
7 |
from datetime import datetime |
|
|
8 |
import singlecellmultiomics |
|
|
9 |
from singlecellmultiomics.fragment import Fragment |
|
|
10 |
from singlecellmultiomics.bamProcessing.bamFunctions import get_read_group_from_read, sorted_bam_file, write_program_tag |
|
|
11 |
|
|
|
12 |
def set_read_group( in_bam_path, out_bam_path, id:str, pl:str, lb:str, sm:str, pu:str, threads=4, sm2reads=False ): |
|
|
13 |
""" |
|
|
14 |
Set read group format of bam file |
|
|
15 |
|
|
|
16 |
Args: |
|
|
17 |
in_bam_path(str) : bam file to read |
|
|
18 |
|
|
|
19 |
in_bam_path(str) : bam file to write |
|
|
20 |
|
|
|
21 |
format(int) : formatting mode, see `singlecellmultiomics.fragment.Fragment.get_read_group` |
|
|
22 |
|
|
|
23 |
threads(int) : Amount of decompression threads for reading |
|
|
24 |
|
|
|
25 |
""" |
|
|
26 |
|
|
|
27 |
""" |
|
|
28 |
read_groups(set/dict) : set or dictionary which contains read groups. The dictionary should have the format { read_group_id (str) |
|
|
29 |
{ 'ID': ID, 'LB':library, |
|
|
30 |
'PL':platform, |
|
|
31 |
'SM':sampleLib, |
|
|
32 |
'PU':readGroup } |
|
|
33 |
""" |
|
|
34 |
|
|
|
35 |
read_groups = {id:{ 'ID': id, 'LB':lb, |
|
|
36 |
'PL':pl, |
|
|
37 |
'SM':sm, |
|
|
38 |
'PU':pu }} |
|
|
39 |
|
|
|
40 |
with pysam.AlignmentFile(in_bam_path, threads = threads) as input_bam: |
|
|
41 |
|
|
|
42 |
input_header = input_bam.header.as_dict() |
|
|
43 |
|
|
|
44 |
# Write provenance information to BAM header |
|
|
45 |
write_program_tag( |
|
|
46 |
input_header, |
|
|
47 |
program_name='bamReadGroupFormat', |
|
|
48 |
command_line=" ".join( |
|
|
49 |
sys.argv), |
|
|
50 |
version=singlecellmultiomics.__version__, |
|
|
51 |
description=f'SingleCellMultiOmics read group formatting, executed at {datetime.now().strftime("%d/%m/%Y %H:%M:%S")}') |
|
|
52 |
|
|
|
53 |
with sorted_bam_file(out_bam_path, header=input_header, read_groups=read_groups, input_is_sorted=True) as out: |
|
|
54 |
print('Started writing') |
|
|
55 |
for read in input_bam: |
|
|
56 |
rg_id = id |
|
|
57 |
read.set_tag('RG',rg_id) |
|
|
58 |
if sm2reads: |
|
|
59 |
read.set_tag('SM',sm) |
|
|
60 |
out.write(read) |
|
|
61 |
|
|
|
62 |
|
|
|
63 |
if __name__=='__main__': |
|
|
64 |
argparser = argparse.ArgumentParser( |
|
|
65 |
formatter_class=argparse.ArgumentDefaultsHelpFormatter, |
|
|
66 |
description='Set read group sample platform unit etc.') |
|
|
67 |
argparser.add_argument('bamin', type=str) |
|
|
68 |
|
|
|
69 |
|
|
|
70 |
argparser.add_argument( |
|
|
71 |
'-id', |
|
|
72 |
type=str, |
|
|
73 |
required = True, |
|
|
74 |
help="Read group id") |
|
|
75 |
|
|
|
76 |
argparser.add_argument( |
|
|
77 |
'-pl', |
|
|
78 |
type=str, |
|
|
79 |
default='ILLUMINA', |
|
|
80 |
help="Read group platform") |
|
|
81 |
|
|
|
82 |
argparser.add_argument( |
|
|
83 |
'-pu', |
|
|
84 |
type=str, |
|
|
85 |
default='LANE', |
|
|
86 |
help="Read group platform unit") |
|
|
87 |
|
|
|
88 |
argparser.add_argument( |
|
|
89 |
'-sm', |
|
|
90 |
type=str, |
|
|
91 |
required=True, |
|
|
92 |
help="Read group sample name") |
|
|
93 |
|
|
|
94 |
argparser.add_argument( |
|
|
95 |
'-lb', |
|
|
96 |
type=str, |
|
|
97 |
default='mix', |
|
|
98 |
help="Library") |
|
|
99 |
|
|
|
100 |
|
|
|
101 |
argparser.add_argument( |
|
|
102 |
'-t', |
|
|
103 |
type=int, |
|
|
104 |
default=4, |
|
|
105 |
help="Threads") |
|
|
106 |
|
|
|
107 |
|
|
|
108 |
argparser.add_argument( |
|
|
109 |
'--sm2reads', |
|
|
110 |
action='store_true', |
|
|
111 |
help="Also write the supplied SM tag to all reads") |
|
|
112 |
|
|
|
113 |
argparser.add_argument('-o', type=str, help="output bam file", required=True) |
|
|
114 |
args = argparser.parse_args() |
|
|
115 |
set_read_group( |
|
|
116 |
args.bamin, |
|
|
117 |
args.o, |
|
|
118 |
id = args.id, |
|
|
119 |
pl = args.pl, |
|
|
120 |
lb = args.lb, |
|
|
121 |
sm = args.sm, |
|
|
122 |
pu = args.pu, |
|
|
123 |
sm2reads= args.sm2reads, |
|
|
124 |
threads = args.t) |