Switch to unified view

a b/singlecellmultiomics/bamProcessing/bamSetReadGroup.py
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
4
import argparse
5
import pysam
6
import sys
7
from datetime import datetime
8
import singlecellmultiomics
9
from singlecellmultiomics.fragment import Fragment
10
from singlecellmultiomics.bamProcessing.bamFunctions import get_read_group_from_read, sorted_bam_file, write_program_tag
11
12
def set_read_group( in_bam_path, out_bam_path, id:str, pl:str, lb:str, sm:str, pu:str, threads=4, sm2reads=False  ):
13
    """
14
    Set read group format of bam file
15
16
    Args:
17
        in_bam_path(str) : bam file to read
18
19
        in_bam_path(str) : bam file to write
20
21
        format(int) : formatting mode, see `singlecellmultiomics.fragment.Fragment.get_read_group`
22
23
        threads(int) : Amount of decompression threads for reading
24
25
    """
26
27
    """
28
    read_groups(set/dict) : set or dictionary which contains read groups. The dictionary should have the format { read_group_id (str)
29
            { 'ID': ID, 'LB':library,
30
            'PL':platform,
31
            'SM':sampleLib,
32
            'PU':readGroup }
33
    """
34
35
    read_groups =     {id:{ 'ID': id, 'LB':lb,
36
        'PL':pl,
37
        'SM':sm,
38
        'PU':pu }}
39
40
    with pysam.AlignmentFile(in_bam_path, threads = threads) as input_bam:
41
42
        input_header = input_bam.header.as_dict()
43
44
        # Write provenance information to BAM header
45
        write_program_tag(
46
            input_header,
47
            program_name='bamReadGroupFormat',
48
            command_line=" ".join(
49
                sys.argv),
50
            version=singlecellmultiomics.__version__,
51
            description=f'SingleCellMultiOmics read group formatting, executed at {datetime.now().strftime("%d/%m/%Y %H:%M:%S")}')
52
53
        with sorted_bam_file(out_bam_path, header=input_header, read_groups=read_groups, input_is_sorted=True) as out:
54
            print('Started writing')
55
            for read in input_bam:
56
                rg_id = id
57
                read.set_tag('RG',rg_id)
58
                if sm2reads:
59
                    read.set_tag('SM',sm)
60
                out.write(read)
61
62
63
if __name__=='__main__':
64
    argparser = argparse.ArgumentParser(
65
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
66
        description='Set read group sample platform unit etc.')
67
    argparser.add_argument('bamin', type=str)
68
69
70
    argparser.add_argument(
71
        '-id',
72
        type=str,
73
        required = True,
74
        help="Read group id")
75
76
    argparser.add_argument(
77
        '-pl',
78
        type=str,
79
        default='ILLUMINA',
80
        help="Read group platform")
81
82
    argparser.add_argument(
83
        '-pu',
84
        type=str,
85
        default='LANE',
86
        help="Read group platform unit")
87
88
    argparser.add_argument(
89
        '-sm',
90
        type=str,
91
        required=True,
92
        help="Read group sample name")
93
94
    argparser.add_argument(
95
        '-lb',
96
        type=str,
97
        default='mix',
98
        help="Library")
99
100
101
    argparser.add_argument(
102
        '-t',
103
        type=int,
104
        default=4,
105
        help="Threads")
106
107
108
    argparser.add_argument(
109
        '--sm2reads',
110
        action='store_true',
111
        help="Also write the supplied SM tag to all reads")
112
113
    argparser.add_argument('-o', type=str, help="output bam file", required=True)
114
    args = argparser.parse_args()
115
    set_read_group(
116
        args.bamin,
117
        args.o,
118
        id = args.id,
119
        pl = args.pl,
120
        lb = args.lb,
121
        sm = args.sm,
122
        pu = args.pu,
123
        sm2reads= args.sm2reads,
124
        threads = args.t)