SingleCellMultiOmics / Git / Diff of /singlecellmultiomics/bamProcessing/bamSetReadGroup.py

Models:
AlyssaS/
SingleCellMultiOmics
Downloads: 1
Diff of /singlecellmultiomics/bamProcessing/bamSetReadGroup.py [000000] .. [2c420a]
Switch to side-by-side view

--- a
+++ b/singlecellmultiomics/bamProcessing/bamSetReadGroup.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import argparse
+import pysam
+import sys
+from datetime import datetime
+import singlecellmultiomics
+from singlecellmultiomics.fragment import Fragment
+from singlecellmultiomics.bamProcessing.bamFunctions import get_read_group_from_read, sorted_bam_file, write_program_tag
+
+def set_read_group( in_bam_path, out_bam_path, id:str, pl:str, lb:str, sm:str, pu:str, threads=4, sm2reads=False  ):
+    """
+    Set read group format of bam file
+
+    Args:
+        in_bam_path(str) : bam file to read
+
+        in_bam_path(str) : bam file to write
+
+        format(int) : formatting mode, see `singlecellmultiomics.fragment.Fragment.get_read_group`
+
+        threads(int) : Amount of decompression threads for reading
+
+    """
+
+    """
+    read_groups(set/dict) : set or dictionary which contains read groups. The dictionary should have the format { read_group_id (str)
+            { 'ID': ID, 'LB':library,
+            'PL':platform,
+            'SM':sampleLib,
+            'PU':readGroup }
+    """
+
+    read_groups =     {id:{ 'ID': id, 'LB':lb,
+        'PL':pl,
+        'SM':sm,
+        'PU':pu }}
+
+    with pysam.AlignmentFile(in_bam_path, threads = threads) as input_bam:
+
+        input_header = input_bam.header.as_dict()
+
+        # Write provenance information to BAM header
+        write_program_tag(
+            input_header,
+            program_name='bamReadGroupFormat',
+            command_line=" ".join(
+                sys.argv),
+            version=singlecellmultiomics.__version__,
+            description=f'SingleCellMultiOmics read group formatting, executed at {datetime.now().strftime("%d/%m/%Y %H:%M:%S")}')
+
+        with sorted_bam_file(out_bam_path, header=input_header, read_groups=read_groups, input_is_sorted=True) as out:
+            print('Started writing')
+            for read in input_bam:
+                rg_id = id
+                read.set_tag('RG',rg_id)
+                if sm2reads:
+                    read.set_tag('SM',sm)
+                out.write(read)
+
+
+if __name__=='__main__':
+    argparser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description='Set read group sample platform unit etc.')
+    argparser.add_argument('bamin', type=str)
+
+
+    argparser.add_argument(
+        '-id',
+        type=str,
+        required = True,
+        help="Read group id")
+
+    argparser.add_argument(
+        '-pl',
+        type=str,
+        default='ILLUMINA',
+        help="Read group platform")
+
+    argparser.add_argument(
+        '-pu',
+        type=str,
+        default='LANE',
+        help="Read group platform unit")
+
+    argparser.add_argument(
+        '-sm',
+        type=str,
+        required=True,
+        help="Read group sample name")
+
+    argparser.add_argument(
+        '-lb',
+        type=str,
+        default='mix',
+        help="Library")
+
+
+    argparser.add_argument(
+        '-t',
+        type=int,
+        default=4,
+        help="Threads")
+
+
+    argparser.add_argument(
+        '--sm2reads',
+        action='store_true',
+        help="Also write the supplied SM tag to all reads")
+
+    argparser.add_argument('-o', type=str, help="output bam file", required=True)
+    args = argparser.parse_args()
+    set_read_group(
+        args.bamin,
+        args.o,
+        id = args.id,
+        pl = args.pl,
+        lb = args.lb,
+        sm = args.sm,
+        pu = args.pu,
+        sm2reads= args.sm2reads,
+        threads = args.t)