[45ad7e]: / singlecellmultiomics / bamProcessing / bamExtractSamples.py

Download this file

136 lines (111 with data), 4.9 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import pysam
import argparse
import sys
from singlecellmultiomics.bamProcessing import get_sample_to_read_group_dict
import collections
from singlecellmultiomics.utils.path import get_valid_filename
def extract_samples( input_bam_path, output_path, capture_samples, head=None, write_group_rg=False, rg_group_prefix=None ):
"""
Extract samples from a bam file
Args:
input_bam_path(str) : path to bam file from which to extract data from specified samples
output_path(str) : prefix of path to write bam files to
capture_samples(dict) : dictionary of lists {group: [sample, sample, sample], }
head(int) : write this amount of reads, then exit
"""
bamFile = pysam.AlignmentFile(input_bam_path, "rb")
# Write to multiple files:
output_handles = {}
print('Groups:')
for group in capture_samples:
header = bamFile.header.copy()
if write_group_rg:
if rg_group_prefix:
rg = rg_group_prefix + group
else:
rg = group
header = header.to_dict()
header['RG'] = [{'ID': rg,
'LB': rg,
'PL': 'ILLUMINA',
'SM': rg,
'PU': '1'}] # reset read group header
output_handles[group] = pysam.AlignmentFile(output_path.replace('.bam',f'{group}.bam'), "wb", header=header)
print(f'\t{group}\t{output_handles[group].filename.decode()}')
sample2handle = {}
sample2group = {}
for group,samples in capture_samples.items():
for sample in samples:
if sample in sample2handle:
raise ValueError(f'The sample {sample} is assigned to more than one group at least:{group} and {sample2group[sample]}')
print(f'\t{sample}\t->{group}')
sample2handle[sample] = output_handles[group]
sample2group[sample] = group
written = 0
for r in bamFile:
sm = r.get_tag('SM')
try:
handle = sample2handle[sm]
except KeyError:
continue
if write_group_rg:
if rg_group_prefix:
r.set_tag('RG', rg_group_prefix + sample2group[sm] )
else:
r.set_tag('RG', sample2group[sm] )
handle.write(r)
written += 1
if head is not None and written >= head:
break
print(f'Filtering finished, {written} reads')
for group,handle in output_handles.items():
handle.close()
try:
os.system(f'samtools index {handle.filename.decode()}')
except Exception as e:
print(e)
pass
if __name__ == '__main__':
argparser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description="""Extract samples/cells from bam file, writes to multiple bam files if multiple groups are supplied
""")
argparser.add_argument('bamfile', metavar='bamfile', type=str)
argparser.add_argument('extract', help="""Sample file, a list of samples to extract. First column: sample name, second column (optional) group name""")
argparser.add_argument(
'-o',
type=str,
required=True,
help='output bam path, to the end of the file name the group name is appended')
argparser.add_argument(
'--f',
action='store_true',
help='Force overwrite of existing files')
argparser.add_argument('-head', type=int)
argparser.add_argument('--write_group_rg', action='store_true', help='Set the group label as read-group, requires second GROUP column in the input file')
argparser.add_argument('-rg_group_prefix', type=str, help='Prepend this string tot the read group. Requires --write_group_rg ')
args = argparser.parse_args()
if os.path.exists(args.o) and not args.f:
raise ValueError(
f'The output file {args.o} already exists! Use --f or remove the file')
assert args.o.endswith('.bam'), 'output file path should end with .bam'
capture_samples = collections.defaultdict( set ) # group -> set( samples )
with open(args.extract) as e:
for line in e:
parts = line.strip().split(None, 1)
if len(parts)==0:
continue
if len(parts)==1:
group = ''
sample = parts[0]
elif len(parts)==2:
sample, group = parts
group = get_valid_filename(group)
else:
print(f'Line with issue: {line} :: {parts}')
raise ValueError("Please supply a file with one or two columns: [SAMPLE], or [SAMPLE]{tab}[GROUP]")
capture_samples[group].add( parts[0] )
extract_samples(args.bamfile, args.o, capture_samples, head=args.head, write_group_rg=args.write_group_rg, rg_group_prefix=args.rg_group_prefix )