[45ad7e]: / singlecellmultiomics / bamProcessing / split_bam_by_cluster.py

Download this file

194 lines (171 with data), 7.8 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#!/usr/bin/env python
'''
DESCRIPTION
Split bams into pseudobulks based on metadata
FOR HELP
python split_bam_by_cluster.py --help
AUTHOR: Jake Yeung (j.yeung@hubrecht.eu)
LAB: Quantitative Biology Lab (https://www.hubrecht.eu/research-groups/van-oudenaarden-group/)
CREATED ON: 2020-01-09
LAST CHANGE: see git log
LICENSE: MIT License (see: http://opensource.org/licenses/MIT)
'''
import sys, argparse, datetime
import pysam
import csv
import os
def reformat_header(header, add_prefix="chr"):
new_header = header.to_dict()
contigs = new_header['SQ']
new_contigs = []
for contig in contigs:
contig['SN'] = ''.join([add_prefix, contig['SN']])
new_contigs.append(contig)
new_header['SQ'] = new_contigs
return pysam.AlignmentHeader.from_dict(new_header)
def scan_bam_get_sampnames(infile, tag="SM"):
'''
Read bam file and get sampnames
'''
tags = set()
with pysam.AlignmentFile(infile, "rb") as inbam:
for read in inbam:
tags.add(readTag(read, tag))
return(tags)
def readTag(read, tag, missing='Missing', defective='Defective'):
'''From /hpc/hub_oudenaarden/bdebarbanson/internalTools/modularDemultiplexer/taggedBamFileToCountTable.py'''
value=None
if tag=='chrom':
return str(read.reference_name)
if not read.has_tag(tag):
return missing
else:
try:
value = read.get_tag(tag)
except Exception as e:
value = defective
return value
def main():
parser = argparse.ArgumentParser(description='Split bams into pseudobulks based on metadata')
parser.add_argument('-infile', metavar='INFILE', required=True,
help='Merged bam file')
parser.add_argument('-annotfile', metavar='INFILE', required=True,
help='Annotations of merged clusters. Expect first row are colum names (skipped), first column are cell names (should match SM tag in bam) and second column is cluster name, used for bam file outputs')
parser.add_argument('-outdir', metavar='OUTDIR', required=True,
help='Output directory')
parser.add_argument('-tagid', metavar='TAG NAME', required=False, default="SM",
help='Tag name to match first column in annot file (default is SM)')
parser.add_argument('--annot_no_colnames', action='store_true', help='Set if annot has no column name')
parser.add_argument('-mapq', metavar="MAPQ value", required=True, default=40, type=int)
parser.add_argument('--add_chr_prefix', action='store_true', help="Add chr prefix to chromosome name")
parser.add_argument('--overwrite', action='store_true', help="Does not check if bam files exists, will just write to file")
parser.add_argument('--quiet', '-q', action='store_true',
help='Suppress some print statements')
parser.add_argument('-logfile', '-l', metavar='LOGFILE', default = None,
help='Write arguments to logfile')
args = parser.parse_args()
# store command line arguments for reproducibility
CMD_INPUTS = ' '.join(['python'] + sys.argv) # easy printing later
# store argparse inputs for reproducibility / debugging purposes
args_dic = vars(args)
# ARG_INPUTS = ['%s=%s' % (key, val) for key, val in args_dic.iteritems()] # for python2
ARG_INPUTS = ['%s=%s' % (key, val) for key, val in args_dic.items()] # for python3
ARG_INPUTS = ' '.join(ARG_INPUTS)
# Print arguments supplied by user
if not args.quiet:
if args.logfile is not None:
sys.stdout = open(args.logfile, "w+")
print(datetime.datetime.now().strftime('Code output on %c'))
print('Command line inputs:')
print(CMD_INPUTS)
print ('Argparse variables:')
print(ARG_INPUTS)
samp_tag_id = args.tagid
samp_cname_indx = 0
samp_cluster_indx = 1
# initialize outbam objects
bname = os.path.splitext(os.path.basename(args.infile))[0]
writebamdic = {} # bam write objs
sorteddic={} # output files for sorted bam files
unsorteddic={} # tmp files unsorted, will be deleted afterwards
# # get sampnames_all by reading the bam file
# sampnames_all = scan_bam_get_sampnames(args.infile, tag = samp_tag_id) # sampnames_all are sample names
# print(str(len(sampnames_all)) + " sampnames_all found in bam file")
# # samp_ignored = set()
# read annot file
print("Reading annot file...")
samp2clstr_dic = {}
samps_good = []
clstrs_lst = []
clstrs = set()
with open(args.annotfile, "r") as annotfile:
jreader = csv.reader(annotfile, delimiter = "\t")
if not args.annot_no_colnames:
jheader = next(jreader)
print("header: %s" % jheader)
for row in jreader:
samp, clstr = row[samp_cname_indx], row[samp_cluster_indx]
if samp not in samp2clstr_dic:
samp2clstr_dic[samp] = clstr
else:
raise Exception('Samp %s is duplicated' % samp)
samps_good.append(samp)
clstrs_lst.append(clstr)
clstrs.add(clstr)
print("Reading annot file... DONE")
# # Check that every sample is in sampnames_all
# print("Checking samps are found in bam...")
# for samp in samps_good:
# assert samp in sampnames_all
# print("Checking samps are found in bam... DONE")
# check that no output bam files will clash with existing bams, exit if no overwrite ooption
if not args.overwrite:
for clstr in clstrs:
checktmppath = os.path.join(args.outdir, '.'.join([bname, clstr, "unsorted", "bam"]))
checkoutpath = os.path.join(args.outdir, '.'.join([bname, clstr, "sorted", "bam"]))
assert not os.path.exists(checktmppath)
assert not os.path.exists(checkoutpath)
dup_count = 0
assign_count = 0
unassign_count = 0
print("Splitting bams into clusters ...")
with pysam.AlignmentFile(args.infile, "rb") as inbam:
if args.add_chr_prefix:
new_header = reformat_header(inbam.header, add_prefix = "chr")
for clstr in clstrs:
tmppath = os.path.join(args.outdir, '.'.join([bname, clstr, "unsorted", "bam"]))
outpath = os.path.join(args.outdir, '.'.join([bname, clstr, "sorted", "bam"]))
unsorteddic[clstr] = tmppath
sorteddic[clstr] = outpath
if not args.add_chr_prefix:
writebamdic[clstr] = pysam.AlignmentFile(tmppath, "wb", template = inbam)
else:
writebamdic[clstr] = pysam.AlignmentFile(tmppath, "wb", header = new_header)
for total_count, read in enumerate(inbam):
if read.is_duplicate:
dup_count += 1
continue
readsamp = readTag(read, samp_tag_id)
if readsamp in samp2clstr_dic:
# write to outpuet
clstr = samp2clstr_dic[readsamp]
writebamdic[clstr].write(read)
assign_count += 1
else:
unassign_count += 1
# close bam files, sort, index and cleanup
for clstr in clstrs:
writebamdic[clstr].close()
pysam.sort('-o', sorteddic[clstr], unsorteddic[clstr])
pysam.index(sorteddic[clstr])
# clean up
for clstr in clstrs:
os.remove(unsorteddic[clstr])
print("Splitting bams into clusters ... DONE")
# Print arguments supplied by user
if not args.quiet:
if args.logfile is not None:
sys.stdout = open(args.logfile, "w+")
print(datetime.datetime.now().strftime('Code finished on: %c'))
if __name__ == '__main__':
main()