[45ad7e]: / singlecellmultiomics / bamProcessing / split_double_BAM.py

Download this file

173 lines (132 with data), 6.8 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/usr/bin/env python
import sys, argparse, datetime
import collections
import os
import singlecellmultiomics
import collections
import itertools
import numpy as np
import random
import pysam
import pysamiterators
import matplotlib.colors
from importlib import reload
import pandas as pd
from scipy.interpolate import interp1d
from more_itertools import chunked
from more_itertools import windowed
import numpy as np
from singlecellmultiomics.bamProcessing import sorted_bam_file
from singlecellmultiomics.bamProcessing.bamToCountTable import coordinate_to_bins
def main():
parser = argparse.ArgumentParser(description='Dual signal unmixing, through a probablity matrix for each cell across bins probability a read is assigned to signal 1. Use this prob matrix with bam file to split a bam file into signal 1 and signal 2')
parser.add_argument('-inbam', metavar='INFILE',
help='Input bam file')
parser.add_argument('-inprobmat', metavar='INFILE',
help='Tab sep matrix file. Columns are cell names (first fcolumn is ""). Rows are genomic bins. Values are probability of reads in bin assigned to mark1.')
parser.add_argument('-outdir', metavar='OUTDIR',
help='Output directory for bams. Full name to be specified in script')
parser.add_argument('-mapq', metavar='INTEGER 0 to 60', default=0, type=int,
help='Minimum quality of read to be considered')
parser.add_argument('-binsize', metavar='Genomic binsize', default=50000, type=int,
help='Binsize of genomic bins to consider (assumes row names are defined by nearest 50kb bins)')
parser.add_argument('--interpolation', action='store_true',
help='Makes a linear interpolation of the bins in your probability matrix (no interpolation across chromosomes).')
parser.add_argument('--quiet', '-q', action='store_true',
help='Suppress some print statements')
parser.add_argument('--logfile', '-l', metavar='LOGFILE', default=None,
help='Write arguments to logfile')
args = parser.parse_args()
# store command line arguments for reproducibility
CMD_INPUTS = ' '.join(['python'] + sys.argv) # easy printing later
# store argparse inputs for reproducibility / debugging purposes
args_dic = vars(args)
# ARG_INPUTS = ['%s=%s' % (key, val) for key, val in args_dic.iteritems()] # for python2
ARG_INPUTS = ['%s=%s' % (key, val) for key, val in args_dic.items()] # for python3
ARG_INPUTS = ' '.join(ARG_INPUTS)
# Print arguments supplied by user
if not args.quiet:
if args.logfile is not None:
sys.stdout = open(args.logfile, "w+")
print(datetime.datetime.now().strftime('Code output on %c'))
print('Command line inputs:')
print(CMD_INPUTS)
print('Argparse variables:')
print(ARG_INPUTS)
p = pd.read_csv(args.inprobmat, sep="\t", index_col=0)
def parse_bin_name(binname):
chrname, coords = binname.split(':')
start, end = coords.split('-')
return chrname, int(start), int(end)
if not args.interpolation:
prob = p
if args.interpolation:
def interpolate_prob_mat(p):
new_rows = []
for index, (binA_orign, binB_orign) in enumerate(windowed(p.index, 2)):
binA = binA_orign #parse_bin_name(binA_orign)
binB = binB_orign #parse_bin_name(binB_orign)
if binA[0] != binB[0]:
continue
if binA[2] > binB[1]:
raise ValueError('The input is not sorted')
contig = binA[0]
binSize = binA[2] - binA[1]
new_rows.append(p.loc[binA_orign, :])
start, end = binA[2], binB[1]
for new_bin_start in range(binA[2], binB[1], binSize):
new_bin_end = new_bin_start + binSize
new_bin_centroid = new_bin_start + binSize*0.5
# for every cell do interpolation
dx = end-start
d = (new_bin_centroid-start)
dy = p.loc[binB_orign, :] - p.loc[binA_orign, :]
interpolated = (dy/dx)*d + p.loc[binA_orign, :]
interpolated.name = (contig, new_bin_start, new_bin_end)
new_rows.append(interpolated)
prob = pd.DataFrame(new_rows)
indexNames = [f'{chromosomes}:{starts}-{ends}' for chromosomes, starts, ends in prob.index]
prob.index = indexNames
return prob
p.index = pd.MultiIndex.from_tuples([parse_bin_name(t) for t in p.index])
p = p.sort_index(0)
prob = interpolate_prob_mat(p)
prob.to_csv(os.path.join(args.outdir, "probabilityMatrix_linearInterpolated.csv"), sep='\t')
#==========End interpolation============================================
prob.index = pd.MultiIndex.from_tuples([parse_bin_name(t.replace('chr', '')) for t in prob.index])
prob.index.set_names(["chr", "start", "end"], inplace=True)
bamFile = args.inbam
wrote = 0
infboth = os.path.join(args.outdir, "both.bam")
infA = os.path.join(args.outdir, "splitted_A.bam")
infB = os.path.join(args.outdir, "splitted_B.bam")
with pysam.AlignmentFile(bamFile) as f:
with sorted_bam_file(infboth, f) as both, sorted_bam_file(infA, origin_bam=f) as a, sorted_bam_file(infB, origin_bam=f) as b:
for readId, (R1, R2) in enumerate(pysamiterators.MatePairIterator(f)):
if R1.mapping_quality < args.mapq & R2.mapping_quality < args.mapq:
continue # one of two reads should have sufficient MAPQ. Less stringent. Should be OK?
if R1.is_duplicate:
continue
bin_start, bin_end = coordinate_to_bins(R1.get_tag('DS'), args.binsize, args.binsize)[0]
# Obtain prob:
bin_name = (R1.reference_name, bin_start, bin_end)
if not bin_name in prob.index:
continue
if R1.get_tag('SM') not in prob.columns:
continue
p = prob.loc[bin_name, R1.get_tag('SM')]
wrote += 1
group = 'A' if np.random.random() <= p else 'B'
R1.set_tag('Gr', group)
R2.set_tag('Gr', group)
if group == 'A':
a.write(R1)
a.write(R2)
else:
b.write(R1)
b.write(R2)
both.write(R1)
both.write(R2)
print("Number of reads written:" + str(wrote))
if __name__ == '__main__':
main()