|
a |
|
b/singlecellmultiomics/bamProcessing/bamToBigWig.py |
|
|
1 |
#!/usr/bin/env python |
|
|
2 |
# -*- coding: utf-8 -*- |
|
|
3 |
|
|
|
4 |
import pyBigWig |
|
|
5 |
import pysam |
|
|
6 |
from singlecellmultiomics.bamProcessing.bamBinCounts import get_binned_counts |
|
|
7 |
import pandas as pd |
|
|
8 |
import argparse |
|
|
9 |
from singlecellmultiomics.bamProcessing import get_contig_sizes |
|
|
10 |
import numpy as np |
|
|
11 |
from singlecellmultiomics.utils.path import get_valid_filename |
|
|
12 |
|
|
|
13 |
def bam_to_wig(bam_paths, write_path, bin_size, method='sum', verbose=False, n_threads=None, sample_mapping_function=None): |
|
|
14 |
if verbose: |
|
|
15 |
print('Counting ...') |
|
|
16 |
counts = get_binned_counts(bam_paths, bin_size=bin_size, n_threads=n_threads) |
|
|
17 |
|
|
|
18 |
if verbose: |
|
|
19 |
print('Writing ...') |
|
|
20 |
|
|
|
21 |
if sample_mapping_function is not None: |
|
|
22 |
|
|
|
23 |
handles = {} |
|
|
24 |
with pysam.AlignmentFile(bam_paths[0]) as alignments: |
|
|
25 |
cs = get_contig_sizes(alignments) |
|
|
26 |
|
|
|
27 |
targets = dict( zip(counts.columns, map(sample_mapping_function, counts.columns) )) |
|
|
28 |
for target in set(targets.values()): |
|
|
29 |
if target is None: |
|
|
30 |
continue |
|
|
31 |
# Select only cells which have the current target label: |
|
|
32 |
subset = counts[ [cell for cell,t in targets.items() if t==target] ] |
|
|
33 |
# And write: |
|
|
34 |
values = subset.sum(1).sort_index() |
|
|
35 |
|
|
|
36 |
# Write values |
|
|
37 |
with pyBigWig.open(write_path.replace('.bw',f'_{str(target)}.bw'),'w') as out: |
|
|
38 |
out.addHeader(list(zip(alignments.references, alignments.lengths))) |
|
|
39 |
|
|
|
40 |
for contig in alignments.references: |
|
|
41 |
|
|
|
42 |
if contig not in values.index.get_level_values(0): |
|
|
43 |
continue |
|
|
44 |
|
|
|
45 |
print(f'Writing data for {contig}, for {target}') |
|
|
46 |
v = values.loc[[contig]] |
|
|
47 |
out.addEntries( |
|
|
48 |
list(v.index.get_level_values(0).values), #Contig |
|
|
49 |
list(v.index.get_level_values(1).values), #Start |
|
|
50 |
ends= list( np.clip( (v.index.get_level_values(1)+bin_size) .values, 0, cs[contig]-1) ) , #end |
|
|
51 |
values= np.array(v.values, dtype=np.float32)) |
|
|
52 |
|
|
|
53 |
else: |
|
|
54 |
with pysam.AlignmentFile(bam_paths[0]) as alignments, pyBigWig.open(write_path,'w') as out: |
|
|
55 |
|
|
|
56 |
cs = get_contig_sizes(alignments) |
|
|
57 |
# Write header |
|
|
58 |
out.addHeader(list(zip(alignments.references, alignments.lengths))) |
|
|
59 |
values = counts.sum(1).sort_index() |
|
|
60 |
print(values) |
|
|
61 |
# Write values |
|
|
62 |
for contig in alignments.references: |
|
|
63 |
|
|
|
64 |
if contig not in values.index.get_level_values(0): |
|
|
65 |
continue |
|
|
66 |
print(f'Writing data for {contig}') |
|
|
67 |
v = values.loc[[contig]] |
|
|
68 |
out.addEntries( |
|
|
69 |
list(v.index.get_level_values(0).values), #Contig |
|
|
70 |
list(v.index.get_level_values(1).values), #Start |
|
|
71 |
ends= list( np.clip( (v.index.get_level_values(1)+bin_size) .values, 0, cs[contig]-1) ) , #end |
|
|
72 |
values= np.array(v.values, dtype=np.float32)) |
|
|
73 |
|
|
|
74 |
|
|
|
75 |
if __name__ == '__main__': |
|
|
76 |
|
|
|
77 |
argparser = argparse.ArgumentParser( |
|
|
78 |
formatter_class=argparse.ArgumentDefaultsHelpFormatter, |
|
|
79 |
description='Bam file to bigwig. Counts based on DS tag by default, otherwise falls back to the start coordinate of R1. Does not qcfail flagged reads and duplicate flagged reads.') |
|
|
80 |
|
|
|
81 |
argparser.add_argument('alignmentfiles', type=str, nargs='+') |
|
|
82 |
argparser.add_argument('-o', type=str, required=True, help='Output path (.bw)') |
|
|
83 |
argparser.add_argument('-t', type=int, help='Amount of threads. Uses all available if not supplied') |
|
|
84 |
|
|
|
85 |
argparser.add_argument('-bin_size', type=int, required=True) |
|
|
86 |
|
|
|
87 |
pseudobulk_gr = argparser.add_argument_group('Pseudobulk settings') |
|
|
88 |
pseudobulk_gr.add_argument( |
|
|
89 |
'-pseudobulk_SM_csv', |
|
|
90 |
type=str, |
|
|
91 |
help="""Path to a CSV file which contains for every barcode index (SM tag) to what group it belongs. |
|
|
92 |
The CSV file has no header and two columns, the first column contains the sample name, |
|
|
93 |
the second the target sample name. Multiple barcode indices can share the same sample name, this will create a pseudobulk signal. |
|
|
94 |
Expects a comma as delimiter.""" |
|
|
95 |
) |
|
|
96 |
|
|
|
97 |
args = argparser.parse_args() |
|
|
98 |
assert args.o.endswith('.bw') |
|
|
99 |
|
|
|
100 |
sample_mapping_function = None |
|
|
101 |
if args.pseudobulk_SM_csv is not None: |
|
|
102 |
sm_sample_map = {str(sm):get_valid_filename(str(sample)) |
|
|
103 |
for sm, sample in pd.read_csv(args.pseudobulk_SM_csv,header=None,index_col=0).iloc[:,0].to_dict().items() } |
|
|
104 |
def sample_mapping_function(s): |
|
|
105 |
return sm_sample_map.get(s) |
|
|
106 |
|
|
|
107 |
|
|
|
108 |
bam_to_wig(args.alignmentfiles, args.o, args.bin_size, verbose=True, sample_mapping_function=sample_mapping_function,n_threads=args.t) |