Switch to unified view

a b/singlecellmultiomics/bamProcessing/bamToBigWig.py
1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
4
import pyBigWig
5
import pysam
6
from singlecellmultiomics.bamProcessing.bamBinCounts import get_binned_counts
7
import pandas as pd
8
import argparse
9
from singlecellmultiomics.bamProcessing import get_contig_sizes
10
import numpy as np
11
from singlecellmultiomics.utils.path import get_valid_filename
12
13
def bam_to_wig(bam_paths, write_path, bin_size, method='sum', verbose=False, n_threads=None, sample_mapping_function=None):
14
    if verbose:
15
        print('Counting ...')
16
    counts = get_binned_counts(bam_paths, bin_size=bin_size, n_threads=n_threads)
17
18
    if verbose:
19
        print('Writing ...')
20
21
    if sample_mapping_function is not None:
22
23
        handles = {}
24
        with pysam.AlignmentFile(bam_paths[0]) as alignments:
25
            cs = get_contig_sizes(alignments)
26
27
            targets =  dict( zip(counts.columns, map(sample_mapping_function, counts.columns) ))
28
            for target in set(targets.values()):
29
                if target is None:
30
                    continue
31
                # Select only cells which have the current target label:
32
                subset = counts[ [cell for cell,t in targets.items() if t==target] ]
33
                # And write:
34
                values = subset.sum(1).sort_index()
35
36
                # Write values
37
                with pyBigWig.open(write_path.replace('.bw',f'_{str(target)}.bw'),'w') as out:
38
                    out.addHeader(list(zip(alignments.references, alignments.lengths)))
39
40
                    for contig in alignments.references:
41
42
                        if contig not in values.index.get_level_values(0):
43
                            continue
44
45
                        print(f'Writing data for {contig}, for {target}')
46
                        v = values.loc[[contig]]
47
                        out.addEntries(
48
                            list(v.index.get_level_values(0).values), #Contig
49
                            list(v.index.get_level_values(1).values), #Start
50
                            ends= list( np.clip(  (v.index.get_level_values(1)+bin_size) .values, 0, cs[contig]-1) ) ,  #end
51
                            values= np.array(v.values, dtype=np.float32))
52
53
    else:
54
        with pysam.AlignmentFile(bam_paths[0]) as alignments, pyBigWig.open(write_path,'w') as out:
55
56
            cs = get_contig_sizes(alignments)
57
            # Write header
58
            out.addHeader(list(zip(alignments.references, alignments.lengths)))
59
            values = counts.sum(1).sort_index()
60
            print(values)
61
            # Write values
62
            for contig in alignments.references:
63
64
                if contig not in values.index.get_level_values(0):
65
                    continue
66
                print(f'Writing data for {contig}')
67
                v = values.loc[[contig]]
68
                out.addEntries(
69
                    list(v.index.get_level_values(0).values), #Contig
70
                    list(v.index.get_level_values(1).values), #Start
71
                    ends= list( np.clip(  (v.index.get_level_values(1)+bin_size) .values, 0, cs[contig]-1) ) ,  #end
72
                    values= np.array(v.values, dtype=np.float32))
73
74
75
if __name__ == '__main__':
76
77
    argparser = argparse.ArgumentParser(
78
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
79
        description='Bam file to bigwig. Counts based on DS tag by default, otherwise falls back to the start coordinate of R1. Does not qcfail flagged reads and duplicate flagged reads.')
80
81
    argparser.add_argument('alignmentfiles', type=str, nargs='+')
82
    argparser.add_argument('-o', type=str, required=True, help='Output path (.bw)')
83
    argparser.add_argument('-t', type=int,  help='Amount of threads. Uses all available if not supplied')
84
85
    argparser.add_argument('-bin_size', type=int, required=True)
86
87
    pseudobulk_gr = argparser.add_argument_group('Pseudobulk settings')
88
    pseudobulk_gr.add_argument(
89
        '-pseudobulk_SM_csv',
90
        type=str,
91
        help="""Path to a CSV file which contains for every barcode index (SM tag) to what group it belongs.
92
         The CSV file has no header and two columns, the first column contains the sample name,
93
        the second the target sample name. Multiple barcode indices can share the same sample name, this will create a pseudobulk signal.
94
        Expects a comma as delimiter."""
95
        )
96
97
    args = argparser.parse_args()
98
    assert args.o.endswith('.bw')
99
100
    sample_mapping_function = None
101
    if args.pseudobulk_SM_csv is not None:
102
        sm_sample_map = {str(sm):get_valid_filename(str(sample))
103
            for sm, sample in pd.read_csv(args.pseudobulk_SM_csv,header=None,index_col=0).iloc[:,0].to_dict().items() }
104
        def sample_mapping_function(s):
105
            return sm_sample_map.get(s)
106
107
108
    bam_to_wig(args.alignmentfiles, args.o, args.bin_size, verbose=True, sample_mapping_function=sample_mapping_function,n_threads=args.t)