Switch to unified view

a b/singlecellmultiomics/utils/export.py
1
import pandas as pd
2
from singlecellmultiomics.bamProcessing import get_contig_sizes
3
from collections import defaultdict
4
import pysam
5
import pyBigWig
6
import numpy as np
7
import gzip
8
import pickle
9
10
def dataframe_to_wig(df: pd.DataFrame, wig_path: str, span: int = 1, stepper: str = "variableStep",
11
                     graphType: str = "points", offset: int = 1, extra_header=''):
12
    """
13
    Args:
14
        df: pandas.DataFrame to export to wig, expected format: multi_index (chrom,position)
15
            with columns: value,
16
        wig_path(str) : write WIG file to this location
17
        graphType: bar/points/line
18
        stepper:
19
        span:
20
    Returns:
21
22
    """
23
    with open(wig_path, 'w') as out:
24
        for chrom, chrom_data in df.sort_index().groupby(level=0):
25
            # Write the header:
26
            out.write(f'{stepper} chrom={chrom} graphType={graphType} span={span} {extra_header}\n')
27
            for k, row in chrom_data.iterrows():
28
                if len(k) == 2:
29
                    (chrom, pos) = k
30
                elif len(k) == 3:
31
                    (chrom, pos, _) = k
32
                else:
33
                    raise ValueError('Supply a dataframe with a multi index in the format (chrom, pos) ')
34
                out.write(f"{pos+offset}\t{row.iloc[0] if 'value' not in row else row['value']}\n")
35
36
37
38
def series_to_bigwig(series,  source_bam:str, write_path:str,bin_size: int=None):
39
    """
40
    Write pd.Series to a bigwig file
41
42
    Args:
43
        series (pd.Series) : Series to write to the bigwig, keys are multi-dimensional (contig, start)
44
45
        source_bam(str): path to source bam file to extract contig ordering from
46
47
        write_path(str): write bigwig file here
48
49
        bin_size(int) : bin_size, set to None to use a bin size of 1
50
51
    """
52
    series.sort_index(inplace=True)
53
    with pysam.AlignmentFile(source_bam) as alignments, pyBigWig.open(write_path,'w') as out:
54
55
        cs = get_contig_sizes(alignments)
56
        # Write header
57
        out.addHeader(list(zip(alignments.references, alignments.lengths)))
58
59
        for contig in series.index.get_level_values(0).unique():
60
61
            start = series.loc[contig].index.astype(int)
62
            contig_list = [contig]*len(start)
63
64
65
            if bin_size is None:
66
                end = np.array(start)+1
67
            else:
68
                end = np.clip( np.array(start)+bin_size, 0, cs[contig] )
69
70
            values_to_write = np.array(series.loc[contig].values ,
71
                                       dtype=np.float64)
72
73
            out.addEntries(
74
                contig_list, #Contig
75
                list(start), #Start
76
                ends= list(end), #end
77
                values= values_to_write)
78
79
80
def write_pickle(obj, output_path: str ):
81
    with gzip.open(output_path,'wb') as out:
82
        pickle.dump(obj, out)
83
84
def read_pickle(path: str):
85
    with gzip.open(path,'rb') as out:
86
        return pickle.load(out)
87
88
89
def write_bigwig(write_locations:dict, write_values:dict,  source_bam:str, write_path:str,bin_size: int=None):
90
    """
91
    Write (location / values dictionaries) to a bigwig file
92
93
    Args:
94
        write_locations(dict) : Locations to write to, grouped by contig, {contig:[location (int), ..]
95
96
        write_values(dict) : Value to write, grouped by contig {contig:{location: value (float), ..}},
97
98
        source_bam(str): path to source bam file to extract contig ordering from
99
100
        write_path(str): write bigwig file here
101
102
        bin_size(int) : bin_size, set to None to use a bin size of 1
103
104
    """
105
    with pysam.AlignmentFile(source_bam) as alignments, pyBigWig.open(write_path,'w') as out:
106
107
        cs = get_contig_sizes(alignments)
108
        # Write header
109
        out.addHeader(list(zip(alignments.references, alignments.lengths)))
110
111
        for contig in alignments.references:
112
            if not contig in write_locations:
113
                continue
114
115
            contig_list = [contig]*len(write_locations[contig])
116
            start = sorted(list(write_locations[contig]))
117
            if bin_size is None:
118
                end = np.array(start)+1
119
            else:
120
                end = np.array(start)+bin_size
121
122
            values_to_write = np.array( [write_values[contig][p] for p in start] , dtype=np.float32)
123
124
            out.addEntries(
125
                contig_list, #Contig
126
                list(start), #Start
127
                ends= list(end), #end
128
                values= values_to_write)