|
a |
|
b/singlecellmultiomics/utils/export.py |
|
|
1 |
import pandas as pd |
|
|
2 |
from singlecellmultiomics.bamProcessing import get_contig_sizes |
|
|
3 |
from collections import defaultdict |
|
|
4 |
import pysam |
|
|
5 |
import pyBigWig |
|
|
6 |
import numpy as np |
|
|
7 |
import gzip |
|
|
8 |
import pickle |
|
|
9 |
|
|
|
10 |
def dataframe_to_wig(df: pd.DataFrame, wig_path: str, span: int = 1, stepper: str = "variableStep", |
|
|
11 |
graphType: str = "points", offset: int = 1, extra_header=''): |
|
|
12 |
""" |
|
|
13 |
Args: |
|
|
14 |
df: pandas.DataFrame to export to wig, expected format: multi_index (chrom,position) |
|
|
15 |
with columns: value, |
|
|
16 |
wig_path(str) : write WIG file to this location |
|
|
17 |
graphType: bar/points/line |
|
|
18 |
stepper: |
|
|
19 |
span: |
|
|
20 |
Returns: |
|
|
21 |
|
|
|
22 |
""" |
|
|
23 |
with open(wig_path, 'w') as out: |
|
|
24 |
for chrom, chrom_data in df.sort_index().groupby(level=0): |
|
|
25 |
# Write the header: |
|
|
26 |
out.write(f'{stepper} chrom={chrom} graphType={graphType} span={span} {extra_header}\n') |
|
|
27 |
for k, row in chrom_data.iterrows(): |
|
|
28 |
if len(k) == 2: |
|
|
29 |
(chrom, pos) = k |
|
|
30 |
elif len(k) == 3: |
|
|
31 |
(chrom, pos, _) = k |
|
|
32 |
else: |
|
|
33 |
raise ValueError('Supply a dataframe with a multi index in the format (chrom, pos) ') |
|
|
34 |
out.write(f"{pos+offset}\t{row.iloc[0] if 'value' not in row else row['value']}\n") |
|
|
35 |
|
|
|
36 |
|
|
|
37 |
|
|
|
38 |
def series_to_bigwig(series, source_bam:str, write_path:str,bin_size: int=None): |
|
|
39 |
""" |
|
|
40 |
Write pd.Series to a bigwig file |
|
|
41 |
|
|
|
42 |
Args: |
|
|
43 |
series (pd.Series) : Series to write to the bigwig, keys are multi-dimensional (contig, start) |
|
|
44 |
|
|
|
45 |
source_bam(str): path to source bam file to extract contig ordering from |
|
|
46 |
|
|
|
47 |
write_path(str): write bigwig file here |
|
|
48 |
|
|
|
49 |
bin_size(int) : bin_size, set to None to use a bin size of 1 |
|
|
50 |
|
|
|
51 |
""" |
|
|
52 |
series.sort_index(inplace=True) |
|
|
53 |
with pysam.AlignmentFile(source_bam) as alignments, pyBigWig.open(write_path,'w') as out: |
|
|
54 |
|
|
|
55 |
cs = get_contig_sizes(alignments) |
|
|
56 |
# Write header |
|
|
57 |
out.addHeader(list(zip(alignments.references, alignments.lengths))) |
|
|
58 |
|
|
|
59 |
for contig in series.index.get_level_values(0).unique(): |
|
|
60 |
|
|
|
61 |
start = series.loc[contig].index.astype(int) |
|
|
62 |
contig_list = [contig]*len(start) |
|
|
63 |
|
|
|
64 |
|
|
|
65 |
if bin_size is None: |
|
|
66 |
end = np.array(start)+1 |
|
|
67 |
else: |
|
|
68 |
end = np.clip( np.array(start)+bin_size, 0, cs[contig] ) |
|
|
69 |
|
|
|
70 |
values_to_write = np.array(series.loc[contig].values , |
|
|
71 |
dtype=np.float64) |
|
|
72 |
|
|
|
73 |
out.addEntries( |
|
|
74 |
contig_list, #Contig |
|
|
75 |
list(start), #Start |
|
|
76 |
ends= list(end), #end |
|
|
77 |
values= values_to_write) |
|
|
78 |
|
|
|
79 |
|
|
|
80 |
def write_pickle(obj, output_path: str ): |
|
|
81 |
with gzip.open(output_path,'wb') as out: |
|
|
82 |
pickle.dump(obj, out) |
|
|
83 |
|
|
|
84 |
def read_pickle(path: str): |
|
|
85 |
with gzip.open(path,'rb') as out: |
|
|
86 |
return pickle.load(out) |
|
|
87 |
|
|
|
88 |
|
|
|
89 |
def write_bigwig(write_locations:dict, write_values:dict, source_bam:str, write_path:str,bin_size: int=None): |
|
|
90 |
""" |
|
|
91 |
Write (location / values dictionaries) to a bigwig file |
|
|
92 |
|
|
|
93 |
Args: |
|
|
94 |
write_locations(dict) : Locations to write to, grouped by contig, {contig:[location (int), ..] |
|
|
95 |
|
|
|
96 |
write_values(dict) : Value to write, grouped by contig {contig:{location: value (float), ..}}, |
|
|
97 |
|
|
|
98 |
source_bam(str): path to source bam file to extract contig ordering from |
|
|
99 |
|
|
|
100 |
write_path(str): write bigwig file here |
|
|
101 |
|
|
|
102 |
bin_size(int) : bin_size, set to None to use a bin size of 1 |
|
|
103 |
|
|
|
104 |
""" |
|
|
105 |
with pysam.AlignmentFile(source_bam) as alignments, pyBigWig.open(write_path,'w') as out: |
|
|
106 |
|
|
|
107 |
cs = get_contig_sizes(alignments) |
|
|
108 |
# Write header |
|
|
109 |
out.addHeader(list(zip(alignments.references, alignments.lengths))) |
|
|
110 |
|
|
|
111 |
for contig in alignments.references: |
|
|
112 |
if not contig in write_locations: |
|
|
113 |
continue |
|
|
114 |
|
|
|
115 |
contig_list = [contig]*len(write_locations[contig]) |
|
|
116 |
start = sorted(list(write_locations[contig])) |
|
|
117 |
if bin_size is None: |
|
|
118 |
end = np.array(start)+1 |
|
|
119 |
else: |
|
|
120 |
end = np.array(start)+bin_size |
|
|
121 |
|
|
|
122 |
values_to_write = np.array( [write_values[contig][p] for p in start] , dtype=np.float32) |
|
|
123 |
|
|
|
124 |
out.addEntries( |
|
|
125 |
contig_list, #Contig |
|
|
126 |
list(start), #Start |
|
|
127 |
ends= list(end), #end |
|
|
128 |
values= values_to_write) |