[45ad7e]: / singlecellmultiomics / utils / blockzip.py

Download this file

163 lines (131 with data), 5.4 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from Bio import bgzf
import os
class BlockZip():
def verify(self):
prev_contig = None
prev_pos = None
for line in self.bgzf_handle:
if len(line) == 0:
continue
line_contig, line_pos, line_strand, rest = self.read_file_line(line)
if prev_pos is not None and line_pos<prev_pos and line_contig==prev_contig:
raise ValueError('Corrupted mapfile')
prev_contig = line_contig
prev_pos = line_pos
def __init__(self, path, mode='r', read_all=False):
"""
Store tabular information tied to genomic locations in a bgzipped file
Args:
path (str) : path to file
mode (str) : mode, r: read, w: write
read_all(bool) : when enabled all data is read from the file and the handles are closed
"""
self.path = path
self.index_path = f'{path}.idx'
self.prev_contig = None
self.mode = mode
self.index = {}
self.cache = {}
if self.mode == 'w':
self.bgzf_handle = bgzf.BgzfWriter(self.path, 'w')
self.index_handle = open(self.index_path, 'wt')
elif self.mode == 'r':
if not os.path.exists(self.path):
raise ValueError(f'BGZIP index file missing at {self.path}')
self.bgzf_handle = bgzf.BgzfReader(self.path, 'rt')
if not os.path.exists(self.index_path):
raise ValueError(
f'BGZIP index file missing at {self.index_path}')
self.index_handle = open(self.index_path, 'rt')
for line in self.index_handle:
contig, start = line.strip().split()
self.index[contig] = int(start)
if read_all:
for line in self.bgzf_handle:
if len(line) == 0:
continue
line_contig, line_pos, line_strand, rest = self.read_file_line(line)
#print((line_pos, line_strand,rest))
if not line_contig in self.cache:
self.cache[line_contig] = {}
self.cache[line_contig][(line_pos, line_strand)] = rest
cpos = line_pos
self.bgzf_handle.close()
self.bgzf_handle=None
self.index_handle.close()
self.index_handle=None
else:
raise ValueError('Mode can be r or w')
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
self.bgzf_handle.close()
self.index_handle.close()
def write(self, contig, position, strand, data):
"""
Write information for location contig/postion/strand
!! Write the contig data per contig, random mixing of contigs will
result in a corrupted file
Args:
contig(str)
postion(int)
strand(bool)
data(str)
"""
assert(self.mode == 'w')
if self.prev_contig is None or self.prev_contig != contig:
self.index_handle.write(
f'{contig}\t{int(self.bgzf_handle.tell())}\n')
self.bgzf_handle.write(
f'{contig}\t{position}\t{"+-"[strand]}\t{data}\n')
self.prev_contig = contig
def __iter__(self):
"Get iterator going over all lines in the file"
assert(self.mode == 'r')
yield from iter(self.bgzf_handle)
def read_file_line(self, line):
line_contig, line_pos, line_strand, rest = line.strip().split(None, 3)
return line_contig, int(line_pos), line_strand == '-', rest
def __getitem__(self, contig_position_strand):
"""Obtain data at the supplied contig position and strand
Args:
contig_position_strand : tuple of (
contig(str)
postion(int)
strand(bool))
Returns:
result (str) : data stored for the genomic location, returns None
when no data is available
"""
contig, position, strand = contig_position_strand
if contig not in self.cache and contig in self.index:
self.read_contig_to_cache(contig)
if contig in self.cache:
return self.cache[contig].get((position, strand), None)
def read_contig_to_cache(self, contig, region_start=None, region_end=None):
if contig not in self.index:
return
self.cache[contig] = {}
# Seek to the start:
self.bgzf_handle.seek(self.index[contig])
while True:
try:
line = self.bgzf_handle.readline()
if len(line) == 0:
break
line_contig, line_pos, line_strand, rest = self.read_file_line(
line)
if line_contig != contig:
break
#print((line_pos, line_strand,rest))
if region_start is not None and line_pos<region_start:
continue
if region_end is not None and line_pos>region_end:
break
self.cache[contig][(line_pos, line_strand)] = rest
except Exception as e:
raise
if line_contig != contig:
break