[41c1e8]: / exseek / scripts / web_server.py

Download this file

369 lines (338 with data), 14.8 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
#! /usr/bin/env python
import os
import sys
import argparse
import logging
import pickle
import json
from flask import Flask, request, send_from_directory, send_file
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] [%(levelname)s] %(name)s: %(message)s')
logger = logging.getLogger('web_server')
app = Flask(__name__)
def read_gtf(filename):
with open(filename, 'r') as fin:
lineno = 0
for line in fin:
lineno += 1
c = line.strip().split('\t')
if c[0].startswith('#'):
continue
attrs = {}
for a in c[8].split(';')[:-1]:
a = a.strip()
i = a.find(' ')
key = a[:i]
val = a[(i + 1):].strip('"')
attrs[key] = val
#gene_id = attrs.get('gene_id')
#if gene_id is None:
# raise ValueError('gene_id not found in GTF file at line {}'.format(lineno))
yield (c, attrs, line, lineno)
def read_gff3(filename):
with open(filename, 'r') as fin:
lineno = 0
for line in fin:
lineno += 1
c = line.strip().split('\t')
if c[0].startswith('#'):
continue
attrs = {}
for a in c[8].split(';'):
a = a.strip()
i = a.find('=')
key = a[:i]
val = a[(i + 1):].strip('"')
attrs[key] = val
yield (c, attrs, line, lineno)
def build_feature_db_from_gtf(filename, source=None):
# use gene_id attribute as key
genes = {}
# use transcript_id attribute as key
transcripts = {}
# use gene_name attribute as key
gene_names = {}
# use transcript name attribute as key
transcript_names = {}
for c, attrs, line, lineno in read_gtf(filename):
if c[2] == 'exon':
gene_id = attrs.get('gene_id')
if gene_id is None:
raise ValueError('gene_id attribute not found in GTF file {}:{}'.format(filename, lineno))
gene = genes.get(gene_id)
if gene is None:
# chrom, start, end, strand, source
gene = [c[0], int(c[3]) - 1, int(c[4]), c[6], c[1]]
genes[gene_id] = gene
else:
gene[1] = min(gene[1], int(c[3]) - 1)
gene[2] = max(gene[2], int(c[4]))
transcript_id = attrs.get('transcript_id')
if transcript_id is None:
raise ValueError('transcript_id attribute not found in GTF file {}:{}'.format(filename, lineno))
transcript = transcripts.get(transcript_id)
if transcript is None:
# chrom, start, end, strand, source
transcript = [c[0], int(c[3]) - 1, int(c[4]), c[6], c[1]]
transcripts[transcript_id] = transcript
else:
transcript[1] = min(transcript[1], int(c[3]) - 1)
transcript[2] = max(transcript[2], int(c[4]))
gene_name = attrs.get('gene_name')
if gene_name is not None:
gene_names[gene_name] = gene
transcript_name = attrs.get('transcript_name')
if transcript_name is not None:
transcript_names[transcript_name] = transcript
feature_db = {}
feature_db.update(genes)
# strip versions from gene ids
for key, val in genes.items():
new_key = key.split('.')[0]
if new_key != key:
feature_db[new_key] = val
feature_db.update(transcripts)
# strip versions from transcript ids
for key, val in transcripts.items():
new_key = key.split('.')[0]
if new_key != key:
feature_db[new_key] = val
feature_db.update(gene_names)
feature_db.update(transcript_names)
return feature_db
def build_feature_db_from_gff3(filename, source=None):
transcripts = {}
genes = {}
aliases = {}
valid_features = {'exon', 'transcript', 'gene', 'primary_transcript', 'miRNA', 'miRNA_primary_transcript'}
for c, attrs, line, lineno in read_gff3(filename):
if c[2] not in valid_features:
continue
name = attrs.get('Name')
gene = attrs.get('gene')
# try to use gene or Name attribute as gene_id
if gene is not None:
gene_id = gene
elif name is not None:
gene_id = name
else:
logger.warn('neither Name nor gene attribute is not found in GFF3 file {}:{}'.format(filename, lineno))
continue
# update gene range
feature = genes.get(gene_id)
if feature is None:
feature = [c[0], int(c[3]) - 1, int(c[4]), c[6], c[1]]
genes[gene_id] = feature
else:
feature[1] = min(feature[1], int(c[3]) - 1)
feature[2] = max(feature[2], int(c[4]))
# alias
alias = aliases.get('Alias')
if alias is not None:
aliases[alias] = feature
# add transcript
transcript_id = attrs.get('transcript_id')
if transcript_id is not None:
feature = transcripts.get(transcript_id)
# update transcript range
if feature is None:
feature = [c[0], int(c[3]) - 1, int(c[4]), c[6], c[1]]
transcripts[transcript_id] = feature
else:
feature[1] = min(feature[1], int(c[3]) - 1)
feature[2] = max(feature[2], int(c[4]))
feature_db = {}
feature_db.update(genes)
# strip versions from gene ids
for key, val in genes.items():
new_key = key.split('.')[0]
if new_key != key:
feature_db[new_key] = val
feature_db.update(transcripts)
# strip versions from transcript ids
for key, val in transcripts.items():
new_key = key.split('.')[0]
if new_key != key:
feature_db[new_key] = val
feature_db.update(aliases)
return feature_db
def build_feature_db_from_bed(filename, source='BED'):
feature_db = {}
with open(filename, 'r') as f:
for line in f:
c = line.strip().split('\t')
feature_db[c[3]] = [c[0], int(c[1]), int(c[2]), c[5], source]
return feature_db
def build_feature_db_from_genepred(filename, source='GENEPRED'):
feature_db = {}
with open(filename, 'r') as f:
for line in f:
c = line.strip().split('\t')
feature_db[c[0]] = [c[1], int(c[3]), int(c[4]), c[2], source]
return feature_db
def build_feature_db_from_genepredext(filename, source='REFGENE'):
feature_db = {}
with open(filename, 'r') as f:
for line in f:
c = line.strip().split('\t')
feature = [c[1], int(c[3]), int(c[4]), c[2], source]
feature_db[c[1]] = feature
feature_db[c[11]] = feature
return feature_db
def build_feature_db_from_refgene(filename, source='REFGENE'):
feature_db = {}
with open(filename, 'r') as f:
for line in f:
c = line.strip().split('\t')
feature = [c[2], int(c[4]), int(c[5]), c[3], source]
feature_db[c[1]] = feature
feature_db[c[12]] = feature
return feature_db
def build_feature_db(filename, source=None, format=None):
if format is None:
format = filename.split('.')[-1].lower()
if format == 'bed':
return build_feature_db_from_bed(filename, source=source)
elif format == 'gtf':
return build_feature_db_from_gtf(filename, source=source)
elif format == 'gff3':
return build_feature_db_from_gff3(filename, source=source)
elif format == 'genepred':
return build_feature_db_from_genepred(filename, source=source)
elif format == 'genepredext':
return build_feature_db_from_genepredext(filename, source=source)
elif format == 'refgene':
return build_feature_db_from_refgene(filename, source=source)
elif format == 'pkl':
return pickle.load(open(filename, 'rb'))
else:
raise ValueError('unknown database file format: {}'.format(format))
def read_feature_db(filename):
feature_db = {}
with open(filename, 'r') as f:
for line in f:
c = line.strip().split('\t')
feature_db[c[0]] = c[1:]
return feature_db
def merge_feature_db(filenames):
feature_db = {}
for filename in filenames:
with open(filename, 'rb') as f:
feature_db.update(pickle.load(f))
return feature_db
@app.route('/locus', methods=['GET'])
def query_locus():
global feature_db
genome = request.args.get('genome')
name = request.args.get('name')
if (genome is None) or (name is None):
return '[]'
db = feature_db.get(genome)
if db is None:
return '[]'
feature = db.get(name)
if feature is not None:
#return '{0}\t{1}:{2}-{3}\t{4}'.format(name, feature[0], feature[1], feature[2], feature[4])
return json.dumps([{
'gene': name,
'chromosome': feature[0],
'start': feature[1],
'end': feature[2],
'type': 'gene'
}])
else:
return '[]'
#@app.route('/igv/<path:filename>')
#def serve_static_file(filename):
# return send_from_directory(os.path.join(root_dir, 'igv', 'html'), filename)
@app.route('/bigwig/<dataset>/<filename>')
def serve_bigwig_file(dataset, filename):
sample_id, map_step, strand, _ = filename.split('.')
if map_step.startswith('circRNA'):
return send_file(os.path.join(root_dir, 'output', dataset, 'bigwig_pseudo_genome', filename), conditional=True)
else:
return send_file(os.path.join(root_dir, 'output', dataset, 'bigwig', filename), conditional=True)
@app.route('/bigwig_normalized/<dataset>/<filename>')
def serve_bigwig_normalized_file(dataset, filename):
return send_file(os.path.join(root_dir, 'output', dataset, 'bigwig_normalized', filename), conditional=True)
@app.route('/bigwig_normalized_log2/<dataset>/<filename>')
def serve_bigwig_normalized_log2_file(dataset, filename):
return send_file(os.path.join(root_dir, 'output', dataset, 'bigwig_normalized_log2', filename), conditional=True)
@app.route('/bam/<dataset>/<sample_id>/<filename>')
def serve_bam_file(dataset, sample_id, filename):
if filename.startswith('circRNA'):
return send_file(os.path.join(root_dir, 'output', dataset, 'bam_pseudo_genome', sample_id, filename), conditional=True)
else:
for bam_search_dir in ('bam_sorted_by_coord', 'gbam_sorted'):
bam_dir = os.path.join(root_dir, 'output', dataset, bam_search_dir)
if os.path.isdir(bam_dir):
return send_file(os.path.join(bam_dir, sample_id, filename), conditional=True)
@app.route('/domains/<dataset>.bed')
def serve_domains(dataset):
return send_file(os.path.join(root_dir, 'output', dataset, 'domains_genome', '20', '05.bed'), conditional=True)
@app.route('/refined_domains/<dataset>.bed')
def serve_refined_domains(dataset):
return send_file(os.path.join(root_dir, 'output', dataset, 'refined_domains_genome', '20', '05.bed'), conditional=True)
@app.route('/domains_localmax/<dataset>.bed')
def serve_domains_localmax(dataset):
return send_file(os.path.join(root_dir, 'output', dataset, 'domains_localmax_genome', 'domains.bed'), conditional=True)
@app.route('/domains_localmax_by_sample/<dataset>/<sample_id>.bed')
def serve_domains_localmax_by_sample(dataset, sample_id):
return send_file(os.path.join(root_dir, 'output', dataset, 'domains_localmax_by_sample_genome', sample_id + '.bed'), conditional=True)
@app.route('/genome/hg38/<path:filename>')
def serve_genome_dir(filename):
return send_from_directory(os.path.join(root_dir, 'genome', 'hg38'), filename, conditional=True)
@app.route('/igv_reference/<dataset>/<genome>/<filename>')
def serve_igv_reference(dataset, genome, filename):
return send_from_directory(os.path.join(output_dir, dataset, 'igv_reference', genome), filename, conditional=True)
if __name__ == "__main__":
#app.run(host='0.0.0.0', debug=True)
parser = argparse.ArgumentParser('IGV web server')
parser.add_argument('--database', '-i', type=str, action='append', help='database files')
parser.add_argument('--host', type=str, default='0.0.0.0')
parser.add_argument('--port', type=int, default=5000)
parser.add_argument('--output-file', '-o', type=str, help='output feature database file')
parser.add_argument('--build-database', action='store_true', default=False)
parser.add_argument('--genome', type=str)
parser.add_argument('--root-dir', type=str,
help='root directory to serve files. Default is current directory')
parser.add_argument('--output-dir', type=str,
help='output base directory. Default is {root_dir}/output')
parser.add_argument('--debug', action='store_true', default=False)
args = parser.parse_args()
if args.build_database:
if (args.genome is None) or (args.database is None):
print('Error: argument --genome and --database is required for --build-database')
parser.print_help()
sys.exit(1)
feature_db = {}
if args.database is not None:
for database in args.database:
if args.build_database:
logger.info('build feature database from {}'.format(database))
db = build_feature_db(database)
logger.info('added {} features'.format(len(db)))
feature_db.update(db)
del db
else:
with open(database, 'rb') as f:
genome = os.path.splitext(os.path.basename(database))[0]
logger.info('load feature database {} from {}'.format(genome, database))
feature_db[genome] = pickle.load(f)
logger.info('finished loading {} features'.format(len(feature_db[genome])))
if args.output_file is not None:
logger.info('dump database to output file: ' + args.output_file)
with open(args.output_file, 'wb') as f:
pickle.dump(feature_db, f)
if args.root_dir is None:
root_dir = os.getcwd()
else:
root_dir = args.root_dir
if args.output_dir is None:
output_dir = os.path.join(args.root_dir, 'output')
else:
output_dir = args.output_dir
if not args.build_database:
import flask_silk
from flask_autoindex import AutoIndex
AutoIndex(app, browse_root=os.path.join(args.root_dir, 'igv', 'html'))
app.run(host=args.host, port=args.port, debug=args.debug, threaded=True)