|
a |
|
b/exseek/scripts/preprocess.py |
|
|
1 |
#! /usr/bin/env python |
|
|
2 |
from __future__ import print_function |
|
|
3 |
import argparse, sys, os, errno |
|
|
4 |
import logging |
|
|
5 |
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] [%(levelname)s] %(name)s: %(message)s') |
|
|
6 |
|
|
|
7 |
command_handlers = {} |
|
|
8 |
def command_handler(f): |
|
|
9 |
command_handlers[f.__name__] = f |
|
|
10 |
return f |
|
|
11 |
|
|
|
12 |
def read_gtf(filename): |
|
|
13 |
from ioutils import open_file_or_stdin |
|
|
14 |
|
|
|
15 |
with open_file_or_stdin(filename) as fin: |
|
|
16 |
lineno = 0 |
|
|
17 |
for line in fin: |
|
|
18 |
lineno += 1 |
|
|
19 |
c = line.strip().split('\t') |
|
|
20 |
if c[0].startswith('#'): |
|
|
21 |
continue |
|
|
22 |
attrs = {} |
|
|
23 |
for a in c[8].split(';')[:-1]: |
|
|
24 |
a = a.strip() |
|
|
25 |
i = a.find(' ') |
|
|
26 |
key = a[:i] |
|
|
27 |
val = a[(i + 1):].strip('"') |
|
|
28 |
attrs[key] = val |
|
|
29 |
gene_id = attrs.get('gene_id') |
|
|
30 |
if gene_id is None: |
|
|
31 |
raise ValueError('gene_id not found in GTF file at line {}'.format(lineno)) |
|
|
32 |
yield (c, attrs, line) |
|
|
33 |
|
|
|
34 |
@command_handler |
|
|
35 |
def extract_gene(args): |
|
|
36 |
from ioutils import open_file_or_stdout |
|
|
37 |
|
|
|
38 |
feature = args.feature |
|
|
39 |
genes = {} |
|
|
40 |
logger.info('read GTF file: ' + args.input_file) |
|
|
41 |
for c, attrs, line in read_gtf(args.input_file): |
|
|
42 |
if (feature is not None) and (c[2] != feature): |
|
|
43 |
continue |
|
|
44 |
gene_id = attrs.get('gene_id') |
|
|
45 |
gene = genes.get(gene_id) |
|
|
46 |
if gene is None: |
|
|
47 |
gene = [c[0], int(c[3]) - 1, int(c[4]), gene_id, 0, c[6]] |
|
|
48 |
genes[gene_id] = gene |
|
|
49 |
else: |
|
|
50 |
gene[1] = min(gene[1], int(c[3]) - 1) |
|
|
51 |
gene[2] = max(gene[2], int(c[4])) |
|
|
52 |
|
|
|
53 |
logger.info('number of genes: {}'.format(len(genes))) |
|
|
54 |
logger.info('write BED file: ' + args.output_file) |
|
|
55 |
with open_file_or_stdout(args.output_file) as fout: |
|
|
56 |
for gene_id, gene in genes.items(): |
|
|
57 |
fout.write('\t'.join(map(str, gene))) |
|
|
58 |
fout.write('\n') |
|
|
59 |
|
|
|
60 |
|
|
|
61 |
@command_handler |
|
|
62 |
def fix_gtf(args): |
|
|
63 |
from ioutils import open_file_or_stdout |
|
|
64 |
from collections import defaultdict |
|
|
65 |
# strand of exons grouped by transcript_id |
|
|
66 |
strands = defaultdict(list) |
|
|
67 |
|
|
|
68 |
feature = args.feature |
|
|
69 |
lines = [] |
|
|
70 |
logger.info('read GTF file: ' + args.input_file) |
|
|
71 |
for c, attrs, line in read_gtf(args.input_file): |
|
|
72 |
if c[2] in ('transcript', 'exon'): |
|
|
73 |
transcript_id = attrs.get('transcript_id') |
|
|
74 |
if transcript_id is None: |
|
|
75 |
raise ValueError('transcript_id not found in GTF file at line {}'.format(lineno)) |
|
|
76 |
lines.append((transcript_id, line)) |
|
|
77 |
else: |
|
|
78 |
transcript_id = attrs.get('transcript_id') |
|
|
79 |
if transcript_id is None: |
|
|
80 |
raise ValueError('transcript_id not found in GTF file at line {}'.format(lineno)) |
|
|
81 |
strands[transcript_id].append(c[6]) |
|
|
82 |
invalid_transcripts = set() |
|
|
83 |
for transcript_id, strands_tx in strands.items(): |
|
|
84 |
strands_tx = set(strands_tx) |
|
|
85 |
# remove transcripts without strand information |
|
|
86 |
if '.' in strands_tx: |
|
|
87 |
invalid_transcripts.add(transcript_id) |
|
|
88 |
# remove transcripts with exon on different strands |
|
|
89 |
elif len(strands_tx) != 1: |
|
|
90 |
invalid_transcripts.add(transcript_id) |
|
|
91 |
|
|
|
92 |
logger.info('number of transcripts: {}'.format(len(strands))) |
|
|
93 |
logger.info('number of invalid transcripts: {}'.format(len(invalid_transcripts))) |
|
|
94 |
logger.info('write GTF file: ' + args.output_file) |
|
|
95 |
with open_file_or_stdout(args.output_file) as fout: |
|
|
96 |
for transcript_id, line in lines: |
|
|
97 |
if transcript_id not in invalid_transcripts: |
|
|
98 |
fout.write(line) |
|
|
99 |
|
|
|
100 |
|
|
|
101 |
@command_handler |
|
|
102 |
def transcript_counts(args): |
|
|
103 |
import pysam |
|
|
104 |
import numpy as np |
|
|
105 |
from ioutils import open_file_or_stdout |
|
|
106 |
from collections import defaultdict |
|
|
107 |
|
|
|
108 |
logger.info('read input transcript BAM file: ' + args.input_file) |
|
|
109 |
sam = pysam.AlignmentFile(args.input_file, "rb") |
|
|
110 |
counts = defaultdict(int) |
|
|
111 |
for read in sam: |
|
|
112 |
counts[read.reference_name] += 1 |
|
|
113 |
|
|
|
114 |
logger.info('create output file: ' + args.output_file) |
|
|
115 |
with open_file_or_stdout(args.output_file) as fout: |
|
|
116 |
for key, val in counts.items(): |
|
|
117 |
fout.write('{}\t{}\n'.format(key, val)) |
|
|
118 |
|
|
|
119 |
@command_handler |
|
|
120 |
def extract_longest_transcript(args): |
|
|
121 |
from ioutils import open_file_or_stdin, open_file_or_stdout |
|
|
122 |
from collections import defaultdict |
|
|
123 |
from functools import partial |
|
|
124 |
|
|
|
125 |
feature = args.feature |
|
|
126 |
genes = defaultdict(partial(defaultdict, int)) |
|
|
127 |
lines = [] |
|
|
128 |
logger.info('read gtf file: ' + args.input_file) |
|
|
129 |
with open_file_or_stdin(args.input_file) as fin: |
|
|
130 |
lineno = 0 |
|
|
131 |
for line in fin: |
|
|
132 |
lineno += 1 |
|
|
133 |
c = line.strip().split('\t') |
|
|
134 |
if c[0].startswith('#'): |
|
|
135 |
continue |
|
|
136 |
if c[2] != feature: |
|
|
137 |
lines.append(('#other#', line)) |
|
|
138 |
continue |
|
|
139 |
attrs = {} |
|
|
140 |
for a in c[8].split(';')[:-1]: |
|
|
141 |
a = a.strip() |
|
|
142 |
i = a.find(' ') |
|
|
143 |
key = a[:i] |
|
|
144 |
val = a[(i + 1):].strip('"') |
|
|
145 |
attrs[key] = val |
|
|
146 |
transcript_id = attrs.get('transcript_id') |
|
|
147 |
if transcript_id is None: |
|
|
148 |
raise ValueError('transcript_id not found in GTF file at line {}'.format(lineno)) |
|
|
149 |
gene_id = attrs.get('gene_id') |
|
|
150 |
if gene_id is None: |
|
|
151 |
raise ValueError('gene_id not found in GTF file at line {}'.format(lineno)) |
|
|
152 |
lines.append((transcript_id, line)) |
|
|
153 |
genes[gene_id][transcript_id] += int(c[4]) - int(c[3]) + 1 |
|
|
154 |
kept_transcripts = set() |
|
|
155 |
kept_transcripts.add('#other#') |
|
|
156 |
for gene_id, gene in genes.items(): |
|
|
157 |
max_length = 0 |
|
|
158 |
max_transcript = None |
|
|
159 |
for transcript_id, length in gene.items(): |
|
|
160 |
if length > max_length: |
|
|
161 |
max_length = length |
|
|
162 |
max_transcript = transcript_id |
|
|
163 |
kept_transcripts.add(transcript_id) |
|
|
164 |
|
|
|
165 |
logger.info('number of genes: {}'.format(len(genes))) |
|
|
166 |
logger.info('number of transcripts: {}'.format(sum(map(len, genes.values())))) |
|
|
167 |
logger.info('number of longest transcripts: {}'.format(len(kept_transcripts) - 1)) |
|
|
168 |
logger.info('write output gtf file: ' + args.output_file) |
|
|
169 |
with open_file_or_stdout(args.output_file) as fout: |
|
|
170 |
for transcript_id, line in lines: |
|
|
171 |
if transcript_id in kept_transcripts: |
|
|
172 |
fout.write(line) |
|
|
173 |
|
|
|
174 |
@command_handler |
|
|
175 |
def gtf_to_transcript_table(args): |
|
|
176 |
from ioutils import open_file_or_stdin, open_file_or_stdout |
|
|
177 |
from collections import OrderedDict |
|
|
178 |
|
|
|
179 |
feature = args.feature |
|
|
180 |
default_transcript_type = args.transcript_type |
|
|
181 |
default_gene_type = args.gene_type |
|
|
182 |
|
|
|
183 |
fout = open_file_or_stdout(args.output_file) |
|
|
184 |
with open_file_or_stdin(args.input_file) as fin: |
|
|
185 |
transcripts = OrderedDict() |
|
|
186 |
for line in fin: |
|
|
187 |
# get GTF columns |
|
|
188 |
c = line.strip().split('\t') |
|
|
189 |
if c[0].startswith('#'): |
|
|
190 |
continue |
|
|
191 |
if c[2] != feature: |
|
|
192 |
continue |
|
|
193 |
# GTF attributes |
|
|
194 |
attrs = {} |
|
|
195 |
for a in c[8].split(';')[:-1]: |
|
|
196 |
a = a.strip() |
|
|
197 |
i = a.find(' ') |
|
|
198 |
key = a[:i] |
|
|
199 |
val = a[(i + 1):].strip('"') |
|
|
200 |
attrs[key] = val |
|
|
201 |
# mitranscriptome |
|
|
202 |
if c[1] == 'mitranscriptome': |
|
|
203 |
if attrs['tcat'] == 'lncrna': |
|
|
204 |
attrs['gene_type'] = 'lncRNA' |
|
|
205 |
attrs['transcript_type'] = 'lncRNA' |
|
|
206 |
elif attrs['tcat'] == 'tucp': |
|
|
207 |
attrs['gene_type'] = 'tucpRNA' |
|
|
208 |
attrs['transcript_type'] = 'tucpRNA' |
|
|
209 |
# use transcript_id as transcript_name |
|
|
210 |
if 'transcript_name' not in attrs: |
|
|
211 |
attrs['transcript_name'] = attrs['transcript_id'] |
|
|
212 |
# use gene_id as gene_name |
|
|
213 |
if 'gene_name' not in attrs: |
|
|
214 |
attrs['gene_name'] = attrs['gene_id'] |
|
|
215 |
# set transcript_type if given |
|
|
216 |
if default_transcript_type is not None: |
|
|
217 |
attrs['transcript_type'] = default_transcript_type |
|
|
218 |
else: |
|
|
219 |
if 'transcript_type' not in attrs: |
|
|
220 |
attrs['transcript_type'] = 'unknown' |
|
|
221 |
# set gene_type if given |
|
|
222 |
if default_gene_type is not None: |
|
|
223 |
attrs['gene_type'] = default_gene_type |
|
|
224 |
else: |
|
|
225 |
if 'gene_type' not in attrs: |
|
|
226 |
attrs['gene_type'] = 'unknown' |
|
|
227 |
exon = [c[0], int(c[3]) - 1, int(c[4]), attrs['gene_id'], 0, c[6], |
|
|
228 |
attrs['gene_id'], attrs['transcript_id'], |
|
|
229 |
attrs['gene_name'], attrs['transcript_name'], |
|
|
230 |
attrs['gene_type'], attrs['transcript_type'], c[1]] |
|
|
231 |
transcript = transcripts.get(attrs['transcript_id']) |
|
|
232 |
if transcript is None: |
|
|
233 |
transcripts[attrs['transcript_id']] = exon |
|
|
234 |
else: |
|
|
235 |
if c[2] == 'exon': |
|
|
236 |
transcript[1] = min(transcript[1], exon[1]) |
|
|
237 |
transcript[2] = max(transcript[2], exon[2]) |
|
|
238 |
header = ['chrom', 'start', 'end', 'name', 'score', 'strand', |
|
|
239 |
'gene_id', 'transcript_id', |
|
|
240 |
'gene_name', 'transcript_name', |
|
|
241 |
'gene_type', 'transcript_type', 'source' |
|
|
242 |
] |
|
|
243 |
print('\t'.join(header), file=fout) |
|
|
244 |
for transcript in transcripts.values(): |
|
|
245 |
print('\t'.join(str(a) for a in transcript), file=fout) |
|
|
246 |
fout.close() |
|
|
247 |
|
|
|
248 |
@command_handler |
|
|
249 |
def extract_circrna_junction(args): |
|
|
250 |
from Bio import SeqIO |
|
|
251 |
from ioutils import open_file_or_stdin, open_file_or_stdout |
|
|
252 |
|
|
|
253 |
anchor_size = args.anchor_size |
|
|
254 |
logger.info('read sequence file: ' + args.input_file) |
|
|
255 |
logger.info('create output file: ' + args.output_file) |
|
|
256 |
fout = open_file_or_stdout(args.output_file) |
|
|
257 |
with open_file_or_stdin(args.input_file) as fin: |
|
|
258 |
for record in SeqIO.parse(fin, 'fasta'): |
|
|
259 |
seq = str(record.seq) |
|
|
260 |
if len(seq) < args.min_length: |
|
|
261 |
continue |
|
|
262 |
s = min(len(seq), anchor_size) |
|
|
263 |
seq_id = record.id.split('|')[0] |
|
|
264 |
fout.write('>{}\n'.format(seq_id)) |
|
|
265 |
fout.write(seq[-s:] + seq[:s]) |
|
|
266 |
fout.write('\n') |
|
|
267 |
fout.close() |
|
|
268 |
|
|
|
269 |
@command_handler |
|
|
270 |
def filter_circrna_reads(args): |
|
|
271 |
import pysam |
|
|
272 |
import numpy as np |
|
|
273 |
from ioutils import open_file_or_stdout, open_file_or_stdin |
|
|
274 |
from collections import defaultdict |
|
|
275 |
from copy import deepcopy |
|
|
276 |
|
|
|
277 |
logger.info('read input SAM file: ' + args.input_file) |
|
|
278 |
fin = open_file_or_stdin(args.input_file) |
|
|
279 |
sam_in = pysam.AlignmentFile(fin, "r") |
|
|
280 |
if sam_in.header is None: |
|
|
281 |
raise ValueError('requires SAM header to get junction positions') |
|
|
282 |
# get junction positions (middle of the sequences) |
|
|
283 |
junction_positions = {} |
|
|
284 |
for sq in sam_in.header['SQ']: |
|
|
285 |
junction_positions[sq['SN']] = sq['LN']//2 |
|
|
286 |
|
|
|
287 |
logger.info('create output SAM file: ' + args.output_file) |
|
|
288 |
fout = open_file_or_stdout(args.output_file) |
|
|
289 |
sam_out = pysam.AlignmentFile(fout, 'w', template=sam_in) |
|
|
290 |
|
|
|
291 |
sam_filtered = None |
|
|
292 |
if args.filtered_file is not None: |
|
|
293 |
logger.info('create filtered SAM file: ' + args.filtered_file) |
|
|
294 |
sam_filtered = pysam.AlignmentFile(args.filtered_file, 'w', template=sam_in) |
|
|
295 |
|
|
|
296 |
for read in sam_in: |
|
|
297 |
filtered = False |
|
|
298 |
if read.is_unmapped: |
|
|
299 |
filtered = True |
|
|
300 |
elif read.is_reverse: |
|
|
301 |
filtered = True |
|
|
302 |
else: |
|
|
303 |
pos = junction_positions[read.reference_name] |
|
|
304 |
if not (read.reference_start < pos <= read.reference_end): |
|
|
305 |
filterd = True |
|
|
306 |
if not filtered: |
|
|
307 |
sam_out.write(read) |
|
|
308 |
elif sam_filtered is not None: |
|
|
309 |
sam_filtered.write(read) |
|
|
310 |
|
|
|
311 |
fin.close() |
|
|
312 |
fout.close() |
|
|
313 |
if sam_filtered is not None: |
|
|
314 |
sam_filtered.close() |
|
|
315 |
|
|
|
316 |
@command_handler |
|
|
317 |
def chrom_sizes(args): |
|
|
318 |
from Bio import SeqIO |
|
|
319 |
from ioutils import open_file_or_stdin, open_file_or_stdout |
|
|
320 |
|
|
|
321 |
fout = open_file_or_stdout(args.output_file) |
|
|
322 |
with open_file_or_stdin(args.input_file) as fin: |
|
|
323 |
for record in SeqIO.parse(fin, 'fasta'): |
|
|
324 |
fout.write('{}\t{}\n'.format(record.id, len(record.seq))) |
|
|
325 |
|
|
|
326 |
@command_handler |
|
|
327 |
def extract_domain_sequence(args): |
|
|
328 |
from pyfaidx import Fasta |
|
|
329 |
from Bio.Seq import Seq |
|
|
330 |
from ioutils import open_file_or_stdout |
|
|
331 |
import pandas as pd |
|
|
332 |
|
|
|
333 |
fout = open_file_or_stdout(args.output_file) |
|
|
334 |
fastas = {} |
|
|
335 |
with open(args.input_file, 'r') as fin: |
|
|
336 |
for lineno, line in enumerate(fin): |
|
|
337 |
feature = line.split('\t')[0] |
|
|
338 |
gene_id, gene_type, gene_name, domain_id, transcript_id, start, end = feature.split('|') |
|
|
339 |
start = int(start) |
|
|
340 |
end = int(end) |
|
|
341 |
if gene_type == 'genomic': |
|
|
342 |
gene_type = 'genome' |
|
|
343 |
if gene_type not in fastas: |
|
|
344 |
fastas[gene_type] = Fasta(os.path.join(args.genome_dir, 'fasta', gene_type + '.fa')) |
|
|
345 |
if gene_type == 'genome': |
|
|
346 |
chrom, gstart, gend, strand = gene_id.split('_') |
|
|
347 |
gstart = int(gstart) |
|
|
348 |
gend = int(gend) |
|
|
349 |
seq = fastas[gene_type][chrom][gstart:gend].seq |
|
|
350 |
if strand == '-': |
|
|
351 |
seq = str(Seq(seq).reverse_complement()) |
|
|
352 |
else: |
|
|
353 |
seq = fastas[gene_type][transcript_id][start:end].seq |
|
|
354 |
seq = seq.upper() |
|
|
355 |
fout.write('>{}\n'.format(feature)) |
|
|
356 |
fout.write(seq) |
|
|
357 |
fout.write('\n') |
|
|
358 |
fout.close() |
|
|
359 |
|
|
|
360 |
@command_handler |
|
|
361 |
def extract_feature_sequence(args): |
|
|
362 |
from pyfaidx import Fasta |
|
|
363 |
from Bio.Seq import Seq |
|
|
364 |
from ioutils import open_file_or_stdout |
|
|
365 |
|
|
|
366 |
def pad_range(start, end, chrom_size, max_length): |
|
|
367 |
if (end - start) >= max_length: |
|
|
368 |
return |
|
|
369 |
padding_left = (max_length - (end - start))//2 |
|
|
370 |
new_start = max(0, start - padding_left) |
|
|
371 |
new_end = min(new_start + max_length, chrom_size) |
|
|
372 |
return new_start, new_end |
|
|
373 |
|
|
|
374 |
fout = open_file_or_stdout(args.output_file) |
|
|
375 |
fastas = {} |
|
|
376 |
with open(args.input_file, 'r') as fin: |
|
|
377 |
for lineno, line in enumerate(fin): |
|
|
378 |
if lineno == 0: |
|
|
379 |
continue |
|
|
380 |
feature = line.split('\t')[0] |
|
|
381 |
gene_id, gene_type, gene_name, domain_id, transcript_id, start, end = feature.split('|') |
|
|
382 |
start = int(start) |
|
|
383 |
end = int(end) |
|
|
384 |
if gene_type == 'genomic': |
|
|
385 |
gene_type = 'genome' |
|
|
386 |
# load FASTA file |
|
|
387 |
if gene_type not in fastas: |
|
|
388 |
fastas[gene_type] = Fasta(os.path.join(args.genome_dir, 'fasta', gene_type + '.fa')) |
|
|
389 |
if gene_type == 'genome': |
|
|
390 |
chrom, gstart, gend, strand = gene_id.split('_') |
|
|
391 |
gstart = int(gstart) |
|
|
392 |
gend = int(gend) |
|
|
393 |
seq = fastas[gene_type][chrom][gstart:gend].seq |
|
|
394 |
if strand == '-': |
|
|
395 |
seq = str(Seq(seq).reverse_complement()) |
|
|
396 |
else: |
|
|
397 |
seq = fastas[gene_type][transcript_id][start:end].seq |
|
|
398 |
seq = seq.upper() |
|
|
399 |
fout.write('>{}\n'.format(feature)) |
|
|
400 |
fout.write(seq) |
|
|
401 |
fout.write('\n') |
|
|
402 |
fout.close() |
|
|
403 |
|
|
|
404 |
@command_handler |
|
|
405 |
def calculate_star_parameters(args): |
|
|
406 |
from math import log2 |
|
|
407 |
|
|
|
408 |
genome_length = 0 |
|
|
409 |
n_seqs = 0 |
|
|
410 |
with open(args.input_file, 'r') as f: |
|
|
411 |
for line in f: |
|
|
412 |
genome_length += int(line.split('\t')[1]) |
|
|
413 |
n_seqs += 1 |
|
|
414 |
if args.parameter == 'genomeSAindexNbases': |
|
|
415 |
print(min(14, int(log2(genome_length)//2) - 1)) |
|
|
416 |
elif args.parameter == 'genomeChrBinNbits': |
|
|
417 |
print(min(18, int(log2(genome_length/n_seqs)))) |
|
|
418 |
|
|
|
419 |
@command_handler |
|
|
420 |
def highlight_mature_mirna_location(args): |
|
|
421 |
import gzip |
|
|
422 |
from Bio import SeqIO |
|
|
423 |
from collections import defaultdict |
|
|
424 |
from ioutils import open_file_or_stdout |
|
|
425 |
|
|
|
426 |
logger.info('read hairpin sequences: ' + args.hairpin) |
|
|
427 |
hairpin_seqs = {} |
|
|
428 |
with open(args.hairpin, 'r') as f: |
|
|
429 |
for record in SeqIO.parse(f, 'fasta'): |
|
|
430 |
if (args.species is None) or (args.species == record.id.split('-')[0]): |
|
|
431 |
hairpin_seqs[record.id] = str(record.seq) |
|
|
432 |
|
|
|
433 |
logger.info('read mature sequences: ' + args.mature) |
|
|
434 |
fout = open_file_or_stdout(args.output_file) |
|
|
435 |
mature_positions = defaultdict(dict) |
|
|
436 |
with open(args.mature, 'r') as f: |
|
|
437 |
for record in SeqIO.parse(f, 'fasta'): |
|
|
438 |
if not ((args.species is None) or (args.species == record.id.split('-')[0])): |
|
|
439 |
continue |
|
|
440 |
if record.id.endswith('-5p') or record.id.endswith('-3p'): |
|
|
441 |
end_type = record.id.split('-')[-1] |
|
|
442 |
hairpin_id = record.id[:-3].lower() |
|
|
443 |
hairpin_seq = hairpin_seqs.get(hairpin_id) |
|
|
444 |
if hairpin_seq is None: |
|
|
445 |
logger.warn('hairpin sequence id {} is not found in hairpin file'.format(hairpin_id)) |
|
|
446 |
continue |
|
|
447 |
mature_id = record.id |
|
|
448 |
mature_seq = str(record.seq) |
|
|
449 |
pos = hairpin_seq.find(mature_seq) |
|
|
450 |
if pos < 0: |
|
|
451 |
logger.warn('mature sequence {} is not found in hairpin file'.format(mature_id)) |
|
|
452 |
continue |
|
|
453 |
else: |
|
|
454 |
mature_positions[hairpin_id][end_type] = (pos, pos + len(mature_seq)) |
|
|
455 |
|
|
|
456 |
for hairpin_id, hairpin_seq in hairpin_seqs.items(): |
|
|
457 |
mature_position = mature_positions.get(hairpin_id) |
|
|
458 |
if mature_position is None: |
|
|
459 |
mature_position = {} |
|
|
460 |
fout.write('>{}\n'.format(hairpin_id)) |
|
|
461 |
offset = 0 |
|
|
462 |
if '5p' in mature_position: |
|
|
463 |
start, end = mature_position['5p'] |
|
|
464 |
fout.write('{0}\x1B[31;1m{1}\x1B[0m'.format(hairpin_seq[offset:start], hairpin_seq[start:end])) |
|
|
465 |
offset = end |
|
|
466 |
if '3p' in mature_position: |
|
|
467 |
start, end = mature_position['3p'] |
|
|
468 |
fout.write('{0}\x1B[32;1m{1}\x1B[0m'.format(hairpin_seq[offset:start], hairpin_seq[start:end])) |
|
|
469 |
offset = end |
|
|
470 |
fout.write('{}\n'.format(hairpin_seq[offset:])) |
|
|
471 |
fout.close() |
|
|
472 |
|
|
|
473 |
@command_handler |
|
|
474 |
def extract_mature_mirna_location(args): |
|
|
475 |
from utils import read_gff, GFFRecord |
|
|
476 |
from ioutils import open_file_or_stdin, open_file_or_stdout |
|
|
477 |
from collections import OrderedDict, defaultdict |
|
|
478 |
|
|
|
479 |
logger.info('read input GFF file: ' + args.input_file) |
|
|
480 |
fin = open_file_or_stdin(args.input_file) |
|
|
481 |
logger.info('open output BED file: ' + args.input_file) |
|
|
482 |
fout = open_file_or_stdout(args.output_file) |
|
|
483 |
# key: precursor_id, value: precursor record |
|
|
484 |
precursors = OrderedDict() |
|
|
485 |
# key: precursor_id, value: list of mature records |
|
|
486 |
matures = defaultdict(list) |
|
|
487 |
# read features from GFF file |
|
|
488 |
for record in read_gff(fin): |
|
|
489 |
if record.feature == 'miRNA_primary_transcript': |
|
|
490 |
precursors[record.attr['ID']] = record |
|
|
491 |
elif record.feature == 'miRNA': |
|
|
492 |
matures[record.attr['Derives_from']].append(record) |
|
|
493 |
# get locations of mature miRNAs |
|
|
494 |
for precursor_id, precursor in precursors.items(): |
|
|
495 |
for mature in matures[precursor_id]: |
|
|
496 |
if mature.strand == '+': |
|
|
497 |
fout.write('{}\t{}\t{}\t{}\t0\t+\n'.format( |
|
|
498 |
precursor.attr['Name'], |
|
|
499 |
mature.start - precursor.start, |
|
|
500 |
mature.end - precursor.start + 1, |
|
|
501 |
mature.attr['Name'])) |
|
|
502 |
else: |
|
|
503 |
fout.write('{}\t{}\t{}\t{}\t0\t+\n'.format( |
|
|
504 |
precursor.attr['Name'], |
|
|
505 |
precursor.end - mature.end, |
|
|
506 |
precursor.end - mature.start + 1, |
|
|
507 |
mature.attr['Name'] |
|
|
508 |
)) |
|
|
509 |
fin.close() |
|
|
510 |
fout.close() |
|
|
511 |
|
|
|
512 |
@command_handler |
|
|
513 |
def gtf_to_bed(args): |
|
|
514 |
from ioutils import open_file_or_stdin, open_file_or_stdout |
|
|
515 |
|
|
|
516 |
exon_feature = 'exon' |
|
|
517 |
# use transcript_id attribute as key |
|
|
518 |
transcripts = {} |
|
|
519 |
logger.info('read input GTF file: ' + args.input_file) |
|
|
520 |
for lineno, record in enumerate(read_gtf(args.input_file)): |
|
|
521 |
c, attrs, line = record |
|
|
522 |
if c[2] == exon_feature: |
|
|
523 |
gene_id = attrs.get('gene_id') |
|
|
524 |
if gene_id is None: |
|
|
525 |
raise ValueError('gene_id attribute not found in GTF file {}:{}'.format(args.input_file, lineno)) |
|
|
526 |
transcript_id = attrs.get('transcript_id') |
|
|
527 |
if transcript_id is None: |
|
|
528 |
raise ValueError('transcript_id attribute not found in GTF file {}:{}'.format(args.input_file, lineno)) |
|
|
529 |
transcript = transcripts.get(transcript_id) |
|
|
530 |
if transcript is None: |
|
|
531 |
# new transcript |
|
|
532 |
transcript = { |
|
|
533 |
'chrom': c[0], |
|
|
534 |
'strand': c[6], |
|
|
535 |
'gene_id': gene_id, |
|
|
536 |
'gene_name': attrs.get('gene_name', gene_id), |
|
|
537 |
'transcript_name': attrs.get('transcript_name', transcript_id), |
|
|
538 |
'exons': [] |
|
|
539 |
} |
|
|
540 |
transcripts[transcript_id] = transcript |
|
|
541 |
# add a new exon |
|
|
542 |
transcript['exons'].append((int(c[3]) - 1, int(c[4]))) |
|
|
543 |
|
|
|
544 |
fout = open_file_or_stdout(args.output_file) |
|
|
545 |
bed_template = '{chrom}\t{start}\t{end}\t{name}\t0\t{strand}\t0\t0\t0\t{n_exons}\t{exon_sizes}\t{exon_starts}\n' |
|
|
546 |
for transcript_id, transcript in transcripts.items(): |
|
|
547 |
# sort exons by start position |
|
|
548 |
transcript['exons'] = sorted(transcript['exons'], key=lambda x: x[0]) |
|
|
549 |
transcript['n_exons'] = len(transcript['exons']) |
|
|
550 |
transcript['start'] = transcript['exons'][0][0] |
|
|
551 |
transcript['end'] = transcript['exons'][-1][1] |
|
|
552 |
transcript['exon_starts'] = ','.join(str(e[0] - transcript['start']) for e in transcript['exons']) |
|
|
553 |
transcript['exon_sizes'] = ','.join(str(e[1] - e[0]) for e in transcript['exons']) |
|
|
554 |
transcript['name'] = '{gene_id}'.format(**transcript) |
|
|
555 |
fout.write(bed_template.format(**transcript)) |
|
|
556 |
|
|
|
557 |
|
|
|
558 |
@command_handler |
|
|
559 |
def calculate_gene_length(args): |
|
|
560 |
import HTSeq |
|
|
561 |
from collections import defaultdict |
|
|
562 |
from functools import partial |
|
|
563 |
import numpy as np |
|
|
564 |
from ioutils import open_file_or_stdin |
|
|
565 |
from tqdm import tqdm |
|
|
566 |
|
|
|
567 |
fin = open_file_or_stdin(args.input_file) |
|
|
568 |
gff = HTSeq.GFF_Reader(fin) |
|
|
569 |
exons = defaultdict(partial(defaultdict, int)) |
|
|
570 |
for feature in tqdm(gff, unit='feature'): |
|
|
571 |
if feature.type == 'exon': |
|
|
572 |
exons[feature.attr['gene_id']][feature.attr['transcript_id']] += feature.iv.length |
|
|
573 |
|
|
|
574 |
@command_handler |
|
|
575 |
def merge_data_frames(args): |
|
|
576 |
import pandas as pd |
|
|
577 |
import numpy as np |
|
|
578 |
from ioutils import open_file_or_stdout |
|
|
579 |
|
|
|
580 |
if (not args.on_index) and (args.on is None): |
|
|
581 |
raise ValueError('argument --on is required if --on-index is not specified') |
|
|
582 |
merged = None |
|
|
583 |
for input_file in args.input_file: |
|
|
584 |
logger.info('read input file: ' + input_file) |
|
|
585 |
df = pd.read_table(input_file, sep=args.sep) |
|
|
586 |
if merged is None: |
|
|
587 |
merged = df |
|
|
588 |
else: |
|
|
589 |
if args.on_index: |
|
|
590 |
merged = pd.merge(merged, df, how=args.how, left_index=True, right_index=True) |
|
|
591 |
else: |
|
|
592 |
merged = pd.merge(merged, df, how=args.how, on=args.on) |
|
|
593 |
if args.fillna is not None: |
|
|
594 |
merged.fillna(args.fillna, inplace=True) |
|
|
595 |
logger.info('open output file: ' + args.output_file) |
|
|
596 |
with open_file_or_stdout(args.output_file) as f: |
|
|
597 |
merged.to_csv(f, sep=args.sep, header=True, index=args.on_index) |
|
|
598 |
|
|
|
599 |
@command_handler |
|
|
600 |
def genomecov(args): |
|
|
601 |
import pysam |
|
|
602 |
import h5py |
|
|
603 |
import numpy as np |
|
|
604 |
from tqdm import tqdm |
|
|
605 |
|
|
|
606 |
logger.info('read input BAM/SAM file: ' + args.input_file) |
|
|
607 |
sam = pysam.AlignmentFile(args.input_file, 'r') |
|
|
608 |
logger.info('open output HDF5 file: ' + args.output_file) |
|
|
609 |
fout = h5py.File(args.output_file, 'w') |
|
|
610 |
for sq in tqdm(sam.header['SQ'], unit='seq'): |
|
|
611 |
data = np.zeros(sq['LN'], dtype=np.int32) |
|
|
612 |
fout.create_dataset(sq['SN'], data=data, compression='gzip') |
|
|
613 |
fout.close() |
|
|
614 |
|
|
|
615 |
@command_handler |
|
|
616 |
def calc_rpkm(args): |
|
|
617 |
import pandas as pd |
|
|
618 |
import numpy as np |
|
|
619 |
from ioutils import open_file_or_stdin, open_file_or_stdout |
|
|
620 |
|
|
|
621 |
matrix = pd.read_table(open_file_or_stdin(args.input_file), index_col=0, sep='\t') |
|
|
622 |
feature_info = matrix.index.to_series().str.split('|', expand=True) |
|
|
623 |
feature_info.columns = ['gene_id', 'gene_type', 'gene_name', 'feature_id', 'transcript_id', 'start', 'end'] |
|
|
624 |
feature_info['start'] = feature_info['start'].astype('int') |
|
|
625 |
feature_info['end'] = feature_info['end'].astype('int') |
|
|
626 |
feature_info['length'] = feature_info['end'] - feature_info['start'] |
|
|
627 |
matrix = 1000.0*matrix.div(feature_info['length'], axis=0) |
|
|
628 |
matrix.to_csv(open_file_or_stdout(args.output_file), index=True, header=True, sep='\t', na_rep='NA') |
|
|
629 |
|
|
|
630 |
@command_handler |
|
|
631 |
def create_pseudo_genome(args): |
|
|
632 |
from pyfaidx import Fasta, Faidx |
|
|
633 |
import numpy as np |
|
|
634 |
|
|
|
635 |
src = Fasta(args.input_file) |
|
|
636 |
lengths = np.array([len(r) for r in src]) |
|
|
637 |
padded_lengths = lengths + args.padding |
|
|
638 |
padded_seq = ''.join(['N']*args.padding) |
|
|
639 |
cum_lengths = np.cumsum(padded_lengths) |
|
|
640 |
chrom_indices = cum_lengths//args.max_chrom_size |
|
|
641 |
chrom_starts = cum_lengths%args.max_chrom_size - padded_lengths |
|
|
642 |
prev_chrom_index = -1 |
|
|
643 |
|
|
|
644 |
# write FASTA file |
|
|
645 |
logger.info('write FASTA file: ' + args.output_fasta) |
|
|
646 |
with open(args.output_fasta, 'w') as fout: |
|
|
647 |
for i, record in enumerate(src): |
|
|
648 |
# new chromosome |
|
|
649 |
if chrom_indices[i] != prev_chrom_index: |
|
|
650 |
if i > 0: |
|
|
651 |
fout.write('\n') |
|
|
652 |
fout.write('>c{}\n'.format(i + 1)) |
|
|
653 |
prev_chrom_index = chrom_indices[i] |
|
|
654 |
# write sequence |
|
|
655 |
fout.write(str(record)) |
|
|
656 |
fout.write(padded_seq) |
|
|
657 |
# build fasta index |
|
|
658 |
logger.info('build FASTA index') |
|
|
659 |
dst_index = Faidx(args.output_fasta) |
|
|
660 |
dst_index.build_index() |
|
|
661 |
# chrom sizes |
|
|
662 |
logger.info('write chrom sizes: ' + args.output_chrom_sizes) |
|
|
663 |
fout = open(args.output_chrom_sizes, 'w') |
|
|
664 |
with open(args.output_fasta + '.fai', 'r') as f: |
|
|
665 |
for line in f: |
|
|
666 |
c = line.strip().split('\t') |
|
|
667 |
fout.write(c[0] + '\t' + c[1] + '\n') |
|
|
668 |
fout.close() |
|
|
669 |
# cytoband file |
|
|
670 |
logger.info('write cytoband file: ' + args.output_cytoband) |
|
|
671 |
fout = open(args.output_cytoband, 'w') |
|
|
672 |
with open(args.output_fasta + '.fai', 'r') as f: |
|
|
673 |
for i, line in enumerate(f): |
|
|
674 |
c = line.strip().split('\t') |
|
|
675 |
fout.write('{0}\t0\t{1}\tp{2}.1\tgned\n'.format(c[0], c[1], i + 1)) |
|
|
676 |
fout.close() |
|
|
677 |
|
|
|
678 |
# annotation file |
|
|
679 |
logger.info('write annotation file: ' + args.output_annotation) |
|
|
680 |
with open(args.output_annotation, 'w') as fout: |
|
|
681 |
for i, record in enumerate(src): |
|
|
682 |
record = ['c%d'%(chrom_indices[i] + 1), |
|
|
683 |
str(chrom_starts[i]), |
|
|
684 |
str(chrom_starts[i] + lengths[i]), |
|
|
685 |
record.name, |
|
|
686 |
'0', |
|
|
687 |
'+'] |
|
|
688 |
fout.write('\t'.join(record)) |
|
|
689 |
fout.write('\n') |
|
|
690 |
# create junction annotation |
|
|
691 |
if args.circular_rna: |
|
|
692 |
junction_pos = (int(record[1]) + int(record[2]))//2 |
|
|
693 |
record[1] = str(junction_pos - 1) |
|
|
694 |
record[2] = str(junction_pos + 1) |
|
|
695 |
record[3] = record[3] + '|junction' |
|
|
696 |
fout.write('\t'.join(record)) |
|
|
697 |
fout.write('\n') |
|
|
698 |
|
|
|
699 |
@command_handler |
|
|
700 |
def map_bam_to_pseudo_genome(args): |
|
|
701 |
import pysam |
|
|
702 |
|
|
|
703 |
def as_paired_reads(sam): |
|
|
704 |
read1 = None |
|
|
705 |
for read in sam: |
|
|
706 |
if read.is_read1: |
|
|
707 |
read1 = read |
|
|
708 |
elif read.is_read2: |
|
|
709 |
yield (read1, read) |
|
|
710 |
else: |
|
|
711 |
raise ValueError('input SAM is not paired-end') |
|
|
712 |
|
|
|
713 |
chrom_sizes = {} |
|
|
714 |
logger.info('read chrom sizes: ' + args.chrom_sizes) |
|
|
715 |
with open(args.chrom_sizes, 'r') as fin: |
|
|
716 |
for line in fin: |
|
|
717 |
c = line.strip().split('\t') |
|
|
718 |
chrom_sizes[c[0]] = int(c[1]) |
|
|
719 |
|
|
|
720 |
chrom_starts = {} |
|
|
721 |
chrom_names = {} |
|
|
722 |
with open(args.bed, 'r') as fin: |
|
|
723 |
for line in fin: |
|
|
724 |
c = line.strip().split('\t') |
|
|
725 |
c[1] = int(c[1]) |
|
|
726 |
c[2] = int(c[2]) |
|
|
727 |
chrom_names[c[3]] = c[0] |
|
|
728 |
chrom_starts[c[3]] = c[1] |
|
|
729 |
|
|
|
730 |
logger.info('read input BAM file: ' + args.input_file) |
|
|
731 |
sam = pysam.AlignmentFile(args.input_file, 'rb') |
|
|
732 |
|
|
|
733 |
if args.circular_rna: |
|
|
734 |
junction_positions = {sq['SN']:sq['LN']//2 for sq in sam.header['SQ']} |
|
|
735 |
|
|
|
736 |
output_header = { |
|
|
737 |
'HD': sam.header['HD'], |
|
|
738 |
'SQ': [{'SN': chrom, 'LN': size} for chrom, size in chrom_sizes.items()], |
|
|
739 |
'PG': [{'ID': 'map_bam_to_pseudo_genome', |
|
|
740 |
'PN': 'map_bam_to_pseudo_genome', |
|
|
741 |
'VN': '1.0', |
|
|
742 |
'CL': ' '.join(sys.argv)} |
|
|
743 |
] |
|
|
744 |
} |
|
|
745 |
logger.info('create output BAM file: ' + args.output_file) |
|
|
746 |
output_sam = pysam.AlignmentFile(args.output_file, 'wb', header=output_header) |
|
|
747 |
|
|
|
748 |
def map_read(read): |
|
|
749 |
d = read.to_dict() |
|
|
750 |
ref_name = d['ref_name'] |
|
|
751 |
d['ref_name'] = chrom_names[ref_name] |
|
|
752 |
d['ref_pos'] = str(int(d['ref_pos']) + chrom_starts[ref_name]) |
|
|
753 |
return pysam.AlignedSegment.from_dict(d, output_sam.header) |
|
|
754 |
|
|
|
755 |
strandness = args.strandness |
|
|
756 |
is_circular_rna = args.circular_rna |
|
|
757 |
|
|
|
758 |
n_reads = 0 |
|
|
759 |
if args.paired_end: |
|
|
760 |
for read1, read2 in as_paired_reads(sam): |
|
|
761 |
if (read1.is_unmapped) or (read2.is_unmapped): |
|
|
762 |
continue |
|
|
763 |
if read1.reference_name != read2.reference_name: |
|
|
764 |
continue |
|
|
765 |
if is_circular_rna: |
|
|
766 |
if (strandness == 'forward') and ((not read1.is_reverse) and read2.is_reverse): |
|
|
767 |
continue |
|
|
768 |
if (strandness == 'reverse') and ((not read2.is_reverse) and read1.is_reverse): |
|
|
769 |
continue |
|
|
770 |
pos = junction_positions[read1.reference_name] |
|
|
771 |
if not (read1.reference_start < pos <= read2.reference_end): |
|
|
772 |
continue |
|
|
773 |
output_sam.write(map_read(read1)) |
|
|
774 |
output_sam.write(map_read(read2)) |
|
|
775 |
n_reads += 1 |
|
|
776 |
else: |
|
|
777 |
for read in sam: |
|
|
778 |
if read.is_unmapped: |
|
|
779 |
continue |
|
|
780 |
output_sam.write(map_read(read)) |
|
|
781 |
n_reads += 1 |
|
|
782 |
logger.info('number of written reads: {}'.format(n_reads)) |
|
|
783 |
|
|
|
784 |
output_sam.close() |
|
|
785 |
|
|
|
786 |
|
|
|
787 |
|
|
|
788 |
if __name__ == '__main__': |
|
|
789 |
main_parser = argparse.ArgumentParser(description='Preprocessing module') |
|
|
790 |
subparsers = main_parser.add_subparsers(dest='command') |
|
|
791 |
|
|
|
792 |
parser = subparsers.add_parser('transcript_counts', |
|
|
793 |
help='count reads that belongs to a transcript') |
|
|
794 |
parser.add_argument('--input-file', '-i', type=str, required=True, |
|
|
795 |
help='input transcript BAM file') |
|
|
796 |
parser.add_argument('--output-file', '-o', type=str, default='-', |
|
|
797 |
help='output transcript counts file') |
|
|
798 |
|
|
|
799 |
parser = subparsers.add_parser('gtf_to_transcript_table') |
|
|
800 |
parser.add_argument('--input-file', '-i', type=str, default='-', |
|
|
801 |
help='input GTF file') |
|
|
802 |
parser.add_argument('--output-file', '-o', type=str, default='-', |
|
|
803 |
help='output table file') |
|
|
804 |
parser.add_argument('--feature', type=str, |
|
|
805 |
help='feature to use in input GTF file (Column 3)') |
|
|
806 |
parser.add_argument('--gene-type', type=str, |
|
|
807 |
help='gene type to set if "gene_type" attribute is not available in GTF file') |
|
|
808 |
parser.add_argument('--transcript-type', type=str, default='unknown', |
|
|
809 |
help='gene type to set if "transcript_type" attribute is not available in GTF file') |
|
|
810 |
|
|
|
811 |
parser = subparsers.add_parser('extract_longest_transcript') |
|
|
812 |
parser.add_argument('--input-file', '-i', type=str, default='-', |
|
|
813 |
help='input GTF file') |
|
|
814 |
parser.add_argument('--output-file', '-o', type=str, default='-', |
|
|
815 |
help='output table file') |
|
|
816 |
parser.add_argument('--feature', type=str, default='exon', |
|
|
817 |
help='feature to use in input GTF file (Column 3)') |
|
|
818 |
|
|
|
819 |
parser = subparsers.add_parser('fix_gtf', |
|
|
820 |
help='fix problems in a GTF file by removing invalid transcripts') |
|
|
821 |
parser.add_argument('--input-file', '-i', type=str, default='-', |
|
|
822 |
help='input GTF file') |
|
|
823 |
parser.add_argument('--output-file', '-o', type=str, default='-', |
|
|
824 |
help='output table file') |
|
|
825 |
parser.add_argument('--feature', type=str, default='exon', |
|
|
826 |
help='feature to use in input GTF file (Column 3)') |
|
|
827 |
|
|
|
828 |
parser = subparsers.add_parser('extract_gene', |
|
|
829 |
help='extract gene regions from a GTF file') |
|
|
830 |
parser.add_argument('--input-file', '-i', type=str, default='-', |
|
|
831 |
help='input GTF file') |
|
|
832 |
parser.add_argument('--output-file', '-o', type=str, default='-', |
|
|
833 |
help='output BED file') |
|
|
834 |
parser.add_argument('--feature', type=str, |
|
|
835 |
help='feature to use in input GTF file (Column 3)') |
|
|
836 |
|
|
|
837 |
parser = subparsers.add_parser('gtf_to_bed', |
|
|
838 |
help='convert transcripts from GTF to BED') |
|
|
839 |
parser.add_argument('--input-file', '-i', type=str, default='-', |
|
|
840 |
help='input GTF file') |
|
|
841 |
parser.add_argument('--output-file', '-o', type=str, default='-', |
|
|
842 |
help='output BED file') |
|
|
843 |
parser.add_argument('--feature', type=str, |
|
|
844 |
help='features to treat as exons in input GTF file (Column 3)') |
|
|
845 |
|
|
|
846 |
parser = subparsers.add_parser('extract_circrna_junction', |
|
|
847 |
help='extract circular RNA junction sequences from spliced sequences') |
|
|
848 |
parser.add_argument('--input-file', '-i', type=str, default='-', |
|
|
849 |
help='spliced sequence file') |
|
|
850 |
parser.add_argument('--output-file', '-o', type=str, default='-', |
|
|
851 |
help='junction sequence file') |
|
|
852 |
parser.add_argument('--anchor-size', '-s', type=int, default=50, |
|
|
853 |
help='anchor length from both ends') |
|
|
854 |
parser.add_argument('--min-length', type=int, default=50, |
|
|
855 |
help='minimal length to keep a spliced sequence') |
|
|
856 |
|
|
|
857 |
parser = subparsers.add_parser('filter_circrna_reads', |
|
|
858 |
help='filter out reads not crossing circRNA junctions') |
|
|
859 |
parser.add_argument('--input-file', '-i', type=str, default='-', |
|
|
860 |
help='input SAM file') |
|
|
861 |
parser.add_argument('--output-file', '-o', type=str, default='-', |
|
|
862 |
help='output SAM file') |
|
|
863 |
parser.add_argument('--filtered-file', '-u', type=str, |
|
|
864 |
help='write filtered SAM records to file') |
|
|
865 |
|
|
|
866 |
parser = subparsers.add_parser('chrom_sizes', |
|
|
867 |
help='create chrom sizes file from FASTA file') |
|
|
868 |
parser.add_argument('--input-file', '-i', type=str, default='-', |
|
|
869 |
help='input FASTA') |
|
|
870 |
parser.add_argument('--output-file', '-o', type=str, default='-', |
|
|
871 |
help='output chrom sizes file') |
|
|
872 |
|
|
|
873 |
parser = subparsers.add_parser('extract_feature_sequence', |
|
|
874 |
help='extract sequences using feature names') |
|
|
875 |
parser.add_argument('--input-file', '-i', type=str, required=True, |
|
|
876 |
help='feature file with feature names in the first column') |
|
|
877 |
parser.add_argument('--genome-dir', '-g', type=str, required=True, |
|
|
878 |
help='genome directory where fasta/${rna_type}.fa contains sequences') |
|
|
879 |
parser.add_argument('--output-file', '-o', type=str, default='-', |
|
|
880 |
help='output file in FASTA format') |
|
|
881 |
parser.add_argument('--padding', type=int, default=200, |
|
|
882 |
help='extend on both ends to maximum in length') |
|
|
883 |
|
|
|
884 |
parser = subparsers.add_parser('extract_domain_sequence', |
|
|
885 |
help='extract sequences using feature names') |
|
|
886 |
parser.add_argument('--input-file', '-i', type=str, required=True, |
|
|
887 |
help='feature file with feature names in the first column') |
|
|
888 |
parser.add_argument('--genome-dir', '-g', type=str, required=True, |
|
|
889 |
help='genome directory where fasta/${rna_type}.fa contains sequences') |
|
|
890 |
parser.add_argument('--flanking', type=int, default=100) |
|
|
891 |
parser.add_argument('--output-file', '-o', type=str, default='-', |
|
|
892 |
help='output file in FASTA format') |
|
|
893 |
|
|
|
894 |
parser = subparsers.add_parser('calculate_star_parameters', |
|
|
895 |
help='calculate --genomeSAindexNbases and --genomeChrBinNbits for STAR') |
|
|
896 |
parser.add_argument('--input-file', '-i', type=str, required=True, |
|
|
897 |
help='FASTA index (.fai) file') |
|
|
898 |
parser.add_argument('--parameter', '-p', type=str, required=True, |
|
|
899 |
choices=('genomeSAindexNbases', 'genomeChrBinNbits'), |
|
|
900 |
help='parameter to calculate') |
|
|
901 |
|
|
|
902 |
parser = subparsers.add_parser('highlight_mature_mirna_location', |
|
|
903 |
help='get mature miRNA location in precursor miRNA in miRBase') |
|
|
904 |
parser.add_argument('--mature', type=str, required=True, |
|
|
905 |
help='FASTA file of mature sequences') |
|
|
906 |
parser.add_argument('--hairpin', type=str, required=True, |
|
|
907 |
help='FASTA file of hairpin sequences') |
|
|
908 |
parser.add_argument('--output-file', '-o', type=str, default='-') |
|
|
909 |
parser.add_argument('--species', type=str) |
|
|
910 |
|
|
|
911 |
parser = subparsers.add_parser('extract_mature_mirna_location', |
|
|
912 |
help='Extract mature miRNA location in precursor miRNA in miRBase') |
|
|
913 |
parser.add_argument('--input-file', '-i', type=str, required=True, |
|
|
914 |
help='miRBase GFF3 file') |
|
|
915 |
parser.add_argument('--output-file', '-o', type=str, default='-', |
|
|
916 |
help='BED file with 6 columns: pre-miRNA, start, end, mature miRNA, 0, +') |
|
|
917 |
|
|
|
918 |
parser = subparsers.add_parser('calculate_gene_length', |
|
|
919 |
help='calculate effective gene length') |
|
|
920 |
parser.add_argument('--input-file', '-i', type=str, default='-', |
|
|
921 |
help='GTF/GFF file') |
|
|
922 |
parser.add_argument('--method', '-m', type=str, |
|
|
923 |
choices=('isoform_length_mean', 'isoform_length_median', 'isoform_length_max', 'merged_exon_length')) |
|
|
924 |
parser.add_argument('--output-file', '-o', type=str, |
|
|
925 |
help='output tab-deliminated file with two columns: gene_id, length') |
|
|
926 |
|
|
|
927 |
parser = subparsers.add_parser('merge_data_frames') |
|
|
928 |
parser.add_argument('--input-file', '-i', type=str, action='append', required=True, |
|
|
929 |
help='input data matrix') |
|
|
930 |
parser.add_argument('--sep', type=str, default='\t', help='column delimiter') |
|
|
931 |
parser.add_argument('--how', type=str, default='inner', |
|
|
932 |
choices=('inner', 'outer', 'left', 'right')) |
|
|
933 |
parser.add_argument('--on', type=str, help='column name to join on') |
|
|
934 |
parser.add_argument('--on-index', action='store_true', default=False, help='join on index') |
|
|
935 |
parser.add_argument('--fillna', type=str) |
|
|
936 |
parser.add_argument('--output-file', '-o', type=str, default='-', |
|
|
937 |
help='output merged data frame') |
|
|
938 |
|
|
|
939 |
parser = subparsers.add_parser('genomecov') |
|
|
940 |
parser.add_argument('--input-file', '-i', type=str, required=True, |
|
|
941 |
help='input BAM/SAM file') |
|
|
942 |
parser.add_argument('--output-file', '-o', type=str, required=True, |
|
|
943 |
help='output HDF5 file') |
|
|
944 |
|
|
|
945 |
parser = subparsers.add_parser('calc_rpkm') |
|
|
946 |
parser.add_argument('--input-file', '-i', type=str, default='-', |
|
|
947 |
help='input expression (RPM) matrix file') |
|
|
948 |
parser.add_argument('--output-file', '-o', type=str, default='-', |
|
|
949 |
help='output expression (RPKM) matrix file') |
|
|
950 |
|
|
|
951 |
parser = subparsers.add_parser('create_pseudo_genome') |
|
|
952 |
parser.add_argument('--input-file', '-i', type=str, required=True, |
|
|
953 |
help='input transcript FASTA file (with FASTA index)') |
|
|
954 |
parser.add_argument('--max-chrom-size', type=int, default=100000000, |
|
|
955 |
help='maximum size for each chromosome') |
|
|
956 |
parser.add_argument('--padding', type=int, default=50, |
|
|
957 |
help='number of N bases padded after each sequence') |
|
|
958 |
parser.add_argument('--circular-rna', action='store_true', |
|
|
959 |
help='write circular RNA junction to annotation file') |
|
|
960 |
parser.add_argument('--output-fasta', type=str, required=True, |
|
|
961 |
help='output FASTA and index file') |
|
|
962 |
parser.add_argument('--output-chrom-sizes', type=str, required=True, |
|
|
963 |
help='output chrom sizes file') |
|
|
964 |
parser.add_argument('--output-annotation', type=str, required=True, |
|
|
965 |
help='output annotation BED file') |
|
|
966 |
parser.add_argument('--output-cytoband', type=str, required=True, |
|
|
967 |
help='output cytoband file') |
|
|
968 |
|
|
|
969 |
parser = subparsers.add_parser('map_bam_to_pseudo_genome') |
|
|
970 |
parser.add_argument('--input-file', '-i', type=str, required=True, |
|
|
971 |
help='input BAM file') |
|
|
972 |
parser.add_argument('--bed', type=str, required=True, |
|
|
973 |
help='input BED file of genes in the pseudo-genome') |
|
|
974 |
parser.add_argument('--chrom-sizes', type=str, required=True, |
|
|
975 |
help='input chrom sizes of the pseudo-genome') |
|
|
976 |
parser.add_argument('--strandness', '-s', type=str, default='forward', |
|
|
977 |
help='strandness for circular RNA') |
|
|
978 |
parser.add_argument('--paired-end', action='store_true', |
|
|
979 |
help='input is paired-end reads') |
|
|
980 |
parser.add_argument('--circular-rna', action='store_true', |
|
|
981 |
help='requires the read to be mapped to circular RNA junctions') |
|
|
982 |
parser.add_argument('--output-file', '-o', type=str, required=True, |
|
|
983 |
help='output BAM file') |
|
|
984 |
|
|
|
985 |
args = main_parser.parse_args() |
|
|
986 |
if args.command is None: |
|
|
987 |
main_parser.print_help() |
|
|
988 |
sys.exit(1) |
|
|
989 |
logger = logging.getLogger('preprocess.' + args.command) |
|
|
990 |
|
|
|
991 |
try: |
|
|
992 |
command_handlers.get(args.command)(args) |
|
|
993 |
except BrokenPipeError: |
|
|
994 |
pass |
|
|
995 |
except KeyboardInterrupt: |
|
|
996 |
pass |