|
a |
|
b/singlecellmultiomics/modularDemultiplexer/demultiplexedFastqConversion.py |
|
|
1 |
#!/usr/bin/env python3 |
|
|
2 |
# -*- coding: utf-8 -*- |
|
|
3 |
import os |
|
|
4 |
import sys |
|
|
5 |
import collections |
|
|
6 |
import argparse |
|
|
7 |
import singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods |
|
|
8 |
from singlecellmultiomics.fastqProcessing.fastqIterator import FastqIterator |
|
|
9 |
|
|
|
10 |
if __name__ == '__main__': |
|
|
11 |
argparser = argparse.ArgumentParser( |
|
|
12 |
formatter_class=argparse.ArgumentDefaultsHelpFormatter, |
|
|
13 |
description="""Extract features from a demultiplexed fastq file. |
|
|
14 |
Example usage: -seqfeatures BC,RX,SEQ -qualityfeatures QT,RQ,QUAL | gzip > bc_umi.fastq.gz |
|
|
15 |
|
|
|
16 |
""") # -o ./bc_umi.fastq.gz |
|
|
17 |
argparser.add_argument( |
|
|
18 |
'-seqfeatures', |
|
|
19 |
type=str, |
|
|
20 |
help="Features to put into the ", |
|
|
21 |
required=True) |
|
|
22 |
argparser.add_argument( |
|
|
23 |
'-qualityfeatures', |
|
|
24 |
type=str, |
|
|
25 |
help="Features to put into ", |
|
|
26 |
required=True) |
|
|
27 |
argparser.add_argument( |
|
|
28 |
'fastqfiles', |
|
|
29 |
nargs='*', |
|
|
30 |
type=str, |
|
|
31 |
help="Input demultiplexed fastq files") |
|
|
32 |
|
|
|
33 |
argparser.add_argument('-format', type=str, help="Output format: fq, tsv") |
|
|
34 |
#argparser.add_argument('-o', type=str, help="output file path", required=True) |
|
|
35 |
args = argparser.parse_args() |
|
|
36 |
|
|
|
37 |
seqfeatures = args.seqfeatures.split(',') |
|
|
38 |
qualityfeatures = args.qualityfeatures.split(',') |
|
|
39 |
if len(seqfeatures) != len(qualityfeatures): |
|
|
40 |
raise ValueError('Supply a quality feature for every sequence feature') |
|
|
41 |
|
|
|
42 |
TagDefinitions = singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods.TagDefinitions |
|
|
43 |
for reads in FastqIterator(*args.fastqfiles): |
|
|
44 |
for readIndex, record in enumerate(reads): |
|
|
45 |
tr = singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods.TaggedRecord( |
|
|
46 |
TagDefinitions) |
|
|
47 |
tr.fromTaggedFastq(record) |
|
|
48 |
sequence = ''.join( |
|
|
49 |
(record.sequence if tag == 'SEQ' else tr.tags.get(tag) for tag in seqfeatures)) |
|
|
50 |
qualities = ''.join( |
|
|
51 |
(record.qual if tag == 'QUAL' else tr.tags.get(tag) for tag in qualityfeatures)) |
|
|
52 |
if len(sequence) != len(qualities): |
|
|
53 |
raise ValueError( |
|
|
54 |
'Length of base qualities does not match the length of the selected sequence features') |
|
|
55 |
|
|
|
56 |
if args.format == 'fq': |
|
|
57 |
print(f'{record.header}\n{sequence}\n+\n{qualities}') |
|
|
58 |
elif args.format == 'tsv': |
|
|
59 |
print() |