--- a +++ b/singlecellmultiomics/modularDemultiplexer/demultiplexedFastqConversion.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import os +import sys +import collections +import argparse +import singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods +from singlecellmultiomics.fastqProcessing.fastqIterator import FastqIterator + +if __name__ == '__main__': + argparser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description="""Extract features from a demultiplexed fastq file. + Example usage: -seqfeatures BC,RX,SEQ -qualityfeatures QT,RQ,QUAL | gzip > bc_umi.fastq.gz + + """) # -o ./bc_umi.fastq.gz + argparser.add_argument( + '-seqfeatures', + type=str, + help="Features to put into the ", + required=True) + argparser.add_argument( + '-qualityfeatures', + type=str, + help="Features to put into ", + required=True) + argparser.add_argument( + 'fastqfiles', + nargs='*', + type=str, + help="Input demultiplexed fastq files") + + argparser.add_argument('-format', type=str, help="Output format: fq, tsv") + #argparser.add_argument('-o', type=str, help="output file path", required=True) + args = argparser.parse_args() + + seqfeatures = args.seqfeatures.split(',') + qualityfeatures = args.qualityfeatures.split(',') + if len(seqfeatures) != len(qualityfeatures): + raise ValueError('Supply a quality feature for every sequence feature') + + TagDefinitions = singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods.TagDefinitions + for reads in FastqIterator(*args.fastqfiles): + for readIndex, record in enumerate(reads): + tr = singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods.TaggedRecord( + TagDefinitions) + tr.fromTaggedFastq(record) + sequence = ''.join( + (record.sequence if tag == 'SEQ' else tr.tags.get(tag) for tag in seqfeatures)) + qualities = ''.join( + (record.qual if tag == 'QUAL' else tr.tags.get(tag) for tag in qualityfeatures)) + if len(sequence) != len(qualities): + raise ValueError( + 'Length of base qualities does not match the length of the selected sequence features') + + if args.format == 'fq': + print(f'{record.header}\n{sequence}\n+\n{qualities}') + elif args.format == 'tsv': + print()