--- a +++ b/singlecellmultiomics/libraryDetection/archivestats.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import argparse + +import pkg_resources +import singlecellmultiomics.barcodeFileParser.barcodeFileParser as barcodeFileParser +from singlecellmultiomics.modularDemultiplexer.demultiplexingStrategyLoader import DemultiplexingStrategyLoader +import singlecellmultiomics.libraryDetection.sequencingLibraryListing as sequencingLibraryListing +import glob +import fnmatch +import os +from types import SimpleNamespace + +if __name__ == '__main__': + barcode_dir = pkg_resources.resource_filename( + 'singlecellmultiomics', 'modularDemultiplexer/barcodes/') + index_dir = pkg_resources.resource_filename( + 'singlecellmultiomics', 'modularDemultiplexer/indices/') + + barcode_parser = barcodeFileParser.BarcodeParser( + hammingDistanceExpansion=0, barcodeDirectory=barcode_dir) + index_parser = barcodeFileParser.BarcodeParser( + hammingDistanceExpansion=1, barcodeDirectory=index_dir) + + dmx = DemultiplexingStrategyLoader(barcodeParser=barcode_parser, + indexParser=index_parser, + indexFileAlias=None) + + argparser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description="""Check multiplexability of many fastq files""") + + argparser.add_argument('-locations', default='.') + + arguments = argparser.parse_args() + sequencing_dirs = arguments.locations.split(',') + dmxes = sorted([x.shortName for x in dmx.demultiplexingStrategies]) + #print('\t'.join(['RUN', 'SEQ', 'AVO', 'INDEX', 'LIBRARY'] + dmxes)) + + matches = [] + for sdir in sequencing_dirs: + for root, dirnames, filenames in os.walk(sdir): + for d in dirnames: + try: + fp = os.path.join(root, d) + #print(' ' + fp) + if len(list(glob.glob(fp + '/*.fastq.gz')) + ) and d != 'BaseCalls': + matches.append(fp) + fastqfiles = list(glob.glob(fp + '/*.fastq.gz')) + + args = SimpleNamespace( + replace=None, + fastqfiles=fastqfiles, + slib=None, + merge='_', + dsize=10000, + se=False, + ignore=True, + maxAutoDetectMethods=100, + minAutoDetectPct=1) + libraries = sequencingLibraryListing.SequencingLibraryLister( + verbose=False).detect(fastqfiles, args=args) + + processedReadPairs, strategyYieldsForAllLibraries = dmx.detectLibYields( + libraries, testReads=args.dsize, maxAutoDetectMethods=args.maxAutoDetectMethods, minAutoDetectPct=args.minAutoDetectPct, verbose=False) + + #print(strategyYieldsForAllLibraries) + + if False: + for library, associated_fastqs_lane in libraries.items(): + # Obtain run id + run_id = '?' + seqid = '?' + index = '?' + avo_id = '?' + i = None + dpos = None + for lane, reads in associated_fastqs_lane.items(): + parts = os.path.dirname( + reads['R1'][0]).split('/') + + if 'Data' in parts: + try: + i = parts.index('Data') + except Exception as e: + i = -2 + + try: + dpos = parts.index('BaseCalls') + except Exception as e: + dpos = -1 + pass + + try: + run_id = parts[i - 2] + except Exception as e: + pass + + try: + seqid = parts[i - 1] + except Exception as e: + pass + + try: + index = parts[dpos + 2] + except Exception as e: + pass + + try: + avo_id = parts[dpos + 1] + except Exception as e: + pass + + else: + try: + avo_id = parts[-2] + except Exception as e: + pass + + try: + index = parts[-1] + except Exception as e: + pass + + break + + + + for library in libraries: + processedReadPairs = strategyYieldsForAllLibraries[library]['processedReadPairs'] + strategyYieldForLibrary = strategyYieldsForAllLibraries[library]['strategyYields'] + selectedStrategies = dmx.selectedStrategiesBasedOnYield( + processedReadPairs, + strategyYieldForLibrary, + maxAutoDetectMethods=args.maxAutoDetectMethods, + minAutoDetectPct=args.minAutoDetectPct) + selectedStrategies = dmx.getSelectedStrategiesFromStringList( + selectedStrategies, verbose=False) + + print(library,selectedStrategies[0].shortName) + + #print('\t'.join([run_id, seqid, avo_id, index, library] + [str( + # strategyYieldsForAllLibraries[library]['strategyYields'].get(x, 0) ) for x in dmxes])) + except Exception as e: + raise + #print(e)