Switch to side-by-side view

--- a
+++ b/singlecellmultiomics/libraryProcessing/libraryStatistics.py
@@ -0,0 +1,377 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+from singlecellmultiomics.statistic import *
+import pandas as pd
+import matplotlib.pyplot as plt
+import os
+import sys
+import pysam
+import collections
+import argparse
+
+from singlecellmultiomics.bamProcessing import bam_is_processed_by_program
+
+from colorama import Fore
+from colorama import Back
+from colorama import Style
+import singlecellmultiomics.pyutils as pyutils
+from singlecellmultiomics.tagtools import tagtools
+from pysamiterators import MatePairIteratorIncludingNonProper
+import gzip
+import pickle
+import subprocess
+from glob import glob
+
+import matplotlib
+matplotlib.rcParams['figure.dpi'] = 160
+matplotlib.use('Agg')
+
+
+def select_bam_file(lookup):
+    for l in lookup:
+        if os.path.exists(l):
+            print(f'Found file at {l}')
+            return l
+
+    return None
+
+
+def select_fastq_file(lookup):
+    for paths in lookup:
+        if isinstance(paths, tuple):
+            for path in paths:
+                if os.path.exists(path):
+                    return paths
+        elif os.path.exists(paths):
+            return (paths,)
+    return None
+
+
+if __name__ == '__main__':
+    argparser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description='Obtain statistics from your libraries')
+    argparser.add_argument(
+        'libraries',
+        type=str,
+        nargs='*',
+        help="either a library structured folder, or the tagged BAM file")
+    argparser.add_argument('-t', type=str, default='all-stats',
+                           help="type of staistics to produce. options are \
+            'demult-stats', 'meth-stats', 'chic-stats' and 'all-stats'")
+    argparser.add_argument('-o', type=str, help="output file prefix")
+    argparser.add_argument(
+        '--plotsOnly',
+        action='store_true',
+        help="only make plots")
+    argparser.add_argument(
+        '--fatal',
+        action='store_true',
+        help="Fatal error on any issue")
+    argparser.add_argument(
+        '--sl',
+        action='store_true',
+        help="Show lookup paths")
+
+
+    argparser.add_argument(
+        '--tablesOnly',
+        action='store_true',
+        help="only make tables")
+
+    argparser.add_argument('-head', type=int)
+    argparser.add_argument(
+        '-tagged_bam',
+        type=str,
+        help='Alias of subpath to tagged bam file. For example /tagged/sorted.bam')
+    argparser.add_argument('--v', action='store_true')
+    argparser.add_argument('--nort', action='store_true')
+    argparser.add_argument('--nolorenz', action='store_true')
+    args = argparser.parse_args()
+
+    for library in args.libraries:
+        if library.endswith('.bam'):
+            # the library is a bam file..
+            bamFile = os.path.abspath(library)
+            library = os.path.dirname(os.path.abspath(bamFile))
+            library_name = os.path.basename(os.path.abspath(bamFile))
+            print("Bam file was supplied:")
+            print(bamFile)
+        else:
+            print("A library was supplied, automatically detecting files ..")
+            bamFile = None
+            library_name = os.path.basename(library)
+        rc = ReadCount(args)  # Is also mappability
+
+        statistics = [
+            rc,
+            FragmentSizeHistogram(args),
+            RejectionReasonHistogram(args),
+            MappingQualityHistogram(args),
+            OversequencingHistogram(args),
+            CellReadCount(args)
+        ]
+
+
+        full_file_statistics = []
+
+        if not args.nolorenz:
+            full_file_statistics.append( Lorenz(args) )
+
+        if(args.t in ['meth-stats', 'all-stats']):
+            statistics.extend([
+                MethylationContextHistogram(args),
+                ConversionMatrix(args)
+            ])
+
+        if(args.t in ['chic-stats', 'all-stats']):
+            statistics.extend([ScCHICLigation(args)])
+
+        if(args.t in ['demult-stats', 'all-stats']):
+            statistics.extend([
+                TrimmingStats(args),
+                AlleleHistogram(args),
+                DataTypeHistogram(args),
+                TagHistogram(args),
+                PlateStatistic(args),
+                PlateStatistic2(args)
+            ])
+
+        demuxFastqFilesLookup = [
+            (f'{library}/demultiplexedR1.fastq.gz',
+             f'{library}/demultiplexedR2.fastq.gz'),
+            (f'{library}/demultiplexedR1_val_1.fq.gz',
+             f'{library}/demultiplexedR2_val_2.fq.gz'),
+            (f'{library}/demultiplexedR1_val_1.fq',
+             f'{library}/demultiplexedR2_val_2.fq')]
+
+        rejectFilesLookup = [
+            (f'{library}/rejectsR1.fastq.gz', f'{library}/rejectsR2.fastq.gz'),
+            (f'{library}/rejectsR1.fastq', f'{library}/rejectsR2.fastq'),
+            (f'{library}/rejectsR1.fq.gz', f'{library}/rejectsR2.fq.gz'),
+            (f'{library}/rejectsR1.fq', f'{library}/rejectsR2.fq'),
+        ]
+
+        taggedFilesLookup = [
+            f'{library}/tagged.bam',
+            f'{library}/tagged/tagged.bam',
+            f'{library}/tagged/marked_duplicates.bam',
+            f'{library}/tagged/resorted.featureCounts.bam',
+            f'{library}/tagged/STAR_mappedAligned.sortedByCoord.out.featureCounts.bam',
+            f'{library}/tagged/STAR_mappedAligned.sortedByCoord.out.bam',
+            f'{library}/tagged/sorted.bam']
+        if args.tagged_bam:
+            taggedFilesLookup = [
+                library + '/' + args.tagged_bam] + taggedFilesLookup
+
+        if args.sl:
+            print(f'{Style.BRIGHT}Demux file lookup paths{Style.RESET_ALL}')
+            print(demuxFastqFilesLookup)
+            print(f'{Style.BRIGHT}Reject fastq file lookup paths{Style.RESET_ALL}')
+            print(rejectFilesLookup)
+            print(f'{Style.BRIGHT}Tagged bam lookup paths{Style.RESET_ALL}')
+            print(taggedFilesLookup)
+
+        if 'cluster' in library:
+            continue
+        print(f'{Style.BRIGHT}Library {library}{Style.RESET_ALL}')
+        # Check if the bam file is present
+        if bamFile is None:
+            bamFile = select_bam_file(taggedFilesLookup)
+
+        if bamFile is None:
+            # Perform glob expansion
+            bams = list(glob(f'{library}/*.bam'))+list(glob(f'{library}/*/*.bam'))
+            for bam_path in bams:
+                print(f"Trying {bam_path}",end="\t")
+                try:
+                    with pysam.AlignmentFile(bam_path) as a:
+                        if bam_is_processed_by_program(a, program='bamtagmultiome'):
+                            bamFile = bam_path
+                            print("[TAGGED]")
+                            break
+                        else:
+                            print("[NOT TAGGED]")
+                except Exception as e:
+                    print(f"[ERROR] {e}")
+
+        if bamFile is None:
+            print(f'{Fore.RED}BAM FILE MISSING {library}{Style.RESET_ALL}')
+            exit()
+        else:
+            print(f'{Fore.GREEN}Bam file at {bamFile}{Style.RESET_ALL}')
+
+        demuxFastqFiles = select_fastq_file(demuxFastqFilesLookup)
+        rejectFastqFiles = select_fastq_file(rejectFilesLookup)
+
+        print("Selected files:")
+        print(f'demultiplexed reads: {demuxFastqFiles}')
+        print(f'rejected reads: {rejectFastqFiles}')
+        print(f'tagged bam: {bamFile}')
+
+        demuxReads = None
+        rejectedReads = None
+        if demuxFastqFiles is not None:
+            firstMate = demuxFastqFiles[0]
+            print(f'\tDemuxed > {firstMate}')
+            if firstMate.endswith('.gz'):
+                demuxReads = pyutils.wccountgz(firstMate) / 4
+            else:
+                demuxReads = pyutils.wccount(firstMate) / 4
+
+        if rejectFastqFiles is not None:
+            firstMate = rejectFastqFiles[0]
+            print(f'\tRejects > {firstMate}')
+            if firstMate.endswith('.gz'):
+                rejectedReads = pyutils.wccountgz(firstMate) / 4
+            else:
+                rejectedReads = pyutils.wccount(firstMate) / 4
+
+        if demuxReads is not None:
+            rc.setRawDemuxCount(demuxReads, paired=True)
+
+            if rejectedReads is not None:
+                rc.setRawReadCount(rejectedReads + demuxReads, paired=True)
+
+        if bamFile is not None and os.path.exists(bamFile):
+            print(f'\tTagged > {bamFile}')
+            with pysam.AlignmentFile(bamFile) as f:
+
+                for i, (R1,R2) in enumerate(MatePairIteratorIncludingNonProper(f)):
+                    for statistic in statistics:
+                        statistic.processRead(R1,R2)
+                    if args.head is not None and i >= (args.head - 1):
+                        break
+        else:
+            print(f'Did not find a bam file at {bamFile}')
+
+        statDict = {}
+
+        if os.path.exists(
+                f'{library}/tagged/STAR_mappedAligned.sortedByCoord.out.featureCounts.bam'):
+            cmd = f'samtools view {bamFile} -F 4 -f 64 | cut -f 1 | sort | uniq | wc -l'
+            out = subprocess.Popen(cmd,
+                                   stdout=subprocess.PIPE,
+                                   stderr=subprocess.STDOUT,
+                                   shell=True
+                                   ).communicate()[0]
+            read1mapped = int(out.partition(b' ')[0])
+            cmd = f'samtools view {bamFile} -F 4 -f 128 | cut -f 1 | sort | uniq | wc -l'
+            out = subprocess.Popen(cmd,
+                                   stdout=subprocess.PIPE,
+                                   stderr=subprocess.STDOUT,
+                                   shell=True
+                                   ).communicate()[0]
+            read2mapped = int(out.partition(b' ')[0])
+
+            rc.totalMappedReads['R1'] = read1mapped
+            rc.totalMappedReads['R2'] = read2mapped
+            # Deduplicated reads have RC:i:1 set, -f 64 selects for R2
+            cmd = f'samtools view {bamFile} -F 4 -f 64 | grep RC:i:1 | cut -f 1 | sort | uniq | wc -l'
+            out = subprocess.Popen(cmd,
+                                   stdout=subprocess.PIPE,
+                                   stderr=subprocess.STDOUT,
+                                   shell=True
+                                   ).communicate()[0]
+            read1mappeddedup = int(out.partition(b' ')[0])
+            # Deduplicated reads have RC:i:1 set, -f 128 selects for R2
+            cmd = f'samtools view {bamFile} -F 4 -f 128 | grep RC:i:1 |  cut -f 1 | sort | uniq | wc -l'
+            out = subprocess.Popen(cmd,
+                                   stdout=subprocess.PIPE,
+                                   stderr=subprocess.STDOUT,
+                                   shell=True
+                                   ).communicate()[0]
+            read2mappeddedup = int(out.partition(b' ')[0])
+
+            rc.totalDedupReads['R1'] = read1mappeddedup
+            rc.totalDedupReads['R2'] = read2mappeddedup
+
+        for statistic in statistics:
+            try:
+                print(f'\t{statistic.__class__.__name__}')
+                print(f'\t\t{statistic}\n')
+                statDict[statistic.__class__.__name__] = dict(statistic)
+                print(dict(statistic))
+            except Exception as e:
+                if args.v:
+                    print(e)
+                if args.fatal:
+                    raise
+
+        if bamFile is not None:
+            for statistic in full_file_statistics:
+                try:
+                    with pysam.AlignmentFile(bamFile ) as alignments:
+                        statistic.process_file(alignments)
+                except Exception as e:
+                    if args.v:
+                        print(e)
+                    if args.fatal:
+                        raise
+
+
+
+        # Make plots:
+        if args.o is None:
+            plot_dir = f'{library}/plots'
+            table_dir = f'{library}/tables'
+            statFile = f'{library}/statistics.pickle.gz'
+        else:
+            plot_dir = f'{args.o}/plots'
+            table_dir = f'{args.o}/tables'
+            statFile = f'{args.o}/statistics.pickle.gz'
+
+        if not args.tablesOnly:
+
+            if not os.path.exists(plot_dir):
+                os.makedirs(plot_dir)
+            for statistic in statistics + full_file_statistics:
+                if not hasattr(statistic, 'plot'):
+                    print(
+                        f'Not making a plot for {statistic.__class__.__name__} as no plot method is defined')
+                    continue
+                try:
+                    statistic.plot(
+                        f'{plot_dir}/{statistic.__class__.__name__}.png',
+                        title=library_name)
+                except Exception as e:
+                    if args.v:
+                        import traceback
+                        traceback.print_exc()
+
+        # Make tables:
+        if not args.plotsOnly:
+            with gzip.open(statFile, 'wb') as f:
+                pickle.dump(statDict, f)
+            if os.path.exists(statFile):
+                with gzip.open(statFile, 'rb') as f:
+                    try:
+                        statDict.update(pickle.load(f))
+                    except Exception as e:
+                        if args.fatal:
+                            raise
+
+
+            if not os.path.exists(table_dir):
+                os.makedirs(table_dir)
+            for statistic in statistics + full_file_statistics:
+                if not hasattr(statistic, 'to_csv'):
+                    print(
+                        f'Not making a table for {statistic.__class__.__name__} as to_csv method is not defined')
+                    continue
+                try:
+                    statistic.to_csv(
+                        f'{table_dir}/{statistic.__class__.__name__}_{library_name}.csv')
+                except Exception as e:
+                    if args.fatal:
+                        raise
+                    if args.v:
+                        import traceback
+                        traceback.print_exc()
+
+
+        # Make RT reaction plot:
+        if bamFile is not None and os.path.exists(bamFile):
+            if not args.nort:
+                os.system(
+                    f"bamPlotRTstats.py {bamFile} -head 2_000_000 --notstrict -o {plot_dir}/RT_")