Switch to unified view

a b/singlecellmultiomics/libraryProcessing/libraryStatistics.py
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
from singlecellmultiomics.statistic import *
4
import pandas as pd
5
import matplotlib.pyplot as plt
6
import os
7
import sys
8
import pysam
9
import collections
10
import argparse
11
12
from singlecellmultiomics.bamProcessing import bam_is_processed_by_program
13
14
from colorama import Fore
15
from colorama import Back
16
from colorama import Style
17
import singlecellmultiomics.pyutils as pyutils
18
from singlecellmultiomics.tagtools import tagtools
19
from pysamiterators import MatePairIteratorIncludingNonProper
20
import gzip
21
import pickle
22
import subprocess
23
from glob import glob
24
25
import matplotlib
26
matplotlib.rcParams['figure.dpi'] = 160
27
matplotlib.use('Agg')
28
29
30
def select_bam_file(lookup):
31
    for l in lookup:
32
        if os.path.exists(l):
33
            print(f'Found file at {l}')
34
            return l
35
36
    return None
37
38
39
def select_fastq_file(lookup):
40
    for paths in lookup:
41
        if isinstance(paths, tuple):
42
            for path in paths:
43
                if os.path.exists(path):
44
                    return paths
45
        elif os.path.exists(paths):
46
            return (paths,)
47
    return None
48
49
50
if __name__ == '__main__':
51
    argparser = argparse.ArgumentParser(
52
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
53
        description='Obtain statistics from your libraries')
54
    argparser.add_argument(
55
        'libraries',
56
        type=str,
57
        nargs='*',
58
        help="either a library structured folder, or the tagged BAM file")
59
    argparser.add_argument('-t', type=str, default='all-stats',
60
                           help="type of staistics to produce. options are \
61
            'demult-stats', 'meth-stats', 'chic-stats' and 'all-stats'")
62
    argparser.add_argument('-o', type=str, help="output file prefix")
63
    argparser.add_argument(
64
        '--plotsOnly',
65
        action='store_true',
66
        help="only make plots")
67
    argparser.add_argument(
68
        '--fatal',
69
        action='store_true',
70
        help="Fatal error on any issue")
71
    argparser.add_argument(
72
        '--sl',
73
        action='store_true',
74
        help="Show lookup paths")
75
76
77
    argparser.add_argument(
78
        '--tablesOnly',
79
        action='store_true',
80
        help="only make tables")
81
82
    argparser.add_argument('-head', type=int)
83
    argparser.add_argument(
84
        '-tagged_bam',
85
        type=str,
86
        help='Alias of subpath to tagged bam file. For example /tagged/sorted.bam')
87
    argparser.add_argument('--v', action='store_true')
88
    argparser.add_argument('--nort', action='store_true')
89
    argparser.add_argument('--nolorenz', action='store_true')
90
    args = argparser.parse_args()
91
92
    for library in args.libraries:
93
        if library.endswith('.bam'):
94
            # the library is a bam file..
95
            bamFile = os.path.abspath(library)
96
            library = os.path.dirname(os.path.abspath(bamFile))
97
            library_name = os.path.basename(os.path.abspath(bamFile))
98
            print("Bam file was supplied:")
99
            print(bamFile)
100
        else:
101
            print("A library was supplied, automatically detecting files ..")
102
            bamFile = None
103
            library_name = os.path.basename(library)
104
        rc = ReadCount(args)  # Is also mappability
105
106
        statistics = [
107
            rc,
108
            FragmentSizeHistogram(args),
109
            RejectionReasonHistogram(args),
110
            MappingQualityHistogram(args),
111
            OversequencingHistogram(args),
112
            CellReadCount(args)
113
        ]
114
115
116
        full_file_statistics = []
117
118
        if not args.nolorenz:
119
            full_file_statistics.append( Lorenz(args) )
120
121
        if(args.t in ['meth-stats', 'all-stats']):
122
            statistics.extend([
123
                MethylationContextHistogram(args),
124
                ConversionMatrix(args)
125
            ])
126
127
        if(args.t in ['chic-stats', 'all-stats']):
128
            statistics.extend([ScCHICLigation(args)])
129
130
        if(args.t in ['demult-stats', 'all-stats']):
131
            statistics.extend([
132
                TrimmingStats(args),
133
                AlleleHistogram(args),
134
                DataTypeHistogram(args),
135
                TagHistogram(args),
136
                PlateStatistic(args),
137
                PlateStatistic2(args)
138
            ])
139
140
        demuxFastqFilesLookup = [
141
            (f'{library}/demultiplexedR1.fastq.gz',
142
             f'{library}/demultiplexedR2.fastq.gz'),
143
            (f'{library}/demultiplexedR1_val_1.fq.gz',
144
             f'{library}/demultiplexedR2_val_2.fq.gz'),
145
            (f'{library}/demultiplexedR1_val_1.fq',
146
             f'{library}/demultiplexedR2_val_2.fq')]
147
148
        rejectFilesLookup = [
149
            (f'{library}/rejectsR1.fastq.gz', f'{library}/rejectsR2.fastq.gz'),
150
            (f'{library}/rejectsR1.fastq', f'{library}/rejectsR2.fastq'),
151
            (f'{library}/rejectsR1.fq.gz', f'{library}/rejectsR2.fq.gz'),
152
            (f'{library}/rejectsR1.fq', f'{library}/rejectsR2.fq'),
153
        ]
154
155
        taggedFilesLookup = [
156
            f'{library}/tagged.bam',
157
            f'{library}/tagged/tagged.bam',
158
            f'{library}/tagged/marked_duplicates.bam',
159
            f'{library}/tagged/resorted.featureCounts.bam',
160
            f'{library}/tagged/STAR_mappedAligned.sortedByCoord.out.featureCounts.bam',
161
            f'{library}/tagged/STAR_mappedAligned.sortedByCoord.out.bam',
162
            f'{library}/tagged/sorted.bam']
163
        if args.tagged_bam:
164
            taggedFilesLookup = [
165
                library + '/' + args.tagged_bam] + taggedFilesLookup
166
167
        if args.sl:
168
            print(f'{Style.BRIGHT}Demux file lookup paths{Style.RESET_ALL}')
169
            print(demuxFastqFilesLookup)
170
            print(f'{Style.BRIGHT}Reject fastq file lookup paths{Style.RESET_ALL}')
171
            print(rejectFilesLookup)
172
            print(f'{Style.BRIGHT}Tagged bam lookup paths{Style.RESET_ALL}')
173
            print(taggedFilesLookup)
174
175
        if 'cluster' in library:
176
            continue
177
        print(f'{Style.BRIGHT}Library {library}{Style.RESET_ALL}')
178
        # Check if the bam file is present
179
        if bamFile is None:
180
            bamFile = select_bam_file(taggedFilesLookup)
181
182
        if bamFile is None:
183
            # Perform glob expansion
184
            bams = list(glob(f'{library}/*.bam'))+list(glob(f'{library}/*/*.bam'))
185
            for bam_path in bams:
186
                print(f"Trying {bam_path}",end="\t")
187
                try:
188
                    with pysam.AlignmentFile(bam_path) as a:
189
                        if bam_is_processed_by_program(a, program='bamtagmultiome'):
190
                            bamFile = bam_path
191
                            print("[TAGGED]")
192
                            break
193
                        else:
194
                            print("[NOT TAGGED]")
195
                except Exception as e:
196
                    print(f"[ERROR] {e}")
197
198
        if bamFile is None:
199
            print(f'{Fore.RED}BAM FILE MISSING {library}{Style.RESET_ALL}')
200
            exit()
201
        else:
202
            print(f'{Fore.GREEN}Bam file at {bamFile}{Style.RESET_ALL}')
203
204
        demuxFastqFiles = select_fastq_file(demuxFastqFilesLookup)
205
        rejectFastqFiles = select_fastq_file(rejectFilesLookup)
206
207
        print("Selected files:")
208
        print(f'demultiplexed reads: {demuxFastqFiles}')
209
        print(f'rejected reads: {rejectFastqFiles}')
210
        print(f'tagged bam: {bamFile}')
211
212
        demuxReads = None
213
        rejectedReads = None
214
        if demuxFastqFiles is not None:
215
            firstMate = demuxFastqFiles[0]
216
            print(f'\tDemuxed > {firstMate}')
217
            if firstMate.endswith('.gz'):
218
                demuxReads = pyutils.wccountgz(firstMate) / 4
219
            else:
220
                demuxReads = pyutils.wccount(firstMate) / 4
221
222
        if rejectFastqFiles is not None:
223
            firstMate = rejectFastqFiles[0]
224
            print(f'\tRejects > {firstMate}')
225
            if firstMate.endswith('.gz'):
226
                rejectedReads = pyutils.wccountgz(firstMate) / 4
227
            else:
228
                rejectedReads = pyutils.wccount(firstMate) / 4
229
230
        if demuxReads is not None:
231
            rc.setRawDemuxCount(demuxReads, paired=True)
232
233
            if rejectedReads is not None:
234
                rc.setRawReadCount(rejectedReads + demuxReads, paired=True)
235
236
        if bamFile is not None and os.path.exists(bamFile):
237
            print(f'\tTagged > {bamFile}')
238
            with pysam.AlignmentFile(bamFile) as f:
239
240
                for i, (R1,R2) in enumerate(MatePairIteratorIncludingNonProper(f)):
241
                    for statistic in statistics:
242
                        statistic.processRead(R1,R2)
243
                    if args.head is not None and i >= (args.head - 1):
244
                        break
245
        else:
246
            print(f'Did not find a bam file at {bamFile}')
247
248
        statDict = {}
249
250
        if os.path.exists(
251
                f'{library}/tagged/STAR_mappedAligned.sortedByCoord.out.featureCounts.bam'):
252
            cmd = f'samtools view {bamFile} -F 4 -f 64 | cut -f 1 | sort | uniq | wc -l'
253
            out = subprocess.Popen(cmd,
254
                                   stdout=subprocess.PIPE,
255
                                   stderr=subprocess.STDOUT,
256
                                   shell=True
257
                                   ).communicate()[0]
258
            read1mapped = int(out.partition(b' ')[0])
259
            cmd = f'samtools view {bamFile} -F 4 -f 128 | cut -f 1 | sort | uniq | wc -l'
260
            out = subprocess.Popen(cmd,
261
                                   stdout=subprocess.PIPE,
262
                                   stderr=subprocess.STDOUT,
263
                                   shell=True
264
                                   ).communicate()[0]
265
            read2mapped = int(out.partition(b' ')[0])
266
267
            rc.totalMappedReads['R1'] = read1mapped
268
            rc.totalMappedReads['R2'] = read2mapped
269
            # Deduplicated reads have RC:i:1 set, -f 64 selects for R2
270
            cmd = f'samtools view {bamFile} -F 4 -f 64 | grep RC:i:1 | cut -f 1 | sort | uniq | wc -l'
271
            out = subprocess.Popen(cmd,
272
                                   stdout=subprocess.PIPE,
273
                                   stderr=subprocess.STDOUT,
274
                                   shell=True
275
                                   ).communicate()[0]
276
            read1mappeddedup = int(out.partition(b' ')[0])
277
            # Deduplicated reads have RC:i:1 set, -f 128 selects for R2
278
            cmd = f'samtools view {bamFile} -F 4 -f 128 | grep RC:i:1 |  cut -f 1 | sort | uniq | wc -l'
279
            out = subprocess.Popen(cmd,
280
                                   stdout=subprocess.PIPE,
281
                                   stderr=subprocess.STDOUT,
282
                                   shell=True
283
                                   ).communicate()[0]
284
            read2mappeddedup = int(out.partition(b' ')[0])
285
286
            rc.totalDedupReads['R1'] = read1mappeddedup
287
            rc.totalDedupReads['R2'] = read2mappeddedup
288
289
        for statistic in statistics:
290
            try:
291
                print(f'\t{statistic.__class__.__name__}')
292
                print(f'\t\t{statistic}\n')
293
                statDict[statistic.__class__.__name__] = dict(statistic)
294
                print(dict(statistic))
295
            except Exception as e:
296
                if args.v:
297
                    print(e)
298
                if args.fatal:
299
                    raise
300
301
        if bamFile is not None:
302
            for statistic in full_file_statistics:
303
                try:
304
                    with pysam.AlignmentFile(bamFile ) as alignments:
305
                        statistic.process_file(alignments)
306
                except Exception as e:
307
                    if args.v:
308
                        print(e)
309
                    if args.fatal:
310
                        raise
311
312
313
314
        # Make plots:
315
        if args.o is None:
316
            plot_dir = f'{library}/plots'
317
            table_dir = f'{library}/tables'
318
            statFile = f'{library}/statistics.pickle.gz'
319
        else:
320
            plot_dir = f'{args.o}/plots'
321
            table_dir = f'{args.o}/tables'
322
            statFile = f'{args.o}/statistics.pickle.gz'
323
324
        if not args.tablesOnly:
325
326
            if not os.path.exists(plot_dir):
327
                os.makedirs(plot_dir)
328
            for statistic in statistics + full_file_statistics:
329
                if not hasattr(statistic, 'plot'):
330
                    print(
331
                        f'Not making a plot for {statistic.__class__.__name__} as no plot method is defined')
332
                    continue
333
                try:
334
                    statistic.plot(
335
                        f'{plot_dir}/{statistic.__class__.__name__}.png',
336
                        title=library_name)
337
                except Exception as e:
338
                    if args.v:
339
                        import traceback
340
                        traceback.print_exc()
341
342
        # Make tables:
343
        if not args.plotsOnly:
344
            with gzip.open(statFile, 'wb') as f:
345
                pickle.dump(statDict, f)
346
            if os.path.exists(statFile):
347
                with gzip.open(statFile, 'rb') as f:
348
                    try:
349
                        statDict.update(pickle.load(f))
350
                    except Exception as e:
351
                        if args.fatal:
352
                            raise
353
354
355
            if not os.path.exists(table_dir):
356
                os.makedirs(table_dir)
357
            for statistic in statistics + full_file_statistics:
358
                if not hasattr(statistic, 'to_csv'):
359
                    print(
360
                        f'Not making a table for {statistic.__class__.__name__} as to_csv method is not defined')
361
                    continue
362
                try:
363
                    statistic.to_csv(
364
                        f'{table_dir}/{statistic.__class__.__name__}_{library_name}.csv')
365
                except Exception as e:
366
                    if args.fatal:
367
                        raise
368
                    if args.v:
369
                        import traceback
370
                        traceback.print_exc()
371
372
373
        # Make RT reaction plot:
374
        if bamFile is not None and os.path.exists(bamFile):
375
            if not args.nort:
376
                os.system(
377
                    f"bamPlotRTstats.py {bamFile} -head 2_000_000 --notstrict -o {plot_dir}/RT_")