Switch to unified view

a b/singlecellmultiomics/statistic/readcount.py
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
4
import numpy as np
5
import pandas as pd
6
import collections
7
import matplotlib.pyplot as plt
8
from .statistic import Statistic
9
import singlecellmultiomics.pyutils as pyutils
10
import matplotlib
11
matplotlib.rcParams['figure.dpi'] = 160
12
matplotlib.use('Agg')
13
14
15
class ReadCount(Statistic):
16
    def __init__(self, args):
17
        Statistic.__init__(self, args)
18
        self.totalMappedReads = collections.Counter()
19
        self.rawReadCount = None
20
        self.unmappedReads = collections.Counter()
21
        self.totalDedupReads = collections.Counter()
22
        self.totalAssignedSiteReads = collections.Counter({'R1': 0, 'R2': 0})
23
        self.rejectionReasons = collections.Counter()
24
        self.demuxReadCount = 0
25
        self.rawReadCount = 0
26
27
    def get_df(self):
28
        df = pd.DataFrame.from_dict(dict(self))
29
        # They are paired and only present one time in the dict but are
30
        # expanded by pandas
31
        df['Raw reads'] /= 2
32
        df['Demultiplexed reads'] /= 2  # same
33
34
        df = df[['Raw reads',
35
                 'Demultiplexed reads',
36
                 'Mapped reads',
37
                 'AssignedSiteReads',
38
                 'Deduplicated reads']]
39
        return df
40
41
    def to_csv(self, path):
42
        self.get_df().to_csv(path)
43
44
    def plot(self, target_path, title=None):
45
        df = self.get_df()  # ,'UnmappedReads']]
46
47
        print(df)
48
49
        df.plot.bar(figsize=(10, 4)).legend(bbox_to_anchor=(1, 0.98))
50
        if title is not None:
51
            plt.title(title)
52
53
        plt.subplots_adjust(right=0.6)
54
        plt.tight_layout()
55
        plt.savefig(target_path)
56
57
        ax = plt.gca()
58
        ax.set_ylim(1, None)
59
        ax.set_ylabel('Frequency')
60
        ax.set_yscale('log')
61
        plt.tight_layout()
62
        plt.savefig(target_path.replace('.png', '.log.png'))
63
64
        plt.close()
65
66
    def processRead(self, R1,R2):
67
68
        for read in [R1,R2]:
69
            if read is None:
70
                continue
71
            # Count every read only once
72
            if read.is_supplementary or read.is_secondary:
73
                return
74
75
            if not read.is_unmapped:
76
77
                if read.is_read1:
78
                    self.totalMappedReads['R1'] += 1
79
                elif read.is_read2:
80
                    self.totalMappedReads['R2'] += 1
81
                else:
82
                    self.totalMappedReads['R?'] += 1
83
            else:
84
                if read.is_read1:
85
                    self.unmappedReads['R1'] += 1
86
                elif read.is_read2:
87
                    self.unmappedReads['R2'] += 1
88
                else:
89
                    self.unmappedReads['R?'] += 1
90
91
            # Compatibility with picard --TAG_DUPLICATE_SET_MEMBERS
92
            """
93
            java -jar `which picard.jar` MarkDuplicates I=sorted.bam O=marked_duplicates.bam M=marked_dup_metrics.txt TAG_DUPLICATE_SET_MEMBERS=1
94
            """
95
            if not read.has_tag('MX'):  # Bulk sample
96
                if not read.is_unmapped:
97
                    if read.is_duplicate:
98
                        pass
99
                    else:
100
                        if read.is_read1:
101
                            self.totalDedupReads['R1'] += 1
102
                        else:
103
                            self.totalDedupReads['R2'] += 1
104
105
            else:
106
                if not read.is_unmapped:
107
                    if (read.has_tag('DS') or read.has_tag('GN')) and not read.is_qcfail:
108
                        if read.is_read1:
109
                            self.totalAssignedSiteReads['R1'] += 1
110
                        elif read.is_read2:
111
                            self.totalAssignedSiteReads['R2'] += 1
112
                        else:
113
                            self.totalAssignedSiteReads['R?'] += 1
114
115
                    if not read.is_duplicate:
116
117
                        if read.is_read1:
118
                            self.totalDedupReads['R1'] += 1
119
                        elif read.is_read2:
120
                            self.totalDedupReads['R2'] += 1
121
                        else:
122
                            self.totalDedupReads['R?'] += 1
123
124
    def setRawReadCount(self, readCount, paired=True):
125
        self.rawReadCount = readCount * (2 if paired else 1)
126
127
    def setRawDemuxCount(self, readCount, paired=True):
128
        self.demuxReadCount = readCount * (2 if paired else 1)
129
130
    def mappability(self):
131
        if self.rawReadCount > 0:
132
            return sum(self.totalMappedReads.values()) / self.rawReadCount
133
        return None
134
135
    def __repr__(self):
136
        return f"Input:{int(self.rawReadCount)} raw reads, demultiplexed:{self.demuxReadCount}, final mapped: {self.totalMappedReads['R1']} R1 reads, {self.totalMappedReads['R2']} R2 reads, {self.totalMappedReads['R?']} unknown pairing reads, mappability:{self.mappability() if self.mappability() else 'Cannot be determined'}"
137
138
    def __iter__(self):
139
        yield 'Raw reads', self.rawReadCount
140
        yield 'Demultiplexed reads', self.demuxReadCount
141
        yield 'Mapped reads', self.totalMappedReads
142
        yield 'UnmappedReads', self.unmappedReads
143
        # if self.totalAssignedSiteReads['R1']>0 or
144
        # self.totalAssignedSiteReads['R2']>0 or
145
        # self.totalAssignedSiteReads['R?']>0:
146
        yield 'AssignedSiteReads', self.totalAssignedSiteReads
147
        yield 'Deduplicated reads', self.totalDedupReads