SingleCellMultiOmics / Git / Diff of /singlecellmultiomics/statistic/readcount.py

Models:
AlyssaS/
SingleCellMultiOmics
Downloads: 1
Diff of /singlecellmultiomics/statistic/readcount.py [000000] .. [2c420a]
Switch to side-by-side view

--- a
+++ b/singlecellmultiomics/statistic/readcount.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pandas as pd
+import collections
+import matplotlib.pyplot as plt
+from .statistic import Statistic
+import singlecellmultiomics.pyutils as pyutils
+import matplotlib
+matplotlib.rcParams['figure.dpi'] = 160
+matplotlib.use('Agg')
+
+
+class ReadCount(Statistic):
+    def __init__(self, args):
+        Statistic.__init__(self, args)
+        self.totalMappedReads = collections.Counter()
+        self.rawReadCount = None
+        self.unmappedReads = collections.Counter()
+        self.totalDedupReads = collections.Counter()
+        self.totalAssignedSiteReads = collections.Counter({'R1': 0, 'R2': 0})
+        self.rejectionReasons = collections.Counter()
+        self.demuxReadCount = 0
+        self.rawReadCount = 0
+
+    def get_df(self):
+        df = pd.DataFrame.from_dict(dict(self))
+        # They are paired and only present one time in the dict but are
+        # expanded by pandas
+        df['Raw reads'] /= 2
+        df['Demultiplexed reads'] /= 2  # same
+
+        df = df[['Raw reads',
+                 'Demultiplexed reads',
+                 'Mapped reads',
+                 'AssignedSiteReads',
+                 'Deduplicated reads']]
+        return df
+
+    def to_csv(self, path):
+        self.get_df().to_csv(path)
+
+    def plot(self, target_path, title=None):
+        df = self.get_df()  # ,'UnmappedReads']]
+
+        print(df)
+
+        df.plot.bar(figsize=(10, 4)).legend(bbox_to_anchor=(1, 0.98))
+        if title is not None:
+            plt.title(title)
+
+        plt.subplots_adjust(right=0.6)
+        plt.tight_layout()
+        plt.savefig(target_path)
+
+        ax = plt.gca()
+        ax.set_ylim(1, None)
+        ax.set_ylabel('Frequency')
+        ax.set_yscale('log')
+        plt.tight_layout()
+        plt.savefig(target_path.replace('.png', '.log.png'))
+
+        plt.close()
+
+    def processRead(self, R1,R2):
+
+        for read in [R1,R2]:
+            if read is None:
+                continue
+            # Count every read only once
+            if read.is_supplementary or read.is_secondary:
+                return
+
+            if not read.is_unmapped:
+
+                if read.is_read1:
+                    self.totalMappedReads['R1'] += 1
+                elif read.is_read2:
+                    self.totalMappedReads['R2'] += 1
+                else:
+                    self.totalMappedReads['R?'] += 1
+            else:
+                if read.is_read1:
+                    self.unmappedReads['R1'] += 1
+                elif read.is_read2:
+                    self.unmappedReads['R2'] += 1
+                else:
+                    self.unmappedReads['R?'] += 1
+
+            # Compatibility with picard --TAG_DUPLICATE_SET_MEMBERS
+            """
+            java -jar `which picard.jar` MarkDuplicates I=sorted.bam O=marked_duplicates.bam M=marked_dup_metrics.txt TAG_DUPLICATE_SET_MEMBERS=1
+            """
+            if not read.has_tag('MX'):  # Bulk sample
+                if not read.is_unmapped:
+                    if read.is_duplicate:
+                        pass
+                    else:
+                        if read.is_read1:
+                            self.totalDedupReads['R1'] += 1
+                        else:
+                            self.totalDedupReads['R2'] += 1
+
+            else:
+                if not read.is_unmapped:
+                    if (read.has_tag('DS') or read.has_tag('GN')) and not read.is_qcfail:
+                        if read.is_read1:
+                            self.totalAssignedSiteReads['R1'] += 1
+                        elif read.is_read2:
+                            self.totalAssignedSiteReads['R2'] += 1
+                        else:
+                            self.totalAssignedSiteReads['R?'] += 1
+
+                    if not read.is_duplicate:
+
+                        if read.is_read1:
+                            self.totalDedupReads['R1'] += 1
+                        elif read.is_read2:
+                            self.totalDedupReads['R2'] += 1
+                        else:
+                            self.totalDedupReads['R?'] += 1
+
+    def setRawReadCount(self, readCount, paired=True):
+        self.rawReadCount = readCount * (2 if paired else 1)
+
+    def setRawDemuxCount(self, readCount, paired=True):
+        self.demuxReadCount = readCount * (2 if paired else 1)
+
+    def mappability(self):
+        if self.rawReadCount > 0:
+            return sum(self.totalMappedReads.values()) / self.rawReadCount
+        return None
+
+    def __repr__(self):
+        return f"Input:{int(self.rawReadCount)} raw reads, demultiplexed:{self.demuxReadCount}, final mapped: {self.totalMappedReads['R1']} R1 reads, {self.totalMappedReads['R2']} R2 reads, {self.totalMappedReads['R?']} unknown pairing reads, mappability:{self.mappability() if self.mappability() else 'Cannot be determined'}"
+
+    def __iter__(self):
+        yield 'Raw reads', self.rawReadCount
+        yield 'Demultiplexed reads', self.demuxReadCount
+        yield 'Mapped reads', self.totalMappedReads
+        yield 'UnmappedReads', self.unmappedReads
+        # if self.totalAssignedSiteReads['R1']>0 or
+        # self.totalAssignedSiteReads['R2']>0 or
+        # self.totalAssignedSiteReads['R?']>0:
+        yield 'AssignedSiteReads', self.totalAssignedSiteReads
+        yield 'Deduplicated reads', self.totalDedupReads