|
a |
|
b/singlecellmultiomics/statistic/readcount.py |
|
|
1 |
#!/usr/bin/env python3 |
|
|
2 |
# -*- coding: utf-8 -*- |
|
|
3 |
|
|
|
4 |
import numpy as np |
|
|
5 |
import pandas as pd |
|
|
6 |
import collections |
|
|
7 |
import matplotlib.pyplot as plt |
|
|
8 |
from .statistic import Statistic |
|
|
9 |
import singlecellmultiomics.pyutils as pyutils |
|
|
10 |
import matplotlib |
|
|
11 |
matplotlib.rcParams['figure.dpi'] = 160 |
|
|
12 |
matplotlib.use('Agg') |
|
|
13 |
|
|
|
14 |
|
|
|
15 |
class ReadCount(Statistic): |
|
|
16 |
def __init__(self, args): |
|
|
17 |
Statistic.__init__(self, args) |
|
|
18 |
self.totalMappedReads = collections.Counter() |
|
|
19 |
self.rawReadCount = None |
|
|
20 |
self.unmappedReads = collections.Counter() |
|
|
21 |
self.totalDedupReads = collections.Counter() |
|
|
22 |
self.totalAssignedSiteReads = collections.Counter({'R1': 0, 'R2': 0}) |
|
|
23 |
self.rejectionReasons = collections.Counter() |
|
|
24 |
self.demuxReadCount = 0 |
|
|
25 |
self.rawReadCount = 0 |
|
|
26 |
|
|
|
27 |
def get_df(self): |
|
|
28 |
df = pd.DataFrame.from_dict(dict(self)) |
|
|
29 |
# They are paired and only present one time in the dict but are |
|
|
30 |
# expanded by pandas |
|
|
31 |
df['Raw reads'] /= 2 |
|
|
32 |
df['Demultiplexed reads'] /= 2 # same |
|
|
33 |
|
|
|
34 |
df = df[['Raw reads', |
|
|
35 |
'Demultiplexed reads', |
|
|
36 |
'Mapped reads', |
|
|
37 |
'AssignedSiteReads', |
|
|
38 |
'Deduplicated reads']] |
|
|
39 |
return df |
|
|
40 |
|
|
|
41 |
def to_csv(self, path): |
|
|
42 |
self.get_df().to_csv(path) |
|
|
43 |
|
|
|
44 |
def plot(self, target_path, title=None): |
|
|
45 |
df = self.get_df() # ,'UnmappedReads']] |
|
|
46 |
|
|
|
47 |
print(df) |
|
|
48 |
|
|
|
49 |
df.plot.bar(figsize=(10, 4)).legend(bbox_to_anchor=(1, 0.98)) |
|
|
50 |
if title is not None: |
|
|
51 |
plt.title(title) |
|
|
52 |
|
|
|
53 |
plt.subplots_adjust(right=0.6) |
|
|
54 |
plt.tight_layout() |
|
|
55 |
plt.savefig(target_path) |
|
|
56 |
|
|
|
57 |
ax = plt.gca() |
|
|
58 |
ax.set_ylim(1, None) |
|
|
59 |
ax.set_ylabel('Frequency') |
|
|
60 |
ax.set_yscale('log') |
|
|
61 |
plt.tight_layout() |
|
|
62 |
plt.savefig(target_path.replace('.png', '.log.png')) |
|
|
63 |
|
|
|
64 |
plt.close() |
|
|
65 |
|
|
|
66 |
def processRead(self, R1,R2): |
|
|
67 |
|
|
|
68 |
for read in [R1,R2]: |
|
|
69 |
if read is None: |
|
|
70 |
continue |
|
|
71 |
# Count every read only once |
|
|
72 |
if read.is_supplementary or read.is_secondary: |
|
|
73 |
return |
|
|
74 |
|
|
|
75 |
if not read.is_unmapped: |
|
|
76 |
|
|
|
77 |
if read.is_read1: |
|
|
78 |
self.totalMappedReads['R1'] += 1 |
|
|
79 |
elif read.is_read2: |
|
|
80 |
self.totalMappedReads['R2'] += 1 |
|
|
81 |
else: |
|
|
82 |
self.totalMappedReads['R?'] += 1 |
|
|
83 |
else: |
|
|
84 |
if read.is_read1: |
|
|
85 |
self.unmappedReads['R1'] += 1 |
|
|
86 |
elif read.is_read2: |
|
|
87 |
self.unmappedReads['R2'] += 1 |
|
|
88 |
else: |
|
|
89 |
self.unmappedReads['R?'] += 1 |
|
|
90 |
|
|
|
91 |
# Compatibility with picard --TAG_DUPLICATE_SET_MEMBERS |
|
|
92 |
""" |
|
|
93 |
java -jar `which picard.jar` MarkDuplicates I=sorted.bam O=marked_duplicates.bam M=marked_dup_metrics.txt TAG_DUPLICATE_SET_MEMBERS=1 |
|
|
94 |
""" |
|
|
95 |
if not read.has_tag('MX'): # Bulk sample |
|
|
96 |
if not read.is_unmapped: |
|
|
97 |
if read.is_duplicate: |
|
|
98 |
pass |
|
|
99 |
else: |
|
|
100 |
if read.is_read1: |
|
|
101 |
self.totalDedupReads['R1'] += 1 |
|
|
102 |
else: |
|
|
103 |
self.totalDedupReads['R2'] += 1 |
|
|
104 |
|
|
|
105 |
else: |
|
|
106 |
if not read.is_unmapped: |
|
|
107 |
if (read.has_tag('DS') or read.has_tag('GN')) and not read.is_qcfail: |
|
|
108 |
if read.is_read1: |
|
|
109 |
self.totalAssignedSiteReads['R1'] += 1 |
|
|
110 |
elif read.is_read2: |
|
|
111 |
self.totalAssignedSiteReads['R2'] += 1 |
|
|
112 |
else: |
|
|
113 |
self.totalAssignedSiteReads['R?'] += 1 |
|
|
114 |
|
|
|
115 |
if not read.is_duplicate: |
|
|
116 |
|
|
|
117 |
if read.is_read1: |
|
|
118 |
self.totalDedupReads['R1'] += 1 |
|
|
119 |
elif read.is_read2: |
|
|
120 |
self.totalDedupReads['R2'] += 1 |
|
|
121 |
else: |
|
|
122 |
self.totalDedupReads['R?'] += 1 |
|
|
123 |
|
|
|
124 |
def setRawReadCount(self, readCount, paired=True): |
|
|
125 |
self.rawReadCount = readCount * (2 if paired else 1) |
|
|
126 |
|
|
|
127 |
def setRawDemuxCount(self, readCount, paired=True): |
|
|
128 |
self.demuxReadCount = readCount * (2 if paired else 1) |
|
|
129 |
|
|
|
130 |
def mappability(self): |
|
|
131 |
if self.rawReadCount > 0: |
|
|
132 |
return sum(self.totalMappedReads.values()) / self.rawReadCount |
|
|
133 |
return None |
|
|
134 |
|
|
|
135 |
def __repr__(self): |
|
|
136 |
return f"Input:{int(self.rawReadCount)} raw reads, demultiplexed:{self.demuxReadCount}, final mapped: {self.totalMappedReads['R1']} R1 reads, {self.totalMappedReads['R2']} R2 reads, {self.totalMappedReads['R?']} unknown pairing reads, mappability:{self.mappability() if self.mappability() else 'Cannot be determined'}" |
|
|
137 |
|
|
|
138 |
def __iter__(self): |
|
|
139 |
yield 'Raw reads', self.rawReadCount |
|
|
140 |
yield 'Demultiplexed reads', self.demuxReadCount |
|
|
141 |
yield 'Mapped reads', self.totalMappedReads |
|
|
142 |
yield 'UnmappedReads', self.unmappedReads |
|
|
143 |
# if self.totalAssignedSiteReads['R1']>0 or |
|
|
144 |
# self.totalAssignedSiteReads['R2']>0 or |
|
|
145 |
# self.totalAssignedSiteReads['R?']>0: |
|
|
146 |
yield 'AssignedSiteReads', self.totalAssignedSiteReads |
|
|
147 |
yield 'Deduplicated reads', self.totalDedupReads |