|
a |
|
b/singlecellmultiomics/statistic/mappingquality.py |
|
|
1 |
from .statistic import StatisticHistogram |
|
|
2 |
import singlecellmultiomics.pyutils as pyutils |
|
|
3 |
import collections |
|
|
4 |
import pandas as pd |
|
|
5 |
import matplotlib.pyplot as plt |
|
|
6 |
import seaborn as sns |
|
|
7 |
|
|
|
8 |
|
|
|
9 |
class MappingQualityHistogram(StatisticHistogram): |
|
|
10 |
def __init__(self, args): |
|
|
11 |
StatisticHistogram.__init__(self, args) |
|
|
12 |
self.histogram = collections.Counter() |
|
|
13 |
|
|
|
14 |
def processRead(self, R1,R2): |
|
|
15 |
|
|
|
16 |
for read in [R1,R2]: |
|
|
17 |
if read is None: |
|
|
18 |
continue |
|
|
19 |
|
|
|
20 |
self.histogram[read.mapping_quality] += 1 |
|
|
21 |
|
|
|
22 |
def __repr__(self): |
|
|
23 |
return f'The average mapping quality is {pyutils.meanOfCounter(self.histogram)}, SD:{pyutils.varianceOfCounter(self.histogram)}' |
|
|
24 |
|
|
|
25 |
def get_df(self): |
|
|
26 |
return pd.DataFrame.from_dict({'mq': self.histogram}) |
|
|
27 |
|
|
|
28 |
def to_csv(self, path): |
|
|
29 |
self.get_df().to_csv(path) |
|
|
30 |
|
|
|
31 |
def plot(self, target_path, title=None): |
|
|
32 |
df = self.get_df() # ,'UnmappedReads']] |
|
|
33 |
|
|
|
34 |
df['mq'].plot.bar(figsize=(10, 4)) |
|
|
35 |
ax = plt.gca() |
|
|
36 |
ax.set_xlabel('Mapping quality') |
|
|
37 |
ax.set_ylabel('Frequency (reads)') |
|
|
38 |
|
|
|
39 |
if title is not None: |
|
|
40 |
plt.title(title) |
|
|
41 |
plt.tight_layout() |
|
|
42 |
plt.savefig(target_path) |
|
|
43 |
|
|
|
44 |
ax.set_yscale('log') |
|
|
45 |
sns.despine() |
|
|
46 |
plt.tight_layout() |
|
|
47 |
plt.savefig(target_path.replace('.png', '.log.png')) |
|
|
48 |
|
|
|
49 |
plt.close() |