|
a |
|
b/singlecellmultiomics/statistic/methylation.py |
|
|
1 |
#!/usr/bin/env python3 |
|
|
2 |
# -*- coding: utf-8 -*- |
|
|
3 |
import matplotlib.pyplot as plt |
|
|
4 |
from .statistic import StatisticHistogram |
|
|
5 |
import singlecellmultiomics.pyutils as pyutils |
|
|
6 |
import collections |
|
|
7 |
import pandas as pd |
|
|
8 |
import matplotlib |
|
|
9 |
matplotlib.rcParams['figure.dpi'] = 160 |
|
|
10 |
matplotlib.use('Agg') |
|
|
11 |
|
|
|
12 |
|
|
|
13 |
class MethylationContextHistogram(StatisticHistogram): |
|
|
14 |
def __init__(self, args): |
|
|
15 |
StatisticHistogram.__init__(self, args) |
|
|
16 |
self.context_obs = collections.Counter() # (bismark_call_tag)=> observations |
|
|
17 |
self.hasdata = False |
|
|
18 |
def processRead(self, R1,R2): |
|
|
19 |
|
|
|
20 |
for read in [R2,R2]: |
|
|
21 |
if read is None: |
|
|
22 |
continue |
|
|
23 |
|
|
|
24 |
if not read.is_paired or not read.has_tag( |
|
|
25 |
'XM') or not read.is_read1 or read.is_duplicate or read.has_tag('RR'): |
|
|
26 |
return |
|
|
27 |
self.hasdata = True |
|
|
28 |
tags = dict(read.tags) |
|
|
29 |
for tag in 'zhx': |
|
|
30 |
self.context_obs[tag] += tags.get(f's{tag}', 0) |
|
|
31 |
self.context_obs[tag.upper()] += tags.get(f's{tag.upper()}', 0) |
|
|
32 |
|
|
|
33 |
def __repr__(self): |
|
|
34 |
return f'Methylation status.' |
|
|
35 |
|
|
|
36 |
def get_df(self): |
|
|
37 |
|
|
|
38 |
x = self.context_obs |
|
|
39 |
|
|
|
40 |
return pd.DataFrame( |
|
|
41 |
{'ratio': |
|
|
42 |
{ |
|
|
43 |
'CpG': (x['Z'] / (x['z'] + x['Z'])) if x['z'] + x['Z'] > 0 else 0 , |
|
|
44 |
'CHG': (x['X'] / (x['x'] + x['X'])) if x['x'] + x['X'] >0 else 0, |
|
|
45 |
'CHH': (x['H'] / (x['h'] + x['H'])) if x['h'] + x['H'] > 0 else 0}, |
|
|
46 |
'methylated': { |
|
|
47 |
'CpG': x['Z'], |
|
|
48 |
'CHG': x['X'], |
|
|
49 |
'CHH': x['H'] |
|
|
50 |
}, |
|
|
51 |
'unmethylated': { |
|
|
52 |
'CpG': x['z'], |
|
|
53 |
'CHG': x['x'], |
|
|
54 |
'CHH': x['h'] |
|
|
55 |
} |
|
|
56 |
}) |
|
|
57 |
|
|
|
58 |
def plot(self, target_path, title=None): |
|
|
59 |
|
|
|
60 |
df = self.get_df() |
|
|
61 |
# Methylation ratio plot: |
|
|
62 |
name = 'methylation_pct' |
|
|
63 |
|
|
|
64 |
(df['ratio'] * 100).plot.bar() |
|
|
65 |
ax = plt.gca() |
|
|
66 |
for p in ax.patches: |
|
|
67 |
ax.annotate( |
|
|
68 |
f'{p.get_height():.1f}%', |
|
|
69 |
(p.get_x() + p.get_width() / 2, p.get_height() * 1.005), |
|
|
70 |
va='bottom', ha='center') |
|
|
71 |
ax.set_ylim(0, 110) |
|
|
72 |
ax.set_xlabel('Methylation context') |
|
|
73 |
ax.set_ylabel('% methylated') |
|
|
74 |
ax.spines['right'].set_visible(False) |
|
|
75 |
ax.spines['top'].set_visible(False) |
|
|
76 |
plt.show() |
|
|
77 |
if title is not None: |
|
|
78 |
plt.title(title) |
|
|
79 |
plt.tight_layout() |
|
|
80 |
plt.savefig(target_path.replace('.png', f'.{name}.png')) |
|
|
81 |
|
|
|
82 |
ax.set_yscale('log') |
|
|
83 |
ax.set_ylim(0.01, 110) |
|
|
84 |
|
|
|
85 |
plt.tight_layout() |
|
|
86 |
plt.savefig(target_path.replace('.png', f'.{name}.log.png')) |
|
|
87 |
plt.close() |
|
|
88 |
######## |
|
|
89 |
name = 'methylation_absolute' |
|
|
90 |
|
|
|
91 |
(df[['methylated', 'unmethylated']]).plot.bar() |
|
|
92 |
ax = plt.gca() |
|
|
93 |
for p in ax.patches: |
|
|
94 |
ax.annotate( |
|
|
95 |
f'{p.get_height()}', |
|
|
96 |
(p.get_x() + p.get_width() / 2, p.get_height() * 1.005), |
|
|
97 |
va='bottom', ha='center') |
|
|
98 |
|
|
|
99 |
ax.set_xlabel('Methylation context') |
|
|
100 |
ax.set_ylabel('Bases total') |
|
|
101 |
plt.tight_layout() |
|
|
102 |
ax.spines['right'].set_visible(False) |
|
|
103 |
ax.spines['top'].set_visible(False) |
|
|
104 |
if title is not None: |
|
|
105 |
plt.title(title) |
|
|
106 |
plt.tight_layout() |
|
|
107 |
plt.savefig(target_path.replace('.png', f'.{name}.png')) |
|
|
108 |
|
|
|
109 |
ax.set_yscale('log') |
|
|
110 |
plt.tight_layout() |
|
|
111 |
plt.savefig(target_path.replace('.png', f'.{name}.log.png')) |
|
|
112 |
plt.close() |
|
|
113 |
|
|
|
114 |
def to_csv(self, path): |
|
|
115 |
|
|
|
116 |
self.get_df().to_csv(path) |