[45ad7e]: / singlecellmultiomics / statistic / methylation.py

Download this file

117 lines (98 with data), 3.7 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
from .statistic import StatisticHistogram
import singlecellmultiomics.pyutils as pyutils
import collections
import pandas as pd
import matplotlib
matplotlib.rcParams['figure.dpi'] = 160
matplotlib.use('Agg')
class MethylationContextHistogram(StatisticHistogram):
def __init__(self, args):
StatisticHistogram.__init__(self, args)
self.context_obs = collections.Counter() # (bismark_call_tag)=> observations
self.hasdata = False
def processRead(self, R1,R2):
for read in [R2,R2]:
if read is None:
continue
if not read.is_paired or not read.has_tag(
'XM') or not read.is_read1 or read.is_duplicate or read.has_tag('RR'):
return
self.hasdata = True
tags = dict(read.tags)
for tag in 'zhx':
self.context_obs[tag] += tags.get(f's{tag}', 0)
self.context_obs[tag.upper()] += tags.get(f's{tag.upper()}', 0)
def __repr__(self):
return f'Methylation status.'
def get_df(self):
x = self.context_obs
return pd.DataFrame(
{'ratio':
{
'CpG': (x['Z'] / (x['z'] + x['Z'])) if x['z'] + x['Z'] > 0 else 0 ,
'CHG': (x['X'] / (x['x'] + x['X'])) if x['x'] + x['X'] >0 else 0,
'CHH': (x['H'] / (x['h'] + x['H'])) if x['h'] + x['H'] > 0 else 0},
'methylated': {
'CpG': x['Z'],
'CHG': x['X'],
'CHH': x['H']
},
'unmethylated': {
'CpG': x['z'],
'CHG': x['x'],
'CHH': x['h']
}
})
def plot(self, target_path, title=None):
df = self.get_df()
# Methylation ratio plot:
name = 'methylation_pct'
(df['ratio'] * 100).plot.bar()
ax = plt.gca()
for p in ax.patches:
ax.annotate(
f'{p.get_height():.1f}%',
(p.get_x() + p.get_width() / 2, p.get_height() * 1.005),
va='bottom', ha='center')
ax.set_ylim(0, 110)
ax.set_xlabel('Methylation context')
ax.set_ylabel('% methylated')
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.show()
if title is not None:
plt.title(title)
plt.tight_layout()
plt.savefig(target_path.replace('.png', f'.{name}.png'))
ax.set_yscale('log')
ax.set_ylim(0.01, 110)
plt.tight_layout()
plt.savefig(target_path.replace('.png', f'.{name}.log.png'))
plt.close()
########
name = 'methylation_absolute'
(df[['methylated', 'unmethylated']]).plot.bar()
ax = plt.gca()
for p in ax.patches:
ax.annotate(
f'{p.get_height()}',
(p.get_x() + p.get_width() / 2, p.get_height() * 1.005),
va='bottom', ha='center')
ax.set_xlabel('Methylation context')
ax.set_ylabel('Bases total')
plt.tight_layout()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
if title is not None:
plt.title(title)
plt.tight_layout()
plt.savefig(target_path.replace('.png', f'.{name}.png'))
ax.set_yscale('log')
plt.tight_layout()
plt.savefig(target_path.replace('.png', f'.{name}.log.png'))
plt.close()
def to_csv(self, path):
self.get_df().to_csv(path)