[45ad7e]: / singlecellmultiomics / statistic / fragmentsize.py

Download this file

134 lines (107 with data), 4.0 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
from .statistic import StatisticHistogram
import singlecellmultiomics.pyutils as pyutils
import collections
import seaborn as sns
import matplotlib
matplotlib.rcParams['figure.dpi'] = 160
matplotlib.use('Agg')
def readIsDuplicate(read):
return (read.has_tag('RC') and read.get_tag('RC') > 1) or read.is_duplicate
class FragmentSizeHistogram(StatisticHistogram):
def __init__(self, args):
StatisticHistogram.__init__(self, args)
self.histogram = collections.Counter()
self.histogramReject = collections.Counter()
self.histogramAccept = collections.Counter()
self.histogramMolecule = collections.Counter()
def processRead(self, R1,R2=None):
fragmentSize = None
moleculeSize = None
qcfail =False
for read in [R1,R2]:
if read is None:
continue
if read.is_qcfail:
qcfail = True
if read.has_tag('ms') and not read.is_duplicate:
moleculeSize = read.get_tag('ms')
if read.has_tag('fS'):
fragmentSize = read.get_tag('fS')
break
if fragmentSize is None:
for read in [R1,R2]:
if read is None:
continue
if read.is_qcfail:
qcfail = True
if read.isize !=0:
fragmentSize = abs(read.isize)
break
if fragmentSize is None or fragmentSize > 1_000:
return
#print(fragmentSize, read.reference_start, read.reference_end,mateStart,readLen )
self.histogram[fragmentSize] += 1
if qcfail:
self.histogramReject[fragmentSize] += 1
else:
if moleculeSize is not None:
self.histogramMolecule[fragmentSize] += 1
self.histogramAccept[fragmentSize] += 1
def __repr__(self):
return f'The average fragment size is {pyutils.meanOfCounter(self.histogram)}, SD:{pyutils.varianceOfCounter(self.histogram)}'
def plot(self, target_path, title=None):
fig, ax = plt.subplots()
ax.bar(
list(
self.histogram.keys()), list(
self.histogram.values()), width=1)
if title is not None:
ax.set_title(title)
ax.set_xlabel("Fragment size [bp]")
ax.set_ylabel("# Fragments")
plt.tight_layout()
plt.savefig(target_path)
sns.despine()
plt.close()
fig, ax = plt.subplots()
plt.bar(
list(
self.histogramReject.keys()), list(
self.histogramReject.values()), width=1, color='r')
if title is not None:
plt.title(title)
ax.set_xlabel("Rejected fragment size [bp]")
ax.set_ylabel("# Fragments")
plt.tight_layout()
sns.despine()
plt.savefig(target_path.replace('.png', '.rejected.png'))
plt.close()
fig, ax = plt.subplots()
ax.bar(
list(
self.histogramAccept.keys()), list(
self.histogramAccept.values()), width=1, color='g')
if title is not None:
plt.title(title)
ax.set_xlabel("Accepted fragment size [bp]")
ax.set_ylabel("# Fragments")
plt.tight_layout()
sns.despine()
plt.savefig(target_path.replace('.png', '.accepted.png'))
plt.close()
fig, ax = plt.subplots()
ax.bar(
list(
self.histogramMolecule.keys()), list(
self.histogramMolecule.values()), width=1, color='g')
if title is not None:
plt.title(title)
sns.despine()
ax.set_xlabel("Molecule size [bp]")
ax.set_ylabel("# Molecules")
plt.tight_layout()
plt.savefig(target_path.replace('.png', '.molecules.png'))
plt.close()