|
a |
|
b/singlecellmultiomics/statistic/oversequencing.py |
|
|
1 |
#!/usr/bin/env python3 |
|
|
2 |
# -*- coding: utf-8 -*- |
|
|
3 |
from matplotlib.ticker import MaxNLocator |
|
|
4 |
import matplotlib.pyplot as plt |
|
|
5 |
from .statistic import StatisticHistogram |
|
|
6 |
import singlecellmultiomics.pyutils as pyutils |
|
|
7 |
import collections |
|
|
8 |
|
|
|
9 |
import matplotlib |
|
|
10 |
matplotlib.rcParams['figure.dpi'] = 160 |
|
|
11 |
matplotlib.use('Agg') |
|
|
12 |
|
|
|
13 |
|
|
|
14 |
class OversequencingHistogram(StatisticHistogram): |
|
|
15 |
def __init__(self, args): |
|
|
16 |
StatisticHistogram.__init__(self, args) |
|
|
17 |
self.histogram = collections.Counter() |
|
|
18 |
|
|
|
19 |
def processRead(self, R1,R2=None): |
|
|
20 |
|
|
|
21 |
for read in [R1, R2]: |
|
|
22 |
if read is None: |
|
|
23 |
continue |
|
|
24 |
|
|
|
25 |
if read.has_tag('RC'): |
|
|
26 |
overseq = read.get_tag('RC') |
|
|
27 |
self.histogram[overseq] += 1 |
|
|
28 |
if overseq > 1: |
|
|
29 |
self.histogram[overseq - 1] -= 1 |
|
|
30 |
|
|
|
31 |
# Compatibility with picard --TAG_DUPLICATE_SET_MEMBERS |
|
|
32 |
""" |
|
|
33 |
java -jar `which picard.jar` MarkDuplicates I=sorted.bam O=marked_duplicates.bam M=marked_dup_metrics.txt TAG_DUPLICATE_SET_MEMBERS=1 |
|
|
34 |
""" |
|
|
35 |
elif read.has_tag('PG'): |
|
|
36 |
if read.has_tag('DS') and not read.is_duplicate: |
|
|
37 |
self.histogram[read.get_tag('DS')] += 1 |
|
|
38 |
else: |
|
|
39 |
self.histogram[1] += 1 |
|
|
40 |
|
|
|
41 |
break |
|
|
42 |
|
|
|
43 |
def __repr__(self): |
|
|
44 |
return f'The average oversequencing is {pyutils.meanOfCounter(self.histogram)}, SD:{pyutils.varianceOfCounter(self.histogram)}' |
|
|
45 |
|
|
|
46 |
def plot(self, target_path, title=None): |
|
|
47 |
fig, ax = plt.subplots() |
|
|
48 |
overseqRange = list(range(1, 20)) |
|
|
49 |
|
|
|
50 |
ax.scatter(overseqRange, [self.histogram[x] for x in overseqRange]) |
|
|
51 |
if title is not None: |
|
|
52 |
ax.set_title(title) |
|
|
53 |
ax.xaxis.set_major_locator(MaxNLocator(integer=True)) |
|
|
54 |
ax.set_xlabel("Amount of fragments associated with molecule") |
|
|
55 |
ax.set_ylabel("# Molecules") |
|
|
56 |
plt.tight_layout() |
|
|
57 |
plt.savefig(target_path) |
|
|
58 |
|
|
|
59 |
ax = plt.gca() |
|
|
60 |
ax.set_ylim(1, None) |
|
|
61 |
ax.set_yscale('log') |
|
|
62 |
plt.savefig(target_path.replace('.png', '.log.png')) |
|
|
63 |
|
|
|
64 |
plt.close() |