|
a |
|
b/singlecellmultiomics/statistic/scchicligation.py |
|
|
1 |
#!/usr/bin/env python3 |
|
|
2 |
# -*- coding: utf-8 -*- |
|
|
3 |
from matplotlib.ticker import MaxNLocator |
|
|
4 |
import matplotlib.pyplot as plt |
|
|
5 |
from .statistic import StatisticHistogram |
|
|
6 |
import singlecellmultiomics.pyutils as pyutils |
|
|
7 |
import collections |
|
|
8 |
import pandas as pd |
|
|
9 |
import seaborn as sns |
|
|
10 |
import matplotlib |
|
|
11 |
matplotlib.rcParams['figure.dpi'] = 160 |
|
|
12 |
matplotlib.use('Agg') |
|
|
13 |
|
|
|
14 |
|
|
|
15 |
class ScCHICLigation(): |
|
|
16 |
def __init__(self, args): |
|
|
17 |
# cell -> { A_start: count, total_cuts: count } |
|
|
18 |
self.per_cell_a_obs = collections.defaultdict(collections.Counter) |
|
|
19 |
# cell -> { TA_start: count, total_cuts: count } |
|
|
20 |
self.per_cell_ta_obs = collections.defaultdict(collections.Counter) |
|
|
21 |
|
|
|
22 |
def processRead(self, R1,R2): |
|
|
23 |
|
|
|
24 |
if R1 is None: |
|
|
25 |
return |
|
|
26 |
read = R1 |
|
|
27 |
if read.has_tag('RZ') and not read.is_duplicate and read.is_read1: |
|
|
28 |
sample = read.get_tag('SM') |
|
|
29 |
first = read.get_tag('RZ')[0] |
|
|
30 |
if read.get_tag('RZ') == 'TA': |
|
|
31 |
self.per_cell_ta_obs[sample]['TA_start'] += 1 |
|
|
32 |
if first == 'A': |
|
|
33 |
self.per_cell_a_obs[sample]['A_start'] += 1 |
|
|
34 |
self.per_cell_ta_obs[sample]['total'] += 1 |
|
|
35 |
self.per_cell_a_obs[sample]['total'] += 1 |
|
|
36 |
|
|
|
37 |
def __repr__(self): |
|
|
38 |
return 'ScCHICLigation: no description' |
|
|
39 |
|
|
|
40 |
|
|
|
41 |
|
|
|
42 |
def __iter__(self): |
|
|
43 |
for cell, cell_data in self.per_cell_ta_obs.items(): |
|
|
44 |
yield cell_data['total'], cell_data['TA_start'] / cell_data['total'] |
|
|
45 |
|
|
|
46 |
def plot(self, target_path, title=None): |
|
|
47 |
|
|
|
48 |
########### TA ########### |
|
|
49 |
fig, ax = plt.subplots(figsize=(4, 4)) |
|
|
50 |
|
|
|
51 |
x = [] |
|
|
52 |
y = [] |
|
|
53 |
for cell, cell_data in self.per_cell_ta_obs.items(): |
|
|
54 |
x.append(cell_data['total']) |
|
|
55 |
y.append(cell_data['TA_start'] / cell_data['total']) |
|
|
56 |
|
|
|
57 |
ax.scatter(x, y, s=3,c='k') |
|
|
58 |
ax.set_xscale('log') |
|
|
59 |
if title is not None: |
|
|
60 |
ax.set_title(title) |
|
|
61 |
|
|
|
62 |
ax.set_ylabel("Fraction unique cuts starting with TA") |
|
|
63 |
ax.set_xlabel("# Molecules") |
|
|
64 |
ax.set_xlim(1, None) |
|
|
65 |
ax.set_ylim(-0.1, 1.05) |
|
|
66 |
sns.despine() |
|
|
67 |
plt.tight_layout() |
|
|
68 |
plt.savefig(target_path.replace('.png', '.TA.png')) |
|
|
69 |
plt.close() |
|
|
70 |
|
|
|
71 |
########### A ########### |
|
|
72 |
fig, ax = plt.subplots(figsize=(4, 4)) |
|
|
73 |
|
|
|
74 |
x = [] |
|
|
75 |
y = [] |
|
|
76 |
for cell, cell_data in self.per_cell_ta_obs.items(): |
|
|
77 |
x.append(cell_data['total']) |
|
|
78 |
y.append(cell_data['A_start'] / cell_data['total']) |
|
|
79 |
|
|
|
80 |
ax.scatter(x, y, s=3,c='k') |
|
|
81 |
ax.set_xscale('log') |
|
|
82 |
if title is not None: |
|
|
83 |
ax.set_title(title) |
|
|
84 |
|
|
|
85 |
ax.set_ylabel("Fraction unique cuts starting with A") |
|
|
86 |
ax.set_xlabel("# Molecules") |
|
|
87 |
ax.set_xlim(1, None) |
|
|
88 |
ax.set_ylim(-0.1, 1.05) |
|
|
89 |
plt.tight_layout() |
|
|
90 |
sns.despine() |
|
|
91 |
plt.savefig(target_path.replace('.png', '.A.png')) |
|
|
92 |
plt.close() |
|
|
93 |
|
|
|
94 |
def to_csv(self, path): |
|
|
95 |
pd.DataFrame( |
|
|
96 |
self.per_cell_ta_obs).sort_index().to_csv( |
|
|
97 |
path.replace( |
|
|
98 |
'.csv', |
|
|
99 |
'TA_obs_per_cell.csv')) |