|
a |
|
b/singlecellmultiomics/statistic/plate2.py |
|
|
1 |
import matplotlib.patheffects as path_effects |
|
|
2 |
import math |
|
|
3 |
import string |
|
|
4 |
import numpy as np |
|
|
5 |
import pandas as pd |
|
|
6 |
import matplotlib.pyplot as plt |
|
|
7 |
from .statistic import StatisticHistogram |
|
|
8 |
import singlecellmultiomics.pyutils as pyutils |
|
|
9 |
from collections import defaultdict, Counter |
|
|
10 |
from singlecellmultiomics.utils.plotting import plot_plate |
|
|
11 |
import matplotlib |
|
|
12 |
matplotlib.rcParams['figure.dpi'] = 160 |
|
|
13 |
matplotlib.use('Agg') |
|
|
14 |
|
|
|
15 |
|
|
|
16 |
def counter_defaultdict(): |
|
|
17 |
return defaultdict(Counter) |
|
|
18 |
|
|
|
19 |
class PlateStatistic2(object): |
|
|
20 |
|
|
|
21 |
def __init__(self, args): |
|
|
22 |
self.args = args |
|
|
23 |
|
|
|
24 |
self.stats = defaultdict(counter_defaultdict) |
|
|
25 |
|
|
|
26 |
def to_csv(self, path): |
|
|
27 |
|
|
|
28 |
export = {} |
|
|
29 |
for library, libd in self.stats.items(): |
|
|
30 |
for (stat,metric), data in libd.items(): |
|
|
31 |
print(data) |
|
|
32 |
export[library,stat,metric] = data |
|
|
33 |
|
|
|
34 |
|
|
|
35 |
pd.DataFrame( |
|
|
36 |
export).to_csv( |
|
|
37 |
path.replace( |
|
|
38 |
'.csv', |
|
|
39 |
f'plate_metrics.csv')) |
|
|
40 |
|
|
|
41 |
def processRead(self, R1,R2=None, default_lib=None): |
|
|
42 |
|
|
|
43 |
for read in [R1,R2]: |
|
|
44 |
|
|
|
45 |
if read is None: |
|
|
46 |
continue |
|
|
47 |
|
|
|
48 |
cell_index = int(read.get_tag('bi')) |
|
|
49 |
|
|
|
50 |
if read.has_tag('MX'): |
|
|
51 |
mux= read.get_tag('MX') |
|
|
52 |
format = 384 if ('384' in mux or mux.startswith('CS2')) else 96 |
|
|
53 |
if format==384: |
|
|
54 |
n_rows = 16 |
|
|
55 |
n_cols = 24 |
|
|
56 |
else: |
|
|
57 |
n_rows = 8 |
|
|
58 |
n_cols = 12 |
|
|
59 |
else: |
|
|
60 |
n_rows = 16 |
|
|
61 |
n_cols = 24 |
|
|
62 |
|
|
|
63 |
if default_lib is None: |
|
|
64 |
if read.has_tag('LY'): |
|
|
65 |
library = read.get_tag('LY') |
|
|
66 |
|
|
|
67 |
|
|
|
68 |
else: |
|
|
69 |
library = '_' |
|
|
70 |
else: |
|
|
71 |
library = default_lib |
|
|
72 |
|
|
|
73 |
cell_index = int(cell_index)-1 |
|
|
74 |
row = int(cell_index/n_cols) |
|
|
75 |
col = cell_index - row*n_cols |
|
|
76 |
|
|
|
77 |
cell= read.get_tag('SM') |
|
|
78 |
self.stats[library][('total mapped','# reads')][(row,col, cell)] += 1 |
|
|
79 |
|
|
|
80 |
if read.is_qcfail: |
|
|
81 |
self.stats[library]['qcfail', '# reads'][(row,col,cell)] += 1 |
|
|
82 |
continue |
|
|
83 |
|
|
|
84 |
if read.is_duplicate: |
|
|
85 |
continue |
|
|
86 |
|
|
|
87 |
self.stats[library][('total mapped','# molecules')][(row,col,cell)] += 1 |
|
|
88 |
|
|
|
89 |
if read.has_tag('lh') and read.get_tag('lh') in ('TA','TT','AA'): |
|
|
90 |
self.stats[library][read.get_tag('lh'), 'ligated molecules'][(row,col,cell)] += 1 |
|
|
91 |
|
|
|
92 |
break |
|
|
93 |
|
|
|
94 |
|
|
|
95 |
def __repr__(self): |
|
|
96 |
return 'Plate statistic' |
|
|
97 |
|
|
|
98 |
|
|
|
99 |
def __iter__(self): |
|
|
100 |
for library, libd in self.stats.items(): |
|
|
101 |
for name, data in libd.items(): |
|
|
102 |
for k,v in data.items(): |
|
|
103 |
yield (library,name,k),v |
|
|
104 |
|
|
|
105 |
|
|
|
106 |
def plot(self, target_path, title=None): |
|
|
107 |
for library, libd in self.stats.items(): |
|
|
108 |
for (stat,metric), data in libd.items(): |
|
|
109 |
for log in [True,False]: |
|
|
110 |
|
|
|
111 |
fig, ax, cax = plot_plate(data, log=log, cmap_name='viridis',vmin=1 if log else 0) |
|
|
112 |
cax.set_ylabel(metric) |
|
|
113 |
ax.set_title(f'{stat}, {metric}') |
|
|
114 |
plt.savefig( |
|
|
115 |
target_path.replace( |
|
|
116 |
'.png', |
|
|
117 |
'') + |
|
|
118 |
f'_{library}_{stat}_{metric.replace("# ","n_")}.{"log" if log else "linear"}.png', bbox_inches='tight', dpi=250) |
|
|
119 |
plt.close() |
|
|
120 |
# Create oversequencing plot: (reads / molecule) |
|
|
121 |
overseq = {} |
|
|
122 |
for coordinate, n_molecules in libd[('total mapped','# molecules')].items(): |
|
|
123 |
n_reads = libd[('total mapped','# reads')].get(coordinate,0) |
|
|
124 |
overseq[coordinate] = (n_reads/n_molecules) if n_reads>0 else 0 |
|
|
125 |
|
|
|
126 |
fig, ax, cax = plot_plate(overseq, log=False, cmap_name='viridis', vmin=np.percentile(list(overseq.values()),5)) |
|
|
127 |
cax.set_ylabel('reads / molecule') |
|
|
128 |
ax.set_title(f'Oversequencing') |
|
|
129 |
plt.savefig( |
|
|
130 |
target_path.replace( |
|
|
131 |
'.png', |
|
|
132 |
'') + |
|
|
133 |
f'_{library}_oversequencing.png', bbox_inches='tight', dpi=250) |
|
|
134 |
plt.close() |
|
|
135 |
|
|
|
136 |
# Create TA ratio plot: (ta molecules / total molecules) |
|
|
137 |
ta_ratio = {} |
|
|
138 |
for coordinate, n_ta_molecules in libd[('TA','ligated molecules')].items(): |
|
|
139 |
n_molecules = libd[('total mapped','# molecules')].get(coordinate,0) |
|
|
140 |
ta_ratio[coordinate] = (n_ta_molecules/n_molecules) if n_molecules>0 else 0 |
|
|
141 |
|
|
|
142 |
fig, ax, cax = plot_plate(ta_ratio, log=False, cmap_name='viridis', vmin=np.percentile(list(ta_ratio.values()),5)) |
|
|
143 |
cax.set_ylabel('TA molecules / molecule') |
|
|
144 |
ax.set_title(f'TA fraction') |
|
|
145 |
plt.savefig( |
|
|
146 |
target_path.replace( |
|
|
147 |
'.png', |
|
|
148 |
'') + |
|
|
149 |
f'_{library}_ta_fraction.png', bbox_inches='tight', dpi=250) |
|
|
150 |
plt.close() |