a b/singlecellmultiomics/statistic/scchicligation.py
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
from matplotlib.ticker import MaxNLocator
4
import matplotlib.pyplot as plt
5
from .statistic import StatisticHistogram
6
import singlecellmultiomics.pyutils as pyutils
7
import collections
8
import pandas as pd
9
import seaborn as sns
10
import matplotlib
11
matplotlib.rcParams['figure.dpi'] = 160
12
matplotlib.use('Agg')
13
14
15
class ScCHICLigation():
16
    def __init__(self, args):
17
        # cell -> { A_start: count, total_cuts: count }
18
        self.per_cell_a_obs = collections.defaultdict(collections.Counter)
19
        # cell -> { TA_start: count, total_cuts: count }
20
        self.per_cell_ta_obs = collections.defaultdict(collections.Counter)
21
22
    def processRead(self, R1,R2):
23
24
        if R1 is None:
25
            return
26
        read = R1
27
        if read.has_tag('RZ') and not read.is_duplicate and read.is_read1:
28
            sample = read.get_tag('SM')
29
            first = read.get_tag('RZ')[0]
30
            if read.get_tag('RZ') == 'TA':
31
                self.per_cell_ta_obs[sample]['TA_start'] += 1
32
            if first == 'A':
33
                self.per_cell_a_obs[sample]['A_start'] += 1
34
            self.per_cell_ta_obs[sample]['total'] += 1
35
            self.per_cell_a_obs[sample]['total'] += 1
36
37
    def __repr__(self):
38
        return 'ScCHICLigation: no description'
39
40
41
42
    def __iter__(self):
43
        for cell, cell_data in self.per_cell_ta_obs.items():
44
            yield cell_data['total'],  cell_data['TA_start'] / cell_data['total']
45
46
    def plot(self, target_path, title=None):
47
48
        ########### TA ###########
49
        fig, ax = plt.subplots(figsize=(4, 4))
50
51
        x = []
52
        y = []
53
        for cell, cell_data in self.per_cell_ta_obs.items():
54
            x.append(cell_data['total'])
55
            y.append(cell_data['TA_start'] / cell_data['total'])
56
57
        ax.scatter(x, y, s=3,c='k')
58
        ax.set_xscale('log')
59
        if title is not None:
60
            ax.set_title(title)
61
62
        ax.set_ylabel("Fraction unique cuts starting with TA")
63
        ax.set_xlabel("# Molecules")
64
        ax.set_xlim(1, None)
65
        ax.set_ylim(-0.1, 1.05)
66
        sns.despine()
67
        plt.tight_layout()
68
        plt.savefig(target_path.replace('.png', '.TA.png'))
69
        plt.close()
70
71
        ########### A ###########
72
        fig, ax = plt.subplots(figsize=(4, 4))
73
74
        x = []
75
        y = []
76
        for cell, cell_data in self.per_cell_ta_obs.items():
77
            x.append(cell_data['total'])
78
            y.append(cell_data['A_start'] / cell_data['total'])
79
80
        ax.scatter(x, y, s=3,c='k')
81
        ax.set_xscale('log')
82
        if title is not None:
83
            ax.set_title(title)
84
85
        ax.set_ylabel("Fraction unique cuts starting with A")
86
        ax.set_xlabel("# Molecules")
87
        ax.set_xlim(1, None)
88
        ax.set_ylim(-0.1, 1.05)
89
        plt.tight_layout()
90
        sns.despine()
91
        plt.savefig(target_path.replace('.png', '.A.png'))
92
        plt.close()
93
94
    def to_csv(self, path):
95
        pd.DataFrame(
96
            self.per_cell_ta_obs).sort_index().to_csv(
97
            path.replace(
98
                '.csv',
99
                'TA_obs_per_cell.csv'))