Switch to unified view

a b/singlecellmultiomics/statistic/conversions.py
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
import seaborn as sns
4
import matplotlib.pyplot as plt
5
from .statistic import StatisticHistogram
6
import singlecellmultiomics.pyutils as pyutils
7
import collections
8
import pandas as pd
9
import matplotlib
10
matplotlib.rcParams['figure.dpi'] = 160
11
matplotlib.use('Agg')
12
13
14
class ConversionMatrix(StatisticHistogram):
15
    def __init__(self, args, process_reads=200_000):
16
        StatisticHistogram.__init__(self, args)
17
        self.conversion_obs = collections.defaultdict(collections.Counter)
18
        self.base_obs = collections.defaultdict(collections.Counter)
19
        self.stranded_base_conversions = collections.defaultdict(
20
            collections.Counter)
21
        self.processed_reads = 0
22
        self.process_reads = process_reads
23
24
    def processRead(self, R1,R2):
25
26
        for read in [R1,R2]:
27
            if read is None:
28
                continue
29
30
            if self.processed_reads >= self.process_reads or read is None or read.is_unmapped or read.mapping_quality < 30:
31
                return
32
33
            self.processed_reads += 1
34
            try:
35
                for index, reference_pos, reference_base in read.get_aligned_pairs(
36
                        with_seq=True, matches_only=True):
37
                    query_base = read.seq[index]
38
                    reference_base = reference_base.upper()
39
                    if reference_base != 'N' and query_base != 'N':
40
                        k = (
41
                            query_base, 'R1' if read.is_read1 else (
42
                                'R2' if read.is_read2 else 'R?'), ('forward', 'reverse')[
43
                                read.is_reverse])
44
45
                        self.base_obs[reference_base][k] += 1
46
                        if reference_base != query_base:
47
                            self.conversion_obs[reference_base][k] += 1
48
49
                            if read.has_tag('RS'):
50
                                k = (
51
                                    query_base,
52
                                    'R1' if read.is_read1 else (
53
                                        'R2' if read.is_read2 else 'R?'),
54
                                    read.get_tag('RS'))
55
                                self.stranded_base_conversions[reference_base][k] += 1
56
            except ValueError: # Fails when the MD tag is not present
57
                continue
58
59
    def __repr__(self):
60
        return f'Observed base conversions'
61
62
    def get_df(self):
63
        return pd.DataFrame(
64
            self.base_obs).fillna(0).sort_index(
65
            axis=0).sort_index(
66
            axis=1)
67
68
    def plot(self, target_path, title=None):
69
70
        df = pd.DataFrame(
71
            self.conversion_obs).fillna(0).sort_index(
72
            axis=0).sort_index(
73
            axis=1)
74
        cm = sns.clustermap(df, row_cluster=True, col_cluster=True)
75
        ax = cm.ax_heatmap
76
        ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
77
        cm.ax_heatmap.set_title('Raw conversions')
78
        cm.ax_heatmap.set_xlabel('reference base')
79
        cm.ax_heatmap.set_ylabel('sequenced base')
80
        plt.subplots_adjust(left=0, right=0.8)
81
        if title is not None:
82
            cm.ax_heatmap.set_title(title)
83
        #
84
        plt.savefig(target_path.replace('.png', f'.conversions.png'))
85
        plt.close()
86
87
        df = self.get_df()
88
        cm = sns.clustermap(df, row_cluster=False, col_cluster=False)
89
        ax = cm.ax_heatmap
90
        ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
91
        cm.ax_heatmap.set_title('Raw conversions')
92
        cm.ax_heatmap.set_xlabel('reference base')
93
        cm.ax_heatmap.set_ylabel('sequenced base')
94
        #plt.tight_layout(pad=0.4, w_pad=0.8, h_pad=1.0)
95
        plt.subplots_adjust(left=0, right=0.8)
96
        if title is not None:
97
            cm.ax_heatmap.set_title(title)
98
99
        plt.savefig(target_path.replace('.png', f'.base_obs.png'))
100
        plt.close()
101
102
        if len(self.stranded_base_conversions) == 0:
103
            return
104
105
        df = pd.DataFrame(
106
            self.stranded_base_conversions).fillna(0).sort_index(
107
            axis=0).sort_index(
108
            axis=1)
109
        cm = sns.clustermap(df, row_cluster=False, col_cluster=False)
110
        ax = cm.ax_heatmap
111
        ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
112
        cm.ax_heatmap.set_title('Raw conversions')
113
        cm.ax_heatmap.set_xlabel('reference base')
114
        cm.ax_heatmap.set_ylabel('sequenced base')
115
        #plt.tight_layout(pad=0.4, w_pad=0.8, h_pad=1.0)
116
        plt.subplots_adjust(left=0, right=0.8)
117
        if title is not None:
118
            cm.ax_heatmap.set_title(title)
119
120
        plt.savefig(
121
            target_path.replace(
122
                '.png',
123
                f'.RS_TAG_strand_conversions.png'))
124
        plt.close()
125
126
    def to_csv(self, path):
127
128
        self.get_df().to_csv(path)