Switch to unified view

a b/singlecellmultiomics/bamProcessing/bamPlotRTstats.py
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
import singlecellmultiomics.features
4
import matplotlib.pyplot as plt
5
import numpy as np
6
import itertools
7
import pandas as pd
8
import importlib
9
import singlecellmultiomics.universalBamTagger.universalBamTagger as ut
10
import pysamiterators.iterators as pyts
11
import matplotlib.lines as mlines
12
import os
13
import sys
14
import pysam
15
import collections
16
import argparse
17
import gzip
18
import pickle
19
import matplotlib
20
matplotlib.rcParams['figure.dpi'] = 160
21
matplotlib.use('Agg')
22
23
24
def nlaIII_molecule_acceptance_function(molecule):
25
    first_read = molecule[0][0]
26
    if first_read.mapping_quality < 60:
27
        return False
28
    reject = False
29
    if first_read.has_tag('XA'):
30
        for alt_align in first_read.get_tag('XA').split(';'):
31
            if len(alt_align) == 0:  # Sometimes this tag is empty for some reason
32
                continue
33
34
            hchrom, hpos, hcigar, hflag = alt_align.split(',')
35
            if not hchrom.endswith('_alt'):
36
                reject = True
37
                break
38
    return reject
39
40
41
if __name__ == '__main__':
42
    argparser = argparse.ArgumentParser(
43
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
44
        description='Visualise feature density of a bam file. (Coverage around stop codons, start codons, genes etc)')
45
    argparser.add_argument('bamFile', type=str)
46
    argparser.add_argument(
47
        '-head',
48
        type=int,
49
        help='Process this many molecules')
50
    argparser.add_argument('-binSize', type=int, default=30)
51
    argparser.add_argument(
52
        '-maxfs',
53
        type=int,
54
        default=900,
55
        help='X axis limit of fragment size plot')
56
    argparser.add_argument('-maxOverseq', type=int, default=4)
57
    argparser.add_argument('--notstrict', action='store_true')
58
    argparser.add_argument('-o', type=str, default='RT_dist')
59
60
    args = argparser.parse_args()
61
62
    def dd():
63
        return collections.defaultdict(collections.Counter)
64
65
    fragment_distribution = collections.Counter()
66
    fragment_distribution_raw = collections.defaultdict(
67
        collections.Counter)  # overseq -> fragmentisze -> counts
68
    # Read size fragments
69
    fragment_distribution_raw_rf = collections.defaultdict(
70
        dd)  # lib -> overseq -> fragmentisze (span) -> counts
71
    gc_distribution = collections.Counter()
72
    gc_frag_distribution = collections.defaultdict(
73
        collections.Counter)  # fragment size -> observed gc/at+gc ratio
74
    # fragmentSize -> umi obs
75
76
    rt_frag_distribution = collections.defaultdict(collections.Counter)
77
    # fragment size -> amount of RT reactions -> count
78
79
    observed_cuts = collections.defaultdict()
80
    used = 0
81
    used_reads = 0
82
    gc_capture = False
83
    with pysam.AlignmentFile(args.bamFile) as a:
84
        for i, molecule in enumerate(
85
            ut.MoleculeIterator_OLD(
86
                a, umi_hamming_distance=1)):
87
88
            if not args.notstrict and not nlaIII_molecule_acceptance_function(
89
                    molecule):
90
                continue
91
92
            if args.head is not None and used > args.head:
93
                print('Stoppping, saw enough molecules')
94
                break
95
96
            rt_reactions = ut.molecule_to_random_primer_dict(molecule)
97
            amount_of_rt_reactions = len(rt_reactions)
98
99
            # this obtains the maximum fragment size:
100
            frag_chrom, frag_start, frag_end = pyts.getListSpanningCoordinates(
101
                [v for v in itertools.chain.from_iterable(molecule) if v is not None])
102
103
            # Obtain the fragment sizes of all RT reactions:
104
            rt_sizes = []
105
            for (rt_end, hexamer), fragment in rt_reactions.items():
106
                rt_chrom, rt_start, rt_end = pyts.getListSpanningCoordinates(
107
                    itertools.chain.from_iterable(fragment))
108
                rt_sizes.append([rt_end - rt_start])
109
110
            first_read = molecule[0][0]
111
            site = first_read.get_tag('DS')
112
            strand = first_read.get_tag('RS')
113
            library = first_read.get_tag('LY')
114
            # if gc_capture:
115
            #    sequence = reference.fetch(frag_chrom, frag_start, frag_end)
116
            #    gc = sequence.count('C')+ sequence.count('G')
117
        #        length = len(sequence)
118
            used += 1
119
120
            # if gc_capture:
121
            #    gc_distribution[gc/length] += 1
122
            #    gc_frag_distribution[fragment_size][gc/length] += 1
123
            #    fragment_distribution_raw[len(molecule)][fragment_size]+=1
124
125
            if len(rt_sizes) == 0:
126
                mean_rt_size = 0
127
            else:
128
                mean_rt_size = int(np.mean(rt_sizes))
129
            fragment_distribution_raw_rf[library][len(
130
                molecule)][mean_rt_size] += 1
131
            rt_frag_distribution[mean_rt_size][len(rt_reactions)] += 1
132
            used_reads += len(molecule)
133
134
    bin_size = args.binSize
135
    m_overseq = args.maxOverseq
136
137
    with gzip.open(f'{args.o}_raw_data.pickle.gz', 'wb') as fo:
138
139
        pickle.dump(fragment_distribution_raw_rf, fo)
140
141
    for library in fragment_distribution_raw_rf:
142
        fig, axes = plt.subplots(1, 1, figsize=(10, 7))
143
        ax = axes
144
        ax.set_title(
145
            f'Read fragment size distribution\n{used} molecules / {used_reads} fragments  analysed\n{library}')
146
        table = {}
147
148
        for overseq in range(1, m_overseq):
149
            try:
150
                # Rebin in 10bp bins:
151
                rebinned = collections.Counter()
152
                for f_size, obs in fragment_distribution_raw_rf[library][overseq].most_common(
153
                ):
154
                    rebinned[int(f_size / bin_size) * bin_size] += obs
155
                obs_dist_fsize = np.array(list(rebinned.keys()))
156
                obs_dist_freq = np.array(list(rebinned.values()))
157
                sorting_order = np.argsort(obs_dist_fsize)
158
                obs_dist_fsize = obs_dist_fsize[sorting_order]
159
                obs_dist_freq = obs_dist_freq[sorting_order]
160
                obs_dist_density = obs_dist_freq / np.sum(obs_dist_freq)
161
162
                for i, x in enumerate(obs_dist_fsize):
163
                    table[(overseq, x)] = {'obs_dist_freq': obs_dist_freq[i],
164
                                           'obs_dist_density': obs_dist_density[i]}
165
166
                ax.plot(
167
                    obs_dist_fsize, obs_dist_density, c=(
168
                        overseq / m_overseq, 0, 0), label=f'{overseq} read fragments / umi')
169
                ax.set_xlim(-10, args.maxfs)
170
                ax.legend()
171
                ax.set_xlabel('fragment size')
172
                ax.set_ylabel('density')
173
            except Exception as e:
174
                print(e)
175
                pass
176
        plt.tight_layout()
177
        plt.savefig(f'{args.o}_{library}.png')
178
        pd.DataFrame(table).to_csv(f'{args.o}_{library}.csv')
179
        try:
180
            plt.close()
181
        except Exception as e:
182
            pass
183
184
    # Export the table: