--- a +++ b/singlecellmultiomics/bamProcessing/bamPlotRTstats.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import singlecellmultiomics.features +import matplotlib.pyplot as plt +import numpy as np +import itertools +import pandas as pd +import importlib +import singlecellmultiomics.universalBamTagger.universalBamTagger as ut +import pysamiterators.iterators as pyts +import matplotlib.lines as mlines +import os +import sys +import pysam +import collections +import argparse +import gzip +import pickle +import matplotlib +matplotlib.rcParams['figure.dpi'] = 160 +matplotlib.use('Agg') + + +def nlaIII_molecule_acceptance_function(molecule): + first_read = molecule[0][0] + if first_read.mapping_quality < 60: + return False + reject = False + if first_read.has_tag('XA'): + for alt_align in first_read.get_tag('XA').split(';'): + if len(alt_align) == 0: # Sometimes this tag is empty for some reason + continue + + hchrom, hpos, hcigar, hflag = alt_align.split(',') + if not hchrom.endswith('_alt'): + reject = True + break + return reject + + +if __name__ == '__main__': + argparser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description='Visualise feature density of a bam file. (Coverage around stop codons, start codons, genes etc)') + argparser.add_argument('bamFile', type=str) + argparser.add_argument( + '-head', + type=int, + help='Process this many molecules') + argparser.add_argument('-binSize', type=int, default=30) + argparser.add_argument( + '-maxfs', + type=int, + default=900, + help='X axis limit of fragment size plot') + argparser.add_argument('-maxOverseq', type=int, default=4) + argparser.add_argument('--notstrict', action='store_true') + argparser.add_argument('-o', type=str, default='RT_dist') + + args = argparser.parse_args() + + def dd(): + return collections.defaultdict(collections.Counter) + + fragment_distribution = collections.Counter() + fragment_distribution_raw = collections.defaultdict( + collections.Counter) # overseq -> fragmentisze -> counts + # Read size fragments + fragment_distribution_raw_rf = collections.defaultdict( + dd) # lib -> overseq -> fragmentisze (span) -> counts + gc_distribution = collections.Counter() + gc_frag_distribution = collections.defaultdict( + collections.Counter) # fragment size -> observed gc/at+gc ratio + # fragmentSize -> umi obs + + rt_frag_distribution = collections.defaultdict(collections.Counter) + # fragment size -> amount of RT reactions -> count + + observed_cuts = collections.defaultdict() + used = 0 + used_reads = 0 + gc_capture = False + with pysam.AlignmentFile(args.bamFile) as a: + for i, molecule in enumerate( + ut.MoleculeIterator_OLD( + a, umi_hamming_distance=1)): + + if not args.notstrict and not nlaIII_molecule_acceptance_function( + molecule): + continue + + if args.head is not None and used > args.head: + print('Stoppping, saw enough molecules') + break + + rt_reactions = ut.molecule_to_random_primer_dict(molecule) + amount_of_rt_reactions = len(rt_reactions) + + # this obtains the maximum fragment size: + frag_chrom, frag_start, frag_end = pyts.getListSpanningCoordinates( + [v for v in itertools.chain.from_iterable(molecule) if v is not None]) + + # Obtain the fragment sizes of all RT reactions: + rt_sizes = [] + for (rt_end, hexamer), fragment in rt_reactions.items(): + rt_chrom, rt_start, rt_end = pyts.getListSpanningCoordinates( + itertools.chain.from_iterable(fragment)) + rt_sizes.append([rt_end - rt_start]) + + first_read = molecule[0][0] + site = first_read.get_tag('DS') + strand = first_read.get_tag('RS') + library = first_read.get_tag('LY') + # if gc_capture: + # sequence = reference.fetch(frag_chrom, frag_start, frag_end) + # gc = sequence.count('C')+ sequence.count('G') + # length = len(sequence) + used += 1 + + # if gc_capture: + # gc_distribution[gc/length] += 1 + # gc_frag_distribution[fragment_size][gc/length] += 1 + # fragment_distribution_raw[len(molecule)][fragment_size]+=1 + + if len(rt_sizes) == 0: + mean_rt_size = 0 + else: + mean_rt_size = int(np.mean(rt_sizes)) + fragment_distribution_raw_rf[library][len( + molecule)][mean_rt_size] += 1 + rt_frag_distribution[mean_rt_size][len(rt_reactions)] += 1 + used_reads += len(molecule) + + bin_size = args.binSize + m_overseq = args.maxOverseq + + with gzip.open(f'{args.o}_raw_data.pickle.gz', 'wb') as fo: + + pickle.dump(fragment_distribution_raw_rf, fo) + + for library in fragment_distribution_raw_rf: + fig, axes = plt.subplots(1, 1, figsize=(10, 7)) + ax = axes + ax.set_title( + f'Read fragment size distribution\n{used} molecules / {used_reads} fragments analysed\n{library}') + table = {} + + for overseq in range(1, m_overseq): + try: + # Rebin in 10bp bins: + rebinned = collections.Counter() + for f_size, obs in fragment_distribution_raw_rf[library][overseq].most_common( + ): + rebinned[int(f_size / bin_size) * bin_size] += obs + obs_dist_fsize = np.array(list(rebinned.keys())) + obs_dist_freq = np.array(list(rebinned.values())) + sorting_order = np.argsort(obs_dist_fsize) + obs_dist_fsize = obs_dist_fsize[sorting_order] + obs_dist_freq = obs_dist_freq[sorting_order] + obs_dist_density = obs_dist_freq / np.sum(obs_dist_freq) + + for i, x in enumerate(obs_dist_fsize): + table[(overseq, x)] = {'obs_dist_freq': obs_dist_freq[i], + 'obs_dist_density': obs_dist_density[i]} + + ax.plot( + obs_dist_fsize, obs_dist_density, c=( + overseq / m_overseq, 0, 0), label=f'{overseq} read fragments / umi') + ax.set_xlim(-10, args.maxfs) + ax.legend() + ax.set_xlabel('fragment size') + ax.set_ylabel('density') + except Exception as e: + print(e) + pass + plt.tight_layout() + plt.savefig(f'{args.o}_{library}.png') + pd.DataFrame(table).to_csv(f'{args.o}_{library}.csv') + try: + plt.close() + except Exception as e: + pass + + # Export the table: