[4f4f20]: / singlecellmultiomics / bamProcessing / bamDinucleotideDistribution.py

Download this file

210 lines (156 with data), 7.3 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
import pandas as pd
from collections import Counter
from singlecellmultiomics.utils.sequtils import reverse_complement
from singlecellmultiomics.bamProcessing.bamAnalyzeCutDistances import get_sc_cut_dictionary
from singlecellmultiomics.bamProcessing import get_reference_path_from_bam, get_contigs_with_reads
from singlecellmultiomics.utils import is_main_chromosome
import os
from singlecellmultiomics.features import FeatureContainer
from itertools import product
import numpy as np
import pysam
from multiprocessing import Pool
mpl.rcParams['figure.dpi'] = 300
if __name__ == '__main__':
argparser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description='Investigate dinucleotide usage')
argparser.add_argument(
'-o',
type=str,
help="output folder",
default='./dinuc_usage/')
argparser.add_argument(
'-cpgislands',
type=str,
help="path to cpg island tsv file",
required=False)
argparser.add_argument(
'-offset',
type=float,
help="Offset margin for the beta values, increase to get a larger y-axis",
default=0.05)
argparser.add_argument(
'--tafilter',
action='store_true',
help="Enable TA filter (checks for TA at cut location)")
argparser.add_argument('alignmentfiles', type=str, nargs='+')
args = argparser.parse_args()
bams = args.alignmentfiles
perform_ta_filter = args.tafilter
reference_path = get_reference_path_from_bam(args.alignmentfiles[0])
# We need to ignore cpg islands for these stats:
if args.cpgislands is None:
cpg_islands = None
else:
cpg_islands = FeatureContainer()
for idx,row in pd.read_csv(args.cpgislands,delimiter='\t').iterrows():
cpg_islands.addFeature(chromosome=row['chrom'].replace('chr',''), start=row.chromStart,end= row.chromEnd, name=row.name)
cpg_islands.sort()
sc_cut_dict_stranded = get_sc_cut_dictionary(bams,strand_specific=True,)
extraction_radius = 1500
nucs = 'ACGT'
dinucs =list(product(nucs,nucs))
dinuc_map = dict( (d,i) for i,d in enumerate(dinucs))
#reference = CachedFasta(reference_handle)
def go(contig):
nucobs = np.zeros([extraction_radius*2 + 1,4])
dinucobs = np.zeros([extraction_radius*2 + 1,len(dinucs)])
with pysam.FastaFile(reference_path) as reference:
cut_obs = Counter()
for cell, cellobs in sc_cut_dict_stranded[contig].items():
cut_obs += Counter( list(cellobs.keys()) )
for k, total_seen in cut_obs.most_common():
if total_seen<=3: # Only use more common cuts, these are more likely relaxed
continue
# Extract the genomic region:
strand = k[0]
position = k[1]
if cpg_islands is not None:
hits = cpg_islands.findFeaturesAt(contig, position)
if len(hits):
continue
if position-extraction_radius < 0:
continue
seq = reference.fetch(contig, position-extraction_radius, position+extraction_radius)
if strand:
seq = reverse_complement(seq)
if perform_ta_filter and not 'TA' in seq[extraction_radius-1:extraction_radius+1]: #or not 'TA' in seq[extraction_radius+145:extraction_radius+148]:
continue
prevbase = None
for i,base in enumerate(seq):
try:
j = nucs.index(base)
except ValueError:
prevbase = None
continue
try:
nucobs[i,j] += 1
k = dinuc_map.get((prevbase,base))
if k is not None:
dinucobs[i,k] += 1
prevbase= base
except IndexError:
prevbase = None
continue
return nucobs, dinucobs
if not os.path.exists(args.o):
os.makedirs(args.o)
nucobs = np.zeros([extraction_radius*2 + 1,4])
dinucobs = np.zeros([extraction_radius*2 + 1,len(dinucs)])
contigs = [contig for contig in
get_contigs_with_reads(bam_path=bams[0])
if contig != 'Y' and contig[0]!='J' and contig!='MT' and is_main_chromosome(contig)]
with Pool(16) as workers:
for rn, rd in workers.imap(go, contigs):
nucobs += rn
dinucobs += rd
nuc_obs_df = pd.DataFrame( nucobs, columns= list(nucs ))
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 300
def create_dinuc_plot(s=-250, e=170):
things_for_legend = []
fig,gcax = plt.subplots(figsize=(6,4))
atax = gcax.twinx()
dinuc_df = pd.DataFrame(dinucobs, columns= list(dinucs), index=list(range(-extraction_radius, extraction_radius+1)) )
dinuc_df = dinuc_df.rolling(window=3,center=True).mean()
dinuc_df = (dinuc_df.T / dinuc_df.sum(1)).loc[:,s:e].T
gc = (dinuc_df[('C','C')] + dinuc_df[('G','G')] + dinuc_df[('G','C')])
gcax.xaxis.label.set_color('orange')
gcax.tick_params(axis='y', colors='orange')
things_for_legend += gcax.plot(gc.index, gc.values, label='CC+GG+GC',c='orange')
at = (dinuc_df[('A','A')] + dinuc_df[('T','T')] + dinuc_df[('T','A')])
atax.xaxis.label.set_color('purple')
atax.tick_params(axis='y', colors='purple')
things_for_legend += atax.plot( at.index, at.values, label='AA+TT+TA',c='purple')
extra = args.offset
atax.set_ylim(at.mean()-extra, at.mean()+extra)
gcax.set_ylim(gc.mean()-extra, gc.mean()+extra)
if e-s < 2000:
for x in range(s,e,10):
atax.axvline(x+5,c='grey',lw=0.5, alpha=0.5)
gcax.set_ylabel('Fraction CC+GG+GC')
atax.set_ylabel('Fraction AA+AT+TA')
gcax.set_xlabel('< into uncovered, Distance from cut (bp), into molecule >',c='k')
atax.legend(things_for_legend, [t.get_label() for t in things_for_legend])
plt.title('dinucleotide abundance')
plt.axvline(0,c='k')
plt.axvline(147/2,c='k')
plt.axvline(147,c='k')
create_dinuc_plot()
plt.savefig(args.o + 'dinuc_usage_narrow.png', bbox_inches='tight')
create_dinuc_plot(-500,500)
plt.savefig(args.o + 'dinuc_usage.png', bbox_inches='tight')
create_dinuc_plot(-1500,1500)
plt.savefig(args.o + 'dinuc_usage_wide.png', bbox_inches='tight')
dinuc_df = pd.DataFrame(dinucobs, columns= list(dinucs), index=list(range(-extraction_radius, extraction_radius+1)) )
dinuc_df = dinuc_df.rolling(window=3,center=True).mean()
dinuc_df = (dinuc_df.T / dinuc_df.sum(1)).T
dinuc_df.to_pickle(args.o + 'dinucs.pickle.gz' )
dinuc_df.to_csv(args.o + 'dinucs.csv' )