|
a |
|
b/BioAid/variantTK/variants.py |
|
|
1 |
# This code was developed and authored by Jerzy Twarowski in Malkova Lab at the University of Iowa |
|
|
2 |
# Contact: jerzymateusz-twarowski@uiowa.edu, tvarovski1@gmail.com |
|
|
3 |
|
|
|
4 |
## Contents: |
|
|
5 |
## qualitySNPFilter |
|
|
6 |
## filterByAD |
|
|
7 |
## findDominantAF |
|
|
8 |
## findType |
|
|
9 |
## findZygosity |
|
|
10 |
## filterFromClones |
|
|
11 |
## drawSNPMap |
|
|
12 |
## findGenomeLength |
|
|
13 |
## generateRandomLoci |
|
|
14 |
## findChrInLinearGenome |
|
|
15 |
## findSpectraStrandwise |
|
|
16 |
## renameChrToRoman |
|
|
17 |
## draw_random_SNPMap |
|
|
18 |
## drawCombinedSNPMap |
|
|
19 |
|
|
|
20 |
import os |
|
|
21 |
import random |
|
|
22 |
import numpy as np |
|
|
23 |
import pandas as pd |
|
|
24 |
import matplotlib.pyplot as plt |
|
|
25 |
from natsort import index_natsorted |
|
|
26 |
import seaborn as sns |
|
|
27 |
from numpy.core.numeric import NaN |
|
|
28 |
|
|
|
29 |
|
|
|
30 |
def qualitySNPFilter(frames_list): |
|
|
31 |
|
|
|
32 |
for i in range(len(frames_list)): |
|
|
33 |
frames_list[i] = frames_list[i][frames_list[i].TYPE =='SNP'] |
|
|
34 |
frames_list[i] = frames_list[i].rename(columns=lambda c: 'AF' if 'AF' in c else c) |
|
|
35 |
frames_list[i] = frames_list[i].rename(columns=lambda c: 'AD' if 'AD' in c else c) |
|
|
36 |
|
|
|
37 |
#this line may be problematic for a good result and if not needed comment out |
|
|
38 |
frames_list[i]['AF'] = frames_list[i].apply(lambda row: findDominantAF(row), axis=1) |
|
|
39 |
|
|
|
40 |
frames_list[i] = frames_list[i].astype({'AF': float}) |
|
|
41 |
frames_list[i] = frames_list[i][frames_list[i].AF >= 0.35] |
|
|
42 |
#also filter out 1.5n variants? |
|
|
43 |
|
|
|
44 |
frames_list[i]['EVAL_AD'] = frames_list[i].apply(lambda row: filterByAD(row), axis=1) |
|
|
45 |
frames_list[i] = frames_list[i][frames_list[i].EVAL_AD == 'True'] |
|
|
46 |
frames_list[i]['SPECTRA'] = frames_list[i].apply(lambda row: findType(row.REF, row.ALT), axis=1) |
|
|
47 |
return frames_list |
|
|
48 |
|
|
|
49 |
def filterByAD(row): |
|
|
50 |
reads=row['AD'].split(",") |
|
|
51 |
if int(reads[1]) < 5: |
|
|
52 |
return "False" |
|
|
53 |
if int(reads[0])+int(reads[1]) > 9: |
|
|
54 |
return "True" |
|
|
55 |
else: |
|
|
56 |
return "False" |
|
|
57 |
|
|
|
58 |
def findDominantAF(row): |
|
|
59 |
reads=row['AF'].split(",") |
|
|
60 |
return(max(reads)) |
|
|
61 |
|
|
|
62 |
def findType(colA, colB): |
|
|
63 |
if ((colA == 'C') & (colB == 'T') | (colA == 'G') & (colB == 'A')): |
|
|
64 |
return('C_to_T') |
|
|
65 |
if ((colA == 'C') & (colB == 'A') | (colA == 'G') & (colB == 'T')): |
|
|
66 |
return('C_to_A') |
|
|
67 |
if ((colA == 'C') & (colB == 'G') | (colA == 'G') & (colB == 'C')): |
|
|
68 |
return('C_to_G') |
|
|
69 |
if ((colA == 'T') & (colB == 'C') | (colA == 'A') & (colB == 'G')): |
|
|
70 |
return('T_to_C') |
|
|
71 |
if ((colA == 'T') & (colB == 'G') | (colA == 'A') & (colB == 'C')): |
|
|
72 |
return('T_to_G') |
|
|
73 |
if ((colA == 'T') & (colB == 'A') | (colA == 'A') & (colB == 'T')): |
|
|
74 |
return('T_to_A') |
|
|
75 |
|
|
|
76 |
def findZygosity(row): |
|
|
77 |
if (row["AF"] >= 0.85): |
|
|
78 |
return('Homozygous') |
|
|
79 |
else: |
|
|
80 |
return('Heterozygous') |
|
|
81 |
|
|
|
82 |
def filterFromClones(frames_list): |
|
|
83 |
#this will subtract all common rows between normal/control and experimental |
|
|
84 |
#sample. I might need to revist to filter variants by position only not all columns |
|
|
85 |
subtracted_df_list = [] |
|
|
86 |
for df in frames_list: |
|
|
87 |
temp_df = df.drop(columns=['AD','AF']) |
|
|
88 |
len_original=len(temp_df) |
|
|
89 |
for df_2 in frames_list: |
|
|
90 |
if not df.equals(df_2): |
|
|
91 |
df_2=df_2.drop(columns=['AD','AF']) |
|
|
92 |
temp_df = pd.merge(temp_df,df_2, indicator=True, how='outer').query('_merge=="left_only"').drop('_merge', axis=1) |
|
|
93 |
|
|
|
94 |
len_final=len(temp_df) |
|
|
95 |
print(f"removed {len_original-len_final} rows from df") |
|
|
96 |
subtracted_df_list.append(temp_df) |
|
|
97 |
return subtracted_df_list |
|
|
98 |
|
|
|
99 |
def drawSNPMap(pd_df, df_chr_lengths, chr_starts_df, title, sample_names, saveMap=True): |
|
|
100 |
|
|
|
101 |
pd_df = pd_df.append(chr_starts_df, ignore_index=True) |
|
|
102 |
|
|
|
103 |
fig, ax = plt.subplots() |
|
|
104 |
|
|
|
105 |
df_chr_lengths.plot(kind='barh',legend=False, ax=ax, x="chromosome", y="end_position", fontsize=15, figsize=(20,6), edgecolor="black", linewidth=1, color="beige", zorder=0, title=title, label="Position") |
|
|
106 |
|
|
|
107 |
pd_df = pd_df.sort_values(by="CHROM", key=lambda x: np.argsort(index_natsorted(pd_df["CHROM"]))) |
|
|
108 |
|
|
|
109 |
#To rename legend elements, change plot markers/colors, modify here |
|
|
110 |
label_dict = { "G": "G->N", "C": "C->N", "A": "A->N", "T": "T->N", "REF": "Reference Allele"} |
|
|
111 |
markers = {"Homozygous": "x", "Heterozygous": "|"} |
|
|
112 |
palette = {"G": "green", "C": "red", "A": "gray", "T": "gray"} |
|
|
113 |
|
|
|
114 |
sns.scatterplot(ax=ax,data=pd_df, x="POS", y="CHROM", hue="REF", palette=palette, style="ZYGOSITY", markers=markers, s=120, alpha=0.7,zorder=1, linewidth=1.5) |
|
|
115 |
|
|
|
116 |
ax.set_xlabel("POSITION") |
|
|
117 |
ax.set_ylabel("CHROMOSOME") |
|
|
118 |
|
|
|
119 |
legend_texts = plt.legend().get_texts() |
|
|
120 |
for i in range(len(legend_texts)): |
|
|
121 |
label_str = legend_texts[i].get_text() |
|
|
122 |
if label_str in label_dict: |
|
|
123 |
new_label = label_dict.get(label_str) |
|
|
124 |
|
|
|
125 |
legend_texts[i].set_text(new_label) |
|
|
126 |
|
|
|
127 |
if saveMap: |
|
|
128 |
plt.savefig('figures/'+title+'.png', transparent=False) |
|
|
129 |
|
|
|
130 |
def findGenomeLength(chromosomes): |
|
|
131 |
|
|
|
132 |
genome_length = 0 |
|
|
133 |
for chr in chromosomes: |
|
|
134 |
genome_length+=chr[1] |
|
|
135 |
|
|
|
136 |
return(genome_length) |
|
|
137 |
|
|
|
138 |
def generateRandomLoci(n, chromosomes): |
|
|
139 |
|
|
|
140 |
genome_length=findGenomeLength(chromosomes) |
|
|
141 |
return(random.sample(range(0, genome_length), n)) |
|
|
142 |
|
|
|
143 |
def findChrInLinearGenome(chromosomes, random_loci): |
|
|
144 |
|
|
|
145 |
output_list=[] |
|
|
146 |
for locus in random_loci: |
|
|
147 |
linear_genome_slider=0 |
|
|
148 |
linear_genome_slider_previous=0 |
|
|
149 |
for chromosome in chromosomes: |
|
|
150 |
linear_genome_slider+=chromosome[1] |
|
|
151 |
|
|
|
152 |
if locus < linear_genome_slider: |
|
|
153 |
#print(f'slider:{linear_genome_slider}, chr:{chromosome[0]}, locus:{locus}') |
|
|
154 |
output_list.append([chromosome[0], locus-linear_genome_slider_previous]) |
|
|
155 |
linear_genome_slider_previous+=chromosome[1] |
|
|
156 |
break |
|
|
157 |
linear_genome_slider_previous+=chromosome[1] |
|
|
158 |
|
|
|
159 |
return(output_list) |
|
|
160 |
|
|
|
161 |
def findSpectraStrandwise(colA, colB): |
|
|
162 |
if ((colA == 'C') & (colB == 'T')): |
|
|
163 |
return('C_to_T') |
|
|
164 |
elif ((colA == 'C') & (colB == 'A')): |
|
|
165 |
return('C_to_A') |
|
|
166 |
elif ((colA == 'C') & (colB == 'G')): |
|
|
167 |
return('C_to_G') |
|
|
168 |
elif ((colA == 'T') & (colB == 'C')): |
|
|
169 |
return('T_to_C') |
|
|
170 |
elif ((colA == 'T') & (colB == 'G')): |
|
|
171 |
return('T_to_G') |
|
|
172 |
elif ((colA == 'T') & (colB == 'A')): |
|
|
173 |
return('T_to_A') |
|
|
174 |
|
|
|
175 |
elif ((colA == 'G') & (colB == 'A')): |
|
|
176 |
return('G_to_A') |
|
|
177 |
elif ((colA == 'G') & (colB == 'T')): |
|
|
178 |
return('G_to_T') |
|
|
179 |
elif ((colA == 'G') & (colB == 'C')): |
|
|
180 |
return('G_to_C') |
|
|
181 |
elif ((colA == 'A') & (colB == 'G')): |
|
|
182 |
return('A_to_G') |
|
|
183 |
elif ((colA == 'A') & (colB == 'C')): |
|
|
184 |
return('A_to_C') |
|
|
185 |
elif ((colA == 'A') & (colB == 'T')): |
|
|
186 |
return('A_to_T') |
|
|
187 |
|
|
|
188 |
def renameChrToRoman(df): |
|
|
189 |
roman_names={"chr1": "I", |
|
|
190 |
"chr2": "II", |
|
|
191 |
"chr3": "III", |
|
|
192 |
"chr4": "IV", |
|
|
193 |
"chr5": "V", |
|
|
194 |
"chr6": "VI", |
|
|
195 |
"chr7": "VII", |
|
|
196 |
"chr8": "VIII", |
|
|
197 |
"chr9": "IX", |
|
|
198 |
"chr10": "X", |
|
|
199 |
"chr11": "XI", |
|
|
200 |
"chr12": "XII", |
|
|
201 |
"chr13": "XIII", |
|
|
202 |
"chr14": "XIV", |
|
|
203 |
"chr15": "XV", |
|
|
204 |
"chr16": "XVI"} |
|
|
205 |
df.replace({"chromosome": roman_names}, inplace=True) |
|
|
206 |
return(df) |
|
|
207 |
|
|
|
208 |
def draw_random_SNPMap(pd_df, df_chr_lengths, chr_starts_df, title, saveMap=True): |
|
|
209 |
|
|
|
210 |
#To rename legend elements, change plot markers/colors, modify here |
|
|
211 |
|
|
|
212 |
label_dict = {"cen": 'Centromere'} |
|
|
213 |
markers = {"random": "$|$", "cen": "o"} |
|
|
214 |
palette = {"random": "black", "cen": "white"} |
|
|
215 |
|
|
|
216 |
|
|
|
217 |
pd_df = pd_df.append(chr_starts_df, ignore_index=True) |
|
|
218 |
pd_df = pd_df.sort_values(by="CHROM", key=lambda x: np.argsort(index_natsorted(pd_df["CHROM"]))) |
|
|
219 |
renameChrToRoman(pd_df) |
|
|
220 |
|
|
|
221 |
|
|
|
222 |
fig, ax = plt.subplots() |
|
|
223 |
|
|
|
224 |
renameChrToRoman(df_chr_lengths) |
|
|
225 |
|
|
|
226 |
df_chr_lengths.plot(kind='barh',legend=False, ax=ax, x="chromosome", y="end_position", fontsize=40, figsize=(80,24), edgecolor="black", linewidth=1, color="lightgray", zorder=0, label="Position") |
|
|
227 |
sns.scatterplot(ax=ax, data=pd_df, x="POS", y="CHROM", style="REF", markers=markers, hue="REF", palette=palette, s=1500, alpha=1,zorder=1, linewidth=1, edgecolor="black", legend=False) |
|
|
228 |
|
|
|
229 |
ax.set_xlabel("POSITION", fontsize=40) |
|
|
230 |
ax.set_ylabel("CHROMOSOME", fontsize=40) |
|
|
231 |
fig.suptitle(title, fontsize=60) |
|
|
232 |
|
|
|
233 |
legend_texts = plt.legend().get_texts() |
|
|
234 |
for i in range(len(legend_texts)): |
|
|
235 |
label_str = legend_texts[i].get_text() |
|
|
236 |
if label_str in label_dict: |
|
|
237 |
new_label = label_dict.get(label_str) |
|
|
238 |
|
|
|
239 |
legend_texts[i].set_text(new_label) |
|
|
240 |
|
|
|
241 |
if saveMap: |
|
|
242 |
plt.savefig('figures/'+title+'.png', transparent=False, dpi=200) |
|
|
243 |
|
|
|
244 |
def drawCombinedSNPMap(pd_df, df_chr_lengths, chr_starts_df, title, sample_names, saveMap=True): |
|
|
245 |
|
|
|
246 |
#To rename legend elements, change plot markers/colors, modify here |
|
|
247 |
label_dict = { "G": "G->N", |
|
|
248 |
"C": "C->N", |
|
|
249 |
"A": "A->N", |
|
|
250 |
"T": "T->N", |
|
|
251 |
"SPECTRA_STRANDWISE": "Reference Allele", |
|
|
252 |
"cen": 'Centromere'} |
|
|
253 |
|
|
|
254 |
markers = {"C_to_T": "$|$", |
|
|
255 |
"C_to_A": "$|$", |
|
|
256 |
"C_to_G": "$|$", |
|
|
257 |
"G_to_A": "$|$", |
|
|
258 |
"G_to_C": "$|$", |
|
|
259 |
"G_to_T": "$|$", |
|
|
260 |
'T_to_A': "$|$", |
|
|
261 |
'T_to_C': "$|$", |
|
|
262 |
'A_to_C': "$|$", |
|
|
263 |
'A_to_G': "$|$", |
|
|
264 |
'A_to_T': "$|$", |
|
|
265 |
'T_to_G': "$|$", |
|
|
266 |
'cen': "o"} |
|
|
267 |
|
|
|
268 |
palette = {"C_to_T": "black", |
|
|
269 |
"C_to_A": "black", |
|
|
270 |
"C_to_G": "black", |
|
|
271 |
"G_to_A": "red", |
|
|
272 |
"G_to_C": "blue", |
|
|
273 |
"G_to_T": "olive", |
|
|
274 |
'T_to_A': "gray", |
|
|
275 |
'T_to_C': "gray", |
|
|
276 |
'A_to_C': "gray", |
|
|
277 |
'A_to_G': "gray", |
|
|
278 |
'A_to_T': "gray", |
|
|
279 |
'T_to_G': "gray", |
|
|
280 |
'cen': "white"} |
|
|
281 |
|
|
|
282 |
pd_df = pd_df.append(chr_starts_df, ignore_index=True) |
|
|
283 |
pd_df = pd_df.sort_values(by="CHROM", key=lambda x: np.argsort(index_natsorted(pd_df["CHROM"]))) |
|
|
284 |
renameChrToRoman(pd_df) |
|
|
285 |
|
|
|
286 |
fig, ax = plt.subplots() |
|
|
287 |
|
|
|
288 |
renameChrToRoman(df_chr_lengths) |
|
|
289 |
df_chr_lengths.plot(kind='barh',legend=False, ax=ax, x="chromosome", y="end_position", fontsize=10, figsize=(20,6), edgecolor="black", linewidth=1, color="lightgray", zorder=0, label="Position") |
|
|
290 |
|
|
|
291 |
sns.scatterplot(ax=ax, data=pd_df[pd_df["REF"].isin(["C",'cen'])], x="POS", y="CHROM", hue="SPECTRA_STRANDWISE", palette=palette, style="SPECTRA_STRANDWISE", markers=markers, s=100, alpha=.9,zorder=1, linewidth=.05, edgecolor="black", legend = False) |
|
|
292 |
sns.scatterplot(ax=ax, data=pd_df[pd_df["REF"].isin(["G",'cen'])], x="POS", y="CHROM", hue="SPECTRA_STRANDWISE", palette=palette, style="SPECTRA_STRANDWISE", markers=markers, s=100, alpha=.9,zorder=2, linewidth=.15, edgecolor="black") |
|
|
293 |
|
|
|
294 |
ax.set_xlabel("POSITION", fontsize=10) |
|
|
295 |
ax.set_ylabel("CHROMOSOME", fontsize=10) |
|
|
296 |
fig.suptitle(title, fontsize=15) |
|
|
297 |
|
|
|
298 |
legend_texts = plt.legend().get_texts() |
|
|
299 |
for i in range(len(legend_texts)): |
|
|
300 |
label_str = legend_texts[i].get_text() |
|
|
301 |
if label_str in label_dict: |
|
|
302 |
new_label = label_dict.get(label_str) |
|
|
303 |
legend_texts[i].set_text(new_label) |
|
|
304 |
|
|
|
305 |
if saveMap: |
|
|
306 |
plt.savefig('figures/'+title+'.png', transparent=False, dpi=600) |