Switch to unified view

a b/BioAid/variantTK/variants.py
1
# This code was developed and authored by Jerzy Twarowski in Malkova Lab at the University of Iowa 
2
# Contact: jerzymateusz-twarowski@uiowa.edu, tvarovski1@gmail.com
3
4
## Contents:
5
## qualitySNPFilter
6
## filterByAD
7
## findDominantAF
8
## findType
9
## findZygosity
10
## filterFromClones
11
## drawSNPMap
12
## findGenomeLength
13
## generateRandomLoci
14
## findChrInLinearGenome
15
## findSpectraStrandwise
16
## renameChrToRoman
17
## draw_random_SNPMap
18
## drawCombinedSNPMap
19
20
import os
21
import random
22
import numpy as np
23
import pandas as pd
24
import matplotlib.pyplot as plt
25
from natsort import index_natsorted
26
import seaborn as sns
27
from numpy.core.numeric import NaN
28
29
30
def qualitySNPFilter(frames_list):
31
32
  for i in range(len(frames_list)):
33
    frames_list[i] = frames_list[i][frames_list[i].TYPE =='SNP']
34
    frames_list[i] = frames_list[i].rename(columns=lambda c: 'AF' if 'AF' in c else c)
35
    frames_list[i] = frames_list[i].rename(columns=lambda c: 'AD' if 'AD' in c else c)
36
    
37
    #this line may be problematic for a good result and if not needed comment out
38
    frames_list[i]['AF'] = frames_list[i].apply(lambda row: findDominantAF(row), axis=1)
39
40
    frames_list[i] = frames_list[i].astype({'AF': float})
41
    frames_list[i] = frames_list[i][frames_list[i].AF >= 0.35]
42
    #also filter out 1.5n variants?
43
44
    frames_list[i]['EVAL_AD'] = frames_list[i].apply(lambda row: filterByAD(row), axis=1)
45
    frames_list[i] = frames_list[i][frames_list[i].EVAL_AD == 'True']
46
    frames_list[i]['SPECTRA'] = frames_list[i].apply(lambda row: findType(row.REF, row.ALT), axis=1)
47
  return frames_list
48
49
def filterByAD(row):
50
  reads=row['AD'].split(",")
51
  if int(reads[1]) < 5:
52
    return "False"
53
  if int(reads[0])+int(reads[1]) > 9:
54
    return "True"
55
  else:
56
    return "False"
57
58
def findDominantAF(row):
59
  reads=row['AF'].split(",")
60
  return(max(reads))
61
62
def findType(colA, colB):
63
  if ((colA == 'C') & (colB == 'T') | (colA == 'G') & (colB == 'A')):
64
    return('C_to_T')
65
  if ((colA == 'C') & (colB == 'A') | (colA == 'G') & (colB == 'T')):
66
    return('C_to_A')
67
  if ((colA == 'C') & (colB == 'G') | (colA == 'G') & (colB == 'C')):
68
    return('C_to_G')
69
  if ((colA == 'T') & (colB == 'C') | (colA == 'A') & (colB == 'G')):
70
    return('T_to_C')
71
  if ((colA == 'T') & (colB == 'G') | (colA == 'A') & (colB == 'C')):
72
    return('T_to_G')
73
  if ((colA == 'T') & (colB == 'A') | (colA == 'A') & (colB == 'T')):
74
    return('T_to_A')
75
76
def findZygosity(row):
77
  if (row["AF"] >= 0.85):
78
    return('Homozygous')
79
  else:
80
    return('Heterozygous')
81
82
def filterFromClones(frames_list):
83
  #this will subtract all common rows between normal/control and experimental
84
  #sample. I might need to revist to filter variants by position only not all columns
85
  subtracted_df_list = []
86
  for df in frames_list:
87
    temp_df = df.drop(columns=['AD','AF'])
88
    len_original=len(temp_df)
89
    for df_2 in frames_list:
90
      if not df.equals(df_2):
91
        df_2=df_2.drop(columns=['AD','AF'])
92
        temp_df = pd.merge(temp_df,df_2, indicator=True, how='outer').query('_merge=="left_only"').drop('_merge', axis=1)
93
94
    len_final=len(temp_df)
95
    print(f"removed {len_original-len_final} rows from df")
96
    subtracted_df_list.append(temp_df)
97
  return subtracted_df_list
98
99
def drawSNPMap(pd_df, df_chr_lengths, chr_starts_df, title, sample_names, saveMap=True):
100
101
  pd_df = pd_df.append(chr_starts_df, ignore_index=True)
102
103
  fig, ax = plt.subplots()
104
105
  df_chr_lengths.plot(kind='barh',legend=False, ax=ax, x="chromosome", y="end_position", fontsize=15, figsize=(20,6), edgecolor="black", linewidth=1, color="beige", zorder=0, title=title, label="Position")
106
  
107
  pd_df = pd_df.sort_values(by="CHROM", key=lambda x: np.argsort(index_natsorted(pd_df["CHROM"])))
108
  
109
  #To rename legend elements, change plot markers/colors, modify here
110
  label_dict = { "G": "G->N", "C": "C->N", "A": "A->N", "T": "T->N", "REF": "Reference Allele"}
111
  markers = {"Homozygous": "x", "Heterozygous": "|"}
112
  palette = {"G": "green", "C": "red", "A": "gray", "T": "gray"}
113
114
  sns.scatterplot(ax=ax,data=pd_df, x="POS", y="CHROM", hue="REF", palette=palette, style="ZYGOSITY", markers=markers, s=120, alpha=0.7,zorder=1, linewidth=1.5)
115
  
116
  ax.set_xlabel("POSITION")
117
  ax.set_ylabel("CHROMOSOME")
118
119
  legend_texts = plt.legend().get_texts()
120
  for i in range(len(legend_texts)):
121
    label_str = legend_texts[i].get_text()
122
    if label_str in label_dict:
123
      new_label = label_dict.get(label_str)
124
125
      legend_texts[i].set_text(new_label)
126
127
  if saveMap:
128
    plt.savefig('figures/'+title+'.png', transparent=False)
129
130
def findGenomeLength(chromosomes):
131
132
  genome_length = 0
133
  for chr in chromosomes:
134
    genome_length+=chr[1]
135
136
  return(genome_length)
137
138
def generateRandomLoci(n, chromosomes):
139
140
  genome_length=findGenomeLength(chromosomes)
141
  return(random.sample(range(0, genome_length), n))
142
143
def findChrInLinearGenome(chromosomes, random_loci):
144
145
  output_list=[]
146
  for locus in random_loci:
147
    linear_genome_slider=0
148
    linear_genome_slider_previous=0
149
    for chromosome in chromosomes:
150
      linear_genome_slider+=chromosome[1]
151
152
      if locus < linear_genome_slider:
153
        #print(f'slider:{linear_genome_slider}, chr:{chromosome[0]}, locus:{locus}')
154
        output_list.append([chromosome[0], locus-linear_genome_slider_previous])
155
        linear_genome_slider_previous+=chromosome[1]
156
        break
157
      linear_genome_slider_previous+=chromosome[1]
158
159
  return(output_list)
160
161
def findSpectraStrandwise(colA, colB):
162
  if ((colA == 'C') & (colB == 'T')):
163
    return('C_to_T')
164
  elif ((colA == 'C') & (colB == 'A')):
165
    return('C_to_A')
166
  elif ((colA == 'C') & (colB == 'G')):
167
    return('C_to_G')
168
  elif ((colA == 'T') & (colB == 'C')):
169
    return('T_to_C')
170
  elif ((colA == 'T') & (colB == 'G')):
171
    return('T_to_G')
172
  elif ((colA == 'T') & (colB == 'A')):
173
    return('T_to_A')
174
175
  elif ((colA == 'G') & (colB == 'A')):
176
    return('G_to_A')
177
  elif ((colA == 'G') & (colB == 'T')):
178
    return('G_to_T')
179
  elif ((colA == 'G') & (colB == 'C')):
180
    return('G_to_C')
181
  elif ((colA == 'A') & (colB == 'G')):
182
    return('A_to_G')
183
  elif ((colA == 'A') & (colB == 'C')):
184
    return('A_to_C')
185
  elif ((colA == 'A') & (colB == 'T')):
186
    return('A_to_T')
187
188
def renameChrToRoman(df):
189
  roman_names={"chr1": "I",
190
               "chr2": "II",
191
               "chr3": "III",
192
               "chr4": "IV",
193
               "chr5": "V",
194
               "chr6": "VI",
195
               "chr7": "VII",
196
               "chr8": "VIII",
197
               "chr9": "IX",
198
               "chr10": "X",
199
               "chr11": "XI",
200
               "chr12": "XII",
201
               "chr13": "XIII",
202
               "chr14": "XIV",
203
               "chr15": "XV",
204
               "chr16": "XVI"}
205
  df.replace({"chromosome": roman_names}, inplace=True)
206
  return(df)
207
208
def draw_random_SNPMap(pd_df, df_chr_lengths, chr_starts_df, title, saveMap=True):
209
210
  #To rename legend elements, change plot markers/colors, modify here
211
212
  label_dict = {"cen": 'Centromere'}
213
  markers = {"random": "$|$", "cen": "o"}
214
  palette = {"random": "black", "cen": "white"}
215
216
  
217
  pd_df = pd_df.append(chr_starts_df, ignore_index=True)
218
  pd_df = pd_df.sort_values(by="CHROM", key=lambda x: np.argsort(index_natsorted(pd_df["CHROM"])))
219
  renameChrToRoman(pd_df)
220
221
222
  fig, ax = plt.subplots()
223
224
  renameChrToRoman(df_chr_lengths)
225
226
  df_chr_lengths.plot(kind='barh',legend=False, ax=ax, x="chromosome", y="end_position", fontsize=40, figsize=(80,24), edgecolor="black", linewidth=1, color="lightgray", zorder=0, label="Position")
227
  sns.scatterplot(ax=ax, data=pd_df, x="POS", y="CHROM", style="REF", markers=markers, hue="REF", palette=palette, s=1500, alpha=1,zorder=1, linewidth=1, edgecolor="black", legend=False)
228
229
  ax.set_xlabel("POSITION", fontsize=40)
230
  ax.set_ylabel("CHROMOSOME", fontsize=40)
231
  fig.suptitle(title, fontsize=60)
232
233
  legend_texts = plt.legend().get_texts()
234
  for i in range(len(legend_texts)):
235
    label_str = legend_texts[i].get_text()
236
    if label_str in label_dict:
237
      new_label = label_dict.get(label_str)
238
239
      legend_texts[i].set_text(new_label)
240
241
  if saveMap:
242
    plt.savefig('figures/'+title+'.png', transparent=False, dpi=200)
243
244
def drawCombinedSNPMap(pd_df, df_chr_lengths, chr_starts_df, title, sample_names, saveMap=True):
245
246
  #To rename legend elements, change plot markers/colors, modify here
247
  label_dict = { "G": "G->N",
248
                 "C": "C->N",
249
                 "A": "A->N",
250
                 "T": "T->N",
251
                 "SPECTRA_STRANDWISE": "Reference Allele",
252
                 "cen": 'Centromere'}
253
                
254
  markers = {"C_to_T": "$|$",
255
             "C_to_A": "$|$",
256
             "C_to_G": "$|$",
257
             "G_to_A": "$|$",
258
             "G_to_C": "$|$",
259
             "G_to_T": "$|$",
260
             'T_to_A': "$|$",
261
             'T_to_C': "$|$",
262
             'A_to_C': "$|$",
263
             'A_to_G': "$|$",
264
             'A_to_T': "$|$",
265
             'T_to_G': "$|$",
266
             'cen': "o"}
267
268
  palette = {"C_to_T": "black",
269
             "C_to_A": "black",
270
             "C_to_G": "black",
271
             "G_to_A": "red",
272
             "G_to_C": "blue",
273
             "G_to_T": "olive",
274
             'T_to_A': "gray",
275
             'T_to_C': "gray",
276
             'A_to_C': "gray",
277
             'A_to_G': "gray",
278
             'A_to_T': "gray",
279
             'T_to_G': "gray",
280
             'cen': "white"}
281
282
  pd_df = pd_df.append(chr_starts_df, ignore_index=True)
283
  pd_df = pd_df.sort_values(by="CHROM", key=lambda x: np.argsort(index_natsorted(pd_df["CHROM"])))
284
  renameChrToRoman(pd_df)
285
  
286
  fig, ax = plt.subplots()
287
288
  renameChrToRoman(df_chr_lengths)
289
  df_chr_lengths.plot(kind='barh',legend=False, ax=ax, x="chromosome", y="end_position", fontsize=10, figsize=(20,6), edgecolor="black", linewidth=1, color="lightgray", zorder=0, label="Position")
290
291
  sns.scatterplot(ax=ax, data=pd_df[pd_df["REF"].isin(["C",'cen'])], x="POS", y="CHROM", hue="SPECTRA_STRANDWISE", palette=palette, style="SPECTRA_STRANDWISE", markers=markers, s=100, alpha=.9,zorder=1, linewidth=.05, edgecolor="black", legend = False)
292
  sns.scatterplot(ax=ax, data=pd_df[pd_df["REF"].isin(["G",'cen'])], x="POS", y="CHROM", hue="SPECTRA_STRANDWISE", palette=palette, style="SPECTRA_STRANDWISE", markers=markers, s=100, alpha=.9,zorder=2, linewidth=.15, edgecolor="black")
293
294
  ax.set_xlabel("POSITION", fontsize=10)
295
  ax.set_ylabel("CHROMOSOME", fontsize=10)
296
  fig.suptitle(title, fontsize=15)
297
298
  legend_texts = plt.legend().get_texts()
299
  for i in range(len(legend_texts)):
300
    label_str = legend_texts[i].get_text()
301
    if label_str in label_dict:
302
      new_label = label_dict.get(label_str)
303
      legend_texts[i].set_text(new_label)
304
305
  if saveMap:
306
    plt.savefig('figures/'+title+'.png', transparent=False, dpi=600)