[9a30b1]: / functions_.py

Download this file

299 lines (257 with data), 14.7 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import normalized_mutual_info_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.classifier import StackingClassifier
from sklearn.linear_model import SGDClassifier
from collections import Counter, defaultdict
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math
# algorithm
# ----------
# Consider all unique values and the number of occurances of given feature in train data dataframe
# build a vector (1*9) , the first element = (number of times it occured in class1 + 10*alpha / number of time it occurred in total data+90*alpha)
# gv_dict is like a look up table, for every gene it store a (1*9) representation of it
# for a value of feature in df:
# if it is in train data:
# we add the vector that was stored in 'gv_dict' look up table to 'gv_fea'
# if it is not there is train:
# we add [1/9, 1/9, 1/9, 1/9,1/9, 1/9, 1/9, 1/9, 1/9] to 'gv_fea'
# return 'gv_fea'
# ----------------------
# alpha : used for laplace smoothing
# feature: ['gene', 'variation']
# df: ['train_df', 'test_df', 'cv_df']
def get_gv_fea_dict(alpha, feature, df):
# value_count: it contains a dict like
# print(train_df['Gene'].value_counts())
# output:
# {BRCA1 174
# TP53 106
# EGFR 86
# BRCA2 75
# PTEN 69
# KIT 61
# BRAF 60
# ERBB2 47
# PDGFRA 46
# ...}
# print(train_df['Variation'].value_counts())
# output:
# {
# Truncating_Mutations 63
# Deletion 43
# Amplification 43
# Fusions 22
# Overexpression 3
# E17K 3
# Q61L 3
# S222D 2
# P130S 2
# ...
# }
value_count = df[feature].value_counts()
# gv_dict : Gene Variation Dict, which contains the probability array for each gene/variation
gv_dict = dict()
# denominator will contain the number of time that particular feature occured in whole data
for i, denominator in value_count.items():
# vec will contain (p(yi==1/Gi) probability of gene/variation belongs to perticular class
# vec is 9 diamensional vector
vec = []
for k in range(1,10):
# print(train_df.loc[(train_df['Class']==1) & (train_df['Gene']=='BRCA1')])
# ID Gene Variation Class
# 2470 2470 BRCA1 S1715C 1
# 2486 2486 BRCA1 S1841R 1
# 2614 2614 BRCA1 M1R 1
# 2432 2432 BRCA1 L1657P 1
# 2567 2567 BRCA1 T1685A 1
# 2583 2583 BRCA1 E1660G 1
# 2634 2634 BRCA1 W1718L 1
# cls_cnt.shape[0] will return the number of rows
cls_cnt = df.loc[(df['Class']==k) & (df[feature]==i)]
# cls_cnt.shape[0](numerator) will contain the number of time that particular feature occured in whole data
vec.append((cls_cnt.shape[0] + alpha*10)/ (denominator + 90*alpha))
# we are adding the gene/variation to the dict as key and vec as value
gv_dict[i]=vec
return gv_dict
# Get Gene variation feature
def get_gv_feature(alpha, feature, df):
# print(gv_dict)
# {'BRCA1': [0.20075757575757575, 0.03787878787878788, 0.068181818181818177, 0.13636363636363635, 0.25, 0.19318181818181818, 0.03787878787878788, 0.03787878787878788, 0.03787878787878788],
# 'TP53': [0.32142857142857145, 0.061224489795918366, 0.061224489795918366, 0.27040816326530615, 0.061224489795918366, 0.066326530612244902, 0.051020408163265307, 0.051020408163265307, 0.056122448979591837],
# 'EGFR': [0.056818181818181816, 0.21590909090909091, 0.0625, 0.068181818181818177, 0.068181818181818177, 0.0625, 0.34659090909090912, 0.0625, 0.056818181818181816],
# 'BRCA2': [0.13333333333333333, 0.060606060606060608, 0.060606060606060608, 0.078787878787878782, 0.1393939393939394, 0.34545454545454546, 0.060606060606060608, 0.060606060606060608, 0.060606060606060608],
# 'PTEN': [0.069182389937106917, 0.062893081761006289, 0.069182389937106917, 0.46540880503144655, 0.075471698113207544, 0.062893081761006289, 0.069182389937106917, 0.062893081761006289, 0.062893081761006289],
# 'KIT': [0.066225165562913912, 0.25165562913907286, 0.072847682119205295, 0.072847682119205295, 0.066225165562913912, 0.066225165562913912, 0.27152317880794702, 0.066225165562913912, 0.066225165562913912],
# 'BRAF': [0.066666666666666666, 0.17999999999999999, 0.073333333333333334, 0.073333333333333334, 0.093333333333333338, 0.080000000000000002, 0.29999999999999999, 0.066666666666666666, 0.066666666666666666],
# ...
# }
gv_dict = get_gv_fea_dict(alpha, feature, df)
# value_count is similar in get_gv_fea_dict
value_count = df[feature].value_counts()
# gv_fea: Gene_variation feature, it will contain the feature for each feature value in the data
gv_fea = []
# for every feature values in the given data frame we will check if it is there in the train data then we will add the feature to gv_fea
# if not we will add [1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9] to gv_fea
for index, row in df.iterrows():
if row[feature] in dict(value_count).keys():
gv_fea.append(gv_dict[row[feature]])
else:
gv_fea.append([1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9])
# gv_fea.append([-1,-1,-1,-1,-1,-1,-1,-1,-1])
return gv_fea
# cls_text is a data frame
# for every row in data fram consider the 'TEXT'
# split the words by space
# make a dict with those words
# increment its count whenever we see that word
def extract_dictionary_paddle(cls_text):
dictionary = defaultdict(int)
for index, row in cls_text.iterrows():
for word in row['TEXT'].split():
dictionary[word] +=1
return dictionary
#https://stackoverflow.com/a/1602964
def get_text_responsecoding(df, dict_list, total_dict):
text_feature_responseCoding = np.zeros((df.shape[0],9))
for i in range(0,9):
row_index = 0
for index, row in df.iterrows():
sum_prob = 0
for word in row['TEXT'].split():
sum_prob += math.log(((dict_list[i].get(word,0)+10 )/(total_dict.get(word,0)+90)))
text_feature_responseCoding[row_index][i] = math.exp(sum_prob/len(row['TEXT'].split()))
row_index += 1
return text_feature_responseCoding
def get_intersec_text(df, train_text_features):
df_text_vec = CountVectorizer(min_df=3)
df_text_fea = df_text_vec.fit_transform(df['TEXT'])
df_text_features = df_text_vec.get_feature_names()
df_text_fea_counts = df_text_fea.sum(axis=0).A1
df_text_fea_dict = dict(zip(list(df_text_features),df_text_fea_counts))
len1 = len(set(df_text_features))
len2 = len(set(train_text_features) & set(df_text_features))
return len1,len2
def report_log_loss(train_x, train_y, test_x, test_y, clf):
clf.fit(train_x, train_y)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_x, train_y)
sig_clf_probs = sig_clf.predict_proba(test_x)
return log_loss(test_y, sig_clf_probs, eps=1e-15)
# This function plots the confusion matrices given y_i, y_i_hat.
def plot_confusion_matrix(test_y, predict_y):
C = confusion_matrix(test_y, predict_y)
A =(((C.T)/(C.sum(axis=1))).T)
B =(C/C.sum(axis=0))
labels = [1,2,3,4,5,6,7,8,9]
# representing A in heatmap format
print("-"*20, "Confusion matrix", "-"*20)
plt.figure(figsize=(20,7))
sns.heatmap(C, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.show()
print("-"*20, "Precision matrix (Columm Sum=1)", "-"*20)
plt.figure(figsize=(20,7))
sns.heatmap(B, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.show()
# representing B in heatmap format
print("-"*20, "Recall matrix (Row sum=1)", "-"*20)
plt.figure(figsize=(20,7))
sns.heatmap(A, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.show()
def predict_and_plot_confusion_matrix(train_x, train_y,test_x, test_y, clf):
clf.fit(train_x, train_y)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_x, train_y)
pred_y = sig_clf.predict(test_x)
# for calculating log_loss we willl provide the array of probabilities belongs to each class
print("Log loss :",log_loss(test_y, sig_clf.predict_proba(test_x)))
# calculating the number of data points that are misclassified
print("Number of mis-classified points :", np.count_nonzero((pred_y- test_y))/test_y.shape[0])
plot_confusion_matrix(test_y, pred_y)
# this function will be used just for naive bayes
# for the given indices, we will print the name of the features
# and we will check whether the feature present in the test point text or not
def get_impfeature_names(indices, text, gene, var, no_features, df):
gene_count_vec = CountVectorizer()
var_count_vec = CountVectorizer()
text_count_vec = CountVectorizer(min_df=3)
gene_vec = gene_count_vec.fit(df['Gene'])
var_vec = var_count_vec.fit(df['Variation'])
text_vec = text_count_vec.fit(df['TEXT'])
fea1_len = len(gene_vec.get_feature_names())
fea2_len = len(var_count_vec.get_feature_names())
word_present = 0
for i,v in enumerate(indices):
if (v < fea1_len):
word = gene_vec.get_feature_names()[v]
yes_no = True if word == gene else False
if yes_no:
word_present += 1
print(i, "Gene feature [{}] present in test data point [{}]".format(word,yes_no))
elif (v < fea1_len+fea2_len):
word = var_vec.get_feature_names()[v-(fea1_len)]
yes_no = True if word == var else False
if yes_no:
word_present += 1
print(i, "variation feature [{}] present in test data point [{}]".format(word,yes_no))
else:
word = text_vec.get_feature_names()[v-(fea1_len+fea2_len)]
yes_no = True if word in text.split() else False
if yes_no:
word_present += 1
print(i, "Text feature [{}] present in test data point [{}]".format(word,yes_no))
print("Out of the top ",no_features," features ", word_present, "are present in query point")
def apply_response_coding(train_df, test_df, cv_df, y_train, y_cv, col_name, train_feature_onehotcoding, test_feature_onehotcoding, cv_feature_onehotcoding):
alpha = 1
train_feature_responseCoding = np.array(get_gv_feature(alpha, col_name, train_df))
test_feature_responseCoding = np.array(get_gv_feature(alpha, col_name, test_df))
cv_feature_responseCoding = np.array(get_gv_feature(alpha, col_name, cv_df))
# hyperparemeter for SGD classifier.
alpha = [10 ** x for x in range(-5, 1)]
cv_log_error_array=[]
for i in alpha:
clf = SGDClassifier(alpha=i, penalty='l2', loss='log', random_state=42)
clf.fit(train_feature_onehotcoding, y_train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_feature_onehotcoding, y_train)
predict_y = sig_clf.predict_proba(cv_feature_onehotcoding)
cv_log_error_array.append(log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
print('For values of alpha = ', i, "The log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
# Lets plot the same to check the best Alpha value
fig, ax = plt.subplots()
ax.plot(alpha, cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()
best_alpha = np.argmin(cv_log_error_array)
clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
clf.fit(train_feature_onehotcoding, y_train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_feature_onehotcoding, y_train)
predict_y = sig_clf.predict_proba(train_feature_onehotcoding)
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(cv_feature_onehotcoding)
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(test_feature_onehotcoding)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
test_coverage=test_df[test_df[col_name].isin(list(set(train_df[col_name])))].shape[0]
cv_coverage=cv_df[cv_df[col_name].isin(list(set(train_df[col_name])))].shape[0]
print('1. In test data {} out of {} :'.format(test_coverage ,test_df.shape[0],(test_coverage/test_df.shape[0])*100))
print('2. In cross validation data {} out of {} :'.format(cv_coverage, cv_df.shape[0], (cv_coverage/cv_df.shape[0])*100))
return train_feature_responseCoding, test_feature_responseCoding, cv_feature_responseCoding