|
a |
|
b/run.py |
|
|
1 |
import pandas as pd |
|
|
2 |
import matplotlib.pyplot as plt |
|
|
3 |
import re |
|
|
4 |
import time |
|
|
5 |
import warnings |
|
|
6 |
import numpy as np |
|
|
7 |
from nltk.corpus import stopwords |
|
|
8 |
from sklearn.decomposition import TruncatedSVD |
|
|
9 |
from sklearn.preprocessing import normalize |
|
|
10 |
from sklearn.feature_extraction.text import CountVectorizer |
|
|
11 |
from sklearn.manifold import TSNE |
|
|
12 |
import seaborn as sns |
|
|
13 |
from sklearn.neighbors import KNeighborsClassifier |
|
|
14 |
from sklearn.metrics import confusion_matrix |
|
|
15 |
from sklearn.metrics import accuracy_score, log_loss |
|
|
16 |
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
17 |
from sklearn.linear_model import SGDClassifier |
|
|
18 |
from imblearn.over_sampling import SMOTE |
|
|
19 |
from collections import Counter |
|
|
20 |
from scipy.sparse import hstack |
|
|
21 |
from sklearn.multiclass import OneVsRestClassifier |
|
|
22 |
from sklearn.svm import SVC |
|
|
23 |
from sklearn import StratifiedKFold |
|
|
24 |
from collections import Counter, defaultdict |
|
|
25 |
from sklearn.calibration import CalibratedClassifierCV |
|
|
26 |
from sklearn.naive_bayes import MultinomialNB |
|
|
27 |
from sklearn.naive_bayes import GaussianNB |
|
|
28 |
from sklearn.model_selection import train_test_split |
|
|
29 |
from sklearn.model_selection import GridSearchCV |
|
|
30 |
import math |
|
|
31 |
from sklearn.metrics import normalized_mutual_info_score |
|
|
32 |
from sklearn.ensemble import RandomForestClassifier |
|
|
33 |
warnings.filterwarnings("ignore") |
|
|
34 |
|
|
|
35 |
from mlxtend.classifier import StackingClassifier |
|
|
36 |
|
|
|
37 |
from sklearn import model_selection |
|
|
38 |
from sklearn.linear_model import LogisticRegression |
|
|
39 |
|
|
|
40 |
# Loading training_variants. A comma seperated file |
|
|
41 |
data_variants = pd.read_csv('training/training_variants') |
|
|
42 |
# Loading training_text. Is seperated by || |
|
|
43 |
data_text =pd.read_csv("training/training_text",sep="\|\|",engine="python",names=["ID","TEXT"],skiprows=1) |
|
|
44 |
|
|
|
45 |
# basic look ahead for data_variants |
|
|
46 |
|
|
|
47 |
data_variants.head() |
|
|
48 |
# to classify cancer, we need gene, variants and research paper (text) |
|
|
49 |
# ID --> common, same value in both data_variants & data_text for linking (merging dataframe) |
|
|
50 |
# gene --> genetic mutation location |
|
|
51 |
# variation --> aminoacid change for respective mutations |
|
|
52 |
# class --> 1-9, target classes i,e genetic mutation has been classified on |
|
|
53 |
# in data_variants, text is missing. text is present in data_text dataframe |
|
|
54 |
|
|
|
55 |
data_variants.info() |
|
|
56 |
|
|
|
57 |
data_variants.describe() |
|
|
58 |
|
|
|
59 |
data_variants.columns |
|
|
60 |
|
|
|
61 |
data_variants.shape |
|
|
62 |
|
|
|
63 |
# basic look ahead for data_text |
|
|
64 |
|
|
|
65 |
data_text.head() |
|
|
66 |
# text --> huge text because research paper |
|
|
67 |
|
|
|
68 |
data_text.info() |
|
|
69 |
# missing value exists |
|
|
70 |
|
|
|
71 |
data_text.columns |
|
|
72 |
|
|
|
73 |
data_text.shape |
|
|
74 |
|
|
|
75 |
# target classes |
|
|
76 |
data_variants.Class.unique() |
|
|
77 |
# multiclass classification problem |
|
|
78 |
|
|
|
79 |
''' |
|
|
80 |
important things before modelling: |
|
|
81 |
|
|
|
82 |
1. Since it is medical problem, errors are costly |
|
|
83 |
so have to return outcome in probability. |
|
|
84 |
p(y=2|a1) = 0.99 --> OK |
|
|
85 |
p(y=2|a1) = 0.5 --> NOT OK because probability is ambigous. In this case, have to look for more evidences or reason |
|
|
86 |
|
|
|
87 |
2. Interpretability: |
|
|
88 |
|
|
|
89 |
why 0.5 ? why 0.99 ? provide evidences for probability value |
|
|
90 |
|
|
|
91 |
* few ML models are very good at interpretability such as Naive bayes, decision tree, logistic regression, linear SVM |
|
|
92 |
* few ML models are not good at interpretability such as RBF and SVM |
|
|
93 |
|
|
|
94 |
3. Latency: |
|
|
95 |
|
|
|
96 |
* OK to take few sec/min for output |
|
|
97 |
* NOT OK to take in hours for output |
|
|
98 |
|
|
|
99 |
''' |
|
|
100 |
# data_text has text (i,e research paper), cannot feed raw text to model |
|
|
101 |
# so convert to a format that model understands |
|
|
102 |
|
|
|
103 |
# numbers, stopwords, multiple spaces, special chars are not relevant so lets remove |
|
|
104 |
|
|
|
105 |
stop_words = set(stopwords.words('english')) |
|
|
106 |
|
|
|
107 |
def data_text_preprocess(total_text, ind, col): |
|
|
108 |
# Remove int values from text data as that might not be imp |
|
|
109 |
if type(total_text) is not int: |
|
|
110 |
string = "" |
|
|
111 |
# replacing all special char with space |
|
|
112 |
total_text = re.sub('[^a-zA-Z0-9\n]', ' ', str(total_text)) |
|
|
113 |
# replacing multiple spaces with single space |
|
|
114 |
total_text = re.sub('\s+',' ', str(total_text)) |
|
|
115 |
# bring whole text to same lower-case scale. |
|
|
116 |
total_text = total_text.lower() |
|
|
117 |
|
|
|
118 |
for word in total_text.split(): |
|
|
119 |
# if the word is a not a stop word then retain that word from text |
|
|
120 |
if not word in stop_words: |
|
|
121 |
string += word + " " |
|
|
122 |
|
|
|
123 |
data_text[col][ind] = string |
|
|
124 |
|
|
|
125 |
for index, row in data_text.iterrows(): |
|
|
126 |
if type(row['TEXT']) is str: |
|
|
127 |
data_text_preprocess(row['TEXT'], index, 'TEXT') |
|
|
128 |
|
|
|
129 |
# data preprocessing is completed |
|
|
130 |
|
|
|
131 |
# lets merge dataframes based on ID |
|
|
132 |
result = pd.merge(data_variants, data_text,on='ID', how='left') |
|
|
133 |
result.head() |
|
|
134 |
|
|
|
135 |
# check for missing values |
|
|
136 |
result[result.isnull().any(axis=1)] |
|
|
137 |
# 5 rows of missing values, so lets impute by conacatinating gene + variant values |
|
|
138 |
|
|
|
139 |
result.loc[result['TEXT'].isnull(),'TEXT'] = result['Gene'] +' '+result['Variation'] |
|
|
140 |
|
|
|
141 |
# cross check |
|
|
142 |
result[result.isnull().any(axis=1)] |
|
|
143 |
|
|
|
144 |
# Split dataset into training, test & validation data |
|
|
145 |
|
|
|
146 |
y_true = result['Class'].values |
|
|
147 |
result.Gene = result.Gene.str.replace('\s+', '_') |
|
|
148 |
result.Variation = result.Variation.str.replace('\s+', '_') |
|
|
149 |
|
|
|
150 |
# Splitting the data into train and test set |
|
|
151 |
X_train, test_df, y_train, y_test = train_test_split(result, y_true, stratify=y_true, test_size=0.2) |
|
|
152 |
# split the train data now into train validation and cross validation |
|
|
153 |
train_df, cv_df, y_train, y_cv = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2) |
|
|
154 |
|
|
|
155 |
print('Number of data points in train data:', train_df.shape[0]) |
|
|
156 |
print('Number of data points in test data:', test_df.shape[0]) |
|
|
157 |
print('Number of data points in cross validation data:', cv_df.shape[0]) |
|
|
158 |
|
|
|
159 |
# has data distributed properly ?? |
|
|
160 |
|
|
|
161 |
train_class_distribution = train_df['Class'].value_counts() |
|
|
162 |
test_class_distribution = test_df['Class'].value_counts() |
|
|
163 |
cv_class_distribution = cv_df['Class'].value_counts() |
|
|
164 |
|
|
|
165 |
# lets viz |
|
|
166 |
|
|
|
167 |
# viz for train class distribution |
|
|
168 |
|
|
|
169 |
my_colors = 'rgbkymc' |
|
|
170 |
train_class_distribution.plot(kind='bar') |
|
|
171 |
plt.xlabel('Class') |
|
|
172 |
plt.ylabel(' Number of Data points per Class') |
|
|
173 |
plt.title('Distribution of yi in train data') |
|
|
174 |
plt.grid() |
|
|
175 |
plt.show() |
|
|
176 |
|
|
|
177 |
# lets look in percentage form |
|
|
178 |
|
|
|
179 |
sorted_yi = np.argsort(-train_class_distribution.values) |
|
|
180 |
for i in sorted_yi: |
|
|
181 |
print('Number of data points in class', i+1, ':',train_class_distribution.values[i], '(', np.round((train_class_distribution.values[i]/train_df.shape[0]*100), 3), '%)') |
|
|
182 |
|
|
|
183 |
# viz for test class distribution |
|
|
184 |
|
|
|
185 |
my_colors = 'rgbkymc' |
|
|
186 |
test_class_distribution.plot(kind='bar') |
|
|
187 |
plt.xlabel('Class') |
|
|
188 |
plt.ylabel('Number of Data points per Class') |
|
|
189 |
plt.title('Distribution of yi in test data') |
|
|
190 |
plt.grid() |
|
|
191 |
plt.show() |
|
|
192 |
|
|
|
193 |
# lets look in percentage form |
|
|
194 |
|
|
|
195 |
sorted_yi = np.argsort(-test_class_distribution.values) |
|
|
196 |
for i in sorted_yi: |
|
|
197 |
print('Number of data points in class', i+1, ':',test_class_distribution.values[i], '(', np.round((test_class_distribution.values[i]/test_df.shape[0]*100), 3), '%)') |
|
|
198 |
|
|
|
199 |
# viz for cross validation set |
|
|
200 |
|
|
|
201 |
my_colors = 'rgbkymc' |
|
|
202 |
cv_class_distribution.plot(kind='bar') |
|
|
203 |
plt.xlabel('Class') |
|
|
204 |
plt.ylabel('Data points per Class') |
|
|
205 |
plt.title('Distribution of yi in cross validation data') |
|
|
206 |
plt.grid() |
|
|
207 |
plt.show() |
|
|
208 |
|
|
|
209 |
# lets look in percentage form |
|
|
210 |
|
|
|
211 |
sorted_yi = np.argsort(-train_class_distribution.values) |
|
|
212 |
for i in sorted_yi: |
|
|
213 |
print('Number of data points in class', i+1, ':',cv_class_distribution.values[i], '(', np.round((cv_class_distribution.values[i]/cv_df.shape[0]*100), 3), '%)') |
|
|
214 |
|
|
|
215 |
# distribution must happen in right distribution |
|
|
216 |
|
|
|
217 |
''' |
|
|
218 |
what is the evaluation metric ? |
|
|
219 |
from kaggle page, multi class log loss is the evaluation metric |
|
|
220 |
|
|
|
221 |
log loss ? |
|
|
222 |
|
|
|
223 |
* more the prediction value (prediction probability), log loss is lower |
|
|
224 |
* if the prediction is ambigious, log loss penalizes |
|
|
225 |
* log loss = 0 --> perfect model, which is not the case in real world |
|
|
226 |
* log loss lies between zero to infinity [0,infinity] |
|
|
227 |
|
|
|
228 |
1 problem: |
|
|
229 |
|
|
|
230 |
* in accuracy score evaluation metrics: 0 - 1 is the range where 1 is the best and 0 is the worst |
|
|
231 |
* in AUC evaluation metrics: 0 - 1 is the range where 1 is the best and 0 is the worst |
|
|
232 |
|
|
|
233 |
BUT |
|
|
234 |
|
|
|
235 |
log loss values range between 0 to infinity |
|
|
236 |
|
|
|
237 |
how do we know the infinity value is good or bad ?? |
|
|
238 |
|
|
|
239 |
1 method is: |
|
|
240 |
|
|
|
241 |
random model --> worst model --> compute log loss (ex: 3) |
|
|
242 |
random model --> compute log loss (ex: 1.5) |
|
|
243 |
|
|
|
244 |
1.5 given model is better than 3 |
|
|
245 |
|
|
|
246 |
how do we generate random model ? |
|
|
247 |
|
|
|
248 |
* here working with probability. |
|
|
249 |
* summazation all the probability would be 1 |
|
|
250 |
|
|
|
251 |
''' |
|
|
252 |
|
|
|
253 |
# build random model |
|
|
254 |
|
|
|
255 |
# 9 classes exists, so create 9 random numbers such that thier sum is equivalent to 1 |
|
|
256 |
# because sum of probability of all the 9 classes must be equivalent to 1 |
|
|
257 |
|
|
|
258 |
test_data_len = test_df.shape[0] |
|
|
259 |
cv_data_len = cv_df.shape[0] |
|
|
260 |
|
|
|
261 |
# CV set error |
|
|
262 |
cv_predicted_y = np.zeros((cv_data_len,9)) |
|
|
263 |
for i in range(cv_data_len): |
|
|
264 |
rand_probs = np.random.rand(1,9) # randomly generate numbers between 1-9 |
|
|
265 |
cv_predicted_y[i] = ((rand_probs/sum(sum(rand_probs)))[0]) # summazation equals to 1 |
|
|
266 |
print("Log loss on Cross Validation Data using Random Model",log_loss(y_cv,cv_predicted_y, eps=1e-15)) |
|
|
267 |
# Log loss on Cross Validation Data using Random Model 2.491130932423048 |
|
|
268 |
|
|
|
269 |
# test-set error |
|
|
270 |
test_predicted_y = np.zeros((test_data_len,9)) |
|
|
271 |
for i in range(test_data_len): |
|
|
272 |
rand_probs = np.random.rand(1,9) |
|
|
273 |
test_predicted_y[i] = ((rand_probs/sum(sum(rand_probs)))[0]) |
|
|
274 |
print("Log loss on Test Data using Random Model",log_loss(y_test,test_predicted_y, eps=1e-15)) |
|
|
275 |
# Log loss on Test Data using Random Model 2.531934856356048 |
|
|
276 |
|
|
|
277 |
# Lets get the index of max probablity |
|
|
278 |
predicted_y =np.argmax(test_predicted_y, axis=1) |
|
|
279 |
|
|
|
280 |
# index value ranges from 0-8, our problem statement has from 1-9 so lets update it |
|
|
281 |
predicted_y = predicted_y + 1 |
|
|
282 |
|
|
|
283 |
# confusion matrix |
|
|
284 |
C = confusion_matrix(y_test, predicted_y) |
|
|
285 |
|
|
|
286 |
labels = [1,2,3,4,5,6,7,8,9] |
|
|
287 |
plt.figure(figsize=(20,7)) |
|
|
288 |
sns.heatmap(C, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels) |
|
|
289 |
plt.xlabel('Predicted Class') |
|
|
290 |
plt.ylabel('Original Class') |
|
|
291 |
plt.show() |
|
|
292 |
|
|
|
293 |
# precision matrix |
|
|
294 |
|
|
|
295 |
B =(C/C.sum(axis=0)) |
|
|
296 |
|
|
|
297 |
plt.figure(figsize=(20,7)) |
|
|
298 |
sns.heatmap(B, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels) |
|
|
299 |
plt.xlabel('Predicted Class') |
|
|
300 |
plt.ylabel('Original Class') |
|
|
301 |
plt.show() |
|
|
302 |
|
|
|
303 |
# recall matrix |
|
|
304 |
|
|
|
305 |
A =(((C.T)/(C.sum(axis=1))).T) |
|
|
306 |
|
|
|
307 |
plt.figure(figsize=(20,7)) |
|
|
308 |
sns.heatmap(A, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels) |
|
|
309 |
plt.xlabel('Predicted Class') |
|
|
310 |
plt.ylabel('Original Class') |
|
|
311 |
plt.show() |
|
|
312 |
|
|
|
313 |
# evaluate gene column |
|
|
314 |
|
|
|
315 |
# is gene column important for target feature ? |
|
|
316 |
# if so then convert category to numeric conversions |
|
|
317 |
|
|
|
318 |
unique_genes = train_df['Gene'].value_counts() |
|
|
319 |
print('Number of Unique Genes :', unique_genes.shape[0]) |
|
|
320 |
# the top 10 genes that occured most |
|
|
321 |
print(unique_genes.head(10)) |
|
|
322 |
|
|
|
323 |
unique_genes.shape[0] |
|
|
324 |
|
|
|
325 |
# viz of cumulative distribution of unique gene values |
|
|
326 |
|
|
|
327 |
s = sum(unique_genes.values); |
|
|
328 |
h = unique_genes.values/s; |
|
|
329 |
c = np.cumsum(h) |
|
|
330 |
plt.plot(c,label='Cumulative distribution of Genes') |
|
|
331 |
plt.grid() |
|
|
332 |
plt.legend() |
|
|
333 |
plt.show() |
|
|
334 |
|
|
|
335 |
# top 50 values are contributing to 75% of data |
|
|
336 |
|
|
|
337 |
''' |
|
|
338 |
lets encode |
|
|
339 |
1. one-hot encoding. creates lot of features --> problem in some cases |
|
|
340 |
2. response encoding (mean imputation, ex: average height of indians are 5.5) |
|
|
341 |
|
|
|
342 |
same way response encoding in this case would be: |
|
|
343 |
|
|
|
344 |
* for 1 row, we'll create 9 columns |
|
|
345 |
* in 9 columns, the probability of gene occurance w.r.t that row would be saved |
|
|
346 |
''' |
|
|
347 |
# let's use both encoding techniques and check which one works better |
|
|
348 |
|
|
|
349 |
# 1-hot encoding on gene feature |
|
|
350 |
|
|
|
351 |
gene_vectorizer = CountVectorizer() |
|
|
352 |
train_gene_feature_onehotCoding = gene_vectorizer.fit_transform(train_df['Gene']) |
|
|
353 |
test_gene_feature_onehotCoding = gene_vectorizer.transform(test_df['Gene']) |
|
|
354 |
cv_gene_feature_onehotCoding = gene_vectorizer.transform(cv_df['Gene']) |
|
|
355 |
|
|
|
356 |
train_gene_feature_onehotCoding.shape |
|
|
357 |
# (2124,234) |
|
|
358 |
|
|
|
359 |
# response encoding on gene feature |
|
|
360 |
|
|
|
361 |
# code for response coding with Laplace smoothing. |
|
|
362 |
# alpha : used for laplace smoothing |
|
|
363 |
# feature: ['gene', 'variation'] |
|
|
364 |
# df: ['train_df', 'test_df', 'cv_df'] |
|
|
365 |
# algorithm |
|
|
366 |
# ---------- |
|
|
367 |
# Consider all unique values and the number of occurances of given feature in train data dataframe |
|
|
368 |
# build a vector (1*9) , the first element = (number of times it occured in class1 + 10*alpha / number of time it occurred in total data+90*alpha) |
|
|
369 |
# gv_dict is like a look up table, for every gene it store a (1*9) representation of it |
|
|
370 |
# for a value of feature in df: |
|
|
371 |
# if it is in train data: |
|
|
372 |
# we add the vector that was stored in 'gv_dict' look up table to 'gv_fea' |
|
|
373 |
# if it is not there is train: |
|
|
374 |
# we add [1/9, 1/9, 1/9, 1/9,1/9, 1/9, 1/9, 1/9, 1/9] to 'gv_fea' |
|
|
375 |
# return 'gv_fea' |
|
|
376 |
# ---------------------- |
|
|
377 |
|
|
|
378 |
# get_gv_fea_dict: Get Gene varaition Feature Dict |
|
|
379 |
def get_gv_fea_dict(alpha, feature, df): |
|
|
380 |
# value_count: it contains a dict like |
|
|
381 |
# print(train_df['Gene'].value_counts()) |
|
|
382 |
# output: |
|
|
383 |
# {BRCA1 174 |
|
|
384 |
# TP53 106 |
|
|
385 |
# EGFR 86 |
|
|
386 |
# BRCA2 75 |
|
|
387 |
# PTEN 69 |
|
|
388 |
# KIT 61 |
|
|
389 |
# BRAF 60 |
|
|
390 |
# ERBB2 47 |
|
|
391 |
# PDGFRA 46 |
|
|
392 |
# ...} |
|
|
393 |
# print(train_df['Variation'].value_counts()) |
|
|
394 |
# output: |
|
|
395 |
# { |
|
|
396 |
# Truncating_Mutations 63 |
|
|
397 |
# Deletion 43 |
|
|
398 |
# Amplification 43 |
|
|
399 |
# Fusions 22 |
|
|
400 |
# Overexpression 3 |
|
|
401 |
# E17K 3 |
|
|
402 |
# Q61L 3 |
|
|
403 |
# S222D 2 |
|
|
404 |
# P130S 2 |
|
|
405 |
# ... |
|
|
406 |
# } |
|
|
407 |
value_count = train_df[feature].value_counts() |
|
|
408 |
|
|
|
409 |
# gv_dict : Gene Variation Dict, which contains the probability array for each gene/variation |
|
|
410 |
gv_dict = dict() |
|
|
411 |
|
|
|
412 |
# denominator will contain the number of time that particular feature occured in whole data |
|
|
413 |
for i, denominator in value_count.items(): |
|
|
414 |
# vec will contain (p(yi==1/Gi) probability of gene/variation belongs to perticular class |
|
|
415 |
# vec is 9 diamensional vector |
|
|
416 |
vec = [] |
|
|
417 |
for k in range(1,10): |
|
|
418 |
# print(train_df.loc[(train_df['Class']==1) & (train_df['Gene']=='BRCA1')]) |
|
|
419 |
# ID Gene Variation Class |
|
|
420 |
# 2470 2470 BRCA1 S1715C 1 |
|
|
421 |
# 2486 2486 BRCA1 S1841R 1 |
|
|
422 |
# 2614 2614 BRCA1 M1R 1 |
|
|
423 |
# 2432 2432 BRCA1 L1657P 1 |
|
|
424 |
# 2567 2567 BRCA1 T1685A 1 |
|
|
425 |
# 2583 2583 BRCA1 E1660G 1 |
|
|
426 |
# 2634 2634 BRCA1 W1718L 1 |
|
|
427 |
# cls_cnt.shape[0] will return the number of rows |
|
|
428 |
|
|
|
429 |
cls_cnt = train_df.loc[(train_df['Class']==k) & (train_df[feature]==i)] |
|
|
430 |
|
|
|
431 |
# cls_cnt.shape[0](numerator) will contain the number of time that particular feature occured in whole data |
|
|
432 |
vec.append((cls_cnt.shape[0] + alpha*10)/ (denominator + 90*alpha)) |
|
|
433 |
|
|
|
434 |
# we are adding the gene/variation to the dict as key and vec as value |
|
|
435 |
gv_dict[i]=vec |
|
|
436 |
return gv_dict |
|
|
437 |
|
|
|
438 |
# Get Gene variation feature |
|
|
439 |
def get_gv_feature(alpha, feature, df): |
|
|
440 |
# print(gv_dict) |
|
|
441 |
# {'BRCA1': [0.20075757575757575, 0.03787878787878788, 0.068181818181818177, 0.13636363636363635, 0.25, 0.19318181818181818, 0.03787878787878788, 0.03787878787878788, 0.03787878787878788], |
|
|
442 |
# 'TP53': [0.32142857142857145, 0.061224489795918366, 0.061224489795918366, 0.27040816326530615, 0.061224489795918366, 0.066326530612244902, 0.051020408163265307, 0.051020408163265307, 0.056122448979591837], |
|
|
443 |
# 'EGFR': [0.056818181818181816, 0.21590909090909091, 0.0625, 0.068181818181818177, 0.068181818181818177, 0.0625, 0.34659090909090912, 0.0625, 0.056818181818181816], |
|
|
444 |
# 'BRCA2': [0.13333333333333333, 0.060606060606060608, 0.060606060606060608, 0.078787878787878782, 0.1393939393939394, 0.34545454545454546, 0.060606060606060608, 0.060606060606060608, 0.060606060606060608], |
|
|
445 |
# 'PTEN': [0.069182389937106917, 0.062893081761006289, 0.069182389937106917, 0.46540880503144655, 0.075471698113207544, 0.062893081761006289, 0.069182389937106917, 0.062893081761006289, 0.062893081761006289], |
|
|
446 |
# 'KIT': [0.066225165562913912, 0.25165562913907286, 0.072847682119205295, 0.072847682119205295, 0.066225165562913912, 0.066225165562913912, 0.27152317880794702, 0.066225165562913912, 0.066225165562913912], |
|
|
447 |
# 'BRAF': [0.066666666666666666, 0.17999999999999999, 0.073333333333333334, 0.073333333333333334, 0.093333333333333338, 0.080000000000000002, 0.29999999999999999, 0.066666666666666666, 0.066666666666666666], |
|
|
448 |
# ... |
|
|
449 |
# } |
|
|
450 |
gv_dict = get_gv_fea_dict(alpha, feature, df) |
|
|
451 |
# value_count is similar in get_gv_fea_dict |
|
|
452 |
value_count = train_df[feature].value_counts() |
|
|
453 |
|
|
|
454 |
# gv_fea: Gene_variation feature, it will contain the feature for each feature value in the data |
|
|
455 |
gv_fea = [] |
|
|
456 |
# for every feature values in the given data frame we will check if it is there in the train data then we will add the feature to gv_fea |
|
|
457 |
# if not we will add [1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9] to gv_fea |
|
|
458 |
for index, row in df.iterrows(): |
|
|
459 |
if row[feature] in dict(value_count).keys(): |
|
|
460 |
gv_fea.append(gv_dict[row[feature]]) |
|
|
461 |
else: |
|
|
462 |
gv_fea.append([1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9]) |
|
|
463 |
# gv_fea.append([-1,-1,-1,-1,-1,-1,-1,-1,-1]) |
|
|
464 |
return gv_fea |
|
|
465 |
|
|
|
466 |
#response-coding of the Gene feature |
|
|
467 |
# alpha is used for laplace smoothing |
|
|
468 |
alpha = 1 |
|
|
469 |
# train gene feature |
|
|
470 |
train_gene_feature_responseCoding = np.array(get_gv_feature(alpha, "Gene", train_df)) |
|
|
471 |
# test gene feature |
|
|
472 |
test_gene_feature_responseCoding = np.array(get_gv_feature(alpha, "Gene", test_df)) |
|
|
473 |
# cross validation gene feature |
|
|
474 |
cv_gene_feature_responseCoding = np.array(get_gv_feature(alpha, "Gene", cv_df)) |
|
|
475 |
|
|
|
476 |
''' |
|
|
477 |
how to test whether gene is a good feature for model ? |
|
|
478 |
by using random model log loss |
|
|
479 |
create logistic regression model with only gene feature and check the model log loss |
|
|
480 |
if > random model then gene is a good feature for the model |
|
|
481 |
|
|
|
482 |
in order to do that: |
|
|
483 |
|
|
|
484 |
laplace smooting |
|
|
485 |
|
|
|
486 |
* in NB we use LS i,e in test query there is a word that is not present in training test |
|
|
487 |
* what to do with new word ? |
|
|
488 |
* can we drop ? yes but we're saying it has value 1 if we're dropping. that should not be the case |
|
|
489 |
* can we assign value 0 ? WHole multiplication becomes zero |
|
|
490 |
|
|
|
491 |
* so we use laplace smooting or editing smoothing i,e we add alpha (hyper parameter) and alpha K (k=no of classes) |
|
|
492 |
* alpha = 1, in general. can change since it is hyperparameter |
|
|
493 |
* alpha cannot be small or too large --> bias-variance trade off |
|
|
494 |
|
|
|
495 |
CALLIBIRATED CLASSIFIER --> for getting results in probability format to be used for log loss |
|
|
496 |
|
|
|
497 |
SGD --> for optimization |
|
|
498 |
|
|
|
499 |
''' |
|
|
500 |
|
|
|
501 |
# using 1-hot encoding because more dimension is fine with logistic regression algo |
|
|
502 |
|
|
|
503 |
# We need a hyperparemeter for SGD classifier. |
|
|
504 |
alpha = [10 ** x for x in range(-5, 1)] |
|
|
505 |
|
|
|
506 |
# SGD classifier |
|
|
507 |
cv_log_error_array=[] |
|
|
508 |
for i in alpha: |
|
|
509 |
clf = SGDClassifier(alpha=i, penalty='l2', loss='log', random_state=42) |
|
|
510 |
clf.fit(train_gene_feature_onehotCoding, y_train) |
|
|
511 |
sig_clf = CalibratedClassifierCV(clf, method="sigmoid") |
|
|
512 |
sig_clf.fit(train_gene_feature_onehotCoding, y_train) |
|
|
513 |
predict_y = sig_clf.predict_proba(cv_gene_feature_onehotCoding) |
|
|
514 |
cv_log_error_array.append(log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15)) |
|
|
515 |
print('For values of alpha = ', i, "The log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15)) |
|
|
516 |
|
|
|
517 |
# alpha 0.001 log loss is minimal |
|
|
518 |
|
|
|
519 |
# Lets plot the same to check the best Alpha value |
|
|
520 |
fig, ax = plt.subplots() |
|
|
521 |
ax.plot(alpha, cv_log_error_array,c='g') |
|
|
522 |
for i, txt in enumerate(np.round(cv_log_error_array,3)): |
|
|
523 |
ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i])) |
|
|
524 |
plt.grid() |
|
|
525 |
plt.title("Cross Validation Error for each alpha") |
|
|
526 |
plt.xlabel("Alpha i's") |
|
|
527 |
plt.ylabel("Error measure") |
|
|
528 |
plt.show() |
|
|
529 |
|
|
|
530 |
# Lets use best alpha value as we can see from above graph and compute log loss |
|
|
531 |
best_alpha = np.argmin(cv_log_error_array) |
|
|
532 |
clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42) |
|
|
533 |
clf.fit(train_gene_feature_onehotCoding, y_train) |
|
|
534 |
sig_clf = CalibratedClassifierCV(clf, method="sigmoid") |
|
|
535 |
sig_clf.fit(train_gene_feature_onehotCoding, y_train) |
|
|
536 |
|
|
|
537 |
predict_y = sig_clf.predict_proba(train_gene_feature_onehotCoding) |
|
|
538 |
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15)) |
|
|
539 |
predict_y = sig_clf.predict_proba(cv_gene_feature_onehotCoding) |
|
|
540 |
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15)) |
|
|
541 |
predict_y = sig_clf.predict_proba(test_gene_feature_onehotCoding) |
|
|
542 |
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15)) |
|
|
543 |
|
|
|
544 |
# log loss is less than random model |
|
|
545 |
# how stable is model ? |
|
|
546 |
# in train there is less for gene and in test there is more. is called as overlap |
|
|
547 |
# overlap should be good |
|
|
548 |
# though we're using laplace smoothing, its not a great idea |
|
|
549 |
# data and model will be stable if we've good overlap |
|
|
550 |
|
|
|
551 |
# check how many values are overlapping between train, test or between CV and train |
|
|
552 |
|
|
|
553 |
test_coverage=test_df[test_df['Gene'].isin(list(set(train_df['Gene'])))].shape[0] |
|
|
554 |
cv_coverage=cv_df[cv_df['Gene'].isin(list(set(train_df['Gene'])))].shape[0] |
|
|
555 |
|
|
|
556 |
print('1. In test data',test_coverage, 'out of',test_df.shape[0], ":",(test_coverage/test_df.shape[0])*100) |
|
|
557 |
print('2. In cross validation data',cv_coverage, 'out of ',cv_df.shape[0],":" ,(cv_coverage/cv_df.shape[0])*100) |
|
|
558 |
|
|
|
559 |
# EVALUATING VARIATION FEATURE |
|
|
560 |
|
|
|
561 |
# Variation is also a categorical variable so we have to deal in same way like we have done for Gene column. |
|
|
562 |
# We will again get the one hot encoder and response enoding variable for variation column. |
|
|
563 |
|
|
|
564 |
unique_variations = train_df['Variation'].value_counts() |
|
|
565 |
print('Number of Unique Variations :', unique_variations.shape[0]) |
|
|
566 |
# the top 10 variations that occured most |
|
|
567 |
print(unique_variations.head(10)) |
|
|
568 |
|
|
|
569 |
# comulative distribution of unique variation values |
|
|
570 |
|
|
|
571 |
s = sum(unique_variations.values); |
|
|
572 |
h = unique_variations.values/s; |
|
|
573 |
c = np.cumsum(h) |
|
|
574 |
print(c) |
|
|
575 |
plt.plot(c,label='Cumulative distribution of Variations') |
|
|
576 |
plt.grid() |
|
|
577 |
plt.legend() |
|
|
578 |
plt.show() |
|
|
579 |
|
|
|
580 |
# one-hot encoding of variation feature. |
|
|
581 |
variation_vectorizer = CountVectorizer() |
|
|
582 |
train_variation_feature_onehotCoding = variation_vectorizer.fit_transform(train_df['Variation']) |
|
|
583 |
test_variation_feature_onehotCoding = variation_vectorizer.transform(test_df['Variation']) |
|
|
584 |
cv_variation_feature_onehotCoding = variation_vectorizer.transform(cv_df['Variation']) |
|
|
585 |
|
|
|
586 |
train_variation_feature_onehotCoding.shape |
|
|
587 |
|
|
|
588 |
# alpha is used for laplace smoothing |
|
|
589 |
alpha = 1 |
|
|
590 |
# train gene feature |
|
|
591 |
train_variation_feature_responseCoding = np.array(get_gv_feature(alpha, "Variation", train_df)) |
|
|
592 |
# test gene feature |
|
|
593 |
test_variation_feature_responseCoding = np.array(get_gv_feature(alpha, "Variation", test_df)) |
|
|
594 |
# cross validation gene feature |
|
|
595 |
cv_variation_feature_responseCoding = np.array(get_gv_feature(alpha, "Variation", cv_df)) |
|
|
596 |
|
|
|
597 |
train_variation_feature_responseCoding.shape |
|
|
598 |
|
|
|
599 |
# We need a hyperparemeter for SGD classifier. |
|
|
600 |
alpha = [10 ** x for x in range(-5, 1)] |
|
|
601 |
|
|
|
602 |
# SGD classifier |
|
|
603 |
cv_log_error_array=[] |
|
|
604 |
for i in alpha: |
|
|
605 |
clf = SGDClassifier(alpha=i, penalty='l2', loss='log', random_state=42) |
|
|
606 |
clf.fit(train_variation_feature_onehotCoding, y_train) |
|
|
607 |
|
|
|
608 |
sig_clf = CalibratedClassifierCV(clf, method="sigmoid") |
|
|
609 |
sig_clf.fit(train_variation_feature_onehotCoding, y_train) |
|
|
610 |
predict_y = sig_clf.predict_proba(cv_variation_feature_onehotCoding) |
|
|
611 |
|
|
|
612 |
cv_log_error_array.append(log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15)) |
|
|
613 |
print('For values of alpha = ', i, "The log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15)) |
|
|
614 |
|
|
|
615 |
# Lets plot the same to check the best Alpha value |
|
|
616 |
fig, ax = plt.subplots() |
|
|
617 |
ax.plot(alpha, cv_log_error_array,c='g') |
|
|
618 |
for i, txt in enumerate(np.round(cv_log_error_array,3)): |
|
|
619 |
ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i])) |
|
|
620 |
plt.grid() |
|
|
621 |
plt.title("Cross Validation Error for each alpha") |
|
|
622 |
plt.xlabel("Alpha i's") |
|
|
623 |
plt.ylabel("Error measure") |
|
|
624 |
plt.show() |
|
|
625 |
|
|
|
626 |
best_alpha = np.argmin(cv_log_error_array) |
|
|
627 |
clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42) |
|
|
628 |
clf.fit(train_variation_feature_onehotCoding, y_train) |
|
|
629 |
sig_clf = CalibratedClassifierCV(clf, method="sigmoid") |
|
|
630 |
sig_clf.fit(train_variation_feature_onehotCoding, y_train) |
|
|
631 |
|
|
|
632 |
predict_y = sig_clf.predict_proba(train_variation_feature_onehotCoding) |
|
|
633 |
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15)) |
|
|
634 |
predict_y = sig_clf.predict_proba(cv_variation_feature_onehotCoding) |
|
|
635 |
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15)) |
|
|
636 |
predict_y = sig_clf.predict_proba(test_variation_feature_onehotCoding) |
|
|
637 |
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15)) |
|
|
638 |
|
|
|
639 |
# alpha 0.0001, test log loss is less compared to random model so good have variation feature |
|
|
640 |
|
|
|
641 |
test_coverage=test_df[test_df['Variation'].isin(list(set(train_df['Variation'])))].shape[0] |
|
|
642 |
cv_coverage=cv_df[cv_df['Variation'].isin(list(set(train_df['Variation'])))].shape[0] |
|
|
643 |
|
|
|
644 |
print('1. In test data',test_coverage, 'out of',test_df.shape[0], ":",(test_coverage/test_df.shape[0])*100) |
|
|
645 |
print('2. In cross validation data',cv_coverage, 'out of ',cv_df.shape[0],":" ,(cv_coverage/cv_df.shape[0])*100) |
|
|
646 |
|
|
|
647 |
# stability is low for variation feature. since log loss is less, lets keep it |
|
|
648 |
|
|
|
649 |
# EVALUATING TEXT COLUMN |
|
|
650 |
|
|
|
651 |
# cls_text is a data frame |
|
|
652 |
# for every row in data frame consider the 'TEXT' |
|
|
653 |
# split the words by space |
|
|
654 |
# make a dict with those words |
|
|
655 |
# increment its count whenever we see that word |
|
|
656 |
|
|
|
657 |
# to get word count |
|
|
658 |
def extract_dictionary_paddle(cls_text): |
|
|
659 |
dictionary = defaultdict(int) |
|
|
660 |
for index, row in cls_text.iterrows(): |
|
|
661 |
for word in row['TEXT'].split(): |
|
|
662 |
dictionary[word] +=1 |
|
|
663 |
return dictionary |
|
|
664 |
|
|
|
665 |
import math |
|
|
666 |
#https://stackoverflow.com/a/1602964 |
|
|
667 |
def get_text_responsecoding(df): |
|
|
668 |
text_feature_responseCoding = np.zeros((df.shape[0],9)) |
|
|
669 |
for i in range(0,9): |
|
|
670 |
row_index = 0 |
|
|
671 |
for index, row in df.iterrows(): |
|
|
672 |
sum_prob = 0 |
|
|
673 |
for word in row['TEXT'].split(): |
|
|
674 |
sum_prob += math.log(((dict_list[i].get(word,0)+10 )/(total_dict.get(word,0)+90))) |
|
|
675 |
text_feature_responseCoding[row_index][i] = math.exp(sum_prob/len(row['TEXT'].split())) |
|
|
676 |
row_index += 1 |
|
|
677 |
return text_feature_responseCoding |
|
|
678 |
|
|
|
679 |
# building a CountVectorizer with all the words that occured minimum 3 times in train data |
|
|
680 |
text_vectorizer = CountVectorizer(min_df=3) |
|
|
681 |
train_text_feature_onehotCoding = text_vectorizer.fit_transform(train_df['TEXT']) |
|
|
682 |
# getting all the feature names (words) |
|
|
683 |
train_text_features= text_vectorizer.get_feature_names() |
|
|
684 |
|
|
|
685 |
# train_text_feature_onehotCoding.sum(axis=0).A1 will sum every row and returns (1*number of features) vector |
|
|
686 |
train_text_fea_counts = train_text_feature_onehotCoding.sum(axis=0).A1 |
|
|
687 |
|
|
|
688 |
# zip(list(text_features),text_fea_counts) will zip a word with its number of times it occured |
|
|
689 |
text_fea_dict = dict(zip(list(train_text_features),train_text_fea_counts)) |
|
|
690 |
|
|
|
691 |
print("Total number of unique words in train data :", len(train_text_features)) |
|
|
692 |
|
|
|
693 |
|
|
|
694 |
dict_list = [] |
|
|
695 |
# dict_list =[] contains 9 dictoinaries each corresponds to a class |
|
|
696 |
for i in range(1,10): |
|
|
697 |
cls_text = train_df[train_df['Class']==i] |
|
|
698 |
# build a word dict based on the words in that class |
|
|
699 |
dict_list.append(extract_dictionary_paddle(cls_text)) |
|
|
700 |
# append it to dict_list |
|
|
701 |
|
|
|
702 |
# dict_list[i] is build on i'th class text data |
|
|
703 |
# total_dict is buid on whole training text data |
|
|
704 |
total_dict = extract_dictionary_paddle(train_df) |
|
|
705 |
|
|
|
706 |
|
|
|
707 |
confuse_array = [] |
|
|
708 |
for i in train_text_features: |
|
|
709 |
ratios = [] |
|
|
710 |
max_val = -1 |
|
|
711 |
for j in range(0,9): |
|
|
712 |
ratios.append((dict_list[j][i]+10 )/(total_dict[i]+90)) |
|
|
713 |
confuse_array.append(ratios) |
|
|
714 |
confuse_array = np.array(confuse_array) |
|
|
715 |
|
|
|
716 |
#response coding of text features |
|
|
717 |
train_text_feature_responseCoding = get_text_responsecoding(train_df) |
|
|
718 |
test_text_feature_responseCoding = get_text_responsecoding(test_df) |
|
|
719 |
cv_text_feature_responseCoding = get_text_responsecoding(cv_df) |
|
|
720 |
|
|
|
721 |
# https://stackoverflow.com/a/16202486 |
|
|
722 |
# we convert each row values such that they sum to 1 |
|
|
723 |
train_text_feature_responseCoding = (train_text_feature_responseCoding.T/train_text_feature_responseCoding.sum(axis=1)).T |
|
|
724 |
test_text_feature_responseCoding = (test_text_feature_responseCoding.T/test_text_feature_responseCoding.sum(axis=1)).T |
|
|
725 |
cv_text_feature_responseCoding = (cv_text_feature_responseCoding.T/cv_text_feature_responseCoding.sum(axis=1)).T |
|
|
726 |
|
|
|
727 |
# don't forget to normalize every feature |
|
|
728 |
train_text_feature_onehotCoding = normalize(train_text_feature_onehotCoding, axis=0) |
|
|
729 |
|
|
|
730 |
# we use the same vectorizer that was trained on train data |
|
|
731 |
test_text_feature_onehotCoding = text_vectorizer.transform(test_df['TEXT']) |
|
|
732 |
# don't forget to normalize every feature |
|
|
733 |
test_text_feature_onehotCoding = normalize(test_text_feature_onehotCoding, axis=0) |
|
|
734 |
|
|
|
735 |
# we use the same vectorizer that was trained on train data |
|
|
736 |
cv_text_feature_onehotCoding = text_vectorizer.transform(cv_df['TEXT']) |
|
|
737 |
# don't forget to normalize every feature |
|
|
738 |
cv_text_feature_onehotCoding = normalize(cv_text_feature_onehotCoding, axis=0) |
|
|
739 |
|
|
|
740 |
#https://stackoverflow.com/a/2258273/4084039 |
|
|
741 |
sorted_text_fea_dict = dict(sorted(text_fea_dict.items(), key=lambda x: x[1] , reverse=True)) |
|
|
742 |
sorted_text_occur = np.array(list(sorted_text_fea_dict.values())) |
|
|
743 |
|
|
|
744 |
# Number of words for a given frequency. |
|
|
745 |
print(Counter(sorted_text_occur)) |
|
|
746 |
|
|
|
747 |
# build the model with only text column |
|
|
748 |
|
|
|
749 |
cv_log_error_array=[] |
|
|
750 |
for i in alpha: |
|
|
751 |
clf = SGDClassifier(alpha=i, penalty='l2', loss='log', random_state=42) |
|
|
752 |
clf.fit(train_text_feature_onehotCoding, y_train) |
|
|
753 |
|
|
|
754 |
sig_clf = CalibratedClassifierCV(clf, method="sigmoid") |
|
|
755 |
sig_clf.fit(train_text_feature_onehotCoding, y_train) |
|
|
756 |
predict_y = sig_clf.predict_proba(cv_text_feature_onehotCoding) |
|
|
757 |
cv_log_error_array.append(log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15)) |
|
|
758 |
print('For values of alpha = ', i, "The log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15)) |
|
|
759 |
|
|
|
760 |
# log loss is less than random model |
|
|
761 |
|
|
|
762 |
fig, ax = plt.subplots() |
|
|
763 |
ax.plot(alpha, cv_log_error_array,c='g') |
|
|
764 |
for i, txt in enumerate(np.round(cv_log_error_array,3)): |
|
|
765 |
ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i])) |
|
|
766 |
plt.grid() |
|
|
767 |
plt.title("Cross Validation Error for each alpha") |
|
|
768 |
plt.xlabel("Alpha i's") |
|
|
769 |
plt.ylabel("Error measure") |
|
|
770 |
plt.show() |
|
|
771 |
|
|
|
772 |
best_alpha = np.argmin(cv_log_error_array) |
|
|
773 |
clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42) |
|
|
774 |
clf.fit(train_text_feature_onehotCoding, y_train) |
|
|
775 |
sig_clf = CalibratedClassifierCV(clf, method="sigmoid") |
|
|
776 |
sig_clf.fit(train_text_feature_onehotCoding, y_train) |
|
|
777 |
|
|
|
778 |
predict_y = sig_clf.predict_proba(train_text_feature_onehotCoding) |
|
|
779 |
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15)) |
|
|
780 |
predict_y = sig_clf.predict_proba(cv_text_feature_onehotCoding) |
|
|
781 |
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15)) |
|
|
782 |
predict_y = sig_clf.predict_proba(test_text_feature_onehotCoding) |
|
|
783 |
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15)) |
|
|
784 |
|
|
|
785 |
# check the overlap of text data |
|
|
786 |
|
|
|
787 |
def get_intersec_text(df): |
|
|
788 |
df_text_vec = CountVectorizer(min_df=3) |
|
|
789 |
df_text_fea = df_text_vec.fit_transform(df['TEXT']) |
|
|
790 |
df_text_features = df_text_vec.get_feature_names() |
|
|
791 |
|
|
|
792 |
df_text_fea_counts = df_text_fea.sum(axis=0).A1 |
|
|
793 |
df_text_fea_dict = dict(zip(list(df_text_features),df_text_fea_counts)) |
|
|
794 |
len1 = len(set(df_text_features)) |
|
|
795 |
len2 = len(set(train_text_features) & set(df_text_features)) |
|
|
796 |
return len1,len2 |
|
|
797 |
|
|
|
798 |
len1,len2 = get_intersec_text(test_df) |
|
|
799 |
print(np.round((len2/len1)*100, 3), "% of word of test data appeared in train data") |
|
|
800 |
len1,len2 = get_intersec_text(cv_df) |
|
|
801 |
print(np.round((len2/len1)*100, 3), "% of word of Cross Validation appeared in train data") |
|
|
802 |
|
|
|
803 |
# text column as good stable |
|
|
804 |
|
|
|
805 |
# therefore, all 3 columns are going important for model buidling |
|
|
806 |
|
|
|
807 |
# Data prepration for Machine Learning models |
|
|
808 |
|
|
|
809 |
# few utility functions |
|
|
810 |
|
|
|
811 |
def report_log_loss(train_x, train_y, test_x, test_y, clf): |
|
|
812 |
clf.fit(train_x, train_y) |
|
|
813 |
sig_clf = CalibratedClassifierCV(clf, method="sigmoid") |
|
|
814 |
sig_clf.fit(train_x, train_y) |
|
|
815 |
sig_clf_probs = sig_clf.predict_proba(test_x) |
|
|
816 |
return log_loss(test_y, sig_clf_probs, eps=1e-15) |
|
|
817 |
|
|
|
818 |
# This function plots the confusion matrices given y_i, y_i_hat. |
|
|
819 |
def plot_confusion_matrix(test_y, predict_y): |
|
|
820 |
C = confusion_matrix(test_y, predict_y) |
|
|
821 |
|
|
|
822 |
A =(((C.T)/(C.sum(axis=1))).T) |
|
|
823 |
|
|
|
824 |
B =(C/C.sum(axis=0)) |
|
|
825 |
labels = [1,2,3,4,5,6,7,8,9] |
|
|
826 |
# representing A in heatmap format |
|
|
827 |
print("-"*20, "Confusion matrix", "-"*20) |
|
|
828 |
plt.figure(figsize=(20,7)) |
|
|
829 |
sns.heatmap(C, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels) |
|
|
830 |
plt.xlabel('Predicted Class') |
|
|
831 |
plt.ylabel('Original Class') |
|
|
832 |
plt.show() |
|
|
833 |
|
|
|
834 |
print("-"*20, "Precision matrix (Columm Sum=1)", "-"*20) |
|
|
835 |
plt.figure(figsize=(20,7)) |
|
|
836 |
sns.heatmap(B, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels) |
|
|
837 |
plt.xlabel('Predicted Class') |
|
|
838 |
plt.ylabel('Original Class') |
|
|
839 |
plt.show() |
|
|
840 |
|
|
|
841 |
# representing B in heatmap format |
|
|
842 |
print("-"*20, "Recall matrix (Row sum=1)", "-"*20) |
|
|
843 |
plt.figure(figsize=(20,7)) |
|
|
844 |
sns.heatmap(A, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels) |
|
|
845 |
plt.xlabel('Predicted Class') |
|
|
846 |
plt.ylabel('Original Class') |
|
|
847 |
plt.show() |
|
|
848 |
|
|
|
849 |
|
|
|
850 |
def predict_and_plot_confusion_matrix(train_x, train_y,test_x, test_y, clf): |
|
|
851 |
clf.fit(train_x, train_y) |
|
|
852 |
sig_clf = CalibratedClassifierCV(clf, method="sigmoid") |
|
|
853 |
sig_clf.fit(train_x, train_y) |
|
|
854 |
pred_y = sig_clf.predict(test_x) |
|
|
855 |
|
|
|
856 |
# for calculating log_loss we willl provide the array of probabilities belongs to each class |
|
|
857 |
print("Log loss :",log_loss(test_y, sig_clf.predict_proba(test_x))) |
|
|
858 |
# calculating the number of data points that are misclassified |
|
|
859 |
print("Number of mis-classified points :", np.count_nonzero((pred_y- test_y))/test_y.shape[0]) |
|
|
860 |
plot_confusion_matrix(test_y, pred_y) |
|
|
861 |
|
|
|
862 |
|
|
|
863 |
# this function will be used just for naive bayes |
|
|
864 |
# for the given indices, we will print the name of the features |
|
|
865 |
# and we will check whether the feature present in the test point text or not |
|
|
866 |
def get_impfeature_names(indices, text, gene, var, no_features): |
|
|
867 |
gene_count_vec = CountVectorizer() |
|
|
868 |
var_count_vec = CountVectorizer() |
|
|
869 |
text_count_vec = CountVectorizer(min_df=3) |
|
|
870 |
|
|
|
871 |
gene_vec = gene_count_vec.fit(train_df['Gene']) |
|
|
872 |
var_vec = var_count_vec.fit(train_df['Variation']) |
|
|
873 |
text_vec = text_count_vec.fit(train_df['TEXT']) |
|
|
874 |
|
|
|
875 |
fea1_len = len(gene_vec.get_feature_names()) |
|
|
876 |
fea2_len = len(var_count_vec.get_feature_names()) |
|
|
877 |
|
|
|
878 |
word_present = 0 |
|
|
879 |
for i,v in enumerate(indices): |
|
|
880 |
if (v < fea1_len): |
|
|
881 |
word = gene_vec.get_feature_names()[v] |
|
|
882 |
yes_no = True if word == gene else False |
|
|
883 |
if yes_no: |
|
|
884 |
word_present += 1 |
|
|
885 |
print(i, "Gene feature [{}] present in test data point [{}]".format(word,yes_no)) |
|
|
886 |
elif (v < fea1_len+fea2_len): |
|
|
887 |
word = var_vec.get_feature_names()[v-(fea1_len)] |
|
|
888 |
yes_no = True if word == var else False |
|
|
889 |
if yes_no: |
|
|
890 |
word_present += 1 |
|
|
891 |
print(i, "variation feature [{}] present in test data point [{}]".format(word,yes_no)) |
|
|
892 |
else: |
|
|
893 |
word = text_vec.get_feature_names()[v-(fea1_len+fea2_len)] |
|
|
894 |
yes_no = True if word in text.split() else False |
|
|
895 |
if yes_no: |
|
|
896 |
word_present += 1 |
|
|
897 |
print(i, "Text feature [{}] present in test data point [{}]".format(word,yes_no)) |
|
|
898 |
|
|
|
899 |
print("Out of the top ",no_features," features ", word_present, "are present in query point") |
|
|
900 |
|
|
|
901 |
## Combining all 3 features together |
|
|
902 |
|
|
|
903 |
# merging gene, variance and text features |
|
|
904 |
|
|
|
905 |
# building train, test and cross validation data sets |
|
|
906 |
# a = [[1, 2], |
|
|
907 |
# [3, 4]] |
|
|
908 |
# b = [[4, 5], |
|
|
909 |
# [6, 7]] |
|
|
910 |
# hstack(a, b) = [[1, 2, 4, 5], |
|
|
911 |
# [ 3, 4, 6, 7]] |
|
|
912 |
|
|
|
913 |
train_gene_var_onehotCoding = hstack((train_gene_feature_onehotCoding,train_variation_feature_onehotCoding)) |
|
|
914 |
test_gene_var_onehotCoding = hstack((test_gene_feature_onehotCoding,test_variation_feature_onehotCoding)) |
|
|
915 |
cv_gene_var_onehotCoding = hstack((cv_gene_feature_onehotCoding,cv_variation_feature_onehotCoding)) |
|
|
916 |
|
|
|
917 |
train_x_onehotCoding = hstack((train_gene_var_onehotCoding, train_text_feature_onehotCoding)).tocsr() |
|
|
918 |
train_y = np.array(list(train_df['Class'])) |
|
|
919 |
|
|
|
920 |
test_x_onehotCoding = hstack((test_gene_var_onehotCoding, test_text_feature_onehotCoding)).tocsr() |
|
|
921 |
test_y = np.array(list(test_df['Class'])) |
|
|
922 |
|
|
|
923 |
cv_x_onehotCoding = hstack((cv_gene_var_onehotCoding, cv_text_feature_onehotCoding)).tocsr() |
|
|
924 |
cv_y = np.array(list(cv_df['Class'])) |
|
|
925 |
|
|
|
926 |
|
|
|
927 |
train_gene_var_responseCoding = np.hstack((train_gene_feature_responseCoding,train_variation_feature_responseCoding)) |
|
|
928 |
test_gene_var_responseCoding = np.hstack((test_gene_feature_responseCoding,test_variation_feature_responseCoding)) |
|
|
929 |
cv_gene_var_responseCoding = np.hstack((cv_gene_feature_responseCoding,cv_variation_feature_responseCoding)) |
|
|
930 |
|
|
|
931 |
train_x_responseCoding = np.hstack((train_gene_var_responseCoding, train_text_feature_responseCoding)) |
|
|
932 |
test_x_responseCoding = np.hstack((test_gene_var_responseCoding, test_text_feature_responseCoding)) |
|
|
933 |
cv_x_responseCoding = np.hstack((cv_gene_var_responseCoding, cv_text_feature_responseCoding)) |
|
|
934 |
|
|
|
935 |
|
|
|
936 |
print("One hot encoding features :") |
|
|
937 |
print("(number of data points * number of features) in train data = ", train_x_onehotCoding.shape) |
|
|
938 |
print("(number of data points * number of features) in test data = ", test_x_onehotCoding.shape) |
|
|
939 |
print("(number of data points * number of features) in cross validation data =", cv_x_onehotCoding.shape) |
|
|
940 |
|
|
|
941 |
print(" Response encoding features :") |
|
|
942 |
print("(number of data points * number of features) in train data = ", train_x_responseCoding.shape) |
|
|
943 |
print("(number of data points * number of features) in test data = ", test_x_responseCoding.shape) |
|
|
944 |
print("(number of data points * number of features) in cross validation data =", cv_x_responseCoding.shape) |
|
|
945 |
|
|
|
946 |
# Building Machine Learning model |
|
|
947 |
|
|
|
948 |
# http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html |
|
|
949 |
alpha = [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000] |
|
|
950 |
cv_log_error_array = [] |
|
|
951 |
for i in alpha: |
|
|
952 |
print("for alpha =", i) |
|
|
953 |
clf = MultinomialNB(alpha=i) |
|
|
954 |
clf.fit(train_x_onehotCoding, train_y) |
|
|
955 |
sig_clf = CalibratedClassifierCV(clf, method="sigmoid") |
|
|
956 |
sig_clf.fit(train_x_onehotCoding, train_y) |
|
|
957 |
sig_clf_probs = sig_clf.predict_proba(cv_x_onehotCoding) |
|
|
958 |
cv_log_error_array.append(log_loss(cv_y, sig_clf_probs, labels=clf.classes_, eps=1e-15)) |
|
|
959 |
# to avoid rounding error while multiplying probabilites we use log-probability estimates |
|
|
960 |
print("Log Loss :",log_loss(cv_y, sig_clf_probs)) |
|
|
961 |
|
|
|
962 |
fig, ax = plt.subplots() |
|
|
963 |
ax.plot(np.log10(alpha), cv_log_error_array,c='g') |
|
|
964 |
for i, txt in enumerate(np.round(cv_log_error_array,3)): |
|
|
965 |
ax.annotate((alpha[i],str(txt)), (np.log10(alpha[i]),cv_log_error_array[i])) |
|
|
966 |
plt.grid() |
|
|
967 |
plt.xticks(np.log10(alpha)) |
|
|
968 |
plt.title("Cross Validation Error for each alpha") |
|
|
969 |
plt.xlabel("Alpha i's") |
|
|
970 |
plt.ylabel("Error measure") |
|
|
971 |
plt.show() |
|
|
972 |
|
|
|
973 |
best_alpha = np.argmin(cv_log_error_array) |
|
|
974 |
clf = MultinomialNB(alpha=alpha[best_alpha]) |
|
|
975 |
clf.fit(train_x_onehotCoding, train_y) |
|
|
976 |
sig_clf = CalibratedClassifierCV(clf, method="sigmoid") |
|
|
977 |
sig_clf.fit(train_x_onehotCoding, train_y) |
|
|
978 |
|
|
|
979 |
predict_y = sig_clf.predict_proba(train_x_onehotCoding) |
|
|
980 |
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15)) |
|
|
981 |
predict_y = sig_clf.predict_proba(cv_x_onehotCoding) |
|
|
982 |
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15)) |
|
|
983 |
predict_y = sig_clf.predict_proba(test_x_onehotCoding) |
|
|
984 |
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15)) |
|
|
985 |
|
|
|
986 |
# Testing our Naive Bayes model with best found value of alpha on testing data |
|
|
987 |
clf = MultinomialNB(alpha=alpha[best_alpha]) |
|
|
988 |
clf.fit(train_x_onehotCoding, train_y) |
|
|
989 |
sig_clf = CalibratedClassifierCV(clf, method="sigmoid") |
|
|
990 |
sig_clf.fit(train_x_onehotCoding, train_y) |
|
|
991 |
sig_clf_probs = sig_clf.predict_proba(cv_x_onehotCoding) |
|
|
992 |
# to avoid rounding error while multiplying probabilites we use log-probability estimates |
|
|
993 |
print("Log Loss :",log_loss(cv_y, sig_clf_probs)) |
|
|
994 |
print("Number of missclassified point :", np.count_nonzero((sig_clf.predict(cv_x_onehotCoding)- cv_y))/cv_y.shape[0]) |
|
|
995 |
plot_confusion_matrix(cv_y, sig_clf.predict(cv_x_onehotCoding.toarray())) |
|
|
996 |
|
|
|
997 |
# interpretability of model |
|
|
998 |
|
|
|
999 |
test_point_index = 1 |
|
|
1000 |
no_feature = 100 |
|
|
1001 |
predicted_cls = sig_clf.predict(test_x_onehotCoding[test_point_index]) |
|
|
1002 |
print("Predicted Class :", predicted_cls[0]) |
|
|
1003 |
print("Predicted Class Probabilities:", np.round(sig_clf.predict_proba(test_x_onehotCoding[test_point_index]),4)) |
|
|
1004 |
print("Actual Class :", test_y[test_point_index]) |
|
|
1005 |
indices = np.argsort(-clf.coef_)[predicted_cls-1][:,:no_feature] |
|
|
1006 |
print("-"*50) |
|
|
1007 |
get_impfeature_names(indices[0], test_df['TEXT'].iloc[test_point_index],test_df['Gene'].iloc[test_point_index],test_df['Variation'].iloc[test_point_index], no_feature) |
|
|
1008 |
|
|
|
1009 |
# one more ex |
|
|
1010 |
|
|
|
1011 |
test_point_index = 100 |
|
|
1012 |
no_feature = 100 |
|
|
1013 |
predicted_cls = sig_clf.predict(test_x_onehotCoding[test_point_index]) |
|
|
1014 |
print("Predicted Class :", predicted_cls[0]) |
|
|
1015 |
print("Predicted Class Probabilities:", np.round(sig_clf.predict_proba(test_x_onehotCoding[test_point_index]),4)) |
|
|
1016 |
print("Actual Class :", test_y[test_point_index]) |
|
|
1017 |
indices = np.argsort(-clf.coef_)[predicted_cls-1][:,:no_feature] |
|
|
1018 |
print("-"*50) |
|
|
1019 |
get_impfeature_names(indices[0], test_df['TEXT'].iloc[test_point_index],test_df['Gene'].iloc[test_point_index],test_df['Variation'].iloc[test_point_index], no_feature) |
|
|
1020 |
|
|
|
1021 |
# So Naive Bayes not performing very badly but lets look at other models |
|
|
1022 |
|
|
|
1023 |
# Logistic Regression |
|
|
1024 |
|
|
|
1025 |
# Balancing all classes |
|
|
1026 |
|
|
|
1027 |
alpha = [10 ** x for x in range(-6, 3)] |
|
|
1028 |
cv_log_error_array = [] |
|
|
1029 |
for i in alpha: |
|
|
1030 |
print("for alpha =", i) |
|
|
1031 |
clf = SGDClassifier(class_weight='balanced', alpha=i, penalty='l2', loss='log', random_state=42) |
|
|
1032 |
clf.fit(train_x_onehotCoding, train_y) |
|
|
1033 |
sig_clf = CalibratedClassifierCV(clf, method="sigmoid") |
|
|
1034 |
sig_clf.fit(train_x_onehotCoding, train_y) |
|
|
1035 |
sig_clf_probs = sig_clf.predict_proba(cv_x_onehotCoding) |
|
|
1036 |
cv_log_error_array.append(log_loss(cv_y, sig_clf_probs, labels=clf.classes_, eps=1e-15)) |
|
|
1037 |
# to avoid rounding error while multiplying probabilites we use log-probability estimates |
|
|
1038 |
print("Log Loss :",log_loss(cv_y, sig_clf_probs)) |
|
|
1039 |
|
|
|
1040 |
fig, ax = plt.subplots() |
|
|
1041 |
ax.plot(alpha, cv_log_error_array,c='g') |
|
|
1042 |
for i, txt in enumerate(np.round(cv_log_error_array,3)): |
|
|
1043 |
ax.annotate((alpha[i],str(txt)), (alpha[i],cv_log_error_array[i])) |
|
|
1044 |
plt.grid() |
|
|
1045 |
plt.title("Cross Validation Error for each alpha") |
|
|
1046 |
plt.xlabel("Alpha i's") |
|
|
1047 |
plt.ylabel("Error measure") |
|
|
1048 |
plt.show() |
|
|
1049 |
|
|
|
1050 |
best_alpha = np.argmin(cv_log_error_array) |
|
|
1051 |
clf = SGDClassifier(class_weight='balanced', alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42) |
|
|
1052 |
clf.fit(train_x_onehotCoding, train_y) |
|
|
1053 |
sig_clf = CalibratedClassifierCV(clf, method="sigmoid") |
|
|
1054 |
sig_clf.fit(train_x_onehotCoding, train_y) |
|
|
1055 |
|
|
|
1056 |
predict_y = sig_clf.predict_proba(train_x_onehotCoding) |
|
|
1057 |
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15)) |
|
|
1058 |
predict_y = sig_clf.predict_proba(cv_x_onehotCoding) |
|
|
1059 |
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15)) |
|
|
1060 |
predict_y = sig_clf.predict_proba(test_x_onehotCoding) |
|
|
1061 |
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15)) |
|
|
1062 |
|
|
|
1063 |
# Lets test it on testing data using best alpha value |
|
|
1064 |
|
|
|
1065 |
clf = SGDClassifier(class_weight='balanced', alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42) |
|
|
1066 |
predict_and_plot_confusion_matrix(train_x_onehotCoding, train_y, cv_x_onehotCoding, cv_y, clf) |
|
|
1067 |
|
|
|
1068 |
|
|
|
1069 |
# feature importance |
|
|
1070 |
|
|
|
1071 |
def get_imp_feature_names(text, indices, removed_ind = []): |
|
|
1072 |
word_present = 0 |
|
|
1073 |
tabulte_list = [] |
|
|
1074 |
incresingorder_ind = 0 |
|
|
1075 |
for i in indices: |
|
|
1076 |
if i < train_gene_feature_onehotCoding.shape[1]: |
|
|
1077 |
tabulte_list.append([incresingorder_ind, "Gene", "Yes"]) |
|
|
1078 |
elif i< 18: |
|
|
1079 |
tabulte_list.append([incresingorder_ind,"Variation", "Yes"]) |
|
|
1080 |
if ((i > 17) & (i not in removed_ind)) : |
|
|
1081 |
word = train_text_features[i] |
|
|
1082 |
yes_no = True if word in text.split() else False |
|
|
1083 |
if yes_no: |
|
|
1084 |
word_present += 1 |
|
|
1085 |
tabulte_list.append([incresingorder_ind,train_text_features[i], yes_no]) |
|
|
1086 |
incresingorder_ind += 1 |
|
|
1087 |
print(word_present, "most importent features are present in our query point") |
|
|
1088 |
print("-"*50) |
|
|
1089 |
print("The features that are most importent of the ",predicted_cls[0]," class:") |
|
|
1090 |
print (tabulate(tabulte_list, headers=["Index",'Feature name', 'Present or Not'])) |
|
|
1091 |
|
|
|
1092 |
# testing query point and doing interpretability |
|
|
1093 |
|
|
|
1094 |
# from tabulate import tabulate |
|
|
1095 |
clf = SGDClassifier(class_weight='balanced', alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42) |
|
|
1096 |
clf.fit(train_x_onehotCoding,train_y) |
|
|
1097 |
test_point_index = 1 |
|
|
1098 |
no_feature = 500 |
|
|
1099 |
predicted_cls = sig_clf.predict(test_x_onehotCoding[test_point_index]) |
|
|
1100 |
print("Predicted Class :", predicted_cls[0]) |
|
|
1101 |
print("Predicted Class Probabilities:", np.round(sig_clf.predict_proba(test_x_onehotCoding[test_point_index]),4)) |
|
|
1102 |
print("Actual Class :", test_y[test_point_index]) |
|
|
1103 |
indices = np.argsort(-clf.coef_)[predicted_cls-1][:,:no_feature] |
|
|
1104 |
print("-"*50) |
|
|
1105 |
get_impfeature_names(indices[0], test_df['TEXT'].iloc[test_point_index],test_df['Gene'].iloc[test_point_index],test_df['Variation'].iloc[test_point_index], no_feature) |
|
|
1106 |
|
|
|
1107 |
test_point_index = 100 |
|
|
1108 |
no_feature = 500 |
|
|
1109 |
predicted_cls = sig_clf.predict(test_x_onehotCoding[test_point_index]) |
|
|
1110 |
print("Predicted Class :", predicted_cls[0]) |
|
|
1111 |
print("Predicted Class Probabilities:", np.round(sig_clf.predict_proba(test_x_onehotCoding[test_point_index]),4)) |
|
|
1112 |
print("Actual Class :", test_y[test_point_index]) |
|
|
1113 |
indices = np.argsort(-clf.coef_)[predicted_cls-1][:,:no_feature] |
|
|
1114 |
print("-"*50) |
|
|
1115 |
get_impfeature_names(indices[0], test_df['TEXT'].iloc[test_point_index],test_df['Gene'].iloc[test_point_index],test_df['Variation'].iloc[test_point_index], no_feature) |
|
|
1116 |
|
|
|
1117 |
# K Nearest Neighbour Classification |
|
|
1118 |
|
|
|
1119 |
alpha = [5, 11, 15, 21, 31, 41, 51, 99] |
|
|
1120 |
cv_log_error_array = [] |
|
|
1121 |
for i in alpha: |
|
|
1122 |
print("for alpha =", i) |
|
|
1123 |
clf = KNeighborsClassifier(n_neighbors=i) |
|
|
1124 |
clf.fit(train_x_responseCoding, train_y) |
|
|
1125 |
sig_clf = CalibratedClassifierCV(clf, method="sigmoid") |
|
|
1126 |
sig_clf.fit(train_x_responseCoding, train_y) |
|
|
1127 |
sig_clf_probs = sig_clf.predict_proba(cv_x_responseCoding) |
|
|
1128 |
cv_log_error_array.append(log_loss(cv_y, sig_clf_probs, labels=clf.classes_, eps=1e-15)) |
|
|
1129 |
# to avoid rounding error while multiplying probabilites we use log-probability estimates |
|
|
1130 |
print("Log Loss :",log_loss(cv_y, sig_clf_probs)) |
|
|
1131 |
|
|
|
1132 |
fig, ax = plt.subplots() |
|
|
1133 |
ax.plot(alpha, cv_log_error_array,c='g') |
|
|
1134 |
for i, txt in enumerate(np.round(cv_log_error_array,3)): |
|
|
1135 |
ax.annotate((alpha[i],str(txt)), (alpha[i],cv_log_error_array[i])) |
|
|
1136 |
plt.grid() |
|
|
1137 |
plt.title("Cross Validation Error for each alpha") |
|
|
1138 |
plt.xlabel("Alpha i's") |
|
|
1139 |
plt.ylabel("Error measure") |
|
|
1140 |
plt.show() |
|
|
1141 |
|
|
|
1142 |
best_alpha = np.argmin(cv_log_error_array) |
|
|
1143 |
clf = KNeighborsClassifier(n_neighbors=alpha[best_alpha]) |
|
|
1144 |
clf.fit(train_x_responseCoding, train_y) |
|
|
1145 |
sig_clf = CalibratedClassifierCV(clf, method="sigmoid") |
|
|
1146 |
sig_clf.fit(train_x_responseCoding, train_y) |
|
|
1147 |
|
|
|
1148 |
predict_y = sig_clf.predict_proba(train_x_responseCoding) |
|
|
1149 |
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15)) |
|
|
1150 |
predict_y = sig_clf.predict_proba(cv_x_responseCoding) |
|
|
1151 |
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15)) |
|
|
1152 |
predict_y = sig_clf.predict_proba(test_x_responseCoding) |
|
|
1153 |
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15)) |
|
|
1154 |
|
|
|
1155 |
clf = KNeighborsClassifier(n_neighbors=alpha[best_alpha]) |
|
|
1156 |
predict_and_plot_confusion_matrix(train_x_responseCoding, train_y, cv_x_responseCoding, cv_y, clf) |
|
|
1157 |
|
|
|
1158 |
# Lets look at few test points |
|
|
1159 |
clf = KNeighborsClassifier(n_neighbors=alpha[best_alpha]) |
|
|
1160 |
clf.fit(train_x_responseCoding, train_y) |
|
|
1161 |
sig_clf = CalibratedClassifierCV(clf, method="sigmoid") |
|
|
1162 |
sig_clf.fit(train_x_responseCoding, train_y) |
|
|
1163 |
|
|
|
1164 |
test_point_index = 1 |
|
|
1165 |
predicted_cls = sig_clf.predict(test_x_responseCoding[0].reshape(1,-1)) |
|
|
1166 |
print("Predicted Class :", predicted_cls[0]) |
|
|
1167 |
print("Actual Class :", test_y[test_point_index]) |
|
|
1168 |
neighbors = clf.kneighbors(test_x_responseCoding[test_point_index].reshape(1, -1), alpha[best_alpha]) |
|
|
1169 |
print("The ",alpha[best_alpha]," nearest neighbours of the test points belongs to classes",train_y[neighbors[1][0]]) |
|
|
1170 |
print("Fequency of nearest points :",Counter(train_y[neighbors[1][0]])) |
|
|
1171 |
|
|
|
1172 |
clf = KNeighborsClassifier(n_neighbors=alpha[best_alpha]) |
|
|
1173 |
clf.fit(train_x_responseCoding, train_y) |
|
|
1174 |
sig_clf = CalibratedClassifierCV(clf, method="sigmoid") |
|
|
1175 |
sig_clf.fit(train_x_responseCoding, train_y) |
|
|
1176 |
|
|
|
1177 |
test_point_index = 100 |
|
|
1178 |
|
|
|
1179 |
predicted_cls = sig_clf.predict(test_x_responseCoding[test_point_index].reshape(1,-1)) |
|
|
1180 |
print("Predicted Class :", predicted_cls[0]) |
|
|
1181 |
print("Actual Class :", test_y[test_point_index]) |
|
|
1182 |
neighbors = clf.kneighbors(test_x_responseCoding[test_point_index].reshape(1, -1), alpha[best_alpha]) |
|
|
1183 |
print("the k value for knn is",alpha[best_alpha],"and the nearest neighbours of the test points belongs to classes",train_y[neighbors[1][0]]) |
|
|
1184 |
print("Fequency of nearest points :",Counter(train_y[neighbors[1][0]])) |
|
|
1185 |
|
|
|
1186 |
# without balancing data for logistic regression .01 difference in log loss |
|
|
1187 |
|
|
|
1188 |
# linear SVM dint reduce w.r.t log loss |
|
|
1189 |
|
|
|
1190 |
# random forest classifier with 1-hot encoding and response encoding (was worse than 1-hot encoding) dint reduce w.r.t log loss |
|
|
1191 |
|
|
|
1192 |
# stacking model dint reduce w.r.t log loss |
|
|
1193 |
|
|
|
1194 |
# logistic regression with balancing and 1-hot encoding has got the lowest log loss |
|
|
1195 |
|
|
|
1196 |
|
|
|
1197 |
|