Diff of /run.py [000000] .. [44995b]

Switch to unified view

a b/run.py
1
import pandas as pd
2
import matplotlib.pyplot as plt
3
import re
4
import time
5
import warnings
6
import numpy as np
7
from nltk.corpus import stopwords
8
from sklearn.decomposition import TruncatedSVD
9
from sklearn.preprocessing import normalize
10
from sklearn.feature_extraction.text import CountVectorizer
11
from sklearn.manifold import TSNE
12
import seaborn as sns
13
from sklearn.neighbors import KNeighborsClassifier
14
from sklearn.metrics import confusion_matrix
15
from sklearn.metrics import accuracy_score, log_loss
16
from sklearn.feature_extraction.text import TfidfVectorizer
17
from sklearn.linear_model import SGDClassifier
18
from imblearn.over_sampling import SMOTE
19
from collections import Counter
20
from scipy.sparse import hstack
21
from sklearn.multiclass import OneVsRestClassifier
22
from sklearn.svm import SVC
23
from sklearn import StratifiedKFold 
24
from collections import Counter, defaultdict
25
from sklearn.calibration import CalibratedClassifierCV
26
from sklearn.naive_bayes import MultinomialNB
27
from sklearn.naive_bayes import GaussianNB
28
from sklearn.model_selection import train_test_split
29
from sklearn.model_selection import GridSearchCV
30
import math
31
from sklearn.metrics import normalized_mutual_info_score
32
from sklearn.ensemble import RandomForestClassifier
33
warnings.filterwarnings("ignore")
34
35
from mlxtend.classifier import StackingClassifier
36
37
from sklearn import model_selection
38
from sklearn.linear_model import LogisticRegression
39
40
# Loading training_variants. A comma seperated file
41
data_variants = pd.read_csv('training/training_variants')
42
# Loading training_text. Is seperated by ||
43
data_text =pd.read_csv("training/training_text",sep="\|\|",engine="python",names=["ID","TEXT"],skiprows=1)
44
45
# basic look ahead for data_variants
46
47
data_variants.head()
48
# to classify cancer, we need gene, variants and research paper (text)
49
# ID --> common, same value in both data_variants & data_text for linking (merging dataframe)
50
# gene --> genetic mutation location
51
# variation --> aminoacid change for respective mutations
52
# class --> 1-9, target classes i,e genetic mutation has been classified on
53
# in data_variants, text is missing. text is present in data_text dataframe
54
55
data_variants.info()
56
57
data_variants.describe()
58
59
data_variants.columns
60
61
data_variants.shape
62
63
# basic look ahead for data_text
64
65
data_text.head()
66
# text --> huge text because research paper
67
68
data_text.info()
69
# missing value exists
70
71
data_text.columns
72
73
data_text.shape
74
75
# target classes
76
data_variants.Class.unique()
77
# multiclass classification problem
78
79
'''
80
important things before modelling:
81
82
1. Since it is medical problem, errors are costly
83
    so have to return outcome in probability. 
84
    p(y=2|a1) = 0.99 --> OK
85
    p(y=2|a1) = 0.5 --> NOT OK because probability is ambigous. In this case, have to look for more evidences or reason
86
87
2. Interpretability: 
88
89
    why 0.5 ? why 0.99 ? provide evidences for probability value
90
91
    * few ML models are very good at interpretability such as Naive bayes, decision tree, logistic regression, linear SVM
92
    * few ML models are not good at interpretability such as RBF and SVM
93
94
3. Latency: 
95
96
    * OK to take few sec/min for output
97
    * NOT OK to take in hours for output
98
99
'''
100
# data_text has text (i,e research paper), cannot feed raw text to model
101
# so convert to a format that model understands
102
103
# numbers, stopwords, multiple spaces, special chars are not relevant so lets remove
104
105
stop_words = set(stopwords.words('english'))
106
107
def data_text_preprocess(total_text, ind, col):
108
    # Remove int values from text data as that might not be imp
109
    if type(total_text) is not int:
110
        string = ""
111
        # replacing all special char with space
112
        total_text = re.sub('[^a-zA-Z0-9\n]', ' ', str(total_text))
113
        # replacing multiple spaces with single space
114
        total_text = re.sub('\s+',' ', str(total_text))
115
        # bring whole text to same lower-case scale.
116
        total_text = total_text.lower()
117
        
118
        for word in total_text.split():
119
        # if the word is a not a stop word then retain that word from text
120
            if not word in stop_words:
121
                string += word + " "
122
        
123
        data_text[col][ind] = string
124
125
for index, row in data_text.iterrows():
126
    if type(row['TEXT']) is str:
127
        data_text_preprocess(row['TEXT'], index, 'TEXT')
128
129
# data preprocessing is completed
130
131
# lets merge dataframes based on ID
132
result = pd.merge(data_variants, data_text,on='ID', how='left')
133
result.head()
134
135
# check for missing values
136
result[result.isnull().any(axis=1)]
137
# 5 rows of missing values, so lets impute by conacatinating gene + variant values
138
139
result.loc[result['TEXT'].isnull(),'TEXT'] = result['Gene'] +' '+result['Variation']
140
141
# cross check 
142
result[result.isnull().any(axis=1)]
143
144
# Split dataset into training, test & validation data
145
146
y_true = result['Class'].values
147
result.Gene      = result.Gene.str.replace('\s+', '_')
148
result.Variation = result.Variation.str.replace('\s+', '_')
149
150
# Splitting the data into train and test set 
151
X_train, test_df, y_train, y_test = train_test_split(result, y_true, stratify=y_true, test_size=0.2)
152
# split the train data now into train validation and cross validation
153
train_df, cv_df, y_train, y_cv = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2)
154
155
print('Number of data points in train data:', train_df.shape[0])
156
print('Number of data points in test data:', test_df.shape[0])
157
print('Number of data points in cross validation data:', cv_df.shape[0])
158
159
# has data distributed properly ??
160
161
train_class_distribution = train_df['Class'].value_counts()
162
test_class_distribution = test_df['Class'].value_counts()
163
cv_class_distribution = cv_df['Class'].value_counts()
164
165
# lets viz
166
167
# viz for train class distribution
168
169
my_colors = 'rgbkymc'
170
train_class_distribution.plot(kind='bar')
171
plt.xlabel('Class')
172
plt.ylabel(' Number of Data points per Class')
173
plt.title('Distribution of yi in train data')
174
plt.grid()
175
plt.show()
176
177
# lets look in percentage form
178
179
sorted_yi = np.argsort(-train_class_distribution.values)
180
for i in sorted_yi:
181
    print('Number of data points in class', i+1, ':',train_class_distribution.values[i], '(', np.round((train_class_distribution.values[i]/train_df.shape[0]*100), 3), '%)')
182
183
# viz for test class distribution
184
185
my_colors = 'rgbkymc'
186
test_class_distribution.plot(kind='bar')
187
plt.xlabel('Class')
188
plt.ylabel('Number of Data points per Class')
189
plt.title('Distribution of yi in test data')
190
plt.grid()
191
plt.show()
192
193
# lets look in percentage form
194
195
sorted_yi = np.argsort(-test_class_distribution.values)
196
for i in sorted_yi:
197
    print('Number of data points in class', i+1, ':',test_class_distribution.values[i], '(', np.round((test_class_distribution.values[i]/test_df.shape[0]*100), 3), '%)')
198
199
# viz for cross validation set
200
201
my_colors = 'rgbkymc'
202
cv_class_distribution.plot(kind='bar')
203
plt.xlabel('Class')
204
plt.ylabel('Data points per Class')
205
plt.title('Distribution of yi in cross validation data')
206
plt.grid()
207
plt.show()
208
209
# lets look in percentage form
210
211
sorted_yi = np.argsort(-train_class_distribution.values)
212
for i in sorted_yi:
213
    print('Number of data points in class', i+1, ':',cv_class_distribution.values[i], '(', np.round((cv_class_distribution.values[i]/cv_df.shape[0]*100), 3), '%)')
214
215
# distribution must happen in right distribution
216
217
'''
218
what is the evaluation metric ?
219
from kaggle page, multi class log loss is the evaluation metric
220
221
log loss ?
222
223
* more the prediction value (prediction probability), log loss is lower
224
* if the prediction is ambigious, log loss penalizes
225
* log loss = 0 --> perfect model, which is not the case in real world
226
* log loss lies between zero to infinity [0,infinity]
227
228
1 problem:
229
230
* in accuracy score evaluation metrics: 0 - 1 is the range where 1 is the best and 0 is the worst
231
* in AUC evaluation metrics: 0 - 1 is the range where 1 is the best and 0 is the worst
232
233
BUT 
234
235
log loss values range between 0 to infinity
236
237
how do we know the infinity value is good or bad ??
238
239
1 method is: 
240
241
random model --> worst model --> compute log loss (ex: 3)
242
random model --> compute log loss (ex: 1.5)
243
244
1.5 given model is better than 3
245
246
how do we generate random model ?
247
248
* here working with probability.
249
* summazation all the probability would be 1
250
251
'''
252
253
# build random model 
254
255
# 9 classes exists, so create 9 random numbers such that thier sum is equivalent to 1 
256
# because sum of probability of all the 9 classes must be equivalent to 1
257
258
test_data_len = test_df.shape[0]
259
cv_data_len = cv_df.shape[0]
260
261
# CV set error
262
cv_predicted_y = np.zeros((cv_data_len,9))
263
for i in range(cv_data_len):
264
    rand_probs = np.random.rand(1,9) # randomly generate numbers between 1-9
265
    cv_predicted_y[i] = ((rand_probs/sum(sum(rand_probs)))[0]) # summazation equals to 1
266
print("Log loss on Cross Validation Data using Random Model",log_loss(y_cv,cv_predicted_y, eps=1e-15))
267
# Log loss on Cross Validation Data using Random Model 2.491130932423048
268
269
# test-set error
270
test_predicted_y = np.zeros((test_data_len,9))
271
for i in range(test_data_len):
272
    rand_probs = np.random.rand(1,9)
273
    test_predicted_y[i] = ((rand_probs/sum(sum(rand_probs)))[0])
274
print("Log loss on Test Data using Random Model",log_loss(y_test,test_predicted_y, eps=1e-15))
275
# Log loss on Test Data using Random Model 2.531934856356048
276
277
# Lets get the index of max probablity
278
predicted_y =np.argmax(test_predicted_y, axis=1)
279
280
# index value ranges from 0-8, our problem statement has from 1-9 so lets update it 
281
predicted_y = predicted_y + 1
282
283
# confusion matrix
284
C = confusion_matrix(y_test, predicted_y)
285
286
labels = [1,2,3,4,5,6,7,8,9]
287
plt.figure(figsize=(20,7))
288
sns.heatmap(C, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
289
plt.xlabel('Predicted Class')
290
plt.ylabel('Original Class')
291
plt.show()
292
293
# precision matrix
294
295
B =(C/C.sum(axis=0))
296
297
plt.figure(figsize=(20,7))
298
sns.heatmap(B, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
299
plt.xlabel('Predicted Class')
300
plt.ylabel('Original Class')
301
plt.show()
302
303
# recall matrix
304
305
A =(((C.T)/(C.sum(axis=1))).T)
306
307
plt.figure(figsize=(20,7))
308
sns.heatmap(A, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
309
plt.xlabel('Predicted Class')
310
plt.ylabel('Original Class')
311
plt.show()
312
313
# evaluate gene column
314
315
# is gene column important for target feature ?
316
# if so then convert category to numeric conversions
317
318
unique_genes = train_df['Gene'].value_counts()
319
print('Number of Unique Genes :', unique_genes.shape[0])
320
# the top 10 genes that occured most
321
print(unique_genes.head(10))
322
323
unique_genes.shape[0]
324
325
# viz of cumulative distribution of unique gene values
326
327
s = sum(unique_genes.values);
328
h = unique_genes.values/s;
329
c = np.cumsum(h)
330
plt.plot(c,label='Cumulative distribution of Genes')
331
plt.grid()
332
plt.legend()
333
plt.show()
334
335
# top 50 values are contributing to 75% of data
336
337
'''
338
lets encode
339
1. one-hot encoding. creates lot of features --> problem in some cases 
340
2. response encoding (mean imputation, ex: average height of indians are 5.5)
341
342
same way response encoding in this case would be:
343
344
* for 1 row, we'll create 9 columns
345
* in 9 columns, the probability of gene occurance w.r.t that row would be saved 
346
'''
347
# let's use both encoding techniques and check which one works better
348
349
# 1-hot encoding on gene feature
350
351
gene_vectorizer = CountVectorizer()
352
train_gene_feature_onehotCoding = gene_vectorizer.fit_transform(train_df['Gene'])
353
test_gene_feature_onehotCoding = gene_vectorizer.transform(test_df['Gene'])
354
cv_gene_feature_onehotCoding = gene_vectorizer.transform(cv_df['Gene'])
355
356
train_gene_feature_onehotCoding.shape
357
# (2124,234)
358
359
# response encoding on gene feature
360
361
# code for response coding with Laplace smoothing.
362
# alpha : used for laplace smoothing
363
# feature: ['gene', 'variation']
364
# df: ['train_df', 'test_df', 'cv_df']
365
# algorithm
366
# ----------
367
# Consider all unique values and the number of occurances of given feature in train data dataframe
368
# build a vector (1*9) , the first element = (number of times it occured in class1 + 10*alpha / number of time it occurred in total data+90*alpha)
369
# gv_dict is like a look up table, for every gene it store a (1*9) representation of it
370
# for a value of feature in df:
371
# if it is in train data:
372
# we add the vector that was stored in 'gv_dict' look up table to 'gv_fea'
373
# if it is not there is train:
374
# we add [1/9, 1/9, 1/9, 1/9,1/9, 1/9, 1/9, 1/9, 1/9] to 'gv_fea'
375
# return 'gv_fea'
376
# ----------------------
377
378
# get_gv_fea_dict: Get Gene varaition Feature Dict
379
def get_gv_fea_dict(alpha, feature, df):
380
    # value_count: it contains a dict like
381
    # print(train_df['Gene'].value_counts())
382
    # output:
383
    #        {BRCA1      174
384
    #         TP53       106
385
    #         EGFR        86
386
    #         BRCA2       75
387
    #         PTEN        69
388
    #         KIT         61
389
    #         BRAF        60
390
    #         ERBB2       47
391
    #         PDGFRA      46
392
    #         ...}
393
    # print(train_df['Variation'].value_counts())
394
    # output:
395
    # {
396
    # Truncating_Mutations                     63
397
    # Deletion                                 43
398
    # Amplification                            43
399
    # Fusions                                  22
400
    # Overexpression                            3
401
    # E17K                                      3
402
    # Q61L                                      3
403
    # S222D                                     2
404
    # P130S                                     2
405
    # ...
406
    # }
407
    value_count = train_df[feature].value_counts()
408
    
409
    # gv_dict : Gene Variation Dict, which contains the probability array for each gene/variation
410
    gv_dict = dict()
411
    
412
    # denominator will contain the number of time that particular feature occured in whole data
413
    for i, denominator in value_count.items():
414
        # vec will contain (p(yi==1/Gi) probability of gene/variation belongs to perticular class
415
        # vec is 9 diamensional vector
416
        vec = []
417
        for k in range(1,10):
418
            # print(train_df.loc[(train_df['Class']==1) & (train_df['Gene']=='BRCA1')])
419
            #         ID   Gene             Variation  Class  
420
            # 2470  2470  BRCA1                S1715C      1   
421
            # 2486  2486  BRCA1                S1841R      1   
422
            # 2614  2614  BRCA1                   M1R      1   
423
            # 2432  2432  BRCA1                L1657P      1   
424
            # 2567  2567  BRCA1                T1685A      1   
425
            # 2583  2583  BRCA1                E1660G      1   
426
            # 2634  2634  BRCA1                W1718L      1   
427
            # cls_cnt.shape[0] will return the number of rows
428
429
            cls_cnt = train_df.loc[(train_df['Class']==k) & (train_df[feature]==i)]
430
            
431
            # cls_cnt.shape[0](numerator) will contain the number of time that particular feature occured in whole data
432
            vec.append((cls_cnt.shape[0] + alpha*10)/ (denominator + 90*alpha))
433
434
        # we are adding the gene/variation to the dict as key and vec as value
435
        gv_dict[i]=vec
436
    return gv_dict
437
438
# Get Gene variation feature
439
def get_gv_feature(alpha, feature, df):
440
    # print(gv_dict)
441
    #     {'BRCA1': [0.20075757575757575, 0.03787878787878788, 0.068181818181818177, 0.13636363636363635, 0.25, 0.19318181818181818, 0.03787878787878788, 0.03787878787878788, 0.03787878787878788], 
442
    #      'TP53': [0.32142857142857145, 0.061224489795918366, 0.061224489795918366, 0.27040816326530615, 0.061224489795918366, 0.066326530612244902, 0.051020408163265307, 0.051020408163265307, 0.056122448979591837], 
443
    #      'EGFR': [0.056818181818181816, 0.21590909090909091, 0.0625, 0.068181818181818177, 0.068181818181818177, 0.0625, 0.34659090909090912, 0.0625, 0.056818181818181816], 
444
    #      'BRCA2': [0.13333333333333333, 0.060606060606060608, 0.060606060606060608, 0.078787878787878782, 0.1393939393939394, 0.34545454545454546, 0.060606060606060608, 0.060606060606060608, 0.060606060606060608], 
445
    #      'PTEN': [0.069182389937106917, 0.062893081761006289, 0.069182389937106917, 0.46540880503144655, 0.075471698113207544, 0.062893081761006289, 0.069182389937106917, 0.062893081761006289, 0.062893081761006289], 
446
    #      'KIT': [0.066225165562913912, 0.25165562913907286, 0.072847682119205295, 0.072847682119205295, 0.066225165562913912, 0.066225165562913912, 0.27152317880794702, 0.066225165562913912, 0.066225165562913912], 
447
    #      'BRAF': [0.066666666666666666, 0.17999999999999999, 0.073333333333333334, 0.073333333333333334, 0.093333333333333338, 0.080000000000000002, 0.29999999999999999, 0.066666666666666666, 0.066666666666666666],
448
    #      ...
449
    #     }
450
    gv_dict = get_gv_fea_dict(alpha, feature, df)
451
    # value_count is similar in get_gv_fea_dict
452
    value_count = train_df[feature].value_counts()
453
    
454
    # gv_fea: Gene_variation feature, it will contain the feature for each feature value in the data
455
    gv_fea = []
456
    # for every feature values in the given data frame we will check if it is there in the train data then we will add the feature to gv_fea
457
    # if not we will add [1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9] to gv_fea
458
    for index, row in df.iterrows():
459
        if row[feature] in dict(value_count).keys():
460
            gv_fea.append(gv_dict[row[feature]])
461
        else:
462
            gv_fea.append([1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9])
463
#             gv_fea.append([-1,-1,-1,-1,-1,-1,-1,-1,-1])
464
    return gv_fea
465
466
#response-coding of the Gene feature
467
# alpha is used for laplace smoothing
468
alpha = 1
469
# train gene feature
470
train_gene_feature_responseCoding = np.array(get_gv_feature(alpha, "Gene", train_df))
471
# test gene feature
472
test_gene_feature_responseCoding = np.array(get_gv_feature(alpha, "Gene", test_df))
473
# cross validation gene feature
474
cv_gene_feature_responseCoding = np.array(get_gv_feature(alpha, "Gene", cv_df))
475
476
'''
477
how to test whether gene is a good feature for model ?
478
by using random model log loss
479
create logistic regression model with only gene feature and check the model log loss
480
if > random model then gene is a good feature for the model 
481
482
in order to do that:
483
484
laplace smooting
485
486
* in NB we use LS i,e in test query there is a word that is not present in training test
487
* what to do with new word ? 
488
* can we drop ? yes but we're saying it has value 1 if we're dropping. that should not be the case
489
* can we assign value 0 ? WHole multiplication becomes zero
490
491
* so we use laplace smooting or editing smoothing i,e we add alpha (hyper parameter) and alpha K (k=no of classes)
492
* alpha = 1, in general. can change since it is hyperparameter
493
* alpha cannot be small or too large --> bias-variance trade off
494
495
CALLIBIRATED CLASSIFIER --> for getting results in probability format to be used for log loss
496
497
SGD --> for optimization
498
499
'''
500
501
# using 1-hot encoding because more dimension is fine with logistic regression algo
502
503
# We need a hyperparemeter for SGD classifier.
504
alpha = [10 ** x for x in range(-5, 1)]
505
506
# SGD classifier
507
cv_log_error_array=[]
508
for i in alpha:
509
    clf = SGDClassifier(alpha=i, penalty='l2', loss='log', random_state=42)
510
    clf.fit(train_gene_feature_onehotCoding, y_train)
511
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
512
    sig_clf.fit(train_gene_feature_onehotCoding, y_train)
513
    predict_y = sig_clf.predict_proba(cv_gene_feature_onehotCoding)
514
    cv_log_error_array.append(log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
515
    print('For values of alpha = ', i, "The log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
516
517
# alpha 0.001 log loss is minimal
518
519
# Lets plot the same to check the best Alpha value
520
fig, ax = plt.subplots()
521
ax.plot(alpha, cv_log_error_array,c='g')
522
for i, txt in enumerate(np.round(cv_log_error_array,3)):
523
    ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
524
plt.grid()
525
plt.title("Cross Validation Error for each alpha")
526
plt.xlabel("Alpha i's")
527
plt.ylabel("Error measure")
528
plt.show()
529
530
# Lets use best alpha value as we can see from above graph and compute log loss
531
best_alpha = np.argmin(cv_log_error_array)
532
clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
533
clf.fit(train_gene_feature_onehotCoding, y_train)
534
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
535
sig_clf.fit(train_gene_feature_onehotCoding, y_train)
536
537
predict_y = sig_clf.predict_proba(train_gene_feature_onehotCoding)
538
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
539
predict_y = sig_clf.predict_proba(cv_gene_feature_onehotCoding)
540
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
541
predict_y = sig_clf.predict_proba(test_gene_feature_onehotCoding)
542
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
543
544
# log loss is less than random model 
545
# how stable is model ?
546
# in train there is less for gene and in test there is more. is called as overlap
547
# overlap should be good
548
# though we're using laplace smoothing, its not a great idea
549
# data and model will be stable if we've good overlap
550
551
# check how many values are overlapping between train, test or between CV and train
552
553
test_coverage=test_df[test_df['Gene'].isin(list(set(train_df['Gene'])))].shape[0]
554
cv_coverage=cv_df[cv_df['Gene'].isin(list(set(train_df['Gene'])))].shape[0]
555
556
print('1. In test data',test_coverage, 'out of',test_df.shape[0], ":",(test_coverage/test_df.shape[0])*100)
557
print('2. In cross validation data',cv_coverage, 'out of ',cv_df.shape[0],":" ,(cv_coverage/cv_df.shape[0])*100)
558
559
# EVALUATING VARIATION FEATURE
560
561
# Variation is also a categorical variable so we have to deal in same way like we have done for Gene column. 
562
# We will again get the one hot encoder and response enoding variable for variation column.
563
564
unique_variations = train_df['Variation'].value_counts()
565
print('Number of Unique Variations :', unique_variations.shape[0])
566
# the top 10 variations that occured most
567
print(unique_variations.head(10))
568
569
# comulative distribution of unique variation values
570
571
s = sum(unique_variations.values);
572
h = unique_variations.values/s;
573
c = np.cumsum(h)
574
print(c)
575
plt.plot(c,label='Cumulative distribution of Variations')
576
plt.grid()
577
plt.legend()
578
plt.show()
579
580
# one-hot encoding of variation feature.
581
variation_vectorizer = CountVectorizer()
582
train_variation_feature_onehotCoding = variation_vectorizer.fit_transform(train_df['Variation'])
583
test_variation_feature_onehotCoding = variation_vectorizer.transform(test_df['Variation'])
584
cv_variation_feature_onehotCoding = variation_vectorizer.transform(cv_df['Variation'])
585
586
train_variation_feature_onehotCoding.shape
587
588
# alpha is used for laplace smoothing
589
alpha = 1
590
# train gene feature
591
train_variation_feature_responseCoding = np.array(get_gv_feature(alpha, "Variation", train_df))
592
# test gene feature
593
test_variation_feature_responseCoding = np.array(get_gv_feature(alpha, "Variation", test_df))
594
# cross validation gene feature
595
cv_variation_feature_responseCoding = np.array(get_gv_feature(alpha, "Variation", cv_df))
596
597
train_variation_feature_responseCoding.shape
598
599
# We need a hyperparemeter for SGD classifier.
600
alpha = [10 ** x for x in range(-5, 1)]
601
602
# SGD classifier
603
cv_log_error_array=[]
604
for i in alpha:
605
    clf = SGDClassifier(alpha=i, penalty='l2', loss='log', random_state=42)
606
    clf.fit(train_variation_feature_onehotCoding, y_train)
607
    
608
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
609
    sig_clf.fit(train_variation_feature_onehotCoding, y_train)
610
    predict_y = sig_clf.predict_proba(cv_variation_feature_onehotCoding)
611
    
612
    cv_log_error_array.append(log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
613
    print('For values of alpha = ', i, "The log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
614
615
# Lets plot the same to check the best Alpha value
616
fig, ax = plt.subplots()
617
ax.plot(alpha, cv_log_error_array,c='g')
618
for i, txt in enumerate(np.round(cv_log_error_array,3)):
619
    ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
620
plt.grid()
621
plt.title("Cross Validation Error for each alpha")
622
plt.xlabel("Alpha i's")
623
plt.ylabel("Error measure")
624
plt.show()
625
626
best_alpha = np.argmin(cv_log_error_array)
627
clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
628
clf.fit(train_variation_feature_onehotCoding, y_train)
629
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
630
sig_clf.fit(train_variation_feature_onehotCoding, y_train)
631
632
predict_y = sig_clf.predict_proba(train_variation_feature_onehotCoding)
633
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
634
predict_y = sig_clf.predict_proba(cv_variation_feature_onehotCoding)
635
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
636
predict_y = sig_clf.predict_proba(test_variation_feature_onehotCoding)
637
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
638
639
# alpha 0.0001, test log loss is less compared to random model so good have variation feature 
640
641
test_coverage=test_df[test_df['Variation'].isin(list(set(train_df['Variation'])))].shape[0]
642
cv_coverage=cv_df[cv_df['Variation'].isin(list(set(train_df['Variation'])))].shape[0]
643
644
print('1. In test data',test_coverage, 'out of',test_df.shape[0], ":",(test_coverage/test_df.shape[0])*100)
645
print('2. In cross validation data',cv_coverage, 'out of ',cv_df.shape[0],":" ,(cv_coverage/cv_df.shape[0])*100)
646
647
# stability is low for variation feature. since log loss is less, lets keep it
648
649
# EVALUATING TEXT COLUMN
650
651
# cls_text is a data frame
652
# for every row in data frame consider the 'TEXT'
653
# split the words by space
654
# make a dict with those words
655
# increment its count whenever we see that word
656
657
# to get word count
658
def extract_dictionary_paddle(cls_text):
659
    dictionary = defaultdict(int)
660
    for index, row in cls_text.iterrows():
661
        for word in row['TEXT'].split():
662
            dictionary[word] +=1
663
    return dictionary
664
665
import math
666
#https://stackoverflow.com/a/1602964
667
def get_text_responsecoding(df):
668
    text_feature_responseCoding = np.zeros((df.shape[0],9))
669
    for i in range(0,9):
670
        row_index = 0
671
        for index, row in df.iterrows():
672
            sum_prob = 0
673
            for word in row['TEXT'].split():
674
                sum_prob += math.log(((dict_list[i].get(word,0)+10 )/(total_dict.get(word,0)+90)))
675
            text_feature_responseCoding[row_index][i] = math.exp(sum_prob/len(row['TEXT'].split()))
676
            row_index += 1
677
    return text_feature_responseCoding
678
679
# building a CountVectorizer with all the words that occured minimum 3 times in train data
680
text_vectorizer = CountVectorizer(min_df=3)
681
train_text_feature_onehotCoding = text_vectorizer.fit_transform(train_df['TEXT'])
682
# getting all the feature names (words)
683
train_text_features= text_vectorizer.get_feature_names()
684
685
# train_text_feature_onehotCoding.sum(axis=0).A1 will sum every row and returns (1*number of features) vector
686
train_text_fea_counts = train_text_feature_onehotCoding.sum(axis=0).A1
687
688
# zip(list(text_features),text_fea_counts) will zip a word with its number of times it occured
689
text_fea_dict = dict(zip(list(train_text_features),train_text_fea_counts))
690
691
print("Total number of unique words in train data :", len(train_text_features))
692
693
694
dict_list = []
695
# dict_list =[] contains 9 dictoinaries each corresponds to a class
696
for i in range(1,10):
697
    cls_text = train_df[train_df['Class']==i]
698
    # build a word dict based on the words in that class
699
    dict_list.append(extract_dictionary_paddle(cls_text))
700
    # append it to dict_list
701
702
# dict_list[i] is build on i'th  class text data
703
# total_dict is buid on whole training text data
704
total_dict = extract_dictionary_paddle(train_df)
705
706
707
confuse_array = []
708
for i in train_text_features:
709
    ratios = []
710
    max_val = -1
711
    for j in range(0,9):
712
        ratios.append((dict_list[j][i]+10 )/(total_dict[i]+90))
713
    confuse_array.append(ratios)
714
confuse_array = np.array(confuse_array)
715
716
#response coding of text features
717
train_text_feature_responseCoding  = get_text_responsecoding(train_df)
718
test_text_feature_responseCoding  = get_text_responsecoding(test_df)
719
cv_text_feature_responseCoding  = get_text_responsecoding(cv_df)
720
721
# https://stackoverflow.com/a/16202486
722
# we convert each row values such that they sum to 1  
723
train_text_feature_responseCoding = (train_text_feature_responseCoding.T/train_text_feature_responseCoding.sum(axis=1)).T
724
test_text_feature_responseCoding = (test_text_feature_responseCoding.T/test_text_feature_responseCoding.sum(axis=1)).T
725
cv_text_feature_responseCoding = (cv_text_feature_responseCoding.T/cv_text_feature_responseCoding.sum(axis=1)).T
726
727
# don't forget to normalize every feature
728
train_text_feature_onehotCoding = normalize(train_text_feature_onehotCoding, axis=0)
729
730
# we use the same vectorizer that was trained on train data
731
test_text_feature_onehotCoding = text_vectorizer.transform(test_df['TEXT'])
732
# don't forget to normalize every feature
733
test_text_feature_onehotCoding = normalize(test_text_feature_onehotCoding, axis=0)
734
735
# we use the same vectorizer that was trained on train data
736
cv_text_feature_onehotCoding = text_vectorizer.transform(cv_df['TEXT'])
737
# don't forget to normalize every feature
738
cv_text_feature_onehotCoding = normalize(cv_text_feature_onehotCoding, axis=0)
739
740
#https://stackoverflow.com/a/2258273/4084039
741
sorted_text_fea_dict = dict(sorted(text_fea_dict.items(), key=lambda x: x[1] , reverse=True))
742
sorted_text_occur = np.array(list(sorted_text_fea_dict.values()))
743
744
# Number of words for a given frequency.
745
print(Counter(sorted_text_occur))
746
747
# build the model with only text column
748
749
cv_log_error_array=[]
750
for i in alpha:
751
    clf = SGDClassifier(alpha=i, penalty='l2', loss='log', random_state=42)
752
    clf.fit(train_text_feature_onehotCoding, y_train)
753
    
754
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
755
    sig_clf.fit(train_text_feature_onehotCoding, y_train)
756
    predict_y = sig_clf.predict_proba(cv_text_feature_onehotCoding)
757
    cv_log_error_array.append(log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
758
    print('For values of alpha = ', i, "The log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
759
760
# log loss is less than random model
761
762
fig, ax = plt.subplots()
763
ax.plot(alpha, cv_log_error_array,c='g')
764
for i, txt in enumerate(np.round(cv_log_error_array,3)):
765
    ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
766
plt.grid()
767
plt.title("Cross Validation Error for each alpha")
768
plt.xlabel("Alpha i's")
769
plt.ylabel("Error measure")
770
plt.show()
771
772
best_alpha = np.argmin(cv_log_error_array)
773
clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
774
clf.fit(train_text_feature_onehotCoding, y_train)
775
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
776
sig_clf.fit(train_text_feature_onehotCoding, y_train)
777
778
predict_y = sig_clf.predict_proba(train_text_feature_onehotCoding)
779
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
780
predict_y = sig_clf.predict_proba(cv_text_feature_onehotCoding)
781
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
782
predict_y = sig_clf.predict_proba(test_text_feature_onehotCoding)
783
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
784
785
# check the overlap of text data
786
787
def get_intersec_text(df):
788
    df_text_vec = CountVectorizer(min_df=3)
789
    df_text_fea = df_text_vec.fit_transform(df['TEXT'])
790
    df_text_features = df_text_vec.get_feature_names()
791
792
    df_text_fea_counts = df_text_fea.sum(axis=0).A1
793
    df_text_fea_dict = dict(zip(list(df_text_features),df_text_fea_counts))
794
    len1 = len(set(df_text_features))
795
    len2 = len(set(train_text_features) & set(df_text_features))
796
    return len1,len2
797
798
len1,len2 = get_intersec_text(test_df)
799
print(np.round((len2/len1)*100, 3), "% of word of test data appeared in train data")
800
len1,len2 = get_intersec_text(cv_df)
801
print(np.round((len2/len1)*100, 3), "% of word of Cross Validation appeared in train data")
802
803
# text column as good stable
804
805
# therefore, all 3 columns are going important for model buidling
806
807
# Data prepration for Machine Learning models
808
809
# few utility functions
810
811
def report_log_loss(train_x, train_y, test_x, test_y,  clf):
812
    clf.fit(train_x, train_y)
813
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
814
    sig_clf.fit(train_x, train_y)
815
    sig_clf_probs = sig_clf.predict_proba(test_x)
816
    return log_loss(test_y, sig_clf_probs, eps=1e-15)
817
818
# This function plots the confusion matrices given y_i, y_i_hat.
819
def plot_confusion_matrix(test_y, predict_y):
820
    C = confusion_matrix(test_y, predict_y)
821
    
822
    A =(((C.T)/(C.sum(axis=1))).T)
823
    
824
    B =(C/C.sum(axis=0)) 
825
    labels = [1,2,3,4,5,6,7,8,9]
826
    # representing A in heatmap format
827
    print("-"*20, "Confusion matrix", "-"*20)
828
    plt.figure(figsize=(20,7))
829
    sns.heatmap(C, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
830
    plt.xlabel('Predicted Class')
831
    plt.ylabel('Original Class')
832
    plt.show()
833
834
    print("-"*20, "Precision matrix (Columm Sum=1)", "-"*20)
835
    plt.figure(figsize=(20,7))
836
    sns.heatmap(B, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
837
    plt.xlabel('Predicted Class')
838
    plt.ylabel('Original Class')
839
    plt.show()
840
    
841
    # representing B in heatmap format
842
    print("-"*20, "Recall matrix (Row sum=1)", "-"*20)
843
    plt.figure(figsize=(20,7))
844
    sns.heatmap(A, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
845
    plt.xlabel('Predicted Class')
846
    plt.ylabel('Original Class')
847
    plt.show()
848
849
850
def predict_and_plot_confusion_matrix(train_x, train_y,test_x, test_y, clf):
851
    clf.fit(train_x, train_y)
852
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
853
    sig_clf.fit(train_x, train_y)
854
    pred_y = sig_clf.predict(test_x)
855
856
    # for calculating log_loss we willl provide the array of probabilities belongs to each class
857
    print("Log loss :",log_loss(test_y, sig_clf.predict_proba(test_x)))
858
    # calculating the number of data points that are misclassified
859
    print("Number of mis-classified points :", np.count_nonzero((pred_y- test_y))/test_y.shape[0])
860
    plot_confusion_matrix(test_y, pred_y)
861
862
863
# this function will be used just for naive bayes
864
# for the given indices, we will print the name of the features
865
# and we will check whether the feature present in the test point text or not
866
def get_impfeature_names(indices, text, gene, var, no_features):
867
    gene_count_vec = CountVectorizer()
868
    var_count_vec = CountVectorizer()
869
    text_count_vec = CountVectorizer(min_df=3)
870
    
871
    gene_vec = gene_count_vec.fit(train_df['Gene'])
872
    var_vec  = var_count_vec.fit(train_df['Variation'])
873
    text_vec = text_count_vec.fit(train_df['TEXT'])
874
    
875
    fea1_len = len(gene_vec.get_feature_names())
876
    fea2_len = len(var_count_vec.get_feature_names())
877
    
878
    word_present = 0
879
    for i,v in enumerate(indices):
880
        if (v < fea1_len):
881
            word = gene_vec.get_feature_names()[v]
882
            yes_no = True if word == gene else False
883
            if yes_no:
884
                word_present += 1
885
                print(i, "Gene feature [{}] present in test data point [{}]".format(word,yes_no))
886
        elif (v < fea1_len+fea2_len):
887
            word = var_vec.get_feature_names()[v-(fea1_len)]
888
            yes_no = True if word == var else False
889
            if yes_no:
890
                word_present += 1
891
                print(i, "variation feature [{}] present in test data point [{}]".format(word,yes_no))
892
        else:
893
            word = text_vec.get_feature_names()[v-(fea1_len+fea2_len)]
894
            yes_no = True if word in text.split() else False
895
            if yes_no:
896
                word_present += 1
897
                print(i, "Text feature [{}] present in test data point [{}]".format(word,yes_no))
898
899
    print("Out of the top ",no_features," features ", word_present, "are present in query point")
900
901
## Combining all 3 features together
902
903
# merging gene, variance and text features
904
905
# building train, test and cross validation data sets
906
# a = [[1, 2], 
907
#      [3, 4]]
908
# b = [[4, 5], 
909
#      [6, 7]]
910
# hstack(a, b) = [[1, 2, 4, 5],
911
#                [ 3, 4, 6, 7]]
912
913
train_gene_var_onehotCoding = hstack((train_gene_feature_onehotCoding,train_variation_feature_onehotCoding))
914
test_gene_var_onehotCoding = hstack((test_gene_feature_onehotCoding,test_variation_feature_onehotCoding))
915
cv_gene_var_onehotCoding = hstack((cv_gene_feature_onehotCoding,cv_variation_feature_onehotCoding))
916
917
train_x_onehotCoding = hstack((train_gene_var_onehotCoding, train_text_feature_onehotCoding)).tocsr()
918
train_y = np.array(list(train_df['Class']))
919
920
test_x_onehotCoding = hstack((test_gene_var_onehotCoding, test_text_feature_onehotCoding)).tocsr()
921
test_y = np.array(list(test_df['Class']))
922
923
cv_x_onehotCoding = hstack((cv_gene_var_onehotCoding, cv_text_feature_onehotCoding)).tocsr()
924
cv_y = np.array(list(cv_df['Class']))
925
926
927
train_gene_var_responseCoding = np.hstack((train_gene_feature_responseCoding,train_variation_feature_responseCoding))
928
test_gene_var_responseCoding = np.hstack((test_gene_feature_responseCoding,test_variation_feature_responseCoding))
929
cv_gene_var_responseCoding = np.hstack((cv_gene_feature_responseCoding,cv_variation_feature_responseCoding))
930
931
train_x_responseCoding = np.hstack((train_gene_var_responseCoding, train_text_feature_responseCoding))
932
test_x_responseCoding = np.hstack((test_gene_var_responseCoding, test_text_feature_responseCoding))
933
cv_x_responseCoding = np.hstack((cv_gene_var_responseCoding, cv_text_feature_responseCoding))
934
935
936
print("One hot encoding features :")
937
print("(number of data points * number of features) in train data = ", train_x_onehotCoding.shape)
938
print("(number of data points * number of features) in test data = ", test_x_onehotCoding.shape)
939
print("(number of data points * number of features) in cross validation data =", cv_x_onehotCoding.shape)
940
941
print(" Response encoding features :")
942
print("(number of data points * number of features) in train data = ", train_x_responseCoding.shape)
943
print("(number of data points * number of features) in test data = ", test_x_responseCoding.shape)
944
print("(number of data points * number of features) in cross validation data =", cv_x_responseCoding.shape)
945
946
# Building Machine Learning model
947
948
# http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html
949
alpha = [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000]
950
cv_log_error_array = []
951
for i in alpha:
952
    print("for alpha =", i)
953
    clf = MultinomialNB(alpha=i)
954
    clf.fit(train_x_onehotCoding, train_y)
955
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
956
    sig_clf.fit(train_x_onehotCoding, train_y)
957
    sig_clf_probs = sig_clf.predict_proba(cv_x_onehotCoding)
958
    cv_log_error_array.append(log_loss(cv_y, sig_clf_probs, labels=clf.classes_, eps=1e-15))
959
    # to avoid rounding error while multiplying probabilites we use log-probability estimates
960
    print("Log Loss :",log_loss(cv_y, sig_clf_probs)) 
961
962
fig, ax = plt.subplots()
963
ax.plot(np.log10(alpha), cv_log_error_array,c='g')
964
for i, txt in enumerate(np.round(cv_log_error_array,3)):
965
    ax.annotate((alpha[i],str(txt)), (np.log10(alpha[i]),cv_log_error_array[i]))
966
plt.grid()
967
plt.xticks(np.log10(alpha))
968
plt.title("Cross Validation Error for each alpha")
969
plt.xlabel("Alpha i's")
970
plt.ylabel("Error measure")
971
plt.show()
972
973
best_alpha = np.argmin(cv_log_error_array)
974
clf = MultinomialNB(alpha=alpha[best_alpha])
975
clf.fit(train_x_onehotCoding, train_y)
976
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
977
sig_clf.fit(train_x_onehotCoding, train_y)
978
979
predict_y = sig_clf.predict_proba(train_x_onehotCoding)
980
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
981
predict_y = sig_clf.predict_proba(cv_x_onehotCoding)
982
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
983
predict_y = sig_clf.predict_proba(test_x_onehotCoding)
984
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
985
986
# Testing our Naive Bayes model with best found value of alpha on testing data
987
clf = MultinomialNB(alpha=alpha[best_alpha])
988
clf.fit(train_x_onehotCoding, train_y)
989
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
990
sig_clf.fit(train_x_onehotCoding, train_y)
991
sig_clf_probs = sig_clf.predict_proba(cv_x_onehotCoding)
992
# to avoid rounding error while multiplying probabilites we use log-probability estimates
993
print("Log Loss :",log_loss(cv_y, sig_clf_probs))
994
print("Number of missclassified point :", np.count_nonzero((sig_clf.predict(cv_x_onehotCoding)- cv_y))/cv_y.shape[0])
995
plot_confusion_matrix(cv_y, sig_clf.predict(cv_x_onehotCoding.toarray()))
996
997
# interpretability of model
998
999
test_point_index = 1
1000
no_feature = 100
1001
predicted_cls = sig_clf.predict(test_x_onehotCoding[test_point_index])
1002
print("Predicted Class :", predicted_cls[0])
1003
print("Predicted Class Probabilities:", np.round(sig_clf.predict_proba(test_x_onehotCoding[test_point_index]),4))
1004
print("Actual Class :", test_y[test_point_index])
1005
indices = np.argsort(-clf.coef_)[predicted_cls-1][:,:no_feature]
1006
print("-"*50)
1007
get_impfeature_names(indices[0], test_df['TEXT'].iloc[test_point_index],test_df['Gene'].iloc[test_point_index],test_df['Variation'].iloc[test_point_index], no_feature)
1008
1009
# one more ex
1010
1011
test_point_index = 100
1012
no_feature = 100
1013
predicted_cls = sig_clf.predict(test_x_onehotCoding[test_point_index])
1014
print("Predicted Class :", predicted_cls[0])
1015
print("Predicted Class Probabilities:", np.round(sig_clf.predict_proba(test_x_onehotCoding[test_point_index]),4))
1016
print("Actual Class :", test_y[test_point_index])
1017
indices = np.argsort(-clf.coef_)[predicted_cls-1][:,:no_feature]
1018
print("-"*50)
1019
get_impfeature_names(indices[0], test_df['TEXT'].iloc[test_point_index],test_df['Gene'].iloc[test_point_index],test_df['Variation'].iloc[test_point_index], no_feature)
1020
1021
# So Naive Bayes not performing very badly but lets look at other models
1022
1023
# Logistic Regression
1024
1025
# Balancing all classes
1026
1027
alpha = [10 ** x for x in range(-6, 3)]
1028
cv_log_error_array = []
1029
for i in alpha:
1030
    print("for alpha =", i)
1031
    clf = SGDClassifier(class_weight='balanced', alpha=i, penalty='l2', loss='log', random_state=42)
1032
    clf.fit(train_x_onehotCoding, train_y)
1033
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
1034
    sig_clf.fit(train_x_onehotCoding, train_y)
1035
    sig_clf_probs = sig_clf.predict_proba(cv_x_onehotCoding)
1036
    cv_log_error_array.append(log_loss(cv_y, sig_clf_probs, labels=clf.classes_, eps=1e-15))
1037
    # to avoid rounding error while multiplying probabilites we use log-probability estimates
1038
    print("Log Loss :",log_loss(cv_y, sig_clf_probs)) 
1039
1040
fig, ax = plt.subplots()
1041
ax.plot(alpha, cv_log_error_array,c='g')
1042
for i, txt in enumerate(np.round(cv_log_error_array,3)):
1043
    ax.annotate((alpha[i],str(txt)), (alpha[i],cv_log_error_array[i]))
1044
plt.grid()
1045
plt.title("Cross Validation Error for each alpha")
1046
plt.xlabel("Alpha i's")
1047
plt.ylabel("Error measure")
1048
plt.show()
1049
1050
best_alpha = np.argmin(cv_log_error_array)
1051
clf = SGDClassifier(class_weight='balanced', alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
1052
clf.fit(train_x_onehotCoding, train_y)
1053
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
1054
sig_clf.fit(train_x_onehotCoding, train_y)
1055
1056
predict_y = sig_clf.predict_proba(train_x_onehotCoding)
1057
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
1058
predict_y = sig_clf.predict_proba(cv_x_onehotCoding)
1059
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
1060
predict_y = sig_clf.predict_proba(test_x_onehotCoding)
1061
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
1062
1063
# Lets test it on testing data using best alpha value
1064
1065
clf = SGDClassifier(class_weight='balanced', alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
1066
predict_and_plot_confusion_matrix(train_x_onehotCoding, train_y, cv_x_onehotCoding, cv_y, clf)
1067
1068
1069
# feature importance
1070
1071
def get_imp_feature_names(text, indices, removed_ind = []):
1072
    word_present = 0
1073
    tabulte_list = []
1074
    incresingorder_ind = 0
1075
    for i in indices:
1076
        if i < train_gene_feature_onehotCoding.shape[1]:
1077
            tabulte_list.append([incresingorder_ind, "Gene", "Yes"])
1078
        elif i< 18:
1079
            tabulte_list.append([incresingorder_ind,"Variation", "Yes"])
1080
        if ((i > 17) & (i not in removed_ind)) :
1081
            word = train_text_features[i]
1082
            yes_no = True if word in text.split() else False
1083
            if yes_no:
1084
                word_present += 1
1085
            tabulte_list.append([incresingorder_ind,train_text_features[i], yes_no])
1086
        incresingorder_ind += 1
1087
    print(word_present, "most importent features are present in our query point")
1088
    print("-"*50)
1089
    print("The features that are most importent of the ",predicted_cls[0]," class:")
1090
    print (tabulate(tabulte_list, headers=["Index",'Feature name', 'Present or Not']))
1091
1092
# testing query point and doing interpretability
1093
1094
# from tabulate import tabulate
1095
clf = SGDClassifier(class_weight='balanced', alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
1096
clf.fit(train_x_onehotCoding,train_y)
1097
test_point_index = 1
1098
no_feature = 500
1099
predicted_cls = sig_clf.predict(test_x_onehotCoding[test_point_index])
1100
print("Predicted Class :", predicted_cls[0])
1101
print("Predicted Class Probabilities:", np.round(sig_clf.predict_proba(test_x_onehotCoding[test_point_index]),4))
1102
print("Actual Class :", test_y[test_point_index])
1103
indices = np.argsort(-clf.coef_)[predicted_cls-1][:,:no_feature]
1104
print("-"*50)
1105
get_impfeature_names(indices[0], test_df['TEXT'].iloc[test_point_index],test_df['Gene'].iloc[test_point_index],test_df['Variation'].iloc[test_point_index], no_feature)
1106
1107
test_point_index = 100
1108
no_feature = 500
1109
predicted_cls = sig_clf.predict(test_x_onehotCoding[test_point_index])
1110
print("Predicted Class :", predicted_cls[0])
1111
print("Predicted Class Probabilities:", np.round(sig_clf.predict_proba(test_x_onehotCoding[test_point_index]),4))
1112
print("Actual Class :", test_y[test_point_index])
1113
indices = np.argsort(-clf.coef_)[predicted_cls-1][:,:no_feature]
1114
print("-"*50)
1115
get_impfeature_names(indices[0], test_df['TEXT'].iloc[test_point_index],test_df['Gene'].iloc[test_point_index],test_df['Variation'].iloc[test_point_index], no_feature)
1116
1117
# K Nearest Neighbour Classification
1118
1119
alpha = [5, 11, 15, 21, 31, 41, 51, 99]
1120
cv_log_error_array = []
1121
for i in alpha:
1122
    print("for alpha =", i)
1123
    clf = KNeighborsClassifier(n_neighbors=i)
1124
    clf.fit(train_x_responseCoding, train_y)
1125
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
1126
    sig_clf.fit(train_x_responseCoding, train_y)
1127
    sig_clf_probs = sig_clf.predict_proba(cv_x_responseCoding)
1128
    cv_log_error_array.append(log_loss(cv_y, sig_clf_probs, labels=clf.classes_, eps=1e-15))
1129
    # to avoid rounding error while multiplying probabilites we use log-probability estimates
1130
    print("Log Loss :",log_loss(cv_y, sig_clf_probs))
1131
1132
fig, ax = plt.subplots()
1133
ax.plot(alpha, cv_log_error_array,c='g')
1134
for i, txt in enumerate(np.round(cv_log_error_array,3)):
1135
    ax.annotate((alpha[i],str(txt)), (alpha[i],cv_log_error_array[i]))
1136
plt.grid()
1137
plt.title("Cross Validation Error for each alpha")
1138
plt.xlabel("Alpha i's")
1139
plt.ylabel("Error measure")
1140
plt.show()
1141
1142
best_alpha = np.argmin(cv_log_error_array)
1143
clf = KNeighborsClassifier(n_neighbors=alpha[best_alpha])
1144
clf.fit(train_x_responseCoding, train_y)
1145
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
1146
sig_clf.fit(train_x_responseCoding, train_y)
1147
1148
predict_y = sig_clf.predict_proba(train_x_responseCoding)
1149
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
1150
predict_y = sig_clf.predict_proba(cv_x_responseCoding)
1151
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
1152
predict_y = sig_clf.predict_proba(test_x_responseCoding)
1153
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
1154
1155
clf = KNeighborsClassifier(n_neighbors=alpha[best_alpha])
1156
predict_and_plot_confusion_matrix(train_x_responseCoding, train_y, cv_x_responseCoding, cv_y, clf)
1157
1158
# Lets look at few test points
1159
clf = KNeighborsClassifier(n_neighbors=alpha[best_alpha])
1160
clf.fit(train_x_responseCoding, train_y)
1161
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
1162
sig_clf.fit(train_x_responseCoding, train_y)
1163
1164
test_point_index = 1
1165
predicted_cls = sig_clf.predict(test_x_responseCoding[0].reshape(1,-1))
1166
print("Predicted Class :", predicted_cls[0])
1167
print("Actual Class :", test_y[test_point_index])
1168
neighbors = clf.kneighbors(test_x_responseCoding[test_point_index].reshape(1, -1), alpha[best_alpha])
1169
print("The ",alpha[best_alpha]," nearest neighbours of the test points belongs to classes",train_y[neighbors[1][0]])
1170
print("Fequency of nearest points :",Counter(train_y[neighbors[1][0]]))
1171
1172
clf = KNeighborsClassifier(n_neighbors=alpha[best_alpha])
1173
clf.fit(train_x_responseCoding, train_y)
1174
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
1175
sig_clf.fit(train_x_responseCoding, train_y)
1176
1177
test_point_index = 100
1178
1179
predicted_cls = sig_clf.predict(test_x_responseCoding[test_point_index].reshape(1,-1))
1180
print("Predicted Class :", predicted_cls[0])
1181
print("Actual Class :", test_y[test_point_index])
1182
neighbors = clf.kneighbors(test_x_responseCoding[test_point_index].reshape(1, -1), alpha[best_alpha])
1183
print("the k value for knn is",alpha[best_alpha],"and the nearest neighbours of the test points belongs to classes",train_y[neighbors[1][0]])
1184
print("Fequency of nearest points :",Counter(train_y[neighbors[1][0]]))
1185
1186
# without balancing data for logistic regression .01 difference in log loss
1187
1188
# linear SVM dint reduce w.r.t log loss
1189
1190
# random forest classifier with 1-hot encoding and response encoding (was worse than 1-hot encoding) dint reduce w.r.t log loss
1191
1192
# stacking model dint reduce w.r.t log loss
1193
1194
# logistic regression with balancing and 1-hot encoding has got the lowest log loss
1195
1196
1197