MLRedefining-Cancer-Treat / Git / Diff of /run.py

Models:
joseph-gordon/
MLRedefining-Cancer-Treat
Downloads: 1
Diff of /run.py [000000] .. [44995b]
Switch to side-by-side view

--- a
+++ b/run.py
@@ -0,0 +1,1197 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import re
+import time
+import warnings
+import numpy as np
+from nltk.corpus import stopwords
+from sklearn.decomposition import TruncatedSVD
+from sklearn.preprocessing import normalize
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.manifold import TSNE
+import seaborn as sns
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import accuracy_score, log_loss
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import SGDClassifier
+from imblearn.over_sampling import SMOTE
+from collections import Counter
+from scipy.sparse import hstack
+from sklearn.multiclass import OneVsRestClassifier
+from sklearn.svm import SVC
+from sklearn import StratifiedKFold 
+from collections import Counter, defaultdict
+from sklearn.calibration import CalibratedClassifierCV
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.naive_bayes import GaussianNB
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import GridSearchCV
+import math
+from sklearn.metrics import normalized_mutual_info_score
+from sklearn.ensemble import RandomForestClassifier
+warnings.filterwarnings("ignore")
+
+from mlxtend.classifier import StackingClassifier
+
+from sklearn import model_selection
+from sklearn.linear_model import LogisticRegression
+
+# Loading training_variants. A comma seperated file
+data_variants = pd.read_csv('training/training_variants')
+# Loading training_text. Is seperated by ||
+data_text =pd.read_csv("training/training_text",sep="\|\|",engine="python",names=["ID","TEXT"],skiprows=1)
+
+# basic look ahead for data_variants
+
+data_variants.head()
+# to classify cancer, we need gene, variants and research paper (text)
+# ID --> common, same value in both data_variants & data_text for linking (merging dataframe)
+# gene --> genetic mutation location
+# variation --> aminoacid change for respective mutations
+# class --> 1-9, target classes i,e genetic mutation has been classified on
+# in data_variants, text is missing. text is present in data_text dataframe
+
+data_variants.info()
+
+data_variants.describe()
+
+data_variants.columns
+
+data_variants.shape
+
+# basic look ahead for data_text
+
+data_text.head()
+# text --> huge text because research paper
+
+data_text.info()
+# missing value exists
+
+data_text.columns
+
+data_text.shape
+
+# target classes
+data_variants.Class.unique()
+# multiclass classification problem
+
+'''
+important things before modelling:
+
+1. Since it is medical problem, errors are costly
+    so have to return outcome in probability. 
+    p(y=2|a1) = 0.99 --> OK
+    p(y=2|a1) = 0.5 --> NOT OK because probability is ambigous. In this case, have to look for more evidences or reason
+
+2. Interpretability: 
+
+    why 0.5 ? why 0.99 ? provide evidences for probability value
+
+    * few ML models are very good at interpretability such as Naive bayes, decision tree, logistic regression, linear SVM
+    * few ML models are not good at interpretability such as RBF and SVM
+
+3. Latency: 
+
+    * OK to take few sec/min for output
+    * NOT OK to take in hours for output
+
+'''
+# data_text has text (i,e research paper), cannot feed raw text to model
+# so convert to a format that model understands
+
+# numbers, stopwords, multiple spaces, special chars are not relevant so lets remove
+
+stop_words = set(stopwords.words('english'))
+
+def data_text_preprocess(total_text, ind, col):
+    # Remove int values from text data as that might not be imp
+    if type(total_text) is not int:
+        string = ""
+        # replacing all special char with space
+        total_text = re.sub('[^a-zA-Z0-9\n]', ' ', str(total_text))
+        # replacing multiple spaces with single space
+        total_text = re.sub('\s+',' ', str(total_text))
+        # bring whole text to same lower-case scale.
+        total_text = total_text.lower()
+        
+        for word in total_text.split():
+        # if the word is a not a stop word then retain that word from text
+            if not word in stop_words:
+                string += word + " "
+        
+        data_text[col][ind] = string
+
+for index, row in data_text.iterrows():
+    if type(row['TEXT']) is str:
+        data_text_preprocess(row['TEXT'], index, 'TEXT')
+
+# data preprocessing is completed
+
+# lets merge dataframes based on ID
+result = pd.merge(data_variants, data_text,on='ID', how='left')
+result.head()
+
+# check for missing values
+result[result.isnull().any(axis=1)]
+# 5 rows of missing values, so lets impute by conacatinating gene + variant values
+
+result.loc[result['TEXT'].isnull(),'TEXT'] = result['Gene'] +' '+result['Variation']
+
+# cross check 
+result[result.isnull().any(axis=1)]
+
+# Split dataset into training, test & validation data
+
+y_true = result['Class'].values
+result.Gene      = result.Gene.str.replace('\s+', '_')
+result.Variation = result.Variation.str.replace('\s+', '_')
+
+# Splitting the data into train and test set 
+X_train, test_df, y_train, y_test = train_test_split(result, y_true, stratify=y_true, test_size=0.2)
+# split the train data now into train validation and cross validation
+train_df, cv_df, y_train, y_cv = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2)
+
+print('Number of data points in train data:', train_df.shape[0])
+print('Number of data points in test data:', test_df.shape[0])
+print('Number of data points in cross validation data:', cv_df.shape[0])
+
+# has data distributed properly ??
+
+train_class_distribution = train_df['Class'].value_counts()
+test_class_distribution = test_df['Class'].value_counts()
+cv_class_distribution = cv_df['Class'].value_counts()
+
+# lets viz
+
+# viz for train class distribution
+
+my_colors = 'rgbkymc'
+train_class_distribution.plot(kind='bar')
+plt.xlabel('Class')
+plt.ylabel(' Number of Data points per Class')
+plt.title('Distribution of yi in train data')
+plt.grid()
+plt.show()
+
+# lets look in percentage form
+
+sorted_yi = np.argsort(-train_class_distribution.values)
+for i in sorted_yi:
+    print('Number of data points in class', i+1, ':',train_class_distribution.values[i], '(', np.round((train_class_distribution.values[i]/train_df.shape[0]*100), 3), '%)')
+
+# viz for test class distribution
+
+my_colors = 'rgbkymc'
+test_class_distribution.plot(kind='bar')
+plt.xlabel('Class')
+plt.ylabel('Number of Data points per Class')
+plt.title('Distribution of yi in test data')
+plt.grid()
+plt.show()
+
+# lets look in percentage form
+
+sorted_yi = np.argsort(-test_class_distribution.values)
+for i in sorted_yi:
+    print('Number of data points in class', i+1, ':',test_class_distribution.values[i], '(', np.round((test_class_distribution.values[i]/test_df.shape[0]*100), 3), '%)')
+
+# viz for cross validation set
+
+my_colors = 'rgbkymc'
+cv_class_distribution.plot(kind='bar')
+plt.xlabel('Class')
+plt.ylabel('Data points per Class')
+plt.title('Distribution of yi in cross validation data')
+plt.grid()
+plt.show()
+
+# lets look in percentage form
+
+sorted_yi = np.argsort(-train_class_distribution.values)
+for i in sorted_yi:
+    print('Number of data points in class', i+1, ':',cv_class_distribution.values[i], '(', np.round((cv_class_distribution.values[i]/cv_df.shape[0]*100), 3), '%)')
+
+# distribution must happen in right distribution
+
+'''
+what is the evaluation metric ?
+from kaggle page, multi class log loss is the evaluation metric
+
+log loss ?
+
+* more the prediction value (prediction probability), log loss is lower
+* if the prediction is ambigious, log loss penalizes
+* log loss = 0 --> perfect model, which is not the case in real world
+* log loss lies between zero to infinity [0,infinity]
+
+1 problem:
+
+* in accuracy score evaluation metrics: 0 - 1 is the range where 1 is the best and 0 is the worst
+* in AUC evaluation metrics: 0 - 1 is the range where 1 is the best and 0 is the worst
+
+BUT 
+
+log loss values range between 0 to infinity
+
+how do we know the infinity value is good or bad ??
+
+1 method is: 
+
+random model --> worst model --> compute log loss (ex: 3)
+random model --> compute log loss (ex: 1.5)
+
+1.5 given model is better than 3
+
+how do we generate random model ?
+
+* here working with probability.
+* summazation all the probability would be 1
+
+'''
+
+# build random model 
+
+# 9 classes exists, so create 9 random numbers such that thier sum is equivalent to 1 
+# because sum of probability of all the 9 classes must be equivalent to 1
+
+test_data_len = test_df.shape[0]
+cv_data_len = cv_df.shape[0]
+
+# CV set error
+cv_predicted_y = np.zeros((cv_data_len,9))
+for i in range(cv_data_len):
+    rand_probs = np.random.rand(1,9) # randomly generate numbers between 1-9
+    cv_predicted_y[i] = ((rand_probs/sum(sum(rand_probs)))[0]) # summazation equals to 1
+print("Log loss on Cross Validation Data using Random Model",log_loss(y_cv,cv_predicted_y, eps=1e-15))
+# Log loss on Cross Validation Data using Random Model 2.491130932423048
+
+# test-set error
+test_predicted_y = np.zeros((test_data_len,9))
+for i in range(test_data_len):
+    rand_probs = np.random.rand(1,9)
+    test_predicted_y[i] = ((rand_probs/sum(sum(rand_probs)))[0])
+print("Log loss on Test Data using Random Model",log_loss(y_test,test_predicted_y, eps=1e-15))
+# Log loss on Test Data using Random Model 2.531934856356048
+
+# Lets get the index of max probablity
+predicted_y =np.argmax(test_predicted_y, axis=1)
+
+# index value ranges from 0-8, our problem statement has from 1-9 so lets update it 
+predicted_y = predicted_y + 1
+
+# confusion matrix
+C = confusion_matrix(y_test, predicted_y)
+
+labels = [1,2,3,4,5,6,7,8,9]
+plt.figure(figsize=(20,7))
+sns.heatmap(C, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
+plt.xlabel('Predicted Class')
+plt.ylabel('Original Class')
+plt.show()
+
+# precision matrix
+
+B =(C/C.sum(axis=0))
+
+plt.figure(figsize=(20,7))
+sns.heatmap(B, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
+plt.xlabel('Predicted Class')
+plt.ylabel('Original Class')
+plt.show()
+
+# recall matrix
+
+A =(((C.T)/(C.sum(axis=1))).T)
+
+plt.figure(figsize=(20,7))
+sns.heatmap(A, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
+plt.xlabel('Predicted Class')
+plt.ylabel('Original Class')
+plt.show()
+
+# evaluate gene column
+
+# is gene column important for target feature ?
+# if so then convert category to numeric conversions
+
+unique_genes = train_df['Gene'].value_counts()
+print('Number of Unique Genes :', unique_genes.shape[0])
+# the top 10 genes that occured most
+print(unique_genes.head(10))
+
+unique_genes.shape[0]
+
+# viz of cumulative distribution of unique gene values
+
+s = sum(unique_genes.values);
+h = unique_genes.values/s;
+c = np.cumsum(h)
+plt.plot(c,label='Cumulative distribution of Genes')
+plt.grid()
+plt.legend()
+plt.show()
+
+# top 50 values are contributing to 75% of data
+
+'''
+lets encode
+1. one-hot encoding. creates lot of features --> problem in some cases 
+2. response encoding (mean imputation, ex: average height of indians are 5.5)
+
+same way response encoding in this case would be:
+
+* for 1 row, we'll create 9 columns
+* in 9 columns, the probability of gene occurance w.r.t that row would be saved 
+'''
+# let's use both encoding techniques and check which one works better
+
+# 1-hot encoding on gene feature
+
+gene_vectorizer = CountVectorizer()
+train_gene_feature_onehotCoding = gene_vectorizer.fit_transform(train_df['Gene'])
+test_gene_feature_onehotCoding = gene_vectorizer.transform(test_df['Gene'])
+cv_gene_feature_onehotCoding = gene_vectorizer.transform(cv_df['Gene'])
+
+train_gene_feature_onehotCoding.shape
+# (2124,234)
+
+# response encoding on gene feature
+
+# code for response coding with Laplace smoothing.
+# alpha : used for laplace smoothing
+# feature: ['gene', 'variation']
+# df: ['train_df', 'test_df', 'cv_df']
+# algorithm
+# ----------
+# Consider all unique values and the number of occurances of given feature in train data dataframe
+# build a vector (1*9) , the first element = (number of times it occured in class1 + 10*alpha / number of time it occurred in total data+90*alpha)
+# gv_dict is like a look up table, for every gene it store a (1*9) representation of it
+# for a value of feature in df:
+# if it is in train data:
+# we add the vector that was stored in 'gv_dict' look up table to 'gv_fea'
+# if it is not there is train:
+# we add [1/9, 1/9, 1/9, 1/9,1/9, 1/9, 1/9, 1/9, 1/9] to 'gv_fea'
+# return 'gv_fea'
+# ----------------------
+
+# get_gv_fea_dict: Get Gene varaition Feature Dict
+def get_gv_fea_dict(alpha, feature, df):
+    # value_count: it contains a dict like
+    # print(train_df['Gene'].value_counts())
+    # output:
+    #        {BRCA1      174
+    #         TP53       106
+    #         EGFR        86
+    #         BRCA2       75
+    #         PTEN        69
+    #         KIT         61
+    #         BRAF        60
+    #         ERBB2       47
+    #         PDGFRA      46
+    #         ...}
+    # print(train_df['Variation'].value_counts())
+    # output:
+    # {
+    # Truncating_Mutations                     63
+    # Deletion                                 43
+    # Amplification                            43
+    # Fusions                                  22
+    # Overexpression                            3
+    # E17K                                      3
+    # Q61L                                      3
+    # S222D                                     2
+    # P130S                                     2
+    # ...
+    # }
+    value_count = train_df[feature].value_counts()
+    
+    # gv_dict : Gene Variation Dict, which contains the probability array for each gene/variation
+    gv_dict = dict()
+    
+    # denominator will contain the number of time that particular feature occured in whole data
+    for i, denominator in value_count.items():
+        # vec will contain (p(yi==1/Gi) probability of gene/variation belongs to perticular class
+        # vec is 9 diamensional vector
+        vec = []
+        for k in range(1,10):
+            # print(train_df.loc[(train_df['Class']==1) & (train_df['Gene']=='BRCA1')])
+            #         ID   Gene             Variation  Class  
+            # 2470  2470  BRCA1                S1715C      1   
+            # 2486  2486  BRCA1                S1841R      1   
+            # 2614  2614  BRCA1                   M1R      1   
+            # 2432  2432  BRCA1                L1657P      1   
+            # 2567  2567  BRCA1                T1685A      1   
+            # 2583  2583  BRCA1                E1660G      1   
+            # 2634  2634  BRCA1                W1718L      1   
+            # cls_cnt.shape[0] will return the number of rows
+
+            cls_cnt = train_df.loc[(train_df['Class']==k) & (train_df[feature]==i)]
+            
+            # cls_cnt.shape[0](numerator) will contain the number of time that particular feature occured in whole data
+            vec.append((cls_cnt.shape[0] + alpha*10)/ (denominator + 90*alpha))
+
+        # we are adding the gene/variation to the dict as key and vec as value
+        gv_dict[i]=vec
+    return gv_dict
+
+# Get Gene variation feature
+def get_gv_feature(alpha, feature, df):
+    # print(gv_dict)
+    #     {'BRCA1': [0.20075757575757575, 0.03787878787878788, 0.068181818181818177, 0.13636363636363635, 0.25, 0.19318181818181818, 0.03787878787878788, 0.03787878787878788, 0.03787878787878788], 
+    #      'TP53': [0.32142857142857145, 0.061224489795918366, 0.061224489795918366, 0.27040816326530615, 0.061224489795918366, 0.066326530612244902, 0.051020408163265307, 0.051020408163265307, 0.056122448979591837], 
+    #      'EGFR': [0.056818181818181816, 0.21590909090909091, 0.0625, 0.068181818181818177, 0.068181818181818177, 0.0625, 0.34659090909090912, 0.0625, 0.056818181818181816], 
+    #      'BRCA2': [0.13333333333333333, 0.060606060606060608, 0.060606060606060608, 0.078787878787878782, 0.1393939393939394, 0.34545454545454546, 0.060606060606060608, 0.060606060606060608, 0.060606060606060608], 
+    #      'PTEN': [0.069182389937106917, 0.062893081761006289, 0.069182389937106917, 0.46540880503144655, 0.075471698113207544, 0.062893081761006289, 0.069182389937106917, 0.062893081761006289, 0.062893081761006289], 
+    #      'KIT': [0.066225165562913912, 0.25165562913907286, 0.072847682119205295, 0.072847682119205295, 0.066225165562913912, 0.066225165562913912, 0.27152317880794702, 0.066225165562913912, 0.066225165562913912], 
+    #      'BRAF': [0.066666666666666666, 0.17999999999999999, 0.073333333333333334, 0.073333333333333334, 0.093333333333333338, 0.080000000000000002, 0.29999999999999999, 0.066666666666666666, 0.066666666666666666],
+    #      ...
+    #     }
+    gv_dict = get_gv_fea_dict(alpha, feature, df)
+    # value_count is similar in get_gv_fea_dict
+    value_count = train_df[feature].value_counts()
+    
+    # gv_fea: Gene_variation feature, it will contain the feature for each feature value in the data
+    gv_fea = []
+    # for every feature values in the given data frame we will check if it is there in the train data then we will add the feature to gv_fea
+    # if not we will add [1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9] to gv_fea
+    for index, row in df.iterrows():
+        if row[feature] in dict(value_count).keys():
+            gv_fea.append(gv_dict[row[feature]])
+        else:
+            gv_fea.append([1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9])
+#             gv_fea.append([-1,-1,-1,-1,-1,-1,-1,-1,-1])
+    return gv_fea
+
+#response-coding of the Gene feature
+# alpha is used for laplace smoothing
+alpha = 1
+# train gene feature
+train_gene_feature_responseCoding = np.array(get_gv_feature(alpha, "Gene", train_df))
+# test gene feature
+test_gene_feature_responseCoding = np.array(get_gv_feature(alpha, "Gene", test_df))
+# cross validation gene feature
+cv_gene_feature_responseCoding = np.array(get_gv_feature(alpha, "Gene", cv_df))
+
+'''
+how to test whether gene is a good feature for model ?
+by using random model log loss
+create logistic regression model with only gene feature and check the model log loss
+if > random model then gene is a good feature for the model 
+
+in order to do that:
+
+laplace smooting
+
+* in NB we use LS i,e in test query there is a word that is not present in training test
+* what to do with new word ? 
+* can we drop ? yes but we're saying it has value 1 if we're dropping. that should not be the case
+* can we assign value 0 ? WHole multiplication becomes zero
+
+* so we use laplace smooting or editing smoothing i,e we add alpha (hyper parameter) and alpha K (k=no of classes)
+* alpha = 1, in general. can change since it is hyperparameter
+* alpha cannot be small or too large --> bias-variance trade off
+
+CALLIBIRATED CLASSIFIER --> for getting results in probability format to be used for log loss
+
+SGD --> for optimization
+
+'''
+
+# using 1-hot encoding because more dimension is fine with logistic regression algo
+
+# We need a hyperparemeter for SGD classifier.
+alpha = [10 ** x for x in range(-5, 1)]
+
+# SGD classifier
+cv_log_error_array=[]
+for i in alpha:
+    clf = SGDClassifier(alpha=i, penalty='l2', loss='log', random_state=42)
+    clf.fit(train_gene_feature_onehotCoding, y_train)
+    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
+    sig_clf.fit(train_gene_feature_onehotCoding, y_train)
+    predict_y = sig_clf.predict_proba(cv_gene_feature_onehotCoding)
+    cv_log_error_array.append(log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
+    print('For values of alpha = ', i, "The log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
+
+# alpha 0.001 log loss is minimal
+
+# Lets plot the same to check the best Alpha value
+fig, ax = plt.subplots()
+ax.plot(alpha, cv_log_error_array,c='g')
+for i, txt in enumerate(np.round(cv_log_error_array,3)):
+    ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
+plt.grid()
+plt.title("Cross Validation Error for each alpha")
+plt.xlabel("Alpha i's")
+plt.ylabel("Error measure")
+plt.show()
+
+# Lets use best alpha value as we can see from above graph and compute log loss
+best_alpha = np.argmin(cv_log_error_array)
+clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
+clf.fit(train_gene_feature_onehotCoding, y_train)
+sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
+sig_clf.fit(train_gene_feature_onehotCoding, y_train)
+
+predict_y = sig_clf.predict_proba(train_gene_feature_onehotCoding)
+print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
+predict_y = sig_clf.predict_proba(cv_gene_feature_onehotCoding)
+print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
+predict_y = sig_clf.predict_proba(test_gene_feature_onehotCoding)
+print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
+
+# log loss is less than random model 
+# how stable is model ?
+# in train there is less for gene and in test there is more. is called as overlap
+# overlap should be good
+# though we're using laplace smoothing, its not a great idea
+# data and model will be stable if we've good overlap
+
+# check how many values are overlapping between train, test or between CV and train
+
+test_coverage=test_df[test_df['Gene'].isin(list(set(train_df['Gene'])))].shape[0]
+cv_coverage=cv_df[cv_df['Gene'].isin(list(set(train_df['Gene'])))].shape[0]
+
+print('1. In test data',test_coverage, 'out of',test_df.shape[0], ":",(test_coverage/test_df.shape[0])*100)
+print('2. In cross validation data',cv_coverage, 'out of ',cv_df.shape[0],":" ,(cv_coverage/cv_df.shape[0])*100)
+
+# EVALUATING VARIATION FEATURE
+
+# Variation is also a categorical variable so we have to deal in same way like we have done for Gene column. 
+# We will again get the one hot encoder and response enoding variable for variation column.
+
+unique_variations = train_df['Variation'].value_counts()
+print('Number of Unique Variations :', unique_variations.shape[0])
+# the top 10 variations that occured most
+print(unique_variations.head(10))
+
+# comulative distribution of unique variation values
+
+s = sum(unique_variations.values);
+h = unique_variations.values/s;
+c = np.cumsum(h)
+print(c)
+plt.plot(c,label='Cumulative distribution of Variations')
+plt.grid()
+plt.legend()
+plt.show()
+
+# one-hot encoding of variation feature.
+variation_vectorizer = CountVectorizer()
+train_variation_feature_onehotCoding = variation_vectorizer.fit_transform(train_df['Variation'])
+test_variation_feature_onehotCoding = variation_vectorizer.transform(test_df['Variation'])
+cv_variation_feature_onehotCoding = variation_vectorizer.transform(cv_df['Variation'])
+
+train_variation_feature_onehotCoding.shape
+
+# alpha is used for laplace smoothing
+alpha = 1
+# train gene feature
+train_variation_feature_responseCoding = np.array(get_gv_feature(alpha, "Variation", train_df))
+# test gene feature
+test_variation_feature_responseCoding = np.array(get_gv_feature(alpha, "Variation", test_df))
+# cross validation gene feature
+cv_variation_feature_responseCoding = np.array(get_gv_feature(alpha, "Variation", cv_df))
+
+train_variation_feature_responseCoding.shape
+
+# We need a hyperparemeter for SGD classifier.
+alpha = [10 ** x for x in range(-5, 1)]
+
+# SGD classifier
+cv_log_error_array=[]
+for i in alpha:
+    clf = SGDClassifier(alpha=i, penalty='l2', loss='log', random_state=42)
+    clf.fit(train_variation_feature_onehotCoding, y_train)
+    
+    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
+    sig_clf.fit(train_variation_feature_onehotCoding, y_train)
+    predict_y = sig_clf.predict_proba(cv_variation_feature_onehotCoding)
+    
+    cv_log_error_array.append(log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
+    print('For values of alpha = ', i, "The log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
+
+# Lets plot the same to check the best Alpha value
+fig, ax = plt.subplots()
+ax.plot(alpha, cv_log_error_array,c='g')
+for i, txt in enumerate(np.round(cv_log_error_array,3)):
+    ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
+plt.grid()
+plt.title("Cross Validation Error for each alpha")
+plt.xlabel("Alpha i's")
+plt.ylabel("Error measure")
+plt.show()
+
+best_alpha = np.argmin(cv_log_error_array)
+clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
+clf.fit(train_variation_feature_onehotCoding, y_train)
+sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
+sig_clf.fit(train_variation_feature_onehotCoding, y_train)
+
+predict_y = sig_clf.predict_proba(train_variation_feature_onehotCoding)
+print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
+predict_y = sig_clf.predict_proba(cv_variation_feature_onehotCoding)
+print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
+predict_y = sig_clf.predict_proba(test_variation_feature_onehotCoding)
+print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
+
+# alpha 0.0001, test log loss is less compared to random model so good have variation feature 
+
+test_coverage=test_df[test_df['Variation'].isin(list(set(train_df['Variation'])))].shape[0]
+cv_coverage=cv_df[cv_df['Variation'].isin(list(set(train_df['Variation'])))].shape[0]
+
+print('1. In test data',test_coverage, 'out of',test_df.shape[0], ":",(test_coverage/test_df.shape[0])*100)
+print('2. In cross validation data',cv_coverage, 'out of ',cv_df.shape[0],":" ,(cv_coverage/cv_df.shape[0])*100)
+
+# stability is low for variation feature. since log loss is less, lets keep it
+
+# EVALUATING TEXT COLUMN
+
+# cls_text is a data frame
+# for every row in data frame consider the 'TEXT'
+# split the words by space
+# make a dict with those words
+# increment its count whenever we see that word
+
+# to get word count
+def extract_dictionary_paddle(cls_text):
+    dictionary = defaultdict(int)
+    for index, row in cls_text.iterrows():
+        for word in row['TEXT'].split():
+            dictionary[word] +=1
+    return dictionary
+
+import math
+#https://stackoverflow.com/a/1602964
+def get_text_responsecoding(df):
+    text_feature_responseCoding = np.zeros((df.shape[0],9))
+    for i in range(0,9):
+        row_index = 0
+        for index, row in df.iterrows():
+            sum_prob = 0
+            for word in row['TEXT'].split():
+                sum_prob += math.log(((dict_list[i].get(word,0)+10 )/(total_dict.get(word,0)+90)))
+            text_feature_responseCoding[row_index][i] = math.exp(sum_prob/len(row['TEXT'].split()))
+            row_index += 1
+    return text_feature_responseCoding
+
+# building a CountVectorizer with all the words that occured minimum 3 times in train data
+text_vectorizer = CountVectorizer(min_df=3)
+train_text_feature_onehotCoding = text_vectorizer.fit_transform(train_df['TEXT'])
+# getting all the feature names (words)
+train_text_features= text_vectorizer.get_feature_names()
+
+# train_text_feature_onehotCoding.sum(axis=0).A1 will sum every row and returns (1*number of features) vector
+train_text_fea_counts = train_text_feature_onehotCoding.sum(axis=0).A1
+
+# zip(list(text_features),text_fea_counts) will zip a word with its number of times it occured
+text_fea_dict = dict(zip(list(train_text_features),train_text_fea_counts))
+
+print("Total number of unique words in train data :", len(train_text_features))
+
+
+dict_list = []
+# dict_list =[] contains 9 dictoinaries each corresponds to a class
+for i in range(1,10):
+    cls_text = train_df[train_df['Class']==i]
+    # build a word dict based on the words in that class
+    dict_list.append(extract_dictionary_paddle(cls_text))
+    # append it to dict_list
+
+# dict_list[i] is build on i'th  class text data
+# total_dict is buid on whole training text data
+total_dict = extract_dictionary_paddle(train_df)
+
+
+confuse_array = []
+for i in train_text_features:
+    ratios = []
+    max_val = -1
+    for j in range(0,9):
+        ratios.append((dict_list[j][i]+10 )/(total_dict[i]+90))
+    confuse_array.append(ratios)
+confuse_array = np.array(confuse_array)
+
+#response coding of text features
+train_text_feature_responseCoding  = get_text_responsecoding(train_df)
+test_text_feature_responseCoding  = get_text_responsecoding(test_df)
+cv_text_feature_responseCoding  = get_text_responsecoding(cv_df)
+
+# https://stackoverflow.com/a/16202486
+# we convert each row values such that they sum to 1  
+train_text_feature_responseCoding = (train_text_feature_responseCoding.T/train_text_feature_responseCoding.sum(axis=1)).T
+test_text_feature_responseCoding = (test_text_feature_responseCoding.T/test_text_feature_responseCoding.sum(axis=1)).T
+cv_text_feature_responseCoding = (cv_text_feature_responseCoding.T/cv_text_feature_responseCoding.sum(axis=1)).T
+
+# don't forget to normalize every feature
+train_text_feature_onehotCoding = normalize(train_text_feature_onehotCoding, axis=0)
+
+# we use the same vectorizer that was trained on train data
+test_text_feature_onehotCoding = text_vectorizer.transform(test_df['TEXT'])
+# don't forget to normalize every feature
+test_text_feature_onehotCoding = normalize(test_text_feature_onehotCoding, axis=0)
+
+# we use the same vectorizer that was trained on train data
+cv_text_feature_onehotCoding = text_vectorizer.transform(cv_df['TEXT'])
+# don't forget to normalize every feature
+cv_text_feature_onehotCoding = normalize(cv_text_feature_onehotCoding, axis=0)
+
+#https://stackoverflow.com/a/2258273/4084039
+sorted_text_fea_dict = dict(sorted(text_fea_dict.items(), key=lambda x: x[1] , reverse=True))
+sorted_text_occur = np.array(list(sorted_text_fea_dict.values()))
+
+# Number of words for a given frequency.
+print(Counter(sorted_text_occur))
+
+# build the model with only text column
+
+cv_log_error_array=[]
+for i in alpha:
+    clf = SGDClassifier(alpha=i, penalty='l2', loss='log', random_state=42)
+    clf.fit(train_text_feature_onehotCoding, y_train)
+    
+    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
+    sig_clf.fit(train_text_feature_onehotCoding, y_train)
+    predict_y = sig_clf.predict_proba(cv_text_feature_onehotCoding)
+    cv_log_error_array.append(log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
+    print('For values of alpha = ', i, "The log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
+
+# log loss is less than random model
+
+fig, ax = plt.subplots()
+ax.plot(alpha, cv_log_error_array,c='g')
+for i, txt in enumerate(np.round(cv_log_error_array,3)):
+    ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
+plt.grid()
+plt.title("Cross Validation Error for each alpha")
+plt.xlabel("Alpha i's")
+plt.ylabel("Error measure")
+plt.show()
+
+best_alpha = np.argmin(cv_log_error_array)
+clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
+clf.fit(train_text_feature_onehotCoding, y_train)
+sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
+sig_clf.fit(train_text_feature_onehotCoding, y_train)
+
+predict_y = sig_clf.predict_proba(train_text_feature_onehotCoding)
+print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
+predict_y = sig_clf.predict_proba(cv_text_feature_onehotCoding)
+print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
+predict_y = sig_clf.predict_proba(test_text_feature_onehotCoding)
+print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
+
+# check the overlap of text data
+
+def get_intersec_text(df):
+    df_text_vec = CountVectorizer(min_df=3)
+    df_text_fea = df_text_vec.fit_transform(df['TEXT'])
+    df_text_features = df_text_vec.get_feature_names()
+
+    df_text_fea_counts = df_text_fea.sum(axis=0).A1
+    df_text_fea_dict = dict(zip(list(df_text_features),df_text_fea_counts))
+    len1 = len(set(df_text_features))
+    len2 = len(set(train_text_features) & set(df_text_features))
+    return len1,len2
+
+len1,len2 = get_intersec_text(test_df)
+print(np.round((len2/len1)*100, 3), "% of word of test data appeared in train data")
+len1,len2 = get_intersec_text(cv_df)
+print(np.round((len2/len1)*100, 3), "% of word of Cross Validation appeared in train data")
+
+# text column as good stable
+
+# therefore, all 3 columns are going important for model buidling
+
+# Data prepration for Machine Learning models
+
+# few utility functions
+
+def report_log_loss(train_x, train_y, test_x, test_y,  clf):
+    clf.fit(train_x, train_y)
+    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
+    sig_clf.fit(train_x, train_y)
+    sig_clf_probs = sig_clf.predict_proba(test_x)
+    return log_loss(test_y, sig_clf_probs, eps=1e-15)
+
+# This function plots the confusion matrices given y_i, y_i_hat.
+def plot_confusion_matrix(test_y, predict_y):
+    C = confusion_matrix(test_y, predict_y)
+    
+    A =(((C.T)/(C.sum(axis=1))).T)
+    
+    B =(C/C.sum(axis=0)) 
+    labels = [1,2,3,4,5,6,7,8,9]
+    # representing A in heatmap format
+    print("-"*20, "Confusion matrix", "-"*20)
+    plt.figure(figsize=(20,7))
+    sns.heatmap(C, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
+    plt.xlabel('Predicted Class')
+    plt.ylabel('Original Class')
+    plt.show()
+
+    print("-"*20, "Precision matrix (Columm Sum=1)", "-"*20)
+    plt.figure(figsize=(20,7))
+    sns.heatmap(B, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
+    plt.xlabel('Predicted Class')
+    plt.ylabel('Original Class')
+    plt.show()
+    
+    # representing B in heatmap format
+    print("-"*20, "Recall matrix (Row sum=1)", "-"*20)
+    plt.figure(figsize=(20,7))
+    sns.heatmap(A, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
+    plt.xlabel('Predicted Class')
+    plt.ylabel('Original Class')
+    plt.show()
+
+
+def predict_and_plot_confusion_matrix(train_x, train_y,test_x, test_y, clf):
+    clf.fit(train_x, train_y)
+    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
+    sig_clf.fit(train_x, train_y)
+    pred_y = sig_clf.predict(test_x)
+
+    # for calculating log_loss we willl provide the array of probabilities belongs to each class
+    print("Log loss :",log_loss(test_y, sig_clf.predict_proba(test_x)))
+    # calculating the number of data points that are misclassified
+    print("Number of mis-classified points :", np.count_nonzero((pred_y- test_y))/test_y.shape[0])
+    plot_confusion_matrix(test_y, pred_y)
+
+
+# this function will be used just for naive bayes
+# for the given indices, we will print the name of the features
+# and we will check whether the feature present in the test point text or not
+def get_impfeature_names(indices, text, gene, var, no_features):
+    gene_count_vec = CountVectorizer()
+    var_count_vec = CountVectorizer()
+    text_count_vec = CountVectorizer(min_df=3)
+    
+    gene_vec = gene_count_vec.fit(train_df['Gene'])
+    var_vec  = var_count_vec.fit(train_df['Variation'])
+    text_vec = text_count_vec.fit(train_df['TEXT'])
+    
+    fea1_len = len(gene_vec.get_feature_names())
+    fea2_len = len(var_count_vec.get_feature_names())
+    
+    word_present = 0
+    for i,v in enumerate(indices):
+        if (v < fea1_len):
+            word = gene_vec.get_feature_names()[v]
+            yes_no = True if word == gene else False
+            if yes_no:
+                word_present += 1
+                print(i, "Gene feature [{}] present in test data point [{}]".format(word,yes_no))
+        elif (v < fea1_len+fea2_len):
+            word = var_vec.get_feature_names()[v-(fea1_len)]
+            yes_no = True if word == var else False
+            if yes_no:
+                word_present += 1
+                print(i, "variation feature [{}] present in test data point [{}]".format(word,yes_no))
+        else:
+            word = text_vec.get_feature_names()[v-(fea1_len+fea2_len)]
+            yes_no = True if word in text.split() else False
+            if yes_no:
+                word_present += 1
+                print(i, "Text feature [{}] present in test data point [{}]".format(word,yes_no))
+
+    print("Out of the top ",no_features," features ", word_present, "are present in query point")
+
+## Combining all 3 features together
+
+# merging gene, variance and text features
+
+# building train, test and cross validation data sets
+# a = [[1, 2], 
+#      [3, 4]]
+# b = [[4, 5], 
+#      [6, 7]]
+# hstack(a, b) = [[1, 2, 4, 5],
+#                [ 3, 4, 6, 7]]
+
+train_gene_var_onehotCoding = hstack((train_gene_feature_onehotCoding,train_variation_feature_onehotCoding))
+test_gene_var_onehotCoding = hstack((test_gene_feature_onehotCoding,test_variation_feature_onehotCoding))
+cv_gene_var_onehotCoding = hstack((cv_gene_feature_onehotCoding,cv_variation_feature_onehotCoding))
+
+train_x_onehotCoding = hstack((train_gene_var_onehotCoding, train_text_feature_onehotCoding)).tocsr()
+train_y = np.array(list(train_df['Class']))
+
+test_x_onehotCoding = hstack((test_gene_var_onehotCoding, test_text_feature_onehotCoding)).tocsr()
+test_y = np.array(list(test_df['Class']))
+
+cv_x_onehotCoding = hstack((cv_gene_var_onehotCoding, cv_text_feature_onehotCoding)).tocsr()
+cv_y = np.array(list(cv_df['Class']))
+
+
+train_gene_var_responseCoding = np.hstack((train_gene_feature_responseCoding,train_variation_feature_responseCoding))
+test_gene_var_responseCoding = np.hstack((test_gene_feature_responseCoding,test_variation_feature_responseCoding))
+cv_gene_var_responseCoding = np.hstack((cv_gene_feature_responseCoding,cv_variation_feature_responseCoding))
+
+train_x_responseCoding = np.hstack((train_gene_var_responseCoding, train_text_feature_responseCoding))
+test_x_responseCoding = np.hstack((test_gene_var_responseCoding, test_text_feature_responseCoding))
+cv_x_responseCoding = np.hstack((cv_gene_var_responseCoding, cv_text_feature_responseCoding))
+
+
+print("One hot encoding features :")
+print("(number of data points * number of features) in train data = ", train_x_onehotCoding.shape)
+print("(number of data points * number of features) in test data = ", test_x_onehotCoding.shape)
+print("(number of data points * number of features) in cross validation data =", cv_x_onehotCoding.shape)
+
+print(" Response encoding features :")
+print("(number of data points * number of features) in train data = ", train_x_responseCoding.shape)
+print("(number of data points * number of features) in test data = ", test_x_responseCoding.shape)
+print("(number of data points * number of features) in cross validation data =", cv_x_responseCoding.shape)
+
+# Building Machine Learning model
+
+# http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html
+alpha = [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000]
+cv_log_error_array = []
+for i in alpha:
+    print("for alpha =", i)
+    clf = MultinomialNB(alpha=i)
+    clf.fit(train_x_onehotCoding, train_y)
+    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
+    sig_clf.fit(train_x_onehotCoding, train_y)
+    sig_clf_probs = sig_clf.predict_proba(cv_x_onehotCoding)
+    cv_log_error_array.append(log_loss(cv_y, sig_clf_probs, labels=clf.classes_, eps=1e-15))
+    # to avoid rounding error while multiplying probabilites we use log-probability estimates
+    print("Log Loss :",log_loss(cv_y, sig_clf_probs)) 
+
+fig, ax = plt.subplots()
+ax.plot(np.log10(alpha), cv_log_error_array,c='g')
+for i, txt in enumerate(np.round(cv_log_error_array,3)):
+    ax.annotate((alpha[i],str(txt)), (np.log10(alpha[i]),cv_log_error_array[i]))
+plt.grid()
+plt.xticks(np.log10(alpha))
+plt.title("Cross Validation Error for each alpha")
+plt.xlabel("Alpha i's")
+plt.ylabel("Error measure")
+plt.show()
+
+best_alpha = np.argmin(cv_log_error_array)
+clf = MultinomialNB(alpha=alpha[best_alpha])
+clf.fit(train_x_onehotCoding, train_y)
+sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
+sig_clf.fit(train_x_onehotCoding, train_y)
+
+predict_y = sig_clf.predict_proba(train_x_onehotCoding)
+print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
+predict_y = sig_clf.predict_proba(cv_x_onehotCoding)
+print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
+predict_y = sig_clf.predict_proba(test_x_onehotCoding)
+print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
+
+# Testing our Naive Bayes model with best found value of alpha on testing data
+clf = MultinomialNB(alpha=alpha[best_alpha])
+clf.fit(train_x_onehotCoding, train_y)
+sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
+sig_clf.fit(train_x_onehotCoding, train_y)
+sig_clf_probs = sig_clf.predict_proba(cv_x_onehotCoding)
+# to avoid rounding error while multiplying probabilites we use log-probability estimates
+print("Log Loss :",log_loss(cv_y, sig_clf_probs))
+print("Number of missclassified point :", np.count_nonzero((sig_clf.predict(cv_x_onehotCoding)- cv_y))/cv_y.shape[0])
+plot_confusion_matrix(cv_y, sig_clf.predict(cv_x_onehotCoding.toarray()))
+
+# interpretability of model
+
+test_point_index = 1
+no_feature = 100
+predicted_cls = sig_clf.predict(test_x_onehotCoding[test_point_index])
+print("Predicted Class :", predicted_cls[0])
+print("Predicted Class Probabilities:", np.round(sig_clf.predict_proba(test_x_onehotCoding[test_point_index]),4))
+print("Actual Class :", test_y[test_point_index])
+indices = np.argsort(-clf.coef_)[predicted_cls-1][:,:no_feature]
+print("-"*50)
+get_impfeature_names(indices[0], test_df['TEXT'].iloc[test_point_index],test_df['Gene'].iloc[test_point_index],test_df['Variation'].iloc[test_point_index], no_feature)
+
+# one more ex
+
+test_point_index = 100
+no_feature = 100
+predicted_cls = sig_clf.predict(test_x_onehotCoding[test_point_index])
+print("Predicted Class :", predicted_cls[0])
+print("Predicted Class Probabilities:", np.round(sig_clf.predict_proba(test_x_onehotCoding[test_point_index]),4))
+print("Actual Class :", test_y[test_point_index])
+indices = np.argsort(-clf.coef_)[predicted_cls-1][:,:no_feature]
+print("-"*50)
+get_impfeature_names(indices[0], test_df['TEXT'].iloc[test_point_index],test_df['Gene'].iloc[test_point_index],test_df['Variation'].iloc[test_point_index], no_feature)
+
+# So Naive Bayes not performing very badly but lets look at other models
+
+# Logistic Regression
+
+# Balancing all classes
+
+alpha = [10 ** x for x in range(-6, 3)]
+cv_log_error_array = []
+for i in alpha:
+    print("for alpha =", i)
+    clf = SGDClassifier(class_weight='balanced', alpha=i, penalty='l2', loss='log', random_state=42)
+    clf.fit(train_x_onehotCoding, train_y)
+    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
+    sig_clf.fit(train_x_onehotCoding, train_y)
+    sig_clf_probs = sig_clf.predict_proba(cv_x_onehotCoding)
+    cv_log_error_array.append(log_loss(cv_y, sig_clf_probs, labels=clf.classes_, eps=1e-15))
+    # to avoid rounding error while multiplying probabilites we use log-probability estimates
+    print("Log Loss :",log_loss(cv_y, sig_clf_probs)) 
+
+fig, ax = plt.subplots()
+ax.plot(alpha, cv_log_error_array,c='g')
+for i, txt in enumerate(np.round(cv_log_error_array,3)):
+    ax.annotate((alpha[i],str(txt)), (alpha[i],cv_log_error_array[i]))
+plt.grid()
+plt.title("Cross Validation Error for each alpha")
+plt.xlabel("Alpha i's")
+plt.ylabel("Error measure")
+plt.show()
+
+best_alpha = np.argmin(cv_log_error_array)
+clf = SGDClassifier(class_weight='balanced', alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
+clf.fit(train_x_onehotCoding, train_y)
+sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
+sig_clf.fit(train_x_onehotCoding, train_y)
+
+predict_y = sig_clf.predict_proba(train_x_onehotCoding)
+print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
+predict_y = sig_clf.predict_proba(cv_x_onehotCoding)
+print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
+predict_y = sig_clf.predict_proba(test_x_onehotCoding)
+print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
+
+# Lets test it on testing data using best alpha value
+
+clf = SGDClassifier(class_weight='balanced', alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
+predict_and_plot_confusion_matrix(train_x_onehotCoding, train_y, cv_x_onehotCoding, cv_y, clf)
+
+
+# feature importance
+
+def get_imp_feature_names(text, indices, removed_ind = []):
+    word_present = 0
+    tabulte_list = []
+    incresingorder_ind = 0
+    for i in indices:
+        if i < train_gene_feature_onehotCoding.shape[1]:
+            tabulte_list.append([incresingorder_ind, "Gene", "Yes"])
+        elif i< 18:
+            tabulte_list.append([incresingorder_ind,"Variation", "Yes"])
+        if ((i > 17) & (i not in removed_ind)) :
+            word = train_text_features[i]
+            yes_no = True if word in text.split() else False
+            if yes_no:
+                word_present += 1
+            tabulte_list.append([incresingorder_ind,train_text_features[i], yes_no])
+        incresingorder_ind += 1
+    print(word_present, "most importent features are present in our query point")
+    print("-"*50)
+    print("The features that are most importent of the ",predicted_cls[0]," class:")
+    print (tabulate(tabulte_list, headers=["Index",'Feature name', 'Present or Not']))
+
+# testing query point and doing interpretability
+
+# from tabulate import tabulate
+clf = SGDClassifier(class_weight='balanced', alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
+clf.fit(train_x_onehotCoding,train_y)
+test_point_index = 1
+no_feature = 500
+predicted_cls = sig_clf.predict(test_x_onehotCoding[test_point_index])
+print("Predicted Class :", predicted_cls[0])
+print("Predicted Class Probabilities:", np.round(sig_clf.predict_proba(test_x_onehotCoding[test_point_index]),4))
+print("Actual Class :", test_y[test_point_index])
+indices = np.argsort(-clf.coef_)[predicted_cls-1][:,:no_feature]
+print("-"*50)
+get_impfeature_names(indices[0], test_df['TEXT'].iloc[test_point_index],test_df['Gene'].iloc[test_point_index],test_df['Variation'].iloc[test_point_index], no_feature)
+
+test_point_index = 100
+no_feature = 500
+predicted_cls = sig_clf.predict(test_x_onehotCoding[test_point_index])
+print("Predicted Class :", predicted_cls[0])
+print("Predicted Class Probabilities:", np.round(sig_clf.predict_proba(test_x_onehotCoding[test_point_index]),4))
+print("Actual Class :", test_y[test_point_index])
+indices = np.argsort(-clf.coef_)[predicted_cls-1][:,:no_feature]
+print("-"*50)
+get_impfeature_names(indices[0], test_df['TEXT'].iloc[test_point_index],test_df['Gene'].iloc[test_point_index],test_df['Variation'].iloc[test_point_index], no_feature)
+
+# K Nearest Neighbour Classification
+
+alpha = [5, 11, 15, 21, 31, 41, 51, 99]
+cv_log_error_array = []
+for i in alpha:
+    print("for alpha =", i)
+    clf = KNeighborsClassifier(n_neighbors=i)
+    clf.fit(train_x_responseCoding, train_y)
+    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
+    sig_clf.fit(train_x_responseCoding, train_y)
+    sig_clf_probs = sig_clf.predict_proba(cv_x_responseCoding)
+    cv_log_error_array.append(log_loss(cv_y, sig_clf_probs, labels=clf.classes_, eps=1e-15))
+    # to avoid rounding error while multiplying probabilites we use log-probability estimates
+    print("Log Loss :",log_loss(cv_y, sig_clf_probs))
+
+fig, ax = plt.subplots()
+ax.plot(alpha, cv_log_error_array,c='g')
+for i, txt in enumerate(np.round(cv_log_error_array,3)):
+    ax.annotate((alpha[i],str(txt)), (alpha[i],cv_log_error_array[i]))
+plt.grid()
+plt.title("Cross Validation Error for each alpha")
+plt.xlabel("Alpha i's")
+plt.ylabel("Error measure")
+plt.show()
+
+best_alpha = np.argmin(cv_log_error_array)
+clf = KNeighborsClassifier(n_neighbors=alpha[best_alpha])
+clf.fit(train_x_responseCoding, train_y)
+sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
+sig_clf.fit(train_x_responseCoding, train_y)
+
+predict_y = sig_clf.predict_proba(train_x_responseCoding)
+print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
+predict_y = sig_clf.predict_proba(cv_x_responseCoding)
+print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
+predict_y = sig_clf.predict_proba(test_x_responseCoding)
+print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
+
+clf = KNeighborsClassifier(n_neighbors=alpha[best_alpha])
+predict_and_plot_confusion_matrix(train_x_responseCoding, train_y, cv_x_responseCoding, cv_y, clf)
+
+# Lets look at few test points
+clf = KNeighborsClassifier(n_neighbors=alpha[best_alpha])
+clf.fit(train_x_responseCoding, train_y)
+sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
+sig_clf.fit(train_x_responseCoding, train_y)
+
+test_point_index = 1
+predicted_cls = sig_clf.predict(test_x_responseCoding[0].reshape(1,-1))
+print("Predicted Class :", predicted_cls[0])
+print("Actual Class :", test_y[test_point_index])
+neighbors = clf.kneighbors(test_x_responseCoding[test_point_index].reshape(1, -1), alpha[best_alpha])
+print("The ",alpha[best_alpha]," nearest neighbours of the test points belongs to classes",train_y[neighbors[1][0]])
+print("Fequency of nearest points :",Counter(train_y[neighbors[1][0]]))
+
+clf = KNeighborsClassifier(n_neighbors=alpha[best_alpha])
+clf.fit(train_x_responseCoding, train_y)
+sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
+sig_clf.fit(train_x_responseCoding, train_y)
+
+test_point_index = 100
+
+predicted_cls = sig_clf.predict(test_x_responseCoding[test_point_index].reshape(1,-1))
+print("Predicted Class :", predicted_cls[0])
+print("Actual Class :", test_y[test_point_index])
+neighbors = clf.kneighbors(test_x_responseCoding[test_point_index].reshape(1, -1), alpha[best_alpha])
+print("the k value for knn is",alpha[best_alpha],"and the nearest neighbours of the test points belongs to classes",train_y[neighbors[1][0]])
+print("Fequency of nearest points :",Counter(train_y[neighbors[1][0]]))
+
+# without balancing data for logistic regression .01 difference in log loss
+
+# linear SVM dint reduce w.r.t log loss
+
+# random forest classifier with 1-hot encoding and response encoding (was worse than 1-hot encoding) dint reduce w.r.t log loss
+
+# stacking model dint reduce w.r.t log loss
+
+# logistic regression with balancing and 1-hot encoding has got the lowest log loss
+
+
+