--- a +++ b/run.py @@ -0,0 +1,1197 @@ +import pandas as pd +import matplotlib.pyplot as plt +import re +import time +import warnings +import numpy as np +from nltk.corpus import stopwords +from sklearn.decomposition import TruncatedSVD +from sklearn.preprocessing import normalize +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.manifold import TSNE +import seaborn as sns +from sklearn.neighbors import KNeighborsClassifier +from sklearn.metrics import confusion_matrix +from sklearn.metrics import accuracy_score, log_loss +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import SGDClassifier +from imblearn.over_sampling import SMOTE +from collections import Counter +from scipy.sparse import hstack +from sklearn.multiclass import OneVsRestClassifier +from sklearn.svm import SVC +from sklearn import StratifiedKFold +from collections import Counter, defaultdict +from sklearn.calibration import CalibratedClassifierCV +from sklearn.naive_bayes import MultinomialNB +from sklearn.naive_bayes import GaussianNB +from sklearn.model_selection import train_test_split +from sklearn.model_selection import GridSearchCV +import math +from sklearn.metrics import normalized_mutual_info_score +from sklearn.ensemble import RandomForestClassifier +warnings.filterwarnings("ignore") + +from mlxtend.classifier import StackingClassifier + +from sklearn import model_selection +from sklearn.linear_model import LogisticRegression + +# Loading training_variants. A comma seperated file +data_variants = pd.read_csv('training/training_variants') +# Loading training_text. Is seperated by || +data_text =pd.read_csv("training/training_text",sep="\|\|",engine="python",names=["ID","TEXT"],skiprows=1) + +# basic look ahead for data_variants + +data_variants.head() +# to classify cancer, we need gene, variants and research paper (text) +# ID --> common, same value in both data_variants & data_text for linking (merging dataframe) +# gene --> genetic mutation location +# variation --> aminoacid change for respective mutations +# class --> 1-9, target classes i,e genetic mutation has been classified on +# in data_variants, text is missing. text is present in data_text dataframe + +data_variants.info() + +data_variants.describe() + +data_variants.columns + +data_variants.shape + +# basic look ahead for data_text + +data_text.head() +# text --> huge text because research paper + +data_text.info() +# missing value exists + +data_text.columns + +data_text.shape + +# target classes +data_variants.Class.unique() +# multiclass classification problem + +''' +important things before modelling: + +1. Since it is medical problem, errors are costly + so have to return outcome in probability. + p(y=2|a1) = 0.99 --> OK + p(y=2|a1) = 0.5 --> NOT OK because probability is ambigous. In this case, have to look for more evidences or reason + +2. Interpretability: + + why 0.5 ? why 0.99 ? provide evidences for probability value + + * few ML models are very good at interpretability such as Naive bayes, decision tree, logistic regression, linear SVM + * few ML models are not good at interpretability such as RBF and SVM + +3. Latency: + + * OK to take few sec/min for output + * NOT OK to take in hours for output + +''' +# data_text has text (i,e research paper), cannot feed raw text to model +# so convert to a format that model understands + +# numbers, stopwords, multiple spaces, special chars are not relevant so lets remove + +stop_words = set(stopwords.words('english')) + +def data_text_preprocess(total_text, ind, col): + # Remove int values from text data as that might not be imp + if type(total_text) is not int: + string = "" + # replacing all special char with space + total_text = re.sub('[^a-zA-Z0-9\n]', ' ', str(total_text)) + # replacing multiple spaces with single space + total_text = re.sub('\s+',' ', str(total_text)) + # bring whole text to same lower-case scale. + total_text = total_text.lower() + + for word in total_text.split(): + # if the word is a not a stop word then retain that word from text + if not word in stop_words: + string += word + " " + + data_text[col][ind] = string + +for index, row in data_text.iterrows(): + if type(row['TEXT']) is str: + data_text_preprocess(row['TEXT'], index, 'TEXT') + +# data preprocessing is completed + +# lets merge dataframes based on ID +result = pd.merge(data_variants, data_text,on='ID', how='left') +result.head() + +# check for missing values +result[result.isnull().any(axis=1)] +# 5 rows of missing values, so lets impute by conacatinating gene + variant values + +result.loc[result['TEXT'].isnull(),'TEXT'] = result['Gene'] +' '+result['Variation'] + +# cross check +result[result.isnull().any(axis=1)] + +# Split dataset into training, test & validation data + +y_true = result['Class'].values +result.Gene = result.Gene.str.replace('\s+', '_') +result.Variation = result.Variation.str.replace('\s+', '_') + +# Splitting the data into train and test set +X_train, test_df, y_train, y_test = train_test_split(result, y_true, stratify=y_true, test_size=0.2) +# split the train data now into train validation and cross validation +train_df, cv_df, y_train, y_cv = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2) + +print('Number of data points in train data:', train_df.shape[0]) +print('Number of data points in test data:', test_df.shape[0]) +print('Number of data points in cross validation data:', cv_df.shape[0]) + +# has data distributed properly ?? + +train_class_distribution = train_df['Class'].value_counts() +test_class_distribution = test_df['Class'].value_counts() +cv_class_distribution = cv_df['Class'].value_counts() + +# lets viz + +# viz for train class distribution + +my_colors = 'rgbkymc' +train_class_distribution.plot(kind='bar') +plt.xlabel('Class') +plt.ylabel(' Number of Data points per Class') +plt.title('Distribution of yi in train data') +plt.grid() +plt.show() + +# lets look in percentage form + +sorted_yi = np.argsort(-train_class_distribution.values) +for i in sorted_yi: + print('Number of data points in class', i+1, ':',train_class_distribution.values[i], '(', np.round((train_class_distribution.values[i]/train_df.shape[0]*100), 3), '%)') + +# viz for test class distribution + +my_colors = 'rgbkymc' +test_class_distribution.plot(kind='bar') +plt.xlabel('Class') +plt.ylabel('Number of Data points per Class') +plt.title('Distribution of yi in test data') +plt.grid() +plt.show() + +# lets look in percentage form + +sorted_yi = np.argsort(-test_class_distribution.values) +for i in sorted_yi: + print('Number of data points in class', i+1, ':',test_class_distribution.values[i], '(', np.round((test_class_distribution.values[i]/test_df.shape[0]*100), 3), '%)') + +# viz for cross validation set + +my_colors = 'rgbkymc' +cv_class_distribution.plot(kind='bar') +plt.xlabel('Class') +plt.ylabel('Data points per Class') +plt.title('Distribution of yi in cross validation data') +plt.grid() +plt.show() + +# lets look in percentage form + +sorted_yi = np.argsort(-train_class_distribution.values) +for i in sorted_yi: + print('Number of data points in class', i+1, ':',cv_class_distribution.values[i], '(', np.round((cv_class_distribution.values[i]/cv_df.shape[0]*100), 3), '%)') + +# distribution must happen in right distribution + +''' +what is the evaluation metric ? +from kaggle page, multi class log loss is the evaluation metric + +log loss ? + +* more the prediction value (prediction probability), log loss is lower +* if the prediction is ambigious, log loss penalizes +* log loss = 0 --> perfect model, which is not the case in real world +* log loss lies between zero to infinity [0,infinity] + +1 problem: + +* in accuracy score evaluation metrics: 0 - 1 is the range where 1 is the best and 0 is the worst +* in AUC evaluation metrics: 0 - 1 is the range where 1 is the best and 0 is the worst + +BUT + +log loss values range between 0 to infinity + +how do we know the infinity value is good or bad ?? + +1 method is: + +random model --> worst model --> compute log loss (ex: 3) +random model --> compute log loss (ex: 1.5) + +1.5 given model is better than 3 + +how do we generate random model ? + +* here working with probability. +* summazation all the probability would be 1 + +''' + +# build random model + +# 9 classes exists, so create 9 random numbers such that thier sum is equivalent to 1 +# because sum of probability of all the 9 classes must be equivalent to 1 + +test_data_len = test_df.shape[0] +cv_data_len = cv_df.shape[0] + +# CV set error +cv_predicted_y = np.zeros((cv_data_len,9)) +for i in range(cv_data_len): + rand_probs = np.random.rand(1,9) # randomly generate numbers between 1-9 + cv_predicted_y[i] = ((rand_probs/sum(sum(rand_probs)))[0]) # summazation equals to 1 +print("Log loss on Cross Validation Data using Random Model",log_loss(y_cv,cv_predicted_y, eps=1e-15)) +# Log loss on Cross Validation Data using Random Model 2.491130932423048 + +# test-set error +test_predicted_y = np.zeros((test_data_len,9)) +for i in range(test_data_len): + rand_probs = np.random.rand(1,9) + test_predicted_y[i] = ((rand_probs/sum(sum(rand_probs)))[0]) +print("Log loss on Test Data using Random Model",log_loss(y_test,test_predicted_y, eps=1e-15)) +# Log loss on Test Data using Random Model 2.531934856356048 + +# Lets get the index of max probablity +predicted_y =np.argmax(test_predicted_y, axis=1) + +# index value ranges from 0-8, our problem statement has from 1-9 so lets update it +predicted_y = predicted_y + 1 + +# confusion matrix +C = confusion_matrix(y_test, predicted_y) + +labels = [1,2,3,4,5,6,7,8,9] +plt.figure(figsize=(20,7)) +sns.heatmap(C, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels) +plt.xlabel('Predicted Class') +plt.ylabel('Original Class') +plt.show() + +# precision matrix + +B =(C/C.sum(axis=0)) + +plt.figure(figsize=(20,7)) +sns.heatmap(B, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels) +plt.xlabel('Predicted Class') +plt.ylabel('Original Class') +plt.show() + +# recall matrix + +A =(((C.T)/(C.sum(axis=1))).T) + +plt.figure(figsize=(20,7)) +sns.heatmap(A, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels) +plt.xlabel('Predicted Class') +plt.ylabel('Original Class') +plt.show() + +# evaluate gene column + +# is gene column important for target feature ? +# if so then convert category to numeric conversions + +unique_genes = train_df['Gene'].value_counts() +print('Number of Unique Genes :', unique_genes.shape[0]) +# the top 10 genes that occured most +print(unique_genes.head(10)) + +unique_genes.shape[0] + +# viz of cumulative distribution of unique gene values + +s = sum(unique_genes.values); +h = unique_genes.values/s; +c = np.cumsum(h) +plt.plot(c,label='Cumulative distribution of Genes') +plt.grid() +plt.legend() +plt.show() + +# top 50 values are contributing to 75% of data + +''' +lets encode +1. one-hot encoding. creates lot of features --> problem in some cases +2. response encoding (mean imputation, ex: average height of indians are 5.5) + +same way response encoding in this case would be: + +* for 1 row, we'll create 9 columns +* in 9 columns, the probability of gene occurance w.r.t that row would be saved +''' +# let's use both encoding techniques and check which one works better + +# 1-hot encoding on gene feature + +gene_vectorizer = CountVectorizer() +train_gene_feature_onehotCoding = gene_vectorizer.fit_transform(train_df['Gene']) +test_gene_feature_onehotCoding = gene_vectorizer.transform(test_df['Gene']) +cv_gene_feature_onehotCoding = gene_vectorizer.transform(cv_df['Gene']) + +train_gene_feature_onehotCoding.shape +# (2124,234) + +# response encoding on gene feature + +# code for response coding with Laplace smoothing. +# alpha : used for laplace smoothing +# feature: ['gene', 'variation'] +# df: ['train_df', 'test_df', 'cv_df'] +# algorithm +# ---------- +# Consider all unique values and the number of occurances of given feature in train data dataframe +# build a vector (1*9) , the first element = (number of times it occured in class1 + 10*alpha / number of time it occurred in total data+90*alpha) +# gv_dict is like a look up table, for every gene it store a (1*9) representation of it +# for a value of feature in df: +# if it is in train data: +# we add the vector that was stored in 'gv_dict' look up table to 'gv_fea' +# if it is not there is train: +# we add [1/9, 1/9, 1/9, 1/9,1/9, 1/9, 1/9, 1/9, 1/9] to 'gv_fea' +# return 'gv_fea' +# ---------------------- + +# get_gv_fea_dict: Get Gene varaition Feature Dict +def get_gv_fea_dict(alpha, feature, df): + # value_count: it contains a dict like + # print(train_df['Gene'].value_counts()) + # output: + # {BRCA1 174 + # TP53 106 + # EGFR 86 + # BRCA2 75 + # PTEN 69 + # KIT 61 + # BRAF 60 + # ERBB2 47 + # PDGFRA 46 + # ...} + # print(train_df['Variation'].value_counts()) + # output: + # { + # Truncating_Mutations 63 + # Deletion 43 + # Amplification 43 + # Fusions 22 + # Overexpression 3 + # E17K 3 + # Q61L 3 + # S222D 2 + # P130S 2 + # ... + # } + value_count = train_df[feature].value_counts() + + # gv_dict : Gene Variation Dict, which contains the probability array for each gene/variation + gv_dict = dict() + + # denominator will contain the number of time that particular feature occured in whole data + for i, denominator in value_count.items(): + # vec will contain (p(yi==1/Gi) probability of gene/variation belongs to perticular class + # vec is 9 diamensional vector + vec = [] + for k in range(1,10): + # print(train_df.loc[(train_df['Class']==1) & (train_df['Gene']=='BRCA1')]) + # ID Gene Variation Class + # 2470 2470 BRCA1 S1715C 1 + # 2486 2486 BRCA1 S1841R 1 + # 2614 2614 BRCA1 M1R 1 + # 2432 2432 BRCA1 L1657P 1 + # 2567 2567 BRCA1 T1685A 1 + # 2583 2583 BRCA1 E1660G 1 + # 2634 2634 BRCA1 W1718L 1 + # cls_cnt.shape[0] will return the number of rows + + cls_cnt = train_df.loc[(train_df['Class']==k) & (train_df[feature]==i)] + + # cls_cnt.shape[0](numerator) will contain the number of time that particular feature occured in whole data + vec.append((cls_cnt.shape[0] + alpha*10)/ (denominator + 90*alpha)) + + # we are adding the gene/variation to the dict as key and vec as value + gv_dict[i]=vec + return gv_dict + +# Get Gene variation feature +def get_gv_feature(alpha, feature, df): + # print(gv_dict) + # {'BRCA1': [0.20075757575757575, 0.03787878787878788, 0.068181818181818177, 0.13636363636363635, 0.25, 0.19318181818181818, 0.03787878787878788, 0.03787878787878788, 0.03787878787878788], + # 'TP53': [0.32142857142857145, 0.061224489795918366, 0.061224489795918366, 0.27040816326530615, 0.061224489795918366, 0.066326530612244902, 0.051020408163265307, 0.051020408163265307, 0.056122448979591837], + # 'EGFR': [0.056818181818181816, 0.21590909090909091, 0.0625, 0.068181818181818177, 0.068181818181818177, 0.0625, 0.34659090909090912, 0.0625, 0.056818181818181816], + # 'BRCA2': [0.13333333333333333, 0.060606060606060608, 0.060606060606060608, 0.078787878787878782, 0.1393939393939394, 0.34545454545454546, 0.060606060606060608, 0.060606060606060608, 0.060606060606060608], + # 'PTEN': [0.069182389937106917, 0.062893081761006289, 0.069182389937106917, 0.46540880503144655, 0.075471698113207544, 0.062893081761006289, 0.069182389937106917, 0.062893081761006289, 0.062893081761006289], + # 'KIT': [0.066225165562913912, 0.25165562913907286, 0.072847682119205295, 0.072847682119205295, 0.066225165562913912, 0.066225165562913912, 0.27152317880794702, 0.066225165562913912, 0.066225165562913912], + # 'BRAF': [0.066666666666666666, 0.17999999999999999, 0.073333333333333334, 0.073333333333333334, 0.093333333333333338, 0.080000000000000002, 0.29999999999999999, 0.066666666666666666, 0.066666666666666666], + # ... + # } + gv_dict = get_gv_fea_dict(alpha, feature, df) + # value_count is similar in get_gv_fea_dict + value_count = train_df[feature].value_counts() + + # gv_fea: Gene_variation feature, it will contain the feature for each feature value in the data + gv_fea = [] + # for every feature values in the given data frame we will check if it is there in the train data then we will add the feature to gv_fea + # if not we will add [1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9] to gv_fea + for index, row in df.iterrows(): + if row[feature] in dict(value_count).keys(): + gv_fea.append(gv_dict[row[feature]]) + else: + gv_fea.append([1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9]) +# gv_fea.append([-1,-1,-1,-1,-1,-1,-1,-1,-1]) + return gv_fea + +#response-coding of the Gene feature +# alpha is used for laplace smoothing +alpha = 1 +# train gene feature +train_gene_feature_responseCoding = np.array(get_gv_feature(alpha, "Gene", train_df)) +# test gene feature +test_gene_feature_responseCoding = np.array(get_gv_feature(alpha, "Gene", test_df)) +# cross validation gene feature +cv_gene_feature_responseCoding = np.array(get_gv_feature(alpha, "Gene", cv_df)) + +''' +how to test whether gene is a good feature for model ? +by using random model log loss +create logistic regression model with only gene feature and check the model log loss +if > random model then gene is a good feature for the model + +in order to do that: + +laplace smooting + +* in NB we use LS i,e in test query there is a word that is not present in training test +* what to do with new word ? +* can we drop ? yes but we're saying it has value 1 if we're dropping. that should not be the case +* can we assign value 0 ? WHole multiplication becomes zero + +* so we use laplace smooting or editing smoothing i,e we add alpha (hyper parameter) and alpha K (k=no of classes) +* alpha = 1, in general. can change since it is hyperparameter +* alpha cannot be small or too large --> bias-variance trade off + +CALLIBIRATED CLASSIFIER --> for getting results in probability format to be used for log loss + +SGD --> for optimization + +''' + +# using 1-hot encoding because more dimension is fine with logistic regression algo + +# We need a hyperparemeter for SGD classifier. +alpha = [10 ** x for x in range(-5, 1)] + +# SGD classifier +cv_log_error_array=[] +for i in alpha: + clf = SGDClassifier(alpha=i, penalty='l2', loss='log', random_state=42) + clf.fit(train_gene_feature_onehotCoding, y_train) + sig_clf = CalibratedClassifierCV(clf, method="sigmoid") + sig_clf.fit(train_gene_feature_onehotCoding, y_train) + predict_y = sig_clf.predict_proba(cv_gene_feature_onehotCoding) + cv_log_error_array.append(log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15)) + print('For values of alpha = ', i, "The log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15)) + +# alpha 0.001 log loss is minimal + +# Lets plot the same to check the best Alpha value +fig, ax = plt.subplots() +ax.plot(alpha, cv_log_error_array,c='g') +for i, txt in enumerate(np.round(cv_log_error_array,3)): + ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i])) +plt.grid() +plt.title("Cross Validation Error for each alpha") +plt.xlabel("Alpha i's") +plt.ylabel("Error measure") +plt.show() + +# Lets use best alpha value as we can see from above graph and compute log loss +best_alpha = np.argmin(cv_log_error_array) +clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42) +clf.fit(train_gene_feature_onehotCoding, y_train) +sig_clf = CalibratedClassifierCV(clf, method="sigmoid") +sig_clf.fit(train_gene_feature_onehotCoding, y_train) + +predict_y = sig_clf.predict_proba(train_gene_feature_onehotCoding) +print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15)) +predict_y = sig_clf.predict_proba(cv_gene_feature_onehotCoding) +print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15)) +predict_y = sig_clf.predict_proba(test_gene_feature_onehotCoding) +print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15)) + +# log loss is less than random model +# how stable is model ? +# in train there is less for gene and in test there is more. is called as overlap +# overlap should be good +# though we're using laplace smoothing, its not a great idea +# data and model will be stable if we've good overlap + +# check how many values are overlapping between train, test or between CV and train + +test_coverage=test_df[test_df['Gene'].isin(list(set(train_df['Gene'])))].shape[0] +cv_coverage=cv_df[cv_df['Gene'].isin(list(set(train_df['Gene'])))].shape[0] + +print('1. In test data',test_coverage, 'out of',test_df.shape[0], ":",(test_coverage/test_df.shape[0])*100) +print('2. In cross validation data',cv_coverage, 'out of ',cv_df.shape[0],":" ,(cv_coverage/cv_df.shape[0])*100) + +# EVALUATING VARIATION FEATURE + +# Variation is also a categorical variable so we have to deal in same way like we have done for Gene column. +# We will again get the one hot encoder and response enoding variable for variation column. + +unique_variations = train_df['Variation'].value_counts() +print('Number of Unique Variations :', unique_variations.shape[0]) +# the top 10 variations that occured most +print(unique_variations.head(10)) + +# comulative distribution of unique variation values + +s = sum(unique_variations.values); +h = unique_variations.values/s; +c = np.cumsum(h) +print(c) +plt.plot(c,label='Cumulative distribution of Variations') +plt.grid() +plt.legend() +plt.show() + +# one-hot encoding of variation feature. +variation_vectorizer = CountVectorizer() +train_variation_feature_onehotCoding = variation_vectorizer.fit_transform(train_df['Variation']) +test_variation_feature_onehotCoding = variation_vectorizer.transform(test_df['Variation']) +cv_variation_feature_onehotCoding = variation_vectorizer.transform(cv_df['Variation']) + +train_variation_feature_onehotCoding.shape + +# alpha is used for laplace smoothing +alpha = 1 +# train gene feature +train_variation_feature_responseCoding = np.array(get_gv_feature(alpha, "Variation", train_df)) +# test gene feature +test_variation_feature_responseCoding = np.array(get_gv_feature(alpha, "Variation", test_df)) +# cross validation gene feature +cv_variation_feature_responseCoding = np.array(get_gv_feature(alpha, "Variation", cv_df)) + +train_variation_feature_responseCoding.shape + +# We need a hyperparemeter for SGD classifier. +alpha = [10 ** x for x in range(-5, 1)] + +# SGD classifier +cv_log_error_array=[] +for i in alpha: + clf = SGDClassifier(alpha=i, penalty='l2', loss='log', random_state=42) + clf.fit(train_variation_feature_onehotCoding, y_train) + + sig_clf = CalibratedClassifierCV(clf, method="sigmoid") + sig_clf.fit(train_variation_feature_onehotCoding, y_train) + predict_y = sig_clf.predict_proba(cv_variation_feature_onehotCoding) + + cv_log_error_array.append(log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15)) + print('For values of alpha = ', i, "The log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15)) + +# Lets plot the same to check the best Alpha value +fig, ax = plt.subplots() +ax.plot(alpha, cv_log_error_array,c='g') +for i, txt in enumerate(np.round(cv_log_error_array,3)): + ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i])) +plt.grid() +plt.title("Cross Validation Error for each alpha") +plt.xlabel("Alpha i's") +plt.ylabel("Error measure") +plt.show() + +best_alpha = np.argmin(cv_log_error_array) +clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42) +clf.fit(train_variation_feature_onehotCoding, y_train) +sig_clf = CalibratedClassifierCV(clf, method="sigmoid") +sig_clf.fit(train_variation_feature_onehotCoding, y_train) + +predict_y = sig_clf.predict_proba(train_variation_feature_onehotCoding) +print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15)) +predict_y = sig_clf.predict_proba(cv_variation_feature_onehotCoding) +print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15)) +predict_y = sig_clf.predict_proba(test_variation_feature_onehotCoding) +print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15)) + +# alpha 0.0001, test log loss is less compared to random model so good have variation feature + +test_coverage=test_df[test_df['Variation'].isin(list(set(train_df['Variation'])))].shape[0] +cv_coverage=cv_df[cv_df['Variation'].isin(list(set(train_df['Variation'])))].shape[0] + +print('1. In test data',test_coverage, 'out of',test_df.shape[0], ":",(test_coverage/test_df.shape[0])*100) +print('2. In cross validation data',cv_coverage, 'out of ',cv_df.shape[0],":" ,(cv_coverage/cv_df.shape[0])*100) + +# stability is low for variation feature. since log loss is less, lets keep it + +# EVALUATING TEXT COLUMN + +# cls_text is a data frame +# for every row in data frame consider the 'TEXT' +# split the words by space +# make a dict with those words +# increment its count whenever we see that word + +# to get word count +def extract_dictionary_paddle(cls_text): + dictionary = defaultdict(int) + for index, row in cls_text.iterrows(): + for word in row['TEXT'].split(): + dictionary[word] +=1 + return dictionary + +import math +#https://stackoverflow.com/a/1602964 +def get_text_responsecoding(df): + text_feature_responseCoding = np.zeros((df.shape[0],9)) + for i in range(0,9): + row_index = 0 + for index, row in df.iterrows(): + sum_prob = 0 + for word in row['TEXT'].split(): + sum_prob += math.log(((dict_list[i].get(word,0)+10 )/(total_dict.get(word,0)+90))) + text_feature_responseCoding[row_index][i] = math.exp(sum_prob/len(row['TEXT'].split())) + row_index += 1 + return text_feature_responseCoding + +# building a CountVectorizer with all the words that occured minimum 3 times in train data +text_vectorizer = CountVectorizer(min_df=3) +train_text_feature_onehotCoding = text_vectorizer.fit_transform(train_df['TEXT']) +# getting all the feature names (words) +train_text_features= text_vectorizer.get_feature_names() + +# train_text_feature_onehotCoding.sum(axis=0).A1 will sum every row and returns (1*number of features) vector +train_text_fea_counts = train_text_feature_onehotCoding.sum(axis=0).A1 + +# zip(list(text_features),text_fea_counts) will zip a word with its number of times it occured +text_fea_dict = dict(zip(list(train_text_features),train_text_fea_counts)) + +print("Total number of unique words in train data :", len(train_text_features)) + + +dict_list = [] +# dict_list =[] contains 9 dictoinaries each corresponds to a class +for i in range(1,10): + cls_text = train_df[train_df['Class']==i] + # build a word dict based on the words in that class + dict_list.append(extract_dictionary_paddle(cls_text)) + # append it to dict_list + +# dict_list[i] is build on i'th class text data +# total_dict is buid on whole training text data +total_dict = extract_dictionary_paddle(train_df) + + +confuse_array = [] +for i in train_text_features: + ratios = [] + max_val = -1 + for j in range(0,9): + ratios.append((dict_list[j][i]+10 )/(total_dict[i]+90)) + confuse_array.append(ratios) +confuse_array = np.array(confuse_array) + +#response coding of text features +train_text_feature_responseCoding = get_text_responsecoding(train_df) +test_text_feature_responseCoding = get_text_responsecoding(test_df) +cv_text_feature_responseCoding = get_text_responsecoding(cv_df) + +# https://stackoverflow.com/a/16202486 +# we convert each row values such that they sum to 1 +train_text_feature_responseCoding = (train_text_feature_responseCoding.T/train_text_feature_responseCoding.sum(axis=1)).T +test_text_feature_responseCoding = (test_text_feature_responseCoding.T/test_text_feature_responseCoding.sum(axis=1)).T +cv_text_feature_responseCoding = (cv_text_feature_responseCoding.T/cv_text_feature_responseCoding.sum(axis=1)).T + +# don't forget to normalize every feature +train_text_feature_onehotCoding = normalize(train_text_feature_onehotCoding, axis=0) + +# we use the same vectorizer that was trained on train data +test_text_feature_onehotCoding = text_vectorizer.transform(test_df['TEXT']) +# don't forget to normalize every feature +test_text_feature_onehotCoding = normalize(test_text_feature_onehotCoding, axis=0) + +# we use the same vectorizer that was trained on train data +cv_text_feature_onehotCoding = text_vectorizer.transform(cv_df['TEXT']) +# don't forget to normalize every feature +cv_text_feature_onehotCoding = normalize(cv_text_feature_onehotCoding, axis=0) + +#https://stackoverflow.com/a/2258273/4084039 +sorted_text_fea_dict = dict(sorted(text_fea_dict.items(), key=lambda x: x[1] , reverse=True)) +sorted_text_occur = np.array(list(sorted_text_fea_dict.values())) + +# Number of words for a given frequency. +print(Counter(sorted_text_occur)) + +# build the model with only text column + +cv_log_error_array=[] +for i in alpha: + clf = SGDClassifier(alpha=i, penalty='l2', loss='log', random_state=42) + clf.fit(train_text_feature_onehotCoding, y_train) + + sig_clf = CalibratedClassifierCV(clf, method="sigmoid") + sig_clf.fit(train_text_feature_onehotCoding, y_train) + predict_y = sig_clf.predict_proba(cv_text_feature_onehotCoding) + cv_log_error_array.append(log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15)) + print('For values of alpha = ', i, "The log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15)) + +# log loss is less than random model + +fig, ax = plt.subplots() +ax.plot(alpha, cv_log_error_array,c='g') +for i, txt in enumerate(np.round(cv_log_error_array,3)): + ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i])) +plt.grid() +plt.title("Cross Validation Error for each alpha") +plt.xlabel("Alpha i's") +plt.ylabel("Error measure") +plt.show() + +best_alpha = np.argmin(cv_log_error_array) +clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42) +clf.fit(train_text_feature_onehotCoding, y_train) +sig_clf = CalibratedClassifierCV(clf, method="sigmoid") +sig_clf.fit(train_text_feature_onehotCoding, y_train) + +predict_y = sig_clf.predict_proba(train_text_feature_onehotCoding) +print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15)) +predict_y = sig_clf.predict_proba(cv_text_feature_onehotCoding) +print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15)) +predict_y = sig_clf.predict_proba(test_text_feature_onehotCoding) +print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15)) + +# check the overlap of text data + +def get_intersec_text(df): + df_text_vec = CountVectorizer(min_df=3) + df_text_fea = df_text_vec.fit_transform(df['TEXT']) + df_text_features = df_text_vec.get_feature_names() + + df_text_fea_counts = df_text_fea.sum(axis=0).A1 + df_text_fea_dict = dict(zip(list(df_text_features),df_text_fea_counts)) + len1 = len(set(df_text_features)) + len2 = len(set(train_text_features) & set(df_text_features)) + return len1,len2 + +len1,len2 = get_intersec_text(test_df) +print(np.round((len2/len1)*100, 3), "% of word of test data appeared in train data") +len1,len2 = get_intersec_text(cv_df) +print(np.round((len2/len1)*100, 3), "% of word of Cross Validation appeared in train data") + +# text column as good stable + +# therefore, all 3 columns are going important for model buidling + +# Data prepration for Machine Learning models + +# few utility functions + +def report_log_loss(train_x, train_y, test_x, test_y, clf): + clf.fit(train_x, train_y) + sig_clf = CalibratedClassifierCV(clf, method="sigmoid") + sig_clf.fit(train_x, train_y) + sig_clf_probs = sig_clf.predict_proba(test_x) + return log_loss(test_y, sig_clf_probs, eps=1e-15) + +# This function plots the confusion matrices given y_i, y_i_hat. +def plot_confusion_matrix(test_y, predict_y): + C = confusion_matrix(test_y, predict_y) + + A =(((C.T)/(C.sum(axis=1))).T) + + B =(C/C.sum(axis=0)) + labels = [1,2,3,4,5,6,7,8,9] + # representing A in heatmap format + print("-"*20, "Confusion matrix", "-"*20) + plt.figure(figsize=(20,7)) + sns.heatmap(C, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels) + plt.xlabel('Predicted Class') + plt.ylabel('Original Class') + plt.show() + + print("-"*20, "Precision matrix (Columm Sum=1)", "-"*20) + plt.figure(figsize=(20,7)) + sns.heatmap(B, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels) + plt.xlabel('Predicted Class') + plt.ylabel('Original Class') + plt.show() + + # representing B in heatmap format + print("-"*20, "Recall matrix (Row sum=1)", "-"*20) + plt.figure(figsize=(20,7)) + sns.heatmap(A, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels) + plt.xlabel('Predicted Class') + plt.ylabel('Original Class') + plt.show() + + +def predict_and_plot_confusion_matrix(train_x, train_y,test_x, test_y, clf): + clf.fit(train_x, train_y) + sig_clf = CalibratedClassifierCV(clf, method="sigmoid") + sig_clf.fit(train_x, train_y) + pred_y = sig_clf.predict(test_x) + + # for calculating log_loss we willl provide the array of probabilities belongs to each class + print("Log loss :",log_loss(test_y, sig_clf.predict_proba(test_x))) + # calculating the number of data points that are misclassified + print("Number of mis-classified points :", np.count_nonzero((pred_y- test_y))/test_y.shape[0]) + plot_confusion_matrix(test_y, pred_y) + + +# this function will be used just for naive bayes +# for the given indices, we will print the name of the features +# and we will check whether the feature present in the test point text or not +def get_impfeature_names(indices, text, gene, var, no_features): + gene_count_vec = CountVectorizer() + var_count_vec = CountVectorizer() + text_count_vec = CountVectorizer(min_df=3) + + gene_vec = gene_count_vec.fit(train_df['Gene']) + var_vec = var_count_vec.fit(train_df['Variation']) + text_vec = text_count_vec.fit(train_df['TEXT']) + + fea1_len = len(gene_vec.get_feature_names()) + fea2_len = len(var_count_vec.get_feature_names()) + + word_present = 0 + for i,v in enumerate(indices): + if (v < fea1_len): + word = gene_vec.get_feature_names()[v] + yes_no = True if word == gene else False + if yes_no: + word_present += 1 + print(i, "Gene feature [{}] present in test data point [{}]".format(word,yes_no)) + elif (v < fea1_len+fea2_len): + word = var_vec.get_feature_names()[v-(fea1_len)] + yes_no = True if word == var else False + if yes_no: + word_present += 1 + print(i, "variation feature [{}] present in test data point [{}]".format(word,yes_no)) + else: + word = text_vec.get_feature_names()[v-(fea1_len+fea2_len)] + yes_no = True if word in text.split() else False + if yes_no: + word_present += 1 + print(i, "Text feature [{}] present in test data point [{}]".format(word,yes_no)) + + print("Out of the top ",no_features," features ", word_present, "are present in query point") + +## Combining all 3 features together + +# merging gene, variance and text features + +# building train, test and cross validation data sets +# a = [[1, 2], +# [3, 4]] +# b = [[4, 5], +# [6, 7]] +# hstack(a, b) = [[1, 2, 4, 5], +# [ 3, 4, 6, 7]] + +train_gene_var_onehotCoding = hstack((train_gene_feature_onehotCoding,train_variation_feature_onehotCoding)) +test_gene_var_onehotCoding = hstack((test_gene_feature_onehotCoding,test_variation_feature_onehotCoding)) +cv_gene_var_onehotCoding = hstack((cv_gene_feature_onehotCoding,cv_variation_feature_onehotCoding)) + +train_x_onehotCoding = hstack((train_gene_var_onehotCoding, train_text_feature_onehotCoding)).tocsr() +train_y = np.array(list(train_df['Class'])) + +test_x_onehotCoding = hstack((test_gene_var_onehotCoding, test_text_feature_onehotCoding)).tocsr() +test_y = np.array(list(test_df['Class'])) + +cv_x_onehotCoding = hstack((cv_gene_var_onehotCoding, cv_text_feature_onehotCoding)).tocsr() +cv_y = np.array(list(cv_df['Class'])) + + +train_gene_var_responseCoding = np.hstack((train_gene_feature_responseCoding,train_variation_feature_responseCoding)) +test_gene_var_responseCoding = np.hstack((test_gene_feature_responseCoding,test_variation_feature_responseCoding)) +cv_gene_var_responseCoding = np.hstack((cv_gene_feature_responseCoding,cv_variation_feature_responseCoding)) + +train_x_responseCoding = np.hstack((train_gene_var_responseCoding, train_text_feature_responseCoding)) +test_x_responseCoding = np.hstack((test_gene_var_responseCoding, test_text_feature_responseCoding)) +cv_x_responseCoding = np.hstack((cv_gene_var_responseCoding, cv_text_feature_responseCoding)) + + +print("One hot encoding features :") +print("(number of data points * number of features) in train data = ", train_x_onehotCoding.shape) +print("(number of data points * number of features) in test data = ", test_x_onehotCoding.shape) +print("(number of data points * number of features) in cross validation data =", cv_x_onehotCoding.shape) + +print(" Response encoding features :") +print("(number of data points * number of features) in train data = ", train_x_responseCoding.shape) +print("(number of data points * number of features) in test data = ", test_x_responseCoding.shape) +print("(number of data points * number of features) in cross validation data =", cv_x_responseCoding.shape) + +# Building Machine Learning model + +# http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html +alpha = [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000] +cv_log_error_array = [] +for i in alpha: + print("for alpha =", i) + clf = MultinomialNB(alpha=i) + clf.fit(train_x_onehotCoding, train_y) + sig_clf = CalibratedClassifierCV(clf, method="sigmoid") + sig_clf.fit(train_x_onehotCoding, train_y) + sig_clf_probs = sig_clf.predict_proba(cv_x_onehotCoding) + cv_log_error_array.append(log_loss(cv_y, sig_clf_probs, labels=clf.classes_, eps=1e-15)) + # to avoid rounding error while multiplying probabilites we use log-probability estimates + print("Log Loss :",log_loss(cv_y, sig_clf_probs)) + +fig, ax = plt.subplots() +ax.plot(np.log10(alpha), cv_log_error_array,c='g') +for i, txt in enumerate(np.round(cv_log_error_array,3)): + ax.annotate((alpha[i],str(txt)), (np.log10(alpha[i]),cv_log_error_array[i])) +plt.grid() +plt.xticks(np.log10(alpha)) +plt.title("Cross Validation Error for each alpha") +plt.xlabel("Alpha i's") +plt.ylabel("Error measure") +plt.show() + +best_alpha = np.argmin(cv_log_error_array) +clf = MultinomialNB(alpha=alpha[best_alpha]) +clf.fit(train_x_onehotCoding, train_y) +sig_clf = CalibratedClassifierCV(clf, method="sigmoid") +sig_clf.fit(train_x_onehotCoding, train_y) + +predict_y = sig_clf.predict_proba(train_x_onehotCoding) +print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15)) +predict_y = sig_clf.predict_proba(cv_x_onehotCoding) +print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15)) +predict_y = sig_clf.predict_proba(test_x_onehotCoding) +print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15)) + +# Testing our Naive Bayes model with best found value of alpha on testing data +clf = MultinomialNB(alpha=alpha[best_alpha]) +clf.fit(train_x_onehotCoding, train_y) +sig_clf = CalibratedClassifierCV(clf, method="sigmoid") +sig_clf.fit(train_x_onehotCoding, train_y) +sig_clf_probs = sig_clf.predict_proba(cv_x_onehotCoding) +# to avoid rounding error while multiplying probabilites we use log-probability estimates +print("Log Loss :",log_loss(cv_y, sig_clf_probs)) +print("Number of missclassified point :", np.count_nonzero((sig_clf.predict(cv_x_onehotCoding)- cv_y))/cv_y.shape[0]) +plot_confusion_matrix(cv_y, sig_clf.predict(cv_x_onehotCoding.toarray())) + +# interpretability of model + +test_point_index = 1 +no_feature = 100 +predicted_cls = sig_clf.predict(test_x_onehotCoding[test_point_index]) +print("Predicted Class :", predicted_cls[0]) +print("Predicted Class Probabilities:", np.round(sig_clf.predict_proba(test_x_onehotCoding[test_point_index]),4)) +print("Actual Class :", test_y[test_point_index]) +indices = np.argsort(-clf.coef_)[predicted_cls-1][:,:no_feature] +print("-"*50) +get_impfeature_names(indices[0], test_df['TEXT'].iloc[test_point_index],test_df['Gene'].iloc[test_point_index],test_df['Variation'].iloc[test_point_index], no_feature) + +# one more ex + +test_point_index = 100 +no_feature = 100 +predicted_cls = sig_clf.predict(test_x_onehotCoding[test_point_index]) +print("Predicted Class :", predicted_cls[0]) +print("Predicted Class Probabilities:", np.round(sig_clf.predict_proba(test_x_onehotCoding[test_point_index]),4)) +print("Actual Class :", test_y[test_point_index]) +indices = np.argsort(-clf.coef_)[predicted_cls-1][:,:no_feature] +print("-"*50) +get_impfeature_names(indices[0], test_df['TEXT'].iloc[test_point_index],test_df['Gene'].iloc[test_point_index],test_df['Variation'].iloc[test_point_index], no_feature) + +# So Naive Bayes not performing very badly but lets look at other models + +# Logistic Regression + +# Balancing all classes + +alpha = [10 ** x for x in range(-6, 3)] +cv_log_error_array = [] +for i in alpha: + print("for alpha =", i) + clf = SGDClassifier(class_weight='balanced', alpha=i, penalty='l2', loss='log', random_state=42) + clf.fit(train_x_onehotCoding, train_y) + sig_clf = CalibratedClassifierCV(clf, method="sigmoid") + sig_clf.fit(train_x_onehotCoding, train_y) + sig_clf_probs = sig_clf.predict_proba(cv_x_onehotCoding) + cv_log_error_array.append(log_loss(cv_y, sig_clf_probs, labels=clf.classes_, eps=1e-15)) + # to avoid rounding error while multiplying probabilites we use log-probability estimates + print("Log Loss :",log_loss(cv_y, sig_clf_probs)) + +fig, ax = plt.subplots() +ax.plot(alpha, cv_log_error_array,c='g') +for i, txt in enumerate(np.round(cv_log_error_array,3)): + ax.annotate((alpha[i],str(txt)), (alpha[i],cv_log_error_array[i])) +plt.grid() +plt.title("Cross Validation Error for each alpha") +plt.xlabel("Alpha i's") +plt.ylabel("Error measure") +plt.show() + +best_alpha = np.argmin(cv_log_error_array) +clf = SGDClassifier(class_weight='balanced', alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42) +clf.fit(train_x_onehotCoding, train_y) +sig_clf = CalibratedClassifierCV(clf, method="sigmoid") +sig_clf.fit(train_x_onehotCoding, train_y) + +predict_y = sig_clf.predict_proba(train_x_onehotCoding) +print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15)) +predict_y = sig_clf.predict_proba(cv_x_onehotCoding) +print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15)) +predict_y = sig_clf.predict_proba(test_x_onehotCoding) +print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15)) + +# Lets test it on testing data using best alpha value + +clf = SGDClassifier(class_weight='balanced', alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42) +predict_and_plot_confusion_matrix(train_x_onehotCoding, train_y, cv_x_onehotCoding, cv_y, clf) + + +# feature importance + +def get_imp_feature_names(text, indices, removed_ind = []): + word_present = 0 + tabulte_list = [] + incresingorder_ind = 0 + for i in indices: + if i < train_gene_feature_onehotCoding.shape[1]: + tabulte_list.append([incresingorder_ind, "Gene", "Yes"]) + elif i< 18: + tabulte_list.append([incresingorder_ind,"Variation", "Yes"]) + if ((i > 17) & (i not in removed_ind)) : + word = train_text_features[i] + yes_no = True if word in text.split() else False + if yes_no: + word_present += 1 + tabulte_list.append([incresingorder_ind,train_text_features[i], yes_no]) + incresingorder_ind += 1 + print(word_present, "most importent features are present in our query point") + print("-"*50) + print("The features that are most importent of the ",predicted_cls[0]," class:") + print (tabulate(tabulte_list, headers=["Index",'Feature name', 'Present or Not'])) + +# testing query point and doing interpretability + +# from tabulate import tabulate +clf = SGDClassifier(class_weight='balanced', alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42) +clf.fit(train_x_onehotCoding,train_y) +test_point_index = 1 +no_feature = 500 +predicted_cls = sig_clf.predict(test_x_onehotCoding[test_point_index]) +print("Predicted Class :", predicted_cls[0]) +print("Predicted Class Probabilities:", np.round(sig_clf.predict_proba(test_x_onehotCoding[test_point_index]),4)) +print("Actual Class :", test_y[test_point_index]) +indices = np.argsort(-clf.coef_)[predicted_cls-1][:,:no_feature] +print("-"*50) +get_impfeature_names(indices[0], test_df['TEXT'].iloc[test_point_index],test_df['Gene'].iloc[test_point_index],test_df['Variation'].iloc[test_point_index], no_feature) + +test_point_index = 100 +no_feature = 500 +predicted_cls = sig_clf.predict(test_x_onehotCoding[test_point_index]) +print("Predicted Class :", predicted_cls[0]) +print("Predicted Class Probabilities:", np.round(sig_clf.predict_proba(test_x_onehotCoding[test_point_index]),4)) +print("Actual Class :", test_y[test_point_index]) +indices = np.argsort(-clf.coef_)[predicted_cls-1][:,:no_feature] +print("-"*50) +get_impfeature_names(indices[0], test_df['TEXT'].iloc[test_point_index],test_df['Gene'].iloc[test_point_index],test_df['Variation'].iloc[test_point_index], no_feature) + +# K Nearest Neighbour Classification + +alpha = [5, 11, 15, 21, 31, 41, 51, 99] +cv_log_error_array = [] +for i in alpha: + print("for alpha =", i) + clf = KNeighborsClassifier(n_neighbors=i) + clf.fit(train_x_responseCoding, train_y) + sig_clf = CalibratedClassifierCV(clf, method="sigmoid") + sig_clf.fit(train_x_responseCoding, train_y) + sig_clf_probs = sig_clf.predict_proba(cv_x_responseCoding) + cv_log_error_array.append(log_loss(cv_y, sig_clf_probs, labels=clf.classes_, eps=1e-15)) + # to avoid rounding error while multiplying probabilites we use log-probability estimates + print("Log Loss :",log_loss(cv_y, sig_clf_probs)) + +fig, ax = plt.subplots() +ax.plot(alpha, cv_log_error_array,c='g') +for i, txt in enumerate(np.round(cv_log_error_array,3)): + ax.annotate((alpha[i],str(txt)), (alpha[i],cv_log_error_array[i])) +plt.grid() +plt.title("Cross Validation Error for each alpha") +plt.xlabel("Alpha i's") +plt.ylabel("Error measure") +plt.show() + +best_alpha = np.argmin(cv_log_error_array) +clf = KNeighborsClassifier(n_neighbors=alpha[best_alpha]) +clf.fit(train_x_responseCoding, train_y) +sig_clf = CalibratedClassifierCV(clf, method="sigmoid") +sig_clf.fit(train_x_responseCoding, train_y) + +predict_y = sig_clf.predict_proba(train_x_responseCoding) +print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15)) +predict_y = sig_clf.predict_proba(cv_x_responseCoding) +print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15)) +predict_y = sig_clf.predict_proba(test_x_responseCoding) +print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15)) + +clf = KNeighborsClassifier(n_neighbors=alpha[best_alpha]) +predict_and_plot_confusion_matrix(train_x_responseCoding, train_y, cv_x_responseCoding, cv_y, clf) + +# Lets look at few test points +clf = KNeighborsClassifier(n_neighbors=alpha[best_alpha]) +clf.fit(train_x_responseCoding, train_y) +sig_clf = CalibratedClassifierCV(clf, method="sigmoid") +sig_clf.fit(train_x_responseCoding, train_y) + +test_point_index = 1 +predicted_cls = sig_clf.predict(test_x_responseCoding[0].reshape(1,-1)) +print("Predicted Class :", predicted_cls[0]) +print("Actual Class :", test_y[test_point_index]) +neighbors = clf.kneighbors(test_x_responseCoding[test_point_index].reshape(1, -1), alpha[best_alpha]) +print("The ",alpha[best_alpha]," nearest neighbours of the test points belongs to classes",train_y[neighbors[1][0]]) +print("Fequency of nearest points :",Counter(train_y[neighbors[1][0]])) + +clf = KNeighborsClassifier(n_neighbors=alpha[best_alpha]) +clf.fit(train_x_responseCoding, train_y) +sig_clf = CalibratedClassifierCV(clf, method="sigmoid") +sig_clf.fit(train_x_responseCoding, train_y) + +test_point_index = 100 + +predicted_cls = sig_clf.predict(test_x_responseCoding[test_point_index].reshape(1,-1)) +print("Predicted Class :", predicted_cls[0]) +print("Actual Class :", test_y[test_point_index]) +neighbors = clf.kneighbors(test_x_responseCoding[test_point_index].reshape(1, -1), alpha[best_alpha]) +print("the k value for knn is",alpha[best_alpha],"and the nearest neighbours of the test points belongs to classes",train_y[neighbors[1][0]]) +print("Fequency of nearest points :",Counter(train_y[neighbors[1][0]])) + +# without balancing data for logistic regression .01 difference in log loss + +# linear SVM dint reduce w.r.t log loss + +# random forest classifier with 1-hot encoding and response encoding (was worse than 1-hot encoding) dint reduce w.r.t log loss + +# stacking model dint reduce w.r.t log loss + +# logistic regression with balancing and 1-hot encoding has got the lowest log loss + + +