Diff of /DataMiningProject.py [000000] .. [0bd3e5]

Switch to side-by-side view

--- a
+++ b/DataMiningProject.py
@@ -0,0 +1,147 @@
+#####################################
+# Group Members                     #
+# Bhoopalsinh Musale	002269332   #
+# Syed Malik Muzaffar	002269955   #
+#####################################
+
+
+# Imports
+import pandas as pd
+import numpy as np
+
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
+from sklearn.utils.multiclass import unique_labels
+import scipy.stats as st
+
+from sklearn import tree
+from sklearn import naive_bayes
+from sklearn.neural_network import MLPClassifier
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.ensemble import AdaBoostClassifier
+from sklearn.pipeline import Pipeline
+from sklearn.model_selection import GridSearchCV
+from sklearn.model_selection import cross_val_score
+from sklearn.model_selection import KFold
+from tempfile import mkdtemp
+from TScoreSelection import TScoreSelection
+import os
+import warnings
+warnings.filterwarnings("ignore")
+
+
+def load_data():
+    '''
+        Load data from CSV file
+        returns X,Y and random seeds
+    '''
+    dataFrame = pd.read_csv('pp5i_train.gr.csv')
+    dataFrame.set_index('SNO', inplace=True)
+    dataFrame = dataFrame.transpose()
+    dataFrame.reset_index(drop=True, inplace=True)
+
+    y = pd.read_csv('pp5i_train_class.txt')
+    dataFrame = pd.concat([dataFrame, y], axis=1)
+    myRndSeeds = 72
+    dataFrame = dataFrame.sample(
+        frac=1, random_state=myRndSeeds).reset_index(drop=True)
+    print(dataFrame.shape)
+    print(dataFrame.head())
+
+    X = dataFrame.drop('Class', axis=1)
+
+    y = dataFrame['Class']
+
+    return X, y, myRndSeeds
+
+
+def clean_data(X):
+    '''
+        Thresholding both train and test data 
+        to a minimum value of 20, maximum of 16,000.
+    '''
+    X.clip(upper=16000, lower=20, inplace=True)
+    print(X.shape)
+    X = X.loc[:, X.max() - X.min() > 2]
+    print(X.shape)
+    return X
+
+
+if __name__ == "__main__":
+    # Loading Dataset
+    X, y, myRndSeeds = load_data()
+
+    # Cleaning Dataset
+    X = clean_data(X)
+
+    # Feature selection using Ttest
+    cachedir = mkdtemp()
+    pipe = Pipeline([('featureSelection', TScoreSelection(w=10)),
+                     ('classify', KNeighborsClassifier(n_neighbors=1))],
+                    memory=cachedir)
+
+    # Top Gene Selection
+    N_GENES = [2, 4, 6, 8, 10, 12, 15, 20, 25, 30]
+    N_LAYERS = [(32,), (64,), (128,)]
+
+    # Hyperparameter Optimization
+    tuned_parameters = [
+        # KNN Classifier(2,3,4)
+        {'featureSelection__w': N_GENES,
+         'classify': [KNeighborsClassifier()],
+         'classify__n_neighbors': [2, 3, 4]
+         },
+        # Decision Tree Classifier(J48 algorithm)
+        {'featureSelection__w': N_GENES,
+         'classify': [tree.DecisionTreeClassifier()],
+         'classify__criterion':['gini', 'entropy'],
+         'classify__min_samples_leaf': [1, 3, 5],
+         'classify__max_depth': [3, 6, 9],
+         'classify__presort': [True]
+         },
+        # Neural Network Multi-label Classifier
+        {'featureSelection__w': N_GENES,
+         'classify': [MLPClassifier()],
+         'classify__hidden_layer_sizes': N_LAYERS,
+         'classify__activation': ['logistic'],
+         'classify__alpha':[0.05, 0.01, 0.005, 0.001],
+         'classify__max_iter':[1000],
+         'classify__solver': ['lbfgs'],
+         'classify__verbose': [True]
+         },
+        # Naïve Bayes Classifier
+        {'featureSelection__w': N_GENES,
+         'classify': [naive_bayes.GaussianNB()]
+         },
+        # AdaBoost Classifier
+        {'featureSelection__w': N_GENES,
+         'classify': [AdaBoostClassifier()]
+         }
+    ]
+
+    # Model Selection using Pipeline and Cross validation
+    kfolds = KFold(n_splits=5, shuffle=True, random_state=myRndSeeds)
+    model = GridSearchCV(pipe, tuned_parameters, cv=kfolds,
+                         return_train_score=True)
+    model.fit(X, y)
+    results = pd.DataFrame(model.cv_results_)
+
+    print(results.sort_values(by='mean_test_score', ascending=False).head())
+
+    # Best Model
+    best_estimator_ = model.best_estimator_
+    print(best_estimator_)
+
+    # Running best model on Test dataset
+    testDataFrame = pd.read_csv('pp5i_test.gr.csv')
+    testDataFrame.set_index('SNO', inplace=True)
+    X_test = testDataFrame.transpose()
+    X_test.reset_index(drop=True, inplace=True)
+
+    # Generating output Y for given Test Dataset
+    Y = pd.DataFrame()
+    Y['predicted'] = model.predict(X_test)
+    finalResult = Y
+
+    # Final Output
+    print(finalResult)