Diff of /DataMiningProject.py [000000] .. [0bd3e5]

Switch to unified view

a b/DataMiningProject.py
1
#####################################
2
# Group Members                     #
3
# Bhoopalsinh Musale    002269332   #
4
# Syed Malik Muzaffar   002269955   #
5
#####################################
6
7
8
# Imports
9
import pandas as pd
10
import numpy as np
11
12
from sklearn.base import BaseEstimator, TransformerMixin
13
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
14
from sklearn.utils.multiclass import unique_labels
15
import scipy.stats as st
16
17
from sklearn import tree
18
from sklearn import naive_bayes
19
from sklearn.neural_network import MLPClassifier
20
from sklearn.neighbors import KNeighborsClassifier
21
from sklearn.ensemble import AdaBoostClassifier
22
from sklearn.pipeline import Pipeline
23
from sklearn.model_selection import GridSearchCV
24
from sklearn.model_selection import cross_val_score
25
from sklearn.model_selection import KFold
26
from tempfile import mkdtemp
27
from TScoreSelection import TScoreSelection
28
import os
29
import warnings
30
warnings.filterwarnings("ignore")
31
32
33
def load_data():
34
    '''
35
        Load data from CSV file
36
        returns X,Y and random seeds
37
    '''
38
    dataFrame = pd.read_csv('pp5i_train.gr.csv')
39
    dataFrame.set_index('SNO', inplace=True)
40
    dataFrame = dataFrame.transpose()
41
    dataFrame.reset_index(drop=True, inplace=True)
42
43
    y = pd.read_csv('pp5i_train_class.txt')
44
    dataFrame = pd.concat([dataFrame, y], axis=1)
45
    myRndSeeds = 72
46
    dataFrame = dataFrame.sample(
47
        frac=1, random_state=myRndSeeds).reset_index(drop=True)
48
    print(dataFrame.shape)
49
    print(dataFrame.head())
50
51
    X = dataFrame.drop('Class', axis=1)
52
53
    y = dataFrame['Class']
54
55
    return X, y, myRndSeeds
56
57
58
def clean_data(X):
59
    '''
60
        Thresholding both train and test data 
61
        to a minimum value of 20, maximum of 16,000.
62
    '''
63
    X.clip(upper=16000, lower=20, inplace=True)
64
    print(X.shape)
65
    X = X.loc[:, X.max() - X.min() > 2]
66
    print(X.shape)
67
    return X
68
69
70
if __name__ == "__main__":
71
    # Loading Dataset
72
    X, y, myRndSeeds = load_data()
73
74
    # Cleaning Dataset
75
    X = clean_data(X)
76
77
    # Feature selection using Ttest
78
    cachedir = mkdtemp()
79
    pipe = Pipeline([('featureSelection', TScoreSelection(w=10)),
80
                     ('classify', KNeighborsClassifier(n_neighbors=1))],
81
                    memory=cachedir)
82
83
    # Top Gene Selection
84
    N_GENES = [2, 4, 6, 8, 10, 12, 15, 20, 25, 30]
85
    N_LAYERS = [(32,), (64,), (128,)]
86
87
    # Hyperparameter Optimization
88
    tuned_parameters = [
89
        # KNN Classifier(2,3,4)
90
        {'featureSelection__w': N_GENES,
91
         'classify': [KNeighborsClassifier()],
92
         'classify__n_neighbors': [2, 3, 4]
93
         },
94
        # Decision Tree Classifier(J48 algorithm)
95
        {'featureSelection__w': N_GENES,
96
         'classify': [tree.DecisionTreeClassifier()],
97
         'classify__criterion':['gini', 'entropy'],
98
         'classify__min_samples_leaf': [1, 3, 5],
99
         'classify__max_depth': [3, 6, 9],
100
         'classify__presort': [True]
101
         },
102
        # Neural Network Multi-label Classifier
103
        {'featureSelection__w': N_GENES,
104
         'classify': [MLPClassifier()],
105
         'classify__hidden_layer_sizes': N_LAYERS,
106
         'classify__activation': ['logistic'],
107
         'classify__alpha':[0.05, 0.01, 0.005, 0.001],
108
         'classify__max_iter':[1000],
109
         'classify__solver': ['lbfgs'],
110
         'classify__verbose': [True]
111
         },
112
        # Naïve Bayes Classifier
113
        {'featureSelection__w': N_GENES,
114
         'classify': [naive_bayes.GaussianNB()]
115
         },
116
        # AdaBoost Classifier
117
        {'featureSelection__w': N_GENES,
118
         'classify': [AdaBoostClassifier()]
119
         }
120
    ]
121
122
    # Model Selection using Pipeline and Cross validation
123
    kfolds = KFold(n_splits=5, shuffle=True, random_state=myRndSeeds)
124
    model = GridSearchCV(pipe, tuned_parameters, cv=kfolds,
125
                         return_train_score=True)
126
    model.fit(X, y)
127
    results = pd.DataFrame(model.cv_results_)
128
129
    print(results.sort_values(by='mean_test_score', ascending=False).head())
130
131
    # Best Model
132
    best_estimator_ = model.best_estimator_
133
    print(best_estimator_)
134
135
    # Running best model on Test dataset
136
    testDataFrame = pd.read_csv('pp5i_test.gr.csv')
137
    testDataFrame.set_index('SNO', inplace=True)
138
    X_test = testDataFrame.transpose()
139
    X_test.reset_index(drop=True, inplace=True)
140
141
    # Generating output Y for given Test Dataset
142
    Y = pd.DataFrame()
143
    Y['predicted'] = model.predict(X_test)
144
    finalResult = Y
145
146
    # Final Output
147
    print(finalResult)