[0bd3e5]: / DataMiningProject.py

Download this file

148 lines (123 with data), 4.7 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#####################################
# Group Members #
# Bhoopalsinh Musale 002269332 #
# Syed Malik Muzaffar 002269955 #
#####################################
# Imports
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
import scipy.stats as st
from sklearn import tree
from sklearn import naive_bayes
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from tempfile import mkdtemp
from TScoreSelection import TScoreSelection
import os
import warnings
warnings.filterwarnings("ignore")
def load_data():
'''
Load data from CSV file
returns X,Y and random seeds
'''
dataFrame = pd.read_csv('pp5i_train.gr.csv')
dataFrame.set_index('SNO', inplace=True)
dataFrame = dataFrame.transpose()
dataFrame.reset_index(drop=True, inplace=True)
y = pd.read_csv('pp5i_train_class.txt')
dataFrame = pd.concat([dataFrame, y], axis=1)
myRndSeeds = 72
dataFrame = dataFrame.sample(
frac=1, random_state=myRndSeeds).reset_index(drop=True)
print(dataFrame.shape)
print(dataFrame.head())
X = dataFrame.drop('Class', axis=1)
y = dataFrame['Class']
return X, y, myRndSeeds
def clean_data(X):
'''
Thresholding both train and test data
to a minimum value of 20, maximum of 16,000.
'''
X.clip(upper=16000, lower=20, inplace=True)
print(X.shape)
X = X.loc[:, X.max() - X.min() > 2]
print(X.shape)
return X
if __name__ == "__main__":
# Loading Dataset
X, y, myRndSeeds = load_data()
# Cleaning Dataset
X = clean_data(X)
# Feature selection using Ttest
cachedir = mkdtemp()
pipe = Pipeline([('featureSelection', TScoreSelection(w=10)),
('classify', KNeighborsClassifier(n_neighbors=1))],
memory=cachedir)
# Top Gene Selection
N_GENES = [2, 4, 6, 8, 10, 12, 15, 20, 25, 30]
N_LAYERS = [(32,), (64,), (128,)]
# Hyperparameter Optimization
tuned_parameters = [
# KNN Classifier(2,3,4)
{'featureSelection__w': N_GENES,
'classify': [KNeighborsClassifier()],
'classify__n_neighbors': [2, 3, 4]
},
# Decision Tree Classifier(J48 algorithm)
{'featureSelection__w': N_GENES,
'classify': [tree.DecisionTreeClassifier()],
'classify__criterion':['gini', 'entropy'],
'classify__min_samples_leaf': [1, 3, 5],
'classify__max_depth': [3, 6, 9],
'classify__presort': [True]
},
# Neural Network Multi-label Classifier
{'featureSelection__w': N_GENES,
'classify': [MLPClassifier()],
'classify__hidden_layer_sizes': N_LAYERS,
'classify__activation': ['logistic'],
'classify__alpha':[0.05, 0.01, 0.005, 0.001],
'classify__max_iter':[1000],
'classify__solver': ['lbfgs'],
'classify__verbose': [True]
},
# Naïve Bayes Classifier
{'featureSelection__w': N_GENES,
'classify': [naive_bayes.GaussianNB()]
},
# AdaBoost Classifier
{'featureSelection__w': N_GENES,
'classify': [AdaBoostClassifier()]
}
]
# Model Selection using Pipeline and Cross validation
kfolds = KFold(n_splits=5, shuffle=True, random_state=myRndSeeds)
model = GridSearchCV(pipe, tuned_parameters, cv=kfolds,
return_train_score=True)
model.fit(X, y)
results = pd.DataFrame(model.cv_results_)
print(results.sort_values(by='mean_test_score', ascending=False).head())
# Best Model
best_estimator_ = model.best_estimator_
print(best_estimator_)
# Running best model on Test dataset
testDataFrame = pd.read_csv('pp5i_test.gr.csv')
testDataFrame.set_index('SNO', inplace=True)
X_test = testDataFrame.transpose()
X_test.reset_index(drop=True, inplace=True)
# Generating output Y for given Test Dataset
Y = pd.DataFrame()
Y['predicted'] = model.predict(X_test)
finalResult = Y
# Final Output
print(finalResult)