|
a |
|
b/algorithm_comparison.py |
|
|
1 |
# import all necessary libraries |
|
|
2 |
import pandas |
|
|
3 |
from pandas.tools.plotting import scatter_matrix |
|
|
4 |
from sklearn import cross_validation |
|
|
5 |
from sklearn.metrics import matthews_corrcoef |
|
|
6 |
from sklearn.metrics import classification_report |
|
|
7 |
from sklearn.metrics import confusion_matrix |
|
|
8 |
from sklearn.metrics import accuracy_score |
|
|
9 |
from sklearn.linear_model import LogisticRegression |
|
|
10 |
from sklearn.tree import DecisionTreeClassifier |
|
|
11 |
from sklearn.neighbors import KNeighborsClassifier |
|
|
12 |
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis |
|
|
13 |
from sklearn.naive_bayes import GaussianNB |
|
|
14 |
from sklearn.neural_network import MLPClassifier |
|
|
15 |
from sklearn.ensemble import GradientBoostingClassifier |
|
|
16 |
|
|
|
17 |
# load the dataset (local path) |
|
|
18 |
url = "data.csv" |
|
|
19 |
# feature names |
|
|
20 |
features = ["MDVP:Fo(Hz)","MDVP:Fhi(Hz)","MDVP:Flo(Hz)","MDVP:Jitter(%)","MDVP:Jitter(Abs)","MDVP:RAP","MDVP:PPQ","Jitter:DDP","MDVP:Shimmer","MDVP:Shimmer(dB)","Shimmer:APQ3","Shimmer:APQ5","MDVP:APQ","Shimmer:DDA","NHR","HNR","RPDE","DFA","spread1","spread2","D2","PPE","status"] |
|
|
21 |
dataset = pandas.read_csv(url, names = features) |
|
|
22 |
|
|
|
23 |
# store the dataset as an array for easier processing |
|
|
24 |
array = dataset.values |
|
|
25 |
# X stores feature values |
|
|
26 |
X = array[:,0:22] |
|
|
27 |
# Y stores "answers", the flower species / class (every row, 4th column) |
|
|
28 |
Y = array[:,22] |
|
|
29 |
validation_size = 0.3 |
|
|
30 |
# randomize which part of the data is training and which part is validation |
|
|
31 |
seed = 7 |
|
|
32 |
# split dataset into training set (80%) and validation set (20%) |
|
|
33 |
X_train, X_validation, Y_train, Y_validation = cross_validation.train_test_split(X, Y, test_size = validation_size, random_state = seed) |
|
|
34 |
|
|
|
35 |
# 10-fold cross validation to estimate accuracy (split data into 10 parts; use 9 parts to train and 1 for test) |
|
|
36 |
num_folds = 10 |
|
|
37 |
num_instances = len(X_train) |
|
|
38 |
seed = 7 |
|
|
39 |
# use the 'accuracy' metric to evaluate models (correct / total) |
|
|
40 |
scoring = 'accuracy' |
|
|
41 |
|
|
|
42 |
# algorithms / models |
|
|
43 |
models = [] |
|
|
44 |
models.append(('LR', LogisticRegression())) |
|
|
45 |
models.append(('LDA', LinearDiscriminantAnalysis())) |
|
|
46 |
models.append(('KNN', KNeighborsClassifier())) |
|
|
47 |
models.append(('DT', DecisionTreeClassifier())) |
|
|
48 |
models.append(('NN', MLPClassifier(solver='lbfgs'))) |
|
|
49 |
models.append(('NB', GaussianNB())) |
|
|
50 |
models.append(('GB', GradientBoostingClassifier(n_estimators=10000))) |
|
|
51 |
|
|
|
52 |
# evaluate each algorithm / model |
|
|
53 |
results = [] |
|
|
54 |
names = [] |
|
|
55 |
print("Scores for each algorithm:") |
|
|
56 |
for name, model in models: |
|
|
57 |
kfold = cross_validation.KFold(n = num_instances, n_folds = num_folds, random_state = seed) |
|
|
58 |
cv_results = cross_validation.cross_val_score(model, X_train, Y_train, cv = kfold, scoring = scoring) |
|
|
59 |
results.append(cv_results) |
|
|
60 |
names.append(name) |
|
|
61 |
model.fit(X_train, Y_train) |
|
|
62 |
predictions = model.predict(X_validation) |
|
|
63 |
print(name, accuracy_score(Y_validation, predictions)*100) |
|
|
64 |
print(matthews_corrcoef(Y_validation, predictions)) |
|
|
65 |
print() |