Switch to unified view

a b/rescaled_data_algorithm_comparison.py
1
# import all necessary libraries
2
import pandas
3
from pandas.tools.plotting import scatter_matrix
4
from sklearn import cross_validation
5
from sklearn.metrics import matthews_corrcoef
6
from sklearn.metrics import classification_report
7
from sklearn.metrics import confusion_matrix
8
from sklearn.linear_model import LogisticRegression
9
from sklearn.tree import DecisionTreeClassifier
10
from sklearn.neighbors import KNeighborsClassifier
11
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
12
from sklearn.naive_bayes import GaussianNB
13
from sklearn.neural_network import MLPClassifier
14
from sklearn.preprocessing import MinMaxScaler
15
from sklearn.ensemble import GradientBoostingClassifier
16
from sklearn.metrics import accuracy_score
17
from sklearn import tree
18
19
# load the dataset (local path)
20
url = "data.csv"
21
# feature names
22
features = ["MDVP:Fo(Hz)","MDVP:Fhi(Hz)","MDVP:Flo(Hz)","MDVP:Jitter(%)","MDVP:Jitter(Abs)","MDVP:RAP","MDVP:PPQ","Jitter:DDP","MDVP:Shimmer","MDVP:Shimmer(dB)","Shimmer:APQ3","Shimmer:APQ5","MDVP:APQ","Shimmer:DDA","NHR","HNR","RPDE","DFA","spread1","spread2","D2","PPE","status"]
23
dataset = pandas.read_csv(url, names = features)
24
25
# store the dataset as an array for easier processing
26
array = dataset.values
27
scaler = MinMaxScaler(feature_range=(0,1))
28
scaled = scaler.fit_transform(array)
29
# X stores feature values
30
X = scaled[:,0:22]
31
# Y stores "answers", the flower species / class (every row, 4th column)
32
Y = scaled[:,22]
33
validation_size = 0.25
34
# randomize which part of the data is training and which part is validation
35
seed = 7
36
# split dataset into training set (80%) and validation set (20%)
37
X_train, X_validation, Y_train, Y_validation = cross_validation.train_test_split(X, Y, test_size = validation_size, random_state = seed)
38
39
# 10-fold cross validation to estimate accuracy (split data into 10 parts; use 9 parts to train and 1 for test)
40
num_folds = 10
41
num_instances = len(X_train)
42
seed = 7
43
# use the 'accuracy' metric to evaluate models (correct / total)
44
scoring = 'accuracy'
45
46
# algorithms / models
47
models = []
48
#models.append(('LR', LogisticRegression()))
49
#models.append(('LDA', LinearDiscriminantAnalysis()))
50
#models.append(('KNN', KNeighborsClassifier()))
51
models.append(('DT', DecisionTreeClassifier()))
52
#models.append(('NN', MLPClassifier(solver='lbfgs')))
53
#models.append(('NB', GaussianNB()))
54
#models.append(('GB', GradientBoostingClassifier(n_estimators=10000)))
55
56
# evaluate each algorithm / model
57
results = []
58
names = []
59
print("Scores for each algorithm:")
60
for name, model in models:
61
    kfold = cross_validation.KFold(n = num_instances, n_folds = num_folds, random_state = seed)
62
    cv_results = cross_validation.cross_val_score(model, X_train, Y_train, cv = kfold, scoring = scoring)
63
    results.append(cv_results)
64
    names.append(name)
65
    model.fit(X_train, Y_train)
66
    predictions = model.predict(X_validation)
67
    print(name, accuracy_score(Y_validation, predictions)*100)
68
    print(matthews_corrcoef(Y_validation, predictions))
69
    tree.export_graphviz(model, out_file="tree.dot")