|
a |
|
b/knn.py |
|
|
1 |
# import all necessary libraries |
|
|
2 |
import pandas |
|
|
3 |
import sklearn |
|
|
4 |
from sklearn.model_selection import cross_validate,cross_val_score,train_test_split |
|
|
5 |
from sklearn.metrics import matthews_corrcoef |
|
|
6 |
from sklearn.metrics import classification_report |
|
|
7 |
from sklearn.metrics import confusion_matrix |
|
|
8 |
from sklearn.neighbors import KNeighborsClassifier |
|
|
9 |
from sklearn.preprocessing import MinMaxScaler |
|
|
10 |
from sklearn.metrics import accuracy_score |
|
|
11 |
|
|
|
12 |
# load the dataset (local path) |
|
|
13 |
url = "data.csv" |
|
|
14 |
# feature names |
|
|
15 |
features = ["MDVP:Fo(Hz)","MDVP:Fhi(Hz)","MDVP:Flo(Hz)","MDVP:Jitter(%)","MDVP:Jitter(Abs)","MDVP:RAP","MDVP:PPQ","Jitter:DDP","MDVP:Shimmer","MDVP:Shimmer(dB)","Shimmer:APQ3","Shimmer:APQ5","MDVP:APQ","Shimmer:DDA","NHR","HNR","RPDE","DFA","spread1","spread2","D2","PPE","status"] |
|
|
16 |
dataset = pandas.read_csv(url, names = features) |
|
|
17 |
|
|
|
18 |
# store the dataset as an array for easier processing |
|
|
19 |
array = dataset.values |
|
|
20 |
scaler = MinMaxScaler(feature_range=(0,1)) |
|
|
21 |
scaled = scaler.fit_transform(array) |
|
|
22 |
# X stores feature values |
|
|
23 |
X = scaled[:,0:22] |
|
|
24 |
# Y stores "answers", the flower species / class (every row, 4th column) |
|
|
25 |
Y = scaled[:,22] |
|
|
26 |
validation_size = 0.25 |
|
|
27 |
# randomize which part of the data is training and which part is validation |
|
|
28 |
seed = 7 |
|
|
29 |
# split dataset into training set (80%) and validation set (20%) |
|
|
30 |
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size = validation_size, random_state = seed) |
|
|
31 |
print(X_train) |
|
|
32 |
# 10-fold cross validation to estimate accuracy (split data into 10 parts; use 9 parts to train and 1 for test) |
|
|
33 |
num_folds = 10 |
|
|
34 |
num_instances = len(X_train) |
|
|
35 |
seed = 7 |
|
|
36 |
# use the 'accuracy' metric to evaluate models (correct / total) |
|
|
37 |
scoring = 'accuracy' |
|
|
38 |
|
|
|
39 |
results = [] |
|
|
40 |
clf = KNeighborsClassifier() |
|
|
41 |
kfold = sklearn.model_selection.KFold(n_splits=num_instances,random_state = seed) |
|
|
42 |
cv_results = cross_val_score(clf, X_train, Y_train, cv = kfold, scoring = scoring) |
|
|
43 |
clf.fit(X_train, Y_train) |
|
|
44 |
predictions = clf.predict(X_validation) |
|
|
45 |
print("KNN") |
|
|
46 |
print(accuracy_score(Y_validation, predictions)*100) |
|
|
47 |
print(matthews_corrcoef(Y_validation, predictions)) |
|
|
48 |
print(classification_report(Y_validation, predictions)) |