Diff of /scratch.py [000000] .. [5c6b9a]

Switch to unified view

a b/scratch.py
1
"""
2
import pandas as pd
3
from sklearn import svm
4
file = 'data/train.csv'
5
6
train_data = pd.read_csv(file)
7
8
print(train_data.head())
9
10
print(train_data.columns)
11
12
#features = Sex, Age, Pclass, Cabin, SibSp, Parch, Embarked, Name, Ticket
13
#label = Survived
14
15
#'PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp','Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'
16
17
#SVM
18
#Bayesian logisitic regression
19
kernel = 'rbf'
20
svm.SVC()
21
"""
22
23
# Extract features using sliding window and form the training dataset, test dataset
24
25
from sklearn.ensemble import RandomForestClassifier
26
from sklearn.datasets import make_classification
27
from sklearn.model_selection import train_test_split
28
from sklearn.mixture import GaussianMixture
29
30
import numpy as np
31
32
X, y = make_classification(n_samples=10000, n_features=6,
33
                            n_informative=3, n_redundant=0,
34
                            random_state=0, shuffle=True)
35
36
print(X.shape)  # 10000x6
37
print(y.shape)  # 10000
38
39
# TODO: Feature extraction using sliding window
40
41
train_features, test_features, train_labels, test_labels = train_test_split(X, y,
42
                                                                            test_size=0.25, random_state=42)
43
# TODO: K-fold cross validation
44
45
print('Training Features Shape:', train_features.shape)
46
print('Training Labels Shape:', train_labels.shape)
47
print('Testing Features Shape:', test_features.shape)
48
print('Testing Labels Shape:', test_labels.shape)
49
50
clf = RandomForestClassifier(n_estimators=100, max_depth=3, oob_score=True
51
                             )
52
53
clf.fit(X, y)
54
55
print(clf.feature_importances_)
56
#print(clf.oob_decision_function_)
57
print(clf.oob_score_)
58
59
predictions = clf.predict(test_features)
60
errors = abs(predictions - test_labels)
61
print("M A E: ", round(np.mean(errors), 2))
62
63
64
# Visualization
65
feature_list = [1, 2, 3, 4, 5, 6]
66
from sklearn.tree import export_graphviz
67
import pydot
68
# Pull out one tree from the forest
69
tree = clf.estimators_[5]
70
# Export the image to a dot file
71
export_graphviz(tree, out_file='tree.dot', feature_names=feature_list, rounded=True, precision=1)
72
# Use dot file to create a graph
73
(graph, ) = pydot.graph_from_dot_file('tree.dot')
74
# Write graph to a png file
75
#graph.write_png('tree_.png')
76
77
# TODO: Confusion matrix, Accuracy
78
79
80
# GMM
81
82
gmm = GaussianMixture(n_components=3, covariance_type='full')
83
gmm.fit(X, y)