Diff of /classification.py [000000] .. [2d53aa]

Switch to unified view

a b/classification.py
1
import numpy as np
2
import pandas as pd
3
from sklearn.model_selection import train_test_split, StratifiedKFold
4
from sklearn import svm
5
from sklearn import metrics
6
7
8
def classification(latent_code, random_seed=42, ten_fold=False):
9
    tumour_type = pd.read_csv('data/PANCAN/GDC-PANCAN_both_samples_tumour_type.tsv', sep='\t', index_col=0)
10
    latent_code_label = pd.merge(latent_code, tumour_type, left_index=True, right_index=True)
11
12
    # separate latent variables and targets
13
    label = latent_code_label[['tumour_type']]
14
    data = latent_code_label.iloc[:, :-1]
15
16
    X = data.values
17
    y = label.values.ravel()
18
19
    if ten_fold:
20
        # 10-fold cross-validation
21
        skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_seed)
22
        accuracy_array = np.zeros(10)
23
        precision_array = np.zeros(10)
24
        recall_array = np.zeros(10)
25
        f1_array = np.zeros(10)
26
        i = 0
27
        for train_index, test_index in skf.split(X, y):
28
            X_train, X_test = X[train_index], X[test_index]
29
            y_train, y_test = y[train_index], y[test_index]
30
31
            # Use SVM as classifier
32
            clf = svm.SVC(gamma='scale', random_state=random_seed)
33
            clf.fit(X_train, y_train)
34
35
            # Test the classifier using the testing set
36
            y_pred = clf.predict(X_test)
37
            accuracy = metrics.accuracy_score(y_test, y_pred)
38
            precision = metrics.precision_score(y_test, y_pred, average='weighted')
39
            recall = metrics.recall_score(y_test, y_pred, average='weighted')
40
            f1 = metrics.f1_score(y_test, y_pred, average='weighted')
41
42
            # Store the metrics
43
            accuracy_array[i] = accuracy
44
            precision_array[i] = precision
45
            recall_array[i] = recall
46
            f1_array[i] = f1
47
            i = i + 1
48
        accuracy_average = np.mean(accuracy_array)
49
        precision_average = np.mean(precision_array)
50
        recall_average = np.mean(recall_array)
51
        f1_average = np.mean(f1_array)
52
53
        accuracy_std = accuracy_array.std()
54
        precision_std = precision_array.std()
55
        recall_std = recall_array.std()
56
        f1_std = f1_array.std()
57
58
        print('{:.2f}±{:.2f}%'.format(accuracy_average * 100, accuracy_std * 100))
59
        print('{:.3f}±{:.3f}'.format(precision_average, precision_std))
60
        print('{:.3f}±{:.3f}'.format(recall_average, recall_std))
61
        print('{:.3f}±{:.3f}'.format(f1_average, f1_std))
62
63
    else:
64
        testset_ratio = 0.2
65
        valset_ratio = 0.5
66
67
        # Just one separation
68
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testset_ratio, random_state=random_seed,
69
                                                            stratify=y)
70
        X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=valset_ratio,
71
                                                        random_state=random_seed, stratify=y_test)
72
73
        # Use SVM as classifier
74
        clf = svm.SVC(gamma='scale', random_state=random_seed)
75
        clf.fit(X_train, y_train)
76
77
        # Test the classifier using the testing set
78
        y_pred = clf.predict(X_test)
79
        accuracy = metrics.accuracy_score(y_test, y_pred)
80
        precision = metrics.precision_score(y_test, y_pred, average='weighted')
81
        recall = metrics.recall_score(y_test, y_pred, average='weighted')
82
        f1 = metrics.f1_score(y_test, y_pred, average='weighted')
83
84
        print('{:.2f}'.format(accuracy * 100))
85
        print('{:.2f}'.format(precision * 100))
86
        print('{:.2f}'.format(recall * 100))
87
        print('{:.2f}'.format(f1 * 100))