--- a
+++ b/classification.py
@@ -0,0 +1,87 @@
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split, StratifiedKFold
+from sklearn import svm
+from sklearn import metrics
+
+
+def classification(latent_code, random_seed=42, ten_fold=False):
+    tumour_type = pd.read_csv('data/PANCAN/GDC-PANCAN_both_samples_tumour_type.tsv', sep='\t', index_col=0)
+    latent_code_label = pd.merge(latent_code, tumour_type, left_index=True, right_index=True)
+
+    # separate latent variables and targets
+    label = latent_code_label[['tumour_type']]
+    data = latent_code_label.iloc[:, :-1]
+
+    X = data.values
+    y = label.values.ravel()
+
+    if ten_fold:
+        # 10-fold cross-validation
+        skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_seed)
+        accuracy_array = np.zeros(10)
+        precision_array = np.zeros(10)
+        recall_array = np.zeros(10)
+        f1_array = np.zeros(10)
+        i = 0
+        for train_index, test_index in skf.split(X, y):
+            X_train, X_test = X[train_index], X[test_index]
+            y_train, y_test = y[train_index], y[test_index]
+
+            # Use SVM as classifier
+            clf = svm.SVC(gamma='scale', random_state=random_seed)
+            clf.fit(X_train, y_train)
+
+            # Test the classifier using the testing set
+            y_pred = clf.predict(X_test)
+            accuracy = metrics.accuracy_score(y_test, y_pred)
+            precision = metrics.precision_score(y_test, y_pred, average='weighted')
+            recall = metrics.recall_score(y_test, y_pred, average='weighted')
+            f1 = metrics.f1_score(y_test, y_pred, average='weighted')
+
+            # Store the metrics
+            accuracy_array[i] = accuracy
+            precision_array[i] = precision
+            recall_array[i] = recall
+            f1_array[i] = f1
+            i = i + 1
+        accuracy_average = np.mean(accuracy_array)
+        precision_average = np.mean(precision_array)
+        recall_average = np.mean(recall_array)
+        f1_average = np.mean(f1_array)
+
+        accuracy_std = accuracy_array.std()
+        precision_std = precision_array.std()
+        recall_std = recall_array.std()
+        f1_std = f1_array.std()
+
+        print('{:.2f}±{:.2f}%'.format(accuracy_average * 100, accuracy_std * 100))
+        print('{:.3f}±{:.3f}'.format(precision_average, precision_std))
+        print('{:.3f}±{:.3f}'.format(recall_average, recall_std))
+        print('{:.3f}±{:.3f}'.format(f1_average, f1_std))
+
+    else:
+        testset_ratio = 0.2
+        valset_ratio = 0.5
+
+        # Just one separation
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testset_ratio, random_state=random_seed,
+                                                            stratify=y)
+        X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=valset_ratio,
+                                                        random_state=random_seed, stratify=y_test)
+
+        # Use SVM as classifier
+        clf = svm.SVC(gamma='scale', random_state=random_seed)
+        clf.fit(X_train, y_train)
+
+        # Test the classifier using the testing set
+        y_pred = clf.predict(X_test)
+        accuracy = metrics.accuracy_score(y_test, y_pred)
+        precision = metrics.precision_score(y_test, y_pred, average='weighted')
+        recall = metrics.recall_score(y_test, y_pred, average='weighted')
+        f1 = metrics.f1_score(y_test, y_pred, average='weighted')
+
+        print('{:.2f}'.format(accuracy * 100))
+        print('{:.2f}'.format(precision * 100))
+        print('{:.2f}'.format(recall * 100))
+        print('{:.2f}'.format(f1 * 100))