a b/src/pca_cancer.py
1
import time
2
from datetime import datetime
3
import csv
4
import numpy as np
5
from sklearn.cluster import KMeans
6
from sklearn.decomposition import PCA
7
from sklearn.preprocessing import StandardScaler
8
from sklearn.pipeline import Pipeline
9
from sklearn.cross_validation import train_test_split
10
from sklearn.cross_validation import StratifiedShuffleSplit
11
from sklearn.grid_search import GridSearchCV
12
from sklearn.cross_validation import StratifiedKFold
13
14
15
print "Script start at ", datetime.now().isoformat()
16
17
X=np.load('F:/NYU/Hackathon/numpy_array.npy')
18
Y=X[:,:3] #patient_id cancer_type tissue_type
19
X=X[:,3:] #rpm
20
21
RS=np.random.RandomState(90)
22
perm=RS.permutation(678)
23
24
Y=Y[perm]
25
X=X[perm]
26
27
X_train, X_test, Y_train, Y_test = train_test_split(X, Y[:,1], test_size=0.25, random_state=30, stratify=Y[:,1])
28
29
p=PCA(n_components=0.5).fit(X_train)
30
print(p.explained_variance_)