[785f18]: / coxnet / coxnet_baseline.py

Download this file

98 lines (88 with data), 3.4 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
"""
Coxnet: CoxPH with Lasso penalty. On Owkin dataset.
Leon Zheng
"""
import preprocessing
from sksurv.linear_model import CoxnetSurvivalAnalysis
from sklearn.model_selection import cross_validate, RandomizedSearchCV
from sksurv.util import Surv
import numpy as np
import pandas as pd
# Features
# features = ['original_shape_Compactness2',
# 'original_shape_SphericalDisproportion',
# 'original_shape_SurfaceVolumeRatio',
# 'original_firstorder_Kurtosis',
# 'original_firstorder_MeanAbsoluteDeviation',
# 'original_firstorder_Minimum',
# 'original_glcm_ClusterProminence',
# 'original_glcm_Contrast',
# 'original_glcm_DifferenceEntropy',
# 'original_glcm_DifferenceAverage',
# 'original_glcm_JointEnergy',
# 'original_glcm_Id',
# 'original_glcm_Idm',
# 'original_glcm_Imc1',
# 'original_glcm_Imc2',
# 'original_glcm_Idmn',
# 'original_glcm_Idn',
# 'original_glrlm_ShortRunEmphasis',
# 'original_glrlm_LongRunEmphasis',
# 'original_glrlm_GrayLevelNonUniformity',
# 'original_glrlm_RunPercentage',
# 'original_glrlm_ShortRunLowGrayLevelEmphasis',
# 'original_glrlm_LongRunLowGrayLevelEmphasis',
# 'original_glrlm_LongRunHighGrayLevelEmphasis',
# 'Nstage',
# 'age',
# 'SourceDataset']
# radiomics_features = ['original_shape_Sphericity', 'original_shape_SurfaceVolumeRatio',
# 'original_shape_Maximum3DDiameter', 'original_glcm_JointEntropy', 'original_glcm_Id',
# 'original_glcm_Idm']
# clinical_features = ['SourceDataset', 'Nstage']
# features = radiomics_features + clinical_features
features = ['Mstage',
'Nstage',
'SourceDataset',
'age',
'original_shape_VoxelVolume',
'original_firstorder_Maximum',
'original_firstorder_Mean',
'original_glcm_ClusterProminence',
'original_glcm_Idm',
'original_glcm_Idn',
'original_glrlm_RunPercentage']
# Read data
input_train, output_train, input_test = preprocessing.load_owkin_data()
input_train = input_train[features]
input_test = input_test[features]
input_train, input_test = preprocessing.normalizing_input(input_train, input_test)
structured_y = Surv.from_dataframe('Event', 'SurvivalTime', output_train)
# Coxnet
# coxnet = CoxnetSurvivalAnalysis()
# print(cross_validate(coxnet, input_train, structured_y, cv=5))
# Grid search
tuned_params = {"l1_ratio": np.linspace(0.01, 0.02, 100),
"n_alphas": range(140, 160, 1),
}
grid_search = RandomizedSearchCV(CoxnetSurvivalAnalysis(), tuned_params, cv=5, n_jobs=4, n_iter=1000)
grid_search.fit(input_train, structured_y)
print(grid_search.best_score_)
best_params = grid_search.best_params_
print(best_params)
# Prediction
def predict(model, X, threshold=0.9):
prediction = model.predict_survival_function(X)
y_pred = []
for pred in prediction:
time = pred.x
survival_prob = pred.y
i_pred = 0
while i_pred < len(survival_prob) - 1 and survival_prob[i_pred] > threshold:
i_pred += 1
y_pred.append(time[i_pred])
return pd.DataFrame(np.array([[y, np.nan] for y in y_pred]), index=X.index, columns=['SurvivalTime', 'Event'])
coxph = CoxnetSurvivalAnalysis(**best_params, fit_baseline_model=True)
coxph.fit(input_train, structured_y)
y_pred = predict(coxph, input_test)
y_pred.to_csv('submission.csv')