[53737a]: / examples / load_3_omics_model.py

Download this file

132 lines (102 with data), 3.6 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
"""
Load the 3-omics and perform subtype detecion from the HCC dataset
tsv files used in the original study are available in the ./data folder of this project.
However, theses files must be decompressed using this function in linux:
gzip -d *.gz.
"""
# Python import needed
from simdeep.simdeep_boosting import SimDeepBoosting
from simdeep.config import PATH_THIS_FILE
from collections import OrderedDict
from os.path import isfile
from sys import exit
def main():
""" Main function excecuted """
path_data = PATH_THIS_FILE + "/../data/"
# Testing if the files were decompressed in the good repository
try:
assert(isfile(path_data + "meth.tsv"))
assert(isfile(path_data + "rna.tsv"))
assert(isfile(path_data + "mir.tsv"))
except AssertionError:
print('gz files in {0} must be decompressed !\n exiting...'.format(path_data))
exit(1)
# Tsv files used in the original study in the appropriate order
tsv_files = OrderedDict([
('MIR', 'mir.tsv'),
('METH', 'meth.tsv'),
('RNA', 'rna.tsv'),
])
# File with survival event
survival_tsv = 'survival.tsv'
# As test dataset we will use the rna.tsv only
tsv_test = {'RNA': 'rna.tsv'}
# because it is the same data, we should use the same survival file
test_survival = 'survival.tsv'
PROJECT_NAME = 'HCC_dataset'
EPOCHS = 10
SEED = 10045
nb_it = 3
nb_threads = 2
survival_flag = {
'patient_id': 'Samples',
'survival': 'days',
'event': 'event'}
import ray
ray.init(num_cpus=3)
normalization = {
'NB_FEATURES_TO_KEEP': 100, # variance selection features. 0 is all the feature
'TRAIN_RANK_NORM': True,
'TRAIN_CORR_REDUCTION': True,
'TRAIN_CORR_RANK_NORM': True,
'TRAIN_ROBUST_SCALE': False,
}
# Instanciate a DeepProg instance
boosting = SimDeepBoosting(
nb_threads=nb_threads,
nb_it=nb_it,
split_n_fold=3,
survival_tsv=survival_tsv,
training_tsv=tsv_files,
path_data=path_data,
project_name=PROJECT_NAME,
path_results=path_data,
epochs=EPOCHS,
survival_flag=survival_flag,
distribute=True,
cluster_method="mixture",
use_autoencoders=True,
feature_surv_analysis=True,
normalization=normalization,
seed=SEED)
boosting.fit()
# predict labels of the training
boosting.predict_labels_on_full_dataset()
boosting.compute_clusters_consistency_for_full_labels()
boosting.evalutate_cluster_performance()
boosting.collect_cindex_for_test_fold()
boosting.collect_cindex_for_full_dataset()
boosting.compute_pvalue_for_merged_test_fold()
boosting.compute_feature_scores_per_cluster()
boosting.write_feature_score_per_cluster()
# Finally, load test set
boosting.load_new_test_dataset(
tsv_test,
'test_RNA_only',
test_survival,
)
boosting.predict_labels_on_test_dataset()
boosting.compute_c_indexes_for_test_dataset()
boosting.compute_clusters_consistency_for_test_labels()
# Experimental method to plot the test dataset amongst the class kernel densities
boosting.plot_supervised_kernel_for_test_sets()
boosting.plot_supervised_predicted_labels_for_test_sets()
#All the parameters are attributes of the SimDeep instance:
# boosting.labels
# boosting.test_labels
# boosting.test_labels_proba
# ... etc...
# Close clusters and free memory
ray.shutdown()
if __name__ == "__main__":
main()