Switch to unified view

a b/examples/load_3_omics_model.py
1
"""
2
Load the 3-omics and perform subtype detecion from the HCC dataset
3
4
tsv files used in the original study are available in the ./data folder of this project.
5
However, theses files must be decompressed using this function in linux:
6
gzip -d *.gz.
7
8
"""
9
10
# Python import needed
11
from simdeep.simdeep_boosting import SimDeepBoosting
12
from simdeep.config import PATH_THIS_FILE
13
14
from collections import OrderedDict
15
16
from os.path import isfile
17
18
from sys import exit
19
20
21
def main():
22
    """ Main function excecuted """
23
    path_data = PATH_THIS_FILE + "/../data/"
24
25
    # Testing if the files were decompressed in the good repository
26
    try:
27
        assert(isfile(path_data + "meth.tsv"))
28
        assert(isfile(path_data + "rna.tsv"))
29
        assert(isfile(path_data + "mir.tsv"))
30
    except AssertionError:
31
        print('gz files in {0} must be decompressed !\n exiting...'.format(path_data))
32
        exit(1)
33
34
    # Tsv files used in the original study in the appropriate order
35
    tsv_files = OrderedDict([
36
        ('MIR', 'mir.tsv'),
37
        ('METH', 'meth.tsv'),
38
        ('RNA', 'rna.tsv'),
39
    ])
40
41
    # File with survival event
42
    survival_tsv = 'survival.tsv'
43
44
    # As test dataset we will use the rna.tsv only
45
    tsv_test = {'RNA': 'rna.tsv'}
46
    # because it is the same data, we should use the same survival file
47
    test_survival = 'survival.tsv'
48
49
    PROJECT_NAME = 'HCC_dataset'
50
    EPOCHS = 10
51
    SEED = 10045
52
    nb_it = 3
53
    nb_threads = 2
54
55
    survival_flag = {
56
        'patient_id': 'Samples',
57
        'survival': 'days',
58
        'event': 'event'}
59
60
    import ray
61
    ray.init(num_cpus=3)
62
63
    normalization = {
64
        'NB_FEATURES_TO_KEEP': 100, # variance selection features. 0 is all the feature
65
        'TRAIN_RANK_NORM': True,
66
        'TRAIN_CORR_REDUCTION': True,
67
        'TRAIN_CORR_RANK_NORM': True,
68
        'TRAIN_ROBUST_SCALE': False,
69
    }
70
71
    # Instanciate a DeepProg instance
72
    boosting = SimDeepBoosting(
73
        nb_threads=nb_threads,
74
        nb_it=nb_it,
75
        split_n_fold=3,
76
        survival_tsv=survival_tsv,
77
        training_tsv=tsv_files,
78
        path_data=path_data,
79
        project_name=PROJECT_NAME,
80
        path_results=path_data,
81
        epochs=EPOCHS,
82
        survival_flag=survival_flag,
83
        distribute=True,
84
        cluster_method="mixture",
85
        use_autoencoders=True,
86
        feature_surv_analysis=True,
87
        normalization=normalization,
88
        seed=SEED)
89
90
    boosting.fit()
91
92
    # predict labels of the training
93
94
    boosting.predict_labels_on_full_dataset()
95
    boosting.compute_clusters_consistency_for_full_labels()
96
    boosting.evalutate_cluster_performance()
97
    boosting.collect_cindex_for_test_fold()
98
    boosting.collect_cindex_for_full_dataset()
99
    boosting.compute_pvalue_for_merged_test_fold()
100
101
    boosting.compute_feature_scores_per_cluster()
102
    boosting.write_feature_score_per_cluster()
103
104
    # Finally, load test set
105
    boosting.load_new_test_dataset(
106
        tsv_test,
107
        'test_RNA_only',
108
        test_survival,
109
    )
110
111
    boosting.predict_labels_on_test_dataset()
112
    boosting.compute_c_indexes_for_test_dataset()
113
    boosting.compute_clusters_consistency_for_test_labels()
114
115
    # Experimental method to plot the test dataset amongst the class kernel densities
116
    boosting.plot_supervised_kernel_for_test_sets()
117
    boosting.plot_supervised_predicted_labels_for_test_sets()
118
119
    #All the parameters are attributes of the SimDeep instance:
120
121
    # boosting.labels
122
    # boosting.test_labels
123
    # boosting.test_labels_proba
124
    # ... etc...
125
126
    # Close clusters and free memory
127
    ray.shutdown()
128
129
130
if __name__ == "__main__":
131
    main()