DeepProg / Git / Diff of /examples/load_3_omics

Models:

AlyssaS/

DeepProg

Downloads: 1

Diff of /examples/load_3_omics_model.py [000000] .. [53737a]

Switch to unified view

 b/examples/load_3_omics_model.py
+"""
+Load the 3-omics and perform subtype detecion from the HCC dataset
+tsv files used in the original study are available in the ./data folder of this project.
+However, theses files must be decompressed using this function in linux:
+gzip -d *.gz.
+"""
+# Python import needed
+from simdeep.simdeep_boosting import SimDeepBoosting
+from simdeep.config import PATH_THIS_FILE
+from collections import OrderedDict
+from os.path import isfile
+from sys import exit
+def main():
+    """ Main function excecuted """
+    path_data = PATH_THIS_FILE + "/../data/"
+    # Testing if the files were decompressed in the good repository
+    try:
+        assert(isfile(path_data + "meth.tsv"))
+        assert(isfile(path_data + "rna.tsv"))
+        assert(isfile(path_data + "mir.tsv"))
+    except AssertionError:
+        print('gz files in {0} must be decompressed !\n exiting...'.format(path_data))
+        exit(1)
+    # Tsv files used in the original study in the appropriate order
+    tsv_files = OrderedDict([
+        ('MIR', 'mir.tsv'),
+        ('METH', 'meth.tsv'),
+        ('RNA', 'rna.tsv'),
+    ])
+    # File with survival event
+    survival_tsv = 'survival.tsv'
+    # As test dataset we will use the rna.tsv only
+    tsv_test = {'RNA': 'rna.tsv'}
+    # because it is the same data, we should use the same survival file
+    test_survival = 'survival.tsv'
+    PROJECT_NAME = 'HCC_dataset'
+    EPOCHS = 10
+    SEED = 10045
+    nb_it = 3
+    nb_threads = 2
+    survival_flag = {
+        'patient_id': 'Samples',
+        'survival': 'days',
+        'event': 'event'}
+    import ray
+    ray.init(num_cpus=3)
+    normalization = {
+        'NB_FEATURES_TO_KEEP': 100, # variance selection features. 0 is all the feature
+        'TRAIN_RANK_NORM': True,
+        'TRAIN_CORR_REDUCTION': True,
+        'TRAIN_CORR_RANK_NORM': True,
+        'TRAIN_ROBUST_SCALE': False,
+    }
+    # Instanciate a DeepProg instance
+    boosting = SimDeepBoosting(
+        nb_threads=nb_threads,
+        nb_it=nb_it,
+        split_n_fold=3,
+        survival_tsv=survival_tsv,
+        training_tsv=tsv_files,
+        path_data=path_data,
+        project_name=PROJECT_NAME,
+        path_results=path_data,
+        epochs=EPOCHS,
+        survival_flag=survival_flag,
+        distribute=True,
+        cluster_method="mixture",
+        use_autoencoders=True,
+        feature_surv_analysis=True,
+        normalization=normalization,
+        seed=SEED)
+    boosting.fit()
+    # predict labels of the training
+    boosting.predict_labels_on_full_dataset()
+    boosting.compute_clusters_consistency_for_full_labels()
+    boosting.evalutate_cluster_performance()
+    boosting.collect_cindex_for_test_fold()
+    boosting.collect_cindex_for_full_dataset()
+    boosting.compute_pvalue_for_merged_test_fold()
+    boosting.compute_feature_scores_per_cluster()
+    boosting.write_feature_score_per_cluster()
+    # Finally, load test set
+    boosting.load_new_test_dataset(
+        tsv_test,
+        'test_RNA_only',
+        test_survival,
+    )
+    boosting.predict_labels_on_test_dataset()
+    boosting.compute_c_indexes_for_test_dataset()
+    boosting.compute_clusters_consistency_for_test_labels()
+    # Experimental method to plot the test dataset amongst the class kernel densities
+    boosting.plot_supervised_kernel_for_test_sets()
+    boosting.plot_supervised_predicted_labels_for_test_sets()
+    #All the parameters are attributes of the SimDeep instance:
+    # boosting.labels
+    # boosting.test_labels
+    # boosting.test_labels_proba
+    # ... etc...
+    # Close clusters and free memory
+    ray.shutdown()
+if __name__ == "__main__":
+    main()