Diff of /datasets/c_dataset.py [000000] .. [03464c]

Switch to unified view

a b/datasets/c_dataset.py
1
import os.path
2
from datasets import load_file
3
from datasets import get_survival_y_true
4
from datasets.basic_dataset import BasicDataset
5
import numpy as np
6
import pandas as pd
7
import torch
8
9
10
class CDataset(BasicDataset):
11
    """
12
    A dataset class for miRNA expression dataset.
13
    File should be prepared as '/path/to/data/C.tsv'.
14
    For each omics file, each columns should be each sample and each row should be each molecular feature.
15
    """
16
17
    def __init__(self, param):
18
        """
19
        Initialize this dataset class.
20
        """
21
        BasicDataset.__init__(self, param)
22
        self.omics_dims = []
23
        self.omics_dims.append(None)  # First dimension is for gene expression (A)
24
        self.omics_dims.append(None)  # Second dimension is for DNA methylation (B)
25
26
        # Load data for C
27
        C_df = load_file(param, 'C')
28
        # Get the sample list
29
        if param.use_sample_list:
30
            sample_list_path = os.path.join(param.data_root, 'sample_list.tsv')  # get the path of sample list
31
            self.sample_list = np.loadtxt(sample_list_path, delimiter='\t', dtype='<U32')
32
        else:
33
            self.sample_list = C_df.columns
34
        # Get the feature list for C
35
        if param.use_feature_lists:
36
            feature_list_C_path = os.path.join(param.data_root, 'feature_list_C.tsv')  # get the path of feature list
37
            feature_list_C = np.loadtxt(feature_list_C_path, delimiter='\t', dtype='<U32')
38
        else:
39
            feature_list_C = C_df.index
40
        C_df = C_df.loc[feature_list_C, self.sample_list]
41
        self.C_dim = C_df.shape[0]
42
        self.sample_num = C_df.shape[1]
43
        C_array = C_df.values
44
        if self.param.add_channel:
45
            # Add one dimension for the channel
46
            C_array = C_array[np.newaxis, :, :]
47
        self.C_tensor_all = torch.Tensor(C_array)
48
        self.omics_dims.append(self.C_dim)
49
50
        self.class_num = 0
51
        if param.downstream_task == 'classification':
52
            # Load labels
53
            labels_path = os.path.join(param.data_root, 'labels.tsv')       # get the path of the label
54
            labels_df = pd.read_csv(labels_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
55
            self.labels_array = labels_df.iloc[:, -1].values
56
            # Get the class number
57
            self.class_num = len(labels_df.iloc[:, -1].unique())
58
        elif param.downstream_task == 'regression':
59
            # Load target values
60
            values_path = os.path.join(param.data_root, 'values.tsv')  # get the path of the target value
61
            values_df = pd.read_csv(values_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
62
            self.values_array = values_df.iloc[:, -1].astype(float).values
63
            self.values_max = self.values_array.max()
64
            self.values_min = self.values_array.min()
65
        elif param.downstream_task == 'survival':
66
            # Load survival data
67
            survival_path = os.path.join(param.data_root, 'survival.tsv')   # get the path of the survival data
68
            survival_df = pd.read_csv(survival_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
69
            self.survival_T_array = survival_df.iloc[:, -2].astype(float).values
70
            self.survival_E_array = survival_df.iloc[:, -1].values
71
            self.survival_T_max = self.survival_T_array.max()
72
            self.survival_T_min = self.survival_T_array.min()
73
            if param.survival_loss == 'MTLR':
74
                self.y_true_tensor = get_survival_y_true(param, self.survival_T_array, self.survival_E_array)
75
            if param.stratify_label:
76
                labels_path = os.path.join(param.data_root, 'labels.tsv')  # get the path of the label
77
                labels_df = pd.read_csv(labels_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
78
                self.labels_array = labels_df.iloc[:, -1].values
79
        elif param.downstream_task == 'multitask':
80
            # Load labels
81
            labels_path = os.path.join(param.data_root, 'labels.tsv')  # get the path of the label
82
            labels_df = pd.read_csv(labels_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
83
            self.labels_array = labels_df.iloc[:, -1].values
84
            # Get the class number
85
            self.class_num = len(labels_df.iloc[:, -1].unique())
86
87
            # Load target values
88
            values_path = os.path.join(param.data_root, 'values.tsv')  # get the path of the target value
89
            values_df = pd.read_csv(values_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
90
            self.values_array = values_df.iloc[:, -1].astype(float).values
91
            self.values_max = self.values_array.max()
92
            self.values_min = self.values_array.min()
93
94
            # Load survival data
95
            survival_path = os.path.join(param.data_root, 'survival.tsv')  # get the path of the survival data
96
            survival_df = pd.read_csv(survival_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
97
            self.survival_T_array = survival_df.iloc[:, -2].astype(float).values
98
            self.survival_E_array = survival_df.iloc[:, -1].values
99
            self.survival_T_max = self.survival_T_array.max()
100
            self.survival_T_min = self.survival_T_array.min()
101
            if param.survival_loss == 'MTLR':
102
                self.y_true_tensor = get_survival_y_true(param, self.survival_T_array, self.survival_E_array)
103
        elif param.downstream_task == 'alltask':
104
            # Load labels
105
            self.labels_array = []
106
            self.class_num = []
107
            for i in range(param.task_num-2):
108
                labels_path = os.path.join(param.data_root, 'labels_'+str(i+1)+'.tsv')  # get the path of the label
109
                labels_df = pd.read_csv(labels_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
110
                self.labels_array.append(labels_df.iloc[:, -1].values)
111
                # Get the class number
112
                self.class_num.append(len(labels_df.iloc[:, -1].unique()))
113
114
            # Load target values
115
            values_path = os.path.join(param.data_root, 'values.tsv')  # get the path of the target value
116
            values_df = pd.read_csv(values_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
117
            self.values_array = values_df.iloc[:, -1].astype(float).values
118
            self.values_max = self.values_array.max()
119
            self.values_min = self.values_array.min()
120
121
            # Load survival data
122
            survival_path = os.path.join(param.data_root, 'survival.tsv')  # get the path of the survival data
123
            survival_df = pd.read_csv(survival_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
124
            self.survival_T_array = survival_df.iloc[:, -2].astype(float).values
125
            self.survival_E_array = survival_df.iloc[:, -1].values
126
            self.survival_T_max = self.survival_T_array.max()
127
            self.survival_T_min = self.survival_T_array.min()
128
            if param.survival_loss == 'MTLR':
129
                self.y_true_tensor = get_survival_y_true(param, self.survival_T_array, self.survival_E_array)
130
131
    def __getitem__(self, index):
132
        """
133
        Return a data point and its metadata information.
134
135
        Returns a dictionary that contains C_tensor, label and index
136
            input_omics (list)              -- a list of input omics tensor
137
            label (int)                     -- label of the sample
138
            index (int)                     -- the index of this data point
139
        """
140
        # Get the tensor of C
141
        if self.param.add_channel:
142
            C_tensor = self.C_tensor_all[:, :, index]
143
        else:
144
            C_tensor = self.C_tensor_all[:, index]
145
146
        # Get the tensor of A
147
        A_tensor = 0
148
149
        # Get the tensor of B
150
        # Get the tensor of B
151
        if self.param.ch_separate:
152
            B_tensor = list(np.zeros(23))
153
        else:
154
            B_tensor = 0
155
156
        if self.param.downstream_task == 'classification':
157
            # Get label
158
            label = self.labels_array[index]
159
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'label': label, 'index': index}
160
        elif self.param.downstream_task == 'regression':
161
            # Get target value
162
            value = self.values_array[index]
163
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'value': value, 'index': index}
164
        elif self.param.downstream_task == 'survival':
165
            # Get survival T and E
166
            survival_T = self.survival_T_array[index]
167
            survival_E = self.survival_E_array[index]
168
            y_true = self.y_true_tensor[index, :]
169
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'survival_T': survival_T, 'survival_E': survival_E, 'y_true': y_true, 'index': index}
170
        elif self.param.downstream_task == 'multitask':
171
            # Get label
172
            label = self.labels_array[index]
173
            # Get target value
174
            value = self.values_array[index]
175
            # Get survival T and E
176
            survival_T = self.survival_T_array[index]
177
            survival_E = self.survival_E_array[index]
178
            y_true = self.y_true_tensor[index, :]
179
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'label': label, 'value': value,
180
                    'survival_T': survival_T, 'survival_E': survival_E, 'y_true': y_true, 'index': index}
181
        elif self.param.downstream_task == 'alltask':
182
            # Get label
183
            label = []
184
            for i in range(self.param.task_num - 2):
185
                label.append(self.labels_array[i][index])
186
            # Get target value
187
            value = self.values_array[index]
188
            # Get survival T and E
189
            survival_T = self.survival_T_array[index]
190
            survival_E = self.survival_E_array[index]
191
            y_true = self.y_true_tensor[index, :]
192
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'label': label, 'value': value,
193
                    'survival_T': survival_T, 'survival_E': survival_E, 'y_true': y_true, 'index': index}
194
        else:
195
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'index': index}
196
197
    def __len__(self):
198
        """
199
        Return the number of data points in the dataset.
200
        """
201
        return self.sample_num
202