Diff of /datasets/abc_dataset.py [000000] .. [03464c]

Switch to unified view

a b/datasets/abc_dataset.py
1
import os.path
2
from datasets import load_file
3
from datasets import get_survival_y_true
4
from datasets.basic_dataset import BasicDataset
5
from util import preprocess
6
import numpy as np
7
import pandas as pd
8
import torch
9
10
11
class ABCDataset(BasicDataset):
12
    """
13
    A dataset class for multi-omics dataset.
14
    For gene expression data, file should be prepared as '/path/to/data/A.tsv'.
15
    For DNA methylation data, file should be prepared as '/path/to/data/B.tsv'.
16
    For miRNA expression data, file should be prepared as '/path/to/data/C.tsv'.
17
    For each omics file, each columns should be each sample and each row should be each molecular feature.
18
    """
19
20
    def __init__(self, param):
21
        """
22
        Initialize this dataset class.
23
        """
24
        BasicDataset.__init__(self, param)
25
        self.omics_dims = []
26
27
        # Load data for A
28
        A_df = load_file(param, 'A')
29
        # Get the sample list
30
        if param.use_sample_list:
31
            sample_list_path = os.path.join(param.data_root, 'sample_list.tsv')       # get the path of sample list
32
            self.sample_list = np.loadtxt(sample_list_path, delimiter='\t', dtype='<U32')
33
        else:
34
            self.sample_list = A_df.columns
35
        # Get the feature list for A
36
        if param.use_feature_lists:
37
            feature_list_A_path = os.path.join(param.data_root, 'feature_list_A.tsv')  # get the path of feature list
38
            feature_list_A = np.loadtxt(feature_list_A_path, delimiter='\t', dtype='<U32')
39
        else:
40
            feature_list_A = A_df.index
41
        A_df = A_df.loc[feature_list_A, self.sample_list]
42
        self.A_dim = A_df.shape[0]
43
        self.sample_num = A_df.shape[1]
44
        A_array = A_df.values
45
        if self.param.add_channel:
46
            # Add one dimension for the channel
47
            A_array = A_array[np.newaxis, :, :]
48
        self.A_tensor_all = torch.Tensor(A_array)
49
        self.omics_dims.append(self.A_dim)
50
51
        # Load data for B
52
        B_df = load_file(param, 'B')
53
        # Get the feature list for B
54
        if param.use_feature_lists:
55
            feature_list_B_path = os.path.join(param.data_root, 'feature_list_B.tsv')  # get the path of feature list
56
            feature_list_B = np.loadtxt(feature_list_B_path, delimiter='\t', dtype='<U32')
57
        else:
58
            feature_list_B = B_df.index
59
        B_df = B_df.loc[feature_list_B, self.sample_list]
60
        if param.ch_separate:
61
            B_df_list, self.B_dim = preprocess.separate_B(B_df)
62
            self.B_tensor_all = []
63
            for i in range(0, 23):
64
                B_array = B_df_list[i].values
65
                if self.param.add_channel:
66
                    # Add one dimension for the channel
67
                    B_array = B_array[np.newaxis, :, :]
68
                B_tensor_part = torch.Tensor(B_array)
69
                self.B_tensor_all.append(B_tensor_part)
70
        else:
71
            self.B_dim = B_df.shape[0]
72
            B_array = B_df.values
73
            if self.param.add_channel:
74
                # Add one dimension for the channel
75
                B_array = B_array[np.newaxis, :, :]
76
            self.B_tensor_all = torch.Tensor(B_array)
77
        self.omics_dims.append(self.B_dim)
78
79
        # Load data for C
80
        C_df = load_file(param, 'C')
81
        # Get the feature list for C
82
        if param.use_feature_lists:
83
            feature_list_C_path = os.path.join(param.data_root, 'feature_list_C.tsv')  # get the path of feature list
84
            feature_list_C = np.loadtxt(feature_list_C_path, delimiter='\t', dtype='<U32')
85
        else:
86
            feature_list_C = C_df.index
87
        C_df = C_df.loc[feature_list_C, self.sample_list]
88
        self.C_dim = C_df.shape[0]
89
        C_array = C_df.values
90
        if self.param.add_channel:
91
            # Add one dimension for the channel
92
            C_array = C_array[np.newaxis, :, :]
93
        self.C_tensor_all = torch.Tensor(C_array)
94
        self.omics_dims.append(self.C_dim)
95
96
        self.class_num = 0
97
        if param.downstream_task == 'classification':
98
            # Load labels
99
            labels_path = os.path.join(param.data_root, 'labels.tsv')       # get the path of the label
100
            labels_df = pd.read_csv(labels_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
101
            self.labels_array = labels_df.iloc[:, -1].values
102
            # Get the class number
103
            self.class_num = len(labels_df.iloc[:, -1].unique())
104
        elif param.downstream_task == 'regression':
105
            # Load target values
106
            values_path = os.path.join(param.data_root, 'values.tsv')  # get the path of the target value
107
            values_df = pd.read_csv(values_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
108
            self.values_array = values_df.iloc[:, -1].astype(float).values
109
            self.values_max = self.values_array.max()
110
            self.values_min = self.values_array.min()
111
        elif param.downstream_task == 'survival':
112
            # Load survival data
113
            survival_path = os.path.join(param.data_root, 'survival.tsv')   # get the path of the survival data
114
            survival_df = pd.read_csv(survival_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
115
            self.survival_T_array = survival_df.iloc[:, -2].astype(float).values
116
            self.survival_E_array = survival_df.iloc[:, -1].values
117
            self.survival_T_max = self.survival_T_array.max()
118
            self.survival_T_min = self.survival_T_array.min()
119
            if param.survival_loss == 'MTLR':
120
                self.y_true_tensor = get_survival_y_true(param, self.survival_T_array, self.survival_E_array)
121
            if param.stratify_label:
122
                labels_path = os.path.join(param.data_root, 'labels.tsv')  # get the path of the label
123
                labels_df = pd.read_csv(labels_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
124
                self.labels_array = labels_df.iloc[:, -1].values
125
        elif param.downstream_task == 'multitask':
126
            # Load labels
127
            labels_path = os.path.join(param.data_root, 'labels.tsv')  # get the path of the label
128
            labels_df = pd.read_csv(labels_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
129
            self.labels_array = labels_df.iloc[:, -1].values
130
            # Get the class number
131
            self.class_num = len(labels_df.iloc[:, -1].unique())
132
133
            # Load target values
134
            values_path = os.path.join(param.data_root, 'values.tsv')  # get the path of the target value
135
            values_df = pd.read_csv(values_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
136
            self.values_array = values_df.iloc[:, -1].astype(float).values
137
            self.values_max = self.values_array.max()
138
            self.values_min = self.values_array.min()
139
140
            # Load survival data
141
            survival_path = os.path.join(param.data_root, 'survival.tsv')  # get the path of the survival data
142
            survival_df = pd.read_csv(survival_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
143
            self.survival_T_array = survival_df.iloc[:, -2].astype(float).values
144
            self.survival_E_array = survival_df.iloc[:, -1].values
145
            self.survival_T_max = self.survival_T_array.max()
146
            self.survival_T_min = self.survival_T_array.min()
147
            if param.survival_loss == 'MTLR':
148
                self.y_true_tensor = get_survival_y_true(param, self.survival_T_array, self.survival_E_array)
149
        elif param.downstream_task == 'alltask':
150
            # Load labels
151
            self.labels_array = []
152
            self.class_num = []
153
            for i in range(param.task_num-2):
154
                labels_path = os.path.join(param.data_root, 'labels_'+str(i+1)+'.tsv')  # get the path of the label
155
                labels_df = pd.read_csv(labels_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
156
                self.labels_array.append(labels_df.iloc[:, -1].values)
157
                # Get the class number
158
                self.class_num.append(len(labels_df.iloc[:, -1].unique()))
159
160
            # Load target values
161
            values_path = os.path.join(param.data_root, 'values.tsv')  # get the path of the target value
162
            values_df = pd.read_csv(values_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
163
            self.values_array = values_df.iloc[:, -1].astype(float).values
164
            self.values_max = self.values_array.max()
165
            self.values_min = self.values_array.min()
166
167
            # Load survival data
168
            survival_path = os.path.join(param.data_root, 'survival.tsv')  # get the path of the survival data
169
            survival_df = pd.read_csv(survival_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
170
            self.survival_T_array = survival_df.iloc[:, -2].astype(float).values
171
            self.survival_E_array = survival_df.iloc[:, -1].values
172
            self.survival_T_max = self.survival_T_array.max()
173
            self.survival_T_min = self.survival_T_array.min()
174
            if param.survival_loss == 'MTLR':
175
                self.y_true_tensor = get_survival_y_true(param, self.survival_T_array, self.survival_E_array)
176
177
    def __getitem__(self, index):
178
        """
179
        Return a data point and its metadata information.
180
181
        Returns a dictionary that contains A_tensor, B_tensor, C_tensor, label and index
182
            input_omics (list)              -- a list of input omics tensor
183
            label (int)                     -- label of the sample
184
            index (int)                     -- the index of this data point
185
        """
186
        # Get the tensor of A
187
        if self.param.add_channel:
188
            A_tensor = self.A_tensor_all[:, :, index]
189
        else:
190
            A_tensor = self.A_tensor_all[:, index]
191
192
        # Get the tensor of B
193
        if self.param.ch_separate:
194
            B_tensor = []
195
            for i in range(0, 23):
196
                if self.param.add_channel:
197
                    B_tensor_part = self.B_tensor_all[i][:, :, index]
198
                else:
199
                    B_tensor_part = self.B_tensor_all[i][:, index]
200
                B_tensor.append(B_tensor_part)
201
            # Return a list of tensor
202
        else:
203
            if self.param.add_channel:
204
                B_tensor = self.B_tensor_all[:, :, index]
205
            else:
206
                B_tensor = self.B_tensor_all[:, index]
207
            # Return a tensor
208
209
        # Get the tensor of C
210
        if self.param.add_channel:
211
            C_tensor = self.C_tensor_all[:, :, index]
212
        else:
213
            C_tensor = self.C_tensor_all[:, index]
214
215
        if self.param.downstream_task == 'classification':
216
            # Get label
217
            label = self.labels_array[index]
218
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'label': label, 'index': index}
219
        elif self.param.downstream_task == 'regression':
220
            # Get target value
221
            value = self.values_array[index]
222
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'value': value, 'index': index}
223
        elif self.param.downstream_task == 'survival':
224
            # Get survival T and E
225
            survival_T = self.survival_T_array[index]
226
            survival_E = self.survival_E_array[index]
227
            y_true = self.y_true_tensor[index, :]
228
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'survival_T': survival_T, 'survival_E': survival_E, 'y_true': y_true, 'index': index}
229
        elif self.param.downstream_task == 'multitask':
230
            # Get label
231
            label = self.labels_array[index]
232
            # Get target value
233
            value = self.values_array[index]
234
            # Get survival T and E
235
            survival_T = self.survival_T_array[index]
236
            survival_E = self.survival_E_array[index]
237
            y_true = self.y_true_tensor[index, :]
238
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'label': label, 'value': value,
239
                    'survival_T': survival_T, 'survival_E': survival_E, 'y_true': y_true, 'index': index}
240
        elif self.param.downstream_task == 'alltask':
241
            # Get label
242
            label = []
243
            for i in range(self.param.task_num - 2):
244
                label.append(self.labels_array[i][index])
245
            # Get target value
246
            value = self.values_array[index]
247
            # Get survival T and E
248
            survival_T = self.survival_T_array[index]
249
            survival_E = self.survival_E_array[index]
250
            y_true = self.y_true_tensor[index, :]
251
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'label': label, 'value': value,
252
                    'survival_T': survival_T, 'survival_E': survival_E, 'y_true': y_true, 'index': index}
253
        else:
254
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'index': index}
255
256
    def __len__(self):
257
        """
258
        Return the number of data points in the dataset.
259
        """
260
        return self.sample_num
261