Diff of /datasets/ab_dataset.py [000000] .. [03464c]

Switch to unified view

a b/datasets/ab_dataset.py
1
import os.path
2
from datasets import load_file
3
from datasets import get_survival_y_true
4
from datasets.basic_dataset import BasicDataset
5
from util import preprocess
6
import numpy as np
7
import pandas as pd
8
import torch
9
10
11
class ABDataset(BasicDataset):
12
    """
13
    A dataset class for multi-omics dataset.
14
    For gene expression data, file should be prepared as '/path/to/data/A.tsv'.
15
    For DNA methylation data, file should be prepared as '/path/to/data/B.tsv'.
16
    For each omics file, each columns should be each sample and each row should be each molecular feature.
17
    """
18
19
    def __init__(self, param):
20
        """
21
        Initialize this dataset class.
22
        """
23
        BasicDataset.__init__(self, param)
24
        self.omics_dims = []
25
26
        # Load data for A
27
        A_df = load_file(param, 'A')
28
        # Get the sample list
29
        if param.use_sample_list:
30
            sample_list_path = os.path.join(param.data_root, 'sample_list.tsv')       # get the path of sample list
31
            self.sample_list = np.loadtxt(sample_list_path, delimiter='\t', dtype='<U32')
32
        else:
33
            self.sample_list = A_df.columns
34
        # Get the feature list for A
35
        if param.use_feature_lists:
36
            feature_list_A_path = os.path.join(param.data_root, 'feature_list_A.tsv')  # get the path of feature list
37
            feature_list_A = np.loadtxt(feature_list_A_path, delimiter='\t', dtype='<U32')
38
        else:
39
            feature_list_A = A_df.index
40
        A_df = A_df.loc[feature_list_A, self.sample_list]
41
        self.A_dim = A_df.shape[0]
42
        self.sample_num = A_df.shape[1]
43
        A_array = A_df.values
44
        if self.param.add_channel:
45
            # Add one dimension for the channel
46
            A_array = A_array[np.newaxis, :, :]
47
        self.A_tensor_all = torch.Tensor(A_array)
48
        self.omics_dims.append(self.A_dim)
49
50
        # Load data for B
51
        B_df = load_file(param, 'B')
52
        # Get the feature list for B
53
        if param.use_feature_lists:
54
            feature_list_B_path = os.path.join(param.data_root, 'feature_list_B.tsv')  # get the path of feature list
55
            feature_list_B = np.loadtxt(feature_list_B_path, delimiter='\t', dtype='<U32')
56
        else:
57
            feature_list_B = B_df.index
58
        B_df = B_df.loc[feature_list_B, self.sample_list]
59
        if param.ch_separate:
60
            B_df_list, self.B_dim = preprocess.separate_B(B_df)
61
            self.B_tensor_all = []
62
            for i in range(0, 23):
63
                B_array = B_df_list[i].values
64
                if self.param.add_channel:
65
                    # Add one dimension for the channel
66
                    B_array = B_array[np.newaxis, :, :]
67
                B_tensor_part = torch.Tensor(B_array)
68
                self.B_tensor_all.append(B_tensor_part)
69
        else:
70
            self.B_dim = B_df.shape[0]
71
            B_array = B_df.values
72
            if self.param.add_channel:
73
                # Add one dimension for the channel
74
                B_array = B_array[np.newaxis, :, :]
75
            self.B_tensor_all = torch.Tensor(B_array)
76
        self.omics_dims.append(self.B_dim)
77
78
        self.class_num = 0
79
        if param.downstream_task == 'classification':
80
            # Load labels
81
            labels_path = os.path.join(param.data_root, 'labels.tsv')       # get the path of the label
82
            labels_df = pd.read_csv(labels_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
83
            self.labels_array = labels_df.iloc[:, -1].values
84
            # Get the class number
85
            self.class_num = len(labels_df.iloc[:, -1].unique())
86
        elif param.downstream_task == 'regression':
87
            # Load target values
88
            values_path = os.path.join(param.data_root, 'values.tsv')       # get the path of the target value
89
            values_df = pd.read_csv(values_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
90
            self.values_array = values_df.iloc[:, -1].astype(float).values
91
            self.values_max = self.values_array.max()
92
            self.values_min = self.values_array.min()
93
        elif param.downstream_task == 'survival':
94
            # Load survival data
95
            survival_path = os.path.join(param.data_root, 'survival.tsv')   # get the path of the survival data
96
            survival_df = pd.read_csv(survival_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
97
            self.survival_T_array = survival_df.iloc[:, -2].astype(float).values
98
            self.survival_E_array = survival_df.iloc[:, -1].values
99
            self.survival_T_max = self.survival_T_array.max()
100
            self.survival_T_min = self.survival_T_array.min()
101
            if param.survival_loss == 'MTLR':
102
                self.y_true_tensor = get_survival_y_true(param, self.survival_T_array, self.survival_E_array)
103
            if param.stratify_label:
104
                labels_path = os.path.join(param.data_root, 'labels.tsv')  # get the path of the label
105
                labels_df = pd.read_csv(labels_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
106
                self.labels_array = labels_df.iloc[:, -1].values
107
        elif param.downstream_task == 'multitask':
108
            # Load labels
109
            labels_path = os.path.join(param.data_root, 'labels.tsv')  # get the path of the label
110
            labels_df = pd.read_csv(labels_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
111
            self.labels_array = labels_df.iloc[:, -1].values
112
            # Get the class number
113
            self.class_num = len(labels_df.iloc[:, -1].unique())
114
115
            # Load target values
116
            values_path = os.path.join(param.data_root, 'values.tsv')  # get the path of the target value
117
            values_df = pd.read_csv(values_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
118
            self.values_array = values_df.iloc[:, -1].astype(float).values
119
            self.values_max = self.values_array.max()
120
            self.values_min = self.values_array.min()
121
122
            # Load survival data
123
            survival_path = os.path.join(param.data_root, 'survival.tsv')  # get the path of the survival data
124
            survival_df = pd.read_csv(survival_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
125
            self.survival_T_array = survival_df.iloc[:, -2].astype(float).values
126
            self.survival_E_array = survival_df.iloc[:, -1].values
127
            self.survival_T_max = self.survival_T_array.max()
128
            self.survival_T_min = self.survival_T_array.min()
129
            if param.survival_loss == 'MTLR':
130
                self.y_true_tensor = get_survival_y_true(param, self.survival_T_array, self.survival_E_array)
131
        elif param.downstream_task == 'alltask':
132
            # Load labels
133
            self.labels_array = []
134
            self.class_num = []
135
            for i in range(param.task_num-2):
136
                labels_path = os.path.join(param.data_root, 'labels_'+str(i+1)+'.tsv')  # get the path of the label
137
                labels_df = pd.read_csv(labels_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
138
                self.labels_array.append(labels_df.iloc[:, -1].values)
139
                # Get the class number
140
                self.class_num.append(len(labels_df.iloc[:, -1].unique()))
141
142
            # Load target values
143
            values_path = os.path.join(param.data_root, 'values.tsv')  # get the path of the target value
144
            values_df = pd.read_csv(values_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
145
            self.values_array = values_df.iloc[:, -1].astype(float).values
146
            self.values_max = self.values_array.max()
147
            self.values_min = self.values_array.min()
148
149
            # Load survival data
150
            survival_path = os.path.join(param.data_root, 'survival.tsv')  # get the path of the survival data
151
            survival_df = pd.read_csv(survival_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
152
            self.survival_T_array = survival_df.iloc[:, -2].astype(float).values
153
            self.survival_E_array = survival_df.iloc[:, -1].values
154
            self.survival_T_max = self.survival_T_array.max()
155
            self.survival_T_min = self.survival_T_array.min()
156
            if param.survival_loss == 'MTLR':
157
                self.y_true_tensor = get_survival_y_true(param, self.survival_T_array, self.survival_E_array)
158
159
    def __getitem__(self, index):
160
        """
161
        Return a data point and its metadata information.
162
163
        Returns a dictionary that contains A_tensor, B_tensor, C_tensor, label and index
164
            input_omics (list)              -- a list of input omics tensor
165
            label (int)                     -- label of the sample
166
            index (int)                     -- the index of this data point
167
        """
168
        # Get the tensor of A
169
        if self.param.add_channel:
170
            A_tensor = self.A_tensor_all[:, :, index]
171
        else:
172
            A_tensor = self.A_tensor_all[:, index]
173
174
        # Get the tensor of B
175
        if self.param.ch_separate:
176
            B_tensor = []
177
            for i in range(0, 23):
178
                if self.param.add_channel:
179
                    B_tensor_part = self.B_tensor_all[i][:, :, index]
180
                else:
181
                    B_tensor_part = self.B_tensor_all[i][:, index]
182
                B_tensor.append(B_tensor_part)
183
            # Return a list of tensor
184
        else:
185
            if self.param.add_channel:
186
                B_tensor = self.B_tensor_all[:, :, index]
187
            else:
188
                B_tensor = self.B_tensor_all[:, index]
189
            # Return a tensor
190
191
        # Get the tensor of C
192
        C_tensor = 0
193
194
        if self.param.downstream_task == 'classification':
195
            # Get label
196
            label = self.labels_array[index]
197
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'label': label, 'index': index}
198
        elif self.param.downstream_task == 'regression':
199
            # Get target value
200
            value = self.values_array[index]
201
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'value': value, 'index': index}
202
        elif self.param.downstream_task == 'survival':
203
            # Get survival T and E
204
            survival_T = self.survival_T_array[index]
205
            survival_E = self.survival_E_array[index]
206
            y_true = self.y_true_tensor[index, :]
207
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'survival_T': survival_T, 'survival_E': survival_E, 'y_true': y_true, 'index': index}
208
        elif self.param.downstream_task == 'multitask':
209
            # Get label
210
            label = self.labels_array[index]
211
            # Get target value
212
            value = self.values_array[index]
213
            # Get survival T and E
214
            survival_T = self.survival_T_array[index]
215
            survival_E = self.survival_E_array[index]
216
            y_true = self.y_true_tensor[index, :]
217
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'label': label, 'value': value,
218
                    'survival_T': survival_T, 'survival_E': survival_E, 'y_true': y_true, 'index': index}
219
        elif self.param.downstream_task == 'alltask':
220
            # Get label
221
            label = []
222
            for i in range(self.param.task_num - 2):
223
                label.append(self.labels_array[i][index])
224
            # Get target value
225
            value = self.values_array[index]
226
            # Get survival T and E
227
            survival_T = self.survival_T_array[index]
228
            survival_E = self.survival_E_array[index]
229
            y_true = self.y_true_tensor[index, :]
230
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'label': label, 'value': value,
231
                    'survival_T': survival_T, 'survival_E': survival_E, 'y_true': y_true, 'index': index}
232
        else:
233
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'index': index}
234
235
    def __len__(self):
236
        """
237
        Return the number of data points in the dataset.
238
        """
239
        return self.sample_num
240