Diff of /datasets/b_dataset.py [000000] .. [03464c]

Switch to unified view

a b/datasets/b_dataset.py
1
import os.path
2
from datasets import load_file
3
from datasets import get_survival_y_true
4
from datasets.basic_dataset import BasicDataset
5
from util import preprocess
6
import numpy as np
7
import pandas as pd
8
import torch
9
10
11
class BDataset(BasicDataset):
12
    """
13
    A dataset class for methylation dataset.
14
    DNA methylation data file should be prepared as '/path/to/data/B.tsv'.
15
16
    For each omics file, each columns should be each sample and each row should be each molecular feature.
17
    """
18
19
    def __init__(self, param):
20
        """
21
        Initialize this dataset class.
22
        """
23
        BasicDataset.__init__(self, param)
24
        self.omics_dims = []
25
        self.omics_dims.append(None)            # First dimension is for gene expression (A)
26
27
        # Load data for B
28
        B_df = load_file(param, 'B')
29
        # Get the sample list
30
        if param.use_sample_list:
31
            sample_list_path = os.path.join(param.data_root, 'sample_list.tsv')  # get the path of sample list
32
            self.sample_list = np.loadtxt(sample_list_path, delimiter='\t', dtype='str')
33
        else:
34
            self.sample_list = B_df.columns
35
        # Get the feature list for B
36
        if param.use_feature_lists:
37
            feature_list_B_path = os.path.join(param.data_root, 'feature_list_B.tsv')  # get the path of feature list
38
            feature_list_B = np.loadtxt(feature_list_B_path, delimiter='\t', dtype='<U32')
39
        else:
40
            feature_list_B = B_df.index
41
        B_df = B_df.loc[feature_list_B, self.sample_list]
42
        self.sample_num = B_df.shape[1]
43
        if param.ch_separate:
44
            B_df_list, self.B_dim = preprocess.separate_B(B_df)
45
            self.B_tensor_all = []
46
            for i in range(0, 23):
47
                B_array = B_df_list[i].values
48
                if self.param.add_channel:
49
                    # Add one dimension for the channel
50
                    B_array = B_array[np.newaxis, :, :]
51
                B_tensor_part = torch.Tensor(B_array)
52
                self.B_tensor_all.append(B_tensor_part)
53
        else:
54
            self.B_dim = B_df.shape[0]
55
            B_array = B_df.values
56
            if self.param.add_channel:
57
                # Add one dimension for the channel
58
                B_array = B_array[np.newaxis, :, :]
59
            self.B_tensor_all = torch.Tensor(B_array)
60
        self.omics_dims.append(self.B_dim)
61
62
        self.class_num = 0
63
        if param.downstream_task == 'classification':
64
            # Load labels
65
            labels_path = os.path.join(param.data_root, 'labels.tsv')       # get the path of the label
66
            labels_df = pd.read_csv(labels_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
67
            self.labels_array = labels_df.iloc[:, -1].values
68
            # Get the class number
69
            self.class_num = len(labels_df.iloc[:, -1].unique())
70
        elif param.downstream_task == 'regression':
71
            # Load target values
72
            values_path = os.path.join(param.data_root, 'values.tsv')  # get the path of the target value
73
            values_df = pd.read_csv(values_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
74
            self.values_array = values_df.iloc[:, -1].astype(float).values
75
            self.values_max = self.values_array.max()
76
            self.values_min = self.values_array.min()
77
        elif param.downstream_task == 'survival':
78
            # Load survival data
79
            survival_path = os.path.join(param.data_root, 'survival.tsv')   # get the path of the survival data
80
            survival_df = pd.read_csv(survival_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
81
            self.survival_T_array = survival_df.iloc[:, -2].astype(float).values
82
            self.survival_E_array = survival_df.iloc[:, -1].values
83
            self.survival_T_max = self.survival_T_array.max()
84
            self.survival_T_min = self.survival_T_array.min()
85
            if param.survival_loss == 'MTLR':
86
                self.y_true_tensor = get_survival_y_true(param, self.survival_T_array, self.survival_E_array)
87
            if param.stratify_label:
88
                labels_path = os.path.join(param.data_root, 'labels.tsv')  # get the path of the label
89
                labels_df = pd.read_csv(labels_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
90
                self.labels_array = labels_df.iloc[:, -1].values
91
        elif param.downstream_task == 'multitask':
92
            # Load labels
93
            labels_path = os.path.join(param.data_root, 'labels.tsv')  # get the path of the label
94
            labels_df = pd.read_csv(labels_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
95
            self.labels_array = labels_df.iloc[:, -1].values
96
            # Get the class number
97
            self.class_num = len(labels_df.iloc[:, -1].unique())
98
99
            # Load target values
100
            values_path = os.path.join(param.data_root, 'values.tsv')  # get the path of the target value
101
            values_df = pd.read_csv(values_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
102
            self.values_array = values_df.iloc[:, -1].astype(float).values
103
            self.values_max = self.values_array.max()
104
            self.values_min = self.values_array.min()
105
106
            # Load survival data
107
            survival_path = os.path.join(param.data_root, 'survival.tsv')  # get the path of the survival data
108
            survival_df = pd.read_csv(survival_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
109
            self.survival_T_array = survival_df.iloc[:, -2].astype(float).values
110
            self.survival_E_array = survival_df.iloc[:, -1].values
111
            self.survival_T_max = self.survival_T_array.max()
112
            self.survival_T_min = self.survival_T_array.min()
113
            if param.survival_loss == 'MTLR':
114
                self.y_true_tensor = get_survival_y_true(param, self.survival_T_array, self.survival_E_array)
115
        elif param.downstream_task == 'alltask':
116
            # Load labels
117
            self.labels_array = []
118
            self.class_num = []
119
            for i in range(param.task_num-2):
120
                labels_path = os.path.join(param.data_root, 'labels_'+str(i+1)+'.tsv')  # get the path of the label
121
                labels_df = pd.read_csv(labels_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
122
                self.labels_array.append(labels_df.iloc[:, -1].values)
123
                # Get the class number
124
                self.class_num.append(len(labels_df.iloc[:, -1].unique()))
125
126
            # Load target values
127
            values_path = os.path.join(param.data_root, 'values.tsv')  # get the path of the target value
128
            values_df = pd.read_csv(values_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
129
            self.values_array = values_df.iloc[:, -1].astype(float).values
130
            self.values_max = self.values_array.max()
131
            self.values_min = self.values_array.min()
132
133
            # Load survival data
134
            survival_path = os.path.join(param.data_root, 'survival.tsv')  # get the path of the survival data
135
            survival_df = pd.read_csv(survival_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
136
            self.survival_T_array = survival_df.iloc[:, -2].astype(float).values
137
            self.survival_E_array = survival_df.iloc[:, -1].values
138
            self.survival_T_max = self.survival_T_array.max()
139
            self.survival_T_min = self.survival_T_array.min()
140
            if param.survival_loss == 'MTLR':
141
                self.y_true_tensor = get_survival_y_true(param, self.survival_T_array, self.survival_E_array)
142
143
    def __getitem__(self, index):
144
        """
145
        Return a data point and its metadata information.
146
147
        Returns a dictionary that contains B_tensor, label and index
148
            input_omics (list)              -- a list of input omics tensor
149
            label (int)                     -- label of the sample
150
            index (int)                     -- the index of this data point
151
        """
152
        # Get the tensor of B
153
        if self.param.ch_separate:
154
            B_tensor = []
155
            for i in range(0, 23):
156
                if self.param.add_channel:
157
                    B_tensor_part = self.B_tensor_all[i][:, :, index]
158
                else:
159
                    B_tensor_part = self.B_tensor_all[i][:, index]
160
                B_tensor.append(B_tensor_part)
161
            # Return a list of tensor
162
        else:
163
            if self.param.add_channel:
164
                B_tensor = self.B_tensor_all[:, :, index]
165
            else:
166
                B_tensor = self.B_tensor_all[:, index]
167
            # Return a tensor
168
169
        # Get the tensor of A
170
        A_tensor = 0
171
172
        # Get the tensor of C
173
        C_tensor = 0
174
175
        if self.param.downstream_task == 'classification':
176
            # Get label
177
            label = self.labels_array[index]
178
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'label': label, 'index': index}
179
        elif self.param.downstream_task == 'regression':
180
            # Get target value
181
            value = self.values_array[index]
182
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'value': value, 'index': index}
183
        elif self.param.downstream_task == 'survival':
184
            # Get survival T and E
185
            survival_T = self.survival_T_array[index]
186
            survival_E = self.survival_E_array[index]
187
            y_true = self.y_true_tensor[index, :]
188
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'survival_T': survival_T, 'survival_E': survival_E, 'y_true': y_true, 'index': index}
189
        elif self.param.downstream_task == 'multitask':
190
            # Get label
191
            label = self.labels_array[index]
192
            # Get target value
193
            value = self.values_array[index]
194
            # Get survival T and E
195
            survival_T = self.survival_T_array[index]
196
            survival_E = self.survival_E_array[index]
197
            y_true = self.y_true_tensor[index, :]
198
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'label': label, 'value': value,
199
                    'survival_T': survival_T, 'survival_E': survival_E, 'y_true': y_true, 'index': index}
200
        elif self.param.downstream_task == 'alltask':
201
            # Get label
202
            label = []
203
            for i in range(self.param.task_num - 2):
204
                label.append(self.labels_array[i][index])
205
            # Get target value
206
            value = self.values_array[index]
207
            # Get survival T and E
208
            survival_T = self.survival_T_array[index]
209
            survival_E = self.survival_E_array[index]
210
            y_true = self.y_true_tensor[index, :]
211
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'label': label, 'value': value,
212
                    'survival_T': survival_T, 'survival_E': survival_E, 'y_true': y_true, 'index': index}
213
        else:
214
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'index': index}
215
216
    def __len__(self):
217
        """
218
        Return the number of data points in the dataset.
219
        """
220
        return self.sample_num