Diff of /datasets/a_dataset.py [000000] .. [03464c]

Switch to unified view

a b/datasets/a_dataset.py
1
import os.path
2
from datasets import load_file
3
from datasets import get_survival_y_true
4
from datasets.basic_dataset import BasicDataset
5
import numpy as np
6
import pandas as pd
7
import torch
8
9
10
class ADataset(BasicDataset):
11
    """
12
    A dataset class for gene expression dataset.
13
    File should be prepared as '/path/to/data/A.tsv'.
14
    For each omics file, each columns should be each sample and each row should be each molecular feature.
15
    """
16
17
    def __init__(self, param):
18
        """
19
        Initialize this dataset class.
20
        """
21
        BasicDataset.__init__(self, param)
22
        self.omics_dims = []
23
24
        # Load data for A
25
        A_df = load_file(param, 'A')
26
        # Get the sample list
27
        if param.use_sample_list:
28
            sample_list_path = os.path.join(param.data_root, 'sample_list.tsv')       # get the path of sample list
29
            self.sample_list = np.loadtxt(sample_list_path, delimiter='\t', dtype='<U32')
30
        else:
31
            self.sample_list = A_df.columns
32
        # Get the feature list for A
33
        if param.use_feature_lists:
34
            feature_list_A_path = os.path.join(param.data_root, 'feature_list_A.tsv')  # get the path of feature list
35
            feature_list_A = np.loadtxt(feature_list_A_path, delimiter='\t', dtype='<U32')
36
        else:
37
            feature_list_A = A_df.index
38
        A_df = A_df.loc[feature_list_A, self.sample_list]
39
        self.A_dim = A_df.shape[0]
40
        self.sample_num = A_df.shape[1]
41
        A_array = A_df.values
42
        if self.param.add_channel:
43
            # Add one dimension for the channel
44
            A_array = A_array[np.newaxis, :, :]
45
        self.A_tensor_all = torch.Tensor(A_array)
46
        self.omics_dims.append(self.A_dim)
47
48
        self.class_num = 0
49
        if param.downstream_task == 'classification':
50
            # Load labels
51
            labels_path = os.path.join(param.data_root, 'labels.tsv')       # get the path of the label
52
            labels_df = pd.read_csv(labels_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
53
            self.labels_array = labels_df.iloc[:, -1].values
54
            # Get the class number
55
            self.class_num = len(labels_df.iloc[:, -1].unique())
56
        elif param.downstream_task == 'regression':
57
            # Load target values
58
            values_path = os.path.join(param.data_root, 'values.tsv')  # get the path of the target value
59
            values_df = pd.read_csv(values_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
60
            self.values_array = values_df.iloc[:, -1].astype(float).values
61
            self.values_max = self.values_array.max()
62
            self.values_min = self.values_array.min()
63
        elif param.downstream_task == 'survival':
64
            # Load survival data
65
            survival_path = os.path.join(param.data_root, 'survival.tsv')  # get the path of the survival data
66
            survival_df = pd.read_csv(survival_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
67
            self.survival_T_array = survival_df.iloc[:, -2].astype(float).values
68
            self.survival_E_array = survival_df.iloc[:, -1].values
69
            self.survival_T_max = self.survival_T_array.max()
70
            self.survival_T_min = self.survival_T_array.min()
71
            if param.survival_loss == 'MTLR':
72
                self.y_true_tensor = get_survival_y_true(param, self.survival_T_array, self.survival_E_array)
73
            if param.stratify_label:
74
                labels_path = os.path.join(param.data_root, 'labels.tsv')  # get the path of the label
75
                labels_df = pd.read_csv(labels_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
76
                self.labels_array = labels_df.iloc[:, -1].values
77
        elif param.downstream_task == 'multitask':
78
            # Load labels
79
            labels_path = os.path.join(param.data_root, 'labels.tsv')  # get the path of the label
80
            labels_df = pd.read_csv(labels_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
81
            self.labels_array = labels_df.iloc[:, -1].values
82
            # Get the class number
83
            self.class_num = len(labels_df.iloc[:, -1].unique())
84
85
            # Load target values
86
            values_path = os.path.join(param.data_root, 'values.tsv')  # get the path of the target value
87
            values_df = pd.read_csv(values_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
88
            self.values_array = values_df.iloc[:, -1].astype(float).values
89
            self.values_max = self.values_array.max()
90
            self.values_min = self.values_array.min()
91
92
            # Load survival data
93
            survival_path = os.path.join(param.data_root, 'survival.tsv')  # get the path of the survival data
94
            survival_df = pd.read_csv(survival_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
95
            self.survival_T_array = survival_df.iloc[:, -2].astype(float).values
96
            self.survival_E_array = survival_df.iloc[:, -1].values
97
            self.survival_T_max = self.survival_T_array.max()
98
            self.survival_T_min = self.survival_T_array.min()
99
            if param.survival_loss == 'MTLR':
100
                self.y_true_tensor = get_survival_y_true(param, self.survival_T_array, self.survival_E_array)
101
        elif param.downstream_task == 'alltask':
102
            # Load labels
103
            self.labels_array = []
104
            self.class_num = []
105
            for i in range(param.task_num-2):
106
                labels_path = os.path.join(param.data_root, 'labels_'+str(i+1)+'.tsv')  # get the path of the label
107
                labels_df = pd.read_csv(labels_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
108
                self.labels_array.append(labels_df.iloc[:, -1].values)
109
                # Get the class number
110
                self.class_num.append(len(labels_df.iloc[:, -1].unique()))
111
112
            # Load target values
113
            values_path = os.path.join(param.data_root, 'values.tsv')  # get the path of the target value
114
            values_df = pd.read_csv(values_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
115
            self.values_array = values_df.iloc[:, -1].astype(float).values
116
            self.values_max = self.values_array.max()
117
            self.values_min = self.values_array.min()
118
119
            # Load survival data
120
            survival_path = os.path.join(param.data_root, 'survival.tsv')  # get the path of the survival data
121
            survival_df = pd.read_csv(survival_path, sep='\t', header=0, index_col=0).loc[self.sample_list, :]
122
            self.survival_T_array = survival_df.iloc[:, -2].astype(float).values
123
            self.survival_E_array = survival_df.iloc[:, -1].values
124
            self.survival_T_max = self.survival_T_array.max()
125
            self.survival_T_min = self.survival_T_array.min()
126
            if param.survival_loss == 'MTLR':
127
                self.y_true_tensor = get_survival_y_true(param, self.survival_T_array, self.survival_E_array)
128
129
    def __getitem__(self, index):
130
        """
131
        Return a data point and its metadata information.
132
133
        Returns a dictionary that contains A_tensor, label and index
134
            input_omics (list)              -- a list of input omics tensor
135
            label (int)                     -- label of the sample
136
            index (int)                     -- the index of this data point
137
        """
138
        # Get the tensor of A
139
        if self.param.add_channel:
140
            A_tensor = self.A_tensor_all[:, :, index]
141
        else:
142
            A_tensor = self.A_tensor_all[:, index]
143
144
        # Get the tensor of B
145
        if self.param.ch_separate:
146
            B_tensor = list(np.zeros(23))
147
        else:
148
            B_tensor = 0
149
150
        # Get the tensor of C
151
        C_tensor = 0
152
153
        if self.param.downstream_task == 'classification':
154
            # Get label
155
            label = self.labels_array[index]
156
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'label': label, 'index': index}
157
        elif self.param.downstream_task == 'regression':
158
            # Get target value
159
            value = self.values_array[index]
160
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'value': value, 'index': index}
161
        elif self.param.downstream_task == 'survival':
162
            # Get survival T and E
163
            survival_T = self.survival_T_array[index]
164
            survival_E = self.survival_E_array[index]
165
            y_true = self.y_true_tensor[index, :]
166
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'survival_T': survival_T, 'survival_E': survival_E, 'y_true': y_true, 'index': index}
167
        elif self.param.downstream_task == 'multitask':
168
            # Get label
169
            label = self.labels_array[index]
170
            # Get target value
171
            value = self.values_array[index]
172
            # Get survival T and E
173
            survival_T = self.survival_T_array[index]
174
            survival_E = self.survival_E_array[index]
175
            y_true = self.y_true_tensor[index, :]
176
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'label': label, 'value': value, 'survival_T': survival_T, 'survival_E': survival_E, 'y_true': y_true, 'index': index}
177
        elif self.param.downstream_task == 'alltask':
178
            # Get label
179
            label = []
180
            for i in range(self.param.task_num - 2):
181
                label.append(self.labels_array[i][index])
182
            # Get target value
183
            value = self.values_array[index]
184
            # Get survival T and E
185
            survival_T = self.survival_T_array[index]
186
            survival_E = self.survival_E_array[index]
187
            y_true = self.y_true_tensor[index, :]
188
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'label': label, 'value': value, 'survival_T': survival_T, 'survival_E': survival_E, 'y_true': y_true, 'index': index}
189
        else:
190
            return {'input_omics': [A_tensor, B_tensor, C_tensor], 'index': index}
191
192
    def __len__(self):
193
        """
194
        Return the number of data points in the dataset.
195
        """
196
        return self.sample_num