Switch to unified view

a b/util/data_loading_helpers_modified.py
1
import numpy as np
2
import re
3
4
eeg_float_resolution=np.float16
5
6
Alpha_ffd_names = ['FFD_a1', 'FFD_a1_diff', 'FFD_a2', 'FFD_a2_diff']
7
Beta_ffd_names = ['FFD_b1', 'FFD_b1_diff', 'FFD_b2', 'FFD_b2_diff']
8
Gamma_ffd_names = ['FFD_g1', 'FFD_g1_diff', 'FFD_g2', 'FFD_g2_diff']
9
Theta_ffd_names = ['FFD_t1', 'FFD_t1_diff', 'FFD_t2', 'FFD_t2_diff']
10
Alpha_gd_names = ['GD_a1', 'GD_a1_diff', 'GD_a2', 'GD_a2_diff']
11
Beta_gd_names = ['GD_b1', 'GD_b1_diff', 'GD_b2', 'GD_b2_diff']
12
Gamma_gd_names = ['GD_g1', 'GD_g1_diff', 'GD_g2', 'GD_g2_diff']
13
Theta_gd_names = ['GD_t1', 'GD_t1_diff', 'GD_t2', 'GD_t2_diff']
14
Alpha_gpt_names = ['GPT_a1', 'GPT_a1_diff', 'GPT_a2', 'GPT_a2_diff']
15
Beta_gpt_names = ['GPT_b1', 'GPT_b1_diff', 'GPT_b2', 'GPT_b2_diff']
16
Gamma_gpt_names = ['GPT_g1', 'GPT_g1_diff', 'GPT_g2', 'GPT_g2_diff']
17
Theta_gpt_names = ['GPT_t1', 'GPT_t1_diff', 'GPT_t2', 'GPT_t2_diff']
18
Alpha_sfd_names = ['SFD_a1', 'SFD_a1_diff', 'SFD_a2', 'SFD_a2_diff']
19
Beta_sfd_names = ['SFD_b1', 'SFD_b1_diff', 'SFD_b2', 'SFD_b2_diff']
20
Gamma_sfd_names = ['SFD_g1', 'SFD_g1_diff', 'SFD_g2', 'SFD_g2_diff']
21
Theta_sfd_names = ['SFD_t1', 'SFD_t1_diff', 'SFD_t2', 'SFD_t2_diff']
22
Alpha_trt_names = ['TRT_a1', 'TRT_a1_diff', 'TRT_a2', 'TRT_a2_diff']
23
Beta_trt_names = ['TRT_b1', 'TRT_b1_diff', 'TRT_b2', 'TRT_b2_diff']
24
Gamma_trt_names = ['TRT_g1', 'TRT_g1_diff', 'TRT_g2', 'TRT_g2_diff']
25
Theta_trt_names = ['TRT_t1', 'TRT_t1_diff', 'TRT_t2', 'TRT_t2_diff']
26
27
# IF YOU CHANGE THOSE YOU MUST ALSO CHANGE CONSTANTS
28
Alpha_features = Alpha_ffd_names + Alpha_gd_names + Alpha_gpt_names + Alpha_trt_names# + Alpha_sfd_names
29
Beta_features = Beta_ffd_names + Beta_gd_names + Beta_gpt_names + Beta_trt_names# + Beta_sfd_names
30
Gamma_features = Gamma_ffd_names + Gamma_gd_names + Gamma_gpt_names + Gamma_trt_names# + Gamma_sfd_names
31
Theta_features = Theta_ffd_names + Theta_gd_names + Theta_gpt_names + Theta_trt_names# + Theta_sfd_names
32
# print(Alpha_features)
33
34
# GD_EEG_feautres
35
36
37
def extract_all_fixations(data_container, word_data_object, float_resolution = np.float16):
38
    """
39
    Extracts all fixations from a word data object
40
    :param data_container:      (h5py)  Container of the whole data, h5py object
41
    :param word_data_object:    (h5py)  Container of fixation objects, h5py object
42
    :param float_resolution:    (type)  Resolution to which data re to be converted, used for data compression
43
    :return:
44
        fixations_data  (list)  Data arrays representing each fixation
45
    """
46
    word_data = data_container[word_data_object]
47
    fixations_data = []
48
    if len(word_data.shape) > 1:
49
        for fixation_idx in range(word_data.shape[0]):
50
            fixations_data.append(np.array(data_container[word_data[fixation_idx][0]]).astype(float_resolution))
51
    return fixations_data
52
53
54
def is_real_word(word):
55
    """
56
    Check if the word is a real word
57
    :param word:    (str)   word string
58
    :return:
59
        is_word (bool)  True if it is a real word
60
    """
61
    is_word = re.search('[a-zA-Z0-9]', word)
62
    return is_word
63
64
65
def load_matlab_string(matlab_extracted_object):
66
    """
67
    Converts a string loaded from h5py into a python string
68
    :param matlab_extracted_object:     (h5py)  matlab string object
69
    :return:
70
        extracted_string    (str)   translated string
71
    """
72
    extracted_string = u''.join(chr(c[0]) for c in matlab_extracted_object)
73
    return extracted_string
74
75
76
def extract_word_level_data(data_container, word_objects, eeg_float_resolution = np.float16):
77
    """
78
    Extracts word level data for a specific sentence
79
    :param data_container:          (h5py)  Container of the whole data, h5py object
80
    :param word_objects:            (h5py)  Container of all word data for a specific sentence
81
    :param eeg_float_resolution:    (type)  Resolution with which to save EEG, used for data compression
82
    :return:
83
        word_level_data     (dict)  Contains all word level data indexed by their index number in the sentence,
84
                                    together with the reading order, indexed by "word_reading_order"
85
    """
86
    available_objects = list(word_objects)
87
    #print(available_objects)
88
    #print(len(available_objects))
89
    # print('available_objects:', available_objects)
90
91
    if isinstance(available_objects[0], str):
92
93
        contentData = word_objects['content']
94
        #fixations_order_per_word = []
95
        if "rawEEG" in available_objects:
96
97
            rawData = word_objects['rawEEG']
98
            etData = word_objects['rawET']
99
100
            ffdData = word_objects['FFD']
101
            gdData = word_objects['GD']
102
            gptData = word_objects['GPT']
103
            trtData = word_objects['TRT']
104
105
            try:
106
                sfdData = word_objects['SFD']
107
            except KeyError:
108
                print("no SFD!")
109
                sfdData = []
110
            nFixData = word_objects['nFixations']
111
            fixPositions = word_objects["fixPositions"]
112
113
            Alpha_features_data = [word_objects[feature] for feature in Alpha_features]
114
            Beta_features_data = [word_objects[feature] for feature in Beta_features]
115
            Gamma_features_data = [word_objects[feature] for feature in Gamma_features]
116
            Theta_features_data = [word_objects[feature] for feature in Theta_features]
117
            #### 
118
            GD_EEG_features = [word_objects[feature] for feature in ['GD_t1','GD_t2','GD_a1','GD_a2','GD_b1','GD_b2','GD_g1','GD_g2']]
119
            FFD_EEG_features = [word_objects[feature] for feature in ['FFD_t1','FFD_t2','FFD_a1','FFD_a2','FFD_b1','FFD_b2','FFD_g1','FFD_g2']]
120
            TRT_EEG_features = [word_objects[feature] for feature in ['TRT_t1','TRT_t2','TRT_a1','TRT_a2','TRT_b1','TRT_b2','TRT_g1','TRT_g2']]
121
            #### 
122
            assert len(contentData) == len(etData) == len(rawData), "different amounts of different data!!"
123
124
            zipped_data = zip(rawData, etData, contentData, ffdData, gdData, gptData, trtData, sfdData, nFixData, fixPositions)
125
            
126
            word_level_data = {}
127
            word_idx = 0
128
129
            word_tokens_has_fixation = [] 
130
            word_tokens_with_mask = []
131
            word_tokens_all = []
132
            for raw_eegs_obj, ets_obj, word_obj, ffd, gd, gpt, trt, sfd, nFix, fixPos in zipped_data:
133
                word_string = load_matlab_string(data_container[word_obj[0]])
134
                if is_real_word(word_string):
135
                    data_dict = {}
136
                    data_dict["RAW_EEG"] = extract_all_fixations(data_container, raw_eegs_obj[0], eeg_float_resolution)
137
                    data_dict["RAW_ET"] = extract_all_fixations(data_container, ets_obj[0], np.float32)
138
139
                    data_dict["FFD"] = data_container[ffd[0]][()][0, 0] if len(data_container[ffd[0]][()].shape) == 2 else None
140
                    data_dict["GD"] = data_container[gd[0]][()][0, 0] if len(data_container[gd[0]][()].shape) == 2 else None
141
                    data_dict["GPT"] = data_container[gpt[0]][()][0, 0] if len(data_container[gpt[0]][()].shape) == 2 else None
142
                    data_dict["TRT"] = data_container[trt[0]][()][0, 0] if len(data_container[trt[0]][()].shape) == 2 else None
143
                    data_dict["SFD"] = data_container[sfd[0]][()][0, 0] if len(data_container[sfd[0]][()].shape) == 2 else None
144
                    data_dict["nFix"] = data_container[nFix[0]][()][0, 0] if len(data_container[nFix[0]][()].shape) == 2 else None
145
146
                    #fixations_order_per_word.append(np.array(data_container[fixPos[0]]))
147
148
                    #print([data_container[obj[word_idx][0]][()] for obj in Alpha_features_data])
149
150
151
                    data_dict["ALPHA_EEG"] = np.concatenate([data_container[obj[word_idx][0]][()]
152
                                                             if len(data_container[obj[word_idx][0]][()].shape) == 2 else []
153
                                                             for obj in Alpha_features_data], 0)
154
155
                    data_dict["BETA_EEG"] = np.concatenate([data_container[obj[word_idx][0]][()]
156
                                                            if len(data_container[obj[word_idx][0]][()].shape) == 2 else []
157
                                                            for obj in Beta_features_data], 0)
158
159
                    data_dict["GAMMA_EEG"] = np.concatenate([data_container[obj[word_idx][0]][()]
160
                                                             if len(data_container[obj[word_idx][0]][()].shape) == 2 else []
161
                                                             for obj in Gamma_features_data], 0)
162
163
                    data_dict["THETA_EEG"] = np.concatenate([data_container[obj[word_idx][0]][()]
164
                                                             if len(data_container[obj[word_idx][0]][()].shape) == 2 else []
165
                                                             for obj in Theta_features_data], 0)
166
167
168
169
170
                    data_dict["word_idx"] = word_idx
171
                    data_dict["content"] = word_string
172
                    ####################################
173
                    word_tokens_all.append(word_string)
174
                    if data_dict["nFix"] is not None:
175
                        ####################################
176
                        data_dict["GD_EEG"] = [np.squeeze(data_container[obj[word_idx][0]][()]) if len(data_container[obj[word_idx][0]][()].shape) == 2 else [] for obj in GD_EEG_features]
177
                        data_dict["FFD_EEG"] = [np.squeeze(data_container[obj[word_idx][0]][()]) if len(data_container[obj[word_idx][0]][()].shape) == 2 else [] for obj in FFD_EEG_features]
178
                        data_dict["TRT_EEG"] = [np.squeeze(data_container[obj[word_idx][0]][()]) if len(data_container[obj[word_idx][0]][()].shape) == 2 else [] for obj in TRT_EEG_features]
179
                        ####################################
180
                        word_tokens_has_fixation.append(word_string)
181
                        word_tokens_with_mask.append(word_string)
182
                    else:
183
                        word_tokens_with_mask.append('[MASK]')
184
                        
185
186
                    word_level_data[word_idx] = data_dict
187
                    word_idx += 1
188
                else:
189
                    print(word_string + " is not a real word.")
190
        else:
191
            # If there are no word-level data it will be word embeddings alone
192
            word_level_data = {}
193
            word_idx = 0
194
            word_tokens_has_fixation = [] 
195
            word_tokens_with_mask = []
196
            word_tokens_all = []
197
198
            for word_obj in contentData:
199
                word_string = load_matlab_string(data_container[word_obj[0]])
200
                if is_real_word(word_string):
201
                    data_dict = {}
202
                    data_dict["RAW_EEG"] = []
203
                    data_dict["ICA_EEG"] = []
204
                    data_dict["RAW_ET"] = []
205
                    data_dict["FFD"] = None
206
                    data_dict["GD"] = None
207
                    data_dict["GPT"] = None
208
                    data_dict["TRT"] = None
209
                    data_dict["SFD"] = None
210
                    data_dict["nFix"] = None
211
                    data_dict["ALPHA_EEG"] = []
212
                    data_dict["BETA_EEG"] = []
213
                    data_dict["GAMMA_EEG"] = []
214
                    data_dict["THETA_EEG"] = []
215
216
                    data_dict["word_idx"] = word_idx
217
                    data_dict["content"] = word_string
218
                    word_level_data[word_idx] = data_dict
219
                    word_idx += 1
220
                else:
221
                    print(word_string + " is not a real word.")
222
223
            sentence = " ".join([load_matlab_string(data_container[word_obj[0]]) for word_obj in word_objects['content']])
224
            #print("Only available objects for the sentence '{}' are {}.".format(sentence, available_objects))
225
            #word_level_data["word_reading_order"] = extract_word_order_from_fixations(fixations_order_per_word)
226
    else:
227
        word_tokens_has_fixation = [] 
228
        word_tokens_with_mask = []
229
        word_tokens_all = []
230
        word_level_data = {}
231
    return word_level_data, word_tokens_all, word_tokens_has_fixation, word_tokens_with_mask