[66af30]: / util / data_loading_helpers_modified.py

Download this file

231 lines (193 with data), 12.4 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
import numpy as np
import re
eeg_float_resolution=np.float16
Alpha_ffd_names = ['FFD_a1', 'FFD_a1_diff', 'FFD_a2', 'FFD_a2_diff']
Beta_ffd_names = ['FFD_b1', 'FFD_b1_diff', 'FFD_b2', 'FFD_b2_diff']
Gamma_ffd_names = ['FFD_g1', 'FFD_g1_diff', 'FFD_g2', 'FFD_g2_diff']
Theta_ffd_names = ['FFD_t1', 'FFD_t1_diff', 'FFD_t2', 'FFD_t2_diff']
Alpha_gd_names = ['GD_a1', 'GD_a1_diff', 'GD_a2', 'GD_a2_diff']
Beta_gd_names = ['GD_b1', 'GD_b1_diff', 'GD_b2', 'GD_b2_diff']
Gamma_gd_names = ['GD_g1', 'GD_g1_diff', 'GD_g2', 'GD_g2_diff']
Theta_gd_names = ['GD_t1', 'GD_t1_diff', 'GD_t2', 'GD_t2_diff']
Alpha_gpt_names = ['GPT_a1', 'GPT_a1_diff', 'GPT_a2', 'GPT_a2_diff']
Beta_gpt_names = ['GPT_b1', 'GPT_b1_diff', 'GPT_b2', 'GPT_b2_diff']
Gamma_gpt_names = ['GPT_g1', 'GPT_g1_diff', 'GPT_g2', 'GPT_g2_diff']
Theta_gpt_names = ['GPT_t1', 'GPT_t1_diff', 'GPT_t2', 'GPT_t2_diff']
Alpha_sfd_names = ['SFD_a1', 'SFD_a1_diff', 'SFD_a2', 'SFD_a2_diff']
Beta_sfd_names = ['SFD_b1', 'SFD_b1_diff', 'SFD_b2', 'SFD_b2_diff']
Gamma_sfd_names = ['SFD_g1', 'SFD_g1_diff', 'SFD_g2', 'SFD_g2_diff']
Theta_sfd_names = ['SFD_t1', 'SFD_t1_diff', 'SFD_t2', 'SFD_t2_diff']
Alpha_trt_names = ['TRT_a1', 'TRT_a1_diff', 'TRT_a2', 'TRT_a2_diff']
Beta_trt_names = ['TRT_b1', 'TRT_b1_diff', 'TRT_b2', 'TRT_b2_diff']
Gamma_trt_names = ['TRT_g1', 'TRT_g1_diff', 'TRT_g2', 'TRT_g2_diff']
Theta_trt_names = ['TRT_t1', 'TRT_t1_diff', 'TRT_t2', 'TRT_t2_diff']
# IF YOU CHANGE THOSE YOU MUST ALSO CHANGE CONSTANTS
Alpha_features = Alpha_ffd_names + Alpha_gd_names + Alpha_gpt_names + Alpha_trt_names# + Alpha_sfd_names
Beta_features = Beta_ffd_names + Beta_gd_names + Beta_gpt_names + Beta_trt_names# + Beta_sfd_names
Gamma_features = Gamma_ffd_names + Gamma_gd_names + Gamma_gpt_names + Gamma_trt_names# + Gamma_sfd_names
Theta_features = Theta_ffd_names + Theta_gd_names + Theta_gpt_names + Theta_trt_names# + Theta_sfd_names
# print(Alpha_features)
# GD_EEG_feautres
def extract_all_fixations(data_container, word_data_object, float_resolution = np.float16):
"""
Extracts all fixations from a word data object
:param data_container: (h5py) Container of the whole data, h5py object
:param word_data_object: (h5py) Container of fixation objects, h5py object
:param float_resolution: (type) Resolution to which data re to be converted, used for data compression
:return:
fixations_data (list) Data arrays representing each fixation
"""
word_data = data_container[word_data_object]
fixations_data = []
if len(word_data.shape) > 1:
for fixation_idx in range(word_data.shape[0]):
fixations_data.append(np.array(data_container[word_data[fixation_idx][0]]).astype(float_resolution))
return fixations_data
def is_real_word(word):
"""
Check if the word is a real word
:param word: (str) word string
:return:
is_word (bool) True if it is a real word
"""
is_word = re.search('[a-zA-Z0-9]', word)
return is_word
def load_matlab_string(matlab_extracted_object):
"""
Converts a string loaded from h5py into a python string
:param matlab_extracted_object: (h5py) matlab string object
:return:
extracted_string (str) translated string
"""
extracted_string = u''.join(chr(c[0]) for c in matlab_extracted_object)
return extracted_string
def extract_word_level_data(data_container, word_objects, eeg_float_resolution = np.float16):
"""
Extracts word level data for a specific sentence
:param data_container: (h5py) Container of the whole data, h5py object
:param word_objects: (h5py) Container of all word data for a specific sentence
:param eeg_float_resolution: (type) Resolution with which to save EEG, used for data compression
:return:
word_level_data (dict) Contains all word level data indexed by their index number in the sentence,
together with the reading order, indexed by "word_reading_order"
"""
available_objects = list(word_objects)
#print(available_objects)
#print(len(available_objects))
# print('available_objects:', available_objects)
if isinstance(available_objects[0], str):
contentData = word_objects['content']
#fixations_order_per_word = []
if "rawEEG" in available_objects:
rawData = word_objects['rawEEG']
etData = word_objects['rawET']
ffdData = word_objects['FFD']
gdData = word_objects['GD']
gptData = word_objects['GPT']
trtData = word_objects['TRT']
try:
sfdData = word_objects['SFD']
except KeyError:
print("no SFD!")
sfdData = []
nFixData = word_objects['nFixations']
fixPositions = word_objects["fixPositions"]
Alpha_features_data = [word_objects[feature] for feature in Alpha_features]
Beta_features_data = [word_objects[feature] for feature in Beta_features]
Gamma_features_data = [word_objects[feature] for feature in Gamma_features]
Theta_features_data = [word_objects[feature] for feature in Theta_features]
####
GD_EEG_features = [word_objects[feature] for feature in ['GD_t1','GD_t2','GD_a1','GD_a2','GD_b1','GD_b2','GD_g1','GD_g2']]
FFD_EEG_features = [word_objects[feature] for feature in ['FFD_t1','FFD_t2','FFD_a1','FFD_a2','FFD_b1','FFD_b2','FFD_g1','FFD_g2']]
TRT_EEG_features = [word_objects[feature] for feature in ['TRT_t1','TRT_t2','TRT_a1','TRT_a2','TRT_b1','TRT_b2','TRT_g1','TRT_g2']]
####
assert len(contentData) == len(etData) == len(rawData), "different amounts of different data!!"
zipped_data = zip(rawData, etData, contentData, ffdData, gdData, gptData, trtData, sfdData, nFixData, fixPositions)
word_level_data = {}
word_idx = 0
word_tokens_has_fixation = []
word_tokens_with_mask = []
word_tokens_all = []
for raw_eegs_obj, ets_obj, word_obj, ffd, gd, gpt, trt, sfd, nFix, fixPos in zipped_data:
word_string = load_matlab_string(data_container[word_obj[0]])
if is_real_word(word_string):
data_dict = {}
data_dict["RAW_EEG"] = extract_all_fixations(data_container, raw_eegs_obj[0], eeg_float_resolution)
data_dict["RAW_ET"] = extract_all_fixations(data_container, ets_obj[0], np.float32)
data_dict["FFD"] = data_container[ffd[0]][()][0, 0] if len(data_container[ffd[0]][()].shape) == 2 else None
data_dict["GD"] = data_container[gd[0]][()][0, 0] if len(data_container[gd[0]][()].shape) == 2 else None
data_dict["GPT"] = data_container[gpt[0]][()][0, 0] if len(data_container[gpt[0]][()].shape) == 2 else None
data_dict["TRT"] = data_container[trt[0]][()][0, 0] if len(data_container[trt[0]][()].shape) == 2 else None
data_dict["SFD"] = data_container[sfd[0]][()][0, 0] if len(data_container[sfd[0]][()].shape) == 2 else None
data_dict["nFix"] = data_container[nFix[0]][()][0, 0] if len(data_container[nFix[0]][()].shape) == 2 else None
#fixations_order_per_word.append(np.array(data_container[fixPos[0]]))
#print([data_container[obj[word_idx][0]][()] for obj in Alpha_features_data])
data_dict["ALPHA_EEG"] = np.concatenate([data_container[obj[word_idx][0]][()]
if len(data_container[obj[word_idx][0]][()].shape) == 2 else []
for obj in Alpha_features_data], 0)
data_dict["BETA_EEG"] = np.concatenate([data_container[obj[word_idx][0]][()]
if len(data_container[obj[word_idx][0]][()].shape) == 2 else []
for obj in Beta_features_data], 0)
data_dict["GAMMA_EEG"] = np.concatenate([data_container[obj[word_idx][0]][()]
if len(data_container[obj[word_idx][0]][()].shape) == 2 else []
for obj in Gamma_features_data], 0)
data_dict["THETA_EEG"] = np.concatenate([data_container[obj[word_idx][0]][()]
if len(data_container[obj[word_idx][0]][()].shape) == 2 else []
for obj in Theta_features_data], 0)
data_dict["word_idx"] = word_idx
data_dict["content"] = word_string
####################################
word_tokens_all.append(word_string)
if data_dict["nFix"] is not None:
####################################
data_dict["GD_EEG"] = [np.squeeze(data_container[obj[word_idx][0]][()]) if len(data_container[obj[word_idx][0]][()].shape) == 2 else [] for obj in GD_EEG_features]
data_dict["FFD_EEG"] = [np.squeeze(data_container[obj[word_idx][0]][()]) if len(data_container[obj[word_idx][0]][()].shape) == 2 else [] for obj in FFD_EEG_features]
data_dict["TRT_EEG"] = [np.squeeze(data_container[obj[word_idx][0]][()]) if len(data_container[obj[word_idx][0]][()].shape) == 2 else [] for obj in TRT_EEG_features]
####################################
word_tokens_has_fixation.append(word_string)
word_tokens_with_mask.append(word_string)
else:
word_tokens_with_mask.append('[MASK]')
word_level_data[word_idx] = data_dict
word_idx += 1
else:
print(word_string + " is not a real word.")
else:
# If there are no word-level data it will be word embeddings alone
word_level_data = {}
word_idx = 0
word_tokens_has_fixation = []
word_tokens_with_mask = []
word_tokens_all = []
for word_obj in contentData:
word_string = load_matlab_string(data_container[word_obj[0]])
if is_real_word(word_string):
data_dict = {}
data_dict["RAW_EEG"] = []
data_dict["ICA_EEG"] = []
data_dict["RAW_ET"] = []
data_dict["FFD"] = None
data_dict["GD"] = None
data_dict["GPT"] = None
data_dict["TRT"] = None
data_dict["SFD"] = None
data_dict["nFix"] = None
data_dict["ALPHA_EEG"] = []
data_dict["BETA_EEG"] = []
data_dict["GAMMA_EEG"] = []
data_dict["THETA_EEG"] = []
data_dict["word_idx"] = word_idx
data_dict["content"] = word_string
word_level_data[word_idx] = data_dict
word_idx += 1
else:
print(word_string + " is not a real word.")
sentence = " ".join([load_matlab_string(data_container[word_obj[0]]) for word_obj in word_objects['content']])
#print("Only available objects for the sentence '{}' are {}.".format(sentence, available_objects))
#word_level_data["word_reading_order"] = extract_word_order_from_fixations(fixations_order_per_word)
else:
word_tokens_has_fixation = []
word_tokens_with_mask = []
word_tokens_all = []
word_level_data = {}
return word_level_data, word_tokens_all, word_tokens_has_fixation, word_tokens_with_mask