|
a |
|
b/util/data_loading_helpers_modified.py |
|
|
1 |
import numpy as np |
|
|
2 |
import re |
|
|
3 |
|
|
|
4 |
eeg_float_resolution=np.float16 |
|
|
5 |
|
|
|
6 |
Alpha_ffd_names = ['FFD_a1', 'FFD_a1_diff', 'FFD_a2', 'FFD_a2_diff'] |
|
|
7 |
Beta_ffd_names = ['FFD_b1', 'FFD_b1_diff', 'FFD_b2', 'FFD_b2_diff'] |
|
|
8 |
Gamma_ffd_names = ['FFD_g1', 'FFD_g1_diff', 'FFD_g2', 'FFD_g2_diff'] |
|
|
9 |
Theta_ffd_names = ['FFD_t1', 'FFD_t1_diff', 'FFD_t2', 'FFD_t2_diff'] |
|
|
10 |
Alpha_gd_names = ['GD_a1', 'GD_a1_diff', 'GD_a2', 'GD_a2_diff'] |
|
|
11 |
Beta_gd_names = ['GD_b1', 'GD_b1_diff', 'GD_b2', 'GD_b2_diff'] |
|
|
12 |
Gamma_gd_names = ['GD_g1', 'GD_g1_diff', 'GD_g2', 'GD_g2_diff'] |
|
|
13 |
Theta_gd_names = ['GD_t1', 'GD_t1_diff', 'GD_t2', 'GD_t2_diff'] |
|
|
14 |
Alpha_gpt_names = ['GPT_a1', 'GPT_a1_diff', 'GPT_a2', 'GPT_a2_diff'] |
|
|
15 |
Beta_gpt_names = ['GPT_b1', 'GPT_b1_diff', 'GPT_b2', 'GPT_b2_diff'] |
|
|
16 |
Gamma_gpt_names = ['GPT_g1', 'GPT_g1_diff', 'GPT_g2', 'GPT_g2_diff'] |
|
|
17 |
Theta_gpt_names = ['GPT_t1', 'GPT_t1_diff', 'GPT_t2', 'GPT_t2_diff'] |
|
|
18 |
Alpha_sfd_names = ['SFD_a1', 'SFD_a1_diff', 'SFD_a2', 'SFD_a2_diff'] |
|
|
19 |
Beta_sfd_names = ['SFD_b1', 'SFD_b1_diff', 'SFD_b2', 'SFD_b2_diff'] |
|
|
20 |
Gamma_sfd_names = ['SFD_g1', 'SFD_g1_diff', 'SFD_g2', 'SFD_g2_diff'] |
|
|
21 |
Theta_sfd_names = ['SFD_t1', 'SFD_t1_diff', 'SFD_t2', 'SFD_t2_diff'] |
|
|
22 |
Alpha_trt_names = ['TRT_a1', 'TRT_a1_diff', 'TRT_a2', 'TRT_a2_diff'] |
|
|
23 |
Beta_trt_names = ['TRT_b1', 'TRT_b1_diff', 'TRT_b2', 'TRT_b2_diff'] |
|
|
24 |
Gamma_trt_names = ['TRT_g1', 'TRT_g1_diff', 'TRT_g2', 'TRT_g2_diff'] |
|
|
25 |
Theta_trt_names = ['TRT_t1', 'TRT_t1_diff', 'TRT_t2', 'TRT_t2_diff'] |
|
|
26 |
|
|
|
27 |
# IF YOU CHANGE THOSE YOU MUST ALSO CHANGE CONSTANTS |
|
|
28 |
Alpha_features = Alpha_ffd_names + Alpha_gd_names + Alpha_gpt_names + Alpha_trt_names# + Alpha_sfd_names |
|
|
29 |
Beta_features = Beta_ffd_names + Beta_gd_names + Beta_gpt_names + Beta_trt_names# + Beta_sfd_names |
|
|
30 |
Gamma_features = Gamma_ffd_names + Gamma_gd_names + Gamma_gpt_names + Gamma_trt_names# + Gamma_sfd_names |
|
|
31 |
Theta_features = Theta_ffd_names + Theta_gd_names + Theta_gpt_names + Theta_trt_names# + Theta_sfd_names |
|
|
32 |
# print(Alpha_features) |
|
|
33 |
|
|
|
34 |
# GD_EEG_feautres |
|
|
35 |
|
|
|
36 |
|
|
|
37 |
def extract_all_fixations(data_container, word_data_object, float_resolution = np.float16): |
|
|
38 |
""" |
|
|
39 |
Extracts all fixations from a word data object |
|
|
40 |
:param data_container: (h5py) Container of the whole data, h5py object |
|
|
41 |
:param word_data_object: (h5py) Container of fixation objects, h5py object |
|
|
42 |
:param float_resolution: (type) Resolution to which data re to be converted, used for data compression |
|
|
43 |
:return: |
|
|
44 |
fixations_data (list) Data arrays representing each fixation |
|
|
45 |
""" |
|
|
46 |
word_data = data_container[word_data_object] |
|
|
47 |
fixations_data = [] |
|
|
48 |
if len(word_data.shape) > 1: |
|
|
49 |
for fixation_idx in range(word_data.shape[0]): |
|
|
50 |
fixations_data.append(np.array(data_container[word_data[fixation_idx][0]]).astype(float_resolution)) |
|
|
51 |
return fixations_data |
|
|
52 |
|
|
|
53 |
|
|
|
54 |
def is_real_word(word): |
|
|
55 |
""" |
|
|
56 |
Check if the word is a real word |
|
|
57 |
:param word: (str) word string |
|
|
58 |
:return: |
|
|
59 |
is_word (bool) True if it is a real word |
|
|
60 |
""" |
|
|
61 |
is_word = re.search('[a-zA-Z0-9]', word) |
|
|
62 |
return is_word |
|
|
63 |
|
|
|
64 |
|
|
|
65 |
def load_matlab_string(matlab_extracted_object): |
|
|
66 |
""" |
|
|
67 |
Converts a string loaded from h5py into a python string |
|
|
68 |
:param matlab_extracted_object: (h5py) matlab string object |
|
|
69 |
:return: |
|
|
70 |
extracted_string (str) translated string |
|
|
71 |
""" |
|
|
72 |
extracted_string = u''.join(chr(c[0]) for c in matlab_extracted_object) |
|
|
73 |
return extracted_string |
|
|
74 |
|
|
|
75 |
|
|
|
76 |
def extract_word_level_data(data_container, word_objects, eeg_float_resolution = np.float16): |
|
|
77 |
""" |
|
|
78 |
Extracts word level data for a specific sentence |
|
|
79 |
:param data_container: (h5py) Container of the whole data, h5py object |
|
|
80 |
:param word_objects: (h5py) Container of all word data for a specific sentence |
|
|
81 |
:param eeg_float_resolution: (type) Resolution with which to save EEG, used for data compression |
|
|
82 |
:return: |
|
|
83 |
word_level_data (dict) Contains all word level data indexed by their index number in the sentence, |
|
|
84 |
together with the reading order, indexed by "word_reading_order" |
|
|
85 |
""" |
|
|
86 |
available_objects = list(word_objects) |
|
|
87 |
#print(available_objects) |
|
|
88 |
#print(len(available_objects)) |
|
|
89 |
# print('available_objects:', available_objects) |
|
|
90 |
|
|
|
91 |
if isinstance(available_objects[0], str): |
|
|
92 |
|
|
|
93 |
contentData = word_objects['content'] |
|
|
94 |
#fixations_order_per_word = [] |
|
|
95 |
if "rawEEG" in available_objects: |
|
|
96 |
|
|
|
97 |
rawData = word_objects['rawEEG'] |
|
|
98 |
etData = word_objects['rawET'] |
|
|
99 |
|
|
|
100 |
ffdData = word_objects['FFD'] |
|
|
101 |
gdData = word_objects['GD'] |
|
|
102 |
gptData = word_objects['GPT'] |
|
|
103 |
trtData = word_objects['TRT'] |
|
|
104 |
|
|
|
105 |
try: |
|
|
106 |
sfdData = word_objects['SFD'] |
|
|
107 |
except KeyError: |
|
|
108 |
print("no SFD!") |
|
|
109 |
sfdData = [] |
|
|
110 |
nFixData = word_objects['nFixations'] |
|
|
111 |
fixPositions = word_objects["fixPositions"] |
|
|
112 |
|
|
|
113 |
Alpha_features_data = [word_objects[feature] for feature in Alpha_features] |
|
|
114 |
Beta_features_data = [word_objects[feature] for feature in Beta_features] |
|
|
115 |
Gamma_features_data = [word_objects[feature] for feature in Gamma_features] |
|
|
116 |
Theta_features_data = [word_objects[feature] for feature in Theta_features] |
|
|
117 |
#### |
|
|
118 |
GD_EEG_features = [word_objects[feature] for feature in ['GD_t1','GD_t2','GD_a1','GD_a2','GD_b1','GD_b2','GD_g1','GD_g2']] |
|
|
119 |
FFD_EEG_features = [word_objects[feature] for feature in ['FFD_t1','FFD_t2','FFD_a1','FFD_a2','FFD_b1','FFD_b2','FFD_g1','FFD_g2']] |
|
|
120 |
TRT_EEG_features = [word_objects[feature] for feature in ['TRT_t1','TRT_t2','TRT_a1','TRT_a2','TRT_b1','TRT_b2','TRT_g1','TRT_g2']] |
|
|
121 |
#### |
|
|
122 |
assert len(contentData) == len(etData) == len(rawData), "different amounts of different data!!" |
|
|
123 |
|
|
|
124 |
zipped_data = zip(rawData, etData, contentData, ffdData, gdData, gptData, trtData, sfdData, nFixData, fixPositions) |
|
|
125 |
|
|
|
126 |
word_level_data = {} |
|
|
127 |
word_idx = 0 |
|
|
128 |
|
|
|
129 |
word_tokens_has_fixation = [] |
|
|
130 |
word_tokens_with_mask = [] |
|
|
131 |
word_tokens_all = [] |
|
|
132 |
for raw_eegs_obj, ets_obj, word_obj, ffd, gd, gpt, trt, sfd, nFix, fixPos in zipped_data: |
|
|
133 |
word_string = load_matlab_string(data_container[word_obj[0]]) |
|
|
134 |
if is_real_word(word_string): |
|
|
135 |
data_dict = {} |
|
|
136 |
data_dict["RAW_EEG"] = extract_all_fixations(data_container, raw_eegs_obj[0], eeg_float_resolution) |
|
|
137 |
data_dict["RAW_ET"] = extract_all_fixations(data_container, ets_obj[0], np.float32) |
|
|
138 |
|
|
|
139 |
data_dict["FFD"] = data_container[ffd[0]][()][0, 0] if len(data_container[ffd[0]][()].shape) == 2 else None |
|
|
140 |
data_dict["GD"] = data_container[gd[0]][()][0, 0] if len(data_container[gd[0]][()].shape) == 2 else None |
|
|
141 |
data_dict["GPT"] = data_container[gpt[0]][()][0, 0] if len(data_container[gpt[0]][()].shape) == 2 else None |
|
|
142 |
data_dict["TRT"] = data_container[trt[0]][()][0, 0] if len(data_container[trt[0]][()].shape) == 2 else None |
|
|
143 |
data_dict["SFD"] = data_container[sfd[0]][()][0, 0] if len(data_container[sfd[0]][()].shape) == 2 else None |
|
|
144 |
data_dict["nFix"] = data_container[nFix[0]][()][0, 0] if len(data_container[nFix[0]][()].shape) == 2 else None |
|
|
145 |
|
|
|
146 |
#fixations_order_per_word.append(np.array(data_container[fixPos[0]])) |
|
|
147 |
|
|
|
148 |
#print([data_container[obj[word_idx][0]][()] for obj in Alpha_features_data]) |
|
|
149 |
|
|
|
150 |
|
|
|
151 |
data_dict["ALPHA_EEG"] = np.concatenate([data_container[obj[word_idx][0]][()] |
|
|
152 |
if len(data_container[obj[word_idx][0]][()].shape) == 2 else [] |
|
|
153 |
for obj in Alpha_features_data], 0) |
|
|
154 |
|
|
|
155 |
data_dict["BETA_EEG"] = np.concatenate([data_container[obj[word_idx][0]][()] |
|
|
156 |
if len(data_container[obj[word_idx][0]][()].shape) == 2 else [] |
|
|
157 |
for obj in Beta_features_data], 0) |
|
|
158 |
|
|
|
159 |
data_dict["GAMMA_EEG"] = np.concatenate([data_container[obj[word_idx][0]][()] |
|
|
160 |
if len(data_container[obj[word_idx][0]][()].shape) == 2 else [] |
|
|
161 |
for obj in Gamma_features_data], 0) |
|
|
162 |
|
|
|
163 |
data_dict["THETA_EEG"] = np.concatenate([data_container[obj[word_idx][0]][()] |
|
|
164 |
if len(data_container[obj[word_idx][0]][()].shape) == 2 else [] |
|
|
165 |
for obj in Theta_features_data], 0) |
|
|
166 |
|
|
|
167 |
|
|
|
168 |
|
|
|
169 |
|
|
|
170 |
data_dict["word_idx"] = word_idx |
|
|
171 |
data_dict["content"] = word_string |
|
|
172 |
#################################### |
|
|
173 |
word_tokens_all.append(word_string) |
|
|
174 |
if data_dict["nFix"] is not None: |
|
|
175 |
#################################### |
|
|
176 |
data_dict["GD_EEG"] = [np.squeeze(data_container[obj[word_idx][0]][()]) if len(data_container[obj[word_idx][0]][()].shape) == 2 else [] for obj in GD_EEG_features] |
|
|
177 |
data_dict["FFD_EEG"] = [np.squeeze(data_container[obj[word_idx][0]][()]) if len(data_container[obj[word_idx][0]][()].shape) == 2 else [] for obj in FFD_EEG_features] |
|
|
178 |
data_dict["TRT_EEG"] = [np.squeeze(data_container[obj[word_idx][0]][()]) if len(data_container[obj[word_idx][0]][()].shape) == 2 else [] for obj in TRT_EEG_features] |
|
|
179 |
#################################### |
|
|
180 |
word_tokens_has_fixation.append(word_string) |
|
|
181 |
word_tokens_with_mask.append(word_string) |
|
|
182 |
else: |
|
|
183 |
word_tokens_with_mask.append('[MASK]') |
|
|
184 |
|
|
|
185 |
|
|
|
186 |
word_level_data[word_idx] = data_dict |
|
|
187 |
word_idx += 1 |
|
|
188 |
else: |
|
|
189 |
print(word_string + " is not a real word.") |
|
|
190 |
else: |
|
|
191 |
# If there are no word-level data it will be word embeddings alone |
|
|
192 |
word_level_data = {} |
|
|
193 |
word_idx = 0 |
|
|
194 |
word_tokens_has_fixation = [] |
|
|
195 |
word_tokens_with_mask = [] |
|
|
196 |
word_tokens_all = [] |
|
|
197 |
|
|
|
198 |
for word_obj in contentData: |
|
|
199 |
word_string = load_matlab_string(data_container[word_obj[0]]) |
|
|
200 |
if is_real_word(word_string): |
|
|
201 |
data_dict = {} |
|
|
202 |
data_dict["RAW_EEG"] = [] |
|
|
203 |
data_dict["ICA_EEG"] = [] |
|
|
204 |
data_dict["RAW_ET"] = [] |
|
|
205 |
data_dict["FFD"] = None |
|
|
206 |
data_dict["GD"] = None |
|
|
207 |
data_dict["GPT"] = None |
|
|
208 |
data_dict["TRT"] = None |
|
|
209 |
data_dict["SFD"] = None |
|
|
210 |
data_dict["nFix"] = None |
|
|
211 |
data_dict["ALPHA_EEG"] = [] |
|
|
212 |
data_dict["BETA_EEG"] = [] |
|
|
213 |
data_dict["GAMMA_EEG"] = [] |
|
|
214 |
data_dict["THETA_EEG"] = [] |
|
|
215 |
|
|
|
216 |
data_dict["word_idx"] = word_idx |
|
|
217 |
data_dict["content"] = word_string |
|
|
218 |
word_level_data[word_idx] = data_dict |
|
|
219 |
word_idx += 1 |
|
|
220 |
else: |
|
|
221 |
print(word_string + " is not a real word.") |
|
|
222 |
|
|
|
223 |
sentence = " ".join([load_matlab_string(data_container[word_obj[0]]) for word_obj in word_objects['content']]) |
|
|
224 |
#print("Only available objects for the sentence '{}' are {}.".format(sentence, available_objects)) |
|
|
225 |
#word_level_data["word_reading_order"] = extract_word_order_from_fixations(fixations_order_per_word) |
|
|
226 |
else: |
|
|
227 |
word_tokens_has_fixation = [] |
|
|
228 |
word_tokens_with_mask = [] |
|
|
229 |
word_tokens_all = [] |
|
|
230 |
word_level_data = {} |
|
|
231 |
return word_level_data, word_tokens_all, word_tokens_has_fixation, word_tokens_with_mask |