EEG-To-Text / Git / Diff of /util/data_loading_helpers

Models:

ReneeD/

EEG-To-Text

Downloads: 1

Diff of /util/data_loading_helpers_modified.py [000000] .. [66af30]

Switch to unified view

 b/util/data_loading_helpers_modified.py
+import numpy as np
+import re
+eeg_float_resolution=np.float16
+Alpha_ffd_names = ['FFD_a1', 'FFD_a1_diff', 'FFD_a2', 'FFD_a2_diff']
+Beta_ffd_names = ['FFD_b1', 'FFD_b1_diff', 'FFD_b2', 'FFD_b2_diff']
+Gamma_ffd_names = ['FFD_g1', 'FFD_g1_diff', 'FFD_g2', 'FFD_g2_diff']
+Theta_ffd_names = ['FFD_t1', 'FFD_t1_diff', 'FFD_t2', 'FFD_t2_diff']
+Alpha_gd_names = ['GD_a1', 'GD_a1_diff', 'GD_a2', 'GD_a2_diff']
+Beta_gd_names = ['GD_b1', 'GD_b1_diff', 'GD_b2', 'GD_b2_diff']
+Gamma_gd_names = ['GD_g1', 'GD_g1_diff', 'GD_g2', 'GD_g2_diff']
+Theta_gd_names = ['GD_t1', 'GD_t1_diff', 'GD_t2', 'GD_t2_diff']
+Alpha_gpt_names = ['GPT_a1', 'GPT_a1_diff', 'GPT_a2', 'GPT_a2_diff']
+Beta_gpt_names = ['GPT_b1', 'GPT_b1_diff', 'GPT_b2', 'GPT_b2_diff']
+Gamma_gpt_names = ['GPT_g1', 'GPT_g1_diff', 'GPT_g2', 'GPT_g2_diff']
+Theta_gpt_names = ['GPT_t1', 'GPT_t1_diff', 'GPT_t2', 'GPT_t2_diff']
+Alpha_sfd_names = ['SFD_a1', 'SFD_a1_diff', 'SFD_a2', 'SFD_a2_diff']
+Beta_sfd_names = ['SFD_b1', 'SFD_b1_diff', 'SFD_b2', 'SFD_b2_diff']
+Gamma_sfd_names = ['SFD_g1', 'SFD_g1_diff', 'SFD_g2', 'SFD_g2_diff']
+Theta_sfd_names = ['SFD_t1', 'SFD_t1_diff', 'SFD_t2', 'SFD_t2_diff']
+Alpha_trt_names = ['TRT_a1', 'TRT_a1_diff', 'TRT_a2', 'TRT_a2_diff']
+Beta_trt_names = ['TRT_b1', 'TRT_b1_diff', 'TRT_b2', 'TRT_b2_diff']
+Gamma_trt_names = ['TRT_g1', 'TRT_g1_diff', 'TRT_g2', 'TRT_g2_diff']
+Theta_trt_names = ['TRT_t1', 'TRT_t1_diff', 'TRT_t2', 'TRT_t2_diff']
+# IF YOU CHANGE THOSE YOU MUST ALSO CHANGE CONSTANTS
+Alpha_features = Alpha_ffd_names + Alpha_gd_names + Alpha_gpt_names + Alpha_trt_names# + Alpha_sfd_names
+Beta_features = Beta_ffd_names + Beta_gd_names + Beta_gpt_names + Beta_trt_names# + Beta_sfd_names
+Gamma_features = Gamma_ffd_names + Gamma_gd_names + Gamma_gpt_names + Gamma_trt_names# + Gamma_sfd_names
+Theta_features = Theta_ffd_names + Theta_gd_names + Theta_gpt_names + Theta_trt_names# + Theta_sfd_names
+# print(Alpha_features)
+# GD_EEG_feautres
+def extract_all_fixations(data_container, word_data_object, float_resolution = np.float16):
+    """
+    Extracts all fixations from a word data object
+    :param data_container:      (h5py)  Container of the whole data, h5py object
+    :param word_data_object:    (h5py)  Container of fixation objects, h5py object
+    :param float_resolution:    (type)  Resolution to which data re to be converted, used for data compression
+    :return:
+        fixations_data  (list)  Data arrays representing each fixation
+    """
+    word_data = data_container[word_data_object]
+    fixations_data = []
+    if len(word_data.shape) > 1:
+        for fixation_idx in range(word_data.shape[0]):
+            fixations_data.append(np.array(data_container[word_data[fixation_idx][0]]).astype(float_resolution))
+    return fixations_data
+def is_real_word(word):
+    """
+    Check if the word is a real word
+    :param word:    (str)   word string
+    :return:
+        is_word (bool)  True if it is a real word
+    """
+    is_word = re.search('[a-zA-Z0-9]', word)
+    return is_word
+def load_matlab_string(matlab_extracted_object):
+    """
+    Converts a string loaded from h5py into a python string
+    :param matlab_extracted_object:     (h5py)  matlab string object
+    :return:
+        extracted_string    (str)   translated string
+    """
+    extracted_string = u''.join(chr(c[0]) for c in matlab_extracted_object)
+    return extracted_string
+def extract_word_level_data(data_container, word_objects, eeg_float_resolution = np.float16):
+    """
+    Extracts word level data for a specific sentence
+    :param data_container:          (h5py)  Container of the whole data, h5py object
+    :param word_objects:            (h5py)  Container of all word data for a specific sentence
+    :param eeg_float_resolution:    (type)  Resolution with which to save EEG, used for data compression
+    :return:
+        word_level_data     (dict)  Contains all word level data indexed by their index number in the sentence,
+                                    together with the reading order, indexed by "word_reading_order"
+    """
+    available_objects = list(word_objects)
+    #print(available_objects)
+    #print(len(available_objects))
+    # print('available_objects:', available_objects)
+    if isinstance(available_objects[0], str):
+        contentData = word_objects['content']
+        #fixations_order_per_word = []
+        if "rawEEG" in available_objects:
+            rawData = word_objects['rawEEG']
+            etData = word_objects['rawET']
+            ffdData = word_objects['FFD']
+            gdData = word_objects['GD']
+            gptData = word_objects['GPT']
+            trtData = word_objects['TRT']
+            try:
+                sfdData = word_objects['SFD']
+            except KeyError:
+                print("no SFD!")
+                sfdData = []
+            nFixData = word_objects['nFixations']
+            fixPositions = word_objects["fixPositions"]
+            Alpha_features_data = [word_objects[feature] for feature in Alpha_features]
+            Beta_features_data = [word_objects[feature] for feature in Beta_features]
+            Gamma_features_data = [word_objects[feature] for feature in Gamma_features]
+            Theta_features_data = [word_objects[feature] for feature in Theta_features]
+            ####
+            GD_EEG_features = [word_objects[feature] for feature in ['GD_t1','GD_t2','GD_a1','GD_a2','GD_b1','GD_b2','GD_g1','GD_g2']]
+            FFD_EEG_features = [word_objects[feature] for feature in ['FFD_t1','FFD_t2','FFD_a1','FFD_a2','FFD_b1','FFD_b2','FFD_g1','FFD_g2']]
+            TRT_EEG_features = [word_objects[feature] for feature in ['TRT_t1','TRT_t2','TRT_a1','TRT_a2','TRT_b1','TRT_b2','TRT_g1','TRT_g2']]
+            ####
+            assert len(contentData) == len(etData) == len(rawData), "different amounts of different data!!"
+            zipped_data = zip(rawData, etData, contentData, ffdData, gdData, gptData, trtData, sfdData, nFixData, fixPositions)
+            word_level_data = {}
+            word_idx = 0
+            word_tokens_has_fixation = []
+            word_tokens_with_mask = []
+            word_tokens_all = []
+            for raw_eegs_obj, ets_obj, word_obj, ffd, gd, gpt, trt, sfd, nFix, fixPos in zipped_data:
+                word_string = load_matlab_string(data_container[word_obj[0]])
+                if is_real_word(word_string):
+                    data_dict = {}
+                    data_dict["RAW_EEG"] = extract_all_fixations(data_container, raw_eegs_obj[0], eeg_float_resolution)
+                    data_dict["RAW_ET"] = extract_all_fixations(data_container, ets_obj[0], np.float32)
+                    data_dict["FFD"] = data_container[ffd[0]][()][0, 0] if len(data_container[ffd[0]][()].shape) == 2 else None
+                    data_dict["GD"] = data_container[gd[0]][()][0, 0] if len(data_container[gd[0]][()].shape) == 2 else None
+                    data_dict["GPT"] = data_container[gpt[0]][()][0, 0] if len(data_container[gpt[0]][()].shape) == 2 else None
+                    data_dict["TRT"] = data_container[trt[0]][()][0, 0] if len(data_container[trt[0]][()].shape) == 2 else None
+                    data_dict["SFD"] = data_container[sfd[0]][()][0, 0] if len(data_container[sfd[0]][()].shape) == 2 else None
+                    data_dict["nFix"] = data_container[nFix[0]][()][0, 0] if len(data_container[nFix[0]][()].shape) == 2 else None
+                    #fixations_order_per_word.append(np.array(data_container[fixPos[0]]))
+                    #print([data_container[obj[word_idx][0]][()] for obj in Alpha_features_data])
+                    data_dict["ALPHA_EEG"] = np.concatenate([data_container[obj[word_idx][0]][()]
+                                                             if len(data_container[obj[word_idx][0]][()].shape) == 2 else []
+                                                             for obj in Alpha_features_data], 0)
+                    data_dict["BETA_EEG"] = np.concatenate([data_container[obj[word_idx][0]][()]
+                                                            if len(data_container[obj[word_idx][0]][()].shape) == 2 else []
+                                                            for obj in Beta_features_data], 0)
+                    data_dict["GAMMA_EEG"] = np.concatenate([data_container[obj[word_idx][0]][()]
+                                                             if len(data_container[obj[word_idx][0]][()].shape) == 2 else []
+                                                             for obj in Gamma_features_data], 0)
+                    data_dict["THETA_EEG"] = np.concatenate([data_container[obj[word_idx][0]][()]
+                                                             if len(data_container[obj[word_idx][0]][()].shape) == 2 else []
+                                                             for obj in Theta_features_data], 0)
+                    data_dict["word_idx"] = word_idx
+                    data_dict["content"] = word_string
+                    ####################################
+                    word_tokens_all.append(word_string)
+                    if data_dict["nFix"] is not None:
+                        ####################################
+                        data_dict["GD_EEG"] = [np.squeeze(data_container[obj[word_idx][0]][()]) if len(data_container[obj[word_idx][0]][()].shape) == 2 else [] for obj in GD_EEG_features]
+                        data_dict["FFD_EEG"] = [np.squeeze(data_container[obj[word_idx][0]][()]) if len(data_container[obj[word_idx][0]][()].shape) == 2 else [] for obj in FFD_EEG_features]
+                        data_dict["TRT_EEG"] = [np.squeeze(data_container[obj[word_idx][0]][()]) if len(data_container[obj[word_idx][0]][()].shape) == 2 else [] for obj in TRT_EEG_features]
+                        ####################################
+                        word_tokens_has_fixation.append(word_string)
+                        word_tokens_with_mask.append(word_string)
+                    else:
+                        word_tokens_with_mask.append('[MASK]')
+                    word_level_data[word_idx] = data_dict
+                    word_idx += 1
+                else:
+                    print(word_string + " is not a real word.")
+        else:
+            # If there are no word-level data it will be word embeddings alone
+            word_level_data = {}
+            word_idx = 0
+            word_tokens_has_fixation = []
+            word_tokens_with_mask = []
+            word_tokens_all = []
+            for word_obj in contentData:
+                word_string = load_matlab_string(data_container[word_obj[0]])
+                if is_real_word(word_string):
+                    data_dict = {}
+                    data_dict["RAW_EEG"] = []
+                    data_dict["ICA_EEG"] = []
+                    data_dict["RAW_ET"] = []
+                    data_dict["FFD"] = None
+                    data_dict["GD"] = None
+                    data_dict["GPT"] = None
+                    data_dict["TRT"] = None
+                    data_dict["SFD"] = None
+                    data_dict["nFix"] = None
+                    data_dict["ALPHA_EEG"] = []
+                    data_dict["BETA_EEG"] = []
+                    data_dict["GAMMA_EEG"] = []
+                    data_dict["THETA_EEG"] = []
+                    data_dict["word_idx"] = word_idx
+                    data_dict["content"] = word_string
+                    word_level_data[word_idx] = data_dict
+                    word_idx += 1
+                else:
+                    print(word_string + " is not a real word.")
+            sentence = " ".join([load_matlab_string(data_container[word_obj[0]]) for word_obj in word_objects['content']])
+            #print("Only available objects for the sentence '{}' are {}.".format(sentence, available_objects))
+            #word_level_data["word_reading_order"] = extract_word_order_from_fixations(fixations_order_per_word)
+    else:
+        word_tokens_has_fixation = []
+        word_tokens_with_mask = []
+        word_tokens_all = []
+        word_level_data = {}
+    return word_level_data, word_tokens_all, word_tokens_has_fixation, word_tokens_with_mask