EEG-To-Text / Git / [66af30] /util/data_loading_helpers

Models:
ReneeD/
EEG-To-Text
Downloads: 1
[66af30]: / util / data_loading_helpers_modified.py
History
Download this file
231 lines (193 with data), 12.4 kB

import numpy as np
import re

eeg_float_resolution=np.float16

Alpha_ffd_names = ['FFD_a1', 'FFD_a1_diff', 'FFD_a2', 'FFD_a2_diff']
Beta_ffd_names = ['FFD_b1', 'FFD_b1_diff', 'FFD_b2', 'FFD_b2_diff']
Gamma_ffd_names = ['FFD_g1', 'FFD_g1_diff', 'FFD_g2', 'FFD_g2_diff']
Theta_ffd_names = ['FFD_t1', 'FFD_t1_diff', 'FFD_t2', 'FFD_t2_diff']
Alpha_gd_names = ['GD_a1', 'GD_a1_diff', 'GD_a2', 'GD_a2_diff']
Beta_gd_names = ['GD_b1', 'GD_b1_diff', 'GD_b2', 'GD_b2_diff']
Gamma_gd_names = ['GD_g1', 'GD_g1_diff', 'GD_g2', 'GD_g2_diff']
Theta_gd_names = ['GD_t1', 'GD_t1_diff', 'GD_t2', 'GD_t2_diff']
Alpha_gpt_names = ['GPT_a1', 'GPT_a1_diff', 'GPT_a2', 'GPT_a2_diff']
Beta_gpt_names = ['GPT_b1', 'GPT_b1_diff', 'GPT_b2', 'GPT_b2_diff']
Gamma_gpt_names = ['GPT_g1', 'GPT_g1_diff', 'GPT_g2', 'GPT_g2_diff']
Theta_gpt_names = ['GPT_t1', 'GPT_t1_diff', 'GPT_t2', 'GPT_t2_diff']
Alpha_sfd_names = ['SFD_a1', 'SFD_a1_diff', 'SFD_a2', 'SFD_a2_diff']
Beta_sfd_names = ['SFD_b1', 'SFD_b1_diff', 'SFD_b2', 'SFD_b2_diff']
Gamma_sfd_names = ['SFD_g1', 'SFD_g1_diff', 'SFD_g2', 'SFD_g2_diff']
Theta_sfd_names = ['SFD_t1', 'SFD_t1_diff', 'SFD_t2', 'SFD_t2_diff']
Alpha_trt_names = ['TRT_a1', 'TRT_a1_diff', 'TRT_a2', 'TRT_a2_diff']
Beta_trt_names = ['TRT_b1', 'TRT_b1_diff', 'TRT_b2', 'TRT_b2_diff']
Gamma_trt_names = ['TRT_g1', 'TRT_g1_diff', 'TRT_g2', 'TRT_g2_diff']
Theta_trt_names = ['TRT_t1', 'TRT_t1_diff', 'TRT_t2', 'TRT_t2_diff']

# IF YOU CHANGE THOSE YOU MUST ALSO CHANGE CONSTANTS
Alpha_features = Alpha_ffd_names + Alpha_gd_names + Alpha_gpt_names + Alpha_trt_names# + Alpha_sfd_names
Beta_features = Beta_ffd_names + Beta_gd_names + Beta_gpt_names + Beta_trt_names# + Beta_sfd_names
Gamma_features = Gamma_ffd_names + Gamma_gd_names + Gamma_gpt_names + Gamma_trt_names# + Gamma_sfd_names
Theta_features = Theta_ffd_names + Theta_gd_names + Theta_gpt_names + Theta_trt_names# + Theta_sfd_names
# print(Alpha_features)

# GD_EEG_feautres


def extract_all_fixations(data_container, word_data_object, float_resolution = np.float16):
    """
    Extracts all fixations from a word data object
    :param data_container:      (h5py)  Container of the whole data, h5py object
    :param word_data_object:    (h5py)  Container of fixation objects, h5py object
    :param float_resolution:    (type)  Resolution to which data re to be converted, used for data compression
    :return:
        fixations_data  (list)  Data arrays representing each fixation
    """
    word_data = data_container[word_data_object]
    fixations_data = []
    if len(word_data.shape) > 1:
        for fixation_idx in range(word_data.shape[0]):
            fixations_data.append(np.array(data_container[word_data[fixation_idx][0]]).astype(float_resolution))
    return fixations_data


def is_real_word(word):
    """
    Check if the word is a real word
    :param word:    (str)   word string
    :return:
        is_word (bool)  True if it is a real word
    """
    is_word = re.search('[a-zA-Z0-9]', word)
    return is_word


def load_matlab_string(matlab_extracted_object):
    """
    Converts a string loaded from h5py into a python string
    :param matlab_extracted_object:     (h5py)  matlab string object
    :return:
        extracted_string    (str)   translated string
    """
    extracted_string = u''.join(chr(c[0]) for c in matlab_extracted_object)
    return extracted_string


def extract_word_level_data(data_container, word_objects, eeg_float_resolution = np.float16):
    """
    Extracts word level data for a specific sentence
    :param data_container:          (h5py)  Container of the whole data, h5py object
    :param word_objects:            (h5py)  Container of all word data for a specific sentence
    :param eeg_float_resolution:    (type)  Resolution with which to save EEG, used for data compression
    :return:
        word_level_data     (dict)  Contains all word level data indexed by their index number in the sentence,
                                    together with the reading order, indexed by "word_reading_order"
    """
    available_objects = list(word_objects)
    #print(available_objects)
    #print(len(available_objects))
    # print('available_objects:', available_objects)

    if isinstance(available_objects[0], str):

        contentData = word_objects['content']
        #fixations_order_per_word = []
        if "rawEEG" in available_objects:

            rawData = word_objects['rawEEG']
            etData = word_objects['rawET']

            ffdData = word_objects['FFD']
            gdData = word_objects['GD']
            gptData = word_objects['GPT']
            trtData = word_objects['TRT']

            try:
                sfdData = word_objects['SFD']
            except KeyError:
                print("no SFD!")
                sfdData = []
            nFixData = word_objects['nFixations']
            fixPositions = word_objects["fixPositions"]

            Alpha_features_data = [word_objects[feature] for feature in Alpha_features]
            Beta_features_data = [word_objects[feature] for feature in Beta_features]
            Gamma_features_data = [word_objects[feature] for feature in Gamma_features]
            Theta_features_data = [word_objects[feature] for feature in Theta_features]
            #### 
            GD_EEG_features = [word_objects[feature] for feature in ['GD_t1','GD_t2','GD_a1','GD_a2','GD_b1','GD_b2','GD_g1','GD_g2']]
            FFD_EEG_features = [word_objects[feature] for feature in ['FFD_t1','FFD_t2','FFD_a1','FFD_a2','FFD_b1','FFD_b2','FFD_g1','FFD_g2']]
            TRT_EEG_features = [word_objects[feature] for feature in ['TRT_t1','TRT_t2','TRT_a1','TRT_a2','TRT_b1','TRT_b2','TRT_g1','TRT_g2']]
            #### 
            assert len(contentData) == len(etData) == len(rawData), "different amounts of different data!!"

            zipped_data = zip(rawData, etData, contentData, ffdData, gdData, gptData, trtData, sfdData, nFixData, fixPositions)
            
            word_level_data = {}
            word_idx = 0

            word_tokens_has_fixation = [] 
            word_tokens_with_mask = []
            word_tokens_all = []
            for raw_eegs_obj, ets_obj, word_obj, ffd, gd, gpt, trt, sfd, nFix, fixPos in zipped_data:
                word_string = load_matlab_string(data_container[word_obj[0]])
                if is_real_word(word_string):
                    data_dict = {}
                    data_dict["RAW_EEG"] = extract_all_fixations(data_container, raw_eegs_obj[0], eeg_float_resolution)
                    data_dict["RAW_ET"] = extract_all_fixations(data_container, ets_obj[0], np.float32)

                    data_dict["FFD"] = data_container[ffd[0]][()][0, 0] if len(data_container[ffd[0]][()].shape) == 2 else None
                    data_dict["GD"] = data_container[gd[0]][()][0, 0] if len(data_container[gd[0]][()].shape) == 2 else None
                    data_dict["GPT"] = data_container[gpt[0]][()][0, 0] if len(data_container[gpt[0]][()].shape) == 2 else None
                    data_dict["TRT"] = data_container[trt[0]][()][0, 0] if len(data_container[trt[0]][()].shape) == 2 else None
                    data_dict["SFD"] = data_container[sfd[0]][()][0, 0] if len(data_container[sfd[0]][()].shape) == 2 else None
                    data_dict["nFix"] = data_container[nFix[0]][()][0, 0] if len(data_container[nFix[0]][()].shape) == 2 else None

                    #fixations_order_per_word.append(np.array(data_container[fixPos[0]]))

                    #print([data_container[obj[word_idx][0]][()] for obj in Alpha_features_data])


                    data_dict["ALPHA_EEG"] = np.concatenate([data_container[obj[word_idx][0]][()]
                                                             if len(data_container[obj[word_idx][0]][()].shape) == 2 else []
                                                             for obj in Alpha_features_data], 0)

                    data_dict["BETA_EEG"] = np.concatenate([data_container[obj[word_idx][0]][()]
                                                            if len(data_container[obj[word_idx][0]][()].shape) == 2 else []
                                                            for obj in Beta_features_data], 0)

                    data_dict["GAMMA_EEG"] = np.concatenate([data_container[obj[word_idx][0]][()]
                                                             if len(data_container[obj[word_idx][0]][()].shape) == 2 else []
                                                             for obj in Gamma_features_data], 0)

                    data_dict["THETA_EEG"] = np.concatenate([data_container[obj[word_idx][0]][()]
                                                             if len(data_container[obj[word_idx][0]][()].shape) == 2 else []
                                                             for obj in Theta_features_data], 0)




                    data_dict["word_idx"] = word_idx
                    data_dict["content"] = word_string
                    ####################################
                    word_tokens_all.append(word_string)
                    if data_dict["nFix"] is not None:
                        ####################################
                        data_dict["GD_EEG"] = [np.squeeze(data_container[obj[word_idx][0]][()]) if len(data_container[obj[word_idx][0]][()].shape) == 2 else [] for obj in GD_EEG_features]
                        data_dict["FFD_EEG"] = [np.squeeze(data_container[obj[word_idx][0]][()]) if len(data_container[obj[word_idx][0]][()].shape) == 2 else [] for obj in FFD_EEG_features]
                        data_dict["TRT_EEG"] = [np.squeeze(data_container[obj[word_idx][0]][()]) if len(data_container[obj[word_idx][0]][()].shape) == 2 else [] for obj in TRT_EEG_features]
                        ####################################
                        word_tokens_has_fixation.append(word_string)
                        word_tokens_with_mask.append(word_string)
                    else:
                        word_tokens_with_mask.append('[MASK]')
                        

                    word_level_data[word_idx] = data_dict
                    word_idx += 1
                else:
                    print(word_string + " is not a real word.")
        else:
            # If there are no word-level data it will be word embeddings alone
            word_level_data = {}
            word_idx = 0
            word_tokens_has_fixation = [] 
            word_tokens_with_mask = []
            word_tokens_all = []

            for word_obj in contentData:
                word_string = load_matlab_string(data_container[word_obj[0]])
                if is_real_word(word_string):
                    data_dict = {}
                    data_dict["RAW_EEG"] = []
                    data_dict["ICA_EEG"] = []
                    data_dict["RAW_ET"] = []
                    data_dict["FFD"] = None
                    data_dict["GD"] = None
                    data_dict["GPT"] = None
                    data_dict["TRT"] = None
                    data_dict["SFD"] = None
                    data_dict["nFix"] = None
                    data_dict["ALPHA_EEG"] = []
                    data_dict["BETA_EEG"] = []
                    data_dict["GAMMA_EEG"] = []
                    data_dict["THETA_EEG"] = []

                    data_dict["word_idx"] = word_idx
                    data_dict["content"] = word_string
                    word_level_data[word_idx] = data_dict
                    word_idx += 1
                else:
                    print(word_string + " is not a real word.")

            sentence = " ".join([load_matlab_string(data_container[word_obj[0]]) for word_obj in word_objects['content']])
            #print("Only available objects for the sentence '{}' are {}.".format(sentence, available_objects))
            #word_level_data["word_reading_order"] = extract_word_order_from_fixations(fixations_order_per_word)
    else:
        word_tokens_has_fixation = [] 
        word_tokens_with_mask = []
        word_tokens_all = []
        word_level_data = {}
    return word_level_data, word_tokens_all, word_tokens_has_fixation, word_tokens_with_mask