PutEMGClassification / Git / [7a9d3b] /src/DataExtraction/DataExtractor.py

Models:
ReneeD/
PutEMGClassification
Downloads: 1
[7a9d3b]: / src / DataExtraction / DataExtractor.py
History
Download this file
151 lines (140 with data), 5.7 kB

#!/usr/bin/env python3

#Created by Adithya Shastry
#Email: ams2590@columbia.edu

#Import stuff

import json #we need to it to save our data
import pandas as pd #to read in hdf5 files
from statistics import mode
import statistics 
import os

class hdfSlidingData:
    '''
    This class will process the putEmg dataset and create a sequential one
    by the use a of a sliding window. After the main processing a json file
    will be created to store data from each run of the window. Windows that 
    contain multuple lables will have one label which is the most common label
    in the window.
    '''
    def __init__(self,window = 200):
        '''
        Inputs:
            - window: the size of the window that will create the temporal
                        region
        '''
        self.window = window
        self.debug = True
        self.errors = 0
    def toJson(self,dataDict):
        '''
        This method will save data and a label to a Json file
        Inputs:
            - dataDict: dictionary to convert to a json
        '''
        return json.dumps(dataDict)
        
    def toDict(self,data,label):
        '''
        Will convert data and labels to a dictionary object that can then be
        used to create a json file
        Inputs:
            - data: lists of lists to save
            - label: the label you want to lists to have
        '''
        dataDict = dict()
        #Add our label to the dict
        dataDict['label'] = label
        #create a sub dict for data points
        dataDict['data'] = dict()
        #now we can iterate through and create our dictionary
        for i,sig in enumerate(data):
            #Cycle through all 24 emg signals
            #we do i+1 to stay consistent with the original dataset
            dataDict['data']["emg{}".format(i+1)] = sig
        #return our dictionary
        return dataDict
    def slidingWindow(self,df,fileOut):
        '''
        Will run the sliding window algorithm on a dataframe passed into it
        Input:
            - df: a dataframe loaded in from the putEMG dataset
            - fileOut: the output file to save the json to
        Output:
            - saves a json file to the hardrive inputted in fileOut
        '''
        #Slice the df
        df = df.iloc[256000:512000,1:25]
        #first extract the columns of the dataframe
        cols = list(df.columns) 
        #get the number of rows in the dataset
        rows = len(df.index)
        #now create our window using self.window
        for i in range(0,rows-self.window):
            #We want to subtract off the window from our range because
                #this will push us outside the bounds of the dataset
            endWindow = i + self.window +1 #the end of the window
            if self.debug:
                print("Reading rows {}-{}".format(i,endWindow))
            #we do plus one since the range method starts at zero and ends
            #   and ends at n-1
            #we want to iterate through the columns and extract the datapoints
            currentData = []
            for j,c in enumerate(cols[:-1]):
                #Extract the data points
                currentData.append(df.iloc[i:endWindow,j].values.tolist())
            #convert our data to a dictionary
            try:
                #make a copy of the traj_1
                l = df.iloc[i:endWindow,-1].copy().values.tolist()
                dataDict = self.toDict(currentData,mode(l))
            except statistics.StatisticsError:
                print("Data Points were missing for some reason")
                #we want to just skip this window
                self.errors = self.errors + 1
                #check if errors exceeds 10
                if self.errors >= 10:
                    #then we just want to return and continue to the next file
                    return None
                continue

            with open("{}-{}.json".format(fileOut,i),'w') as f:
                f.write(self.toJson(dataDict))
        return None
    def convertHDF5(self,filepath,outputDir):
        '''
        This will run all of the processes and return a list of json objects
        Inputs:
            - filepath: the filepath to a hdf5 file you want to process
            - outputDir: The output filepath
        Output:
            - list of json objects that can be saved
        '''
        #first load the filepath as a dataframe
        df = pd.read_hdf(filepath)
        #Now run the sliding window algorithm on the data
        self.slidingWindow(df,os.path.join(outputDir,filepath.split('.')[0]))
        return None
    def getErrors(self):
        '''
        getter method that returns the number of errors
        '''
        return self.errors
    def resetError(self):
        self.errors = 0

             

        
if __name__ == '__main__':
    window = 1000 #Set a default window size
    slider = hdfSlidingData(window)
    #define file paths
    base = "/Users/adish/Documents/School/Fall 2021 Courses/Deep Learning - Signals/Final Project/EMGGestureClassification/Data/sequential"
    os.chdir(base)
    for filename in os.listdir():
        #now iterate through all of the json files and save them
        print(filename)
        out = "jsonOut4"
        outputDir = os.path.join(base,out)
        if not os.path.exists(outputDir):
            #Then make it
            os.mkdir(outputDir)
        slider.convertHDF5(filename,outputDir)
        #print the number of errors
        err = slider.getErrors()
        slider.resetError() #reset the number of errors seen
        if err >= 10:
            print("Skipped to next file {} errors".format(err))