[7a9d3b]: / src / DataExtraction / DataExtractor.py

Download this file

151 lines (140 with data), 5.7 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python3
#Created by Adithya Shastry
#Email: ams2590@columbia.edu
#Import stuff
import json #we need to it to save our data
import pandas as pd #to read in hdf5 files
from statistics import mode
import statistics
import os
class hdfSlidingData:
'''
This class will process the putEmg dataset and create a sequential one
by the use a of a sliding window. After the main processing a json file
will be created to store data from each run of the window. Windows that
contain multuple lables will have one label which is the most common label
in the window.
'''
def __init__(self,window = 200):
'''
Inputs:
- window: the size of the window that will create the temporal
region
'''
self.window = window
self.debug = True
self.errors = 0
def toJson(self,dataDict):
'''
This method will save data and a label to a Json file
Inputs:
- dataDict: dictionary to convert to a json
'''
return json.dumps(dataDict)
def toDict(self,data,label):
'''
Will convert data and labels to a dictionary object that can then be
used to create a json file
Inputs:
- data: lists of lists to save
- label: the label you want to lists to have
'''
dataDict = dict()
#Add our label to the dict
dataDict['label'] = label
#create a sub dict for data points
dataDict['data'] = dict()
#now we can iterate through and create our dictionary
for i,sig in enumerate(data):
#Cycle through all 24 emg signals
#we do i+1 to stay consistent with the original dataset
dataDict['data']["emg{}".format(i+1)] = sig
#return our dictionary
return dataDict
def slidingWindow(self,df,fileOut):
'''
Will run the sliding window algorithm on a dataframe passed into it
Input:
- df: a dataframe loaded in from the putEMG dataset
- fileOut: the output file to save the json to
Output:
- saves a json file to the hardrive inputted in fileOut
'''
#Slice the df
df = df.iloc[256000:512000,1:25]
#first extract the columns of the dataframe
cols = list(df.columns)
#get the number of rows in the dataset
rows = len(df.index)
#now create our window using self.window
for i in range(0,rows-self.window):
#We want to subtract off the window from our range because
#this will push us outside the bounds of the dataset
endWindow = i + self.window +1 #the end of the window
if self.debug:
print("Reading rows {}-{}".format(i,endWindow))
#we do plus one since the range method starts at zero and ends
# and ends at n-1
#we want to iterate through the columns and extract the datapoints
currentData = []
for j,c in enumerate(cols[:-1]):
#Extract the data points
currentData.append(df.iloc[i:endWindow,j].values.tolist())
#convert our data to a dictionary
try:
#make a copy of the traj_1
l = df.iloc[i:endWindow,-1].copy().values.tolist()
dataDict = self.toDict(currentData,mode(l))
except statistics.StatisticsError:
print("Data Points were missing for some reason")
#we want to just skip this window
self.errors = self.errors + 1
#check if errors exceeds 10
if self.errors >= 10:
#then we just want to return and continue to the next file
return None
continue
with open("{}-{}.json".format(fileOut,i),'w') as f:
f.write(self.toJson(dataDict))
return None
def convertHDF5(self,filepath,outputDir):
'''
This will run all of the processes and return a list of json objects
Inputs:
- filepath: the filepath to a hdf5 file you want to process
- outputDir: The output filepath
Output:
- list of json objects that can be saved
'''
#first load the filepath as a dataframe
df = pd.read_hdf(filepath)
#Now run the sliding window algorithm on the data
self.slidingWindow(df,os.path.join(outputDir,filepath.split('.')[0]))
return None
def getErrors(self):
'''
getter method that returns the number of errors
'''
return self.errors
def resetError(self):
self.errors = 0
if __name__ == '__main__':
window = 1000 #Set a default window size
slider = hdfSlidingData(window)
#define file paths
base = "/Users/adish/Documents/School/Fall 2021 Courses/Deep Learning - Signals/Final Project/EMGGestureClassification/Data/sequential"
os.chdir(base)
for filename in os.listdir():
#now iterate through all of the json files and save them
print(filename)
out = "jsonOut4"
outputDir = os.path.join(base,out)
if not os.path.exists(outputDir):
#Then make it
os.mkdir(outputDir)
slider.convertHDF5(filename,outputDir)
#print the number of errors
err = slider.getErrors()
slider.resetError() #reset the number of errors seen
if err >= 10:
print("Skipped to next file {} errors".format(err))