|
a |
|
b/dataset/read_raw_dataset.py |
|
|
1 |
import numpy as np |
|
|
2 |
from os import listdir |
|
|
3 |
from os.path import isfile, join |
|
|
4 |
|
|
|
5 |
def read_raw_dataset(): |
|
|
6 |
|
|
|
7 |
""" |
|
|
8 |
Returns a dictionary `raw_dataset` which contains following keys pairs. |
|
|
9 |
A, B, C, D, and F with values as 2D numpy array of shape (m, n). |
|
|
10 |
|
|
|
11 |
m = no. of training examples in each set |
|
|
12 |
n - no. of data points in time series |
|
|
13 |
""" |
|
|
14 |
|
|
|
15 |
# Directory names are saved as F, O, N, etc. |
|
|
16 |
# And set names are processed as A, B, C, etc. |
|
|
17 |
mapping_set_to_dir = { |
|
|
18 |
'A': (0,'Z'), |
|
|
19 |
'B': (1,'O'), |
|
|
20 |
'C': (2,'N'), |
|
|
21 |
'D': (3,'F'), |
|
|
22 |
'E': (4,'S') |
|
|
23 |
} |
|
|
24 |
|
|
|
25 |
file_lists = [] |
|
|
26 |
|
|
|
27 |
# get the list of files for each set |
|
|
28 |
# 1 file corresponds to 1 training example |
|
|
29 |
for s,d in mapping_set_to_dir.items(): |
|
|
30 |
file_lists.insert(d[0], [f for f in listdir(d[1]) if isfile(join(d[1], f))]) |
|
|
31 |
|
|
|
32 |
raw_dataset = { } |
|
|
33 |
|
|
|
34 |
# loop over all sets |
|
|
35 |
for s,d in mapping_set_to_dir.items(): |
|
|
36 |
|
|
|
37 |
# loop over every file (training example) in each set |
|
|
38 |
for f in file_lists[d[0]]: |
|
|
39 |
|
|
|
40 |
# read the time series data |
|
|
41 |
curr_example = np.loadtxt(join(d[1], f)) |
|
|
42 |
|
|
|
43 |
# create a key in the raw_database dict in case it doesn't exist already |
|
|
44 |
# otherwise just append the new example in the 2D array |
|
|
45 |
if (s in raw_dataset): |
|
|
46 |
raw_dataset[s] = np.append(raw_dataset[s], [curr_example], axis=0) |
|
|
47 |
else: |
|
|
48 |
raw_dataset[s] = np.array([curr_example]) |
|
|
49 |
|
|
|
50 |
return raw_dataset |