[8c4e02]: / scripts / generate_oscaar_csv.py

Download this file

96 lines (84 with data), 3.6 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import sys
import os
from pathlib import Path
def get_regex_bel_bkb(list_item, order):
"""
Returns the regular expression for Basic English Sentences and BKB Sentences - English datasets
"""
regex_ = "*"
if int(list_item) > 9:
regex_ = regex_ + list_item
else:
regex_ = regex_ + '0' + list_item
if int(order) > 9:
regex_ = regex_ + order
else:
regex_ = regex_ + '0' + order
return regex_
def get_regex_ieee(order):
"""
Returns the regular expression for IEEE English 2009 dataset
"""
regex_ = "*"
if int(order) < 10:
regex_ = regex_ + '00' + order
elif int(order) < 100:
regex_ = regex_ + '0' + order
else:
regex_ = regex_ + order
return regex_
def get_regex_clear_speech(num, order):
"""
Returns the regular expression for Clear Speech 2002 dataset
"""
regex_ = "*"
if num == '10':
regex_ = regex_ + 'X'
else:
regex_ = regex_ + num
regex_ = regex_ + "[cp]"
if int(order) < 10:
regex_ = regex_ + '0' + order
else:
regex_ = regex_ + order
return regex_
def main(oscaar_root, out_file):
out_file = open(out_file, 'w')
for dir in os.listdir(oscaar_root):
transcript_iter = Path(oscaar_root + '/' + dir).glob('*materials.txt')
for f in transcript_iter:
transcript_file = str(f)
transcript_name = transcript_file.split(".")[0].split("/")[-1]
transcript_sub_name = ""
if '_' in transcript_name:
transcript_sub_name = transcript_name.split("_")[0]
with open(transcript_file, 'r') as t:
next(t)
counter = 0
for line in t:
line_splits = line.split(",")
list_ = line_splits[0]
order = line_splits[-1].strip("\n")
transcript = " ".join(line_splits[1:-1])
list_item = list_.split(" ")[1]
if ((dir == 'BEL') and (transcript_sub_name == 'BEL')) or (dir == 'BKB'):
audio_file_iter = Path(oscaar_root + "/" + dir + '/' + dir).glob(transcript_sub_name + \
get_regex_bel_bkb(list_item, order) + '.wav')
elif dir == 'IEEE':
audio_file_iter = Path(oscaar_root + "/" + dir + '/' + dir).glob(transcript_sub_name + \
get_regex_ieee(order) + '.wav')
elif dir == 'Clear Speech':
audio_file_iter = Path(oscaar_root + "/" + dir + '/' + dir).glob(transcript_sub_name + \
get_regex_clear_speech(list_item, order) + '.wav')
else:
counter = counter + 1
audio_file_iter = Path(oscaar_root + "/" + dir + '/' + dir).glob(transcript_sub_name + "_" + \
'[A-Z][A-Z]' + str(counter) + '.wav')
for audio_file in audio_file_iter:
id_ = str(audio_file).split(".")[0].split("/")[-1]
out_file.write('oscaar_' + id_ + '\t' + str(audio_file) + '\t' + transcript + '\n')
if __name__ == '__main__':
# needs two command line argument.
# 1. root path of Oscaar
# 2. output csv path
main(sys.argv[1], sys.argv[2])