lexical-stress-detection / Git / Diff of /scripts/generate_oscaar

Models:
Robert-Orr/
lexical-stress-detection
Downloads: 1
Diff of /scripts/generate_oscaar_csv.py [000000] .. [8c4e02]
Switch to side-by-side view

--- a
+++ b/scripts/generate_oscaar_csv.py
@@ -0,0 +1,95 @@
+import sys
+import os
+from pathlib import Path
+
+
+def get_regex_bel_bkb(list_item, order):
+    """
+    Returns the regular expression for Basic English Sentences and BKB Sentences - English datasets
+    """
+    regex_ = "*"
+    if int(list_item) > 9:
+        regex_ = regex_ + list_item
+    else:
+        regex_ = regex_ + '0' + list_item
+    if int(order) > 9:
+        regex_ = regex_ + order
+    else:
+        regex_ = regex_ + '0' + order
+    return regex_
+
+
+def get_regex_ieee(order):
+    """
+    Returns the regular expression for IEEE English 2009 dataset
+    """
+    regex_ = "*"
+    if int(order) < 10:
+        regex_ = regex_ + '00' + order
+    elif int(order) < 100:
+        regex_ = regex_ + '0' + order
+    else:
+        regex_ = regex_ + order
+    return regex_
+
+
+def get_regex_clear_speech(num, order):
+    """
+    Returns the regular expression for Clear Speech 2002 dataset
+    """
+    regex_ = "*"
+    if num == '10':
+        regex_ = regex_ + 'X'
+    else:
+        regex_ = regex_ + num
+
+    regex_ = regex_ + "[cp]"
+    if int(order) < 10:
+        regex_ = regex_ + '0' + order
+    else:
+        regex_ = regex_ + order
+    return regex_
+
+
+def main(oscaar_root, out_file):
+    out_file = open(out_file, 'w')
+    for dir in os.listdir(oscaar_root):
+        transcript_iter = Path(oscaar_root + '/' + dir).glob('*materials.txt')
+        for f in transcript_iter:
+            transcript_file = str(f)
+            transcript_name = transcript_file.split(".")[0].split("/")[-1]
+            transcript_sub_name = ""
+            if '_' in transcript_name:
+                transcript_sub_name = transcript_name.split("_")[0]
+            with open(transcript_file, 'r') as t:
+                next(t)
+                counter = 0
+                for line in t:
+                    line_splits = line.split(",")
+                    list_ = line_splits[0]
+                    order = line_splits[-1].strip("\n")
+                    transcript = " ".join(line_splits[1:-1])
+                    list_item = list_.split(" ")[1]
+                    if ((dir == 'BEL') and (transcript_sub_name == 'BEL')) or (dir == 'BKB'):
+                        audio_file_iter = Path(oscaar_root + "/" + dir + '/' + dir).glob(transcript_sub_name + \
+                                                                                         get_regex_bel_bkb(list_item, order) + '.wav')
+                    elif dir == 'IEEE':
+                        audio_file_iter = Path(oscaar_root + "/" + dir + '/' + dir).glob(transcript_sub_name + \
+                                                                                         get_regex_ieee(order) + '.wav')
+                    elif dir == 'Clear Speech':
+                        audio_file_iter = Path(oscaar_root + "/" + dir + '/' + dir).glob(transcript_sub_name + \
+                                                                            get_regex_clear_speech(list_item, order) + '.wav')
+                    else:
+                        counter = counter + 1
+                        audio_file_iter = Path(oscaar_root + "/" + dir + '/' + dir).glob(transcript_sub_name + "_" + \
+                                                                                         '[A-Z][A-Z]' + str(counter) + '.wav')
+                    for audio_file in audio_file_iter:
+                        id_ = str(audio_file).split(".")[0].split("/")[-1]
+                        out_file.write('oscaar_' + id_ + '\t' + str(audio_file) + '\t' + transcript + '\n')
+
+
+if __name__ == '__main__':
+    # needs two command line argument.
+    # 1. root path of Oscaar
+    # 2. output csv path
+    main(sys.argv[1], sys.argv[2])