--- a +++ b/scripts/generate_oscaar_csv.py @@ -0,0 +1,95 @@ +import sys +import os +from pathlib import Path + + +def get_regex_bel_bkb(list_item, order): + """ + Returns the regular expression for Basic English Sentences and BKB Sentences - English datasets + """ + regex_ = "*" + if int(list_item) > 9: + regex_ = regex_ + list_item + else: + regex_ = regex_ + '0' + list_item + if int(order) > 9: + regex_ = regex_ + order + else: + regex_ = regex_ + '0' + order + return regex_ + + +def get_regex_ieee(order): + """ + Returns the regular expression for IEEE English 2009 dataset + """ + regex_ = "*" + if int(order) < 10: + regex_ = regex_ + '00' + order + elif int(order) < 100: + regex_ = regex_ + '0' + order + else: + regex_ = regex_ + order + return regex_ + + +def get_regex_clear_speech(num, order): + """ + Returns the regular expression for Clear Speech 2002 dataset + """ + regex_ = "*" + if num == '10': + regex_ = regex_ + 'X' + else: + regex_ = regex_ + num + + regex_ = regex_ + "[cp]" + if int(order) < 10: + regex_ = regex_ + '0' + order + else: + regex_ = regex_ + order + return regex_ + + +def main(oscaar_root, out_file): + out_file = open(out_file, 'w') + for dir in os.listdir(oscaar_root): + transcript_iter = Path(oscaar_root + '/' + dir).glob('*materials.txt') + for f in transcript_iter: + transcript_file = str(f) + transcript_name = transcript_file.split(".")[0].split("/")[-1] + transcript_sub_name = "" + if '_' in transcript_name: + transcript_sub_name = transcript_name.split("_")[0] + with open(transcript_file, 'r') as t: + next(t) + counter = 0 + for line in t: + line_splits = line.split(",") + list_ = line_splits[0] + order = line_splits[-1].strip("\n") + transcript = " ".join(line_splits[1:-1]) + list_item = list_.split(" ")[1] + if ((dir == 'BEL') and (transcript_sub_name == 'BEL')) or (dir == 'BKB'): + audio_file_iter = Path(oscaar_root + "/" + dir + '/' + dir).glob(transcript_sub_name + \ + get_regex_bel_bkb(list_item, order) + '.wav') + elif dir == 'IEEE': + audio_file_iter = Path(oscaar_root + "/" + dir + '/' + dir).glob(transcript_sub_name + \ + get_regex_ieee(order) + '.wav') + elif dir == 'Clear Speech': + audio_file_iter = Path(oscaar_root + "/" + dir + '/' + dir).glob(transcript_sub_name + \ + get_regex_clear_speech(list_item, order) + '.wav') + else: + counter = counter + 1 + audio_file_iter = Path(oscaar_root + "/" + dir + '/' + dir).glob(transcript_sub_name + "_" + \ + '[A-Z][A-Z]' + str(counter) + '.wav') + for audio_file in audio_file_iter: + id_ = str(audio_file).split(".")[0].split("/")[-1] + out_file.write('oscaar_' + id_ + '\t' + str(audio_file) + '\t' + transcript + '\n') + + +if __name__ == '__main__': + # needs two command line argument. + # 1. root path of Oscaar + # 2. output csv path + main(sys.argv[1], sys.argv[2])