lexical-stress-detection / Git / [8c4e02] /scripts/generate_oscaar

Models:
Robert-Orr/
lexical-stress-detection
Downloads: 1
[8c4e02]: / scripts / generate_oscaar_csv.py
History
Download this file
96 lines (84 with data), 3.6 kB

import sys
import os
from pathlib import Path


def get_regex_bel_bkb(list_item, order):
    """
    Returns the regular expression for Basic English Sentences and BKB Sentences - English datasets
    """
    regex_ = "*"
    if int(list_item) > 9:
        regex_ = regex_ + list_item
    else:
        regex_ = regex_ + '0' + list_item
    if int(order) > 9:
        regex_ = regex_ + order
    else:
        regex_ = regex_ + '0' + order
    return regex_


def get_regex_ieee(order):
    """
    Returns the regular expression for IEEE English 2009 dataset
    """
    regex_ = "*"
    if int(order) < 10:
        regex_ = regex_ + '00' + order
    elif int(order) < 100:
        regex_ = regex_ + '0' + order
    else:
        regex_ = regex_ + order
    return regex_


def get_regex_clear_speech(num, order):
    """
    Returns the regular expression for Clear Speech 2002 dataset
    """
    regex_ = "*"
    if num == '10':
        regex_ = regex_ + 'X'
    else:
        regex_ = regex_ + num

    regex_ = regex_ + "[cp]"
    if int(order) < 10:
        regex_ = regex_ + '0' + order
    else:
        regex_ = regex_ + order
    return regex_


def main(oscaar_root, out_file):
    out_file = open(out_file, 'w')
    for dir in os.listdir(oscaar_root):
        transcript_iter = Path(oscaar_root + '/' + dir).glob('*materials.txt')
        for f in transcript_iter:
            transcript_file = str(f)
            transcript_name = transcript_file.split(".")[0].split("/")[-1]
            transcript_sub_name = ""
            if '_' in transcript_name:
                transcript_sub_name = transcript_name.split("_")[0]
            with open(transcript_file, 'r') as t:
                next(t)
                counter = 0
                for line in t:
                    line_splits = line.split(",")
                    list_ = line_splits[0]
                    order = line_splits[-1].strip("\n")
                    transcript = " ".join(line_splits[1:-1])
                    list_item = list_.split(" ")[1]
                    if ((dir == 'BEL') and (transcript_sub_name == 'BEL')) or (dir == 'BKB'):
                        audio_file_iter = Path(oscaar_root + "/" + dir + '/' + dir).glob(transcript_sub_name + \
                                                                                         get_regex_bel_bkb(list_item, order) + '.wav')
                    elif dir == 'IEEE':
                        audio_file_iter = Path(oscaar_root + "/" + dir + '/' + dir).glob(transcript_sub_name + \
                                                                                         get_regex_ieee(order) + '.wav')
                    elif dir == 'Clear Speech':
                        audio_file_iter = Path(oscaar_root + "/" + dir + '/' + dir).glob(transcript_sub_name + \
                                                                            get_regex_clear_speech(list_item, order) + '.wav')
                    else:
                        counter = counter + 1
                        audio_file_iter = Path(oscaar_root + "/" + dir + '/' + dir).glob(transcript_sub_name + "_" + \
                                                                                         '[A-Z][A-Z]' + str(counter) + '.wav')
                    for audio_file in audio_file_iter:
                        id_ = str(audio_file).split(".")[0].split("/")[-1]
                        out_file.write('oscaar_' + id_ + '\t' + str(audio_file) + '\t' + transcript + '\n')


if __name__ == '__main__':
    # needs two command line argument.
    # 1. root path of Oscaar
    # 2. output csv path
    main(sys.argv[1], sys.argv[2])