lexical-stress-detection / Git / [8c4e02] /cnnmodel/feature_extraction/non_mfcc

Models:
Robert-Orr/
lexical-stress-detection
Downloads: 1
[8c4e02]: / cnnmodel / feature_extraction / non_mfcc_extraction.py
History
Download this file
187 lines (152 with data), 5.5 kB

import numpy
import math

EPS = 1e-8  # 0.00000001
win_length = 0.025
win_step = 0.01


def audio2frame(signal, frame_length, frame_step, winfunc=lambda x: numpy.ones((x,))):
    """
    Frame a signal into overlapping frames.
    :param signal: the audio signal to frame.
    :param frame_length: length of each frame measured in samples.
    :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
    :returns: an array of frames. Size is NUMFRAMES by frame_len.
    """
    signal_length = len(signal)
    frame_length = int(round(frame_length))
    frame_step = int(round(frame_step))
    if signal_length <= frame_length:
        frames_num = 1
    else:
        frames_num = 1 + int(math.ceil((1.0 * signal_length - frame_length) / frame_step))

    pad_length = int((frames_num - 1) * frame_step + frame_length)

    zeros = numpy.zeros((pad_length - signal_length,))
    pad_signal = numpy.concatenate((signal, zeros))

    indices = numpy.tile(numpy.arange(0, frame_length), (frames_num, 1)) + numpy.tile(
        numpy.arange(0, frames_num * frame_step, frame_step), (frame_length, 1)).T
    indices = numpy.array(indices, dtype=numpy.int32)
    frames = pad_signal[indices]
    win = numpy.tile(winfunc(frame_length), (frames_num, 1))

    return frames * win


def get_p2pamplitude(signal):
    """
    f1 : Compute the peak-to-peak amplitude of the signal
    """
    return numpy.max(signal) - numpy.min(signal)


def get_mean_energy_over_syllable_nucleus(energy):
    """
    f2 : Mean energy over syllable nucleus
    """
    return numpy.mean(energy)


def get_max_energy_over_syllable_nucleus(energy):
    """
    f3 : Max energy over syllable nucleus
    """
    return numpy.max(energy)


def get_duration(signal, samplerate):
    """
    f4 & f5 : Duration of a sound wave. Send input (syllable/vowel) accordingly
    """
    len_frames = len(signal)
    return len_frames / samplerate


def get_max_pitch_over_syllable_nucleus(pitch_for_frames):
    """
    f6 : Maximum pitch over syllable nucleus
    """
    return numpy.max(pitch_for_frames)


def get_mean_pitch_over_syllable_nucleus(pitch_for_frames):
    """
    f7 : Mean pitch over syllable nucleus
    """
    return numpy.mean(pitch_for_frames)


def pitch_from_zcr(frame, fs):
    """
    The function detects the F0 of isolated phoneme by zero-crossing
    """
    M = numpy.round(0.016 * fs) - 1
    # print (frames.shape)
    R = numpy.correlate(frame, frame, mode='full')
    g = R[len(frame) - 1]
    R = R[len(frame):-1]
    # estimate m0 (as the first zero crossing of R)
    [a, ] = numpy.nonzero(numpy.diff(numpy.sign(R)))
    if len(a) == 0:
        m0 = len(R) - 1
    else:
        m0 = a[0]

    if M > len(R):
        M = len(R) - 1

    M = int(M)
    m0 = int(m0)
    Gamma = numpy.zeros(M)
    CSum = numpy.cumsum(frame ** 2)
    Gamma[m0:M] = R[m0:M] / (numpy.sqrt((g * CSum[M:m0:-1])) + EPS)
    ZCR = zcr(Gamma)
    if ZCR[1] > 0.15:
        HR = 0.0
        f0 = 0.0
    else:
        if len(Gamma) == 0:
            HR = 1.0
            blag = 0.0
            Gamma = numpy.zeros((M), dtype=numpy.float64)
        else:
            HR = numpy.max(Gamma)
            blag = numpy.argmax(Gamma)
        # Get fundamental frequency:
        f0 = fs / (blag + EPS)
        if f0 > 5000:
            f0 = 0.0
        if HR < 0.1:
            f0 = 0.0
    pitch = f0
    return HR, pitch


def zcr(frame):
    """
    Compute the number and rate of sign-changes of the signal during the duration of a particular frame
    """
    count = len(frame)
    countZC = numpy.sum(numpy.abs(numpy.diff(numpy.sign(frame)))) / 2
    return countZC, (numpy.float64(countZC) / numpy.float64(count - 1.0))


def get_energy_for_frame(frame):
    """
    Compute energy value of frame
    """
    return numpy.sum(frame ** 2) / numpy.float64(len(frame))


def get_energy_for_frames(frames):
    """
    Compute energy value for all frames
    """
    energy = []
    for i in range(len(frames)):
        energy.append(get_energy_for_frame(frames[i]))
    return energy


def get_pitch_values(frames, fs):
    """
    Compute pitch values for all frames
    """
    pitch_for_frames = []
    for i in range(len(frames)):
        pitch_for_frames.append(pitch_from_zcr(frames[i], fs))
    return pitch_for_frames


def get_non_mfcc(signal, samplerate):
    """
    Compute the non-MFCC features of the signal, these include:
    f1 : Compute the peak-to-peak amplitude of the signal
    f2 : Mean energy over syllable nucleus
    f3 : Max energy over syllable nucleus
    f4 : Duration of a vowel nucleus
    f5 : Maximum pitch over syllable nucleus
    f6 : Mean pitch over syllable nucleus
    """

    non_mfcc_features = numpy.zeros(6)
    frames = audio2frame(signal, win_length * samplerate, win_step * samplerate)
    energy = get_energy_for_frames(frames)
    pitch_vals = get_pitch_values(frames, samplerate)
    non_mfcc_features[0] = get_p2pamplitude(signal)
    non_mfcc_features[1] = get_mean_energy_over_syllable_nucleus(energy)
    non_mfcc_features[2] = get_max_energy_over_syllable_nucleus(energy)
    non_mfcc_features[3] = get_duration(signal, samplerate)
    non_mfcc_features[4] = get_max_pitch_over_syllable_nucleus(pitch_vals)
    non_mfcc_features[5] = get_mean_pitch_over_syllable_nucleus(pitch_vals)
    return non_mfcc_features