In [1]:
import numpy as np
import pandas as pd
from sklearn import tree
import random
import copy
import os
import graphviz
import scipy.io.wavfile as wav
from src.voice_activity_detection.extract_features import extract_features

In [2]:
voice_noise_data = np.load("numpy_files/musan_mean_speech_noise.npy").item()
voice_noise_df = pd.DataFrame.from_dict(voice_noise_data)
voice_noise_df.replace([np.inf,-np.inf,np.nan],0)
voice_noise_df.dropna()
voice_noise_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1356 entries, 0 to 1355
Data columns (total 9 columns):
RMS                  1356 non-null int64
ZCR                  1356 non-null float64
audio                1356 non-null object
bandwidth            1356 non-null float64
nwpd                 1356 non-null float64
rse                  1353 non-null float64
spectral_centroid    1356 non-null float64
spectral_flux        1356 non-null float64
spectral_rolloff     1356 non-null float64
dtypes: float64(7), int64(1), object(1)
memory usage: 95.4+ KB


In [3]:
voice_noise_df["audio"] = voice_noise_df["audio"].astype('category')
voice_noise_df["audio"] = voice_noise_df["audio"].cat.codes
voice_noise_df.head(10)

Unnamed: 0,RMS,ZCR,audio,bandwidth,nwpd,rse,spectral_centroid,spectral_flux,spectral_rolloff
0,120,0.317919,1,285263.7,-1.086645,-0.190269,535.891491,0.039433,4020.624371
1,2,0.44235,1,674314.3,10.7457,-0.875232,840.023296,0.061607,3833.170732
2,13,0.464951,1,857224.4,1.387262,-0.322468,572.973659,0.018836,4938.53211
3,130,0.15085,1,3938587.0,0.49867,-0.290313,3448.58572,0.010311,6988.571429
4,42,0.25021,1,665731.4,0.925133,-0.263716,745.102831,0.014008,4590.849315
5,93,0.481068,1,715268.8,-0.090272,-0.184233,738.235253,0.022868,4529.035025
6,12,0.496398,1,1041786.0,-0.61115,-0.405754,1783.177886,0.020107,5412.625995
7,1967,0.40109,0,228485.2,-0.196838,-0.301843,527.226453,0.034232,3316.301112
8,151,0.499611,1,359781.6,0.241001,-0.153348,524.060903,0.011216,4010.77381
9,5,0.500247,1,5324209.0,-0.130207,-0.202091,2541.108767,0.006781,7003.905325


In [4]:
TRAIN_TEST_PERCENTAGE = 0.80
data = voice_noise_df.loc[:,voice_noise_df.columns != "audio"].as_matrix()
labels = voice_noise_df["audio"].as_matrix()

full_data = np.hstack((data,np.expand_dims(labels,axis = 1)))
random.shuffle(full_data)
data = full_data[:,0:-1]
labels = full_data[:,-1]

nan_indices = np.where(np.isnan(data))
print(nan_indices)
all_indices_ = np.ones(len(data),dtype = "bool")
print(all_indices_.shape)
all_indices_[nan_indices[0]] = False
data_ = copy.deepcopy(data[all_indices_,:])
labels_ = copy.deepcopy(labels[all_indices_])
print(data_.shape)
print(np.where(np.isnan(data_)))

index_ = int(len(data_)*TRAIN_TEST_PERCENTAGE)
train_data = data_[0:index_,:]
train_labels = labels_[0:index_]
test_data = data_[index_:,:]
test_labels = labels_[index_:]

print(train_data.shape)

print(np.where(np.isnan(train_data)))

print(np.where(np.isnan(test_data)))

(array([132, 229]), array([4, 4]))
(1356,)
(1354, 8)
(array([], dtype=int64), array([], dtype=int64))
(1083, 8)
(array([], dtype=int64), array([], dtype=int64))
(array([], dtype=int64), array([], dtype=int64))


In [5]:
classifier = tree.DecisionTreeClassifier(max_depth = 3)
classifier = classifier.fit(train_data,train_labels)

In [6]:
prediction = classifier.predict(test_data)
print(np.mean(np.equal(prediction,test_labels).astype(np.float32)))

0.97416973


## For testing new audio

In [8]:
TEST_AUDIO_FOLDER = "/Users/siva/Documents/speaker_recognition/VOD/testwav/"

In [9]:
def create_dataset(ROOT_FOLDER,WINDOW_LENGTH = 5,FRAME_LENGTH = 25):
    os.chdir(ROOT_FOLDER)
    all_audio = os.listdir()
    dataset_dict = {"ZCR":[],"RMS":[],"spectral_flux":[],\
                   "spectral_centroid":[],"spectral_rolloff":[],\
                   "bandwidth":[],"nwpd":[],"rse":[]}
    for audio in all_audio:
        print("****************************")
        print("reading:",audio)
        sampling_rate,sig = wav.read(ROOT_FOLDER+audio)
        print("sampling rate:",sampling_rate,"signal length",len(sig))
        index = 0
        while index+(sampling_rate*WINDOW_LENGTH) < len(sig):
            sample = sig[index:(index+(sampling_rate*WINDOW_LENGTH))]
            ef = extract_features(sample,FRAME_LENGTH,sampling_rate)
            ZCR,RMS,sf,sr,sc,bd,nwpd,rse = ef.return_()
            dataset_dict["ZCR"].append(ZCR)
            dataset_dict["RMS"].append(RMS)
            dataset_dict["spectral_flux"].append(np.mean(sf))
            dataset_dict["spectral_centroid"].append(np.mean(sc))
            dataset_dict["spectral_rolloff"].append(np.mean(sr))
            dataset_dict["bandwidth"].append(np.mean(bd))
            dataset_dict["nwpd"].append(np.mean(nwpd))
            dataset_dict["rse"].append(np.mean(rse))
            index += sampling_rate*WINDOW_LENGTH
    values = dataset_dict.values()
    print([len(e) for e in values])
    print("finished")
    return dataset_dict

In [10]:
features_test_dict = create_dataset(TEST_AUDIO_FOLDER)

****************************
reading: audiotest08-06-2018-17-01-26.wav
sampling rate: 16000 signal length 84480
****************************
reading: audiotest08-06-2018-17-01-43.wav
sampling rate: 16000 signal length 84480
****************************
reading: audiotest08-06-2018-17-01-56.wav
sampling rate: 16000 signal length 84480
****************************
reading: audiotest08-06-2018-17-02-11.wav
sampling rate: 16000 signal length 84480
****************************
reading: audiotest08-06-2018-16-49-40.wav
sampling rate: 16000 signal length 84480
****************************
reading: audiotest08-06-2018-17-01-12.wav
sampling rate: 16000 signal length 84480
****************************
reading: audiotest08-06-2018-13-12-39.wav
sampling rate: 16000 signal length 84480
****************************
reading: audiotest08-06-2018-17-02-44.wav
sampling rate: 16000 signal length 84480
[8, 8, 8, 8, 8, 8, 8, 8]
finished


In [11]:
test_df = pd.DataFrame.from_dict(features_test_dict)
test_df.replace([np.inf,-np.inf,np.nan],0)
test_df.dropna()
test_df_data = test_df.as_matrix()
test_df.head(10)

Unnamed: 0,RMS,ZCR,bandwidth,nwpd,rse,spectral_centroid,spectral_flux,spectral_rolloff
0,235,0.148002,1800357.0,0.323637,-0.25898,1043.434918,0.021088,6197.853916
1,347,0.239553,3614656.0,1.323582,-0.249343,1700.323351,0.008497,6469.879518
2,271,0.130902,1469745.0,1.151613,-0.253432,988.019173,0.014795,5685.115462
3,140,0.125277,1089120.0,-0.566549,-0.337949,926.835474,0.019217,5463.227912
4,270,0.093726,1016863.0,3.289868,-0.30584,786.064634,0.027444,5227.158635
5,242,0.179977,2751192.0,0.520077,-0.235335,1297.176396,0.016272,6467.871486
6,31,0.085739,1137872.0,0.515145,-0.236153,641.419533,0.026151,4864.646084
7,187,0.107876,668675.2,0.009063,-0.304043,802.714148,0.020602,5183.985944


## 1 - Voice and 0 - Noise

In [13]:
test_predictions = classifier.predict(test_df_data)
print(test_predictions)

[1. 1. 1. 1. 1. 1. 1. 1.]
