How to do it...

The speech commands dataset contains ~65,000 WAV files. It contains sub-folders with the label names, where each file is a recording of one of 30 words, spoken by different speakers. In the Getting ready section of this recipe, we learned how to read a WAV file and obtain its frequency-amplitude representation by applying STFT. In this section, we'll extend the same idea in order to write a generator and then train a neural network to recognize the spoken word.

Let's begin by preparing a dataset for the generator:

  1. First, we list all the files inside the data_speech_commands_v0.01 folder and create a DataFrame:
files = list.files("data/data_speech_commands_v0.01",all.files = T,full.names = F,recursive = T)
paste("Number audio files in dataset: ",length(files)

file_df = as.data.frame(files)
head(file_df)

The following screenshot shows a few records from the data:

Here, we can see that all the files have their label names prefixed. Now, let's create a DataFrame that contains all the filenames and their respective class labels. We will work with three classes, that is, bird, no, and off:

file_df$class = str_split_fixed(file_df$files,pattern = "/",n = 2)[,1]
file_df <- file_df[sample(nrow(file_df)),]
rownames(file_df) <- NULL
file_df = file_df[file_df$class %in% c("bird","no","off"),]
file_df$files <- as.character(file_df$files)
file_df$class <- as.numeric(as.factor(file_df$class)) -1
rownames(file_df) <- NULL
head(file_df)

The following screenshot shows a sample of the DataFrame we created in the preceding code block:

Let's create a variable that represents the number of unique labels:

num_speech_labels = length(unique(file_df$class))
  1. Now, we split our data into train, test, and validation sets. We use the stratified() function from the splitstackshape library to do this:
# split data into train, test and validation
set.seed(200)
train_index = stratified(file_df,group = "class",.80,keep.rownames = T)$rn
test_index = setdiff(row.names(file_df),train_index)
val_index = stratified(file_df[train_index,],group = "class",.20,keep.rownames = T)$rn

train_data = file_df[setdiff(train_index,val_index),]
test_data = file_df[test_index,]
val_data = file_df[val_index,]

Now, let's shuffle the train and test data:

# shuffle train and test data
test_data = test_data[sample(nrow(test_data)),]
train_data = train_data[sample(nrow(train_data)),]
  1. Next, let's build a sequential keras model to classify the audio and compile it:
model <- keras_model_sequential()
model %>%
layer_conv_2d(input_shape = c(fft_size, num_fft_windows,1),
filters = 32, kernel_size = c(3,3), activation = 'relu') %>%
layer_max_pooling_2d(pool_size = c(2, 2)) %>%
layer_conv_2d(filters = 64, kernel_size = c(3,3), activation = 'relu') %>%
layer_max_pooling_2d(pool_size = c(2, 2)) %>%
layer_dropout(rate = 0.25) %>%
layer_flatten() %>%
layer_dense(units = 128, activation = 'tanh') %>%
layer_dense(units = num_speech_labels, activation = 'softmax')

After building the model, we need to compile and visualize its summary:

# compile model
model %>% compile(
loss = "categorical_crossentropy",
optimizer = "rmsprop",
metrics = c('accuracy')
)

summary(model)

The following screenshot shows the summary of the model:

  1. Next, we need to build a data generator:
data_generator <- function(data,windowlen,overlap,numfftwindows,fftsize,windowtype,num_classes,batchsize) {

function(){
indexes <- sample(1:nrow(data), batchsize, replace = TRUE)
x <- array(0, dim = c(length(indexes),fftsize, numfftwindows,1))
y <- array(0, dim = c(length(indexes)))

for (j in 1:length(indexes)){
wav_file_name = data[indexes[j],"files"] %>% as.character()
wav_file = readWave(paste0("data/data_speech_commands_v0.01/",wav_file_name))
# wave attributes
wave_data = wav_file@left
num_samples = length(wav_file@left)
sampling_rate = [email protected]
# accomodating varying input lengths
if(num_samples < 16000){
zero_pad = rep(0,16000 - length(wave_data))
wave_data = c(wave_data,zero_pad)
}else if(num_samples > 16000){
wave_data = wave_data[1:16000]
}
# spectrogram representaion
spectrogram_data = spectro(wave_data,f = sampling_rate ,wl = windowlen,ovlp = overlap,wn = windowtype,complex = T,plot = F,dB = NULL,norm = F)
spectrogram_data = spectrogram_data$amp
spectrogram_data = Mod(spectrogram_data)

# imputing NaN and Inf
if((sum(is.nan(spectrogram_data))> 0)){
spectrogram_data[which(is.nan(spectrogram_data))] = log(0.01)
}else if((sum(is.infinite(spectrogram_data)) >0)){
spectrogram_data[which(is.infinite(spectrogram_data))] = log(0.01)
}else if((sum(is.infinite(spectrogram_data)) >0)){
spectrogram_data[which(is.na(spectrogram_data))] = log(0.01)
}

spectrogram_data = array_reshape(spectrogram_data,dim = c(fftsize,numfftwindows,1))

x[j,,,] = spectrogram_data
y[j] = data[indexes[j],c("class")] %>% as.matrix()
}
list(x, to_categorical(y,num_classes = num_classes))
}
}

Let's set the batch size and the number of epochs and then create the train and validation generators:

batch_size = 20
epochs = 2

# train and validation generator
train_generator = data_generator(data = train_data,windowlen = window_length,overlap = overlap,numfftwindows = num_fft_windows,fftsize = fft_size, windowtype = window_type,num_classes = num_speech_labels,batchsize = batch_size)

val_generator = data_generator(data = val_data,windowlen = window_length,overlap = overlap,numfftwindows = num_fft_windows,fftsize = fft_size, windowtype = window_type,num_classes = num_speech_labels,batchsize = batch_size)
  1. Now, we define our model callbacks:
# model callbacks
model_name = "speech_rec_"

checkpoint_dir <- "checkpoints_speech_recognition"
dir.create(checkpoint_dir)

filepath <- file.path(checkpoint_dir, paste0(model_name,"weights.{epoch:02d}-{val_loss:.2f}.hdf5",sep=""))

cp_callback <- list(callback_model_checkpoint(mode = "auto",
filepath = filepath,
save_best_only = TRUE,
verbose = 1),
callback_early_stopping(min_delta = 0.05,patience = 10))
  1. Finally, we train our model and test it on a sample:
# train model
model %>% fit_generator(generator = train_generator,
epochs = epochs,
steps_per_epoch = nrow(train_data)/batch_size,
validation_data = val_generator ,
validation_steps = nrow(val_data)/batch_size,
callbacks = cp_callback
)

# test sample
test = readWave("data/data_speech_commands_v0.01/no/0132a06d_nohash_2.wav")
# matrix corresponding to the amplitude values
test = spectro(test,wl = window_length,ovlp = overlap,wn = "hanning",complex = T,plot = F,dB = NULL,norm = F)
test = test$amp
test = array_reshape(test,dim = c(fft_size,num_fft_windows,1))
# predict label of test sample.
model %>% predict_classes( array_reshape(test,dim = c(1,fft_size,num_fft_windows,1)))

Great! The model recognizes the spoken word correctly.

..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.
Reset