How to do it...

The speech commands dataset contains ~65,000 WAV files. It contains sub-folders with the label names, where each file is a recording of one of 30 words, spoken by different speakers. In the Getting ready section of this recipe, we learned how to read a WAV file and obtain its frequency-amplitude representation by applying STFT. In this section, we'll extend the same idea in order to write a generator and then train a neural network to recognize the spoken word.

Let's begin by preparing a dataset for the generator:

First, we list all the files inside the data_speech_commands_v0.01 folder and create a DataFrame:

files = list.files("data/data_speech_commands_v0.01",all.files = T,full.names = F,recursive = T)
paste("Number audio files in dataset: ",length(files)

file_df = as.data.frame(files)
head(file_df)

The following screenshot shows a few records from the data:

Here, we can see that all the files have their label names prefixed. Now, let's create a DataFrame that contains all the filenames and their respective class labels. We will work with three classes, that is, bird, no, and off:

file_df$class = str_split_fixed(file_df$files,pattern = "/",n = 2)[,1]
file_df <- file_df[sample(nrow(file_df)),]
rownames(file_df) <- NULL
file_df = file_df[file_df$class %in% c("bird","no","off"),]
file_df$files <- as.character(file_df$files)
file_df$class <- as.numeric(as.factor(file_df$class)) -1
rownames(file_df) <- NULL
head(file_df)

The following screenshot shows a sample of the DataFrame we created in the preceding code block:

Let's create a variable that represents the number of unique labels:

num_speech_labels = length(unique(file_df$class))

Now, we split our data into train, test, and validation sets. We use the stratified() function from the splitstackshape library to do this:

# split data into train, test and validation
set.seed(200)
train_index = stratified(file_df,group = "class",.80,keep.rownames = T)$rn
test_index = setdiff(row.names(file_df),train_index)
val_index = stratified(file_df[train_index,],group = "class",.20,keep.rownames = T)$rn

train_data = file_df[setdiff(train_index,val_index),]
test_data = file_df[test_index,]
val_data = file_df[val_index,]

Now, let's shuffle the train and test data:

# shuffle train and test data
test_data = test_data[sample(nrow(test_data)),]
train_data = train_data[sample(nrow(train_data)),]

Next, let's build a sequential keras model to classify the audio and compile it:

model <- keras_model_sequential()
model %>% 
  layer_conv_2d(input_shape = c(fft_size, num_fft_windows,1), 
                filters = 32, kernel_size = c(3,3), activation = 'relu') %>% 
  layer_max_pooling_2d(pool_size = c(2, 2)) %>% 
  layer_conv_2d(filters = 64, kernel_size = c(3,3), activation = 'relu') %>% 
  layer_max_pooling_2d(pool_size = c(2, 2)) %>% 
  layer_dropout(rate = 0.25) %>% 
  layer_flatten() %>% 
  layer_dense(units = 128, activation = 'tanh') %>% 
  layer_dense(units = num_speech_labels, activation = 'softmax')

After building the model, we need to compile and visualize its summary:

# compile model
model %>% compile(
  loss = "categorical_crossentropy",
  optimizer = "rmsprop",
  metrics = c('accuracy')
)

summary(model)

The following screenshot shows the summary of the model:

Next, we need to build a data generator:

data_generator <- function(data,windowlen,overlap,numfftwindows,fftsize,windowtype,num_classes,batchsize) {
    
    function(){
        indexes <- sample(1:nrow(data), batchsize, replace = TRUE)
        x <- array(0, dim = c(length(indexes),fftsize, numfftwindows,1))
        y <- array(0, dim = c(length(indexes)))
        
        for (j in 1:length(indexes)){
            wav_file_name = data[indexes[j],"files"] %>% as.character()
            wav_file = readWave(paste0("data/data_speech_commands_v0.01/",wav_file_name))
            # wave attributes
            wave_data = wav_file@left
            num_samples = length(wav_file@left)
            sampling_rate = [email protected]
            # accomodating varying input lengths 
            if(num_samples < 16000){
                zero_pad = rep(0,16000 - length(wave_data))
                wave_data = c(wave_data,zero_pad)
            }else if(num_samples > 16000){
                wave_data = wave_data[1:16000]
            }
            # spectrogram representaion
            spectrogram_data = spectro(wave_data,f = sampling_rate ,wl = windowlen,ovlp = overlap,wn = windowtype,complex = T,plot = F,dB = NULL,norm = F)
            spectrogram_data = spectrogram_data$amp
            spectrogram_data = Mod(spectrogram_data)
            
            # imputing NaN and Inf
            if((sum(is.nan(spectrogram_data))> 0)){
                spectrogram_data[which(is.nan(spectrogram_data))] = log(0.01)
            }else if((sum(is.infinite(spectrogram_data)) >0)){
                spectrogram_data[which(is.infinite(spectrogram_data))] = log(0.01)
            }else if((sum(is.infinite(spectrogram_data)) >0)){
                spectrogram_data[which(is.na(spectrogram_data))] = log(0.01)
            }
                
            spectrogram_data = array_reshape(spectrogram_data,dim = c(fftsize,numfftwindows,1))
            
            x[j,,,] = spectrogram_data
            y[j] = data[indexes[j],c("class")] %>% as.matrix()
          }
          list(x, to_categorical(y,num_classes = num_classes))
    }
  }

Let's set the batch size and the number of epochs and then create the train and validation generators:

batch_size = 20
epochs = 2

# train and validation generator
train_generator = data_generator(data = train_data,windowlen = window_length,overlap = overlap,numfftwindows = num_fft_windows,fftsize = fft_size, windowtype = window_type,num_classes = num_speech_labels,batchsize = batch_size)

val_generator = data_generator(data = val_data,windowlen = window_length,overlap = overlap,numfftwindows = num_fft_windows,fftsize = fft_size, windowtype = window_type,num_classes = num_speech_labels,batchsize = batch_size)

Now, we define our model callbacks:

# model callbacks
model_name = "speech_rec_"

checkpoint_dir <- "checkpoints_speech_recognition"
dir.create(checkpoint_dir)

filepath <- file.path(checkpoint_dir, paste0(model_name,"weights.{epoch:02d}-{val_loss:.2f}.hdf5",sep=""))

cp_callback <- list(callback_model_checkpoint(mode = "auto",
 filepath = filepath,
 save_best_only = TRUE,
 verbose = 1),
 callback_early_stopping(min_delta = 0.05,patience = 10))

Finally, we train our model and test it on a sample:

# train model
model %>% fit_generator(generator = train_generator,
                        epochs = epochs,
                        steps_per_epoch = nrow(train_data)/batch_size,
                        validation_data = val_generator ,
                        validation_steps = nrow(val_data)/batch_size,
                        callbacks = cp_callback
                       )

# test sample
test = readWave("data/data_speech_commands_v0.01/no/0132a06d_nohash_2.wav")
# matrix corresponding to the amplitude values
test = spectro(test,wl = window_length,ovlp = overlap,wn = "hanning",complex = T,plot = F,dB = NULL,norm = F)
test = test$amp
test = array_reshape(test,dim = c(fft_size,num_fft_windows,1))
# predict label of test sample.
model %>% predict_classes( array_reshape(test,dim = c(1,fft_size,num_fft_windows,1)))

Great! The model recognizes the spoken word correctly.

Table of Contents for How to do it...

Create new playlist

Sign In

Sign Up

Table of Contents for
How to do it...