Implementing a beam search-based caption generator

We will now be implementing a basic beam search-based algorithm to generate caption sequences:

from keras.preprocessing import image, sequence 
 
def get_raw_caption_sequences(model, word_to_index, image_features, 
                              max_caption_size, beam_size=1): 
     
    start = [word_to_index['<START>']] 
    caption_seqs = [[start, 0.0]] 
 
    while len(caption_seqs[0][0]) < max_caption_size: 
        temp_caption_seqs = [] 
        for caption_seq in caption_seqs: 
            partial_caption_seq = sequence.pad_sequences( 
                                               [caption_seq[0]],  
                                               maxlen=max_caption_size,  
                                               padding='post') 
            next_words_pred = model.predict( 
                                   [np.asarray([image_features]),    
                                   np.asarray(partial_caption_seq)])[0] 
            next_words = np.argsort(next_words_pred)[-beam_size:] 
 
            for word in next_words: 
                new_partial_caption, new_partial_caption_prob =   
                                      caption_seq[0][:], caption_seq[1] 
                new_partial_caption.append(word) 
                new_partial_caption_prob += next_words_pred[word] 
                temp_caption_seqs.append([new_partial_caption,  
                                          new_partial_caption_prob]) 
                 
        caption_seqs = temp_caption_seqs 
        caption_seqs.sort(key = lambda item: item[1]) 
        caption_seqs = caption_seqs[-beam_size:] 
         
    return caption_seqs

This helps us generate captions based on input image features using beam search. However, it is a raw sequence of tokens based on previous tokens at each step. Hence we will build a wrapper function on top of this that will generate a clean text sentence as a caption for an input image by leveraging the preceding function:

def generate_image_caption(model, word_to_index_map, index_to_word_map, 
                           image_features, max_caption_size,   
                           beam_size=1): 
     
    raw_caption_seqs = get_raw_caption_sequences(model=model,  
                                     word_to_index=word_to_index_map,  
                                     image_features=image_features,  
                                     max_caption_size=max_caption_size,  
                                     beam_size=beam_size) 
    raw_caption_seqs.sort(key = lambda l: -l[1]) 
    caption_list = [item[0] for item in raw_caption_seqs] 
    captions = [[index_to_word_map[idx] for idx in caption]  
                                   for caption in caption_list] 
     
    final_captions = [] 
    for caption in captions: 
        start_index = caption.index('<START>')+1 
        max_len = len(caption)  
                   if len(caption) < max_caption_size  
                   else max_caption_size 
        end_index = caption.index('<END>')  
                   if '<END>' in caption  
                   else max_len-1 
        proc_caption = ' '.join(caption[start_index:end_index]) 
        final_captions.append(proc_caption) 
     
    return final_captions

We also need our caption preprocessing function from earlier, which we used to preprocess the initial captions when training the model:

def preprocess_captions(caption_list): 
    pc = [] 
    for caption in caption_list: 
        caption = caption.strip().lower() 
        caption = caption.replace('.', '') 
                         .replace(',', '') 
                         .replace("'", "") 
                         .replace('"', '') 
        caption = caption.replace('&','and') 
                         .replace('(','') 
                         .replace(')', '') 
                         .replace('-', ' ') 
        caption = ' '.join(caption.split())  
        pc.append(caption) 
    return pc

Table of Contents for Implementing a beam search-based caption generator

Create new playlist

Sign In

Sign Up

Table of Contents for
Implementing a beam search-based caption generator