So guys in today’s blog we will implement the Image Captioning project which is a very advanced project. We will use a combination of LSTMs and CNNs for this use case.
So without any further due, Let’s do it…
Step 1 – Importing required libraries for Image Captioning.
import os import pickle import string import tensorflow import numpy as np import matplotlib.pyplot as plt from keras.layers.merge import add from keras.models import Model,load_model from keras.callbacks import ModelCheckpoint from keras.preprocessing.text import Tokenizer from keras.utils import to_categorical,plot_model from keras.preprocessing.sequence import pad_sequences from keras.applications.vgg16 import VGG16,preprocess_input from keras.layers import Input,Dense,LSTM,Embedding,Dropout from keras.preprocessing.image import img_to_array,load_img from nltk.translate.bleu_score import sentence_bleu,corpus_bleu %matplotlib inline
Step 2 – Extract features from the images using VGG-16.
def extract_features(directory):
model = VGG16()
model.layers.pop()
model = Model(inputs=model.inputs,outputs=model.layers[-1].output)
print(model.summary())
features = {}
i=0
for name in os.listdir(directory):
print(i)
img = load_img(directory+'/'+name,target_size=(224,224))
img = img_to_array(img)
img = img.reshape((1,img.shape[0],img.shape[1],img.shape[2]))
img = preprocess_input(img)
feature = model.predict(img,verbose=0)
img_id = name.split('.')[0]
features[img_id] = feature
i+=1
return features
directory ='drive/My Drive/image_captioning/Flicker8k/Flicker8k_Dataset'
features = extract_features(directory)
Step 3 – Load, Clean and Save image descriptions.
def load_description(filename):
mappings = {}
file = open(filename,'r')
content = file.readlines()
file.close()
for lines in content:
tokens = lines.split()
if len(lines)<2:
continue
image_id,image_desc = tokens[0].split('.')[0],tokens[1:]
image_desc = ' '.join(image_desc)
if image_id not in mappings:
mappings[image_id] = []
mappings[image_id].append(image_desc)
return mappings
def clean_description(descriptions):
table = str.maketrans('','',string.punctuation)
for k,image_descriptions in descriptions.items():
for i in range(len(image_descriptions)):
desc = image_descriptions[i]
desc = desc.split()
desc = [x.lower() for x in desc]
desc = [w.translate(table) for w in desc]
desc = [x for x in desc if len(x)>1]
desc = [x for x in desc if x.isalpha()]
image_descriptions[i] = ' '.join(desc)
def create_corpus(descriptions):
corpus = set()
for k in descriptions.keys():
[corpus.update(x.split()) for x in descriptions[k]]
return corpus
def save_descriptions(desc,filename):
lines = []
for k,v in desc.items():
for description in v:
lines.append(k+' '+description)
data = '\n'.join(lines)
file = open(filename,'w')
file.write(data)
file.close()
# load all descriptions
filename = 'drive/My Drive/image_captioning/Flicker8k/Flickr8k.token.txt'
descriptions = load_description(filename)
print('Descriptions loaded: ',len(descriptions))
# clean the loaded descriptions
clean_description(descriptions)
# check the vocabulary length
vocabulary = create_corpus(descriptions)
print('Vocabulary length: ',len(vocabulary))
save_descriptions(descriptions,'drive/My Drive/image_captioning/descriptions.txt')
print('SAVED !!!')

Step 4 – Load train and test image features and descriptions.
def load_set_of_image_ids(filename):
file = open(filename,'r')
lines = file.readlines()
file.close()
image_ids = set()
for line in lines:
if len(line)<1:
continue
image_ids.add(line.split('.')[0])
return image_ids
def load_clean_descriptions(all_desc,train_desc_names):
file = open(all_desc,'r')
lines = file.readlines()
descriptions = {}
for line in lines:
tokens = line.split()
image_id,image_desc = tokens[0].split('.')[0],tokens[1:]
if image_id in train_desc_names:
if image_id not in descriptions:
descriptions[image_id] = []
desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
descriptions[image_id].append(desc)
return descriptions
def load_image_features(filename,dataset):
all_features = pickle.load(open(filename,'rb'))
features = {k:all_features[k] for k in dataset}
return features
# load train image ids
train = 'drive/My Drive/image_captioning/Flicker8k/Flickr_8k.trainImages.txt'
train_image_ids = load_set_of_image_ids(train)
print('Training images found: ',len(train_image_ids))
# load training descriptions
train_descriptions = load_clean_descriptions('drive/My Drive/image_captioning/descriptions.txt',train_image_ids)
print('training descriptions loaded: ',len(train_descriptions))
# load training image features
train_features = load_image_features('drive/My Drive/image_captioning/Flicker_dataset_image_features.pkl',train_image_ids)
print('training features loaded: ',len(train_features))
train_descriptions
Step 5 – Getting descriptions in shape.
def to_list(descriptions):
all_desc_list = []
for k,v in descriptions.items():
for desc in v:
all_desc_list.append(desc)
return all_desc_list
def tokenization(descriptions):
# list of all the descriptions
all_desc_list = to_list(descriptions)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_desc_list)
return tokenizer
# create tokenizer
tokenizer = tokenization(train_descriptions)
# word index is the dictionary /mappings of word-->integer
vocab_size = len(tokenizer.word_index)+1
print('Vocab size: ',vocab_size)
def max_length(descriptions):
all_desc_list = to_list(descriptions)
return (max(len(x.split()) for x in all_desc_list))
def create_sequences(tokenizer,desc_list,max_len,photo):
X1,X2,y = [],[],[]
# X1 will contain photo
# X2 will contain current sequence
# y will contain one hot encoded next word
for desc in desc_list:
# tokenize descriptions
seq = tokenizer.texts_to_sequences([desc])[0]
for i in range(1,len(seq)):
# out seq is basically the next word in the sentence
in_seq,out_seq = seq[:i],seq[i]
# pad input sequence
in_seq = pad_sequences([in_seq],maxlen=max_len)[0]
# one hot encode output sequence
out_seq = to_categorical([out_seq],num_classes=vocab_size)[0]
X1.append(photo)
X2.append(in_seq)
y.append(out_seq)
return np.array(X1),np.array(X2),np.array(y)
# maximum length that a description can have OR the biggest description we are having
max_len = max_length(train_descriptions)
print(max_len)

Step 6 – Functions to generate data and create a model.
def data_generator(descriptions,photos,tokenizer,max_len):
while 1:
for k,desc_list in descriptions.items():
photo = photos[k][0]
in_img,in_seq,out_seq = create_sequences(tokenizer,desc_list,max_len,photo)
yield[[in_img,in_seq],out_seq]
def define_model(vocab_size, max_length):
# image features extractor model
inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
# input sequence model
inputs2 = Input(shape=(max_length,))
# embedding(input_dimension,output_dimension,)
# input dim is always the vocabulary size
# output dimension tells the size of vector space in which the words will be embedded
# mask zero is used when the input itself is 0 then to not confuse it with padded zeros it is used as True
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)
# decoder model OR output word model
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)
# tie it together [image, seq] [word]
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')
# summarize model
print(model.summary())
return model
Step 7 – Creating and training the Image Captioning model.
model = define_model(vocab_size,max_len)
epochs = 20
steps = len(train_descriptions)
for i in range(epochs):
generator = data_generator(train_descriptions,train_features,tokenizer,max_len)
model.fit_generator(generator,epochs=1,steps_per_epoch=steps,verbose=1)
model.save('drive/My Drive/image_captioning/model_'+str(i)+'.h5')

Step 8 – Prediction and Evaluation functions for the Image Captioning model.
def int2word(tokenizer,integer):
for word,index in tokenizer.word_index.items():
if index==integer:
return word
return None
def predict_desc(model,tokenizer,photo,max_len):
in_seq = 'startseq'
for i in range(max_len):
seq = tokenizer.texts_to_sequences([in_seq])[0]
seq = pad_sequences([seq],maxlen=max_len)
y_hat = model.predict([photo,seq],verbose=0)
y_hat = np.argmax(y_hat)
word = int2word(tokenizer,y_hat)
if word==None:
break
in_seq = in_seq+' '+word
if word=='endseq':
break
return in_seq
def evaluate_model(model,descriptions,photos,tokenizer,max_len):
actual,predicted = [],[]
for key,desc in descriptions.items():
y_hat = predict_desc(model,tokenizer,photos[key],max_len)
references = [d.split() for d in desc]
actual.append(references)
predicted.append(y_hat.split())
print('BLEU-1: %f' %corpus_bleu(actual,predicted,weights=(1.0,0,0,0)))
print('BLEU-2: %f' %corpus_bleu(actual,predicted,weights=(0.5,0.5,0,0)))
print('BLEU-3: %f' %corpus_bleu(actual,predicted,weights=(0.33,0.33,0.33,0)))
print('BLEU-4: %f' %corpus_bleu(actual,predicted,weights=(0.25,0.25,0.25,0.25)))
Step 9 – Evaluating Image Captioning model.
#################### load training data (6k) ##########################
train = 'drive/My Drive/image_captioning/Flicker8k/Flickr_8k.trainImages.txt'
train_image_ids = load_set_of_image_ids(train)
print('Training images found: ',len(train_image_ids))
# load training descriptions
train_descriptions = load_clean_descriptions('drive/My Drive/image_captioning/descriptions.txt',train_image_ids)
print('training descriptions loaded: ',len(train_descriptions))
tokenizer = tokenization(train_descriptions)
max_len = max_length(train_descriptions)
#################### load test data ##########################
test = 'drive/My Drive/image_captioning/Flicker8k/Flickr_8k.testImages.txt'
test_image_ids = load_set_of_image_ids(test)
print('Test images found: ',len(test_image_ids))
# load test descriptions
test_descriptions = load_clean_descriptions('drive/My Drive/image_captioning/descriptions.txt',test_image_ids)
print('test descriptions loaded: ',len(test_descriptions))
# load test image features
test_features = load_image_features('drive/My Drive/image_captioning/Flicker_dataset_image_features.pkl',test_image_ids)
print('training features loaded: ',len(test_features))
#################################################################
filename = 'drive/My Drive/image_captioning/model_18.h5'
model = load_model(filename)
evaluate_model(model,test_descriptions,test_features,tokenizer,max_len)

Step 10 – Live Image Captioning.
img_to_test = 'drive/My Drive/image_captioning/983801190.jpg'
img = plt.imread(img_to_test)
plt.imshow(img)
def extract_features(filename):
# load the model
model = VGG16()
# re-structure the model
model.layers.pop()
model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
# load the photo
image = load_img(filename, target_size=(224, 224))
# convert the image pixels to a numpy array
image = img_to_array(image)
# reshape data for the model
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
# prepare the image for the VGG model
image = preprocess_input(image)
# get features
feature = model.predict(image, verbose=0)
return feature
# pre-define the max sequence length (from training)
max_length = 34
# load the model
model = load_model('drive/My Drive/image_captioning/model_18.h5')
# load and prepare the photograph
photo = extract_features(img_to_test)
# generate description
description = predict_desc(model, tokenizer, photo, max_length)
description = ' '.join(description.split()[1:-1])
print()
print(description)
Download Source Code + Data
Do let me know if there’s any query regarding Image Captioning by contacting me on email or LinkedIn.
So this is all for this blog folks, thanks for reading it and I hope you are taking something with you after reading this and till the next time ?…
Read my previous post: GENERATING CIFAR-10 FAKE IMAGES USING DEEP CONVOLUTIONAL GENERATIVE ADVERSARIAL NETWORKS (DCGAN)
Check out my other machine learning projects, deep learning projects, computer vision projects, NLP projects, Flask projects at machinelearningprojects.net.




