How to do it...

This recipe covers the steps for setting up collaborative filtering using an RBM.

  1. Read the movies.dat datasets in R:
txt <- readLines("movies.dat", encoding = "latin1") 
txt_split <- lapply(strsplit(txt, "::"), function(x) as.data.frame(t(x), stringsAsFactors=FALSE))
movies_df <- do.call(rbind, txt_split)
names(movies_df) <- c("MovieID", "Title", "Genres")
movies_df$MovieID <- as.numeric(movies_df$MovieID)
  1. Add a new column (id_order) to the movies dataset, as the current ID column (UserID) cannot be used to index movies because they range from 1 to 3,952:
movies_df$id_order <- 1:nrow(movies_df) 
  1. Read the ratings.dat dataset in R:
ratings_df <- read.table("ratings.dat", sep=":",header=FALSE,stringsAsFactors = F) 
ratings_df <- ratings_df[,c(1,3,5,7)]
colnames(ratings_df) <- c("UserID","MovieID","Rating","Timestamp")
  1. Merge the movies and ratings datasets with all=FALSE:
merged_df <- merge(movies_df, ratings_df, by="MovieID",all=FALSE) 
  1. Remove the non-required columns:
merged_df[,c("Timestamp","Title","Genres")] <- NULL 
  1. Convert the ratings to percentages:
merged_df$rating_per <- merged_df$Rating/5 
  1. Generate a matrix of ratings across all the movies for a sample of 1,000 users:
num_of_users <- 1000 
num_of_movies <- length(unique(movies_df$MovieID))
trX <- matrix(0,nrow=num_of_users,ncol=num_of_movies)
for(i in 1:num_of_users){
merged_df_user <- merged_df[merged_df$UserID %in% i,]
trX[i,merged_df_user$id_order] <- merged_df_user$rating_per
}
  1. Look at the distribution of the trX training dataset. It seems to follow a Bernoulli distribution (values in the range of 0 to 1):
summary(trX[1,]); summary(trX[2,]); summary(trX[3,]) 
  1. Define the input model parameters:
num_hidden = 20 
num_input = nrow(movies_df)
  1. Start a new TensorFlow session:
sess$run(tf$global_variables_initializer()) 
output <- sess$run(list(update_w, update_vb, update_hb), feed_dict = dict(v0=trX,
W = prv_w$eval(),
vb = prv_vb$eval(),
hb = prv_hb$eval()))
prv_w <- output[[1]]
prv_vb <- output[[2]]
prv_hb <- output[[3]]
sess$run(err_sum, feed_dict=dict(v0=trX, W= prv_w, vb= prv_vb, hb= prv_hb))
  1. Train the RBM using 500 epoch iterations and a batch size of 100:
epochs= 500 
errors <- list()
weights <- list()

for(ep in 1:epochs){
for(i in seq(0,(dim(trX)[1]-100),100)){
batchX <- trX[(i+1):(i+100),]
output <- sess$run(list(update_w, update_vb, update_hb), feed_dict = dict(v0=batchX,
W = prv_w,
vb = prv_vb,
hb = prv_hb))
prv_w <- output[[1]]
prv_vb <- output[[2]]
prv_hb <- output[[3]]
if(i%%1000 == 0){
errors <- c(errors,sess$run(err_sum, feed_dict=dict(v0=batchX, W= prv_w, vb= prv_vb, hb= prv_hb)))
weights <- c(weights,output[[1]])
cat(i , " : ")
}
}
cat("epoch :", ep, " : reconstruction error : ", errors[length(errors)][[1]]," ")
}
  1. Plot reconstruction mean squared errors:
error_vec <- unlist(errors) 
plot(error_vec,xlab="# of batches",ylab="mean squared reconstruction error",main="RBM-Reconstruction MSE plot")
..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.
Reset