How to do it...

Start R (using Rstudio or Docker) and load the required packages.
Download the dataset (binary version) from http://www.cs.toronto.edu/~kriz/cifar.html manually or use the following function to download the data in the R environment. The function takes the working directory or the downloaded dataset's location path as an input parameter (data_dir):

# Function to download the binary file
download.cifar.data <- function(data_dir) {
dir.create(data_dir, showWarnings = FALSE)
setwd(data_dir)
if (!file.exists('cifar-10-binary.tar.gz')){
download.file(url='http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz', destfile='cifar-10-binary.tar.gz', method='wget')
untar("cifar-10-binary.tar.gz") # Unzip files
file.remove("cifar-10-binary.tar.gz") # remove zip file
}
setwd("..")
}
# Download the data
download.cifar.data(data_dir="Cifar_10/")

Once the dataset is downloaded and untarred (or unzipped), read it in the R environment as train and test datasets. The function takes the filenames of the train and test batch datasets (filenames) and the number of images to retrieve per batch file (num.images) as input parameters:

# Function to read cifar data
read.cifar.data <- function(filenames,num.images){
images.rgb <- list()
images.lab <- list()
for (f in 1:length(filenames)) {
to.read <- file(paste("Cifar_10/",filenames[f], sep=""), "rb")
for(i in 1:num.images) {
l <- readBin(to.read, integer(), size=1, n=1, endian="big")
r <- as.integer(readBin(to.read, raw(), size=1, n=1024, endian="big"))
g <- as.integer(readBin(to.read, raw(), size=1, n=1024, endian="big"))
b <- as.integer(readBin(to.read, raw(), size=1, n=1024, endian="big"))
index <- num.images * (f-1) + i
images.rgb[[index]] = data.frame(r, g, b)
images.lab[[index]] = l+1
}
close(to.read)
cat("completed :", filenames[f], "
")
remove(l,r,g,b,f,i,index, to.read)
}
return(list("images.rgb"=images.rgb,"images.lab"=images.lab))
}
# Train dataset
cifar_train <- read.cifar.data(filenames = c("data_batch_1.bin","data_batch_2.bin","data_batch_3.bin","data_batch_4.bin", "data_batch_5.bin"))
images.rgb.train <- cifar_train$images.rgb
images.lab.train <- cifar_train$images.lab
rm(cifar_train)
# Test dataset
cifar_test <- read.cifar.data(filenames = c("test_batch.bin"))
images.rgb.test <- cifar_test$images.rgb
images.lab.test <- cifar_test$images.lab
rm(cifar_test)

The outcome of the earlier function is a list of red, green, and blue pixel dataframes for each image along with their labels. Then, flatten the data into a list of two dataframes (one for input and the other for output) using the following function, which takes two parameters--a list of input variables (x_listdata) and a list of output variables (y_listdata):

# Function to flatten the data
flat_data <- function(x_listdata,y_listdata){
# Flatten input x variables
x_listdata <- lapply(x_listdata,function(x){unlist(x)})
x_listdata <- do.call(rbind,x_listdata)
# Flatten outcome y variables
y_listdata <- lapply(y_listdata,function(x){a=c(rep(0,10)); a[x]=1; return(a)})
y_listdata <- do.call(rbind,y_listdata)
# Return flattened x and y variables
return(list("images"=x_listdata, "labels"=y_listdata))
}
# Generate flattened train and test datasets
train_data <- flat_data(x_listdata = images.rgb.train, y_listdata = images.lab.train)
test_data <- flat_data(x_listdata = images.rgb.test, y_listdata = images.lab.test)

Once the list of input and output train and test dataframes is ready, perform sanity checks by plotting the images along with their labels. The function requires two mandatory parameters (index: image row number and images.rgb: flattened input dataset) and one optional parameter (images.lab: flattened output dataset):

labels <- read.table("Cifar_10/batches.meta.txt")
# function to run sanity check on photos & labels import
drawImage <- function(index, images.rgb, images.lab=NULL) {
require(imager)
# Testing the parsing: Convert each color layer into a matrix,
# combine into an rgb object, and display as a plot
img <- images.rgb[[index]]
img.r.mat <- as.cimg(matrix(img$r, ncol=32, byrow = FALSE))
img.g.mat <- as.cimg(matrix(img$g, ncol=32, byrow = FALSE)
img.b.mat <- as.cimg(matrix(img$b, ncol=32, byrow = FALSE))
img.col.mat <- imappend(list(img.r.mat,img.g.mat,img.b.mat),"c") #Bind the three channels into one image
# Extract the label
if(!is.null(images.lab)){
lab = labels[[1]][images.lab[[index]]]
}
# Plot and output label
plot(img.col.mat,main=paste0(lab,":32x32 size",sep=" "),xaxt="n")
axis(side=1, xaxp=c(10, 50, 4), las=1)
return(list("Image label" =lab,"Image description" =img.col.mat))
}
# Draw a random image along with its label and description from train dataset
drawImage(sample(1:50000, size=1), images.rgb.train, images.lab.train)

Now transform the input data using the min-max standardization technique. The preProcess function from the package can be used for normalization. The "range" option of the method performs min-max normalization as follows:

# Function to normalize data
Require(caret) 
normalizeObj<-preProcess(train_data$images, method="range") 
train_data$images<-predict(normalizeObj, train_data$images) 
test_data$images <- predict(normalizeObj, test_data$images)

Table of Contents for How to do it...

Create new playlist

Sign In

Sign Up

Table of Contents for
How to do it...