How to do it...

The section provide steps for how to set up Q-learning:

  1. Define 16 states:
states <- c("s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", "s16") 
  1. Define four actions:
actions<- c("up", "left", "down", "right") 
  1. Define the transitionStateAction function, which can simulate the transitions from one state s to another state s' using an action a. The function takes in the current state s and selected action a, and it returns the next state s' and corresponding reward r'. In case of constrained action, the next state returned is the current state s and the existing reward r:
transitionStateAction<- function(state, action) { 
  # The default state is the existing state in case of constrained action 
next_state<- state 
if (state == "s1"&& action == "down") next_state<- "s2" 
if (state == "s1"&& action == "right") next_state<- "s5" 
if (state == "s2"&& action == "up") next_state<- "s1" 
if (state == "s2"&& action == "right") next_state<- "s6" 
if (state == "s3"&& action == "right") next_state<- "s7" 
if (state == "s3"&& action == "down") next_state<- "s4" 
if (state == "s4"&& action == "up") next_state<- "s3" 
if (state == "s5"&& action == "right") next_state<- "s9" 
if (state == "s5"&& action == "down") next_state<- "s6" 
if (state == "s5"&& action == "left") next_state<- "s1" 
if (state == "s6"&& action == "up") next_state<- "s5" 
if (state == "s6"&& action == "down") next_state<- "s7" 
if (state == "s6"&& action == "left") next_state<- "s2" 
if (state == "s7"&& action == "up") next_state<- "s6" 
if (state == "s7"&& action == "right") next_state<- "s11" 
if (state == "s7"&& action == "down") next_state<- "s8" 
if (state == "s7"&& action == "left") next_state<- "s3" 
if (state == "s8"&& action == "up") next_state<- "s7" 
if (state == "s8"&& action == "right") next_state<- "s12" 
if (state == "s9"&& action == "right") next_state<- "s13" 
if (state == "s9"&& action == "down") next_state<- "s10" 
if (state == "s9"&& action == "left") next_state<- "s5" 
if (state == "s10"&& action == "up") next_state<- "s9" 
if (state == "s10"&& action == "right") next_state<- "s14" 
if (state == "s10"&& action == "down") next_state<- "s11" 
if (state == "s11"&& action == "up") next_state<- "s10" 
if (state == "s11"&& action == "right") next_state<- "s15" 
if (state == "s11"&& action == "left") next_state<- "s7" 
if (state == "s12"&& action == "right") next_state<- "s16" 
if (state == "s12"&& action == "left") next_state<- "s8" 
if (state == "s13"&& action == "down") next_state<- "s14" 
if (state == "s13"&& action == "left") next_state<- "s9" 
if (state == "s14"&& action == "up") next_state<- "s13" 
if (state == "s14"&& action == "down") next_state<- "s15" 
if (state == "s14"&& action == "left") next_state<- "s10" 
if (state == "s15"&& action == "up") next_state<- "s14" 
if (state == "s15"&& action == "down") next_state<- "s16" 
if (state == "s15"&& action == "left") next_state<- "s11" 
if (state == "s16"&& action == "up") next_state<- "s15" 
if (state == "s16"&& action == "left") next_state<- "s12" 
  # Calculate reward 
if (next_state == "s15") { 
reward<- 100 
  } else { 
reward<- -1 
return(list(state=next_state, reward=reward)) 
  1. Define a function to perform Q-learning using n iterations:
Qlearning<- function(n, initState, termState, 
epsilon, learning_rate) { 
  # Initialize a Q-matrix of size #states x #actions with zeroes 
Q_mat<- matrix(0, nrow=length(states), ncol=length(actions), 
dimnames=list(states, actions)) 
  # Run n iterations of Q-learning 
for (i in 1:n) { 
Q_mat<- updateIteration(initState, termState, epsilon, learning_rate, Q_mat) 
   updateIteration<- function(initState, termState, epsilon, learning_rate, Q_mat) { 
state<- initState # set cursor to initial state 
while (state != termState) { 
    # Select the next action greedily or randomnly 
if (runif(1) >= epsilon) { 
action<- sample(actions, 1) # Select randomnly 
    } else { 
action<- which.max(Q_mat[state, ]) # Select best action 
    # Extract the next state and its reward 
response<- transitionStateAction(state, action) 
    # Update the corresponding value in Q-matrix (learning) 
Q_mat[state, action] <- Q_mat[state, action] + learning_rate * 
      (response$reward + max(Q_mat[response$state, ]) - Q_mat[state, action]) 
state<- response$state # update with next state 
  1. Set learning parameters such as epsilon and learning_rate:
epsilon<- 0.1 
learning_rate<- 0.9 
  1. Get the Q-table after 500k iterations:
Q_mat<- Qlearning(500, "s1", "s15", epsilon, learning_rate) 
  1. Get the best (optimum) policy P*, as shown in the following figure. The arrows marked in green shows the direction of traversing S1 to S15:
Optimum policy using model-free iteration with an optimum path from S1 to S15
