How to do it...

The section provide steps for how to set up Q-learning:

Define 16 states:

states <- c("s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", "s16")

Define four actions:

actions<- c("up", "left", "down", "right")

Define the transitionStateAction function, which can simulate the transitions from one state s to another state s' using an action a. The function takes in the current state s and selected action a, and it returns the next state s' and corresponding reward r'. In case of constrained action, the next state returned is the current state s and the existing reward r:

transitionStateAction<- function(state, action) { 
  # The default state is the existing state in case of constrained action 
next_state<- state 
if (state == "s1"&& action == "down") next_state<- "s2" 
if (state == "s1"&& action == "right") next_state<- "s5" 
if (state == "s2"&& action == "up") next_state<- "s1" 
if (state == "s2"&& action == "right") next_state<- "s6" 
if (state == "s3"&& action == "right") next_state<- "s7" 
if (state == "s3"&& action == "down") next_state<- "s4" 
if (state == "s4"&& action == "up") next_state<- "s3" 
if (state == "s5"&& action == "right") next_state<- "s9" 
if (state == "s5"&& action == "down") next_state<- "s6" 
if (state == "s5"&& action == "left") next_state<- "s1" 
if (state == "s6"&& action == "up") next_state<- "s5" 
if (state == "s6"&& action == "down") next_state<- "s7" 
if (state == "s6"&& action == "left") next_state<- "s2" 
if (state == "s7"&& action == "up") next_state<- "s6" 
if (state == "s7"&& action == "right") next_state<- "s11" 
if (state == "s7"&& action == "down") next_state<- "s8" 
if (state == "s7"&& action == "left") next_state<- "s3" 
if (state == "s8"&& action == "up") next_state<- "s7" 
if (state == "s8"&& action == "right") next_state<- "s12" 
if (state == "s9"&& action == "right") next_state<- "s13" 
if (state == "s9"&& action == "down") next_state<- "s10" 
if (state == "s9"&& action == "left") next_state<- "s5" 
if (state == "s10"&& action == "up") next_state<- "s9" 
if (state == "s10"&& action == "right") next_state<- "s14" 
if (state == "s10"&& action == "down") next_state<- "s11" 
if (state == "s11"&& action == "up") next_state<- "s10" 
if (state == "s11"&& action == "right") next_state<- "s15" 
if (state == "s11"&& action == "left") next_state<- "s7" 
if (state == "s12"&& action == "right") next_state<- "s16" 
if (state == "s12"&& action == "left") next_state<- "s8" 
if (state == "s13"&& action == "down") next_state<- "s14" 
if (state == "s13"&& action == "left") next_state<- "s9" 
if (state == "s14"&& action == "up") next_state<- "s13" 
if (state == "s14"&& action == "down") next_state<- "s15" 
if (state == "s14"&& action == "left") next_state<- "s10" 
if (state == "s15"&& action == "up") next_state<- "s14" 
if (state == "s15"&& action == "down") next_state<- "s16" 
if (state == "s15"&& action == "left") next_state<- "s11" 
if (state == "s16"&& action == "up") next_state<- "s15" 
if (state == "s16"&& action == "left") next_state<- "s12" 
  # Calculate reward 
if (next_state == "s15") { 
reward<- 100 
  } else { 
reward<- -1 
  } 
 
return(list(state=next_state, reward=reward)) 
}

Define a function to perform Q-learning using n iterations:

Qlearning<- function(n, initState, termState, 
epsilon, learning_rate) { 
  # Initialize a Q-matrix of size #states x #actions with zeroes 
Q_mat<- matrix(0, nrow=length(states), ncol=length(actions), 
dimnames=list(states, actions)) 
  # Run n iterations of Q-learning 
for (i in 1:n) { 
Q_mat<- updateIteration(initState, termState, epsilon, learning_rate, Q_mat) 
  } 
return(Q_mat) 
} 
   updateIteration<- function(initState, termState, epsilon, learning_rate, Q_mat) { 
state<- initState # set cursor to initial state 
while (state != termState) { 
    # Select the next action greedily or randomnly 
if (runif(1) >= epsilon) { 
action<- sample(actions, 1) # Select randomnly 
    } else { 
action<- which.max(Q_mat[state, ]) # Select best action 
    } 
    # Extract the next state and its reward 
response<- transitionStateAction(state, action) 
    # Update the corresponding value in Q-matrix (learning) 
Q_mat[state, action] <- Q_mat[state, action] + learning_rate * 
      (response$reward + max(Q_mat[response$state, ]) - Q_mat[state, action]) 
state<- response$state # update with next state 
  } 
return(Q_mat) 
}

Set learning parameters such as epsilon and learning_rate:

epsilon<- 0.1 
learning_rate<- 0.9

Get the Q-table after 500k iterations:

Q_mat<- Qlearning(500, "s1", "s15", epsilon, learning_rate) 
Q_mat

Get the best (optimum) policy P*, as shown in the following figure. The arrows marked in green shows the direction of traversing S1 to S15:

actions[max.col(Q_mat)]

Optimum policy using model-free iteration with an optimum path from S1 to S15

Table of Contents for How to do it...

Create new playlist

Sign In

Sign Up

Table of Contents for
How to do it...