2016-10-12 22 views

私はOpenAIジムからFrozen Lake環境を解決するためにSarsaアルゴリズムを実装しようとしています。私はすぐにこれを使って作業を始めましたが、私はそれを理解していると思います。サルサアルゴリズム、なぜQ値がゼロになるのですか?



import gym 
import random 
import numpy as np 

env = gym.make('FrozenLake-v0') 

#Initialize the Q matrix 16(rows)x4(columns) 
Q = np.zeros([env.observation_space.n, env.action_space.n]) 

for i in range(env.observation_space.n): 
    if (i != 5) and (i != 7) and (i != 11) and (i != 12) and (i != 15): 
     for j in range(env.action_space.n): 
      Q[i,j] = np.random.rand() 

#Epsilon-Greedy policy, given a state the agent chooses the action that it believes has the best long-term effect with probability 1-eps, otherwise, it chooses an action uniformly at random. Epsilon may change its value. 

bestreward = 0 
epsilon = 0.1 
discount = 0.99 
learning_rate = 0.1 
num_episodes = 50000 
a = [0,0,0,0,0,0,0,0,0,0] 

for i_episode in range(num_episodes): 

    # Observe current state s 
    observation = env.reset() 
    currentState = observation 

    # Select action a using a policy based on Q 
    if np.random.rand() <= epsilon: #pick randomly 
     currentAction = random.randint(0,env.action_space.n-1) 
    else: #pick greedily    
     currentAction = np.argmax(Q[currentState, :]) 

    totalreward = 0 
    while True: 

     # Carry out an action a 
     observation, reward, done, info = env.step(currentAction) 
     if done is True: 

     # Observe reward r and state s' 
     totalreward += reward 
     nextState = observation 

     # Select action a' using a policy based on Q 
     if np.random.rand() <= epsilon: #pick randomly 
      nextAction = random.randint(0,env.action_space.n-1) 
     else: #pick greedily    
      nextAction = np.argmax(Q[nextState, :]) 

     # update Q with Q-learning 
     Q[currentState, currentAction] += learning_rate * (reward + discount * Q[nextState, nextAction] - Q[currentState, currentAction]) 

     currentState = nextState 
     currentAction = nextAction 

     print "Episode: %d reward %d best %d epsilon %f" % (i_episode, totalreward, bestreward, epsilon) 
     if totalreward > bestreward: 
      bestreward = totalreward 
     if i_episode > num_episodes/2: 
      epsilon = epsilon * 0.9999 
     if i_episode >= num_episodes-10: 
      a.insert(0, totalreward) 
     print a 

     for i in range(env.observation_space.n): 
      print "-----" 
      for j in range(env.action_space.n): 
       print Q[i,j] 




