0

私はopenAIジムを使ってSpace Invadersをプレイしようとしています。以下のコードは、試合の試合に基づいてトレーニングデータを作成し、スコア要件を上回るスコアを得たときに学習します。私はデータの生成から何のエラーもなく、y/targetの値が異なるとすべてが異なることを確認しました。モデルを訓練することで、エラーは発生しません。しかし、testModel関数では、ゲームは引き続き同じアクションを再生します。実際には、NNにランダムな整数でいっぱいの正しいサイズの配列を供給しても、同じ値を予測し続けます。TFLearn DNNは入力に関係なく同じ予測を持っています

私はどのように続行するかの提案/アイデアを大変に感謝します。ありがとう。

import gym 
import random 
import numpy as np 
import tflearn 
from tqdm import tqdm 
from tflearn.layers.core import input_data, dropout, fully_connected 
from tflearn.layers.estimator import regression 
from statistics import mean, median 
from collections import Counter 

LR = 1e-3 
env = gym.make('SpaceInvaders-v0') 
env.reset() 
goal_steps = 25000 
score_requirement = 300 
initial_games = 2000 

def some_random_games_first(): 
    # for episode in range(5): 
    env.reset() 
    score = 0 
    actions = [] 
    for t in range(goal_steps): 
     # env.render() 
     action = env.action_space.sample() 
     if action not in actions: 
      actions.append(action) 

     observation, reward, done, info = env.step(action) 
     score+=reward 
     if done: 
      print("step:", t) 
      break 
    print("observation:", observation) 
    observation = observation.flatten() 
    print(observation) 

def initial_population(): 
    training_data = [] 
    scores = [] 
    accepted_scores = [] 
    for _ in tqdm(range(initial_games)): 
     score = 0 
     game_memory = [] 
     prev_observation = [] 
     for _ in range(goal_steps): 
      action = random.randrange(0,6) 
      #env.render() 
      observation, reward, done, info = env.step(action) 

      if len(prev_observation) > 0: 
       prev_observation = prev_observation.flatten() 
       game_memory.append([prev_observation, action]) 

      prev_observation = observation 
      score += reward 
      if done: 
       break 

     if score >= score_requirement: 
      accepted_scores.append(score) 
      for data in game_memory: 
       if data[1] == 0: 
        output = [1,0,0,0,0,0] 
       elif data[1] == 1: 
        output = [0,1,0,0,0,0] 
       elif data[1] == 2: 
        output = [0,0,1,0,0,0] 
       elif data[1] == 3: 
        output = [0,0,0,1,0,0] 
       elif data[1] == 4: 
        output = [0,0,0,0,1,0] 
       elif data[1] == 5: 
        output = [0,0,0,0,0,1] 

       training_data.append([data[0], output]) 

     env.reset() 
     scores.append(score) 

    training_data_save = np.array(training_data) 
    np.save('saved.npy', training_data_save) 

    print('Average accepted score:', mean(accepted_scores)) 
    print('Median accepted score:', median(accepted_scores)) 
    print(Counter(accepted_scores)) 

    return training_data 

def neural_network_model(input_size): 
    input_layer = input_data(shape = [None, input_size, 1], name='input') 

    fc1 = fully_connected(input_layer, 128, activation='relu') 
    fc1 = dropout(fc1, 0.8) 

    fc2 = fully_connected(fc1, 128, activation='relu') 
    fc2 = dropout(fc2, 0.8) 

    # fc3 = fully_connected(fc2, 512, activation='relu') 
    # fc3 = dropout(fc3, 0.8) 
    # 
    # fc4 = fully_connected(fc3, 256, activation='relu') 
    # fc4 = dropout(fc4, 0.8) 
    # 
    # fc5 = fully_connected(fc4, 128, activation='relu') 
    # fc5 = dropout(fc5, 0.8) 

    fc6 = fully_connected(fc2, 6, activation='softmax') 
    network = regression(fc6, optimizer='adam', learning_rate=LR, loss='categorical_crossentropy', name='targets') 

    model = tflearn.DNN(network, tensorboard_dir='log') 

    return model 

def train_model(training_data, model=False): 
    X = np.array([i[0] for i in training_data]).reshape(-1, len(training_data[0][0]), 1) 
    y = [i[1] for i in training_data] 

    if not model: 
     model = neural_network_model(input_size = len(X[0])) 

    model.fit({'input':X}, {'targets':y}, n_epoch=1, snapshot_step=500, show_metric=True, run_id='openaistuff') 

    return model 

def generateData(): 
    training_data = initial_population() 
    print("training data:",training_data[0]) 

def countActions(): 
    training_data = np.load('saved.npy') 
    X = np.array([np.argmax(i[1]) for i in training_data]) 
    print(Counter(X)) 

def testModel(model): 
    scores = [] 
    choices = [] 

    for each_game in range(1): 
     score = 0 
     game_memory = [] 
     prev_obs = [] 
     env.reset() 
     for _ in range(goal_steps): 
      if len(prev_obs) == 0: 
       action = random.randrange(0,6) 
      else: 
       prev_obs = prev_obs.flatten() 
       action = np.argmax(model.predict(prev_obs.reshape(-1, len(prev_obs), 1))) 

      choices.append(action) 

      # env.render() 
      new_observation, reward, done, info = env.step(action) 
      prev_obs = new_observation 
      game_memory.append([new_observation.flatten(), action]) 
      score += reward 
      if done: 
       break 

     print('Score of game {} was {}'.format(each_game,score)) 

     scores.append(score) 

    print('Average Score', sum(scores)/len(scores)) 

training_data = np.load('saved.npy') 
model = neural_network_model(input_size = len(training_data[0][0])) 
model.load('fresh.model') 

# print(len(training_data)) 
# training_data = initial_population() 
# model = train_model(training_data) 
testModel(model) 
# model.save('fresh.model') 

答えて

0

あなたはどのくらいトレーニングしていますか? OpenAIでの私の経験では、通常、ニューラルネットワークが何かを学ぶのに非常に時間がかかります。これは、しばしば、俳優が唯一の行動を取ることを意味する。

関連する問題