TensorFlow CNNの損失がNaNに急上昇する

私は4つのスカラーフロート出力を予測できる回帰モデルを訓練しようとしています。現在のところ、ネットワークは急速に発散し、NaNに増加しています。私は何が起こっているのか理解できません。TensorFlow CNNの損失がNaNに急上昇する

以下は、NVidia GPUを搭載したWindows 10上のTensorFlow 1.1.0でテストされた自立サンプルです。

from __future__ import absolute_import 
from __future__ import division 
from __future__ import print_function 

import numpy 
import tensorflow as tf 

IMAGE_HEIGHT = 320 
IMAGE_WIDTH = 160 
NUM_CHANNELS = 3 

PIXEL_DEPTH = 255 
SEED = 66479 # Set to None for random seed. 
BATCH_SIZE=5 
NUM_OUTPUTS = 4 # the four outputs 

def data_type(): 
    return tf.float32 

# The variables below hold all the trainable weights. They are passed an 
# initial value which will be assigned when we call: 
# {tf.global_variables_initializer().run()} 
conv1_weights = tf.Variable(
    tf.truncated_normal([5, 5, NUM_CHANNELS, 32], # 5x5 filter, depth 32. 
         stddev=0.1, 
         seed=SEED, dtype=data_type())) 
conv1_biases = tf.Variable(tf.zeros([32], dtype=data_type())) 
conv2_weights = tf.Variable(tf.truncated_normal(
    [5, 5, 32, 64], stddev=0.1, 
    seed=SEED, dtype=data_type())) 
conv2_biases = tf.Variable(tf.constant(0.1, shape=[64], dtype=data_type())) 
fc1_weights = tf.Variable( # fully connected, depth 512. 
    tf.truncated_normal([IMAGE_HEIGHT // 4 * IMAGE_WIDTH // 4 * 64, 512], 
         stddev=0.1, 
         seed=SEED, 
         dtype=data_type())) 
fc1_biases = tf.Variable(tf.constant(0.1, shape=[512], dtype=data_type())) 
fc2_weights = tf.Variable(tf.truncated_normal([512, NUM_OUTPUTS], 
              stddev=0.1, 
              seed=SEED, 
              dtype=data_type())) 
fc2_biases = tf.Variable(tf.constant(
    0.1, shape=[NUM_OUTPUTS], dtype=data_type())) 


    # We will replicate the model structure for the training subgraph, as well 
    # as the evaluation subgraphs, while sharing the trainable parameters. 
def model(data, train=False): 
    """The Model definition.""" 
    # 2D convolution, with 'SAME' padding (i.e. the output feature map has 
    # the same size as the input). Note that {strides} is a 4D array whose 
    # shape matches the data layout: [image index, y, x, depth]. 
    conv = tf.nn.conv2d(data, 
         conv1_weights, 
         strides=[1, 1, 1, 1], 
         padding='SAME') 
    # Bias and rectified linear non-linearity. 
    relu = tf.nn.relu(tf.nn.bias_add(conv, conv1_biases)) 
    # Max pooling. The kernel size spec {ksize} also follows the layout of 
    # the data. Here we have a pooling window of 2, and a stride of 2. 
    pool = tf.nn.max_pool(relu, 
          ksize=[1, 2, 2, 1], 
          strides=[1, 2, 2, 1], 
          padding='SAME') 
    conv = tf.nn.conv2d(pool, 
         conv2_weights, 
         strides=[1, 1, 1, 1], 
         padding='SAME') 
    relu = tf.nn.relu(tf.nn.bias_add(conv, conv2_biases)) 
    pool = tf.nn.max_pool(relu, 
          ksize=[1, 2, 2, 1], 
          strides=[1, 2, 2, 1], 
          padding='SAME') 
    # Reshape the feature map cuboid into a 2D matrix to feed it to the 
    # fully connected layers. 
    pool_shape = pool.get_shape().as_list() 
    reshape = tf.reshape(
     pool, 
     [pool_shape[0], pool_shape[1] * pool_shape[2] * pool_shape[3]]) 
    # Fully connected layer. Note that the '+' operation automatically 
    # broadcasts the biases. 
    hidden = tf.nn.relu(tf.matmul(reshape, fc1_weights) + fc1_biases) 
    # Add a 50% dropout during training only. Dropout also scales 
    # activations such that no rescaling is needed at evaluation time. 
    if train: 
     hidden = tf.nn.dropout(hidden, 0.5, seed=SEED) 
    return tf.matmul(hidden, fc2_weights) + fc2_biases 

def main(): 

    train_data_batch = tf.placeholder(tf.float32, shape=(BATCH_SIZE, IMAGE_HEIGHT, IMAGE_WIDTH, NUM_CHANNELS)) 
    train_label_batch = tf.placeholder(tf.float32, shape=(BATCH_SIZE, NUM_OUTPUTS)) 


    with tf.name_scope('pred'): 
     train_pred = model(train_data_batch, train=True) 

    with tf.name_scope('loss'): 
     loss = tf.reduce_sum(tf.square(train_pred - train_label_batch)) 
     tf.summary.scalar('loss', loss) 


    # L2 regularization for the fully connected parameters. 
    regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases) + 
        tf.nn.l2_loss(fc2_weights) + tf.nn.l2_loss(fc2_biases)) 
    # Add the regularization term to the loss. 
    loss += 5e-4 * regularizers 

    optimizer = tf.train.GradientDescentOptimizer(0.01) 
    train_op = optimizer.minimize(loss) 

    with tf.Session() as sess: 
     # The op for initializing the variables. 
     init_op = tf.group(tf.global_variables_initializer(), 
          tf.local_variables_initializer()) 

     sess.run(init_op) 

     while True: 
      predictions, l, _ = sess.run([train_pred, loss, train_op], feed_dict={ 

       train_data_batch: numpy.zeros([BATCH_SIZE, IMAGE_HEIGHT, IMAGE_WIDTH, NUM_CHANNELS])+0.2, 
       train_label_batch: numpy.zeros([BATCH_SIZE, 4])}) 

      print(l) 

if __name__ == "__main__": 
    main()

は出力：

9031.0 
5.6838e+22 
nan 
nan 
nan 
nan 
nan 
nan 
nan 
nan 
nan 
nan 
nan 
nan 
nan

出典

2017-06-29 astromme

また、[このスレッド]（https://stackoverflow.com/q/33962226/1714410）も該当する場合があります。 [タグ：caffe]というタグが付いていますが、[タグ：テンソルフロー]などの他の深い学習ツールにも関係します。 – Shai

私のモデルが発散されたことが表示されます。私はAdamOptimizerに変更することでこれを解決しました：これは、運動量最適化のパラメータを適応的に設定します。

出典

2017-06-29 05:06:08 astromme

TensorFlow CNNの損失がNaNに急上昇する

答えて

関連する問題