1

私はTensorFlowでSegNetの実装を進めています。空中映像を「ビルディング」と「ビルドしない」の2つのクラスに分けるために使用しています。私は82%の精度で正確なネットワークを提供しています。複数の畳み込みレイヤを追加することで、エンコーダ/デコーダモデルの精度が低下する

しかし、元のSegNetのように複数の畳み込みレイヤーを追加してネットワークを拡張したいと考えましたが、動作させることができません。

これは私が働く小さなモデルに実装する方法である:

def inference_basic(images, phase_train, batch_size, keep_prob): 

    conv1 = conv_layer_with_bn(norm1, [7, 7, images.get_shape().as_list()[3], 64], phase_train, name="conv1") 
    pool1, pool1_indices = tf.nn.max_pool_with_argmax(conv1, ksize=[1, 2, 2, 1], 
         strides=[1, 2, 2, 1], padding='SAME', name='pool1') 

    conv2 = conv_layer_with_bn(pool1, [7, 7, 64, 64], phase_train, name="conv2") 
    pool2, pool2_indices = tf.nn.max_pool_with_argmax(conv2, ksize=[1, 2, 2, 1], 
         strides=[1, 2, 2, 1], padding='SAME', name='pool2') 

    conv3 = conv_layer_with_bn(pool2, [7, 7, 64, 64], phase_train, name="conv3") 
    pool3, pool3_indices = tf.nn.max_pool_with_argmax(conv3, ksize=[1, 2, 2, 1], 
         strides=[1, 2, 2, 1], padding='SAME', name='pool3') 

    conv4 = conv_layer_with_bn(pool3, [7, 7, 64, 64], phase_train, name="conv4") 
    pool4, pool4_indices = tf.nn.max_pool_with_argmax(conv4, ksize=[1, 2, 2, 1], 
         strides=[1, 2, 2, 1], padding='SAME', name='pool4') 

    """ End of encoder """ 

    """ start decoder """ 

    upsample4 = deconv_layer(pool4, [2, 2, 64, 64], [batch_size, FLAGS.image_h//8, FLAGS.image_w//8, 64], 2, "up4") 
    conv_decode4 = conv_layer_with_bn(upsample4, [7, 7, 64, 64], phase_train, False, name="conv_decode4") 

    upsample3= deconv_layer(conv_decode4, [2, 2, 64, 64], [batch_size, FLAGS.image_h//4, FLAGS.image_w//4, 64], 2, "up3") 
    conv_decode3 = conv_layer_with_bn(upsample3, [7, 7, 64, 64], phase_train, False, name="conv_decode3") 

    upsample2= deconv_layer(conv_decode3, [2, 2, 64, 64], [batch_size, FLAGS.image_h//2, FLAGS.image_w//2, 64], 2, "up2") 
    conv_decode2 = conv_layer_with_bn(upsample2, [7, 7, 64, 64], phase_train, False, name="conv_decode2") 

    upsample1= deconv_layer(conv_decode2, [2, 2, 64, 64], [batch_size, FLAGS.image_h, FLAGS.image_w, 64], 2, "up1") 
    conv_decode1 = conv_layer_with_bn(upsample1, [7, 7, 64, 64], phase_train, False, name="conv_decode1") 
    """ end of decoder """ 

    """ Start Classify """ 
    with tf.variable_scope('conv_classifier') as scope: 
    kernel = _variable_with_weight_decay('weights', 
             shape=[1, 1, 64, FLAGS.num_class], 
             initializer=msra_initializer(1, 64), 
             wd=0.0005) 
    conv = tf.nn.conv2d(conv_decode1, kernel, [1, 1, 1, 1], padding='SAME') 
    biases = _variable_on_cpu('biases', [FLAGS.num_class], tf.constant_initializer(0.0)) 
    conv_classifier = tf.nn.bias_add(conv, biases, name=scope.name) 
    return conv_classifier 

を、これは本当に悪い結果を取得する拡張モデル、次のとおりです。

def inference(images, phase_train, batch_size): 
    conv1_1 = conv_layer_with_bn(images, [7, 7, images.get_shape().as_list()[3], 64], phase_train, name="conv1_1") 
    conv1_2 = conv_layer_with_bn(conv1_1, [7, 7, 64, 64], phase_train, name="conv1_2") 
    pool1, pool1_indices = tf.nn.max_pool_with_argmax(conv1_2, ksize=[1, 2, 2, 1],strides=[1, 2, 2, 1], padding='SAME', name='pool1') 

    conv2_1 = conv_layer_with_bn(pool1, [7, 7, 64, 64], phase_train, name="conv2_1") 
    conv2_2 = conv_layer_with_bn(conv2_1, [7, 7, 64, 64], phase_train, name="conv2_2") 
    pool2, pool2_indices = tf.nn.max_pool_with_argmax(conv2_2, ksize=[1, 2, 2, 1], 
               strides=[1, 2, 2, 1], padding='SAME', name='pool2') 

    conv3_1 = conv_layer_with_bn(pool2, [7, 7, 64, 64], phase_train, name="conv3_1") 
    conv3_2 = conv_layer_with_bn(conv3_1, [7, 7, 64, 64], phase_train, name="conv3_2") 
    conv3_3 = conv_layer_with_bn(conv3_2, [7, 7, 64, 64], phase_train, name="conv3_3") 
    pool3, pool3_indices = tf.nn.max_pool_with_argmax(conv3_3, ksize=[1, 2, 2, 1], 
               strides=[1, 2, 2, 1], padding='SAME', name='pool3') 

    conv4_1 = conv_layer_with_bn(pool3, [7, 7, 64, 64], phase_train, name="conv4_1") 
    conv4_2 = conv_layer_with_bn(conv4_1, [7, 7, 64, 64], phase_train, name="conv4_2") 
    conv4_3 = conv_layer_with_bn(conv4_2, [7, 7, 64, 64], phase_train, name="conv4_3") 
    pool4, pool4_indices = tf.nn.max_pool_with_argmax(conv4_3, ksize=[1, 2, 2, 1], 
               strides=[1, 2, 2, 1], padding='SAME', name='pool4') 

    conv5_1 = conv_layer_with_bn(pool4, [7, 7, 64, 64], phase_train, name="conv5_1") 
    conv5_2 = conv_layer_with_bn(conv5_1, [7, 7, 64, 64], phase_train, name="conv5_2") 
    conv5_3 = conv_layer_with_bn(conv5_2, [7, 7, 64, 64], phase_train, name="conv5_3") 
    pool5, pool5_indices = tf.nn.max_pool_with_argmax(conv5_3, ksize=[1, 2, 2, 1], 
               strides=[1, 2, 2, 1], padding='SAME', name='pool5') 
    """ End of encoder """ 

    """ Start decoder """ 
    upsample5 = deconv_layer(pool5, [2, 2, 64, 64], [batch_size, FLAGS.image_h//16, FLAGS.image_w//16, 64], 2, "up5") 
    conv_decode5_1 = conv_layer_with_bn(upsample5, [7, 7, 64, 64], phase_train, True, name="conv_decode5_1") 
    conv_decode5_2 = conv_layer_with_bn(conv_decode5_1, [7, 7, 64, 64], phase_train, True, name="conv_decode5_2") 
    conv_decode5_3 = conv_layer_with_bn(conv_decode5_2, [7, 7, 64, 64], phase_train, True, name="conv_decode5_3") 

    upsample4 = deconv_layer(conv_decode5_3, [2, 2, 64, 64], [batch_size, FLAGS.image_h//8, FLAGS.image_w//8, 64], 2, "up4") 
    conv_decode4_1 = conv_layer_with_bn(upsample4, [7, 7, 64, 64], phase_train, True, name="conv_decode4_1") 
    conv_decode4_2 = conv_layer_with_bn(conv_decode4_1, [7, 7, 64, 64], phase_train, True, name="conv_decode4_2") 
    conv_decode4_3 = conv_layer_with_bn(conv_decode4_2, [7, 7, 64, 64], phase_train, True, name="conv_decode4_3") 

    upsample3 = deconv_layer(conv_decode4_3, [2, 2, 64, 64], [batch_size, FLAGS.image_h//4, FLAGS.image_w//4, 64], 2, "up3") 
    conv_decode3_1 = conv_layer_with_bn(upsample3, [7, 7, 64, 64], phase_train, True, name="conv_decode3_1") 
    conv_decode3_2 = conv_layer_with_bn(conv_decode3_1, [7, 7, 64, 64], phase_train, True, name="conv_decode3_2") 
    conv_decode3_3 = conv_layer_with_bn(conv_decode3_2, [7, 7, 64, 64], phase_train, True, name="conv_decode3_3") 

    upsample2= deconv_layer(conv_decode3_3, [2, 2, 64, 64], [batch_size, FLAGS.image_h//2, FLAGS.image_w//2, 64], 2, "up2") 
    conv_decode2_1 = conv_layer_with_bn(upsample2, [7, 7, 64, 64], phase_train, True, name="conv_decode2_1") 
    conv_decode2_2 = conv_layer_with_bn(conv_decode2_1, [7, 7, 64, 64], phase_train, True, name="conv_decode2_2") 

    upsample1 = deconv_layer(conv_decode2_2, [2, 2, 64, 64], [batch_size, FLAGS.image_h, FLAGS.image_w, 64], 2, "up1") 
    conv_decode1_1 = conv_layer_with_bn(upsample1, [7, 7, 64, 64], phase_train, True, name="conv_decode1_1") 
    conv_decode1_2 = conv_layer_with_bn(conv_decode1_1, [7, 7, 64, 64], phase_train, True, name="conv_decode1_2") 
    """ End of decoder """ 

    """ Start Classify """ 
    # output predicted class number 
    with tf.variable_scope('conv_classifier') as scope: #all variables prefixed with "conv_classifier/" 
    kernel = _variable_with_weight_decay('weights', 
            shape=[1, 1, 64, FLAGS.num_class], 
            initializer=msra_initializer(1, 64), 
            wd=0.0005) 
    conv = tf.nn.conv2d(conv_decode1_2, kernel, [1, 1, 1, 1], padding='SAME') 
    biases = _variable_on_cpu('biases', [FLAGS.num_class], tf.constant_initializer(0.0)) 
    conv_classifier = tf.nn.bias_add(conv, biases, name=scope.name) 
    #logit = conv_classifier = prediction 
    return conv_classifier 

畳み込み層:

def conv_layer_with_bn(inputT, shape, train_phase, activation=True, name=None): 

    in_channel = shape[2] 
    out_channel = shape[3] 
    k_size = shape[0] 

    with tf.variable_scope(name) as scope: 
     kernel = _variable_with_weight_decay('weights', 
            shape=shape, 

     initializer=msra_initializer(k_size, in_channel), 
            wd=None) 
     conv = tf.nn.conv2d(inputT, kernel, [1, 1, 1, 1], padding='SAME') 
     biases = _variable_on_cpu('biases', [out_channel], tf.constant_initializer(0.0)) 
     bias = tf.nn.bias_add(conv, biases) 

     if activation is True: 
     conv_out = tf.nn.relu(batch_norm_layer(bias, train_phase, scope.name)) 
     else: 
     conv_out = batch_norm_layer(bias, train_phase, scope.name) 

return conv_out 

def batch_norm_layer(inputT, is_training, scope): 
     """Used in conv_layer_with_bn()""" 
    return tf.cond(is_training, 
      lambda: tf.contrib.layers.batch_norm(inputT, is_training=True, 
          center=False, updates_collections=None, scope=scope+"_bn"), 
      lambda: tf.contrib.layers.batch_norm(inputT, is_training=False, 
          updates_collections=None, center=False, scope=scope+"_bn", reuse = True)) 

画像内のすべてのピクセルがthに分類されるため、拡張モデルは10%前後になりますe「ビルドしない」クラス。なぜこれが起こっているのか理解するのを助けることができますか?私はSegNetのcaffe implementationを見てきましたが、2つの実装の違いはわかりません。

答えて

0

いくつかのテストの後で、深いモデルのパフォーマンスが悪い理由がわかっていると思います。これは、私が推測するように、より深いモデルではより重要である、ウェイトの初期化に関する問題であるようです。私はモデルを更新して、Delving Deep into Rectifiers,論文で提案されている重み初期化子と、確率勾配降下と、学習率0.1を使用しました。これは問題を解決するようだ!

私は正しいと思っていますか?より深いモデルを使用する場合、ウェイトの初期化はこれ以上重要になりますか?

関連する問題