2017-07-15 9 views
2

バッチ正規化と自己正規化ニューラルネットワークの違いを知りたいと思います。つまり、SELU(Scaled Exponential Linear Unit)はバッチの正規化をどのように置き換えるのですか?SELUによるバッチ正規化と自己正規化ニューラルネットワークの違い

さらに、私はSELUアクティベーションの値を調べた後、範囲内にあった:[-1, 1]。これはバッチの正規化では当てはまりませんが。代わりに、BN層の後の値(reluのアクティブ化の前)は、の値をとっており、[-1, 1]の値ではありません。次のように

バッチノルム層が定義されている...

batch_norm_layer = tf.Print(batch_norm_layer, 
          data=[tf.reduce_max(batch_norm_layer), tf.reduce_min(batch_norm_layer)], 
          message = name_scope + ' min and max') 

そしてSELUのアクティベーションのための同様のコード:ここで

は、私がSELU活性化した後、バッチノルム層を形成した後の値を印刷する方法です。

def batch_norm(x, n_out, phase_train, in_conv_layer = True): 

    with tf.variable_scope('bn'): 
     beta = tf.Variable(tf.constant(0.0, shape=n_out), 
            name='beta', trainable=True) 
     gamma = tf.Variable(tf.constant(1.0, shape=n_out), 
             name='gamma', trainable=True) 
     if in_conv_layer: 
      batch_mean, batch_var = tf.nn.moments(x, [0, 1, 2], name='moments') 
     else: 
      batch_mean, batch_var = tf.nn.moments(x, [0, 1], name='moments') 

     ema = tf.train.ExponentialMovingAverage(decay=0.9999) 

     def mean_var_with_update(): 
      ema_apply_op = ema.apply([batch_mean, batch_var]) 
      with tf.control_dependencies([ema_apply_op]): 
       return tf.identity(batch_mean), tf.identity(batch_var) 

     mean, var = tf.cond(phase_train, 
          mean_var_with_update, 
          lambda: (ema.average(batch_mean), ema.average(batch_var))) 
     normed = tf.nn.batch_normalization(x, mean, var, beta, gamma, 1e-3) 
    return normed 

したがって、バッチノルムは高い値を出力するため、損失は劇的に増加し、したがって、私はナンを得る。

さらに、私はバッチノルムで学習率を下げようとしましたが、それはうまく役に立たなかった。だからこの問題を解決するには???ここで

は、次のコードです:任意のヘルプははるかに高く評価されて

import tensorflow as tf 
import numpy as np 
import os 
import cv2 

batch_size = 32 
num_epoch = 102 
latent_dim = 100 

def weight_variable(kernal_shape): 
    weights = tf.get_variable(name='weights', shape=kernal_shape, dtype=tf.float32, trainable=True, 
         initializer=tf.truncated_normal_initializer(stddev=0.02)) 
    return weights 

def bias_variable(shape): 
    initial = tf.constant(0.0, shape=shape) 
    return tf.Variable(initial) 

def batch_norm(x, n_out, phase_train, convolutional = True): 
    with tf.variable_scope('bn'): 
     exp_moving_avg = tf.train.ExponentialMovingAverage(decay=0.9999) 

     beta = tf.Variable(tf.constant(0.0, shape=n_out), 
            name='beta', trainable=True) 
     gamma = tf.Variable(tf.constant(1.0, shape=n_out), 
             name='gamma', trainable=True) 
     if convolutional: 
      batch_mean, batch_var = tf.nn.moments(x, [0, 1, 2], name='moments') 

     else: 
      batch_mean, batch_var = tf.nn.moments(x, [0], name='moments') 

     update_moving_averages = exp_moving_avg.apply([batch_mean, batch_var]) 

     m = tf.cond(phase_train, lambda: exp_moving_avg.average(batch_mean), lambda: batch_mean) 
     v = tf.cond(phase_train, lambda: exp_moving_avg.average(batch_var), lambda: batch_var) 

     normed = tf.nn.batch_normalization(x, m, v, beta, gamma, 1e-3) 
     normed = tf.Print(normed, data=[tf.shape(normed)], message='size of normed?') 
    return normed, update_moving_averages # Note that we should run the update_moving_averages with sess.run... 

def conv_layer(x, w_shape, b_shape, padding='SAME'): 
    W = weight_variable(w_shape) 
    tf.summary.histogram("weights", W) 

    b = bias_variable(b_shape) 
    tf.summary.histogram("biases", b) 

    # Note that I used a stride of 2 on purpose in order not to use max pool layer. 
    conv = tf.nn.conv2d(x, W, strides=[1, 2, 2, 1], padding=padding) + b 
    conv_batch_norm, update_moving_averages = batch_norm(conv, b_shape, phase_train=tf.cast(True, tf.bool)) 
    name_scope = tf.get_variable_scope().name 

    conv_batch_norm = tf.Print(conv_batch_norm, 
           data=[tf.reduce_max(conv_batch_norm), tf.reduce_min(conv_batch_norm)], 
           message = name_scope + ' min and max') 

    activations = tf.nn.relu(conv_batch_norm) 
    tf.summary.histogram("activations", activations) 

    return activations, update_moving_averages 

def deconv_layer(x, w_shape, b_shape, padding="SAME", activation='selu'): 
    W = weight_variable(w_shape) 
    tf.summary.histogram("weights", W) 

    b = bias_variable(b_shape) 
    tf.summary.histogram('biases', b) 

    x_shape = tf.shape(x) 

    out_shape = tf.stack([x_shape[0], x_shape[1] * 2, x_shape[2] * 2, w_shape[2]]) 
    if activation == 'selu': 
     conv_trans = tf.nn.conv2d_transpose(x, W, out_shape, [1, 2, 2, 1], padding=padding) + b 
     conv_trans_batch_norm, update_moving_averages = \ 
      batch_norm(conv_trans, b_shape, phase_train=tf.cast(True, tf.bool)) 
     transposed_activations = tf.nn.relu(conv_trans_batch_norm) 

    else: 
     conv_trans = tf.nn.conv2d_transpose(x, W, out_shape, [1, 2, 2, 1], padding=padding) + b 
     conv_trans_batch_norm, update_moving_averages = \ 
      batch_norm(conv_trans, b_shape, phase_train=tf.cast(True, tf.bool)) 
     transposed_activations = tf.nn.sigmoid(conv_trans_batch_norm) 

    tf.summary.histogram("transpose_activation", transposed_activations) 
    return transposed_activations, update_moving_averages 

tfrecords_filename_seq = ["C:/Users/user/PycharmProjects/AffectiveComputing/P16_db.tfrecords"] 
filename_queue = tf.train.string_input_producer(tfrecords_filename_seq, num_epochs=num_epoch, shuffle=False, name='queue') 
reader = tf.TFRecordReader() 

_, serialized_example = reader.read(filename_queue) 
features = tf.parse_single_example(
    serialized_example, 
    # Defaults are not specified since both keys are required. 
    features={ 
     'height': tf.FixedLenFeature([], tf.int64), 
     'width': tf.FixedLenFeature([], tf.int64), 
     'image_raw': tf.FixedLenFeature([], tf.string), 
     'annotation_raw': tf.FixedLenFeature([], tf.string) 
    }) 

# This is how we create one example, that is, extract one example from the database. 
image = tf.decode_raw(features['image_raw'], tf.uint8) 
# The height and the weights are used to 
height = tf.cast(features['height'], tf.int32) 
width = tf.cast(features['width'], tf.int32) 

# The image is reshaped since when stored as a binary format, it is flattened. Therefore, we need the 
# height and the weight to restore the original image back. 
image = tf.reshape(image, [height, width, 3]) 

annotation = tf.cast(features['annotation_raw'], tf.string) 

min_after_dequeue = 100 
num_threads = 1 
capacity = min_after_dequeue + num_threads * batch_size 
label_batch, images_batch = tf.train.batch([annotation, image], 
              shapes=[[], [112, 112, 3]], 
              batch_size=batch_size, 
              capacity=capacity, 
              num_threads=num_threads) 

label_batch_splitted = tf.string_split(label_batch, delimiter=',') 
label_batch_values = tf.reshape(label_batch_splitted.values, [batch_size, -1]) 
label_batch_numbers = tf.string_to_number(label_batch_values, out_type=tf.float32) 
confidences = tf.slice(label_batch_numbers, begin=[0, 2], size=[-1, 1]) 

images_batch = tf.cast([images_batch], tf.float32)[0] # Note that casting the image will increases its rank. 

with tf.name_scope('image_normal'): 
    images_batch = tf.map_fn(lambda img: tf.image.per_image_standardization(img), images_batch) 
    #images_batch = tf.Print(images_batch, data=[tf.reduce_max(images_batch), tf.reduce_min(images_batch)], 
    #      message='min and max in images_batch') 
with tf.variable_scope('conv1'): 
    conv1, uma_conv1 = conv_layer(images_batch, [4, 4, 3, 32], [32])  # image size: [56, 56] 
with tf.variable_scope('conv2'): 
    conv2, uma_conv2 = conv_layer(conv1, [4, 4, 32, 64], [64])  # image size: [28, 28] 
with tf.variable_scope('conv3'): 
    conv3, uma_conv3 = conv_layer(conv2, [4, 4, 64, 128], [128]) # image size: [14, 14] 
with tf.variable_scope('conv4'): 
    conv4, uma_conv4 = conv_layer(conv3, [4, 4, 128, 256], [256]) # image size: [7, 7] 
    conv4_reshaped = tf.reshape(conv4, [-1, 7 * 7 * 256], name='conv4_reshaped') 

w_c_mu = tf.Variable(tf.truncated_normal([7 * 7 * 256, latent_dim], stddev=0.1), name='weight_fc_mu') 
b_c_mu = tf.Variable(tf.constant(0.1, shape=[latent_dim]), name='biases_fc_mu') 
w_c_sig = tf.Variable(tf.truncated_normal([7 * 7 * 256, latent_dim], stddev=0.1), name='weight_fc_sig') 
b_c_sig = tf.Variable(tf.constant(0.1, shape=[latent_dim]), name='biases_fc_sig') 
epsilon = tf.random_normal([1, latent_dim]) 

tf.summary.histogram('weights_c_mu', w_c_mu) 
tf.summary.histogram('biases_c_mu', b_c_mu) 
tf.summary.histogram('weights_c_sig', w_c_sig) 
tf.summary.histogram('biases_c_sig', b_c_sig) 

with tf.variable_scope('mu'): 
    mu = tf.nn.bias_add(tf.matmul(conv4_reshaped, w_c_mu), b_c_mu) 
    tf.summary.histogram('mu', mu) 

with tf.variable_scope('stddev'): 
    stddev = tf.nn.bias_add(tf.matmul(conv4_reshaped, w_c_sig), b_c_sig) 
    tf.summary.histogram('stddev', stddev) 

with tf.variable_scope('z'): 
    latent_var = mu + tf.multiply(tf.sqrt(tf.exp(stddev)), epsilon) 
    tf.summary.histogram('features_sig', stddev) 

w_dc = tf.Variable(tf.truncated_normal([latent_dim, 7 * 7 * 256], stddev=0.1), name='weights_dc') 
b_dc = tf.Variable(tf.constant(0.0, shape=[7 * 7 * 256]), name='biases_dc') 
tf.summary.histogram('weights_dc', w_dc) 
tf.summary.histogram('biases_dc', b_dc) 

with tf.variable_scope('deconv4'): 
    deconv4 = tf.nn.bias_add(tf.matmul(latent_var, w_dc), b_dc) 
    deconv4_batch_norm, uma_deconv4 = \ 
     batch_norm(deconv4, [7 * 7 * 256], phase_train=tf.cast(True, tf.bool), convolutional=False) 

    deconv4 = tf.nn.relu(deconv4_batch_norm) 
    deconv4_reshaped = tf.reshape(deconv4, [-1, 7, 7, 256], name='deconv4_reshaped') 

with tf.variable_scope('deconv3'): 
    deconv3, uma_deconv3 = deconv_layer(deconv4_reshaped, [3, 3, 128, 256], [128], activation='selu') 
with tf.variable_scope('deconv2'): 
    deconv2, uma_deconv2 = deconv_layer(deconv3, [3, 3, 64, 128], [64], activation='selu') 
with tf.variable_scope('deconv1'): 
    deconv1, uma_deconv1 = deconv_layer(deconv2, [3, 3, 32, 64], [32], activation='selu') 
with tf.variable_scope('deconv_image'): 
    deconv_image_batch, uma_deconv = deconv_layer(deconv1, [3, 3, 3, 32], [3], activation='sigmoid') 

# loss function. 
with tf.name_scope('loss_likelihood'): 
    # temp1 shape: [32, 112, 112, 3] 

    temp1 = images_batch * tf.log(deconv_image_batch + 1e-9) + (1 - images_batch) * tf.log(1 - deconv_image_batch + 1e-9) 

    #temp1 = temp1 * confidences. This will give an error. Therefore, we should expand the dimension of confidence tensor 
    confidences_ = tf.expand_dims(tf.expand_dims(confidences, axis=1), axis=1) # shape: [32, 1, 1, 1]. 
    temp1 = temp1 * confidences_ 
    log_likelihood = -tf.reduce_sum(temp1, reduction_indices=[1, 2, 3]) 
    log_likelihood_total = tf.reduce_sum(log_likelihood) 
    #l2_loss = tf.reduce_mean(tf.abs(tf.subtract(images_batch, deconv_image_batch))) 

with tf.name_scope('loss_KL'): 
    # temp2 shape: [32, 200] 
    temp2 = 1 + tf.log(tf.square(stddev + 1e-9)) - tf.square(mu) - tf.square(stddev) 
    temp3 = temp2 * confidences  # confidences shape is [32, 1] 
    KL_term = - 0.5 * tf.reduce_sum(temp3, reduction_indices=1) 
    KL_term_total = tf.reduce_sum(KL_term) 

with tf.name_scope('total_loss'): 
    variational_lower_bound = tf.reduce_mean(log_likelihood + KL_term) 
    tf.summary.scalar('loss', variational_lower_bound) 
with tf.name_scope('optimizer'): 
    optimizer = tf.train.AdamOptimizer(0.00001).minimize(variational_lower_bound) 

init_op = tf.group(tf.local_variables_initializer(), 
        tf.global_variables_initializer()) 

saver = tf.train.Saver() 

model_path = 'C:/Users/user/PycharmProjects/VariationalAutoEncoder/' \ 
      'VariationalAutoEncoderFaces/tensorboard_logs/Graph_model/ckpt' 

# Here is the session... 
with tf.Session() as sess: 

    train_writer = tf.summary.FileWriter('C:/Users/user/PycharmProjects/VariationalAutoEncoder/' 
             'VariationalAutoEncoderFaces/tensorboard_logs/Event_files', sess.graph) 

    merged = tf.summary.merge_all() 

    # Note that init_op should start before the Coordinator and the thread otherwise, this will throw an error. 
    sess.run(init_op) 

    coord = tf.train.Coordinator() 
    threads = tf.train.start_queue_runners(coord=coord) 
    step = 0 

    to_run_list = [uma_conv1, uma_conv2, uma_conv3, uma_conv4, uma_deconv1, uma_deconv2, uma_deconv3, 
        uma_deconv4, uma_deconv, optimizer, variational_lower_bound, merged, 
        deconv_image_batch, image] 

    # Note that the last name "Graph_model" is the name of the saved checkpoints file => the ckpt is saved 
    # under tensorboard_logs. 
    ckpt = tf.train.get_checkpoint_state(
     os.path.dirname(model_path)) 
    if ckpt and ckpt.model_checkpoint_path: 
     saver.restore(sess, ckpt.model_checkpoint_path) 
     print('checkpoints are saved!!!') 
    else: 
     print('No stored checkpoints') 
    epoch = 0 
    while not coord.should_stop(): 

     _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, loss, summary, reconstructed_image, original_image = \ 
      sess.run(to_run_list) 

     print('total loss:', loss) 

     original_image = cv2.cvtColor(np.array(original_image), cv2.COLOR_RGB2BGR) 
     reconstructed_image = cv2.cvtColor(np.array(reconstructed_image[0]), cv2.COLOR_RGB2BGR) 

     cv2.imshow('original_image', original_image) 
     cv2.imshow('reconstructed_image', reconstructed_image) 
     cv2.waitKey(1) 
     if step % 234 == 0: 
      epoch += 1 
      print('epoch:', epoch) 
      if epoch == num_epoch - 2: 
       coord.request_stop() 

     if step % 100 == 0: 
      train_writer.add_summary(summary, step) 
      #print('total loss:', loss) 
      #print('log_likelihood_', log_likelihood_) 
      #print('KL_term', KL_term_) 
     step += 1 

    save_path = saver.save(sess, model_path) 
    coord.request_stop() 
    coord.join(threads) 
    train_writer.close() 

!!

+0

これはかなり広い質問です。あなたが探している答えの範囲を私が手助けすることができますか? SELUは、他の非線形活性化関数のいずれかの代わりに使用することができ、他の変更を必要としません。バッチノルムはそれよりも複雑です –

+0

私は、SELUが内部共変量シフト問題を解決するのですか?はいの場合は、batch_normレイヤー+ reluをSELUアクティベーション関数に置き換えることができますか?私は、2つの構成の間で出力の活性化がどのように異なるかについての直感を持ちたいと思う。ありがとう@ Ryan Stout –

+0

SELUは、レイヤー上の(指定されたドメイン内の)アクティベーションの平均と分散を維持することができます。しかし、活性層がまだ正常に分布していることを保証するものではありません。 SELUを適用する前に、フィーチャを正規化する必要があります。 –

答えて

0

3つのSELUレイヤでの平均と分散の傾向を示すサンプルコードがあります。層(入力層を含む)上のノードの数は、[15,30,30,8]

import tensorflow as tf 
import numpy as np 
import os 

#-----------------------------------------------# 
# https://github.com/bioinf-jku/SNNs/blob/master/selu.py 
# The SELU activation function 
def selu(x): 
    with ops.name_scope('elu') as scope: 
     alpha = 1.6732632423543772848170429916717 
     scale = 1.0507009873554804934193349852946 
     return scale*tf.where(x>=0.0, x, alpha*tf.nn.elu(x)) 

#-----------------------------------------------# 
# https://github.com/bioinf-jku/SNNs/blob/master/selu.py 
# alpha-dropout 
def dropout_selu(x, rate, alpha= -1.7580993408473766, fixedPointMean=0.0, fixedPointVar=1.0, 
       noise_shape=None, seed=None, name=None, training=False): 
    """Dropout to a value with rescaling.""" 

    def dropout_selu_impl(x, rate, alpha, noise_shape, seed, name): 
     keep_prob = 1.0 - rate 
     x = ops.convert_to_tensor(x, name="x") 
     if isinstance(keep_prob, numbers.Real) and not 0 < keep_prob <= 1: 
      raise ValueError("keep_prob must be a scalar tensor or a float in the " 
              "range (0, 1], got %g" % keep_prob) 
     keep_prob = ops.convert_to_tensor(keep_prob, dtype=x.dtype, name="keep_prob") 
     keep_prob.get_shape().assert_is_compatible_with(tensor_shape.scalar()) 

     alpha = ops.convert_to_tensor(alpha, dtype=x.dtype, name="alpha") 
     alpha.get_shape().assert_is_compatible_with(tensor_shape.scalar()) 

     if tensor_util.constant_value(keep_prob) == 1: 
      return x 

     noise_shape = noise_shape if noise_shape is not None else array_ops.shape(x) 
     random_tensor = keep_prob 
     random_tensor += random_ops.random_uniform(noise_shape, seed=seed, dtype=x.dtype) 
     binary_tensor = math_ops.floor(random_tensor) 
     ret = x * binary_tensor + alpha * (1-binary_tensor) 

     a = math_ops.sqrt(fixedPointVar/(keep_prob *((1-keep_prob) * math_ops.pow(alpha-fixedPointMean,2) + fixedPointVar))) 

     b = fixedPointMean - a * (keep_prob * fixedPointMean + (1 - keep_prob) * alpha) 
     ret = a * ret + b 
     ret.set_shape(x.get_shape()) 
     return ret 

    with ops.name_scope(name, "dropout", [x]) as name: 
     return utils.smart_cond(training, 
      lambda: dropout_selu_impl(x, rate, alpha, noise_shape, seed, name), 
      lambda: array_ops.identity(x)) 

#-----------------------------------------------# 
# build a 3-layer dense network with SELU activation and alpha-dropout 
sess = tf.InteractiveSession() 

w1 = tf.constant(np.random.normal(loc=0.0, scale=np.sqrt(1.0/15.0), size = [15, 30])) 
b1 = tf.constant(np.random.normal(loc=0.0, scale=0.5, size = [30])) 

x1 = tf.constant(np.random.normal(loc=0.0, scale=1.0, size = [200, 15])) 
y1 = tf.add(tf.matmul(x1, w1), b1) 
y1_selu = selu(y1) 
y1_selu_dropout = dropout_selu(y1_selu, 0.05, training=True) 

w2 = tf.constant(np.random.normal(loc=0.0, scale=np.sqrt(1.0/30.0), size = [30, 30])) 
b2 = tf.constant(np.random.normal(loc=0.0, scale=0.5, size = [30])) 

x2 = y1_selu_dropout 
y2 = tf.add(tf.matmul(x2, w2), b2) 
y2_selu = selu(y2) 
y2_selu_dropout = dropout_selu(y2_selu, 0.05, training=True) 


w3 = tf.constant(np.random.normal(loc=0.0, scale=np.sqrt(1.0/30.0), size = [30, 8])) 
b3 = tf.constant(np.random.normal(loc=0.0, scale=0.5, size = [8])) 

x3 = y2_selu_dropout 
y3 = tf.add(tf.matmul(x3, w3), b3) 
y3_selu = selu(y3) 
y3_selu_dropout = dropout_selu(y3_selu, 0.05, training=True) 


#-------------------------# 
# evaluate the network 
x1_v, y1_selu_dropout_v, \ 
x2_v, y2_selu_dropout_v, \ 
x3_v, y3_selu_dropout_v, \ 
= sess.run([x1, y1_selu_dropout, x2, y2_selu_dropout, x3, y3_selu_dropout]) 

#-------------------------# 
# print each layer's mean and standard deviation (1st line: input; 2nd line: output) 
print("Layer 1") 
print(np.mean(x1_v), np.std(x1_v)) 
print(np.mean(y1_selu_dropout_v), np.std(y1_selu_dropout_v)) 
print("Layer 2") 
print(np.mean(x2_v), np.std(x2_v)) 
print(np.mean(y2_selu_dropout_v), np.std(y2_selu_dropout_v)) 
print("Layer 3") 
print(np.mean(x3_v), np.std(x3_v)) 
print(np.mean(y3_selu_dropout_v), np.std(y3_selu_dropout_v)) 

です。 3層以上では、平均および標準偏差はそれぞれ0および1に依然として近い。

Layer 1 
-0.0101213033749 1.01375071842 
0.0106228883975 1.09375593322 
Layer 2 
0.0106228883975 1.09375593322 
-0.027910206754 1.12216643393 
Layer 3 
-0.027910206754 1.12216643393 
-0.131790078631 1.09698413493 
関連する問題