2017-01-25 6 views
0

ここで実装されたRecurrent Spatial Transformer Network(https://github.com/skaae/recurrent-spatial-transformer-code)を複製しようとしていますが、損失はまったく減少しませんでした。 reluアクティベーション -TensorFlow Vanishing Gradients

1:

ネットワークの構成は以下の通りです。

2 - 重み付けのための重みの初期化、バイアスのゼロ初期化。

3コスト関数はsoftmax_cross_entropy_with_logitsです。

4 - オプティマイザはRMSProp(私は1e-6; 1e-10 espilonを試しました)です。

5 - 値による勾配クリッピング。

次はどうすればよいですか?以下は

は、詳細なコード

import tensorflow as tf 
from spatial_transformer import transformer 
from tensorflow.python.ops import rnn,rnn_cell 
import numpy as np 
from tf_utils import weight_variable, bias_variable, dense_to_one_hot 

# %% load data 
mnist_cluttered = np.load('data/mnist_sequence3_sample_8distortions_9x9.npz') 

X_train = mnist_cluttered['X_train'] 
y_train = mnist_cluttered['y_train'] 
X_valid = mnist_cluttered['X_valid'] 
y_valid = mnist_cluttered['y_valid'] 
X_test = mnist_cluttered['X_test'] 
y_test = mnist_cluttered['y_test'] 

y_train = np.reshape(y_train,[y_train.size,1]) 
y_valid = np.reshape(y_valid,[y_valid.size,1]) 
y_test = np.reshape(y_test,[y_test.size,1]) 

# % turn from dense to one hot representation 
Y_train = dense_to_one_hot(y_train, n_classes=10) 
Y_valid = dense_to_one_hot(y_valid, n_classes=10) 
Y_test = dense_to_one_hot(y_test, n_classes=10) 


Y_train = np.reshape(Y_train,[y_train.size/3,3,10]) 
Y_valid = np.reshape(Y_valid,[y_valid.size/3,3,10]) 
Y_test = np.reshape(Y_test,[y_test.size/3,3,10]) 

# %% Placeholders for 100x100 resolution 
x = tf.placeholder(tf.float32, [None, 10000]) 
y = tf.placeholder(tf.float32, [None,3, 10]) 


x_tensor = tf.reshape(x, [-1, 100, 100, 1]) 

y_tensor = tf.reshape(y,[-1 ,10]) 

#%% localizaton network 

keep_prob = tf.placeholder(tf.float32) 

l_pool0_loc = tf.nn.max_pool(x_tensor,ksize=[1,2,2,1],strides=[1,2,2,1],padding='VALID') 

W_conv0_loc = weight_variable([3,3,1,20],'W_conv0_loc') 

b_conv0_loc = bias_variable([20],'b_conv0_loc') 

l_conv0_loc = tf.nn.relu(tf.nn.conv2d(l_pool0_loc,W_conv0_loc,strides=[1,1,1,1],padding='VALID')+b_conv0_loc) 

l_pool1_loc = tf.nn.max_pool(l_conv0_loc,ksize=[1,2,2,1],strides =[1,2,2,1],padding='VALID') 

W_conv1_loc = weight_variable([3,3,20,20],'W_conv1_loc') 

b_conv1_loc = bias_variable([20],'b_conv1_loc') 

l_conv1_loc = tf.nn.relu(tf.nn.conv2d(l_pool1_loc,W_conv1_loc,strides=[1,1,1,1],padding='VALID')+b_conv1_loc) 

l_pool2_loc = tf.nn.max_pool(l_conv1_loc,ksize=[1,2,2,1],strides =[1,2,2,1],padding='VALID') 

W_conv2_loc = weight_variable([3,3,20,20],'W_conv2_loc') 

b_conv2_loc = bias_variable([20],'b_conv2_loc') 

l_conv2_loc = tf.nn.relu(tf.nn.conv2d(l_pool2_loc,W_conv2_loc,strides=[1,1,1,1],padding='VALID')+b_conv2_loc) 

l_conv2_loc = tf.reshape(l_conv2_loc,[-1 ,9*9*20 ]) 

# Replicate input for Gated Recurrent Unit 
l_conv2_loc = tf.tile(l_conv2_loc,[1,3]) 

l_conv2_loc = tf.split(1,3,l_conv2_loc) 

# Gated Recurrent Unit 

gru_cell = rnn_cell.GRUCell(num_units=256) 

output, state = rnn.rnn(gru_cell,inputs=l_conv2_loc,dtype=tf.float32) 

output = tf.reshape(output,[-1,256]) 

initial = tf.zeros([256,6]) 


W_fc1_loc = tf.Variable(initial_value=initial,name='W_fc1_loc') 

# Use identity transformation as starting point 
initial = np.array([[1., 0, 0], [0, 1., 0]]) 
initial = initial.astype('float32') 
initial = initial.flatten() 
b_fc1_loc = tf.Variable(initial_value=initial,name='b_fc1_loc') 


l_fc1_loc = tf.add(tf.matmul(output,W_fc1_loc), b_fc1_loc) 


# %% We'll create a spatial transformer module to identify discriminative patches 

downsample = 3 

out_size = (100/downsample, 100/downsample) 


l_transform = transformer(tf.tile(x_tensor,[3,1,1,1]), l_fc1_loc, out_size) 

# %% Classification Network 


W_conv0_out = weight_variable([3,3,1,32],'W_conv0_out')     

b_conv0_out = bias_variable([32],'b_conv0_out') 

l_conv0_out = tf.nn.relu(tf.nn.conv2d(l_transform,W_conv0_out,strides=[1,1,1,1],padding='VALID')+b_conv0_out) 

l_pool1_out = tf.nn.max_pool(l_conv0_out,ksize=[1,2,2,1], strides=[1,2,2,1],padding='VALID') 

#l_drp1_out = tf.nn.dropout(l_pool1_out,keep_prob) 

W_conv1_out = weight_variable([3,3,32,32],'W_conv1_out') 

b_conv1_out = bias_variable([32],'b_conv1_out') 

l_conv1_out = tf.nn.relu(tf.nn.conv2d(l_pool1_out,W_conv1_out,strides=[1,1,1,1],padding='VALID')+b_conv1_out) 

l_pool2_out = tf.nn.max_pool(l_conv1_out,ksize=[1,2,2,1], strides=[1,2,2,1],padding='VALID') 

#l_drp2_out = tf.nn.dropout(l_pool2_out,keep_prob) 

W_conv2_out = weight_variable([3,3,32,32],'W_conv2_out')  

b_conv2_out = bias_variable([32],'b_conv2_out') 

l_conv2_out = tf.nn.relu(tf.nn.conv2d(l_pool2_out,W_conv2_out,strides=[1,1,1,1],padding='VALID')+b_conv2_out) 



# %% We'll now reshape so we can connect to a fully-connected layer: 
l_conv2_out_flat = tf.reshape(l_conv2_out, [-1, 4*4*32]) 

# %% Create a fully-connected layer: 
n_fc = 400 

W_fc1 = tf.get_variable('W_fc1',shape=[4*4*32,n_fc],initializer=tf.contrib.layers.xavier_initializer()) 

#W_fc1 = weight_variable([4*4*32,n_fc],'W_fc1') 

b_fc1=bias_variable([n_fc],'b_fc1') 


h_fc1 = tf.nn.relu(tf.add(tf.matmul(l_conv2_out_flat, W_fc1) , b_fc1)) 

# %% And finally our softmax layer: 

W_fc2 = tf.get_variable('W_fc2',shape=[n_fc, 10],initializer=tf.contrib.layers.xavier_initializer()) 

#W_fc2 = weight_variable([n_fc,10],'W_fc2') 

b_fc2=bias_variable([10],'b_fc2') 

y_logits = tf.add(tf.matmul(h_fc1, W_fc2) , b_fc2) 



# %% Monitor accuracy 



correct_prediction = tf.equal(tf.argmax(y_logits, 1), tf.argmax(y_tensor, 1)) 
accuracy = tf.reduce_mean(tf.cast(correct_prediction, 'float')) 


# %% Define loss/eval/training functions 
cross_entropy = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(y_logits,y_tensor)) 

opt = tf.train.RMSPropOptimizer(0.0005,epsilon=1e-6) 

#opt = tf.train.AdagradOptimizer(0.01) 
#optimizer = opt.minimize(cross_entropy) 




gvs = opt.compute_gradients(cross_entropy) 

capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gvs] 

optimizer = opt.apply_gradients(capped_gvs) 




# %% We'll now train in minibatches and report accuracy, loss: 

num_batches = 600 
n_epochs = 300 
batch_size = 100 


with tf.Session() as sess: 

    sess.run(tf.initialize_all_variables()) 

    for epoch_i in range(n_epochs): 

    #print ('epoch: ' + str(epoch_i)) 
     shuffle = np.random.permutation(X_train.shape[0]) 
     avg_cost = 0. 
     for iter_i in range(num_batches - 1): 
      idx = shuffle[iter_i*batch_size:(iter_i+1)*batch_size] 
      batch_xs = X_train[idx] 
      batch_ys = Y_train[idx] 


      _,c=sess.run([optimizer,cross_entropy], feed_dict={x: batch_xs, y: batch_ys}) 

      avg_cost += c/num_batches 
      print('iter: ' + str(iter_i) +' >> ' +' MiniBatch Cost: ' +str(c)) 

    # gr_print= sess.run([grads for grads,_ in gvs], feed_dict={x : batch_xs, y : batch_ys}) 
    # print ('iter: '+str(iter_i)) 
    # for t in gr_print: 
     #  print np.linalg.norm(t) 



saver = tf.train.Saver() 

saver.save(sess,"save/my-model") 

` 

答えて

0

さてあなたはそれが非常に有用であろうドロップアウト使用することができます。 lstmまたはrnnを使用している場合は、ドロップアウトを非常に簡単に実装できます。 `

def create_rnn_cell(): 
     encoDecoCell = tf.contrib.rnn.BasicLSTMCell( # Or GRUCell, LSTMCell(args.hiddenSize) 
      self.args.hiddenSize, 
     ) 
     if not self.args.test: # TODO: Should use a placeholder instead 
      encoDecoCell = tf.contrib.rnn.DropoutWrapper(     #using the dropout 
       encoDecoCell, 
       input_keep_prob=1.0, 
       output_keep_prob=self.args.dropout 
      ) 
     return encoDecoCell 

バッチ正規化も有効です。しかし、特にRNNモジュール用に実装されたBNの例は見当たりませんでした。しかし、ここでバッチノルムについて

Batch Normalization in tensorflow

を学ぶための良い例です。また、あなたがRNN

http://olavnymoen.com/2016/07/07/rnn-batch-normalization

にバッチ正規化を適用することができますこの記事を読んで
関連する問題