ここで実装されたRecurrent Spatial Transformer Network(https://github.com/skaae/recurrent-spatial-transformer-code)を複製しようとしていますが、損失はまったく減少しませんでした。 reluアクティベーション -TensorFlow Vanishing Gradients
1:
ネットワークの構成は以下の通りです。
2 - 重み付けのための重みの初期化、バイアスのゼロ初期化。
3コスト関数はsoftmax_cross_entropy_with_logitsです。
4 - オプティマイザはRMSProp(私は1e-6; 1e-10 espilonを試しました)です。
5 - 値による勾配クリッピング。
次はどうすればよいですか?以下は
は、詳細なコード
import tensorflow as tf
from spatial_transformer import transformer
from tensorflow.python.ops import rnn,rnn_cell
import numpy as np
from tf_utils import weight_variable, bias_variable, dense_to_one_hot
# %% load data
mnist_cluttered = np.load('data/mnist_sequence3_sample_8distortions_9x9.npz')
X_train = mnist_cluttered['X_train']
y_train = mnist_cluttered['y_train']
X_valid = mnist_cluttered['X_valid']
y_valid = mnist_cluttered['y_valid']
X_test = mnist_cluttered['X_test']
y_test = mnist_cluttered['y_test']
y_train = np.reshape(y_train,[y_train.size,1])
y_valid = np.reshape(y_valid,[y_valid.size,1])
y_test = np.reshape(y_test,[y_test.size,1])
# % turn from dense to one hot representation
Y_train = dense_to_one_hot(y_train, n_classes=10)
Y_valid = dense_to_one_hot(y_valid, n_classes=10)
Y_test = dense_to_one_hot(y_test, n_classes=10)
Y_train = np.reshape(Y_train,[y_train.size/3,3,10])
Y_valid = np.reshape(Y_valid,[y_valid.size/3,3,10])
Y_test = np.reshape(Y_test,[y_test.size/3,3,10])
# %% Placeholders for 100x100 resolution
x = tf.placeholder(tf.float32, [None, 10000])
y = tf.placeholder(tf.float32, [None,3, 10])
x_tensor = tf.reshape(x, [-1, 100, 100, 1])
y_tensor = tf.reshape(y,[-1 ,10])
#%% localizaton network
keep_prob = tf.placeholder(tf.float32)
l_pool0_loc = tf.nn.max_pool(x_tensor,ksize=[1,2,2,1],strides=[1,2,2,1],padding='VALID')
W_conv0_loc = weight_variable([3,3,1,20],'W_conv0_loc')
b_conv0_loc = bias_variable([20],'b_conv0_loc')
l_conv0_loc = tf.nn.relu(tf.nn.conv2d(l_pool0_loc,W_conv0_loc,strides=[1,1,1,1],padding='VALID')+b_conv0_loc)
l_pool1_loc = tf.nn.max_pool(l_conv0_loc,ksize=[1,2,2,1],strides =[1,2,2,1],padding='VALID')
W_conv1_loc = weight_variable([3,3,20,20],'W_conv1_loc')
b_conv1_loc = bias_variable([20],'b_conv1_loc')
l_conv1_loc = tf.nn.relu(tf.nn.conv2d(l_pool1_loc,W_conv1_loc,strides=[1,1,1,1],padding='VALID')+b_conv1_loc)
l_pool2_loc = tf.nn.max_pool(l_conv1_loc,ksize=[1,2,2,1],strides =[1,2,2,1],padding='VALID')
W_conv2_loc = weight_variable([3,3,20,20],'W_conv2_loc')
b_conv2_loc = bias_variable([20],'b_conv2_loc')
l_conv2_loc = tf.nn.relu(tf.nn.conv2d(l_pool2_loc,W_conv2_loc,strides=[1,1,1,1],padding='VALID')+b_conv2_loc)
l_conv2_loc = tf.reshape(l_conv2_loc,[-1 ,9*9*20 ])
# Replicate input for Gated Recurrent Unit
l_conv2_loc = tf.tile(l_conv2_loc,[1,3])
l_conv2_loc = tf.split(1,3,l_conv2_loc)
# Gated Recurrent Unit
gru_cell = rnn_cell.GRUCell(num_units=256)
output, state = rnn.rnn(gru_cell,inputs=l_conv2_loc,dtype=tf.float32)
output = tf.reshape(output,[-1,256])
initial = tf.zeros([256,6])
W_fc1_loc = tf.Variable(initial_value=initial,name='W_fc1_loc')
# Use identity transformation as starting point
initial = np.array([[1., 0, 0], [0, 1., 0]])
initial = initial.astype('float32')
initial = initial.flatten()
b_fc1_loc = tf.Variable(initial_value=initial,name='b_fc1_loc')
l_fc1_loc = tf.add(tf.matmul(output,W_fc1_loc), b_fc1_loc)
# %% We'll create a spatial transformer module to identify discriminative patches
downsample = 3
out_size = (100/downsample, 100/downsample)
l_transform = transformer(tf.tile(x_tensor,[3,1,1,1]), l_fc1_loc, out_size)
# %% Classification Network
W_conv0_out = weight_variable([3,3,1,32],'W_conv0_out')
b_conv0_out = bias_variable([32],'b_conv0_out')
l_conv0_out = tf.nn.relu(tf.nn.conv2d(l_transform,W_conv0_out,strides=[1,1,1,1],padding='VALID')+b_conv0_out)
l_pool1_out = tf.nn.max_pool(l_conv0_out,ksize=[1,2,2,1], strides=[1,2,2,1],padding='VALID')
#l_drp1_out = tf.nn.dropout(l_pool1_out,keep_prob)
W_conv1_out = weight_variable([3,3,32,32],'W_conv1_out')
b_conv1_out = bias_variable([32],'b_conv1_out')
l_conv1_out = tf.nn.relu(tf.nn.conv2d(l_pool1_out,W_conv1_out,strides=[1,1,1,1],padding='VALID')+b_conv1_out)
l_pool2_out = tf.nn.max_pool(l_conv1_out,ksize=[1,2,2,1], strides=[1,2,2,1],padding='VALID')
#l_drp2_out = tf.nn.dropout(l_pool2_out,keep_prob)
W_conv2_out = weight_variable([3,3,32,32],'W_conv2_out')
b_conv2_out = bias_variable([32],'b_conv2_out')
l_conv2_out = tf.nn.relu(tf.nn.conv2d(l_pool2_out,W_conv2_out,strides=[1,1,1,1],padding='VALID')+b_conv2_out)
# %% We'll now reshape so we can connect to a fully-connected layer:
l_conv2_out_flat = tf.reshape(l_conv2_out, [-1, 4*4*32])
# %% Create a fully-connected layer:
n_fc = 400
W_fc1 = tf.get_variable('W_fc1',shape=[4*4*32,n_fc],initializer=tf.contrib.layers.xavier_initializer())
#W_fc1 = weight_variable([4*4*32,n_fc],'W_fc1')
b_fc1=bias_variable([n_fc],'b_fc1')
h_fc1 = tf.nn.relu(tf.add(tf.matmul(l_conv2_out_flat, W_fc1) , b_fc1))
# %% And finally our softmax layer:
W_fc2 = tf.get_variable('W_fc2',shape=[n_fc, 10],initializer=tf.contrib.layers.xavier_initializer())
#W_fc2 = weight_variable([n_fc,10],'W_fc2')
b_fc2=bias_variable([10],'b_fc2')
y_logits = tf.add(tf.matmul(h_fc1, W_fc2) , b_fc2)
# %% Monitor accuracy
correct_prediction = tf.equal(tf.argmax(y_logits, 1), tf.argmax(y_tensor, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, 'float'))
# %% Define loss/eval/training functions
cross_entropy = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(y_logits,y_tensor))
opt = tf.train.RMSPropOptimizer(0.0005,epsilon=1e-6)
#opt = tf.train.AdagradOptimizer(0.01)
#optimizer = opt.minimize(cross_entropy)
gvs = opt.compute_gradients(cross_entropy)
capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gvs]
optimizer = opt.apply_gradients(capped_gvs)
# %% We'll now train in minibatches and report accuracy, loss:
num_batches = 600
n_epochs = 300
batch_size = 100
with tf.Session() as sess:
sess.run(tf.initialize_all_variables())
for epoch_i in range(n_epochs):
#print ('epoch: ' + str(epoch_i))
shuffle = np.random.permutation(X_train.shape[0])
avg_cost = 0.
for iter_i in range(num_batches - 1):
idx = shuffle[iter_i*batch_size:(iter_i+1)*batch_size]
batch_xs = X_train[idx]
batch_ys = Y_train[idx]
_,c=sess.run([optimizer,cross_entropy], feed_dict={x: batch_xs, y: batch_ys})
avg_cost += c/num_batches
print('iter: ' + str(iter_i) +' >> ' +' MiniBatch Cost: ' +str(c))
# gr_print= sess.run([grads for grads,_ in gvs], feed_dict={x : batch_xs, y : batch_ys})
# print ('iter: '+str(iter_i))
# for t in gr_print:
# print np.linalg.norm(t)
saver = tf.train.Saver()
saver.save(sess,"save/my-model")
`