2017-02-18 7 views
1

マルチGPUマシンでXLAを使用しようとしています。私はXLA JITをオンにするとバット、テンソルフローは同時にGPUを使用しません。TensorFlow複数のGPUを搭載したXLAはGPUを同時に使用しません

XLA on、gpu0、およびgpu1が交互にアクティブになっている場合。

enter image description here

enter image description here

XLAオフ、gpu0とgpu1は同時に両方アクティブである場合。

enter image description here

私の環境で起こる何ですか?

私のコード

import tensorflow as tf 
from pathlib import Path 
import time 

INPUT_SIZE = 64 
INPUT_CHANNELS = 1 
MINIBATCH_SIZE = 32 
NUM_ITERATIONS = 200000 
NUM_GPU = 2 

def read_op(filename_queue, reader): 
    _, raw = reader.read(filename_queue) 

    read_image = tf.image.decode_jpeg(
     raw, channels=INPUT_CHANNELS) 
    read_image = tf.to_float(read_image)/255. 
    read_image = tf.image.resize_images(read_image, [INPUT_SIZE, INPUT_SIZE]) 
    return read_image 

def inference(image, log_suffix): 
    # autoencoder model for mutli GPU testing 
    # this model has no particular meaning 
    def w_init(initial_weight=1e-3): 
     return tf.truncated_normal_initializer(stddev=initial_weight) 

    def make_conv(x, out_ch, stride=[1,1,1,1]): 
     shape = x.get_shape().as_list() 

     with tf.device('/cpu:0'): 
      conv_w = tf.get_variable(initializer=w_init(), name='weight', 
       shape=[7, 7, shape[3], out_ch]) 

     conv = tf.nn.conv2d(x, conv_w, stride, padding='SAME') 
     mean, var = tf.nn.moments(conv, [0]) 
     conv = tf.nn.batch_normalization(conv, mean, var, None, None, 1e-9) 

     return tf.nn.relu(conv) 

    def make_deconv(x, out_shape, bn=True): 
     shape = x.get_shape().as_list() 

     with tf.device('/cpu:0'): 
      w = tf.get_variable(initializer=w_init(), name='weight', 
       shape=[7, 7, out_shape[3], shape[3]]) 

     deconv = tf.nn.conv2d_transpose(x, w, out_shape, [1,2,2,1]) 
     mean, var = tf.nn.moments(deconv, [0]) 

     if bn: deconv = tf.nn.batch_normalization(deconv, mean, var, None, None, 1e-9) 

     return tf.nn.relu(deconv) 

    def make_deconv_same(x, out_shape, activate=tf.nn.relu, bn=True, scale=1e-3): 
     shape = x.get_shape().as_list() 

     with tf.device('/cpu:0'): 
      w = tf.get_variable(initializer=w_init(), name='weight', 
       shape=[7, 7, out_shape[3], shape[3]]) 

     deconv = tf.nn.conv2d_transpose(x, w, out_shape, [1,1,1,1]) 
     mean, var = tf.nn.moments(deconv, [0]) 

     if bn: deconv = tf.nn.batch_normalization(deconv, mean, var, None, None, 1e-9) 

     return activate(deconv) 

    with tf.variable_scope('conv1'): 
     conv1 = make_conv(image, 128) 
    with tf.variable_scope('conv2'): 
     conv2 = make_conv(conv1, 128) 
    with tf.variable_scope('conv3'): 
     conv3 = make_conv(conv2, 160, stride=[1,2,2,1]) 
    with tf.variable_scope('conv4'): 
     conv4 = make_conv(conv3, 160) 
    with tf.variable_scope('conv5'): 
     conv5 = make_conv(conv4, 192, stride=[1,2,2,1]) 
    with tf.variable_scope('conv6'): 
     conv6 = make_conv(conv5, 192) 
    with tf.variable_scope('conv7'): 
     conv7 = make_conv(conv6, 256, stride=[1,2,2,1]) 
    with tf.variable_scope('conv8'): 
     conv8 = make_conv(conv7, 256) 
    with tf.variable_scope('linear1'): 
     feature_lengh = 300 
     shape = conv8.get_shape().as_list() 
     vec_length = shape[1] * shape[2] * shape[3] 
     in_vec = tf.reshape(conv8,[-1, vec_length]) 

     with tf.device('/cpu:0'): 
      w = tf.get_variable(initializer=w_init(1e-2), name='weight', 
       shape=[vec_length, feature_lengh]) 
      b = tf.get_variable(initializer=w_init(1e-2), name='bias', 
       shape=[feature_lengh]) 

     linear1 = tf.matmul(in_vec, w) + b 
     mean, var = tf.nn.moments(linear1, [0]) 
     linear1 = tf.nn.batch_normalization(linear1, mean, var, None, None, 1e-9) 
     linear1 = tf.nn.sigmoid(linear1) 
    with tf.variable_scope('linear2'): 
     in_shape = linear1.get_shape().as_list() 
     in_length = in_shape[1] 
     out_shape = conv8.get_shape().as_list() 
     out_length = out_shape[1] * out_shape[2] * out_shape[3] 

     with tf.device('/cpu:0'): 
      w = tf.get_variable(initializer=w_init(1e-2), name='weight', 
       shape=[in_length, out_length]) 
      b = tf.get_variable(initializer=w_init(1e-2), name='bias', 
       shape=[out_length]) 

     linear2 = tf.matmul(linear1, w) + b 
     mean, var = tf.nn.moments(linear2, [0]) 
     linear2 = tf.nn.batch_normalization(linear2, mean, var, None, None, 1e-9) 
     linear2 = tf.nn.sigmoid(linear2) 
     linear2 = tf.reshape(linear2, out_shape) 
    with tf.variable_scope('deconv1'): 
     deconv1 = make_deconv_same(linear2, conv7.get_shape()) 
    with tf.variable_scope('deconv2'): 
     deconv2 = make_deconv  (deconv1, conv6.get_shape()) 
    with tf.variable_scope('deconv3'): 
     deconv3 = make_deconv_same(deconv2, conv5.get_shape()) 
    with tf.variable_scope('deconv4'): 
     deconv4 = make_deconv  (deconv3, conv4.get_shape()) 
    with tf.variable_scope('deconv5'): 
     deconv5 = make_deconv_same(deconv4, conv3.get_shape()) 
    with tf.variable_scope('deconv6'): 
     deconv6 = make_deconv  (deconv5, conv2.get_shape()) 
    with tf.variable_scope('deconv7'): 
     deconv7 = make_deconv_same(deconv6, conv1.get_shape()) 
    with tf.variable_scope('deconv8'): 
     deconv8 = make_deconv_same(deconv7, image.get_shape(), bn=False, scale=1e-1) 

    with tf.device('/cpu:0'): 
     image_log = tf.summary.image('output'+log_suffix, deconv8, collections=['image_log']) 
     image_log = tf.summary.image('input'+log_suffix, image, collections=['image_log']) 

    return deconv8 

def loss(label, out, global_step, log_suffix): 
    with tf.name_scope('loss'): 
     l = tf.squared_difference(label, out) 

     # for tensorboard Logarithmic graph mode 
     lv = tf.reduce_mean(l) * 1e+7 

     with tf.device('/cpu:0'): 
      loss_log = tf.summary.scalar('loss'+log_suffix,lv) 

    return l 

def average_gradients(tower_grads): 
    with tf.name_scope('avarage_gradients'): 
     average_grads = [] 

     for grad_and_vars in zip(*tower_grads): 
      grads = [] 

      for g, u in grad_and_vars: 
       expanded_g = tf.expand_dims(g,0) 
       grads.append(expanded_g) 

      grad = tf.concat(grads, axis=0) 
      grad = tf.reduce_mean(grad,0) 

      v = grad_and_vars[0][1] 
      grad_and_var = (grad, v) 
      average_grads.append(grad_and_var) 

     for grad,var in average_grads: 
      with tf.device('/cpu:0'): 
       tf.summary.histogram('grads/'+var.name, grad, collections=['grads']) 

    return average_grads 

def main(): 
    global NUM_GPU, MINIBATCH_SIZE 

    # many jpeg images 
    sample_dir = Path('./training_samples') 
    file_list = [p for p in sample_dir.iterdir() if p.suffix == '.jpg'] 
    file_list = list(map(str, file_list)) 

    with tf.Graph().as_default(), tf.device('/cpu:0'): 
     config_proto = tf.ConfigProto(
      allow_soft_placement=True, log_device_placement=False) 
     # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
     # if XLA is on, problem occured 
     # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
     #config_proto.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 
     sess = tf.Session(config=config_proto) 

     global_step = tf.get_variable(
      'global_step', [], initializer=tf.constant_initializer(0), trainable=False) 

     with tf.variable_scope('optimizer'): 
      opt = tf.train.AdamOptimizer(1e-6) 

     with tf.variable_scope('input'): 
      filename_queue = tf.train.string_input_producer(file_list) 
      reader = tf.WholeFileReader() 
      images_list = [ 
       tf.train.shuffle_batch(
        [read_op(filename_queue, reader)], MINIBATCH_SIZE, 24000, 8000, num_threads=8), 
       tf.train.shuffle_batch(
        [read_op(filename_queue, reader)], MINIBATCH_SIZE, 24000, 8000, num_threads=8)] 

     tower_grads = [] 
     reuse = False 
     for i in range(NUM_GPU): 
      with tf.device('/gpu:{}'.format(i)): 
       with tf.variable_scope('model', reuse=reuse, caching_device='/gpu:{}'.format(i)): 
        infer = inference(images_list[i], '/tower_{}'.format(i)) 
        reuse = True 
        tower_loss = loss(images_list[i], infer, global_step, '/tower_{}'.format(i)) 

       grads = opt.compute_gradients(tower_loss) 
       tower_grads.append(grads) 

     grads = average_gradients(tower_grads) 
     train_op = opt.apply_gradients(grads, global_step=global_step) 

     image_log_op = tf.summary.merge(tf.get_collection('image_log')) 
     loss_log_op = tf.summary.merge_all() 
     grads_log_op = tf.summary.merge(tf.get_collection('grads')) 

     writer = tf.summary.FileWriter('logs') 
     sess.run(tf.global_variables_initializer()) 
     writer.add_graph(tf.get_default_graph()) 
     coordinator = tf.train.Coordinator() 

     threads = tf.train.start_queue_runners(sess=sess, coord=coordinator) 

     for i in range(NUM_ITERATIONS): 
      print('iteration: ',i) 

      start = time.time() 

      if i % 2 == 0: 
       _, loss_log, image_log = sess.run([train_op, loss_log_op, image_log_op]) 
       writer.add_summary(loss_log, i) 
       writer.add_summary(image_log, i) 
       writer.flush() 
      else: 
       _ = sess.run([train_op]) 

      end = time.time() 

      print('time = {}'.format(end - start)) 

     writer.close() 

if __name__ == '__main__': 
    main() 

環境情報

オペレーティングシステム:Ubuntuの16.04 GPU:GTX 1080 X2 設定オプション(GCC):-march =ネイティブ-O3 設定オプション(CUDA機能) :6.1

CUDAとcuDNNの

インストールされたバージョン:

/usr/local/cuda/lib64/libcudadevrt.a 
/usr/local/cuda/lib64/libcudart.so -> libcudart.so.8.0 
/usr/local/cuda/lib64/libcudart.so.8.0 -> libcudart.so.8.0.44 
/usr/local/cuda/lib64/libcudart.so.8.0.44 
/usr/local/cuda/lib64/libcudart_static.a 
/usr/local/cuda/lib64/libcudnn.so -> libcudnn.so.5.1.5 
/usr/local/cuda/lib64/libcudnn.so.5 -> libcudnn.so.5.1.5 
/usr/local/cuda/lib64/libcudnn.so.5.1.5 
/usr/local/cuda/lib64/libcudnn_static.a 

tensorflowコミットハッシュc56c873fbaf976d26d487ad57c8efbc87f05331c

は、現時点ではbazel version

....... 
Build label: 0.4.4 
Build target: bazel-out/local-fastbuild/bin/src/main/java/com/google/devtools/build/lib/bazel/BazelServer_deploy.jar 
Build time: Wed Feb 1 18:54:21 2017 (1485975261) 
Build timestamp: 1485975261 
Build timestamp as int: 1485975261 

答えて

3

の出力、XLAはシングルGPUです。

+0

OK、わかりました。ありがとうございました。 – Yusuke

関連する問題