[问题贴] session placeholder的问题.

tensorflow

#1
g = Graph(is_training=True)

    # with g.graph.as_default():
    sv = tf.train.Supervisor(graph=g.graph,logdir=hp.logdir, save_model_secs=30)

    with sv.managed_session() as sess:
        for epoch in range(hp.num_epoch):
            if sv.should_stop():
                break
            X1_train_batch = None
            y_train_batch = None
            for i in range(data_len // hp.batch_size):
                X1_train_batch = X1_train[i * hp.batch_size:(i + 1) * hp.batch_size]
                y_train_batch = y_train[i * hp.batch_size:(i + 1) * hp.batch_size]

                sess.run([g.train_op], feed_dict={g.x1: X1_train_batch, g.y:y_train_batch})

            gs = sess.run(g.global_step, feed_dict={g.x1: X1_train_batch, g.y:y_train_batch})

            sv.saver.save(sess, hp.logdir + 'model_epoch_%02d_gs_%d' % (epoch + 1, gs))
            print("finished a epoch")


if __name__ == "__main__":
    # with tf.Session() as sess:
    #     tf.summary.FileWriter('./logdir', Graph(is_training=True).graph)

    # X1_file = file(hp.X1_train_path, 'rb')
    # X1 = pickle.load(X1_file)
    # X2_file = file(hp.X2_train_path, 'rb')
    # X2 = pickle.load(X2_file)
    #
    # y1_file = file(hp.y1_train_path, 'rb')
    # y1 = pickle.load(y1_file)
    #
    # y2_file = file(hp.y2_train_path, 'rb')
    # y2 = pickle.load(y2_file)
    X1 = [np.random.rand(1*hp.batch_size, hp.in_timesteps, hp.local_features),
          np.random.rand(1*hp.batch_size, hp.station_num, hp.in_timesteps, hp.local_features+1)]
    X2 = [np.random.rand(1*hp.batch_size, hp.in_timesteps, hp.local_features),
          np.random.rand(1*hp.batch_size, hp.station_num, hp.in_timesteps, hp.local_features+1)]
    y1 = np.random.rand(1*hp.batch_size,hp.out_timesteps, 7)
    y2 = np.random.rand(1*hp.batch_size,hp.out_timesteps, 7)

    X1_len = X1[0].shape[0]
    X2_len = X2[0].shape[0]
    X_len = X1_len + X2_len
    y_test_all = y2[-hp.batch_size:]
    y_test = y_test_all[:,:,0]
    X1_test = X2[0][-hp.batch_size:]
    X2_test = X2[1][-hp.batch_size:]

    X1_train = np.concatenate((X1[0], X2[0][:-hp.batch_size]), axis=0)
    X2_train = np.concatenate((X1[1], X2[1][:-hp.batch_size]), axis=0)

    y_train_all = np.concatenate((y1, y2[:-hp.batch_size]), axis=0)
    y_train = y_train_all[:,:,0]
    print(X1_train.shape)
    print(y_train.shape)

    model_train(X1_train, X2_train, y_train)

代码的Graph()是构图部分,这一部分当中是用了tf.graph().as_default()的上下文来构图.

代码报错:
2018-07-26 15:28:30.212778: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA 2018-07-26 15:28:30.315499: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:892] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2018-07-26 15:28:30.316097: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Found device 0 with properties: name: GeForce GTX 950M major: 5 minor: 0 memoryClockRate(GHz): 1.124 pciBusID: 0000:01:00.0 totalMemory: 3.95GiB freeMemory: 3.49GiB 2018-07-26 15:28:30.316134: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1120] Creating TensorFlow device (/device:GPU:0) -> (device: 0, name: GeForce GTX 950M, pci bus id: 0000:01:00.0, compute capability: 5.0) 2018-07-26 15:28:46.528422: W tensorflow/core/framework/op_kernel.cc:1192] Invalid argument: You must feed a value for placeholder tensor ‘Placeholder_1’ with dtype float and shape [?,48] [[Node: Placeholder_1 = Placeholderdtype=DT_FLOAT, shape=[?,48], _device="/job:localhost/replica:0/task:0/device:GPU:0"]] 2018-07-26 15:28:46.528470: W tensorflow/core/framework/op_kernel.cc:1192] Invalid argument: You must feed a value for placeholder tensor ‘Placeholder_1’ with dtype float and shape [?,48] [[Node: Placeholder_1 = Placeholderdtype=DT_FLOAT, shape=[?,48], _device="/job:localhost/replica:0/task:0/device:GPU:0"]] 2018-07-26 15:28:46.528492: W tensorflow/core/framework/op_kernel.cc:1192] Invalid argument: You must feed a value for placeholder tensor ‘Placeholder_1’ with dtype float and shape [?,48] [[Node: Placeholder_1 = Placeholderdtype=DT_FLOAT, shape=[?,48], _device="/job:localhost/replica:0/task:0/device:GPU:0"]] finished a epoch Traceback (most recent call last): File “/home/lily/tf3.6/lib/python3.6/site-packages/tensorflow/python/client/session.py”, line 1323, in _do_call return fn(*args) File “/home/lily/tf3.6/lib/python3.6/site-packages/tensorflow/python/client/session.py”, line 1302, in _run_fn status, run_metadata) File “/home/lily/tf3.6/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py”, line 473, in exit c_api.TF_GetCode(self.status.status)) tensorflow.python.framework.errors_impl.InvalidArgumentError: You must feed a value for placeholder tensor ‘Placeholder_1’ with dtype float and shape [?,48] [[Node: Placeholder_1 = Placeholderdtype=DT_FLOAT, shape=[?,48], _device="/job:localhost/replica:0/task:0/device:GPU:0"]] [[Node: decoder/num_blocks_4/dec_self_attention/Shape/_3571 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name=“edge_7187_decoder/num_blocks_4/dec_self_attention/Shape”, tensor_type=DT_INT32, _device="/job:localhost/replica:0/task:0/device:CPU:0"]]

During handling of the above exception, another exception occurred:

Traceback (most recent call last): File “/home/lily/Downloads/model_global/attention_aq_2/model.py”, line 295, in model_train(X1_train, X2_train, y_train) File “/home/lily/Downloads/model_global/attention_aq_2/model.py”, line 209, in model_train print(“finished a epoch”) File “/usr/local/lib/python3.6/contextlib.py”, line 88, in exit next(self.gen) File “/home/lily/tf3.6/lib/python3.6/site-packages/tensorflow/python/training/supervisor.py”, line 964, in managed_session self.stop(close_summary_writer=close_summary_writer) File “/home/lily/tf3.6/lib/python3.6/site-packages/tensorflow/python/training/supervisor.py”, line 792, in stop stop_grace_period_secs=self._stop_grace_secs) File “/home/lily/tf3.6/lib/python3.6/site-packages/tensorflow/python/training/coordinator.py”, line 389, in join six.reraise(*self._exc_info_to_raise) File “/home/lily/tf3.6/lib/python3.6/site-packages/six.py”, line 693, in reraise raise value File “/home/lily/tf3.6/lib/python3.6/site-packages/tensorflow/python/training/coordinator.py”, line 296, in stop_on_exception yield File “/home/lily/tf3.6/lib/python3.6/site-packages/tensorflow/python/training/coordinator.py”, line 494, in run self.run_loop() File “/home/lily/tf3.6/lib/python3.6/site-packages/tensorflow/python/training/supervisor.py”, line 994, in run_loop self._sv.global_step]) File “/home/lily/tf3.6/lib/python3.6/site-packages/tensorflow/python/client/session.py”, line 889, in run run_metadata_ptr) File “/home/lily/tf3.6/lib/python3.6/site-packages/tensorflow/python/client/session.py”, line 1120, in _run feed_dict_tensor, options, run_metadata) File “/home/lily/tf3.6/lib/python3.6/site-packages/tensorflow/python/client/session.py”, line 1317, in _do_run options, run_metadata) File “/home/lily/tf3.6/lib/python3.6/site-packages/tensorflow/python/client/session.py”, line 1336, in _do_call raise type(e)(node_def, op, message) tensorflow.python.framework.errors_impl.InvalidArgumentError: You must feed a value for placeholder tensor ‘Placeholder_1’ with dtype float and shape [?,48] [[Node: Placeholder_1 = Placeholderdtype=DT_FLOAT, shape=[?,48], _device="/job:localhost/replica:0/task:0/device:GPU:0"]] [[Node: decoder/num_blocks_4/dec_self_attention/Shape/_3571 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name=“edge_7187_decoder/num_blocks_4/dec_self_attention/Shape”, tensor_type=DT_INT32, _device="/job:localhost/replica:0/task:0/device:CPU:0"]]

Caused by op ‘Placeholder_1’, defined at: File “/home/lily/Downloads/model_global/attention_aq_2/model.py”, line 295, in model_train(X1_train, X2_train, y_train) File “/home/lily/Downloads/model_global/attention_aq_2/model.py”, line 189, in model_train g = Graph(is_training=True) File “/home/lily/Downloads/model_global/attention_aq_2/model.py”, line 38, in init self.y = tf.placeholder(dtype=tf.float32, shape=(None, hp.out_timesteps)) File “/home/lily/tf3.6/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py”, line 1599, in placeholder return gen_array_ops._placeholder(dtype=dtype, shape=shape, name=name) File “/home/lily/tf3.6/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py”, line 3091, in _placeholder “Placeholder”, dtype=dtype, shape=shape, name=name) File “/home/lily/tf3.6/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py”, line 787, in _apply_op_helper op_def=op_def) File “/home/lily/tf3.6/lib/python3.6/site-packages/tensorflow/python/framework/ops.py”, line 2956, in create_op op_def=op_def) File “/home/lily/tf3.6/lib/python3.6/site-packages/tensorflow/python/framework/ops.py”, line 1470, in init self._traceback = self._graph._extract_stack() # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor ‘Placeholder_1’ with dtype float and shape [?,48] [[Node: Placeholder_1 = Placeholderdtype=DT_FLOAT, shape=[?,48], _device="/job:localhost/replica:0/task:0/device:GPU:0"]] [[Node: decoder/num_blocks_4/dec_self_attention/Shape/_3571 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name=“edge_7187_decoder/num_blocks_4/dec_self_attention/Shape”, tensor_type=DT_INT32, _device="/job:localhost/replica:0/task:0/device:CPU:0"]]

这里上面我用了断点调试,每一次报错都是在sv.saver.save()这一步执行之后, 但是屏蔽这一句依旧报错.


#2

这个问题的解决方式比较诡异,.如果说构图只到最后的生成loss的部分,然后利用这个图作为default,然后load原来的模型,是可以做inference的. 但是模型在train的时候不能够用placeholder作为输入.

这个问题是修改transformer模型预测序列数据出现的问题.

最近搜索了一下:
在tf.Supervisor()和tf.train.MonitoredTrainingSession()中间,调用这个会话进行训练首先会运行summary operations, 但是由于summary会和placeholder连接,所以就会报错palceholder没有输入, 所以在这样的session下面,只能够将tensor载入到图当中,不能够采用feed_dict的方式.

这种方式可以通过tf.Session()或者tf.MonitoredSession()实现.

不自动保存summary 的方法

  ...create graph...
  my_train_op = ...
  my_summary_op = tf.summary.merge_all()
 
  sv = tf.Supervisor(logdir="/my/training/directory",
                     summary_op=None) # Do not run the summary service
  with sv.managed_session() as sess:
    for step in range(100000):
      if sv.should_stop():
        break
      if step % 100 == 0:
        _, summ = session.run([my_train_op, my_summary_op])
        sv.summary_computed(sess, summ)
      else:
        session.run(my_train_op)

参考文献:
https://blog.csdn.net/qq_19918373/article/details/72859735