None or FLAGS.ps_hosts ==
"":
raise ValueError(
"Must specify an explicit `ps_hosts`")
if FLAGS.worker_hosts
is None or FLAGS.worker_hosts ==
"":
raise ValueError(
"Must specify an explicit `worker_hosts`") cluster_spec = tf.train.ClusterSpec({
"ps": FLAGS.ps_hosts.split(
","),
"worker": FLAGS.worker_hosts.split(
","), }) server = tf.train.Server( cluster_spec, job_name=FLAGS.job_name, task_index=FLAGS.task_index)
return ( cluster_spec, server, )
def main(unused_argv): if FLAGS.data_dir
is None or FLAGS.data_dir ==
"":
raise ValueError(
"Must specify an explicit `data_dir`")
if FLAGS.train_dir
is None or FLAGS.train_dir ==
"":
raise ValueError(
"Must specify an explicit `train_dir`") cluster_spec, server = device_and_target()
if FLAGS.job_name ==
"ps": server.join()
elif FLAGS.job_name ==
"worker":
with tf.device(tf.train.replica_device_setter(worker_device =
"/job:worker/task:{}".format(FLAGS.task_index), cluster=cluster_spec)): images, labels = inputs(FLAGS.batch_size) logits = inference(images, FLAGS.hidden1, FLAGS.hidden2) loss = lossFunction(logits, labels) train_op = training(loss, FLAGS.learning_rate)
with tf.train.MonitoredTrainingSession( master=server.target, is_chief=(FLAGS.task_index ==
0), checkpoint_dir=FLAGS.train_dir)
as sess:
while not sess.should_stop(): sess.run(train_op)
if __name__ ==
"__main__": tf.app.run()
四、分布式模型的启动
首先关闭防火墙
sudo iptable –F
然后在不同的机器上面启动服务
. -- . --... --...,. 6/8 首页 上一页 4 5 6 7 8 下一页 尾页