大数据下基于Tensorflow框架的深度学习示例教程

None or FLAGS.ps_hosts == "": raise ValueError("Must specify an explicit `ps_hosts`") if FLAGS.worker_hosts is None or FLAGS.worker_hosts == "": raise ValueError("Must specify an explicit `worker_hosts`") cluster_spec = tf.train.ClusterSpec({ "ps": FLAGS.ps_hosts.split(","), "worker": FLAGS.worker_hosts.split(","), }) server = tf.train.Server( cluster_spec, job_name=FLAGS.job_name, task_index=FLAGS.task_index) return ( cluster_spec, server, )def main(unused_argv): if FLAGS.data_dir is None or FLAGS.data_dir == "": raise ValueError("Must specify an explicit `data_dir`") if FLAGS.train_dir is None or FLAGS.train_dir == "": raise ValueError("Must specify an explicit `train_dir`") cluster_spec, server = device_and_target() if FLAGS.job_name == "ps": server.join() elif FLAGS.job_name == "worker": with tf.device(tf.train.replica_device_setter(worker_device = "/job:worker/task:{}".format(FLAGS.task_index), cluster=cluster_spec)): images, labels = inputs(FLAGS.batch_size) logits = inference(images, FLAGS.hidden1, FLAGS.hidden2) loss = lossFunction(logits, labels) train_op = training(loss, FLAGS.learning_rate) with tf.train.MonitoredTrainingSession( master=server.target, is_chief=(FLAGS.task_index == 0), checkpoint_dir=FLAGS.train_dir) as sess: while not sess.should_stop(): sess.run(train_op)if __name__ == "__main__": tf.app.run()

四、分布式模型的启动

首先关闭防火墙

sudo iptable –F

然后在不同的机器上面启动服务

#在246.1机器上面运行参数服务器,命令:CLASSPATH=$($HADOOP_HDFS_HOME/bin/hadoop classpath --glob) python /home/bdusr01/tine/Distributed_Tensorflow_MNIST_Model_Used_NN_Read_TFRecords_On_HDFS_Support_Kerberos.py --ps_hosts=10.142.246.1:1120 --worker_hosts=10.142.78.41:1121,10.