我的TFCONFIG就是这样
def parse_argument():
if flaGS.job_name is None or flaGS.job_name == "":
raise ValueError("Must specify an explicit `job_name`")
if flaGS.task_index is None or flaGS.task_index == "":
raise ValueError("Must specify an explicit `task_index`")
print("job name = %s" % flaGS.job_name)
print("task index = %d" % flaGS.task_index)
os.environ["TF_ROLE"] = flaGS.job_name
os.environ["TF_INDEX"] = str(flaGS.task_index)
# Construct the cluster and start the server
ps_spec = flaGS.ps_hosts.split(",")
worker_spec = flaGS.worker_hosts.split(",")
cluster = {"worker": worker_spec,"ps": ps_spec}
os.environ["TF_CLUSTER_DEF"] = json.dumps(cluster)
def set_tfconfig_environ():
if "TF_CLUSTER_DEF" in os.environ:
cluster = json.loads(os.environ["TF_CLUSTER_DEF"])
task_index = int(os.environ["TF_INDEX"])
task_type = os.environ["TF_ROLE"]
tf_config = dict()
worker_num = len(cluster["worker"])
if task_type == "ps":
tf_config["task"] = {"index": task_index,"type": task_type}
flaGS.job_name = "ps"
flaGS.task_index = task_index
else:
if task_index == 0:
tf_config["task"] = {"index": 0,"type": "chief"}
else:
tf_config["task"] = {"index": task_index - 1,"type": task_type}
flaGS.job_name = "worker"
flaGS.task_index = task_index
if worker_num == 1:
cluster["chief"] = cluster["worker"]
del cluster["worker"]
else:
cluster["chief"] = [cluster["worker"][0]]
del cluster["worker"][0]
del cluster["worker"][worker_num-2]
tf_config["cluster"] = cluster
os.environ["TF_CONFIG"] = json.dumps(tf_config)
print("TF_CONFIG",json.loads(os.environ["TF_CONFIG"]))
执行分布训练时,无法在hdfs modeldir下获得eval文件夹和事件文件,因此张量板仅显示火车曲线,而没有eval曲线。
现在,我对Tensorflow Distribution API感到困惑。我无法获得正确的解决方案,有人可以和我遇到同样的问题吗? 我发现同一问题与我有相同的常见问题,但没有得到答案。
Distributed Tensorflow Estimator execution does not trigger evaluation or export
https://github.com/tensorflow/tensorflow/issues/30121
希望有人可以提供帮助,谢谢。