Tensorflow 2指标使用2个GPU产生错误的结果

我从tensorflow文档中获取了有关使用自定义循环https://www.tensorflow.org/tutorials/distribute/custom_training进行分布式培训的这段代码，我将其固定为与tf.keras.metrics.AUC一起使用，并与2 GPUS（2 DGX机器）。

# Import TensorFlow
import tensorflow as tf

# Helper libraries
import numpy as np


print(tf.__version__)


fashion_mnist = tf.keras.datasets.fashion_mnist
(train_images,train_labels),(test_images,test_labels) = fashion_mnist.load_data()

# Adding a dimension to the array -> new shape == (28,28,1)
# We are doing this because the first layer in our model is a convolutional
# layer and it requires a 4D input (batch_size,height,width,channels).
# batch_size dimension will be added later on.
train_images = train_images[...,None]
test_images = test_images[...,None]

# One hot
train_labels = tf.keras.utils.to_categorical(train_labels,10)
test_labels = tf.keras.utils.to_categorical(test_labels,10)

# Getting the images in [0,1] range.
train_images = train_images / np.float32(255)
test_images = test_images / np.float32(255)

# If the list of devices is not specified in the
# `tf.distribute.MirroredStrategy` constructor,it will be auto-detected.
GPUS = [0,1]
devices = ["/gpu:" + str(gpu_id) for gpu_id in GPUS]
strategy = tf.distribute.MirroredStrategy(devices=devices)

print ('Number of devices: {}'.format(strategy.num_replicas_in_sync))


BUFFER_SIZE = len(train_images)

BATCH_SIZE_PER_REPLICA = 64
GLOBAL_BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync

EPOCHS = 10


train_dataset = tf.data.Dataset.from_tensor_slices((train_images,train_labels)).shuffle(BUFFER_SIZE).batch(GLOBAL_BATCH_SIZE)
test_dataset = tf.data.Dataset.from_tensor_slices((test_images,test_labels)).batch(GLOBAL_BATCH_SIZE)

train_dist_dataset = strategy.experimental_distribute_dataset(train_dataset)
test_dist_dataset = strategy.experimental_distribute_dataset(test_dataset)


def create_model():
  model = tf.keras.Sequential([
      tf.keras.layers.Conv2D(32,3,activation='relu'),tf.keras.layers.MaxPooling2D(),tf.keras.layers.Conv2D(64,tf.keras.layers.flatten(),tf.keras.layers.Dense(64,tf.keras.layers.Dense(10,activation='softmax')
    ])

  return model


with strategy.scope():
  # Set reduction to `none` so we can do the reduction afterwards and divide by
  # global batch size.
  loss_object = tf.keras.losses.CategoricalCrossentropy(
      from_logits=True,reduction=tf.keras.losses.Reduction.NONE)
  def compute_loss(labels,predictions):
    per_example_loss = loss_object(labels,predictions)
    return tf.nn.compute_average_loss(per_example_loss,global_batch_size=GLOBAL_BATCH_SIZE)


with strategy.scope():
  test_loss = tf.keras.metrics.Mean(name='test_loss')

  train_accuracy = tf.keras.metrics.Categoricalaccuracy(
      name='train_accuracy')
  test_accuracy = tf.keras.metrics.Categoricalaccuracy(
      name='test_accuracy')
  train_auc = tf.keras.metrics.AUC(name='train_auc')
  test_auc = tf.keras.metrics.AUC(name='test_auc')


# model,optimizer,and checkpoint must be created under `strategy.scope`.
with strategy.scope():
  model = create_model()

  optimizer = tf.keras.optimizers.Adam()


def train_step(inputs):
  images,labels = inputs

  with tf.GradientTape() as tape:
    predictions = model(images,training=True)
    loss = compute_loss(labels,predictions)

  gradients = tape.gradient(loss,model.trainable_variables)
  optimizer.apply_gradients(zip(gradients,model.trainable_variables))

  train_accuracy(labels,predictions)
  train_auc(labels,predictions)
  return loss

def test_step(inputs):
  images,labels = inputs

  predictions = model(images,training=False)
  t_loss = loss_object(labels,predictions)

  test_loss.update_state(t_loss)
  test_accuracy(labels,predictions)
  test_auc(labels,predictions)


# `run` replicates the provided computation and runs it
# with the distributed input.
@tf.function
def distributed_train_step(dataset_inputs):
  per_replica_losses = strategy.run(train_step,args=(dataset_inputs,))
  return strategy.reduce(tf.distribute.ReduceOp.SUM,per_replica_losses,axis=None)

@tf.function
def distributed_test_step(dataset_inputs):
  return strategy.run(test_step,))


for epoch in range(EPOCHS):
  # TRAIN LOOP
  total_loss = 0.0
  num_batches = 0
  for x in train_dist_dataset:
    total_loss += distributed_train_step(x)
    num_batches += 1
  train_loss = total_loss / num_batches

  # TEST LOOP
  for x in test_dist_dataset:
    distributed_test_step(x)

  template = ("Epoch {},Loss: {},accuracy: {},AUC: {},"
              "Test Loss: {},Test accuracy: {},Test AUC: {}")
  print (template.format(epoch+1,train_loss,train_accuracy.result()*100,train_auc.result()*100,test_loss.result(),test_accuracy.result()*100,test_auc.result()*100))

  test_loss.reset_states()
  train_accuracy.reset_states()
  test_accuracy.reset_states()
  train_auc.reset_states()
  test_auc.reset_states()

问题在于AUC的评估绝对是错误的，因为它超出了其范围（应为0-100），并且我通过运行上述代码一次获得了泰斯的结果：

Epoch 1,Loss: 1.8061423301696777,accuracy: 66.00833892822266,AUC: 321.8688659667969,Test Loss: 1.742477536201477,Test accuracy: 72.0999984741211,Test AUC: 331.33709716796875
Epoch 2,Loss: 1.7129968404769897,accuracy: 74.9816665649414,AUC: 337.37017822265625,Test Loss: 1.7084736824035645,Test accuracy: 75.52999877929688,Test AUC: 337.1878967285156
Epoch 3,Loss: 1.643971562385559,accuracy: 81.83333587646484,AUC: 355.96209716796875,Test Loss: 1.6072628498077393,Test accuracy: 85.3499984741211,Test AUC: 370.603759765625
Epoch 4,Loss: 1.5887378454208374,accuracy: 87.27833557128906,AUC: 373.6204528808594,Test Loss: 1.5906082391738892,Test accuracy: 87.13999938964844,Test AUC: 371.9998474121094
Epoch 5,Loss: 1.581775426864624,accuracy: 88.0,AUC: 373.9468994140625,Test Loss: 1.5964380502700806,Test accuracy: 86.68000030517578,Test AUC: 371.0227355957031
Epoch 6,Loss: 1.5764907598495483,accuracy: 88.49166870117188,AUC: 375.2404479980469,Test Loss: 1.5832056999206543,Test accuracy: 87.94000244140625,Test AUC: 373.41998291015625
Epoch 7,Loss: 1.5698528289794922,accuracy: 89.19166564941406,AUC: 376.473876953125,Test Loss: 1.5770654678344727,Test accuracy: 88.58000183105469,Test AUC: 375.5516662597656
Epoch 8,Loss: 1.564456820487976,accuracy: 89.71833801269531,AUC: 377.8564758300781,Test Loss: 1.5792100429534912,Test accuracy: 88.27000427246094,Test AUC: 373.1791687011719
Epoch 9,Loss: 1.5612279176712036,accuracy: 90.02000427246094,AUC: 377.9949645996094,Test Loss: 1.5729509592056274,Test accuracy: 88.9800033569336,Test AUC: 375.5257263183594
Epoch 10,Loss: 1.5562015771865845,accuracy: 90.54000091552734,AUC: 378.9789123535156,Test Loss: 1.56815767288208,Test accuracy: 89.3499984741211,Test AUC: 375.8636474609375

精度还可以，但似乎这是唯一表现良好的指标。我也尝试了其他指标，但评估不正确。使用多个GPU时似乎出现了问题，因为当我使用一个GPU运行此代码时，它会产生正确的结果。

Tensorflow 2指标使用2个GPU产生错误的结果

iCMS 回答：Tensorflow 2指标使用2个GPU产生错误的结果

大家都在问