具有镜像策略(GPU VS CPU)BAD性能的Tensorflow多GPU训练 向所有人寻求帮助。
我很困惑为什么在我尝试的任何情况下GPU都比CPU慢... 我想使用具有镜像策略的六个GPU来减少训练时间。 我遵循以下步骤: https://keras.io/guides/distributed_training/
我的机器性能不好 GPU:6 * GeForce RTX 2080 TI(10GB) CPU:Intel(R)Xeon(R)Silver 4110 CPU @ 2.10GHz
在Docker容器上运行:
(1)Tensorflow版本2.0.0
(2)酷达9.0
(3)卡丁7.6
(4)Nvidia驱动程序410.78-安装在服务器上
我想问一些问题。 (1)我想知道TensorFlow'MirroredStrategy'是否可以加快训练速度?
例如,点击链接: https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy
(2)如何通过MirroredStrategy加快训练速度(两倍/三倍)。 我点击链接以实现https://www.youtube.com/watch?v=bRMGoPqsn20 我的示例代码可在colab或github gist上运行, 但您需要下载培训数据并放在Google云端硬盘空间中。 PS:colab仅支持单个GPU。
协作: https://colab.research.google.com/drive/1ldJvdk6wfu-fXBb2iBjKe0q1ZnExGG17?usp=sharing
github要点: https://gist.github.com/harrypotter02/0cc6ffe3bf7c520207dc7be96b1e8b66
测试1:
CPU ,批处理= 64,样本:575478
Epoch 2/500 575478/575478 [===] - **16s** 28us/step - loss: 0.0735 - val_loss:
测试2:
GPU ,batch = 64,样本:575478
Epoch 1/500
1498/1498 [===] - 60s 40ms/step - loss: 0.0907 - val_loss: 0.0619
Epoch 2/500
1498/1498 [===] - **32s** 21ms/step - loss: 0.0592 - val_loss: 0.0522
my_mini_batch = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync(GPU)
384 = 64*6
为什么要比CPU慢?
#import tensorflow_datasets as ds #for debug
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3,4,5'
gpu_table = ['/gpu:0','/gpu:1','/gpu:2','/gpu:3','/gpu:4','/gpu:5']
strategy = tf.distribute.MirroredStrategy(devices=gpu_table)
TrainDataPath = './Train'
TestDataPath = './Test'
def Load_Data(InputPath):
Data_Total = np.array([])
Label_Total = np.array([])
folder_content = glob.glob(InputPath+'/'+'*_Label*')
print('folder content=',folder_content)
for File in folder_content:
Label = np.load((File))
Label_Total = np.append(Label_Total,Label)
Feature = np.load([F for F in glob.glob(InputPath + "/" + File.split('/')[-1].split('Label')[0] + '*') if 'Label' not in F][0])
Data_Total = np.append(Data_Total,Feature)
Label_Total = Label_Total.reshape(-1,1)
Data_Total = Data_Total.reshape(-1,29)
return Data_Total,Label_Total
Train_Data,Train_Label = Load_Data(TrainDataPath)
Test_Data,Test_Label = Load_Data(TestDataPath)
def get_dataset():#for debug
print('get_dataset()')
print("Training_Data.shpae=",Training_Data.shape)
global my_mini_batch
my_mini_batch = 1
BATCH_SIZE_PER_REPLICA = 64
my_mini_batch = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
print('my_mini_batc=',my_mini_batch)
print('BATCH_SIZE_PER_REPLICA=',BATCH_SIZE_PER_REPLICA)
print('strategy.num_replicas_in_sync=',strategy.num_replicas_in_sync)
Train_Dataset_1 = tf.data.Dataset.from_tensor_slices((Training_Data[:,:,0:5],Training_Data[:,5:15],15:25],25:])).batch(my_mini_batch).repeat()
Train_Dataset_2 = tf.data.Dataset.from_tensor_slices(Training_Label).batch(my_mini_batch).repeat()
global train_steps
global valid_steps
global test_steps
train_steps = (int)( Training_Data.shape[0] / my_mini_batch)
print('train_steps='+str(train_steps))
valid_steps = (int)(Val_Data.shape[0] / my_mini_batch)
print('valid_steps=' + str(valid_steps))
test_steps = (int)(Test_Data.shape[0] / my_mini_batch)
print('test_steps=' + str(test_steps))
Valid_Dataset_1 = tf.data.Dataset.from_tensor_slices((Val_Data[:,Val_Data[:,25:])).batch(my_mini_batch).repeat()
Valid_Dataset_2 = tf.data.Dataset.from_tensor_slices(Val_Label).batch(my_mini_batch).repeat()
Test_Dataset_1 = tf.data.Dataset.from_tensor_slices((Test_Data[:,Test_Data[:,25:])).batch(my_mini_batch).repeat()
Test_Dataset_2 = tf.data.Dataset.from_tensor_slices(Test_Label).batch(my_mini_batch).repeat()
print('Replicas: ',strategy.num_replicas_in_sync)
print("Num GPUs Available: ",len(tf.config.experimental.list_physical_devices('GPU')))
Train_Dataset_final = tf.data.Dataset.zip((Train_Dataset_1,Train_Dataset_2))
Valid_Dataset_final = tf.data.Dataset.zip((Valid_Dataset_1,Valid_Dataset_2))
Test_Dataset_final = tf.data.Dataset.zip((Test_Dataset_1,Test_Dataset_2))
return Train_Dataset_final,Valid_Dataset_final,Test_Dataset_final
with strategy.scope():
PalmTh = 0.5
A_layer1_Filters= 2
B_layer1_Filters = 2
C_layer1_Filters = 2
D_layer1_Filters = 2
L = 0.005
F = 12
Epochs = 500
EarlyStopPatience = 10
ChangeLrPatience = 8
ChangeLrFactor = 0.9
Test_Para = []
A_IN_Cell = Input(shape=(1,5),name = "Cell")
PX = Input(shape=(1,10),name = "X")
PY= Input(shape=(1,name = "Y")
PZ = Input(shape=(1,4),name = "Z")
L1= Convolution2D(filters = A_layer1_Filters,kernel_size = 3,strides = 1,padding = 'valid',data_format = 'channels_first',use_bias = True,name = 'Conv1_Height_Cell',activity_regularizer=regularizers.l2(0.00001))
(A_IN_Cell)
LH1 = activation(custom_HardTanh)(L1)
LH1_out= flatten()(LH1)
L2= Convolution2D(filters = B_layer1_Filters,name = 'Conv1_ProjectionX',activity_regularizer=regularizers.l2(0.00001))
(PX)
LH2= activation(custom_HardTanh)(L2)
LH2_out= flatten()(LH2)
A_Convolution1 = Convolution2D(filters = C_layer1_Filters,name = 'Conv1_ProjectionY',activity_regularizer=regularizers.l2(0.00001))
(PY)
A_Hidden1 = activation(custom_HardTanh)(A_Convolution1)
A_Out = flatten()(A_Hidden1)
Centroid_Convolution1 = Convolution2D(filters = D_layer1_Filters,name = 'Conv1_Centroid',activity_regularizer=regularizers.l2(0.00001))(PZ)
Centroid_Hidden1 = activation(custom_HardTanh)(Centroid_Convolution1)
Centroid_Out = flatten()(Centroid_Hidden1)
concatentaLayer = concatenate([LH1_out,LH2_out,A_Out,Centroid_Out]
)
DenseLayer1 = Dense(F,activation=None,activity_regularizer=regularizers.l2(0.00001))(concatentaLayer)
DenseLayer1 = activation(custom_HardTanh)(DenseLayer1)
Output = Dense(1,activation='sigmoid')(DenseLayer1)
model = Model(inputs=[A_IN_Cell,PX,PY,PZ],outputs=[Output])
adam = optimizers.Adam(lr=L)
model.compile(optimizer = adam,loss = 'binary_crossentropy')
model.summary()
change_lr = ReduceLROnPlateau(monitor='val_loss',factor=ChangeLrFactor,patience=ChangeLrPatience,min_lr=0.00005)
EarlyStop = EarlyStopping(monitor='loss',patience = EarlyStopPatience,verbose=2,mode='min')
Training_Data,Val_Data,Training_Label,Val_Label = train_test_split(Train_Data,Train_Label,test_size=0.1)
print('Train_Data follow=',Train_Data.shape)
print('Training_Data follow=',Training_Data.shape)
train_dataset,val_dataset,test_dataset = get_dataset()
print('type(train_dataset)=',train_dataset)
#beside scope
history = model.fit(train_dataset,epochs=Epochs,steps_per_epoch=train_steps,validation_steps = valid_steps,validation_data=val_dataset)