我正在使用keras在ImageNet2012上训练我的模型。当我在单个GPU上使用256的批处理大小时,它可以正常训练。
当我使用6个GPU时,我将批处理大小设置为1024,我面临内存不足的问题:
我的环境:
keras-2.2.4
tensorflow-gpu-1.14.0
python-3.6.8
CUDA-10.1
代码:
参考文献来自{@ {3}}
import os,sys,argparse
import numpy as np
from multiprocessing import cpu_count
from multi_gpu import ParallelModel
import tensorflow.keras.backend as K
from tensorflow.keras.optimizers import Adam,SGD,RMSprop
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.resnet50 import preprocess_input,decode_predictions
from tensorflow.keras.callbacks import TensorBoard,ModelCheckpoint,ReduceLROnPlateau,LearningRateScheduler,TerminateonNaN
from tensorflow.keras.utils import multi_gpu_model
from yolo3.models.yolo3_nano import NanoNet
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5"
def get_model(model_type):
if model_type == 'nanonet':
with tf.device('/cpu:0'):
model = NanoNet(input_shape=(224,224,3),weights=None)
else:
raise ValueError('Unsupported model type')
return model
def main(args):
log_dir = args.log_dir#'logs/'
# prepare model
model = get_model("nanonet")
# support multi-gpu training
paralleled_model = multi_gpu_model(model,gpus=6)
if args.weights_path:
paralleled_model.load_weights(args.weights_path)
# callbacks for training process
checkpoint = ModelCheckpoint(args.model_save_dir + 'ep{epoch:03d}-val_loss{val_loss:.3f}-val_acc{val_acc:.3f}-val_top_k_categorical_accuracy{val_top_k_categorical_accuracy:.3f}.h5',monitor='val_acc',mode='max',verbose=1,save_weights_only=False,save_best_only=True,period=1)
logging = TensorBoard(log_dir=args.model_save_dir,histogram_freq=0,write_graph=False,write_grads=False,write_images=False,update_freq='batch')
terminate_on_nan = TerminateonNaN()
learn_rates = [0.05,0.01,0.005,0.001,0.0005,0.0001]
lr_scheduler = LearningRateScheduler(lambda epoch: learn_rates[epoch // 30])
# data generator
train_datagen = ImageDataGenerator(preprocessing_function=preprocess,zoom_range=0.25,#shear_range=0.2,#channel_shift_range=0.1,#rotation_range=0.1,width_shift_range=0.05,height_shift_range=0.05,horizontal_flip=True)
test_datagen = ImageDataGenerator(preprocessing_function=preprocess)
train_generator = train_datagen.flow_from_directory(
args.train_data_path,target_size=(224,224),batch_size=1204)
test_generator = test_datagen.flow_from_directory(
args.val_data_path,batch_size=1024)
# get optimizer
optimizer = get_optimizer(args.optim_type,args.learning_rate)
# start training
paralleled_model.compile(
optimizer=optimizer,metrics=['accuracy','top_k_categorical_accuracy'],loss='categorical_crossentropy')
paralleled_model.summary()
print('Train on {} samples,val on {} samples,with batch size {}.'.format(train_generator.samples,test_generator.samples,args.batch_size))
paralleled_model.fit_generator(
train_generator,steps_per_epoch=train_generator.samples // args.batch_size,epochs=args.total_epoch,workers=cpu_count()-1,#Try to parallized feeding image data but leave one cpu core idle
initial_epoch=args.init_epoch,use_multiprocessing=True,validation_data=test_generator,validation_steps=test_generator.samples // args.batch_size,callbacks=[logging,checkpoint,lr_scheduler,terminate_on_nan])
# Finally store model
model.save(log_dir + 'trained_final.h5')
错误:
2019-11-22 05:06:34.936131: I tensorflow/core/common_runtime/bfc_allocator.cc:812] 8 Chunks of size 152785920 totalling 1.14GiB
2019-11-22 05:06:34.936139: I tensorflow/core/common_runtime/bfc_allocator.cc:812] 1 Chunks of size 156898816 totalling 149.63MiB
2019-11-22 05:06:34.936147: I tensorflow/core/common_runtime/bfc_allocator.cc:812] 1 Chunks of size 166076416 totalling 158.38MiB
2019-11-22 05:06:34.936153: I tensorflow/core/common_runtime/bfc_allocator.cc:812] 15 Chunks of size 204718080 totalling 2.86GiB
2019-11-22 05:06:34.936160: I tensorflow/core/common_runtime/bfc_allocator.cc:812] 5 Chunks of size 209534976 totalling 999.14MiB
2019-11-22 05:06:34.936168: I tensorflow/core/common_runtime/bfc_allocator.cc:812] 1 Chunks of size 239148800 totalling 228.07MiB
2019-11-22 05:06:34.936177: I tensorflow/core/common_runtime/bfc_allocator.cc:812] 1 Chunks of size 245235712 totalling 233.88MiB
2019-11-22 05:06:34.936186: I tensorflow/core/common_runtime/bfc_allocator.cc:812] 1 Chunks of size 259511808 totalling 247.49MiB
2019-11-22 05:06:34.936192: I tensorflow/core/common_runtime/bfc_allocator.cc:812] 3 Chunks of size 305571840 totalling 874.25MiB
2019-11-22 05:06:34.936201: I tensorflow/core/common_runtime/bfc_allocator.cc:812] 1 Chunks of size 316582656 totalling 301.92MiB
2019-11-22 05:06:34.936207: I tensorflow/core/common_runtime/bfc_allocator.cc:812] 19 Chunks of size 409436160 totalling 7.24GiB
2019-11-22 05:06:34.936216: I tensorflow/core/common_runtime/bfc_allocator.cc:812] 2 Chunks of size 416780288 totalling 794.95MiB
2019-11-22 05:06:34.936222: I tensorflow/core/common_runtime/bfc_allocator.cc:812] 6 Chunks of size 419069952 totalling 2.34GiB
2019-11-22 05:06:34.936230: I tensorflow/core/common_runtime/bfc_allocator.cc:812] 1 Chunks of size 426586880 totalling 406.82MiB
2019-11-22 05:06:34.936239: I tensorflow/core/common_runtime/bfc_allocator.cc:816] Sum Total of in-use chunks: 28.46GiB
2019-11-22 05:06:34.936245: I tensorflow/core/common_runtime/bfc_allocator.cc:818] total_region_allocated_bytes_: 30652445440 memory_limit_: 30652445491 available bytes: 51 curr_region_allocation_bytes_: 34359738368
2019-11-22 05:06:34.936267: I tensorflow/core/common_runtime/bfc_allocator.cc:824] Stats:
Limit: 30652445491
InUse: 30563432960
MaxInUse: 30652445440
NumAllocs: 5911
MaxAllocSize: 616562688
2019-11-22 05:06:34.936554: W tensorflow/core/common_runtime/bfc_allocator.cc:319] ****************************************************************************************************
2019-11-22 05:06:34.936604: W tensorflow/core/framework/op_kernel.cc:1622] OP_REQUIRES failed at constant_op.cc:172 : Resource exhausted: OOM when allocating tensor with shape[170,12,224] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
Epoch 1/150
Traceback (most recent call last):
File "yolo3/models/backbones/imagenet_training/train_imagenet.py",line 211,in <module>
main(args)
File "yolo3/models/backbones/imagenet_training/train_imagenet.py",line 168,in main
callbacks=[logging,terminate_on_nan])
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training.py",line 1272,in fit_generator
steps_name='steps_per_epoch')
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training_generator.py",line 265,in model_iteration
batch_outs = batch_function(*batch_data)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training.py",line 997,in train_on_batch
outputs = self.train_function(ins) # pylint: disable=not-callable
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/backend.py",line 3343,in __call__
run_metadata=self.run_metadata)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py",line 1459,in __call__
run_metadata_ptr)
tensorflow.python.framework.errors_impl.ResourceExhaustedError: 2 root error(s) found.
(0) Resource exhausted: OOM when allocating tensor with shape[170,56,70] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[{{node training/SGD/gradients/gradients/zeros_2558-0-0-TransposeNCHWToNHWC-LayoutOptimizer}}]]
Hint: If you want to see a list of allocated tensors when OOM happens,add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
(1) Resource exhausted: OOM when allocating tensor with shape[170,add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
[[replica_1/nano_net/pep_block_12_preproject_BN/cond/Merge_2/_21339]]
Hint: If you want to see a list of allocated tensors when OOM happens,add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
0 successful operations.
5 derived errors ignored.