是否可以在多个 GPU 之间拆分推理以加快推理速度?例如,我有以下代码对输入图像列表运行检测。
def InitTFSess(weights_path):
gpus = tf.config.experimental.list_physical_devices('GPU')
with tf.io.gfile.GFile(weights_path,'rb') as f:
graph_def = tf.compat.v1.GraphDef()
graph_def.ParseFromString(f.read())
config = tf.compat.v1.ConfigProto(
gpu_options=tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=0.9,),# log_device_placement=True
)
sess = tf.compat.v1.Session(config=config)
sess.graph.as_default()
tf.graph_util.import_graph_def(graph_def,name='')
return sess
def detect_image_tensorflow(opt_window_size,opt_conf_thres,window,gpu_select,sess):
rows = opt_window_size
cols = opt_window_size
if gpu_select % 2 == 0:
gpu_device = '/gpu:0'
else:
gpu_device = '/gpu:1'
try:
inp = window
inp = inp[:,:,[2,1,0]]
with tf.device(gpu_device):
out = sess.run([sess.graph.get_tensor_by_name('num_detections:0'),sess.graph.get_tensor_by_name('detection_scores:0'),sess.graph.get_tensor_by_name('detection_boxes:0'),sess.graph.get_tensor_by_name('detection_classes:0')],feed_dict={'image_tensor:0': inp.reshape(1,inp.shape[0],inp.shape[1],3)})
num_detections = int(out[0][0])
detection_output = []
for i in range(0,num_detections):
classId = int(out[3][0][i]) - 1
score = float(out[1][0][i])
bbox = [float(v) for v in out[2][0][i]]
if score > opt_conf_thres:
x1 = bbox[1] * cols
y1 = bbox[0] * rows
x2 = bbox[3] * cols
y2 = bbox[2] * rows
detection_output.append([x1,y1,x2,y2,score,classId])
output_tensor = torch.FloatTensor(detection_output)
return output_tensor
except Exception as e:
return None
def detect(np_images):
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
weights_path = 'trained_weights.pb'
tf_session = self.InitTFSess(weights_path)
win_size = 1000
conf_thres = 0.5
gpu_select = 0
for image in np_images:
detection = detect_image_tensorflow(win_size,conf_thres,image,tf_session)
gpu_select += 1
# do whatever with the detection
我曾尝试将设备暴露在环境变量中,允许 GPU 增长,甚至用一些劣质的 mod 数学替代 GPU。然而,这并没有将推理负载分摊到所有设备上,更不用说加速推理了。
有效的一件事是当我分割输入图像并线程化每个检测调用时,但程序最多只能有 4 个线程。因此,我试图将所有 GPU 与线程结合使用以加快推理速度。这是正确的优化方法吗?