如何使用多个 GPU 加速 Tensorflow 推理？

是否可以在多个 GPU 之间拆分推理以加快推理速度？例如，我有以下代码对输入图像列表运行检测。

def InitTFSess(weights_path):
   
        gpus = tf.config.experimental.list_physical_devices('GPU')
        with tf.io.gfile.GFile(weights_path,'rb') as f:
             graph_def = tf.compat.v1.GraphDef()
             graph_def.ParseFromString(f.read())
                    
             config = tf.compat.v1.ConfigProto(
                          gpu_options=tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=0.9,),# log_device_placement=True  
                        )

             sess = tf.compat.v1.Session(config=config)
             sess.graph.as_default()
             tf.graph_util.import_graph_def(graph_def,name='')

        return sess
            
def detect_image_tensorflow(opt_window_size,opt_conf_thres,window,gpu_select,sess):
        rows = opt_window_size
        cols = opt_window_size

        if gpu_select % 2 == 0:
            gpu_device = '/gpu:0'
        else:
            gpu_device = '/gpu:1'

        try:
            inp = window
            inp = inp[:,:,[2,1,0]]
        
            with tf.device(gpu_device):
                out = sess.run([sess.graph.get_tensor_by_name('num_detections:0'),sess.graph.get_tensor_by_name('detection_scores:0'),sess.graph.get_tensor_by_name('detection_boxes:0'),sess.graph.get_tensor_by_name('detection_classes:0')],feed_dict={'image_tensor:0': inp.reshape(1,inp.shape[0],inp.shape[1],3)})
    
            num_detections = int(out[0][0])
            detection_output = []
        
            for i in range(0,num_detections):
                classId = int(out[3][0][i]) - 1
                score = float(out[1][0][i])
                bbox = [float(v) for v in out[2][0][i]]
                if score > opt_conf_thres:
                    x1 = bbox[1] * cols
                    y1 = bbox[0] * rows
                    x2 = bbox[3] * cols
                    y2 = bbox[2] * rows
    
                    detection_output.append([x1,y1,x2,y2,score,classId])
        
            output_tensor = torch.FloatTensor(detection_output)
            return output_tensor

        except Exception as e:
            return None

def detect(np_images):

    os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
    os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

    weights_path = 'trained_weights.pb'
    tf_session = self.InitTFSess(weights_path)
    win_size = 1000
    conf_thres = 0.5
    gpu_select = 0

    for image in np_images:
        detection = detect_image_tensorflow(win_size,conf_thres,image,tf_session)
        gpu_select += 1
        # do whatever with the detection

我曾尝试将设备暴露在环境变量中，允许 GPU 增长，甚至用一些劣质的 mod 数学替代 GPU。然而，这并没有将推理负载分摊到所有设备上，更不用说加速推理了。

有效的一件事是当我分割输入图像并线程化每个检测调用时，但程序最多只能有 4 个线程。因此，我试图将所有 GPU 与线程结合使用以加快推理速度。这是正确的优化方法吗？

如何使用多个 GPU 加速 Tensorflow 推理？

aucmaktq 回答：如何使用多个 GPU 加速 Tensorflow 推理？

大家都在问