如何在支持 FMA 的 GPU 上优化浮点除法？

现代基于 FMA 的除法算法通常严重依赖 Peter Markstein 的工作，他对此问题进行了广泛的研究。规范参考是：

Peter Markstein，“IA-64 和基本函数：速度和精度”，Prentice-Hall 2000。

下面的 C 代码也是基于 Markstein 的工作。它基本上由一条尽快处理常见情况的快速路径和一条处理所有讨厌的特殊情况的慢速路径组成。

所涉及的概念非常简单，但它们确实需要仔细的数学分析以确保得到正确的舍入结果。第一步是计算除数的准确倒数。这需要精确计算近似误差，FMA 是最好的工具。从（基于硬件的）近似中对倒数进行细化通常采用具有二次收敛的单个 Newton-Raphson 迭代或具有三次收敛的单个 Halley 迭代的形式，两者都完美地映射到 FMA。

将除数的倒数与被除数相乘得到商的近似值。这同样基于基于 FMA 的残差计算进行了改进。在这个阶段，通常使用一个按比例缩放的除数版本，以防止中间计算中的上溢和下溢，并允许在除法算法的快速路径中使用尽可能广泛的操作数。这也意味着最后需要一个最终的缩放步骤来调整商幅度。

通过代码的两条路径的一个有利的安排是首先无条件地执行快速路径计算，然后在最后检查是否满足使用快速路径的条件，如果不满足，则调用慢路径代码。这允许计算与快速路径计算并发的必要条件，并允许概述慢速路径计算，这有助于将该代码放置在很少使用的内存页面中，其中各种慢速路径或收集异常处理代码。

请将下面的代码视为算法草图。包含的基于随机测试向量的“烟雾”级测试远非严格的测试框架。调用慢速路径的条件既没有显示出尽可能紧，也没有显示出尽可能紧，也没有显示出最有效的安排。倒数的细化使用单个 Newton-Raphson 步骤，但这对于给定 GPU 的倒数近似可能不够，并且可能需要以额外的 FMA 为代价替换哈雷迭代。最后，我省略了整个慢速路径的构建，因为这将超出 Stackoverflow 答案的限制，无论是在设计工作还是文本描述量方面。

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>

float uint32_as_float (uint32_t a)
{
    float r;
    memcpy (&r,&a,sizeof r);
    return r;
}

uint32_t float_as_uint32 (float a)
{
    uint32_t r;
    memcpy (&r,sizeof r);
    return r;
}

float rcp_approx (float a)
{
    return 1.0f / a; // fake it for now
}

float fp32_div_slowpath (float dividend,float divisor)
{
    return dividend / divisor; // fake it for now
}

float fp32_div (float dividend,float divisor)
{
    const uint32_t FP32_MANT_MASK = 0x007fffffu;
    const uint32_t FP32_ONE = 0x3f800000u;
    const uint32_t FP32_SIGN_MASK = 0x80000000u;
    const uint32_t FP32_SIGN_EXPO_MASK = 0xff800000u;
    const uint32_t FP32_HI_LIMIT = 0x7e800000u; // 0x1.0p+126
    const uint32_t FP32_LO_LIMIT = 0x00800001u; // 0x1.0p-126 + ulp

    // fast path
    float recip_approx = rcp_approx (divisor); 
    float recip_err = fmaf (recip_approx,-divisor,1.0f);
    float dividend_mant = uint32_as_float ((float_as_uint32 (dividend) & FP32_MANT_MASK) | FP32_ONE);
    float dividend_scale = uint32_as_float (float_as_uint32 (dividend) & FP32_SIGN_EXPO_MASK);
    float refined_recip = fmaf (recip_approx,recip_err,recip_approx);
    float quotient_mant = refined_recip * dividend_mant;
    float quotient_residual = fmaf (quotient_mant,dividend_mant);
    float refined_quotient_mant = fmaf (refined_recip,quotient_residual,quotient_mant);
    float refined_quotient_residual = fmaf (refined_quotient_mant,dividend_mant);
    float final_quotient_mant = fmaf (refined_recip,refined_quotient_residual,refined_quotient_mant);
    float final_quotient = final_quotient_mant * dividend_scale;

    // check if we need to apply the slow path and invoke it if that is the case
    uint32_t id = float_as_uint32 (divisor) & ~FP32_SIGN_MASK;
    uint32_t iq = float_as_uint32 (final_quotient) & ~FP32_SIGN_MASK;
    if ((id > FP32_HI_LIMIT) || ((iq - FP32_LO_LIMIT) > (FP32_HI_LIMIT - FP32_LO_LIMIT))) {
        final_quotient = fp32_div_slowpath (dividend,divisor);
    }

    return final_quotient;
}

// Fixes via: Greg Rose,KISS: A Bit Too Simple. http://eprint.iacr.org/2011/007
static unsigned int z=362436069,w=521288629,jsr=362436069,jcong=123456789;
#define znew (z=36969*(z&0xffff)+(z>>16))
#define wnew (w=18000*(w&0xffff)+(w>>16))
#define MWC  ((znew<<16)+wnew)
#define SHR3 (jsr^=(jsr<<13),jsr^=(jsr>>17),jsr^=(jsr<<5)) /* 2^32-1 */
#define CONG (jcong=69069*jcong+13579)                     /* 2^32 */
#define KISS ((MWC^CONG)+SHR3)

int main (void)
{
    uint64_t count = 0;
    float divisor,dividend,res,ref;

    do {
        if (!(count & 0xffffffULL)) printf ("\rcount: %llu",count);

        dividend = uint32_as_float (KISS);
        divisor  = uint32_as_float (KISS);
        res = fp32_div (dividend,divisor);
        ref = dividend / divisor;
        if (float_as_uint32 (res) != float_as_uint32 (ref)) {
            printf ("error @ dividend=% 16.6a divisor=% 16.6a (% 15.8e,% 15.8e): res=% 16.6a ref=% 16.6a (% 15.8e,% 15.8e)\n",divisor,ref,ref);
            return EXIT_FAILURE;
        }

        count++;
    } while (count);
    return EXIT_SUCCESS;
}

重新熟悉这个问题后的一些附加说明。如果硬件的倒数逼近指令将次正规数视为零，则是有利的。这允许在调用慢路径代码时消除除数大小检查，因为任何大于 2¹²⁶ 的除数将导致倒数为零和商为无穷大，从而触发商检查。

至于商检查，任何代表正态的快速路径结果都可以接受，除了可能代表次正态结果的不正确四舍五入的最小正态。换句话说，绝对值在[0x1.000002p-126,0x1.fffffep+127]内的商不应该触发慢路径代码的重新计算。

只要使用足够准确的倒数，就不需要商细化步骤。涵盖 [1,2] 中的所有被除数和 [1,2] 中的所有除数以及有效数（尾数）模式的所有可能组合的详尽测试在现代硬件中是可行的，需要 2⁴⁶ 测试案件。即使在单个 CPU 内核上运行标量代码，它也能在不到四天的时间内完成，没有报告任何错误。

在实际使用中，在可能的情况下，我们希望强制编译器内联 fp32_div()，同时强制 fp_div_slowpath() 进入被调用的子例程。

下面我使用上面讨论的简化设计了一个基于 AVX 的单精度除法版本。它通过了我所有的测试。由于基于 AVX 的硬件倒数精度较低，因此需要进行哈雷迭代以将倒数细化为完全单精度，从而提供最大误差接近 0.5 ulp 的结果。

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "immintrin.h"

#define MODE_RANDOM           (1)
#define MODE_MANT_EXHAUSTIVE  (2)
#define TEST_MODE             (MODE_RANDOM)

float uint32_as_float (uint32_t a) { float r; memcpy (&r,sizeof r); return r; }
uint32_t float_as_uint32 (float a) { uint32_t r; memcpy (&r,sizeof r); return r; }
float fp32_div_slowpath (float dividend,float divisor) { return dividend / divisor; }

/* 12-bit approximation. Subnormal arguments and results are treated as zero */
float rcp_approx_avx (float divisor)
{
    float r;
    __m256 t = _mm256_set1_ps (divisor);
    t = _mm256_rcp_ps (t);
    memcpy (&r,&t,sizeof r);
    return r;
}

float fp32_div (float dividend,float divisor)
{
    const uint32_t FP32_MANT_MASK = 0x007fffffu;
    const uint32_t FP32_ONE = 0x3f800000u;
    const uint32_t FP32_SIGN_MASK = 0x80000000u;
    const uint32_t FP32_SIGN_EXPO_MASK = 0xff800000u;
    const uint32_t FP32_QUOTIENT_HI_LIMIT = 0x7f7fffffu; // 0x1.0p+128 - ulp
    const uint32_t FP32_QUOTIENT_LO_LIMIT = 0x00800001u; // 0x1.0p-126 + ulp

    // fast path
    float recip_approx = rcp_approx_avx (divisor);
    float recip_err = fmaf (recip_approx,1.0f);
    float recip_err2 = fmaf (recip_err,recip_err);
    float refined_recip = fmaf (recip_approx,recip_err2,recip_approx);
    float dividend_mant = uint32_as_float ((float_as_uint32 (dividend) & FP32_MANT_MASK) | FP32_ONE);
    float dividend_scale = uint32_as_float (float_as_uint32 (dividend) & FP32_SIGN_EXPO_MASK);
    float refined_quotient_mant = refined_recip * dividend_mant;
    float refined_quotient_residual = fmaf (refined_quotient_mant,refined_quotient_mant);
    float final_quotient = final_quotient_mant * dividend_scale;

    // check if we need to apply the slow path and invoke it if that is the case
    uint32_t iq = float_as_uint32 (final_quotient) & ~FP32_SIGN_MASK;
    if ((iq - FP32_QUOTIENT_LO_LIMIT) > (FP32_QUOTIENT_HI_LIMIT - FP32_QUOTIENT_LO_LIMIT)) {
        final_quotient = fp32_div_slowpath (dividend,divisor);
    }
    return final_quotient;
}

// Fixes via: Greg Rose,ref;

#if TEST_MODE == MODE_RANDOM
    do {
        if (!(count & 0xffffffULL)) printf ("\rcount: %llu",ref);
            return EXIT_FAILURE;
        }

        count++;
    } while (count);
#else
    for (dividend = 1.0f; dividend <= 2.0f; dividend = uint32_as_float (float_as_uint32 (dividend) + 1)) {
        printf ("\rdividend=%08x",float_as_uint32 (dividend));
        for (divisor = 1.0f; divisor <= 2.0f; divisor = uint32_as_float (float_as_uint32 (divisor) + 1)) {
            res = fp32_div (dividend,divisor);
            ref = dividend / divisor;
            if (float_as_uint32 (res) != float_as_uint32 (ref)) {
                printf ("error @ dividend=% 16.6a divisor=% 16.6a (% 15.8e,ref);
                return EXIT_FAILURE;
            }
        }
    }
#endif

    return EXIT_SUCCESS;
}

NVIDIA GPU 的相应 CUDA 代码（已测试）如下所示：

__noinline__ __device__ float fp32_div_slowpath (float dividend,float divisor) 
{ 
    return dividend / divisor; 
}

/* Subnormal arguments and results are treated as zero */
__forceinline__ __device__ float rcp_approx_gpu (float divisor)
{
    float r;
    asm ("rcp.approx.ftz.f32 %0,%1;\n\t" : "=f"(r) : "f"(divisor));
    return r;
}

__forceinline__ __device__ float fp32_div (float dividend,float divisor)
{
    const uint32_t FP32_MANT_MASK = 0x007fffffu;
    const uint32_t FP32_ONE = 0x3f800000u;
    const uint32_t FP32_SIGN_MASK = 0x80000000u;
    const uint32_t FP32_SIGN_EXPO_MASK = 0xff800000u;
    const uint32_t FP32_QUOTIENT_HI_LIMIT = 0x7f7fffffu; // 0x1.0p+128 - ulp
    const uint32_t FP32_QUOTIENT_LO_LIMIT = 0x00800001u; // 0x1.0p-126 + ulp

    // fast path
    float recip_approx = rcp_approx_gpu (divisor);
    float recip_err = fmaf (recip_approx,1.0f);
    float refined_recip = fmaf (recip_approx,recip_approx);
    float dividend_mant = __int_as_float ((__float_as_int (dividend) & FP32_MANT_MASK) | FP32_ONE);
    float dividend_scale = __int_as_float (__float_as_int (dividend) & FP32_SIGN_EXPO_MASK);
    float refined_quotient_mant = refined_recip * dividend_mant;
    float refined_quotient_residual = fmaf (refined_quotient_mant,refined_quotient_mant);
    float final_quotient = final_quotient_mant * dividend_scale;

    // check if we need to apply the slow path and invoke it if that is the case
    uint32_t iq = __float_as_int (final_quotient) & ~FP32_SIGN_MASK;
    if ((iq - FP32_QUOTIENT_LO_LIMIT) > (FP32_QUOTIENT_HI_LIMIT - FP32_QUOTIENT_LO_LIMIT)) {
        final_quotient = fp32_div_slowpath (dividend,divisor);
    }
    return final_quotient;
}

如何在支持 FMA 的 GPU 上优化浮点除法？

zhang9987 回答：如何在支持 FMA 的 GPU 上优化浮点除法？

大家都在问