如何在 GPU 上按常数找到除法的魔法乘数？

我正在考虑实现以下计算，其中 divisor 是非零且不是 2 的幂

unsigned multiplier(unsigned divisor)
{
    unsigned shift = 31 - clz(divisor);
    uint64_t t = 1ull << (32 + shift);

    return t / div;
}

这种方式对于缺少 64 位整数和浮点指令但可能具有 32 位融合乘加运算的处理器（例如 GPU，也将缺少除法）有效。

此计算对于查找优化除法中涉及的“魔术乘法器”很有用，当除数提前已知时，乘高指令然后按位移位。与编译器和 reference code in libdivide 中使用的代码不同，它会找到最大的此类乘数。

另外一个变化是，在我正在查看的应用程序中，我预计 divisor 几乎总是可以用 float 类型表示。因此，拥有一条处理这些除数的高效“快速路径”和一条处理其余部分的大小优化的“慢速路径”将是有意义的。

我提出的解决方案在“快速路径”上的 6 或 8 个 FMA 操作中执行专门针对此特定场景（股息是 2 的幂）的余数的长除法，然后使用 8 执行二进制搜索“慢路径”上的迭代。

以下程序对建议的解决方案进行了详尽的测试（在支持 FMA 的 CPU 上需要大约 1-2 分钟）。

#include <math.h>
#include <stdint.h>
#include <stdio.h>

struct quomod {
    unsigned long quo;
    unsigned long mod;
};

// Divide 1 << (32 + SHIFT) by DIV,return quotient and modulus
struct quomod
quomod_ref(unsigned div,unsigned shift)
{
    uint64_t t = 1ull << (32 + shift);

    return (struct quomod){t / div,t % div};
}

// Reinterpret given bits as float
static inline float int_as_float(uint32_t bits)
{
    return (union{ unsigned b; float f; }){bits}.f;
}

// F contains integral value in range [-2**32 .. 2**32]. Convert it to integer,// with wrap-around on overflow. If the GPU implements saturating conversion,// it also may be used
static inline uint32_t cvt_f32_u32_wrap(float f)
{
    return (uint32_t)(long long)f;
}

struct quomod
quomod_alt(unsigned div,unsigned shift)
{
    // t = float(1ull << (32 + shift))
    float t = int_as_float(0x4f800000 + (shift << 23));

    // mask with max(0,shift - 23) low bits zero
    uint32_t mask = (int)(~0u << shift) >> 23;

    // No roundoff in conversion
    float div_f = div & mask;

    // Caution: on the CPU this is correctly rounded,but on the GPU
    // native reciprocal may be off by a few ULP,in which case a
    // refinement step may be necessary:
    // recip = fmaf(fmaf(recip,-div_f,1),recip,recip)
    float recip = 1.f / div_f;

    // Higher part of the quotient,integer in range 2^31 .. 2^32
    float quo_hi = t * recip;

    // No roundoff
    float res = fmaf(quo_hi,t);

    float quo_lo_approx = res * recip;

    float res2 = fmaf(quo_lo_approx,res);

    // Lower part of the quotient,may be negative
    float quo_lo = floorf(fmaf(res2,quo_lo_approx));

    // Remaining part of the dividend
    float mod_f = fmaf(quo_lo,res);

    // Quotient as sum of parts
    unsigned quo = cvt_f32_u32_wrap(quo_hi) + (int)quo_lo;

    // Adjust quotient down if remainder is negative
    if (mod_f < 0) {
        quo--;
    }

    if (div & ~mask) {
        // The quotient was computed for a truncated divisor,so
        // it matches or exceeds the true result

        // High part of the dividend
        uint32_t ref_hi = 1u << shift;

        // Unless quotient is zero after wraparound,increment it so
        // it's higher than true quotient (its high bit must be 1)
        quo -= (int)quo >> 31;

        // Binary search for the true quotient; search invariant:
        // quo is higher than true quotient,quo-2*bit is lower
        for (unsigned bit = 256; bit; bit >>= 1) {
            unsigned try = quo - bit;
            // One multiply-high instruction
            uint32_t prod_hi = 1ull * try * div >> 32;
            if (prod_hi >= ref_hi)
                quo = try;
        }
        // quo is zero or exceeds the true quotient,so quo-1 must be it
        quo--;
    }

    // Use the "left-pointing short magic wand" operator
    // to recover the remainder
    return (struct quomod){quo,quo *- div};
}

int main()
{
    fprintf(stderr,"%66c\r[",']');
    unsigned step = 1;
    for (unsigned div = 3; div; div += step) {
        // Progress bar
        if (!(div & 0x03ffffff)) fprintf(stderr,"=");
        // Skip powers of two
        if (!(div & (div-1))) continue;
        unsigned shift = 31 - __builtin_clz(div);

        struct quomod ref = quomod_ref(div,shift);
        struct quomod alt = quomod_alt(div,shift);

        if (ref.quo != alt.quo || ref.mod != alt.mod) {
            printf("\nerror at %u\n",div);
            return 1;
        }
    }
    fprintf(stderr,"=\nAll ok\n");
    return 0;
}

如何在 GPU 上按常数找到除法的魔法乘数？

dymd005 回答：如何在 GPU 上按常数找到除法的魔法乘数？

大家都在问