我正在紧密循环中实现双线性插值并尝试使用SSE对其进行优化,但我从中获得零加速.
这是代码,非SIMD版本使用简单的向量结构,可以定义为struct Vec3f {float x,y,z;使用已实现的乘法和加法运算符:
- #ifdef USE_SIMD
- const Color c11 = pixelCache[y1 * size.x + x1];
- const Color c12 = pixelCache[y2 * size.x + x1];
- const Color c22 = pixelCache[y2 * size.x + x2];
- const Color c21 = pixelCache[y1 * size.x + x2];
- __declspec(align(16)) float mc11[4] = { 1.0,c11.GetB(),c11.GetG(),c11.GetR() };
- __declspec(align(16)) float mc12[4] = { 1.0,c12.GetB(),c12.GetG(),c12.GetR() };
- __declspec(align(16)) float mc22[4] = { 1.0,c22.GetB(),c22.GetG(),c22.GetR() };
- __declspec(align(16)) float mc21[4] = { 1.0,c21.GetB(),c21.GetG(),c21.GetR() };
- // scalars in vector form for SSE
- const float s11 = (x2-x)*(y2-y);
- const float s12 = (x2-x)*(y-y1);
- const float s22 = (x-x1)*(y-y1);
- const float s21 = (x-x1)*(y2-y);
- __declspec(align(16)) float ms11[4] = {1.0,s11,s11};
- __declspec(align(16)) float ms12[4] = {1.0,s12,s12};
- __declspec(align(16)) float ms22[4] = {1.0,s22,s22};
- __declspec(align(16)) float ms21[4] = {1.0,s21,s21};
- __asm {
- movaps xmm0,mc11
- movaps xmm1,mc12
- movaps xmm2,mc22
- movaps xmm3,mc21
- movaps xmm4,ms11
- movaps xmm5,ms12
- movaps xmm6,ms22
- movaps xmm7,ms21
- mulps xmm0,xmm4
- mulps xmm1,xmm5
- mulps xmm2,xmm6
- mulps xmm3,xmm7
- addps xmm0,xmm1
- addps xmm0,xmm2
- addps xmm0,xmm3
- movaps mc11,xmm0
- }
- #else
- const Vec3f c11 = toFloat(pixelCache[y1 * size.x + x1]);
- const Vec3f c12 = toFloat(pixelCache[y2 * size.x + x1]);
- const Vec3f c22 = toFloat(pixelCache[y2 * size.x + x2]);
- const Vec3f c21 = toFloat(pixelCache[y1 * size.x + x2]);
- const Vec3f colour =
- c11*(x2-x)*(y2-y) +
- c21*(x-x1)*(y2-y) +
- c12*(x2-x)*(y-y1) +
- c22*(x-x1)*(y-y1);
- #endif
重新排列asm代码以重用寄存器(最后只有三个xmm寄存器)并没有产生任何影响.我也试过使用内在函数:
- // perform bilinear interpolation
- const Vec3f c11 = toFloat(pixelCache[y1 * size.x + x1]);
- const Vec3f c12 = toFloat(pixelCache[y2 * size.x + x1]);
- const Vec3f c22 = toFloat(pixelCache[y2 * size.x + x2]);
- const Vec3f c21 = toFloat(pixelCache[y1 * size.x + x2]);
- // scalars in vector form for SSE
- const float s11 = (x2-x)*(y2-y);
- const float s12 = (x2-x)*(y-y1);
- const float s22 = (x-x1)*(y-y1);
- const float s21 = (x-x1)*(y2-y);
- __m128 mc11 = _mm_set_ps(1.f,c11.b,c11.g,c11.r);
- __m128 mc12 = _mm_set_ps(1.f,c12.b,c12.g,c12.r);
- __m128 mc22 = _mm_set_ps(1.f,c22.b,c22.g,c22.r);
- __m128 mc21 = _mm_set_ps(1.f,c21.b,c21.g,c21.r);
- __m128 ms11 = _mm_set_ps(1.f,s11);
- __m128 ms12 = _mm_set_ps(1.f,s12);
- __m128 ms22 = _mm_set_ps(1.f,s22);
- __m128 ms21 = _mm_set_ps(1.f,s21);
- mc11 = _mm_mul_ps(mc11,ms11);
- mc12 = _mm_mul_ps(mc12,ms12);
- mc22 = _mm_mul_ps(mc22,ms22);
- mc21 = _mm_mul_ps(mc21,ms21);
- mc11 = _mm_add_ps(mc11,mc12);
- mc11 = _mm_add_ps(mc11,mc22);
- mc11 = _mm_add_ps(mc11,mc21);
- Vec3f colour;
- _mm_storeu_ps(colour.array,mc11);
并无济于事.我错过了什么,或者在这里不可能获得任何额外的速度?
解决方法
为何浮点?
给定a,b,c,d和xerr的打包像素argb,yerr在0-256范围内,一个简单的例子是:
给定a,b,c,d和xerr的打包像素argb,yerr在0-256范围内,一个简单的例子是:
- // =================================================================================================================
- // xs_Bilerp
- // =================================================================================================================
- finline uint32 xs_Bilerp (uint32 a,uint32 b,uint32 c,uint32 d,uint32 xerr,uint32 yerr)
- {
- #define xs_rbmask 0x00ff00ff
- #define xs_agmask 0xff00ff00
- if (a==b && c==d && a==d) return a;
- const uint32 arb = a & xs_rbmask;
- const uint32 crb = c & xs_rbmask;
- const uint32 aag = a & xs_agmask;
- const uint32 cag = c & xs_agmask;
- const uint32 rbdx1 = (b & xs_rbmask) - arb;
- const uint32 rbdx2 = (d & xs_rbmask) - crb;
- const uint32 agdx1 = ((b & xs_agmask)>>8) - (aag >> 8);
- const uint32 agdx2 = ((d & xs_agmask)>>8) - (cag >> 8);
- const uint32 rb1 = (arb + ((rbdx1 * xerr) >> 8)) & xs_rbmask;
- const uint32 ag1 = (aag + ((agdx1 * xerr) )) & xs_agmask;
- const uint32 rbdy = ((crb + ((rbdx2 * xerr) >> 8)) & xs_rbmask) - rb1;
- const uint32 agdy = (((cag + ((agdx2 * xerr) )) & xs_agmask)>>8) - (ag1 >> 8);
- const uint32 rb = (rb1 + ((rbdy * yerr) >> 8)) & xs_rbmask;
- const uint32 ag = (ag1 + ((agdy * yerr) )) & xs_agmask;
- return ag | rb;
- }