c – SSE双线性插值

前端之家收集整理的这篇文章主要介绍了c – SSE双线性插值前端之家小编觉得挺不错的,现在分享给大家,也给大家做个参考。
我正在紧密循环中实现双线性插值并尝试使用SSE对其进行优化,但我从中获得零加速.

这是代码,非SIMD版本使用简单的向量结构,可以定义为struct Vec3f {float x,y,z;使用已实现的乘法和加法运算符:

  1. #ifdef USE_SIMD
  2. const Color c11 = pixelCache[y1 * size.x + x1];
  3. const Color c12 = pixelCache[y2 * size.x + x1];
  4. const Color c22 = pixelCache[y2 * size.x + x2];
  5. const Color c21 = pixelCache[y1 * size.x + x2];
  6.  
  7. __declspec(align(16)) float mc11[4] = { 1.0,c11.GetB(),c11.GetG(),c11.GetR() };
  8. __declspec(align(16)) float mc12[4] = { 1.0,c12.GetB(),c12.GetG(),c12.GetR() };
  9. __declspec(align(16)) float mc22[4] = { 1.0,c22.GetB(),c22.GetG(),c22.GetR() };
  10. __declspec(align(16)) float mc21[4] = { 1.0,c21.GetB(),c21.GetG(),c21.GetR() };
  11.  
  12. // scalars in vector form for SSE
  13. const float s11 = (x2-x)*(y2-y);
  14. const float s12 = (x2-x)*(y-y1);
  15. const float s22 = (x-x1)*(y-y1);
  16. const float s21 = (x-x1)*(y2-y);
  17.  
  18. __declspec(align(16)) float ms11[4] = {1.0,s11,s11};
  19. __declspec(align(16)) float ms12[4] = {1.0,s12,s12};
  20. __declspec(align(16)) float ms22[4] = {1.0,s22,s22};
  21. __declspec(align(16)) float ms21[4] = {1.0,s21,s21};
  22.  
  23. __asm {
  24. movaps xmm0,mc11
  25. movaps xmm1,mc12
  26. movaps xmm2,mc22
  27. movaps xmm3,mc21
  28.  
  29. movaps xmm4,ms11
  30. movaps xmm5,ms12
  31. movaps xmm6,ms22
  32. movaps xmm7,ms21
  33.  
  34. mulps xmm0,xmm4
  35. mulps xmm1,xmm5
  36. mulps xmm2,xmm6
  37. mulps xmm3,xmm7
  38.  
  39. addps xmm0,xmm1
  40. addps xmm0,xmm2
  41. addps xmm0,xmm3
  42.  
  43. movaps mc11,xmm0
  44. }
  45. #else
  46. const Vec3f c11 = toFloat(pixelCache[y1 * size.x + x1]);
  47. const Vec3f c12 = toFloat(pixelCache[y2 * size.x + x1]);
  48. const Vec3f c22 = toFloat(pixelCache[y2 * size.x + x2]);
  49. const Vec3f c21 = toFloat(pixelCache[y1 * size.x + x2]);
  50.  
  51. const Vec3f colour =
  52. c11*(x2-x)*(y2-y) +
  53. c21*(x-x1)*(y2-y) +
  54. c12*(x2-x)*(y-y1) +
  55. c22*(x-x1)*(y-y1);
  56. #endif

重新排列asm代码以重用寄存器(最后只有三个xmm寄存器)并没有产生任何影响.我也试过使用内在函数

  1. // perform bilinear interpolation
  2. const Vec3f c11 = toFloat(pixelCache[y1 * size.x + x1]);
  3. const Vec3f c12 = toFloat(pixelCache[y2 * size.x + x1]);
  4. const Vec3f c22 = toFloat(pixelCache[y2 * size.x + x2]);
  5. const Vec3f c21 = toFloat(pixelCache[y1 * size.x + x2]);
  6.  
  7. // scalars in vector form for SSE
  8. const float s11 = (x2-x)*(y2-y);
  9. const float s12 = (x2-x)*(y-y1);
  10. const float s22 = (x-x1)*(y-y1);
  11. const float s21 = (x-x1)*(y2-y);
  12.  
  13. __m128 mc11 = _mm_set_ps(1.f,c11.b,c11.g,c11.r);
  14. __m128 mc12 = _mm_set_ps(1.f,c12.b,c12.g,c12.r);
  15. __m128 mc22 = _mm_set_ps(1.f,c22.b,c22.g,c22.r);
  16. __m128 mc21 = _mm_set_ps(1.f,c21.b,c21.g,c21.r);
  17.  
  18. __m128 ms11 = _mm_set_ps(1.f,s11);
  19. __m128 ms12 = _mm_set_ps(1.f,s12);
  20. __m128 ms22 = _mm_set_ps(1.f,s22);
  21. __m128 ms21 = _mm_set_ps(1.f,s21);
  22.  
  23. mc11 = _mm_mul_ps(mc11,ms11);
  24. mc12 = _mm_mul_ps(mc12,ms12);
  25. mc22 = _mm_mul_ps(mc22,ms22);
  26. mc21 = _mm_mul_ps(mc21,ms21);
  27.  
  28. mc11 = _mm_add_ps(mc11,mc12);
  29. mc11 = _mm_add_ps(mc11,mc22);
  30. mc11 = _mm_add_ps(mc11,mc21);
  31.  
  32. Vec3f colour;
  33. _mm_storeu_ps(colour.array,mc11);

并无济于事.我错过了什么,或者在这里不可能获得任何额外的速度?

解决方法

为何浮点?
给定a,b,c,d和xerr的打包像素argb,yerr在0-256范围内,一个简单的例子是:
  1. // =================================================================================================================
  2. // xs_Bilerp
  3. // =================================================================================================================
  4. finline uint32 xs_Bilerp (uint32 a,uint32 b,uint32 c,uint32 d,uint32 xerr,uint32 yerr)
  5. {
  6. #define xs_rbmask 0x00ff00ff
  7. #define xs_agmask 0xff00ff00
  8.  
  9. if (a==b && c==d && a==d) return a;
  10.  
  11. const uint32 arb = a & xs_rbmask;
  12. const uint32 crb = c & xs_rbmask;
  13. const uint32 aag = a & xs_agmask;
  14. const uint32 cag = c & xs_agmask;
  15.  
  16. const uint32 rbdx1 = (b & xs_rbmask) - arb;
  17. const uint32 rbdx2 = (d & xs_rbmask) - crb;
  18. const uint32 agdx1 = ((b & xs_agmask)>>8) - (aag >> 8);
  19. const uint32 agdx2 = ((d & xs_agmask)>>8) - (cag >> 8);
  20.  
  21. const uint32 rb1 = (arb + ((rbdx1 * xerr) >> 8)) & xs_rbmask;
  22. const uint32 ag1 = (aag + ((agdx1 * xerr) )) & xs_agmask;
  23. const uint32 rbdy = ((crb + ((rbdx2 * xerr) >> 8)) & xs_rbmask) - rb1;
  24. const uint32 agdy = (((cag + ((agdx2 * xerr) )) & xs_agmask)>>8) - (ag1 >> 8);
  25.  
  26. const uint32 rb = (rb1 + ((rbdy * yerr) >> 8)) & xs_rbmask;
  27. const uint32 ag = (ag1 + ((agdy * yerr) )) & xs_agmask;
  28.  
  29. return ag | rb;
  30. }

猜你在找的C&C++相关文章