bearbecueLe 09/06/2011 à 14:13
// fast version
const float * __restrict srcTexcoords = reinterpret_cast<const float*>(texcoords.Data());
float * __restrict dstSamples = reinterpret_cast<float*>(outputSamples.Data());
float *dstStop = dstSamples + outputSamples.Count() * 4;
SIMD::Float4 txDims = SIMD::Float4(m_Dimensions.x(), m_Dimensions.y(), m_Dimensions.x(), m_Dimensions.y());
SIMD::Float4 txDimsM1 = SIMD::Float4(m_Dimensions.x() - 1, m_Dimensions.y() - 1, m_Dimensions.x() - 1, m_Dimensions.y() - 1);
if (tcMode == TexcoordMode_Wrap)
{
// strided memviews: ~50/51 fps
// single contiguous loop: ~53/54 fps
// unrolled x4 loop: ~59/60 fps
float *dstStopX4 = dstStop - 4*4;
while (dstSamples <= dstStopX4)
{
SIMD::Float4 x0y0x1y1 = SIMD::Float4::LoadUnaligned(srcTexcoords + 0);
SIMD::Float4 x2y2x3y3 = SIMD::Float4::LoadUnaligned(srcTexcoords + 4);
srcTexcoords += 2*4;
const SIMD::Float4 one(1.0f);
SIMD::Float4 maskLZ_x0y0x1y1 = x0y0x1y1.MaskLower(SIMD::Float4::Zero());
SIMD::Float4 maskLZ_x2y2x3y3 = x2y2x3y3.MaskLower(SIMD::Float4::Zero());
x0y0x1y1 = SIMD::Frac(x0y0x1y1);
x2y2x3y3 = SIMD::Frac(x2y2x3y3);
x0y0x1y1 += maskLZ_x0y0x1y1 & one;
x2y2x3y3 += maskLZ_x2y2x3y3 & one;
x0y0x1y1 *= txDims;
x2y2x3y3 *= txDims;
hh_u32 cell0, cell1, cell2, cell3;
_Find2DCellIndexX2(x0y0x1y1, txDimsM1, m_Dimensions.x(), cell0, cell1);
_Find2DCellIndexX2(x2y2x3y3, txDimsM1, m_Dimensions.x(), cell2, cell3);
SIMD::Float4 texelRGBA0 = _LoadBGRA8_AndReturnRGBA32F(m_Pixels + cell0 * m_PixelSizeInBytes);
SIMD::Float4 texelRGBA1 = _LoadBGRA8_AndReturnRGBA32F(m_Pixels + cell1 * m_PixelSizeInBytes);
SIMD::Float4 texelRGBA2 = _LoadBGRA8_AndReturnRGBA32F(m_Pixels + cell2 * m_PixelSizeInBytes);
SIMD::Float4 texelRGBA3 = _LoadBGRA8_AndReturnRGBA32F(m_Pixels + cell3 * m_PixelSizeInBytes);
texelRGBA0.StoreAligned16(dstSamples, 0x00);
texelRGBA1.StoreAligned16(dstSamples, 0x10);
texelRGBA2.StoreAligned16(dstSamples, 0x20);
texelRGBA3.StoreAligned16(dstSamples, 0x30);
dstSamples += 4*4;
}
while (dstSamples < dstStop)
{
SIMD::Float4 xy = SIMD::Float4::LoadUnaligned(srcTexcoords);
srcTexcoords += 2;
SIMD::Float4 maskLZ = xy.MaskLower(SIMD::Float4::Zero());
xy = SIMD::Frac(xy);
xy += maskLZ & SIMD::Float4(1.0f);
xy *= txDims;
hh_u32 cell = _Find2DCellIndex(xy, txDimsM1, m_Dimensions.x());
SIMD::Float4 texelRGBA = _LoadBGRA8_AndReturnRGBA32F(m_Pixels + cell * m_PixelSizeInBytes);
texelRGBA.StoreAligned16(dstSamples);
dstSamples += 4;
}
}
else if (tcMode == TexcoordMode_WrapMirror)
{