#if (HH_SIMD == HH_SIMD_SSE) && (HH_SIMD_VERSION >= HH_SSE2) && defined(USE_EXPANDED_TEXTURE_FOR_BILERP)
// process 2 sample (8 texels) per iteration
float *dstStopX2 = dstStop - 2*4;
while (dstSamples <= dstStopX2)
{
SIMD::Float4 xy_AB = SIMD::Float4::LoadUnaligned(srcTexcoords + 0);
srcTexcoords += 4;
SIMD::Float4 cursor_AB = SIMD::Frac(xy_AB * txDims);
xy_AB *= txDimsV;
__m128i xy_i_AB = _mm_cvttps_epi32(xy_AB._xmm());
__m128i xy_iw_AB = _mm_and_si128(xy_i_AB, wrapMask._xmm_epi32());
__m128i packed_A = _mm_shufflelo_epi16(xy_iw_AB, HH_MM_SHUFFLE(x, z, y, w)); // [...][0000][0Yyy][0000][Xxxx] -> [...][0000][0000][0Yyy][Xxxx]
__m128i packed_B = _mm_shufflehi_epi16(xy_iw_AB, HH_MM_SHUFFLE(x, z, y, w)); // [0000][0Yyy][0000][Xxxx][...] -> [0000][0000][0Yyy][Xxxx][...]
packed_B = _mm_shuffle_epi32(packed_B, HH_MM_SHUFFLE(z, w, x, y));
hh_u32 cell_A = _mm_cvtsi128_si32(packed_A) >> yStrideShift;
hh_u32 cell_B = _mm_cvtsi128_si32(packed_B) >> yStrideShift;
hh_u8 *texel_A = m_PixelsX4 + cell_A * (4 * kBGRA8PixelSizeInBytes);
hh_u8 *texel_B = m_PixelsX4 + cell_B * (4 * kBGRA8PixelSizeInBytes);
cursor_AB *= k16384;
__m128i t0t1t2t3_A = _mm_load_si128(reinterpret_cast<const __m128i*>(texel_A));
__m128i t0t1t2t3_B = _mm_load_si128(reinterpret_cast<const __m128i*>(texel_B));
__m128i t32_AB = _mm_cvttps_epi32(cursor_AB._xmm());
__m128i t16_A = _mm_shufflelo_epi16(t32_AB, HH_MM_SHUFFLE(x,x,z,z)); // xxyy____
__m128i t16_B = _mm_shufflehi_epi16(t32_AB, HH_MM_SHUFFLE(x,x,z,z)); // ____xxyy
__m128i tx16_A = _mm_shuffle_epi32(t16_A, HH_MM_SHUFFLE(x,x,x,x));
__m128i tx16_B = _mm_shuffle_epi32(t16_B, HH_MM_SHUFFLE(z,z,z,z));
const __m128i _zero = _mm_setzero_si128();
__m128fi ty16A;
__m128fi ty16B;
ty16A.i = t16_A;
ty16B.i = t16_B;
ty16A.f = _mm_shuffle_ps(ty16A.f, ty16B.f, HH_MM_SHUFFLE(y,y,w,w)); // FIXME: stay on the integer pipeline
__m128i ty16 = ty16A.i;
__m128i t0t1_A = _mm_unpacklo_epi8(t0t1t2t3_A, _zero); // a c
__m128i t0t1_B = _mm_unpacklo_epi8(t0t1t2t3_B, _zero); // a c
__m128i t2t3_A = _mm_unpackhi_epi8(t0t1t2t3_A, _zero); // b d
__m128i t2t3_B = _mm_unpackhi_epi8(t0t1t2t3_B, _zero); // b d
__m128i delta_A = _mm_sub_epi16(t2t3_A, t0t1_A); // { b-a, d-c }
__m128i delta_B = _mm_sub_epi16(t2t3_B, t0t1_B); // { b-a, d-c }
delta_A = _mm_slli_epi16(delta_A, 2);
delta_B = _mm_slli_epi16(delta_B, 2);
__m128i t2t3xT_A = _mm_mulhi_epi16(delta_A, tx16_A);
__m128i t2t3xT_B = _mm_mulhi_epi16(delta_B, tx16_B);
__m128fi combined1_A;
__m128fi combined1_B;
combined1_A.i = _mm_add_epi16(t0t1_A, t2t3xT_A);
combined1_B.i = _mm_add_epi16(t0t1_B, t2t3xT_B);
/*
__m128i combinedA_A = _mm_unpacklo_epi16(combined1_A, _zero); // [aa00bb00cc00dd00]
__m128i combinedB_A = _mm_unpackhi_epi16(combined1_A, _zero); // [ee00ff00gg00hh00]
__m128i combinedA_B = _mm_unpacklo_epi16(combined1_B, _zero);
__m128i combinedB_B = _mm_unpackhi_epi16(combined1_B, _zero);
*/
__m128fi combinedTransposedA;
__m128fi combinedTransposedB;
combinedTransposedA.f = _mm_shuffle_ps(combined1_A.f, combined1_B.f, HH_MM_SHUFFLE(x,y,x,y)); // [aa][bb][cc][dd] [ii][jj][kk][ll]
combinedTransposedB.f = _mm_shuffle_ps(combined1_A.f, combined1_B.f, HH_MM_SHUFFLE(z,w,z,w)); // [ee][ff][gg][hh] [mm][nn][oo][pp]
__m128i delta2 = _mm_sub_epi16(combinedTransposedB.i, combinedTransposedA.i); // { b-a } - { d-c }
delta2 = _mm_slli_epi16(delta2, 2);
__m128i combinedxT = _mm_mulhi_epi16(delta2, ty16);
__m128i finalCombined16 = _mm_add_epi16(combinedTransposedA.i, combinedxT);
__m128i finalCombined32_A = _mm_unpacklo_epi16(finalCombined16, _zero);
__m128i finalCombined32_B = _mm_unpackhi_epi16(finalCombined16, _zero);
/*
__m128i delta2_A = _mm_sub_epi32(combinedB_A, combinedA_A); // { b-a } - { d-c }
__m128i delta2_B = _mm_sub_epi32(combinedB_B, combinedA_B); // { b-a } - { d-c }
combinedA_A = _mm_slli_epi32(combinedA_A, 14);
combinedA_B = _mm_slli_epi32(combinedA_B, 14);
__m128i combinedxT_A = _mm_mullo_epi32(delta2_A, ty16_A); // [rrrr][0000][gggg][0000][aaaa][0000][aaaa][0000]
__m128i combinedxT_B = _mm_mullo_epi32(delta2_B, ty16_B); // [rrrr][0000][gggg][0000][aaaa][0000][aaaa][0000]
*/
/*
__m128i delta2_A = _mm_sub_epi16(combinedB_A, combinedA_A); // { b-a } - { d-c }
__m128i delta2_B = _mm_sub_epi16(combinedB_B, combinedA_B); // { b-a } - { d-c }
delta2_A = _mm_slli_epi16(delta2_A, 2);
delta2_B = _mm_slli_epi16(delta2_B, 2);
__m128i combinedxT_A = _mm_mulhi_epi16(delta2_A, ty16_A); // [rrrr][0000][gggg][0000][aaaa][0000][aaaa][0000]
__m128i combinedxT_B = _mm_mulhi_epi16(delta2_B, ty16_B); // [rrrr][0000][gggg][0000][aaaa][0000][aaaa][0000]
__m128i finalCombined32_A = _mm_add_epi16(combinedA_A, combinedxT_A);
__m128i finalCombined32_B = _mm_add_epi16(combinedA_B, combinedxT_B);
*/
SIMD::Float4 fpCombined_A = _mm_cvtepi32_ps(finalCombined32_A);
SIMD::Float4 fpCombined_B = _mm_cvtepi32_ps(finalCombined32_B);
SIMD::Float4 texelRGBA_A = fpCombined_A * _Inv255/*_16384*/;
SIMD::Float4 texelRGBA_B = fpCombined_B * _Inv255/*_16384*/;
texelRGBA_A.StoreAligned16(dstSamples + 0);
texelRGBA_B.StoreAligned16(dstSamples + 4);
dstSamples += 8;
}
#endif