17093Fermer17095
bearbecueLe 03/07/2011 à 21:41
#if	(HH_SIMD == HH_SIMD_SSE) && (HH_SIMD_VERSION >= HH_SSE2) && defined(USE_EXPANDED_TEXTURE_FOR_BILERP)

			// process 2 sample (8 texels) per iteration
			float	*dstStopX2 = dstStop - 2*4;
			while (dstSamples <= dstStopX2)
			{
				SIMD::Float4	xy_AB = SIMD::Float4::LoadUnaligned(srcTexcoords + 0);
				srcTexcoords += 4;
				SIMD::Float4	cursor_AB = SIMD::Frac(xy_AB * txDims);

				xy_AB *= txDimsV;

				__m128i	xy_i_AB = _mm_cvttps_epi32(xy_AB._xmm());
				__m128i	xy_iw_AB = _mm_and_si128(xy_i_AB, wrapMask._xmm_epi32());

				__m128i	packed_A = _mm_shufflelo_epi16(xy_iw_AB, HH_MM_SHUFFLE(x, z, y, w));	// [...][0000][0Yyy][0000][Xxxx] -> [...][0000][0000][0Yyy][Xxxx]
				__m128i	packed_B = _mm_shufflehi_epi16(xy_iw_AB, HH_MM_SHUFFLE(x, z, y, w));	// [0000][0Yyy][0000][Xxxx][...] -> [0000][0000][0Yyy][Xxxx][...]
				packed_B = _mm_shuffle_epi32(packed_B, HH_MM_SHUFFLE(z, w, x, y));

				hh_u32	cell_A = _mm_cvtsi128_si32(packed_A) >> yStrideShift;
				hh_u32	cell_B = _mm_cvtsi128_si32(packed_B) >> yStrideShift;

				hh_u8	*texel_A = m_PixelsX4 + cell_A * (4 * kBGRA8PixelSizeInBytes);
				hh_u8	*texel_B = m_PixelsX4 + cell_B * (4 * kBGRA8PixelSizeInBytes);

				cursor_AB *= k16384;

				__m128i	t0t1t2t3_A = _mm_load_si128(reinterpret_cast<const __m128i*>(texel_A));
				__m128i	t0t1t2t3_B = _mm_load_si128(reinterpret_cast<const __m128i*>(texel_B));

				__m128i	t32_AB = _mm_cvttps_epi32(cursor_AB._xmm());

				__m128i	t16_A = _mm_shufflelo_epi16(t32_AB, HH_MM_SHUFFLE(x,x,z,z));	// xxyy____
				__m128i	t16_B = _mm_shufflehi_epi16(t32_AB, HH_MM_SHUFFLE(x,x,z,z));	// ____xxyy

				__m128i	tx16_A = _mm_shuffle_epi32(t16_A, HH_MM_SHUFFLE(x,x,x,x));
				__m128i	tx16_B = _mm_shuffle_epi32(t16_B, HH_MM_SHUFFLE(z,z,z,z));

				const __m128i	_zero = _mm_setzero_si128();

				__m128fi	ty16A;
				__m128fi	ty16B;
				ty16A.i = t16_A;
				ty16B.i = t16_B;

				ty16A.f = _mm_shuffle_ps(ty16A.f, ty16B.f, HH_MM_SHUFFLE(y,y,w,w));	// FIXME: stay on the integer pipeline
				__m128i	ty16 = ty16A.i;

				__m128i	t0t1_A = _mm_unpacklo_epi8(t0t1t2t3_A, _zero);	// a c
				__m128i	t0t1_B = _mm_unpacklo_epi8(t0t1t2t3_B, _zero);	// a c
				__m128i	t2t3_A = _mm_unpackhi_epi8(t0t1t2t3_A, _zero);	// b d
				__m128i	t2t3_B = _mm_unpackhi_epi8(t0t1t2t3_B, _zero);	// b d

				__m128i	delta_A = _mm_sub_epi16(t2t3_A, t0t1_A);	// { b-a, d-c }
				__m128i	delta_B = _mm_sub_epi16(t2t3_B, t0t1_B);	// { b-a, d-c }

				delta_A = _mm_slli_epi16(delta_A, 2);
				delta_B = _mm_slli_epi16(delta_B, 2);

				__m128i t2t3xT_A = _mm_mulhi_epi16(delta_A, tx16_A);
				__m128i t2t3xT_B = _mm_mulhi_epi16(delta_B, tx16_B);
				__m128fi	combined1_A;
				__m128fi	combined1_B;
				combined1_A.i = _mm_add_epi16(t0t1_A, t2t3xT_A);
				combined1_B.i = _mm_add_epi16(t0t1_B, t2t3xT_B);

				/*
				__m128i	combinedA_A = _mm_unpacklo_epi16(combined1_A, _zero);	// [aa00bb00cc00dd00]
				__m128i	combinedB_A = _mm_unpackhi_epi16(combined1_A, _zero);	// [ee00ff00gg00hh00]

				__m128i	combinedA_B = _mm_unpacklo_epi16(combined1_B, _zero);
				__m128i	combinedB_B = _mm_unpackhi_epi16(combined1_B, _zero);
				*/

				__m128fi	combinedTransposedA;
				__m128fi	combinedTransposedB;
				combinedTransposedA.f = _mm_shuffle_ps(combined1_A.f, combined1_B.f, HH_MM_SHUFFLE(x,y,x,y));	// [aa][bb][cc][dd] [ii][jj][kk][ll]
				combinedTransposedB.f = _mm_shuffle_ps(combined1_A.f, combined1_B.f, HH_MM_SHUFFLE(z,w,z,w));	// [ee][ff][gg][hh] [mm][nn][oo][pp]

				__m128i	delta2 = _mm_sub_epi16(combinedTransposedB.i, combinedTransposedA.i);	// { b-a } - { d-c }
				delta2 = _mm_slli_epi16(delta2, 2);

				__m128i combinedxT = _mm_mulhi_epi16(delta2, ty16);

				__m128i	finalCombined16 = _mm_add_epi16(combinedTransposedA.i, combinedxT);
				__m128i	finalCombined32_A = _mm_unpacklo_epi16(finalCombined16, _zero);
				__m128i	finalCombined32_B = _mm_unpackhi_epi16(finalCombined16, _zero);

/*
				__m128i	delta2_A = _mm_sub_epi32(combinedB_A, combinedA_A);	// { b-a } - { d-c }
				__m128i	delta2_B = _mm_sub_epi32(combinedB_B, combinedA_B);	// { b-a } - { d-c }
				combinedA_A = _mm_slli_epi32(combinedA_A, 14);
				combinedA_B = _mm_slli_epi32(combinedA_B, 14);
				__m128i combinedxT_A = _mm_mullo_epi32(delta2_A, ty16_A);			// [rrrr][0000][gggg][0000][aaaa][0000][aaaa][0000]
				__m128i combinedxT_B = _mm_mullo_epi32(delta2_B, ty16_B);			// [rrrr][0000][gggg][0000][aaaa][0000][aaaa][0000]
*/
				/*
				__m128i	delta2_A = _mm_sub_epi16(combinedB_A, combinedA_A);	// { b-a } - { d-c }
				__m128i	delta2_B = _mm_sub_epi16(combinedB_B, combinedA_B);	// { b-a } - { d-c }
				delta2_A = _mm_slli_epi16(delta2_A, 2);
				delta2_B = _mm_slli_epi16(delta2_B, 2);
				__m128i combinedxT_A = _mm_mulhi_epi16(delta2_A, ty16_A);			// [rrrr][0000][gggg][0000][aaaa][0000][aaaa][0000]
				__m128i combinedxT_B = _mm_mulhi_epi16(delta2_B, ty16_B);			// [rrrr][0000][gggg][0000][aaaa][0000][aaaa][0000]

				__m128i	finalCombined32_A = _mm_add_epi16(combinedA_A, combinedxT_A);
				__m128i	finalCombined32_B = _mm_add_epi16(combinedA_B, combinedxT_B);
				*/

				SIMD::Float4	fpCombined_A = _mm_cvtepi32_ps(finalCombined32_A);
				SIMD::Float4	fpCombined_B = _mm_cvtepi32_ps(finalCombined32_B);
				SIMD::Float4	texelRGBA_A = fpCombined_A * _Inv255/*_16384*/;
				SIMD::Float4	texelRGBA_B = fpCombined_B * _Inv255/*_16384*/;

				texelRGBA_A.StoreAligned16(dstSamples + 0);
				texelRGBA_B.StoreAligned16(dstSamples + 4);
				dstSamples += 8;
			}
#endif