30974Fermer30976
bearbecueLe 08/05/2017 à 05:09
static CRect	_FeedMinMaxUV(const TMemoryView<const CFloat2> &texcoords)
{
	HH_ASSERT(!texcoords.Empty());
	// SIMD loop below 22 times faster than naive C++ loop with HHMin/HHMax on
	// the CFloat2 values that was generating branches + cmovs.

	const CFloat2	*srcUV = texcoords.Data();
	const CFloat2	*srcUVStop = texcoords.DataEnd();
	SIMD::Scalar	minU = srcUV->x();
	SIMD::Scalar	minV = srcUV->y();
	SIMD::Scalar	maxU = minU;
	SIMD::Scalar	maxV = minV;

	if (Mem::IsAligned<0x10>(texcoords.Data()))
	{
		SIMD::Float4	minUVx4_01 = SIMD::Float4(minU, minV, minU, minV);
		SIMD::Float4	maxUVx4_01 = minUVx4_01;
		SIMD::Float4	minUVx4_23 = minUVx4_01;
		SIMD::Float4	maxUVx4_23 = maxUVx4_01;

		srcUVStop -= 4;
		while (srcUV <= srcUVStop)
		{
			const SIMD::Float4	uv0uv1 = SIMD::Float4::LoadAligned16(srcUV, 0x00);
			const SIMD::Float4	uv2uv3 = SIMD::Float4::LoadAligned16(srcUV, 0x10);
			minUVx4_01 = SIMD::Min(minUVx4_01, uv0uv1);
			maxUVx4_01 = SIMD::Max(maxUVx4_01, uv0uv1);
			minUVx4_23 = SIMD::Min(minUVx4_23, uv2uv3);
			maxUVx4_23 = SIMD::Max(maxUVx4_23, uv2uv3);
			srcUV += 4;
		}
		srcUVStop += 4;

		const SIMD::Float4	minUVx4 = SIMD::Min(minUVx4_01, minUVx4_23);
		const SIMD::Float4	maxUVx4 = SIMD::Max(maxUVx4_01, maxUVx4_23);
		const SIMD::Float4	hMin = SIMD::Min(minUVx4, minUVx4.Swizzle<2,3,2,3>());	// movehl
		const SIMD::Float4	hMax = SIMD::Max(maxUVx4, maxUVx4.Swizzle<2,3,2,3>());	// movehl
		minV = hMin.y();
		minU = hMin.x();
		maxV = hMax.y();
		maxU = hMax.x();
	}

	while (srcUV < srcUVStop)
	{
		const SIMD::Scalar	u = SIMD::Scalar::Load(srcUV, 0);
		const SIMD::Scalar	v = SIMD::Scalar::Load(srcUV, 4);
		minU = SIMD::Min(minU, u);
		minV = SIMD::Min(minV, v);
		maxU = SIMD::Max(maxU, u);
		maxV = SIMD::Max(maxV, v);
		srcUV += 1;
	}

	return CRect::FromMinMaxUnchecked(	CFloat2(minU.AsFloat(), minV.AsFloat()),
										CFloat2(maxU.AsFloat(), maxV.AsFloat()));
}

//----------------------------------------------------------------------------

static bool	_TriangleOverlapsBox(const CFloat2 &pa, const CFloat2 &pb, const CFloat2 &pc, const CFloat4 (&cellBoxCenterAndHalfWidth)[2])
{
	// The cell box center and half widths we're given should be broadcasted to a float4 in 'xyxy' form
	const SIMD::Float4	vbc = SIMD::Float4::LoadAligned16(cellBoxCenterAndHalfWidth, 0x00);
	const SIMD::Float4	vbw = SIMD::Float4::LoadAligned16(cellBoxCenterAndHalfWidth, 0x10);

	const SIMD::Float4	kSwapZWSign = SIMD::Float4::FromConstInt<0,0,0x80000000,0x80000000>();
	const SIMD::Float4	vla = SIMD::Float4::LoadUnaligned(&pa).Swizzle<0,1,0,1>() - vbc;	// NOTE: could also do a 'SIMD::Float4::LoadUnaligned_LoHi(&pa, &pa) - vbc'
	const SIMD::Float4	vlb = SIMD::Float4::LoadUnaligned(&pb).Swizzle<0,1,0,1>() - vbc;
	const SIMD::Float4	vlc = SIMD::Float4::LoadUnaligned(&pc).Swizzle<0,1,0,1>() - vbc;

	// Early-out test: triangle bbox vs cell bbox overlap test: if boxes don't overlap, reject
	const SIMD::Float4	vlaSwap = vla ^ kSwapZWSign;
	const SIMD::Float4	vlbSwap = vlb ^ kSwapZWSign;
	const SIMD::Float4	vlcSwap = vlc ^ kSwapZWSign;
	const SIMD::Float4	vMinMax = SIMD::Min(vlaSwap, vlbSwap, vlcSwap);
	const hh_u32		isOut = vMinMax.MaskGreater_Imm4(vbw);
	if (isOut != 0)
		return false;

	// SAT test for the 3 triangle edges
	// (note: code below would work for quads with almost no changes, we're doing the SAT
	// test in SOA, using the xyz lanes of each vector for the 3 edges. a 4th edge would use w)

	const SIMD::Float4	axcxaycy = SIMD::InterleaveLo(vla, vlc);
	const SIMD::Float4	bxbxbyby = vlb.Swizzle<0,0,1,1>();
	const SIMD::Float4	vlabc_x = SIMD::InterleaveLo(axcxaycy, bxbxbyby);	// axbxcxbx
	const SIMD::Float4	vlabc_y = SIMD::InterleaveHi(axcxaycy, bxbxbyby);	// aybycyby
	const SIMD::Float4	vlbca_x = vlabc_x.Swizzle<1,2,0,2>();
	const SIMD::Float4	vlbca_y = vlabc_y.Swizzle<1,2,0,2>();
	const SIMD::Float4	vlcab_x = vlabc_x.Swizzle<2,0,1,0>();
	const SIMD::Float4	vlcab_y = vlabc_y.Swizzle<2,0,1,0>();

	const SIMD::Float4	e012x = vlbca_x - vlabc_x;
	const SIMD::Float4	e012y = vlbca_y - vlabc_y;

	const SIMD::Float4	pr012_0 = vlabc_x.Nmsub(e012y, vlabc_y * e012x);
	const SIMD::Float4	pr012_1 = vlcab_x.Nmsub(e012y, vlcab_y * e012x);

	const SIMD::Float4	pr012Min = SIMD::Min(pr012_0, pr012_1);
	const SIMD::Float4	pr012Max = SIMD::Max(pr012_0, pr012_1);

	const SIMD::Float4	e012xAbs = SIMD::Abs(e012x);
	const SIMD::Float4	e012yAbs = SIMD::Abs(e012y);
	const SIMD::Float4	b012Radius = vbw.xxxx().Madd(e012yAbs, vbw.yyyy() * e012xAbs);
	const SIMD::Float4	pr012Test = SIMD::Max(pr012Min, -pr012Max);

	return pr012Test.MaskGreater_Imm4(b012Radius) == 0;
}

//----------------------------------------------------------------------------

template <typename _Type>
static hh_u32	_ClassifyTriangles(const CRect &bounds, const _Type *indexStream, const TMemoryView<hh_u32> &triangleIDs, const TMemoryView<const CFloat2> &texcoords, hh_u32 tIndexMask)
{
	const CFloat2					boxCenter = bounds.Center();
	const CFloat2					boxHalfWidth = 0.5f * (bounds.Max() - bounds.Min());
	HH_ALIGN(0x10) const CFloat4	boxCenterAndHalfWidth[2] =
	{
		CFloat4(boxCenter, boxCenter),
		CFloat4(boxHalfWidth, boxHalfWidth),
	};

	hh_u32	curChildCount = 0;
	for (hh_u32 tidx = 0; tidx < triangleIDs.Count(); tidx++)
	{
		const hh_u32	tIndex = triangleIDs[tidx];
		const hh_u32	vIndex = tIndex + ((tIndex + tIndex) & tIndexMask);
		const CFloat2	&a = texcoords[indexStream[vIndex + 0]];
		const CFloat2	&b = texcoords[indexStream[vIndex + 1]];
		const CFloat2	&c = texcoords[indexStream[vIndex + 2]];

		if (_TriangleOverlapsBox(a, b, c, boxCenterAndHalfWidth))
		{
			HHSwap(triangleIDs[curChildCount], triangleIDs[tidx]);
			curChildCount++;
		}
	}
	return curChildCount;
}