static CRect _FeedMinMaxUV(const TMemoryView<const CFloat2> &texcoords)
{
HH_ASSERT(!texcoords.Empty());
// SIMD loop below 22 times faster than naive C++ loop with HHMin/HHMax on
// the CFloat2 values that was generating branches + cmovs.
const CFloat2 *srcUV = texcoords.Data();
const CFloat2 *srcUVStop = texcoords.DataEnd();
SIMD::Scalar minU = srcUV->x();
SIMD::Scalar minV = srcUV->y();
SIMD::Scalar maxU = minU;
SIMD::Scalar maxV = minV;
if (Mem::IsAligned<0x10>(texcoords.Data()))
{
SIMD::Float4 minUVx4_01 = SIMD::Float4(minU, minV, minU, minV);
SIMD::Float4 maxUVx4_01 = minUVx4_01;
SIMD::Float4 minUVx4_23 = minUVx4_01;
SIMD::Float4 maxUVx4_23 = maxUVx4_01;
srcUVStop -= 4;
while (srcUV <= srcUVStop)
{
const SIMD::Float4 uv0uv1 = SIMD::Float4::LoadAligned16(srcUV, 0x00);
const SIMD::Float4 uv2uv3 = SIMD::Float4::LoadAligned16(srcUV, 0x10);
minUVx4_01 = SIMD::Min(minUVx4_01, uv0uv1);
maxUVx4_01 = SIMD::Max(maxUVx4_01, uv0uv1);
minUVx4_23 = SIMD::Min(minUVx4_23, uv2uv3);
maxUVx4_23 = SIMD::Max(maxUVx4_23, uv2uv3);
srcUV += 4;
}
srcUVStop += 4;
const SIMD::Float4 minUVx4 = SIMD::Min(minUVx4_01, minUVx4_23);
const SIMD::Float4 maxUVx4 = SIMD::Max(maxUVx4_01, maxUVx4_23);
const SIMD::Float4 hMin = SIMD::Min(minUVx4, minUVx4.Swizzle<2,3,2,3>()); // movehl
const SIMD::Float4 hMax = SIMD::Max(maxUVx4, maxUVx4.Swizzle<2,3,2,3>()); // movehl
minV = hMin.y();
minU = hMin.x();
maxV = hMax.y();
maxU = hMax.x();
}
while (srcUV < srcUVStop)
{
const SIMD::Scalar u = SIMD::Scalar::Load(srcUV, 0);
const SIMD::Scalar v = SIMD::Scalar::Load(srcUV, 4);
minU = SIMD::Min(minU, u);
minV = SIMD::Min(minV, v);
maxU = SIMD::Max(maxU, u);
maxV = SIMD::Max(maxV, v);
srcUV += 1;
}
return CRect::FromMinMaxUnchecked( CFloat2(minU.AsFloat(), minV.AsFloat()),
CFloat2(maxU.AsFloat(), maxV.AsFloat()));
}
//----------------------------------------------------------------------------
static bool _TriangleOverlapsBox(const CFloat2 &pa, const CFloat2 &pb, const CFloat2 &pc, const CFloat4 (&cellBoxCenterAndHalfWidth)[2])
{
// The cell box center and half widths we're given should be broadcasted to a float4 in 'xyxy' form
const SIMD::Float4 vbc = SIMD::Float4::LoadAligned16(cellBoxCenterAndHalfWidth, 0x00);
const SIMD::Float4 vbw = SIMD::Float4::LoadAligned16(cellBoxCenterAndHalfWidth, 0x10);
const SIMD::Float4 kSwapZWSign = SIMD::Float4::FromConstInt<0,0,0x80000000,0x80000000>();
const SIMD::Float4 vla = SIMD::Float4::LoadUnaligned(&pa).Swizzle<0,1,0,1>() - vbc; // NOTE: could also do a 'SIMD::Float4::LoadUnaligned_LoHi(&pa, &pa) - vbc'
const SIMD::Float4 vlb = SIMD::Float4::LoadUnaligned(&pb).Swizzle<0,1,0,1>() - vbc;
const SIMD::Float4 vlc = SIMD::Float4::LoadUnaligned(&pc).Swizzle<0,1,0,1>() - vbc;
// Early-out test: triangle bbox vs cell bbox overlap test: if boxes don't overlap, reject
const SIMD::Float4 vlaSwap = vla ^ kSwapZWSign;
const SIMD::Float4 vlbSwap = vlb ^ kSwapZWSign;
const SIMD::Float4 vlcSwap = vlc ^ kSwapZWSign;
const SIMD::Float4 vMinMax = SIMD::Min(vlaSwap, vlbSwap, vlcSwap);
const hh_u32 isOut = vMinMax.MaskGreater_Imm4(vbw);
if (isOut != 0)
return false;
// SAT test for the 3 triangle edges
// (note: code below would work for quads with almost no changes, we're doing the SAT
// test in SOA, using the xyz lanes of each vector for the 3 edges. a 4th edge would use w)
const SIMD::Float4 axcxaycy = SIMD::InterleaveLo(vla, vlc);
const SIMD::Float4 bxbxbyby = vlb.Swizzle<0,0,1,1>();
const SIMD::Float4 vlabc_x = SIMD::InterleaveLo(axcxaycy, bxbxbyby); // axbxcxbx
const SIMD::Float4 vlabc_y = SIMD::InterleaveHi(axcxaycy, bxbxbyby); // aybycyby
const SIMD::Float4 vlbca_x = vlabc_x.Swizzle<1,2,0,2>();
const SIMD::Float4 vlbca_y = vlabc_y.Swizzle<1,2,0,2>();
const SIMD::Float4 vlcab_x = vlabc_x.Swizzle<2,0,1,0>();
const SIMD::Float4 vlcab_y = vlabc_y.Swizzle<2,0,1,0>();
const SIMD::Float4 e012x = vlbca_x - vlabc_x;
const SIMD::Float4 e012y = vlbca_y - vlabc_y;
const SIMD::Float4 pr012_0 = vlabc_x.Nmsub(e012y, vlabc_y * e012x);
const SIMD::Float4 pr012_1 = vlcab_x.Nmsub(e012y, vlcab_y * e012x);
const SIMD::Float4 pr012Min = SIMD::Min(pr012_0, pr012_1);
const SIMD::Float4 pr012Max = SIMD::Max(pr012_0, pr012_1);
const SIMD::Float4 e012xAbs = SIMD::Abs(e012x);
const SIMD::Float4 e012yAbs = SIMD::Abs(e012y);
const SIMD::Float4 b012Radius = vbw.xxxx().Madd(e012yAbs, vbw.yyyy() * e012xAbs);
const SIMD::Float4 pr012Test = SIMD::Max(pr012Min, -pr012Max);
return pr012Test.MaskGreater_Imm4(b012Radius) == 0;
}
//----------------------------------------------------------------------------
template <typename _Type>
static hh_u32 _ClassifyTriangles(const CRect &bounds, const _Type *indexStream, const TMemoryView<hh_u32> &triangleIDs, const TMemoryView<const CFloat2> &texcoords, hh_u32 tIndexMask)
{
const CFloat2 boxCenter = bounds.Center();
const CFloat2 boxHalfWidth = 0.5f * (bounds.Max() - bounds.Min());
HH_ALIGN(0x10) const CFloat4 boxCenterAndHalfWidth[2] =
{
CFloat4(boxCenter, boxCenter),
CFloat4(boxHalfWidth, boxHalfWidth),
};
hh_u32 curChildCount = 0;
for (hh_u32 tidx = 0; tidx < triangleIDs.Count(); tidx++)
{
const hh_u32 tIndex = triangleIDs[tidx];
const hh_u32 vIndex = tIndex + ((tIndex + tIndex) & tIndexMask);
const CFloat2 &a = texcoords[indexStream[vIndex + 0]];
const CFloat2 &b = texcoords[indexStream[vIndex + 1]];
const CFloat2 &c = texcoords[indexStream[vIndex + 2]];
if (_TriangleOverlapsBox(a, b, c, boxCenterAndHalfWidth))
{
HHSwap(triangleIDs[curChildCount], triangleIDs[tidx]);
curChildCount++;
}
}
return curChildCount;
}