static CRect _FeedMinMaxUV(const TMemoryView<const CFloat2> &texcoords) { HH_ASSERT(!texcoords.Empty()); // SIMD loop below 22 times faster than naive C++ loop with HHMin/HHMax on // the CFloat2 values that was generating branches + cmovs. const CFloat2 *srcUV = texcoords.Data(); const CFloat2 *srcUVStop = texcoords.DataEnd(); SIMD::Scalar minU = srcUV->x(); SIMD::Scalar minV = srcUV->y(); SIMD::Scalar maxU = minU; SIMD::Scalar maxV = minV; if (Mem::IsAligned<0x10>(texcoords.Data())) { SIMD::Float4 minUVx4_01 = SIMD::Float4(minU, minV, minU, minV); SIMD::Float4 maxUVx4_01 = minUVx4_01; SIMD::Float4 minUVx4_23 = minUVx4_01; SIMD::Float4 maxUVx4_23 = maxUVx4_01; srcUVStop -= 4; while (srcUV <= srcUVStop) { const SIMD::Float4 uv0uv1 = SIMD::Float4::LoadAligned16(srcUV, 0x00); const SIMD::Float4 uv2uv3 = SIMD::Float4::LoadAligned16(srcUV, 0x10); minUVx4_01 = SIMD::Min(minUVx4_01, uv0uv1); maxUVx4_01 = SIMD::Max(maxUVx4_01, uv0uv1); minUVx4_23 = SIMD::Min(minUVx4_23, uv2uv3); maxUVx4_23 = SIMD::Max(maxUVx4_23, uv2uv3); srcUV += 4; } srcUVStop += 4; const SIMD::Float4 minUVx4 = SIMD::Min(minUVx4_01, minUVx4_23); const SIMD::Float4 maxUVx4 = SIMD::Max(maxUVx4_01, maxUVx4_23); const SIMD::Float4 hMin = SIMD::Min(minUVx4, minUVx4.Swizzle<2,3,2,3>()); // movehl const SIMD::Float4 hMax = SIMD::Max(maxUVx4, maxUVx4.Swizzle<2,3,2,3>()); // movehl minV = hMin.y(); minU = hMin.x(); maxV = hMax.y(); maxU = hMax.x(); } while (srcUV < srcUVStop) { const SIMD::Scalar u = SIMD::Scalar::Load(srcUV, 0); const SIMD::Scalar v = SIMD::Scalar::Load(srcUV, 4); minU = SIMD::Min(minU, u); minV = SIMD::Min(minV, v); maxU = SIMD::Max(maxU, u); maxV = SIMD::Max(maxV, v); srcUV += 1; } return CRect::FromMinMaxUnchecked( CFloat2(minU.AsFloat(), minV.AsFloat()), CFloat2(maxU.AsFloat(), maxV.AsFloat())); } //---------------------------------------------------------------------------- static bool _TriangleOverlapsBox(const CFloat2 &pa, const CFloat2 &pb, const CFloat2 &pc, const CFloat4 (&cellBoxCenterAndHalfWidth)[2]) { // The cell box center and half widths we're given should be broadcasted to a float4 in 'xyxy' form const SIMD::Float4 vbc = SIMD::Float4::LoadAligned16(cellBoxCenterAndHalfWidth, 0x00); const SIMD::Float4 vbw = SIMD::Float4::LoadAligned16(cellBoxCenterAndHalfWidth, 0x10); const SIMD::Float4 kSwapZWSign = SIMD::Float4::FromConstInt<0,0,0x80000000,0x80000000>(); const SIMD::Float4 vla = SIMD::Float4::LoadUnaligned(&pa).Swizzle<0,1,0,1>() - vbc; // NOTE: could also do a 'SIMD::Float4::LoadUnaligned_LoHi(&pa, &pa) - vbc' const SIMD::Float4 vlb = SIMD::Float4::LoadUnaligned(&pb).Swizzle<0,1,0,1>() - vbc; const SIMD::Float4 vlc = SIMD::Float4::LoadUnaligned(&pc).Swizzle<0,1,0,1>() - vbc; // Early-out test: triangle bbox vs cell bbox overlap test: if boxes don't overlap, reject const SIMD::Float4 vlaSwap = vla ^ kSwapZWSign; const SIMD::Float4 vlbSwap = vlb ^ kSwapZWSign; const SIMD::Float4 vlcSwap = vlc ^ kSwapZWSign; const SIMD::Float4 vMinMax = SIMD::Min(vlaSwap, vlbSwap, vlcSwap); const hh_u32 isOut = vMinMax.MaskGreater_Imm4(vbw); if (isOut != 0) return false; // SAT test for the 3 triangle edges // (note: code below would work for quads with almost no changes, we're doing the SAT // test in SOA, using the xyz lanes of each vector for the 3 edges. a 4th edge would use w) const SIMD::Float4 axcxaycy = SIMD::InterleaveLo(vla, vlc); const SIMD::Float4 bxbxbyby = vlb.Swizzle<0,0,1,1>(); const SIMD::Float4 vlabc_x = SIMD::InterleaveLo(axcxaycy, bxbxbyby); // axbxcxbx const SIMD::Float4 vlabc_y = SIMD::InterleaveHi(axcxaycy, bxbxbyby); // aybycyby const SIMD::Float4 vlbca_x = vlabc_x.Swizzle<1,2,0,2>(); const SIMD::Float4 vlbca_y = vlabc_y.Swizzle<1,2,0,2>(); const SIMD::Float4 vlcab_x = vlabc_x.Swizzle<2,0,1,0>(); const SIMD::Float4 vlcab_y = vlabc_y.Swizzle<2,0,1,0>(); const SIMD::Float4 e012x = vlbca_x - vlabc_x; const SIMD::Float4 e012y = vlbca_y - vlabc_y; const SIMD::Float4 pr012_0 = vlabc_x.Nmsub(e012y, vlabc_y * e012x); const SIMD::Float4 pr012_1 = vlcab_x.Nmsub(e012y, vlcab_y * e012x); const SIMD::Float4 pr012Min = SIMD::Min(pr012_0, pr012_1); const SIMD::Float4 pr012Max = SIMD::Max(pr012_0, pr012_1); const SIMD::Float4 e012xAbs = SIMD::Abs(e012x); const SIMD::Float4 e012yAbs = SIMD::Abs(e012y); const SIMD::Float4 b012Radius = vbw.xxxx().Madd(e012yAbs, vbw.yyyy() * e012xAbs); const SIMD::Float4 pr012Test = SIMD::Max(pr012Min, -pr012Max); return pr012Test.MaskGreater_Imm4(b012Radius) == 0; } //---------------------------------------------------------------------------- template <typename _Type> static hh_u32 _ClassifyTriangles(const CRect &bounds, const _Type *indexStream, const TMemoryView<hh_u32> &triangleIDs, const TMemoryView<const CFloat2> &texcoords, hh_u32 tIndexMask) { const CFloat2 boxCenter = bounds.Center(); const CFloat2 boxHalfWidth = 0.5f * (bounds.Max() - bounds.Min()); HH_ALIGN(0x10) const CFloat4 boxCenterAndHalfWidth[2] = { CFloat4(boxCenter, boxCenter), CFloat4(boxHalfWidth, boxHalfWidth), }; hh_u32 curChildCount = 0; for (hh_u32 tidx = 0; tidx < triangleIDs.Count(); tidx++) { const hh_u32 tIndex = triangleIDs[tidx]; const hh_u32 vIndex = tIndex + ((tIndex + tIndex) & tIndexMask); const CFloat2 &a = texcoords[indexStream[vIndex + 0]]; const CFloat2 &b = texcoords[indexStream[vIndex + 1]]; const CFloat2 &c = texcoords[indexStream[vIndex + 2]]; if (_TriangleOverlapsBox(a, b, c, boxCenterAndHalfWidth)) { HHSwap(triangleIDs[curChildCount], triangleIDs[tidx]); curChildCount++; } } return curChildCount; }