HH_FORCEINLINE vec_float4 vec_dot4(vec_float4 a, vec_float4 b, vec_float4 zero)
{
vec_float4 result;
result = vec_madd(a, b, zero);
result = vec_madd(vec_sld(a, a, 4), vec_sld(b, b, 4), result);
return vec_add(vec_sld(result, result, 8), result);
}
//----------------------------------------------------------------------------
void LowPrecision:

inStream(const float * __restrict srcStream, float * __restrict dstStream, hh_u32 count)
{
#if 0
while (count--)
{
*dstStream++ = sinf(*srcStream++);
}
#else
bool alignedSrc = Mem::IsAligned<0x10>(srcStream);
bool alignedDst = Mem::IsAligned<0x10>(dstStream);
bool allAligned = alignedSrc & alignedDst;
const vec_float4 zero = (vec_float4)(0.0f);
const vec_float4 sCoeffs = vec_ld(0, &_Constant_SinApprox);
const vec_float4 pCoeffs = vec_ld(0, _Constant_Pi);
const vec_float4 s1 = vec_splat(sCoeffs, 1);
const vec_float4 s2 = vec_splat(sCoeffs, 2);
const vec_float4 s3 = vec_splat(sCoeffs, 3);
const vec_float4 pi2 = vec_splat(pCoeffs, 2);
const vec_float4 ipi2 = vec_splat(pCoeffs, 3);
const float *srcStop = srcStream + count - 4;
if (HH_PREDICT_UNLIKELY(!allAligned))
{
while (srcStream < srcStop)
{
const vec_float4 v0 = vec_lvlx( 0, srcStream);
const vec_float4 v1 = vec_lvrx(16, srcStream);
const vec_float4 vs = vec_or(v0, v1);
const vec_float4 vn = vec_madd(vs, ipi2, zero);
const vec_int4 vni = vec_cts(vn, 0); // TODO: find the intrinsic mapping to the __vrfin instruction, and replace the two instructions below
const vec_float4 vnrounded = vec_vcfsx(vni, 0);
const vec_float4 v = vec_nmsub(pi2, vnrounded, vs);
const vec_float4 v2 = vec_madd(v, v, zero);
const vec_float4 v3 = vec_madd(v, v2, zero);
const vec_float4 v5 = vec_madd(v3, v2, zero);
const vec_float4 vc1 = vec_madd(v3, s1, v);
const vec_float4 v7 = vec_madd(v5, v2, zero);
const vec_float4 vc2 = vec_madd(v5, s2, vc1);
const vec_float4 vsin = vec_madd(v7, s3, vc2);
vec_stvlx(vsin, 0, dstStream);
vec_stvrx(vsin, 16, dstStream);
dstStream += 4;
srcStream += 4;
}
}
else
{
while (srcStream < srcStop)
{
const vec_float4 vs = vec_ld(0, srcStream);
const vec_float4 vn = vec_madd(vs, ipi2, zero);
const vec_int4 vni = vec_cts(vn, 0); // TODO: find the intrinsic mapping to the __vrfin instruction, and replace the two instructions below
const vec_float4 vnrounded = vec_vcfsx(vni, 0);
const vec_float4 v = vec_nmsub(pi2, vnrounded, vs);
const vec_float4 v2 = vec_madd(v, v, zero);
const vec_float4 v3 = vec_madd(v, v2, zero);
const vec_float4 v5 = vec_madd(v3, v2, zero);
const vec_float4 vc1 = vec_madd(v3, s1, v);
const vec_float4 v7 = vec_madd(v5, v2, zero);
const vec_float4 vc2 = vec_madd(v5, s2, vc1);
const vec_float4 vsin = vec_madd(v7, s3, vc2);
vec_st(vsin, 0, dstStream);
dstStream += 4;
srcStream += 4;
}
}
srcStop += 4;
float pif = vec_extract(pCoeffs, 0);
float pi2f = vec_extract(pCoeffs, 2);
float ipi2f = vec_extract(pCoeffs, 3);
while (srcStream < srcStop)
{
float vs = *srcStream++;
vs = vs + pif;
float fTemp = __fabsf(vs);
fTemp = fTemp - (pi2f * (float)__fcfid(__fctidz(fTemp * ipi2f)));
fTemp = fTemp - pif;
float v = __fsels(vs, fTemp, -fTemp);
float v2 = v * v;
vec_float4 p;
vec_insert(1.0f, p, 0);
vec_insert(v, p, 1);
vec_insert(v2, p, 2);
vec_insert(v2 * v, p, 3);
vec_float4 vvvv = vec_splat(p, 1);
p = vec_madd(p, p, zero);
p = vec_madd(p, vvvv, zero);
vec_float4 vsin = vec_dot4(p, sCoeffs, zero); // directly use __vmsum4fp on X360
vec_stvewx(vsin, 0, dstStream++);
}
#endif
}