快速sin 和 cos 函式計算
阿新 • • 發佈:2019-01-25
void _SSE2_SinCos(float x, float* s, float* c) // any x
{
__asm
{
movss xmm0, x
movaps xmm7, xmm0
movss xmm1, _ps_am_inv_sign_mask
movss xmm2, _ps_am_sign_mask
movss xmm3, _ps_am_2_o_pi
andps xmm0, xmm1
andps xmm7, xmm2
mulss xmm0, xmm3
pxor xmm3, xmm3
movd xmm5, _epi32_1
movss xmm4, _ps_am_1
cvttps2dq xmm2, xmm0
pand xmm5, xmm2
movd xmm1, _epi32_2
pcmpeqd xmm5, xmm3
movd xmm3, _epi32_1
cvtdq2ps xmm6, xmm2
paddd xmm3, xmm2
pand xmm2, xmm1
pand xmm3, xmm1
subss xmm0, xmm6
pslld xmm2, (31 - 1)
minss xmm0, xmm4
mov eax, s // mov eax, [esp + 4 + 16]
mov edx, c // mov edx, [esp + 4 + 16 + 4]
subss xmm4, xmm0
pslld xmm3, (31 - 1)
movaps xmm6, xmm4
xorps xmm2, xmm7
movaps xmm7, xmm5
andps xmm6, xmm7
andnps xmm7, xmm0
andps xmm0, xmm5
andnps xmm5, xmm4
movss xmm4, _ps_sincos_p3
orps xmm6, xmm7
orps xmm0, xmm5
movss xmm5, _ps_sincos_p2
movaps xmm1, xmm0
movaps xmm7, xmm6
mulss xmm0, xmm0
mulss xmm6, xmm6
orps xmm1, xmm2
orps xmm7, xmm3
movaps xmm2, xmm0
movaps xmm3, xmm6
mulss xmm0, xmm4
mulss xmm6, xmm4
movss xmm4, _ps_sincos_p1
addss xmm0, xmm5
addss xmm6, xmm5
movss xmm5, _ps_sincos_p0
mulss xmm0, xmm2
mulss xmm6, xmm3
addss xmm0, xmm4
addss xmm6, xmm4
mulss xmm0, xmm2
mulss xmm6, xmm3
addss xmm0, xmm5
addss xmm6, xmm5
mulss xmm0, xmm1
mulss xmm6, xmm7
// use full stores since caller might reload with full loads
movss [eax], xmm0
movss [edx], xmm6
}
}
比標準庫的sin , cos 速度提高三倍