| ||
void vmp_VecNormalize(vmp3DVector * const pvD, const vmp3DVector * const pvA) { float fMag; // Magnitude = fMag = sqrtf(pvA->x*pvA->x + pvA->y*pvA->y + pvA->z*pvA->z); if (fMag < 0.0000001f) { // too close to zero pvD->x = pvA->x; pvD->y = pvA->y; pvD->z = pvA->z; } else { // Ick, a division, to obtain a reciprocal! fMag = 1.0f / fMag; pvD->x = pvA->x * fMag; pvD->y = pvA->y * fMag; pvD->z = pvA->z * fMag; } }
The 3DNow! processor supports 64-bit so two loads or two stores must be handled simultaneously , but it is a simple matter of adding the two pairs of floats to each other.
mov eax,vA ; Vector A mov edx,vD ; Vector destination
movq mm1,[eax] ; {Ay Ax} movd mm0,(vmp3DVector PTR [eax]).z ; {0 Az} movq mm4,mm1 ; {Ay Ax} movq mm3,mm0 ; {0 Az} pfmul mm1,mm1 ; {AyAy AxAx} pfmul mm0,mm0 ; {0 AzAz} pfacc mm1,mm1 ; {AyAy+AxAx AyAy+AxAx} pfadd mm0,mm1 ; {0+AyAy+AxAx AzAz+AyAy+AxAx} ; Calculate square root (pfrsqrt=15-bit accuracy) ; too close zero ...??? 1.0 / 10000.0 ??? movd ecx,mm0 cmp ecx,FLOAT0001 ; 0.0001 jl short zmag ; just set vD=vA!!! ; Not too close to zero, f= AzAz+AyAy+AxAx ; for Newton-Raphson 24-bit resolution pfrsqrt mm1,mm0 ; {1/ 1/ } movq mm2,mm1 ; {1/ 1/ } pfmul mm1,mm1 ; {1/r 1/r} pfrsqit1 mm1,mm0 ; X2=f(x,x1) {1st step} ; *** mm1 = Magnitude *** ; Calculate sqrt() = (1/mag) 24-bit pfrcpit2 mm1,mm2 ; {2nd step} {# m} punpckldq mm1,mm1 ; {1/m 1/m} pfmul mm4,mm1 ; {Ny Nx}= {Ay/m Ax/m} pfmul mm3,mm1 ; {0 Nz}= {0/m Az/m} zmag: ; Save Resulting {x y z} Normals movq [edx+0],mm4 ; {Ny Nx} movd (vmp3DVector PTR [edx]).z,mm3 ; {0 Nz}
If the data is unaligned, change the MOVAPS instruction to MOVUPS.
movaps xmm0,[eax] ; {# Az Ay Ax} movaps xmm7,[edx] ; {Dw # # #} andps xmm0,lomsk96 ; {0 Az Ay Ax} andps xmm7,himsk32 ; {Dw 0 0 0} movaps xmm6,xmm0 ; {0 Az Ay Ax} mulps xmm0,xmm0 ; {0 AzAz AyAy AxAx} movaps xmm1,xmm0 ; {0 AzAz AyAy AxAx} movaps xmm2,xmm0 ; {0 AzAz AyAy AxAx} orps xmm1,ONEHIGH ; {1 Az 2 Ay 2 Ax 2 } shufps xmm1,xmm1,11001001b ; 3021 {1 Ax 2 Az 2 Ay 2 } shufps xmm2,xmm2,11010010b ; 3102 {0 Ay 2 Ax 2 Az 2 } addps xmm1,xmm0 ; {1+0 Az 2 +Ax 2 Ay 2 +Az 2 Ax 2 +Ay 2 } addps xmm1,xmm2 ; {1+0 Ay 2 +Az 2 +Ax 2 Ax 2 +Ay 2 +Az 2 Az 2 +Ax 2 +Ay 2 } ; Too close zero? movss uflow,xmm1 ; r= Ay 2 +Az 2 +Ax 2 cmp uflow,FLOAT0001 ; 0.0001f jl short zmag ; set vD=vA!!! ; Calculate square root sqrtps xmm0,xmm1 ; {1 } divps xmm6,xmm0 ; {0 Nz Ny Nz} zmag: orps xmm7,xmm6 ; {Dw Nz Ny Nx} movaps [edx],xmm7 ; Save
Question | How would you upgrade the estimated precision version of the code to full 24-bit precision? |