Listing 3
float ScalarProduct( float* a1, float* a2, int n ) { float ans[4] __attribute__ ((aligned(16))); register int i; if( n >= 8 ) { __asm__ __volatile__( "xorps %%xmm0, %%xmm0" : /* outputs */ : /* inputs */ : /* clobbered */ "xmm0" ); for( i = 0; i < ( n >> 3 ); ++i ) { __asm__ __volatile__( "movups (%0), %%xmm1\n\t" "movups 16(%0), %%xmm2\n\t" "movups (%1), %%xmm3\n\t" "movups 16(%1), %%xmm4\n\t" "add $32,%0\n\t" "add $32,%1\n\t" "mulps %%xmm3, %%xmm1\n\t" "mulps %%xmm4, %%xmm2\n\t" "addps %%xmm2, %%xmm1\n\t" "addps %%xmm1, %%xmm0" : /* outputs */ "+r" ( a1 ), "+r" ( a2 ) : /* inputs */ : /* clobbered */ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" ); } __asm__ __volatile__( "movaps %%xmm0, %0" : /* outputs */ "=m" ( ans ) : /* inputs */ : /* clobbered */ "xmm0", "memory" ); n -= i << 3; ans[0] += ans[1] + ans[2] + ans[3]; } else ans[0] = 0.0; for( i = 0; i < n; ++i ) ans[0] += a1[i] * a2[i]; return( ans[0] ); }