/* compile with: * * $ gcc -O3 -msse -mfpmath=sse * */ #include #include /* high resolution intel processor clock-cycle count * (wallclock!) */ static inline unsigned long long rdtsc() { unsigned long long int x; __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x)); return x; } /* various information on x86 processor type */ #define CPUID(f,ax,bx,cx,dx) __asm__ __volatile__ \ ("cpuid": "=a" (ax), "=b" (bx), "=c" (cx), "=d" (dx) : "a" (f)); /* Assume we know we are on a pentiumIII or higher */ int set_DAZ_and_FTZ(int on) { int sse_level = 0; if(on) { unsigned long ax, bx, cx, dx; CPUID(0x00,ax,bx,cx,dx); printf("%x ",ax); printf("%c%c%c%c",bx,bx>>8,bx>>16,bx>>24); printf("%c%c%c%c",dx,dx>>8,dx>>16,dx>>24); printf("%c%c%c%c\n",cx,cx>>8,cx>>16,cx>>24); CPUID(0x01,ax,bx,cx,dx); if (dx & 0x02000000) { sse_level = 1; // set FLUSH_TO_ZERO to ON and // set round towards zero (RZ) _mm_setcsr(_mm_getcsr() | 0x8000|0x6000); if (dx & 0x04000000) { sse_level = 2; if (cx & 0x00000001) { sse_level = 3; // set DENORMALS_ARE_ZERO to ON _mm_setcsr(_mm_getcsr() | 0x0040); } // we should check for AMD K8 without SSE3 here ... // if(AMD_K8_NO_SSE3) ... } } } else // clear underflow and precision flags // and set DAZ and FTZ to OFF // and restore round to nearest (RN) _mm_setcsr(_mm_getcsr() & ~(0x0030|0x8000|0x0040|0x6000)); return sse_level; } /** * Testing denormal handling with SSE */ float f(int return_X) { float x,y; int i; unsigned long long clk; if(return_X) puts("returning x"); else puts("returning y"); i = 0; x = (float)1.0; clk = rdtsc(); /* unless you have compiled with: * * $ gcc -O3 -msse -mfpmath=sse * * .. the following will never ever complete. */ while (x != (float)0.0) { /* cast all constants to float, or we might end up * in the dreaded x87 double precision stack! */ y = x; x *= (float) 0.999999; ++i; } clk = rdtsc() - clk; // This is funny? if(x == (float)0.0) x = (float)0.0; printf("\n iterations: %d",i); printf("\n clocks/iterations: %lld\n\n",clk/i); if(return_X) return x; else return y; } main() { float z; puts(""); // printf("1st mxscr: %x\n",_mm_getcsr()); int sse = set_DAZ_and_FTZ(1); printf(".. sse%d detected\n",sse); // printf("2nd mxscr: %x\n",_mm_getcsr()); z = f(1) - f(0); if(z == 0.0) puts("x - y is zero (wrong!)\n"); else puts("x - y is not zero (correct!)\n"); // printf("3rd mxscr: %x\n",_mm_getcsr()); /* clean up before we leave */ set_DAZ_and_FTZ(0); // printf("4th mxscr: %x\n",_mm_getcsr()); return 0; }