From glibc-2.7 w_fmod.c:

Code:

#ifdef __STDC__
double __fmod(double x, double y) /* wrapper fmod */
#else
double __fmod(x,y) /* wrapper fmod */
double x,y;
#endif
{
#ifdef _IEEE_LIBM
return __ieee754_fmod(x,y);
#else
double z;
z = __ieee754_fmod(x,y);
if(_LIB_VERSION == _IEEE_ ||__isnan(y)||__isnan(x)) return z;
if(y==0.0) {
return __kernel_standard(x,y,27); /* fmod(x,0) */
} else
return z;
#endif
}

And __ieee754_fmod() from e_fmod.c in the sysdeps/ieee754/dbl-64 folder:

Code:

#ifdef __STDC__
static const double one = 1.0, Zero[] = {0.0, -0.0,};
#else
static double one = 1.0, Zero[] = {0.0, -0.0,};
#endif
#ifdef __STDC__
double __ieee754_fmod(double x, double y)
#else
double __ieee754_fmod(x,y)
double x,y ;
#endif
{
int32_t n,hx,hy,hz,ix,iy,sx,i;
u_int32_t lx,ly,lz;
EXTRACT_WORDS(hx,lx,x);
EXTRACT_WORDS(hy,ly,y);
sx = hx&0x80000000; /* sign of x */
hx ^=sx; /* |x| */
hy &= 0x7fffffff; /* |y| */
/* purge off exception values */
if((hy|ly)==0||(hx>=0x7ff00000)|| /* y=0,or x not finite */
((hy|((ly|-ly)>>31))>0x7ff00000)) /* or y is NaN */
return (x*y)/(x*y);
if(hx<=hy) {
if((hx<hy)||(lx<ly)) return x; /* |x|<|y| return x */
if(lx==ly)
return Zero[(u_int32_t)sx>>31]; /* |x|=|y| return x*0*/
}
/* determine ix = ilogb(x) */
if(hx<0x00100000) { /* subnormal x */
if(hx==0) {
for (ix = -1043, i=lx; i>0; i<<=1) ix -=1;
} else {
for (ix = -1022,i=(hx<<11); i>0; i<<=1) ix -=1;
}
} else ix = (hx>>20)-1023;
/* determine iy = ilogb(y) */
if(hy<0x00100000) { /* subnormal y */
if(hy==0) {
for (iy = -1043, i=ly; i>0; i<<=1) iy -=1;
} else {
for (iy = -1022,i=(hy<<11); i>0; i<<=1) iy -=1;
}
} else iy = (hy>>20)-1023;
/* set up {hx,lx}, {hy,ly} and align y to x */
if(ix >= -1022)
hx = 0x00100000|(0x000fffff&hx);
else { /* subnormal x, shift x to normal */
n = -1022-ix;
if(n<=31) {
hx = (hx<<n)|(lx>>(32-n));
lx <<= n;
} else {
hx = lx<<(n-32);
lx = 0;
}
}
if(iy >= -1022)
hy = 0x00100000|(0x000fffff&hy);
else { /* subnormal y, shift y to normal */
n = -1022-iy;
if(n<=31) {
hy = (hy<<n)|(ly>>(32-n));
ly <<= n;
} else {
hy = ly<<(n-32);
ly = 0;
}
}
/* fix point fmod */
n = ix - iy;
while(n--) {
hz=hx-hy;lz=lx-ly; if(lx<ly) hz -= 1;
if(hz<0){hx = hx+hx+(lx>>31); lx = lx+lx;}
else {
if((hz|lz)==0) /* return sign(x)*0 */
return Zero[(u_int32_t)sx>>31];
hx = hz+hz+(lz>>31); lx = lz+lz;
}
}
hz=hx-hy;lz=lx-ly; if(lx<ly) hz -= 1;
if(hz>=0) {hx=hz;lx=lz;}
/* convert back to floating value and restore the sign */
if((hx|lx)==0) /* return sign(x)*0 */
return Zero[(u_int32_t)sx>>31];
while(hx<0x00100000) { /* normalize x */
hx = hx+hx+(lx>>31); lx = lx+lx;
iy -= 1;
}
if(iy>= -1022) { /* normalize output */
hx = ((hx-0x00100000)|((iy+1023)<<20));
INSERT_WORDS(x,hx|sx,lx);
} else { /* subnormal output */
n = -1022 - iy;
if(n<=20) {
lx = (lx>>n)|((u_int32_t)hx<<(32-n));
hx >>= n;
} else if (n<=31) {
lx = (hx<<(32-n))|(lx>>n); hx = sx;
} else {
lx = hx>>(n-32); hx = sx;
}
INSERT_WORDS(x,hx|sx,lx);
x *= one; /* create necessary signal */
}
return x; /* exact output */

When it comes to the glibc math I assume it to pretty hyperoptimized to no end, though I still get same numbers as you. Could be some rare exceptions being considered in all those if-cases that cause the slowdown, but I can't say much from just the comments alone.