orlp - 9 months ago 38

C Question

While writing an optimized

`ftol`

`GCC 4.6.1`

fast_trunc_one, C:

`int fast_trunc_one(int i) {`

int mantissa, exponent, sign, r;

mantissa = (i & 0x07fffff) | 0x800000;

exponent = 150 - ((i >> 23) & 0xff);

sign = i & 0x80000000;

if (exponent < 0) {

r = mantissa << -exponent; /* diff */

} else {

r = mantissa >> exponent; /* diff */

}

return (r ^ -sign) + sign; /* diff */

}

fast_trunc_two, C:

`int fast_trunc_two(int i) {`

int mantissa, exponent, sign, r;

mantissa = (i & 0x07fffff) | 0x800000;

exponent = 150 - ((i >> 23) & 0xff);

sign = i & 0x80000000;

if (exponent < 0) {

r = (mantissa << -exponent) ^ -sign; /* diff */

} else {

r = (mantissa >> exponent) ^ -sign; /* diff */

}

return r + sign; /* diff */

}

Seems the same right? Well GCC disagrees. After compiling with

`gcc -O3 -S -Wall -o test.s test.c`

fast_trunc_one, generated:

`_fast_trunc_one:`

LFB0:

.cfi_startproc

movl 4(%esp), %eax

movl $150, %ecx

movl %eax, %edx

andl $8388607, %edx

sarl $23, %eax

orl $8388608, %edx

andl $255, %eax

subl %eax, %ecx

movl %edx, %eax

sarl %cl, %eax

testl %ecx, %ecx

js L5

rep

ret

.p2align 4,,7

L5:

negl %ecx

movl %edx, %eax

sall %cl, %eax

ret

.cfi_endproc

fast_trunc_two, generated:

`_fast_trunc_two:`

LFB1:

.cfi_startproc

pushl %ebx

.cfi_def_cfa_offset 8

.cfi_offset 3, -8

movl 8(%esp), %eax

movl $150, %ecx

movl %eax, %ebx

movl %eax, %edx

sarl $23, %ebx

andl $8388607, %edx

andl $255, %ebx

orl $8388608, %edx

andl $-2147483648, %eax

subl %ebx, %ecx

js L9

sarl %cl, %edx

movl %eax, %ecx

negl %ecx

xorl %ecx, %edx

addl %edx, %eax

popl %ebx

.cfi_remember_state

.cfi_def_cfa_offset 4

.cfi_restore 3

ret

.p2align 4,,7

L9:

.cfi_restore_state

negl %ecx

sall %cl, %edx

movl %eax, %ecx

negl %ecx

xorl %ecx, %edx

addl %edx, %eax

popl %ebx

.cfi_restore 3

.cfi_def_cfa_offset 4

ret

.cfi_endproc

That's an

`fast_trunc_one`

`fast_trunc_two`

Answer Source

**Updated to sync with the OP's edit**

By tinkering with the code, I've managed to see how GCC optimizes the first case.

**Before we can understand why they are so different, first we must understand how GCC optimizes fast_trunc_one().**

Believe it or not, `fast_trunc_one()`

is being optimized to this:

```
int fast_trunc_one(int i) {
int mantissa, exponent;
mantissa = (i & 0x07fffff) | 0x800000;
exponent = 150 - ((i >> 23) & 0xff);
if (exponent < 0) {
return (mantissa << -exponent); /* diff */
} else {
return (mantissa >> exponent); /* diff */
}
}
```

This produces the exact same assembly as the original `fast_trunc_one()`

- register names and everything.

Notice that there are no `xor`

s in the assembly for `fast_trunc_one()`

. That's what gave it away for me.

**Step 1:** `sign = -sign`

First, let's take a look at the `sign`

variable. Since `sign = i & 0x80000000;`

, there are only two possible values that `sign`

can take:

`sign = 0`

`sign = 0x80000000`

Now recognize that in both cases, `sign == -sign`

. Therefore, when I change the original code to this:

```
int fast_trunc_one(int i) {
int mantissa, exponent, sign, r;
mantissa = (i & 0x07fffff) | 0x800000;
exponent = 150 - ((i >> 23) & 0xff);
sign = i & 0x80000000;
if (exponent < 0) {
r = mantissa << -exponent;
} else {
r = mantissa >> exponent;
}
return (r ^ sign) + sign;
}
```

It produces the exact same assembly as the original `fast_trunc_one()`

. I'll spare you the assembly, but it is identical - register names and all.

**Step 2:** Mathematical reduction: `x + (y ^ x) = y`

`sign`

can only take one of two values, `0`

or `0x80000000`

.

- When
`x = 0`

, then`x + (y ^ x) = y`

then trivial holds. - Adding and xoring by
`0x80000000`

is the same. It flips the sign bit. Therefore`x + (y ^ x) = y`

also holds when`x = 0x80000000`

.

Therefore, `x + (y ^ x)`

reduces to `y`

. And the code simplifies to this:

```
int fast_trunc_one(int i) {
int mantissa, exponent, sign, r;
mantissa = (i & 0x07fffff) | 0x800000;
exponent = 150 - ((i >> 23) & 0xff);
sign = i & 0x80000000;
if (exponent < 0) {
r = (mantissa << -exponent);
} else {
r = (mantissa >> exponent);
}
return r;
}
```

Again, this compiles to the exact same assembly - register names and all.

This above version finally reduces to this:

```
int fast_trunc_one(int i) {
int mantissa, exponent;
mantissa = (i & 0x07fffff) | 0x800000;
exponent = 150 - ((i >> 23) & 0xff);
if (exponent < 0) {
return (mantissa << -exponent); /* diff */
} else {
return (mantissa >> exponent); /* diff */
}
}
```

which is pretty much exactly what GCC generates in the assembly.

So why doesn't the compiler optimize `fast_trunc_two()`

to the same thing?

The key part in `fast_trunc_one()`

is the `x + (y ^ x) = y`

optimization. In `fast_trunc_two()`

the `x + (y ^ x)`

expression is being split across the branch.

I suspect that might be enough to confuse GCC to not make this optimization. (It would need to hoist the `^ -sign`

out of the branch and merge it into the `r + sign`

at the end.)

For example, this produces the same assembly as `fast_trunc_one()`

:

```
int fast_trunc_two(int i) {
int mantissa, exponent, sign, r;
mantissa = (i & 0x07fffff) | 0x800000;
exponent = 150 - ((i >> 23) & 0xff);
sign = i & 0x80000000;
if (exponent < 0) {
r = ((mantissa << -exponent) ^ -sign) + sign; /* diff */
} else {
r = ((mantissa >> exponent) ^ -sign) + sign; /* diff */
}
return r; /* diff */
}
```