m.s. m.s. - 2 months ago 26
C++ Question

How to remove "noise" from GCC/clang assembly output?

I want to inspect the assembly output of applying

boost::variant
in my code in order to see which intermediate calls are optimized away.

When I compile the following example (with GCC 5.3 using
g++ -O3 -std=c++14 -S
), it seems as if the compiler optimizes away everything and directly returns 100:

(...)
main:
.LFB9320:
.cfi_startproc
movl $100, %eax
ret
.cfi_endproc
(...)





#include <boost/variant.hpp>

struct Foo
{
int get() { return 100; }
};

struct Bar
{
int get() { return 999; }
};

using Variant = boost::variant<Foo, Bar>;


int run(Variant v)
{
return boost::apply_visitor([](auto& x){return x.get();}, v);
}
int main()
{
Foo f;
return run(f);
}


However, the full assembly output contains much more than the above excerpt, which to me looks like it is never called. Is there a way to tell GCC/clang to remove all that "noise" and just output what is actually called when the program is ran?




full assembly output:

.file "main1.cpp"
.section .rodata.str1.8,"aMS",@progbits,1
.align 8
.LC0:
.string "/opt/boost/include/boost/variant/detail/forced_return.hpp"
.section .rodata.str1.1,"aMS",@progbits,1
.LC1:
.string "false"
.section .text.unlikely._ZN5boost6detail7variant13forced_returnIvEET_v,"axG",@progbits,_ZN5boost6detail7variant13forced_returnIvEET_v,comdat
.LCOLDB2:
.section .text._ZN5boost6detail7variant13forced_returnIvEET_v,"axG",@progbits,_ZN5boost6detail7variant13forced_returnIvEET_v,comdat
.LHOTB2:
.p2align 4,,15
.weak _ZN5boost6detail7variant13forced_returnIvEET_v
.type _ZN5boost6detail7variant13forced_returnIvEET_v, @function
_ZN5boost6detail7variant13forced_returnIvEET_v:
.LFB1197:
.cfi_startproc
subq $8, %rsp
.cfi_def_cfa_offset 16
movl $_ZZN5boost6detail7variant13forced_returnIvEET_vE19__PRETTY_FUNCTION__, %ecx
movl $49, %edx
movl $.LC0, %esi
movl $.LC1, %edi
call __assert_fail
.cfi_endproc
.LFE1197:
.size _ZN5boost6detail7variant13forced_returnIvEET_v, .-_ZN5boost6detail7variant13forced_returnIvEET_v
.section .text.unlikely._ZN5boost6detail7variant13forced_returnIvEET_v,"axG",@progbits,_ZN5boost6detail7variant13forced_returnIvEET_v,comdat
.LCOLDE2:
.section .text._ZN5boost6detail7variant13forced_returnIvEET_v,"axG",@progbits,_ZN5boost6detail7variant13forced_returnIvEET_v,comdat
.LHOTE2:
.section .text.unlikely._ZN5boost6detail7variant13forced_returnIiEET_v,"axG",@progbits,_ZN5boost6detail7variant13forced_returnIiEET_v,comdat
.LCOLDB3:
.section .text._ZN5boost6detail7variant13forced_returnIiEET_v,"axG",@progbits,_ZN5boost6detail7variant13forced_returnIiEET_v,comdat
.LHOTB3:
.p2align 4,,15
.weak _ZN5boost6detail7variant13forced_returnIiEET_v
.type _ZN5boost6detail7variant13forced_returnIiEET_v, @function
_ZN5boost6detail7variant13forced_returnIiEET_v:
.LFB9757:
.cfi_startproc
subq $8, %rsp
.cfi_def_cfa_offset 16
movl $_ZZN5boost6detail7variant13forced_returnIiEET_vE19__PRETTY_FUNCTION__, %ecx
movl $39, %edx
movl $.LC0, %esi
movl $.LC1, %edi
call __assert_fail
.cfi_endproc
.LFE9757:
.size _ZN5boost6detail7variant13forced_returnIiEET_v, .-_ZN5boost6detail7variant13forced_returnIiEET_v
.section .text.unlikely._ZN5boost6detail7variant13forced_returnIiEET_v,"axG",@progbits,_ZN5boost6detail7variant13forced_returnIiEET_v,comdat
.LCOLDE3:
.section .text._ZN5boost6detail7variant13forced_returnIiEET_v,"axG",@progbits,_ZN5boost6detail7variant13forced_returnIiEET_v,comdat
.LHOTE3:
.section .text.unlikely,"ax",@progbits
.LCOLDB4:
.text
.LHOTB4:
.p2align 4,,15
.globl _Z3runN5boost7variantI3FooJ3BarEEE
.type _Z3runN5boost7variantI3FooJ3BarEEE, @function
_Z3runN5boost7variantI3FooJ3BarEEE:
.LFB9310:
.cfi_startproc
subq $8, %rsp
.cfi_def_cfa_offset 16
movl (%rdi), %eax
cltd
xorl %edx, %eax
cmpl $19, %eax
ja .L7
jmp *.L9(,%rax,8)
.section .rodata
.align 8
.align 4
.L9:
.quad .L30
.quad .L10
.quad .L7
.quad .L7
.quad .L7
.quad .L7
.quad .L7
.quad .L7
.quad .L7
.quad .L7
.quad .L7
.quad .L7
.quad .L7
.quad .L7
.quad .L7
.quad .L7
.quad .L7
.quad .L7
.quad .L7
.quad .L7
.text
.p2align 4,,10
.p2align 3
.L7:
call _ZN5boost6detail7variant13forced_returnIiEET_v
.p2align 4,,10
.p2align 3
.L30:
movl $100, %eax
.L8:
addq $8, %rsp
.cfi_remember_state
.cfi_def_cfa_offset 8
ret
.p2align 4,,10
.p2align 3
.L10:
.cfi_restore_state
movl $999, %eax
jmp .L8
.cfi_endproc
.LFE9310:
.size _Z3runN5boost7variantI3FooJ3BarEEE, .-_Z3runN5boost7variantI3FooJ3BarEEE
.section .text.unlikely
.LCOLDE4:
.text
.LHOTE4:
.globl _Z3runN5boost7variantI3FooI3BarEEE
.set _Z3runN5boost7variantI3FooI3BarEEE,_Z3runN5boost7variantI3FooJ3BarEEE
.section .text.unlikely
.LCOLDB5:
.section .text.startup,"ax",@progbits
.LHOTB5:
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB9320:
.cfi_startproc
movl $100, %eax
ret
.cfi_endproc
.LFE9320:
.size main, .-main
.section .text.unlikely
.LCOLDE5:
.section .text.startup
.LHOTE5:
.section .rodata
.align 32
.type _ZZN5boost6detail7variant13forced_returnIvEET_vE19__PRETTY_FUNCTION__, @object
.size _ZZN5boost6detail7variant13forced_returnIvEET_vE19__PRETTY_FUNCTION__, 58
_ZZN5boost6detail7variant13forced_returnIvEET_vE19__PRETTY_FUNCTION__:
.string "T boost::detail::variant::forced_return() [with T = void]"
.align 32
.type _ZZN5boost6detail7variant13forced_returnIiEET_vE19__PRETTY_FUNCTION__, @object
.size _ZZN5boost6detail7variant13forced_returnIiEET_vE19__PRETTY_FUNCTION__, 57
_ZZN5boost6detail7variant13forced_returnIiEET_vE19__PRETTY_FUNCTION__:
.string "T boost::detail::variant::forced_return() [with T = int]"
.ident "GCC: (Ubuntu 5.3.0-3ubuntu1~14.04) 5.3.0 20151204"
.section .note.GNU-stack,"",@progbits

Answer

Stripping out the .cfi directives, unused labels, and comment lines is a solved problem: the scripts behind Matt Godbolt's compiler explorer are open source on its github project. It can even do colour highlighting to match source lines to asm lines (using the debug info).

For local use, -fno-asynchronous-unwind-tables avoids .cfi directives. Possibly also useful: -fno-exceptions -fno-rtti.


But really, I'd recommend just using Godbolt directly! You can quickly flip between versions of gcc and clang to see if old or new compilers do something dumb. (Or what icc13 does.) There's even ARM / PPC / ARM64 gcc 4.8.

For C instead of C++, use -xc -std=gnu11 or something, since the compiler explorer site only provides g++ / clang++, not gcc / clang.


Useful compiler options for making asm for human consumption:

  • I'd recommend using -O3 -Wall -Wextra -fverbose-asm -march=haswell) for looking at code. (-fverbose-asm can just make the source look noisy, though, when all you get are numbered temporaries as names for the operands.) When you're fiddling with the source to see how it changes the asm, you definitely want compiler warnings enabled. You don't want to waste time scratching your head over the asm when the explanation is that you did something that deserves a warning in the source.

  • -ffast-math will get many libm functions to inline, some to a single instruction (esp. with SSE4 available for roundsd). Some will inline with just -fno-math-errno, or other "safer" parts of -ffast-math, without the parts that allow the compiler to round differently. If you have FP code, definitely look at it with/without -ffast-math. If you can't safely enable any of -ffast-math in your regular build, maybe you'll get an idea for a safe change you can make in the source to allow the same optimization without -ffast-math.

  • -O3 -fno-tree-vectorize will optimize without auto-vectorizing, if you want to compare with -O2 (which doesn't enable autovectorization on gcc, but does on clang IIRC).
  • clang unrolls loops by default, so -funroll-loops can be useful in complex functions. You can get a sense of "what the compiler did" without having to wade through the unrolled loops. (gcc enables -funroll-loops with -fprofile-use, but not with -O3). (This is a suggestion for human-readable code, not for code that would run faster.

To get a mix of source and asm, use gcc -Wa,-adhln -c -g foo.c | less to pass extra options to as. (More details in a blog post). Note that the output of this isn't valid assembler input, because the C source is there directly, not as an assembler comment. So don't call it a .s. A .lst might make sense if you want to save it to a file.

Godbolt's color highlighting serves a similar purpose, and is great at helping you see when multiple non-contiguous asm instructions come from the same source line. I haven't used that gcc listing command at all, so IDK how well it does, and how easy it is for the eye to see, in that case.

I like the high code density of godbolt's asm pane, so I don't think I'd like having source lines mixed in. At least not for simple functions. Maybe with a function that was too complex to get a handle on the overall structure of what the asm does...


And remember, when you want to just look at the asm, leave out the main() and the compile-time constants. You want to see the code for dealing with a function arg in a register, not for the code after constant-propagation turns it into return 42, or at least optimizes away some stuff.

Removing static and/or inline from functions will produce a stand-alone definition for them, as well as a definition for any callers, so you can just look at that.

Don't put your code in a function called main(). gcc knows that main is special and assumes it will only be called once, so it marks it as "cold" and optimizes it less.


The other thing you can do: If you did make a main(), you can run it and use a debugger. stepi (si) steps by instruction. See the bottom of the tag wiki for instructions. But remember that code might optimize away after inlining into main with compile-time-constant args.

__attribute__((noinline)) may help, on a function that you want to not be inlined. (And hopefully not be cloned to produce a special version for the constant args that main will call with).


For example, if you want to see how the compiler multiplies two integers: I put the following code on Godbolt, to get the asm (from gcc -O3 -march=haswell -fverbose-asm) for the wrong way and the right way to test this.

// the wrong way, which people often write when they're used to creating a runnable test-case with a main() and a printf
// or worse, people will actually look at the asm for such a main()
int constants() { int a = 10, b = 20; return a * b; }
    mov     eax, 200  #,
    ret                     # compiles the same as  return 200;  not interesting

// the right way: compiler doesn't know anything about the inputs
// so we get asm like what would happen when this inlines into a bigger function.
int variables(int a, int b) { return a * b; }
    mov     eax, edi  # D.2345, a
    imul    eax, esi        # D.2345, b
    ret

(This mix of asm and C was hand-crafted by copy-pasting the asm output from godbolt into the right place)

Comments