Use branch prediction hints to make trace events cheaper

Use GCC's __builtin_expect() branch prediction hint to tell the compiler
that trace events are likely to be disabled. This lets the compiler
optimize trace event shim code embedded in functions by moving it
outside the critical path and by annotating it with hints for the CPU's
runtime static branch predictor (depending on the architecture).

The exact performance impact of this change is difficult to quantify,
but the disassembly shows improvements to code locality. For example,
cc::ThreadProxy::CompositeAndReadback, which begins with a call to
TRACE_EVENT0(), looks like this before the change (x64):

00000000004bc2e0 <_ZN2cc11ThreadProxy20CompositeAndReadbackEPvRKN3gfx4RectE>:
  [... prologue ...]
  4bc30a:	48 8d 35 e7 e7 2d 00 	lea    0x2de7e7(%rip),%rsi        # 79aaf8 <.L.str>
  4bc311:	48 8d 15 8b 60 2e 00 	lea    0x2e608b(%rip),%rdx        # 7a23a3 <.L.str7>
  4bc318:	48 8d bc 24 f0 03 00 	lea    0x3f0(%rsp),%rdi
  4bc31f:	00

  /* Initialize scoped trace memory tracker. */
  4bc320:	e8 7b 19 0a 00       	callq  55dca0 <_ZN4base5debug17ScopedTraceMemory10InitializeEPKcS3_>

  /* Have seen this trace event before? */
  4bc325:	48 8b 1d 94 78 3d 00 	mov    0x3d7894(%rip),%rbx        # 893bc0 <_ZZN2cc11ThreadProxy20CompositeAndReadbackEPvRKN3gfx4RectEE28trace_event_unique_atomic149>
  4bc32c:	48 85 db             	test   %rbx,%rbx

  /* No? Better initialize it then */
  4bc32f:	75 16                	jne    4bc347 <_ZN2cc11ThreadProxy20CompositeAndReadbackEPvRKN3gfx4RectE+0x67>
  4bc331:	48 8d 3d c0 e7 2d 00 	lea    0x2de7c0(%rip),%rdi        # 79aaf8 <.L.str>
  4bc338:	e8 63 6f 09 00       	callq  5532a0 <_ZN4base5debug8TraceLog23GetCategoryGroupEnabledEPKc>
  4bc33d:	48 89 c3             	mov    %rax,%rbx
  4bc340:	48 89 1d 79 78 3d 00 	mov    %rbx,0x3d7879(%rip)        # 893bc0 <_ZZN2cc11ThreadProxy20CompositeAndReadbackEPvRKN3gfx4RectEE28trace_event_unique_atomic149>

  /* Trace event has been initialized. Is its category enabled? (atomic load) */
  4bc347:	48 c7 84 24 d0 03 00 	movq   $0x0,0x3d0(%rsp)
  4bc34e:	00 00 00 00 00
  4bc353:	f6 03 05             	testb  $0x5,(%rbx)

  /* Trace category is disabled => jump to rest of function. */
  4bc356:	0f 84 a8 00 00 00    	je     4bc404 <_ZN2cc11ThreadProxy20CompositeAndReadbackEPvRKN3gfx4RectE+0x124>

  /*
   * Trace category is enabled => compute timestamp and arguments and
   * append the event to trace log.
   */
  4bc35c:	e8 df cc 0b 00       	callq  579040 <_ZN4base14PlatformThread9CurrentIdEv>
  4bc361:	4d 89 e6             	mov    %r12,%r14
  4bc364:	41 89 c4             	mov    %eax,%r12d
  4bc367:	e8 a4 3d 0c 00       	callq  580110 <_ZN4base9TimeTicks22NowFromSystemTraceTimeEv>
  4bc36c:	48 89 84 24 80 00 00 	mov    %rax,0x80(%rsp)
  4bc373:	00
  4bc374:	e8 47 69 09 00       	callq  552cc0 <_ZN4base5debug8TraceLog11GetInstanceEv>
  4bc379:	48 8d 8c 24 80 00 00 	lea    0x80(%rsp),%rcx
  4bc380:	00
  4bc381:	48 89 0c 24          	mov    %rcx,(%rsp)
  4bc385:	c7 44 24 30 00 00 00 	movl   $0x0,0x30(%rsp)
  4bc38c:	00
  4bc38d:	48 c7 44 24 28 00 00 	movq   $0x0,0x28(%rsp)
  4bc394:	00 00
  4bc396:	48 c7 44 24 20 00 00 	movq   $0x0,0x20(%rsp)
  4bc39d:	00 00
  4bc39f:	48 c7 44 24 18 00 00 	movq   $0x0,0x18(%rsp)
  4bc3a6:	00 00
  4bc3a8:	48 c7 44 24 10 00 00 	movq   $0x0,0x10(%rsp)
  4bc3af:	00 00
  4bc3b1:	c7 44 24 08 00 00 00 	movl   $0x0,0x8(%rsp)
  4bc3b8:	00
  4bc3b9:	48 8d 2d e3 5f 2e 00 	lea    0x2e5fe3(%rip),%rbp        # 7a23a3 <.L.str7>
  4bc3c0:	be 58 00 00 00       	mov    $0x58,%esi
  4bc3c5:	45 31 c0             	xor    %r8d,%r8d
  4bc3c8:	48 89 c7             	mov    %rax,%rdi
  4bc3cb:	48 89 da             	mov    %rbx,%rdx
  4bc3ce:	48 89 e9             	mov    %rbp,%rcx
  4bc3d1:	45 89 e1             	mov    %r12d,%r9d
  4bc3d4:	4d 89 f4             	mov    %r14,%r12
  4bc3d7:	e8 24 a4 09 00       	callq  556800 <_ZN4base5debug8TraceLog37AddTraceEventWithThreadIdAndTimestampEcPKhPKcyiRKNS_9TimeTicksEiPS5_S3_PKyPK13scoped_refptrINS0_24ConvertableToTraceFormatEEh>

  [... actual function code ...]

After the change:

00000000004bcab0 <_ZN2cc11ThreadProxy20CompositeAndReadbackEPvRKN3gfx4RectE>:
  [... prologue ...]
  4bcad8:	48 8d 35 89 f0 2d 00 	lea    0x2df089(%rip),%rsi        # 79bb68 <.L.str>
  4bcadf:	48 8d 15 2d 69 2e 00 	lea    0x2e692d(%rip),%rdx        # 7a3413 <.L.str7>
  4bcae6:	48 8d bc 24 f0 03 00 	lea    0x3f0(%rsp),%rdi
  4bcaed:	00

  /* Initialize scoped trace memory tracker. */
  4bcaee:	e8 fd 20 0a 00       	callq  55ebf0 <_ZN4base5debug17ScopedTraceMemory10InitializeEPKcS3_>

  /* Have seen this trace event before? */
  4bcaf3:	48 8b 1d c6 80 3d 00 	mov    0x3d80c6(%rip),%rbx        # 894bc0 <_ZZN2cc11ThreadProxy20CompositeAndReadbackEPvRKN3gfx4RectEE28trace_event_unique_atomic149>
  4bcafa:	48 85 db             	test   %rbx,%rbx

  /* No? Jump to the initialization code at the end of the function. */
  4bcafd:	0f 84 7e 03 00 00    	je     4bce81 <_ZN2cc11ThreadProxy20CompositeAndReadbackEPvRKN3gfx4RectE+0x3d1>

  /* Is the trace category enabled? (atomic load) */
  4bcb03:	48 c7 84 24 d0 03 00 	movq   $0x0,0x3d0(%rsp)
  4bcb0a:	00 00 00 00 00
  4bcb0f:	f6 03 05             	testb  $0x5,(%rbx)

  /* If so, jump to the event appending code at the end of the function. */
  4bcb12:	0f 85 84 03 00 00    	jne    4bce9c <_ZN2cc11ThreadProxy20CompositeAndReadbackEPvRKN3gfx4RectE+0x3ec>

  [... actual function code ...]

  [... code for initializing trace event...]

  [... code for appending trace event...]

In other words, the trace event related code has been moved outside the
main function code flow. This should improve code cache utilization.

Note that this change currently has no effect on ARM, since
__builtin_expect() is a no-op because of -Os and -mthumb. Also, despite
being a GCC extension, __builtin_expect() is also supported by Clang.

Review URL: https://codereview.chromium.org/160673002

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@251046 0039d316-1c4b-4281-b951-d872f2087c98
2 files changed