Adding automatic huge page support

Summary:
This patch enables automated hugify for Bolt.
When running Bolt against a binary with -hugify specified, Bolt will inject a call to a runtime library function at the entry of the binary. The runtime library calls madvise to map the hot code region into a 2M huge page. We support both new kernel with THP support and old kernels. For kernels with THP support we simply make a madvise call, while for old kernels, we first copy the code out, remap the memory with huge page, and then copy the code back.
With this change, we no longer need to manually call into hugify_self and precompile it with --hot-text. Instead, we could simply combine --hugify option with existing optimizations, and at runtime it will automatically move hot code into 2M pages.

Some details around the changes made:
1. Add an command line option to support --hugify. --hugify will automatically turn on --hot-text to get the proper hot code symbols. However, running with both --hugify and --hot-text is not allowed, since --hot-text is used on binaries that has precompiled call to hugify_self, which contradicts with the purpose of --hugify.
2. Moved the common utility functions out of instr.cpp to common.h, which will also be used by hugify.cpp. Added a few new system calls definitions.
3. Added a new class that inherits RuntimeLibrary, and implemented the necessary emit and link logic for hugify.
4. Added a simple test for hugify.

(cherry picked from FBD21384529)
diff --git a/bolt/runtime/CMakeLists.txt b/bolt/runtime/CMakeLists.txt
index 5b23f72..a8786c8 100644
--- a/bolt/runtime/CMakeLists.txt
+++ b/bolt/runtime/CMakeLists.txt
@@ -12,13 +12,21 @@
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config.h.in
                ${CMAKE_CURRENT_BINARY_DIR}/config.h)
 
-add_library(bolt_rt STATIC
+add_library(bolt_rt_instr STATIC
   instr.cpp
   ${CMAKE_CURRENT_BINARY_DIR}/config.h
   )
+add_library(bolt_rt_hugify STATIC
+  hugify.cpp
+  ${CMAKE_CURRENT_BINARY_DIR}/config.h
+  )
 
 # Don't let the compiler think it can create calls to standard libs
-target_compile_options(bolt_rt PRIVATE -ffreestanding -fno-exceptions -fno-rtti)
-target_include_directories(bolt_rt PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
+target_compile_options(bolt_rt_instr PRIVATE -ffreestanding -fno-exceptions -fno-rtti)
+target_include_directories(bolt_rt_instr PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
+target_compile_options(bolt_rt_hugify PRIVATE -ffreestanding -fno-exceptions -fno-rtti)
+target_include_directories(bolt_rt_hugify PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 
-install(TARGETS bolt_rt DESTINATION lib)
+install(TARGETS bolt_rt_instr DESTINATION lib)
+install(TARGETS bolt_rt_hugify DESTINATION lib)
+
diff --git a/bolt/runtime/common.h b/bolt/runtime/common.h
new file mode 100644
index 0000000..816ad87
--- /dev/null
+++ b/bolt/runtime/common.h
@@ -0,0 +1,325 @@
+#include <cstddef>
+#include <cstdint>
+
+#include "config.h"
+#ifdef HAVE_ELF_H
+#include <elf.h>
+#endif
+
+#define SAVE_ALL                                                               \
+  "push %%rax\n"                                                               \
+  "push %%rbx\n"                                                               \
+  "push %%rcx\n"                                                               \
+  "push %%rdx\n"                                                               \
+  "push %%rdi\n"                                                               \
+  "push %%rsi\n"                                                               \
+  "push %%rbp\n"                                                               \
+  "push %%r8\n"                                                                \
+  "push %%r9\n"                                                                \
+  "push %%r10\n"                                                               \
+  "push %%r11\n"                                                               \
+  "push %%r12\n"                                                               \
+  "push %%r13\n"                                                               \
+  "push %%r14\n"                                                               \
+  "push %%r15\n"
+
+#define RESTORE_ALL                                                            \
+  "pop %%r15\n"                                                                \
+  "pop %%r14\n"                                                                \
+  "pop %%r13\n"                                                                \
+  "pop %%r12\n"                                                                \
+  "pop %%r11\n"                                                                \
+  "pop %%r10\n"                                                                \
+  "pop %%r9\n"                                                                 \
+  "pop %%r8\n"                                                                 \
+  "pop %%rbp\n"                                                                \
+  "pop %%rsi\n"                                                                \
+  "pop %%rdi\n"                                                                \
+  "pop %%rdx\n"                                                                \
+  "pop %%rcx\n"                                                                \
+  "pop %%rbx\n"                                                                \
+  "pop %%rax\n"
+
+// Anonymous namespace covering everything but our library entry point
+namespace {
+
+// We use a stack-allocated buffer for string manipulation in many pieces of
+// this code, including the code that prints each line of the fdata file. This
+// buffer needs to accomodate large function names, but shouldn't be arbitrarily
+// large (dynamically allocated) for simplicity of our memory space usage.
+constexpr uint32_t BufSize = 10240;
+
+// Declare some syscall wrappers we use throughout this code to avoid linking
+// against system libc.
+uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) {
+  uint64_t ret;
+  __asm__ __volatile__("movq $2, %%rax\n"
+                       "syscall"
+                       : "=a"(ret)
+                       : "D"(pathname), "S"(flags), "d"(mode)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+uint64_t __write(uint64_t fd, const void *buf, uint64_t count) {
+  uint64_t ret;
+  __asm__ __volatile__("movq $1, %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       : "D"(fd), "S"(buf), "d"(count)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) {
+  uint64_t ret;
+  __asm__ __volatile__("movq $8, %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       : "D"(fd), "S"(pos), "d"(whence)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+int __close(uint64_t fd) {
+  uint64_t ret;
+  __asm__ __volatile__("movq $3, %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       : "D"(fd)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+int __madvise(void *addr, size_t length, int advice) {
+  int ret;
+  __asm__ __volatile__("movq $28, %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       : "D"(addr), "S"(length), "d"(advice)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+/* Length of the entries in `struct utsname' is 65.  */
+#define _UTSNAME_LENGTH 65
+
+struct utsname {
+  char sysname[_UTSNAME_LENGTH];  /* Operating system name (e.g., "Linux") */
+  char nodename[_UTSNAME_LENGTH]; /* Name within "some implementation-defined
+                      network" */
+  char release[_UTSNAME_LENGTH]; /* Operating system release (e.g., "2.6.28") */
+  char version[_UTSNAME_LENGTH]; /* Operating system version */
+  char machine[_UTSNAME_LENGTH]; /* Hardware identifier */
+  char domainname[_UTSNAME_LENGTH]; /* NIS or YP domain name */
+};
+
+int __uname(struct utsname *buf) {
+  int ret;
+  __asm__ __volatile__("movq $63, %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       : "D"(buf)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+struct timespec {
+  uint64_t tv_sec;  /* seconds */
+  uint64_t tv_nsec; /* nanoseconds */
+};
+
+uint64_t __nanosleep(const timespec *req, timespec *rem) {
+  uint64_t ret;
+  __asm__ __volatile__("movq $35, %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       : "D"(req), "S"(rem)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+int64_t __fork() {
+  uint64_t ret;
+  __asm__ __volatile__("movq $57, %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       :
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags,
+             uint64_t fd, uint64_t offset) {
+  void *ret;
+  register uint64_t r8 asm("r8") = fd;
+  register uint64_t r9 asm("r9") = offset;
+  register uint64_t r10 asm("r10") = flags;
+  __asm__ __volatile__("movq $9, %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       : "D"(addr), "S"(size), "d"(prot), "r"(r10), "r"(r8),
+                         "r"(r9)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+int __mprotect(void *addr, size_t len, int prot) {
+  int ret;
+  __asm__ __volatile__("movq $10, %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       : "D"(addr), "S"(len), "d"(prot)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+uint64_t __munmap(void *addr, uint64_t size) {
+  uint64_t ret;
+  __asm__ __volatile__("movq $11, %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       : "D"(addr), "S"(size)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+uint64_t __getpid() {
+  uint64_t ret;
+  __asm__ __volatile__("movq $39, %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       :
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+uint64_t __getppid() {
+  uint64_t ret;
+  __asm__ __volatile__("movq $110, %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       :
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+uint64_t __exit(uint64_t code) {
+  uint64_t ret;
+  __asm__ __volatile__("movq $231, %%rax\n"
+                       "syscall\n"
+                       : "=a"(ret)
+                       : "D"(code)
+                       : "cc", "rcx", "r11", "memory");
+  return ret;
+}
+
+// Helper functions for writing strings to the .fdata file. We intentionally
+// avoid using libc names (lowercase memset) to make it clear it is our impl.
+
+/// Write number Num using Base to the buffer in OutBuf, returns a pointer to
+/// the end of the string.
+char *intToStr(char *OutBuf, uint64_t Num, uint32_t Base) {
+  const char *Chars = "0123456789abcdef";
+  char Buf[21];
+  char *Ptr = Buf;
+  while (Num) {
+    *Ptr++ = *(Chars + (Num % Base));
+    Num /= Base;
+  }
+  if (Ptr == Buf) {
+    *OutBuf++ = '0';
+    return OutBuf;
+  }
+  while (Ptr != Buf) {
+    *OutBuf++ = *--Ptr;
+  }
+  return OutBuf;
+}
+
+/// Copy Str to OutBuf, returns a pointer to the end of the copied string
+char *strCopy(char *OutBuf, const char *Str, int32_t Size = BufSize) {
+  while (*Str) {
+    *OutBuf++ = *Str++;
+    if (--Size <= 0)
+      return OutBuf;
+  }
+  return OutBuf;
+}
+
+void memSet(char *Buf, char C, uint32_t Size) {
+  for (int I = 0; I < Size; ++I)
+    *Buf++ = C;
+}
+
+void *memCpy(void *Dest, const void *Src, size_t Len) {
+  char *d = static_cast<char *>(Dest);
+  const char *s = static_cast<const char *>(Src);
+  while (Len--)
+    *d++ = *s++;
+  return Dest;
+}
+
+uint32_t strLen(const char *Str) {
+  uint32_t Size = 0;
+  while (*Str++)
+    ++Size;
+  return Size;
+}
+
+void reportError(const char *Msg, uint64_t Size) {
+  __write(2, Msg, Size);
+  __exit(1);
+}
+
+void assert(bool Assertion, const char *Msg) {
+  if (Assertion)
+    return;
+  char Buf[BufSize];
+  char *Ptr = Buf;
+  Ptr = strCopy(Ptr, "Assertion failed: ");
+  Ptr = strCopy(Ptr, Msg, BufSize - 40);
+  Ptr = strCopy(Ptr, "\n");
+  reportError(Buf, Ptr - Buf);
+}
+
+void reportNumber(const char *Msg, uint64_t Num, uint32_t Base) {
+  char Buf[BufSize];
+  char *Ptr = Buf;
+  Ptr = strCopy(Ptr, Msg, BufSize - 23);
+  Ptr = intToStr(Ptr, Num, Base);
+  Ptr = strCopy(Ptr, "\n");
+  __write(2, Buf, Ptr - Buf);
+}
+
+void report(const char *Msg) { __write(2, Msg, strLen(Msg)); }
+
+/// 1B mutex accessed by lock xchg
+class Mutex {
+  volatile bool InUse{false};
+
+public:
+  bool acquire() {
+    bool Result = true;
+    asm volatile("lock; xchg %0, %1" : "+m"(InUse), "=r"(Result) : : "cc");
+    return !Result;
+  }
+  void release() { InUse = false; }
+};
+
+/// RAII wrapper for Mutex
+class Lock {
+  Mutex &M;
+
+public:
+  Lock(Mutex &M) : M(M) {
+    while (!M.acquire()) {
+    }
+  }
+  ~Lock() { M.release(); }
+};
+
+inline uint64_t alignTo(uint64_t Value, uint64_t Align) {
+  return (Value + Align - 1) / Align * Align;
+}
+} // anonymous namespace
diff --git a/bolt/runtime/hugify.cpp b/bolt/runtime/hugify.cpp
new file mode 100644
index 0000000..3050907
--- /dev/null
+++ b/bolt/runtime/hugify.cpp
@@ -0,0 +1,174 @@
+//===-- hugify.cpp ----------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// This file contains code that is linked to the final binary with a function
+// that is called at program entry to put hot code into a huge page.
+//
+//===----------------------------------------------------------------------===//
+
+#include "common.h"
+#include <sys/mman.h>
+
+// Enables a very verbose logging to stderr useful when debugging
+//#define ENABLE_DEBUG
+
+// Function pointers to init routines in the binary, so we can resume
+// regular execution of the function that we hooked.
+extern void (*__bolt_hugify_init_ptr)();
+
+// The __hot_start and __hot_end symbols set by Bolt. We use them to figure
+// out the rage for marking huge pages.
+extern uint64_t __hot_start;
+extern uint64_t __hot_end;
+
+namespace {
+#ifdef MADV_HUGEPAGE
+/// Starting from character at \p buf, find the longest consecutive sequence
+/// of digits (0-9) and convert it to uint32_t. The converted value
+/// is put into \p ret. \p end marks the end of the buffer to avoid buffer
+/// overflow. The function \returns whether a valid uint32_t value is found.
+/// \p buf will be updated to the next character right after the digits.
+bool scanUInt32(const char *&buf, const char *end, uint32_t &ret) {
+  uint64_t result = 0;
+  const char *oldBuf = buf;
+  while (buf < end && ((*buf) >= '0' && (*buf) <= '9')) {
+    result = result * 10 + (*buf) - '0';
+    ++buf;
+  }
+  if (oldBuf != buf && result <= 0xFFFFFFFFu) {
+    ret = static_cast<uint32_t>(result);
+    return true;
+  }
+  return false;
+}
+
+/// Check whether the kernel supports THP by checking the kernel version.
+/// Only fb kernel 5.2 and latter supports it.
+bool has_pagecache_thp_support() {
+  struct utsname u;
+  int ret = __uname(&u);
+  if (ret) {
+    return false;
+  }
+
+  const char *buf = u.release;
+#ifdef ENABLE_DEBUG
+  report("[hugify] uname release: ");
+  report(buf);
+  report("\n");
+#endif
+  const char *end = buf + strLen(buf);
+  uint32_t nums[5];
+  char delims[4][5] = {".", ".", "-", "_fbk"};
+  // release should be in the format: %d.%d.%d-%d_fbk%d
+  // they represent: major, minor, release, build, fbk.
+  for (int i = 0; i < 5; ++i) {
+    if (!scanUInt32(buf, end, nums[i])) {
+      return false;
+    }
+    if (i < 4) {
+      const char *ptr = delims[i];
+      while (*ptr != '\0') {
+        if (*ptr != *buf) {
+          return false;
+        }
+        ++ptr;
+        ++buf;
+      }
+    }
+  }
+  if (nums[0] > 5) {
+    // Major is > 5.
+    return true;
+  }
+  if (nums[0] < 5) {
+    // Major is < 5.
+    return false;
+  }
+  // minor > 2 || fbk >= 5.
+  return nums[1] > 2 || nums[4] >= 5;
+}
+
+void hugify_for_old_kernel(uint8_t *from, uint8_t *to) {
+  size_t size = to - from;
+
+  uint8_t *mem = reinterpret_cast<uint8_t *>(
+      __mmap(0, size, 0x3 /* PROT_READ | PROT_WRITE*/,
+             0x22 /* MAP_PRIVATE | MAP_ANONYMOUS*/, -1, 0));
+
+  if (mem == (void *)MAP_FAILED) {
+    char msg[] = "Could not allocate memory for text move\n";
+    reportError(msg, sizeof(msg));
+  }
+#ifdef ENABLE_DEBUG
+  reportNumber("Allocated temporary space: ", (uint64_t)mem, 16);
+#endif
+
+  // Copy the hot code to a temproary location.
+  memCpy(mem, from, size);
+
+  // Maps out the existing hot code.
+  if (__mmap(reinterpret_cast<uint64_t>(from), size,
+             PROT_READ | PROT_WRITE | PROT_EXEC,
+             MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1,
+             0) == (void *)MAP_FAILED) {
+    char msg[] = "failed to mmap memory for large page move terminating\n";
+    reportError(msg, sizeof(msg));
+  }
+
+  // Mark the hot code page to be huge page.
+  if (__madvise(from, size, MADV_HUGEPAGE) == -1) {
+    char msg[] = "failed to allocate large page\n";
+    reportError(msg, sizeof(msg));
+  }
+
+  // Copy the hot code back.
+  memCpy(from, mem, size);
+
+  // Change permission back to read-only, ignore failure
+  __mprotect(from, size, PROT_READ | PROT_EXEC);
+
+  __munmap(mem, size);
+}
+#endif
+
+extern "C" void __bolt_hugify_self_impl() {
+#ifdef MADV_HUGEPAGE
+  uint8_t *hotStart = (uint8_t *)&__hot_start;
+  uint8_t *hotEnd = (uint8_t *)&__hot_end;
+  // Make sure the start and end are aligned with huge page address
+  const size_t hugePageBytes = 2L * 1024 * 1024;
+  uint8_t *from = hotStart - ((intptr_t)hotStart & (hugePageBytes - 1));
+  uint8_t *to = hotEnd + (hugePageBytes - 1);
+  to -= (intptr_t)to & (hugePageBytes - 1);
+
+#ifdef ENABLE_DEBUG
+  reportNumber("[hugify] hot start: ", (uint64_t)hotStart, 16);
+  reportNumber("[hugify] hot end: ", (uint64_t)hotEnd, 16);
+  reportNumber("[hugify] aligned huge page from: ", (uint64_t)from, 16);
+  reportNumber("[hugify] aligned huge page to: ", (uint64_t)to, 16);
+#endif
+
+  if (!has_pagecache_thp_support()) {
+    hugify_for_old_kernel(from, to);
+    return;
+  }
+
+  if (__madvise(from, (to - from), MADV_HUGEPAGE) == -1) {
+    char msg[] = "failed to allocate large page\n";
+    // TODO: allow user to control the failure behavior.
+    reportError(msg, sizeof(msg));
+  }
+#endif
+}
+} // anonymous namespace
+
+/// This is hooking ELF's entry, it needs to save all machine state.
+extern "C" __attribute((naked)) void __bolt_hugify_self() {
+  __asm__ __volatile__(SAVE_ALL "call __bolt_hugify_self_impl\n" RESTORE_ALL
+                                "jmp *__bolt_hugify_init_ptr(%%rip)\n" ::
+                                    :);
+}
diff --git a/bolt/runtime/instr.cpp b/bolt/runtime/instr.cpp
index 3985373..bf1fad8 100644
--- a/bolt/runtime/instr.cpp
+++ b/bolt/runtime/instr.cpp
@@ -44,12 +44,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <cstdint>
-
-#include "config.h"
-#ifdef HAVE_ELF_H
-#include <elf.h>
-#endif
+#include "common.h"
 
 // Enables a very verbose logging to stderr useful when debugging
 //#define ENABLE_DEBUG
@@ -100,253 +95,8 @@
 extern void (*__bolt_instr_init_ptr)();
 extern void (*__bolt_instr_fini_ptr)();
 
-// Anonymous namespace covering everything but our library entry point
 namespace {
 
-// We use a stack-allocated buffer for string manipulation in many pieces of
-// this code, including the code that prints each line of the fdata file. This
-// buffer needs to accomodate large function names, but shouldn't be arbitrarily
-// large (dynamically allocated) for simplicity of our memory space usage.
-constexpr uint32_t BufSize = 10240;
-
-// Declare some syscall wrappers we use throughout this code to avoid linking
-// against system libc.
-uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) {
-  uint64_t ret;
-  __asm__ __volatile__ (
-          "movq $2, %%rax\n"
-          "syscall"
-          : "=a"(ret)
-          : "D"(pathname), "S"(flags), "d"(mode)
-          : "cc", "rcx", "r11", "memory");
-  return ret;
-}
-
-uint64_t __write(uint64_t fd, const void *buf, uint64_t count) {
-  uint64_t ret;
-  __asm__ __volatile__ (
-          "movq $1, %%rax\n"
-          "syscall\n"
-          : "=a"(ret)
-          : "D"(fd), "S"(buf), "d"(count)
-          : "cc", "rcx", "r11", "memory");
-  return ret;
-}
-
-uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) {
-  uint64_t ret;
-  __asm__ __volatile__ (
-          "movq $8, %%rax\n"
-          "syscall\n"
-          : "=a"(ret)
-          : "D"(fd), "S"(pos), "d"(whence)
-          : "cc", "rcx", "r11", "memory");
-  return ret;
-}
-
-int __close(uint64_t fd) {
-  uint64_t ret;
-  __asm__ __volatile__ (
-          "movq $3, %%rax\n"
-          "syscall\n"
-          : "=a"(ret)
-          : "D"(fd)
-          : "cc", "rcx", "r11", "memory");
-  return ret;
-}
-
-struct timespec {
-  uint64_t tv_sec; /* seconds */
-  uint64_t tv_nsec;  /* nanoseconds */
-};
-
-uint64_t __nanosleep(const timespec *req, timespec *rem) {
-  uint64_t ret;
-  __asm__ __volatile__ (
-          "movq $35, %%rax\n"
-          "syscall\n"
-          : "=a"(ret)
-          : "D"(req), "S"(rem)
-          : "cc", "rcx", "r11", "memory");
-  return ret;
-}
-
-int64_t __fork() {
-  uint64_t ret;
-  __asm__ __volatile__("movq $57, %%rax\n"
-                       "syscall\n"
-                       : "=a"(ret)
-                       :
-                       : "cc", "rcx", "r11", "memory");
-  return ret;
-}
-
-void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags,
-             uint64_t fd, uint64_t offset) {
-  void *ret;
-  register uint64_t r8 asm("r8") = fd;
-  register uint64_t r9 asm("r9") = offset;
-  register uint64_t r10 asm("r10") = flags;
-  __asm__ __volatile__ (
-          "movq $9, %%rax\n"
-          "syscall\n"
-          : "=a"(ret)
-          : "D"(addr), "S"(size), "d"(prot), "r"(r10), "r"(r8), "r"(r9)
-          : "cc", "rcx", "r11", "memory");
-  return ret;
-}
-
-uint64_t __munmap(void *addr, uint64_t size) {
-  uint64_t ret;
-  __asm__ __volatile__ (
-          "movq $11, %%rax\n"
-          "syscall\n"
-          : "=a"(ret)
-          : "D"(addr), "S"(size)
-          : "cc", "rcx", "r11", "memory");
-  return ret;
-}
-
-uint64_t __getpid() {
-  uint64_t ret;
-  __asm__ __volatile__ (
-          "movq $39, %%rax\n"
-          "syscall\n"
-          : "=a"(ret)
-          :
-          : "cc", "rcx", "r11", "memory");
-  return ret;
-}
-
-uint64_t __getppid() {
-  uint64_t ret;
-  __asm__ __volatile__ (
-          "movq $110, %%rax\n"
-          "syscall\n"
-          : "=a"(ret)
-          :
-          : "cc", "rcx", "r11", "memory");
-  return ret;
-}
-
-uint64_t __exit(uint64_t code) {
-  uint64_t ret;
-  __asm__ __volatile__ (
-          "movq $231, %%rax\n"
-          "syscall\n"
-          : "=a"(ret)
-          : "D"(code)
-          : "cc", "rcx", "r11", "memory");
-  return ret;
-}
-
-// Helper functions for writing strings to the .fdata file. We intentionally
-// avoid using libc names (lowercase memset) to make it clear it is our impl.
-
-/// Write number Num using Base to the buffer in OutBuf, returns a pointer to
-/// the end of the string.
-char *intToStr(char *OutBuf, uint64_t Num, uint32_t Base) {
-  const char *Chars = "0123456789abcdef";
-  char Buf[21];
-  char *Ptr = Buf;
-  while (Num) {
-    *Ptr++ = *(Chars + (Num % Base));
-    Num /= Base;
-  }
-  if (Ptr == Buf) {
-    *OutBuf++ = '0';
-    return OutBuf;
-  }
-  while (Ptr != Buf) {
-    *OutBuf++ = *--Ptr;
-  }
-  return OutBuf;
-}
-
-/// Copy Str to OutBuf, returns a pointer to the end of the copied string
-char *strCopy(char *OutBuf, const char *Str, int32_t Size = BufSize) {
-  while (*Str) {
-    *OutBuf++ = *Str++;
-    if (--Size <= 0)
-      return OutBuf;
-  }
-  return OutBuf;
-}
-
-void memSet(char *Buf, char C, uint32_t Size) {
-  for (int I = 0; I < Size; ++I)
-    *Buf++ = C;
-}
-
-uint32_t strLen(const char *Str) {
-  uint32_t Size = 0;
-  while (*Str++)
-    ++Size;
-  return Size;
-}
-
-void reportError(const char *Msg, uint64_t Size) {
-  __write(2, Msg, Size);
-  __exit(1);
-}
-
-void assert(bool Assertion, const char *Msg) {
-  if (Assertion)
-    return;
-  char Buf[BufSize];
-  char *Ptr = Buf;
-  Ptr = strCopy(Ptr, "Assertion failed: ");
-  Ptr = strCopy(Ptr, Msg, BufSize - 40);
-  Ptr = strCopy(Ptr, "\n");
-  reportError(Buf, Ptr - Buf);
-}
-
-void reportNumber(const char *Msg, uint64_t Num, uint32_t Base) {
-  char Buf[BufSize];
-  char *Ptr = Buf;
-  Ptr = strCopy(Ptr, Msg, BufSize - 23);
-  Ptr = intToStr(Ptr, Num, Base);
-  Ptr = strCopy(Ptr, "\n");
-  __write(2, Buf, Ptr - Buf);
-}
-
-void report(const char *Msg) {
-  __write(2, Msg, strLen(Msg));
-}
-
-/// 1B mutex accessed by lock xchg
-class Mutex {
-  volatile bool InUse{false};
-public:
-  bool acquire() {
-    bool Result = true;
-    asm volatile("lock; xchg %0, %1"
-                 : "+m"(InUse), "=r"(Result)
-                 :
-                 : "cc");
-    return !Result;
-  }
-  void release() {
-    InUse = false;
-  }
-};
-
-/// RAII wrapper for Mutex
-class Lock {
-  Mutex &M;
-public:
-  Lock(Mutex &M) : M(M) {
-    while (!M.acquire()) {}
-  }
-  ~Lock() {
-    M.release();
-  }
-};
-
-inline uint64_t alignTo(uint64_t Value, uint64_t Align) {
-  return (Value + Align - 1) / Align * Align;
-}
-
 /// A simple allocator that mmaps a fixed size region and manages this space
 /// in a stack fashion, meaning you always deallocate the last element that
 /// was allocated. In practice, we don't need to deallocate individual elements.
@@ -359,6 +109,7 @@
     uint64_t Magic;
     uint64_t AllocSize;
   };
+
 public:
   void *allocate(uintptr_t Size) {
     Lock L(M);
@@ -371,7 +122,7 @@
       StackSize = 0;
     }
     Size = alignTo(Size + sizeof(EntryMetadata), 16);
-    uint8_t * AllocAddress = StackBase + StackSize + sizeof(EntryMetadata);
+    uint8_t *AllocAddress = StackBase + StackSize + sizeof(EntryMetadata);
     auto *M = reinterpret_cast<EntryMetadata *>(StackBase + StackSize);
     M->Magic = Magic;
     M->AllocSize = Size;
@@ -416,14 +167,10 @@
   }
 
   /// Set mmap reservation size (only relevant before first allocation)
-  void setMaxSize(uint64_t Size) {
-    MaxSize = Size;
-  }
+  void setMaxSize(uint64_t Size) { MaxSize = Size; }
 
   /// Set mmap reservation privacy (only relevant before first allocation)
-  void setShared(bool S) {
-    Shared = S;
-  }
+  void setShared(bool S) { Shared = S; }
 
   void destroy() {
     if (StackBase == nullptr)
@@ -443,15 +190,12 @@
 /// Used for allocating indirect call instrumentation counters. Initialized by
 /// __bolt_instr_setup, our initialization routine.
 BumpPtrAllocator GlobalAlloc;
-
 } // anonymous namespace
 
 // User-defined placement new operators. We only use those (as opposed to
 // overriding the regular operator new) so we can keep our allocator in the
 // stack instead of in a data section (global).
-void *operator new(uintptr_t Sz, BumpPtrAllocator &A) {
-  return A.allocate(Sz);
-}
+void *operator new(uintptr_t Sz, BumpPtrAllocator &A) { return A.allocate(Sz); }
 void *operator new(uintptr_t Sz, BumpPtrAllocator &A, char C) {
   auto *Ptr = reinterpret_cast<char *>(A.allocate(Sz));
   memSet(Ptr, C, Sz);
@@ -467,9 +211,7 @@
 }
 // Only called during exception unwinding (useless). We must manually dealloc.
 // C++ language weirdness
-void operator delete(void *Ptr, BumpPtrAllocator &A) {
-  A.deallocate(Ptr);
-}
+void operator delete(void *Ptr, BumpPtrAllocator &A) { A.deallocate(Ptr); }
 
 namespace {
 
@@ -1540,7 +1282,6 @@
   }
   return FD;
 }
-
 } // anonymous namespace
 
 /// Reset all counters in case you want to start profiling a new phase of your
@@ -1673,40 +1414,6 @@
   GlobalIndCallCounters[IndCallID].incrementVal(Target, GlobalAlloc);
 }
 
-#define SAVE_ALL                                                               \
-  "push %%rax\n"                                                               \
-  "push %%rbx\n"                                                               \
-  "push %%rcx\n"                                                               \
-  "push %%rdx\n"                                                               \
-  "push %%rdi\n"                                                               \
-  "push %%rsi\n"                                                               \
-  "push %%rbp\n"                                                               \
-  "push %%r8\n"                                                                \
-  "push %%r9\n"                                                                \
-  "push %%r10\n"                                                               \
-  "push %%r11\n"                                                               \
-  "push %%r12\n"                                                               \
-  "push %%r13\n"                                                               \
-  "push %%r14\n"                                                               \
-  "push %%r15\n"
-
-#define RESTORE_ALL                                                            \
-  "pop %%r15\n"                                                                \
-  "pop %%r14\n"                                                                \
-  "pop %%r13\n"                                                                \
-  "pop %%r12\n"                                                                \
-  "pop %%r11\n"                                                                \
-  "pop %%r10\n"                                                                \
-  "pop %%r9\n"                                                                 \
-  "pop %%r8\n"                                                                 \
-  "pop %%rbp\n"                                                                \
-  "pop %%rsi\n"                                                                \
-  "pop %%rdi\n"                                                                \
-  "pop %%rdx\n"                                                                \
-  "pop %%rcx\n"                                                                \
-  "pop %%rbx\n"                                                                \
-  "pop %%rax\n"
-
 /// We receive as in-stack arguments the identifier of the indirect call site
 /// as well as the target address for the call
 extern "C" __attribute((naked)) void __bolt_instr_indirect_call()