[sampling heap profiler] Speed up RecordFree by removing atomic locks.

The patch removes all the atomic-based locking from RecordFree function.
To make it work we prohibit rehashing the samples map. Instead if the
map comes close enough to the rehashing treshold, the map gets copied
into a larger one which becomes current for all future accesses from
the RecordFree function. To resolve the race with threads that might
currently run the RecordFree we keep all the previous copies of the map.

Performance wise it speeds up the sampler 2x on sampling interval of 128KB.
It comes at the cost of spending twice more memory for samples map.

BUG=803276

Change-Id: Icdcdfb7b168b10d9d83d1277328ccc7944d0d795
Reviewed-on: https://chromium-review.googlesource.com/1060088
Reviewed-by: Dmitry Gozman <[email protected]>
Reviewed-by: Primiano Tucci <[email protected]>
Commit-Queue: Alexei Filippov <[email protected]>
Cr-Commit-Position: refs/heads/master@{#561159}
diff --git a/base/sampling_heap_profiler/benchmark-octane.js b/base/sampling_heap_profiler/benchmark-octane.js
new file mode 100644
index 0000000..1e59af52
--- /dev/null
+++ b/base/sampling_heap_profiler/benchmark-octane.js
@@ -0,0 +1,66 @@
+// Copyright 2018 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// To benchmark a specific version of Chrome set the CHROME_PATH environment
+// variable, e.g.:
+// $ CHROME_PATH=~/chromium/out/Release/chrome node benchmark-octane.js
+
+const puppeteer = require('puppeteer');
+
+let base_score;
+
+async function runOctane(samplingRate) {
+  const args = ['--enable-devtools-experiments'];
+  if (samplingRate)
+    args.push(`--sampling-heap-profiler=${samplingRate}`);
+  const browser = await puppeteer.launch({
+      executablePath: process.env.CHROME_PATH, args, headless: true});
+  try {
+    const page = await browser.newPage();
+    await page.goto('https://chromium.github.io/octane/');
+    await page.waitForSelector('#run-octane');  // Just in case.
+    await page.click('#run-octane');
+
+    const scoreDiv = await page.waitForSelector('#main-banner:only-child',
+        {timeout: 120000});
+    const scoreText = await page.evaluate(e => e.innerText, scoreDiv);
+    const match = /Score:\s*(\d+)/.exec(scoreText);
+    if (match.length < 2) {
+      console.log(`Error: cannot parse score from '${scoreText}'`);
+      return 0;
+    }
+    return parseInt(match[1]);
+  } finally {
+    await browser.close();
+  }
+}
+
+async function makeRuns(rate) {
+  console.log(`tesing rate: ${rate}`);
+  let sum = 0;
+  let sum2 = 0;
+  const n = 10;
+  for (let i = 0; i < n; ++i) {
+    const score = await runOctane(rate);
+    console.log(score);
+    sum += score;
+    sum2 += score * score;
+  }
+  const mean = sum / n;
+  const stdev = Math.sqrt(sum2 / n - mean * mean);
+  console.log(`rate: ${rate}   mean: ${mean}   stdev: ${stdev}`);
+  return mean;
+}
+
+async function main() {
+  console.log(`Using ${process.env.CHROME_PATH || puppeteer.executablePath()}`);
+  const base_score = await makeRuns(0);
+  for (let rate = 32; rate <= 2048; rate *= 2) {
+    const score = await makeRuns(rate);
+    console.log(`slowdown: ${(100 - score / base_score * 100).toFixed(2)}%\n`);
+  }
+}
+
+main();
+
diff --git a/base/sampling_heap_profiler/sampling_heap_profiler.cc b/base/sampling_heap_profiler/sampling_heap_profiler.cc
index 3d7424bb..7f0e7e4 100644
--- a/base/sampling_heap_profiler/sampling_heap_profiler.cc
+++ b/base/sampling_heap_profiler/sampling_heap_profiler.cc
@@ -40,20 +40,12 @@
 // A positive value if profiling is running, otherwise it's zero.
 Atomic32 g_running;
 
-// Number of lock-free safe (not causing rehashing) accesses to samples_ map
-// currently being performed.
-Atomic32 g_operations_in_flight;
-
-// Controls if new incoming lock-free accesses are allowed.
-// When set to true, threads should not enter lock-free paths.
-Atomic32 g_fast_path_is_closed;
+// Pointer to the current |SamplingHeapProfiler::SamplesMap|.
+AtomicWord g_current_samples_map;
 
 // Sampling interval parameter, the mean value for intervals between samples.
 AtomicWord g_sampling_interval = kDefaultSamplingIntervalBytes;
 
-// Last generated sample ordinal number.
-uint32_t g_last_sample_ordinal = 0;
-
 void (*g_hooks_install_callback)();
 Atomic32 g_hooks_installed;
 
@@ -182,6 +174,10 @@
 
 SamplingHeapProfiler::SamplingHeapProfiler() {
   instance_ = this;
+  auto samples_map = std::make_unique<SamplesMap>(64);
+  base::subtle::NoBarrier_Store(
+      &g_current_samples_map, reinterpret_cast<AtomicWord>(samples_map.get()));
+  sample_maps_.push(std::move(samples_map));
 }
 
 // static
@@ -235,7 +231,7 @@
 uint32_t SamplingHeapProfiler::Start() {
   InstallAllocatorHooksOnce();
   base::subtle::Barrier_AtomicIncrement(&g_running, 1);
-  return g_last_sample_ordinal;
+  return last_sample_ordinal_;
 }
 
 void SamplingHeapProfiler::Stop() {
@@ -333,44 +329,19 @@
   entered_.Set(true);
   {
     base::AutoLock lock(mutex_);
-
-    Sample sample(size, total_allocated, ++g_last_sample_ordinal);
+    Sample sample(size, total_allocated, ++last_sample_ordinal_);
     RecordStackTrace(&sample, skip_frames);
-
-    if (MayRehashOnInsert()) {
-      // Close the fast path as inserting an element into samples_ may cause
-      // rehashing that invalidates iterators affecting all the concurrent
-      // readers.
-      base::subtle::Release_Store(&g_fast_path_is_closed, 1);
-      // Wait until all current readers leave.
-      while (base::subtle::Acquire_Load(&g_operations_in_flight)) {
-        while (base::subtle::NoBarrier_Load(&g_operations_in_flight)) {
-        }
-      }
-      samples_.emplace(address, std::move(sample));
-      // Open the fast path.
-      base::subtle::Release_Store(&g_fast_path_is_closed, 0);
-    } else {
-      samples_.emplace(address, std::move(sample));
-    }
-
     for (auto* observer : observers_)
       observer->SampleAdded(sample.ordinal, size, total_allocated);
+    EnsureNoRehashingMap().emplace(address, std::move(sample));
   }
-
   entered_.Set(false);
 }
 
 // static
 void SamplingHeapProfiler::RecordFree(void* address) {
-  bool maybe_sampled = true;  // Pessimistically assume allocation was sampled.
-  base::subtle::Barrier_AtomicIncrement(&g_operations_in_flight, 1);
-  if (LIKELY(!base::subtle::NoBarrier_Load(&g_fast_path_is_closed))) {
-    maybe_sampled =
-        instance_->samples_.find(address) != instance_->samples_.end();
-  }
-  base::subtle::Barrier_AtomicIncrement(&g_operations_in_flight, -1);
-  if (maybe_sampled)
+  const SamplesMap& samples = SamplingHeapProfiler::samples();
+  if (UNLIKELY(samples.find(address) != samples.end()))
     instance_->DoRecordFree(address);
 }
 
@@ -382,21 +353,43 @@
   entered_.Set(true);
   {
     base::AutoLock lock(mutex_);
-    auto it = samples_.find(address);
-    if (it != samples_.end()) {
-      for (auto* observer : observers_)
-        observer->SampleRemoved(it->second.ordinal);
-      samples_.erase(it);
-    }
+    SamplesMap& samples = this->samples();
+    auto it = samples.find(address);
+    CHECK(it != samples.end());
+    for (auto* observer : observers_)
+      observer->SampleRemoved(it->second.ordinal);
+    samples.erase(it);
   }
   entered_.Set(false);
 }
 
-bool SamplingHeapProfiler::MayRehashOnInsert() {
+SamplingHeapProfiler::SamplesMap& SamplingHeapProfiler::EnsureNoRehashingMap() {
+  // The function makes sure we never rehash the current map in place.
+  // Instead if it comes close to the rehashing boundary, we allocate a twice
+  // larger map, copy the samples into it, and atomically switch new readers
+  // to use the new map.
+  // We still have to keep all the old maps alive to resolve the theoretical
+  // race with readers in |RecordFree| that have already obtained the map,
+  // but haven't yet managed to access it.
+  SamplesMap& samples = this->samples();
   size_t max_items_before_rehash =
-      std::floor(samples_.bucket_count() * samples_.max_load_factor());
+      static_cast<size_t>(samples.bucket_count() * samples.max_load_factor());
   // Conservatively use 2 instead of 1 to workaround potential rounding errors.
-  return samples_.size() + 2 >= max_items_before_rehash;
+  bool may_rehash_on_insert = samples.size() + 2 >= max_items_before_rehash;
+  if (!may_rehash_on_insert)
+    return samples;
+  auto new_map = std::make_unique<SamplesMap>(samples.begin(), samples.end(),
+                                              samples.bucket_count() * 2);
+  base::subtle::Release_Store(&g_current_samples_map,
+                              reinterpret_cast<AtomicWord>(new_map.get()));
+  sample_maps_.push(std::move(new_map));
+  return this->samples();
+}
+
+// static
+SamplingHeapProfiler::SamplesMap& SamplingHeapProfiler::samples() {
+  return *reinterpret_cast<SamplesMap*>(
+      base::subtle::NoBarrier_Load(&g_current_samples_map));
 }
 
 // static
@@ -439,7 +432,7 @@
   std::vector<Sample> samples;
   {
     base::AutoLock lock(mutex_);
-    for (auto& it : samples_) {
+    for (auto& it : this->samples()) {
       Sample& sample = it.second;
       if (sample.ordinal > profile_id)
         samples.push_back(sample);
diff --git a/base/sampling_heap_profiler/sampling_heap_profiler.h b/base/sampling_heap_profiler/sampling_heap_profiler.h
index 3f2f227a..c579220 100644
--- a/base/sampling_heap_profiler/sampling_heap_profiler.h
+++ b/base/sampling_heap_profiler/sampling_heap_profiler.h
@@ -2,9 +2,11 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
-#ifndef BASE_SAMPLING_HEAP_PROFILER_SAMPLING_HEAP_PROFILER_H
-#define BASE_SAMPLING_HEAP_PROFILER_SAMPLING_HEAP_PROFILER_H
+#ifndef BASE_SAMPLING_HEAP_PROFILER_SAMPLING_HEAP_PROFILER_H_
+#define BASE_SAMPLING_HEAP_PROFILER_SAMPLING_HEAP_PROFILER_H_
 
+#include <memory>
+#include <stack>
 #include <unordered_map>
 #include <vector>
 
@@ -79,6 +81,8 @@
   static SamplingHeapProfiler* GetInstance();
 
  private:
+  using SamplesMap = std::unordered_map<void*, Sample>;
+
   SamplingHeapProfiler();
   ~SamplingHeapProfiler() = delete;
 
@@ -92,12 +96,14 @@
                      uint32_t skip_frames);
   void DoRecordFree(void* address);
   void RecordStackTrace(Sample*, uint32_t skip_frames);
-  bool MayRehashOnInsert();
+  SamplesMap& EnsureNoRehashingMap();
+  static SamplesMap& samples();
 
   base::ThreadLocalBoolean entered_;
   base::Lock mutex_;
-  std::unordered_map<void*, Sample> samples_;
+  std::stack<std::unique_ptr<SamplesMap>> sample_maps_;
   std::vector<SamplesObserver*> observers_;
+  uint32_t last_sample_ordinal_ = 0;
 
   static SamplingHeapProfiler* instance_;
 
@@ -108,4 +114,4 @@
 
 }  // namespace base
 
-#endif  // BASE_SAMPLING_HEAP_PROFILER_SAMPLING_HEAP_PROFILER_H
+#endif  // BASE_SAMPLING_HEAP_PROFILER_SAMPLING_HEAP_PROFILER_H_