From 1e0ee9095ce87a3cee0b44a120f6297ac672f5d0 Mon Sep 17 00:00:00 2001
From: Martin Ankerl <Martin.Ankerl@gmail.com>
Date: Fri, 29 Dec 2017 11:36:11 +0100
Subject: [PATCH] Use best-fit strategy in Arena, now O(log(n)) instead O(n)

This replaces the first-fit algorithm used in the Arena with a best-fit. According to "Dynamic Storage Allocation: A Survey and Critical Review", Wilson et. al. 1995, http://www.scs.stanford.edu/14wi-cs140/sched/readings/wilson.pdf, both startegies work well in practice.

The advantage of using best-fit is that we can switch the slow O(n) algorithm to O(log(n)) operations. Additionally, some previously O(log(n)) operations are now replaced with O(1) operations by using a hash map. The end effect is that the benchmark runs about 2.5 times faster on my machine:

old: BenchLockedPool, 5, 530, 5.25749, 0.00196938, 0.00199755, 0.00198172
new: BenchLockedPool, 5, 1300, 5.11313, 0.000781493, 0.000793314, 0.00078606

I've run all unit tests and benchmarks.
---
 src/bench/lockedpool.cpp   |  2 +-
 src/support/lockedpool.cpp | 70 ++++++++++++++++++++++++--------------
 src/support/lockedpool.h   | 19 ++++++++---
 3 files changed, 60 insertions(+), 31 deletions(-)
diff --git a/src/bench/lockedpool.cpp b/src/bench/lockedpool.cpp
index 914e37a2e..8b7fe5f60 100644
--- a/src/bench/lockedpool.cpp
+++ b/src/bench/lockedpool.cpp
@@ -43,4 +43,4 @@ static void BenchLockedPool(benchmark::State& state)
     addr.clear();
 }
 
-BENCHMARK(BenchLockedPool, 530);
+BENCHMARK(BenchLockedPool, 1300);
diff --git a/src/support/lockedpool.cpp b/src/support/lockedpool.cpp
index 98e869418..ddb84b652 100644
--- a/src/support/lockedpool.cpp
+++ b/src/support/lockedpool.cpp
@@ -47,7 +47,9 @@ Arena::Arena(void *base_in, size_t size_in, size_t alignment_in):
     base(static_cast<char*>(base_in)), end(static_cast<char*>(base_in) + size_in), alignment(alignment_in)
 {
     // Start with one free chunk that covers the entire arena
-    chunks_free.emplace(base, size_in);
+    auto it = size_to_free_chunk.emplace(size_in, base);
+    chunks_free.emplace(base, it);
+    chunks_free_end.emplace(base + size_in, it);
 }
 
 Arena::~Arena()
@@ -63,26 +65,30 @@ void* Arena::alloc(size_t size)
     if (size == 0)
         return nullptr;
 
-    // Pick a large enough free-chunk
-    auto it = std::find_if(chunks_free.begin(), chunks_free.end(),
-        [=](const std::map<char*, size_t>::value_type& chunk){ return chunk.second >= size; });
-    if (it == chunks_free.end())
+    // Pick a large enough free-chunk. Returns an iterator pointing to the first element that is not less than key.
+    // This allocation strategy is best-fit. According to "Dynamic Storage Allocation: A Survey and Critical Review",
+    // Wilson et. al. 1995, http://www.scs.stanford.edu/14wi-cs140/sched/readings/wilson.pdf, best-fit and first-fit
+    // policies seem to work well in practice.
+    auto sizePtrIt = size_to_free_chunk.lower_bound(size);
+    if (sizePtrIt == size_to_free_chunk.end())
         return nullptr;
 
     // Create the used-chunk, taking its space from the end of the free-chunk
-    auto alloced = chunks_used.emplace(it->first + it->second - size, size).first;
-    if (!(it->second -= size))
-        chunks_free.erase(it);
-    return reinterpret_cast<void*>(alloced->first);
-}
-
-/* extend the Iterator if other begins at its end */
-template <class Iterator, class Pair> bool extend(Iterator it, const Pair& other) {
-    if (it->first + it->second == other.first) {
-        it->second += other.second;
-        return true;
+    const size_t sizeRemaining = sizePtrIt->first - size;
+    auto alloced = chunks_used.emplace(sizePtrIt->second + sizeRemaining, size).first;
+    chunks_free_end.erase(sizePtrIt->second + sizePtrIt->first);
+    if (sizePtrIt->first == size) {
+        // whole chunk is used up
+        chunks_free.erase(sizePtrIt->second);
+    } else {
+        // still some memory left in the chunk
+        auto itRemaining = size_to_free_chunk.emplace(sizeRemaining, sizePtrIt->second);
+        chunks_free[sizePtrIt->second] = itRemaining;
+        chunks_free_end.emplace(sizePtrIt->second + sizeRemaining, itRemaining);
     }
-    return false;
+    size_to_free_chunk.erase(sizePtrIt);
+
+    return reinterpret_cast<void*>(alloced->first);
 }
 
 void Arena::free(void *ptr)
@@ -97,16 +103,30 @@ void Arena::free(void *ptr)
     if (i == chunks_used.end()) {
         throw std::runtime_error("Arena: invalid or double free");
     }
-    auto freed = *i;
+    std::pair<char*, size_t> freed = *i;
     chunks_used.erase(i);
 
-    // Add space to free map, coalescing contiguous chunks
-    auto next = chunks_free.upper_bound(freed.first);
-    auto prev = (next == chunks_free.begin()) ? chunks_free.end() : std::prev(next);
-    if (prev == chunks_free.end() || !extend(prev, freed))
-        prev = chunks_free.emplace_hint(next, freed);
-    if (next != chunks_free.end() && extend(prev, *next))
+    // Coalesc freed with previous chunk
+    auto prev = chunks_free_end.find(freed.first);
+    if (prev != chunks_free_end.end()) {
+        freed.first -= prev->second->first;
+        freed.second += prev->second->first;
+        size_to_free_chunk.erase(prev->second);
+        chunks_free_end.erase(prev);
+    }
+
+    // Coalesc freed with chunk after freed
+    auto next = chunks_free.find(freed.first + freed.second);
+    if (next != chunks_free.end()) {
+        freed.second += next->second->first;
+        size_to_free_chunk.erase(next->second);
         chunks_free.erase(next);
+    }
+
+    // Add/set space with coalesced free chunk
+    auto it = size_to_free_chunk.emplace(freed.second, freed.first);
+    chunks_free[freed.first] = it;
+    chunks_free_end[freed.first + freed.second] = it;
 }
 
 Arena::Stats Arena::stats() const
@@ -115,7 +135,7 @@ Arena::Stats Arena::stats() const
     for (const auto& chunk: chunks_used)
         r.used += chunk.second;
     for (const auto& chunk: chunks_free)
-        r.free += chunk.second;
+        r.free += chunk.second->first;
     r.total = r.used + r.free;
     return r;
 }
diff --git a/src/support/lockedpool.h b/src/support/lockedpool.h
index 834f0371e..3b6f82c6c 100644
--- a/src/support/lockedpool.h
+++ b/src/support/lockedpool.h
@@ -10,6 +10,7 @@
 #include <map>
 #include <mutex>
 #include <memory>
+#include <unordered_map>
 
 /**
  * OS-dependent allocation and deallocation of locked/pinned memory pages.
@@ -88,11 +89,19 @@ public:
      */
     bool addressInArena(void *ptr) const { return ptr >= base && ptr < end; }
 private:
-    /** Map of chunk address to chunk information. This class makes use of the
-     * sorted order to merge previous and next chunks during deallocation.
-     */
-    std::map<char*, size_t> chunks_free;
-    std::map<char*, size_t> chunks_used;
+    typedef std::multimap<size_t, char*> SizeToChunkSortedMap;
+    /** Map to enable O(log(n)) best-fit allocation, as it's sorted by size */
+    SizeToChunkSortedMap size_to_free_chunk;
+
+    typedef std::unordered_map<char*, SizeToChunkSortedMap::const_iterator> ChunkToSizeMap;
+    /** Map from begin of free chunk to its node in size_to_free_chunk */
+    ChunkToSizeMap chunks_free;
+    /** Map from end of free chunk to its node in size_to_free_chunk */
+    ChunkToSizeMap chunks_free_end;
+
+    /** Map from begin of used chunk to its size */
+    std::unordered_map<char*, size_t> chunks_used;
+
     /** Base address of arena */
     char* base;
     /** End address of arena */