From 27f202083c95d6c1edaad397bdfa7a5f41f23a0c Mon Sep 17 00:00:00 2001
From: PSeitz <PSeitz@users.noreply.github.com>
Date: Thu, 8 Jun 2023 17:13:52 +0800
Subject: [PATCH] Improve Termmap Indexing Performance +~30% (#2058)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* update benchmark

* Improve Termmap Indexing Performance +~30%

This contains many small changes to improve Termmap performance.
Most notably:
* Specialized byte compare and equality versions, instead of glibc calls.
* ExpUnrolledLinkedList to not contain inline items.

Allow compare hash only via a feature flag compare_hash_only:
64bits should be enough with a good hash function to compare strings by
their hashes instead of comparing the strings. Disabled by default

CreateHashMap/alice/174693
                        time:   [642.23 µs 643.80 µs 645.24 µs]
                        thrpt:  [258.20 MiB/s 258.78 MiB/s 259.41 MiB/s]
                 change:
                        time:   [-14.429% -13.303% -12.348%] (p = 0.00 < 0.05)
                        thrpt:  [+14.088% +15.344% +16.862%]
                        Performance has improved.
CreateHashMap/alice_expull/174693
                        time:   [877.03 µs 880.44 µs 884.67 µs]
                        thrpt:  [188.32 MiB/s 189.22 MiB/s 189.96 MiB/s]
                 change:
                        time:   [-26.460% -26.274% -26.091%] (p = 0.00 < 0.05)
                        thrpt:  [+35.301% +35.637% +35.981%]
                        Performance has improved.
CreateHashMap/numbers_zipf/8000000
                        time:   [9.1198 ms 9.1573 ms 9.1961 ms]
                        thrpt:  [829.64 MiB/s 833.15 MiB/s 836.57 MiB/s]
                 change:
                        time:   [-35.229% -34.828% -34.384%] (p = 0.00 < 0.05)
                        thrpt:  [+52.403% +53.440% +54.390%]
                        Performance has improved.

* clippy

* add bench for ids

* inline(always) to inline whole block with bounds checks

* cleanup
---
 src/indexer/segment_writer.rs |   2 +-
 stacker/Cargo.toml            |   5 +-
 stacker/Performance.md        |  14 ++
 stacker/benches/crit_bench.rs |  43 ++++--
 stacker/src/arena_hashmap.rs  |  60 ++++++--
 stacker/src/expull.rs         | 264 ++++++++++++++++------------------
 stacker/src/fastcmp.rs        | 147 +++++++++++++++++++
 stacker/src/fastcpy.rs        | 117 +++++++++++++++
 stacker/src/lib.rs            |   3 +
 stacker/src/memory_arena.rs   |  51 ++++---
 10 files changed, 531 insertions(+), 175 deletions(-)
 create mode 100644 stacker/Performance.md
 create mode 100644 stacker/src/fastcmp.rs
 create mode 100644 stacker/src/fastcpy.rs

diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs
index c21d7d534..de9f951f3 100644
--- a/src/indexer/segment_writer.rs
+++ b/src/indexer/segment_writer.rs
@@ -457,7 +457,7 @@ mod tests {
     fn test_hashmap_size() {
         assert_eq!(compute_initial_table_size(100_000).unwrap(), 1 << 11);
         assert_eq!(compute_initial_table_size(1_000_000).unwrap(), 1 << 14);
-        assert_eq!(compute_initial_table_size(10_000_000).unwrap(), 1 << 18);
+        assert_eq!(compute_initial_table_size(15_000_000).unwrap(), 1 << 18);
         assert_eq!(compute_initial_table_size(1_000_000_000).unwrap(), 1 << 19);
         assert_eq!(compute_initial_table_size(4_000_000_000).unwrap(), 1 << 19);
     }
diff --git a/stacker/Cargo.toml b/stacker/Cargo.toml
index 32b8ebd4b..e65ddac87 100644
--- a/stacker/Cargo.toml
+++ b/stacker/Cargo.toml
@@ -7,6 +7,7 @@ license = "MIT"
 [dependencies]
 murmurhash32 = "0.3"
 common = { version = "0.5", path = "../common/", package = "tantivy-common" }
+ahash = { version = "0.8.3", default-features = false, optional = true }
 
 [[bench]]
 harness = false
@@ -20,8 +21,10 @@ path = "example/hashmap.rs"
 [dev-dependencies]
 rand = "0.8.5"
 zipf = "7.0.0"
-criterion = "0.5.0"
+criterion = { git = "https://github.com/PSeitz/criterion.rs/", rev = "e6f98ee"} # This fork includes stack randomization to reduce caching effects
 rustc-hash = "1.1.0"
+proptest = "1.2.0"
 
 [features]
+compare_hash_only = ["ahash"] # Compare hash only, not the key in the Hashmap
 unstable = [] # useful for benches.
diff --git a/stacker/Performance.md b/stacker/Performance.md
new file mode 100644
index 000000000..362553623
--- /dev/null
+++ b/stacker/Performance.md
@@ -0,0 +1,14 @@
+
+# Notes
+
+- `extend_from_slice(&key)` calls memcpy, which is relatively slow, since most keys are relatively short. For now there's a specialized version toavoid memcpy calls.
+    Wild copy 16 bytes in a loop is faster, but would require a guard against overflow from the caller side. (We probably can do that). 
+- Comparing two slices of unknown length calls memcmp. Same as above, we can do a specialized version.
+
+fastcmp and fastcpy both employ the same trick, to compare slices of odd length, e.g. 2 operations unconditional on 4 bytes, instead 3 operations with conditionals (1 4byte, 1 2byte, 1 1byte).
+[1, 2, 3, 4, 5, 6, 7]
+[1, 2, 3, 4]
+         [4, 5, 6, 7]
+
+- Since the hashmap writes the values on every key insert/update, the values like expull should be small. Therefore inlining of the values has been removed.
+- Currently the first call to Expull will get a capacity of 0. It would be beneficial if it could be initialized with some memory, so that the first call doesn't have to allocate. But that would mean we don't have `Default` impls.
diff --git a/stacker/benches/crit_bench.rs b/stacker/benches/crit_bench.rs
index 4929835b3..7d29df235 100644
--- a/stacker/benches/crit_bench.rs
+++ b/stacker/benches/crit_bench.rs
@@ -15,11 +15,19 @@ fn bench_hashmap_throughput(c: &mut Criterion) {
     group.plot_config(plot_config);
 
     let input_bytes = ALICE.len() as u64;
+
     let alice_terms_as_bytes: Vec<&[u8]> = ALICE
         .split_ascii_whitespace()
         .map(|el| el.as_bytes())
         .collect();
 
+    let alice_terms_as_bytes_with_docid: Vec<(u32, &[u8])> = ALICE
+        .split_ascii_whitespace()
+        .map(|el| el.as_bytes())
+        .enumerate()
+        .map(|(docid, el)| (docid as u32, el))
+        .collect();
+
     group.throughput(Throughput::Bytes(input_bytes));
 
     group.bench_with_input(
@@ -29,8 +37,8 @@ fn bench_hashmap_throughput(c: &mut Criterion) {
     );
     group.bench_with_input(
         BenchmarkId::new("alice_expull".to_string(), input_bytes),
-        &alice_terms_as_bytes,
-        |b, i| b.iter(|| create_hash_map_with_expull(i.iter())),
+        &alice_terms_as_bytes_with_docid,
+        |b, i| b.iter(|| create_hash_map_with_expull(i.iter().cloned())),
     );
 
     group.bench_with_input(
@@ -48,11 +56,24 @@ fn bench_hashmap_throughput(c: &mut Criterion) {
     // numbers
     let input_bytes = 1_000_000 * 8 as u64;
     group.throughput(Throughput::Bytes(input_bytes));
+    let numbers: Vec<[u8; 8]> = (0..1_000_000u64).map(|el| el.to_le_bytes()).collect();
 
     group.bench_with_input(
         BenchmarkId::new("numbers".to_string(), input_bytes),
-        &(0..1_000_000u64),
-        |b, i| b.iter(|| create_hash_map(i.clone().map(|el| el.to_le_bytes()))),
+        &numbers,
+        |b, i| b.iter(|| create_hash_map(i.iter().cloned())),
+    );
+
+    let numbers_with_doc: Vec<_> = numbers
+        .iter()
+        .enumerate()
+        .map(|(docid, el)| (docid as u32, el))
+        .collect();
+
+    group.bench_with_input(
+        BenchmarkId::new("ids_expull".to_string(), input_bytes),
+        &numbers_with_doc,
+        |b, i| b.iter(|| create_hash_map_with_expull(i.iter().cloned())),
     );
 
     // numbers zipf
@@ -63,11 +84,14 @@ fn bench_hashmap_throughput(c: &mut Criterion) {
 
     let input_bytes = 1_000_000 * 8 as u64;
     group.throughput(Throughput::Bytes(input_bytes));
+    let zipf_numbers: Vec<[u8; 8]> = (0..1_000_000u64)
+        .map(|_| zipf.sample(&mut rng).to_le_bytes())
+        .collect();
 
     group.bench_with_input(
         BenchmarkId::new("numbers_zipf".to_string(), input_bytes),
-        &(0..1_000_000u64),
-        |b, i| b.iter(|| create_hash_map(i.clone().map(|_el| zipf.sample(&mut rng).to_le_bytes()))),
+        &zipf_numbers,
+        |b, i| b.iter(|| create_hash_map(i.iter().cloned())),
     );
 
     group.finish();
@@ -102,14 +126,15 @@ fn create_hash_map<'a, T: AsRef<[u8]>>(terms: impl Iterator<Item = T>) -> ArenaH
     map
 }
 
-fn create_hash_map_with_expull<'a, T: AsRef<[u8]>>(terms: impl Iterator<Item = T>) -> ArenaHashMap {
-    let terms = terms.enumerate();
+fn create_hash_map_with_expull<'a, T: AsRef<[u8]>>(
+    terms: impl Iterator<Item = (u32, T)>,
+) -> ArenaHashMap {
     let mut memory_arena = MemoryArena::default();
     let mut map = ArenaHashMap::with_capacity(HASHMAP_SIZE);
     for (i, term) in terms {
         map.mutate_or_create(term.as_ref(), |val: Option<DocIdRecorder>| {
             if let Some(mut rec) = val {
-                rec.new_doc(i as u32, &mut memory_arena);
+                rec.new_doc(i, &mut memory_arena);
                 rec
             } else {
                 DocIdRecorder::default()
diff --git a/stacker/src/arena_hashmap.rs b/stacker/src/arena_hashmap.rs
index 98d5a4efe..254be2de3 100644
--- a/stacker/src/arena_hashmap.rs
+++ b/stacker/src/arena_hashmap.rs
@@ -2,6 +2,7 @@ use std::iter::{Cloned, Filter};
 use std::mem;
 
 use super::{Addr, MemoryArena};
+use crate::fastcpy::fast_short_slice_copy;
 use crate::memory_arena::store;
 use crate::UnorderedId;
 
@@ -12,8 +13,12 @@ pub fn compute_table_memory_size(capacity: usize) -> usize {
     capacity * mem::size_of::<KeyValue>()
 }
 
+#[cfg(not(feature = "compare_hash_only"))]
 type HashType = u32;
 
+#[cfg(feature = "compare_hash_only")]
+type HashType = u64;
+
 /// `KeyValue` is the item stored in the hash table.
 /// The key is actually a `BytesRef` object stored in an external memory arena.
 /// The `value_addr` also points to an address in the memory arena.
@@ -132,10 +137,21 @@ impl ArenaHashMap {
     }
 
     #[inline]
+    #[cfg(not(feature = "compare_hash_only"))]
     fn get_hash(&self, key: &[u8]) -> HashType {
         murmurhash32::murmurhash2(key)
     }
 
+    #[inline]
+    #[cfg(feature = "compare_hash_only")]
+    fn get_hash(&self, key: &[u8]) -> HashType {
+        /// Since we compare only the hash we need a high quality hash.
+        use std::hash::Hasher;
+        let mut hasher = ahash::AHasher::default();
+        hasher.write(key);
+        hasher.finish() as HashType
+    }
+
     #[inline]
     pub fn read<Item: Copy + 'static>(&self, addr: Addr) -> Item {
         self.memory_arena.read(addr)
@@ -159,17 +175,19 @@ impl ArenaHashMap {
     #[inline]
     fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) {
         let data = self.memory_arena.slice_from(addr);
-        let (key_bytes_len_bytes, data) = data.split_at(2);
+        let key_bytes_len_bytes = unsafe { data.get_unchecked(..2) };
         let key_bytes_len = u16::from_le_bytes(key_bytes_len_bytes.try_into().unwrap());
-        let key_bytes: &[u8] = &data[..key_bytes_len as usize];
+        let key_bytes: &[u8] = unsafe { data.get_unchecked(2..2 + key_bytes_len as usize) };
         (key_bytes, addr.offset(2 + key_bytes_len as u32))
     }
 
     #[inline]
     #[cfg(not(feature = "compare_hash_only"))]
     fn get_value_addr_if_key_match(&self, target_key: &[u8], addr: Addr) -> Option<Addr> {
+        use crate::fastcmp::fast_short_slice_compare;
+
         let (stored_key, value_addr) = self.get_key_value(addr);
-        if stored_key == target_key {
+        if fast_short_slice_compare(stored_key, target_key) {
             Some(value_addr)
         } else {
             None
@@ -178,6 +196,8 @@ impl ArenaHashMap {
     #[inline]
     #[cfg(feature = "compare_hash_only")]
     fn get_value_addr_if_key_match(&self, _target_key: &[u8], addr: Addr) -> Option<Addr> {
+        // For the compare_hash_only feature, it would make sense to store the keys at a different
+        // memory location. Here they will just pollute the cache.
         let data = self.memory_arena.slice_from(addr);
         let key_bytes_len_bytes = &data[..2];
         let key_bytes_len = u16::from_le_bytes(key_bytes_len_bytes.try_into().unwrap());
@@ -283,9 +303,9 @@ impl ArenaHashMap {
         }
         let hash = self.get_hash(key);
         let mut probe = self.probe(hash);
+        let mut bucket = probe.next_probe();
+        let mut kv: KeyValue = self.table[bucket];
         loop {
-            let bucket = probe.next_probe();
-            let kv: KeyValue = self.table[bucket];
             if kv.is_empty() {
                 // The key does not exist yet.
                 let val = updater(None);
@@ -293,14 +313,16 @@ impl ArenaHashMap {
                 let key_addr = self.memory_arena.allocate_space(num_bytes);
                 {
                     let data = self.memory_arena.slice_mut(key_addr, num_bytes);
-                    data[..2].copy_from_slice(&(key.len() as u16).to_le_bytes());
+                    let key_len_bytes: [u8; 2] = (key.len() as u16).to_le_bytes();
+                    data[..2].copy_from_slice(&key_len_bytes);
                     let stop = 2 + key.len();
-                    data[2..stop].copy_from_slice(key);
+                    fast_short_slice_copy(key, &mut data[2..stop]);
                     store(&mut data[stop..], val);
                 }
 
                 return self.set_bucket(hash, key_addr, bucket);
-            } else if kv.hash == hash {
+            }
+            if kv.hash == hash {
                 if let Some(val_addr) = self.get_value_addr_if_key_match(key, kv.key_value_addr) {
                     let v = self.memory_arena.read(val_addr);
                     let new_v = updater(Some(v));
@@ -308,6 +330,9 @@ impl ArenaHashMap {
                     return kv.unordered_id;
                 }
             }
+            // This allows fetching the next bucket before the loop jmp
+            bucket = probe.next_probe();
+            kv = self.table[bucket];
         }
     }
 }
@@ -355,4 +380,23 @@ mod tests {
         assert_eq!(compute_previous_power_of_two(7), 4);
         assert_eq!(compute_previous_power_of_two(u64::MAX as usize), 1 << 63);
     }
+
+    #[test]
+    fn test_many_terms() {
+        let mut terms: Vec<String> = (0..20_000).map(|val| val.to_string()).collect();
+        let mut hash_map: ArenaHashMap = ArenaHashMap::default();
+        for term in terms.iter() {
+            hash_map.mutate_or_create(term.as_bytes(), |_opt_val: Option<u32>| 5u32);
+        }
+        let mut terms_back: Vec<String> = hash_map
+            .iter()
+            .map(|(bytes, _, _)| String::from_utf8(bytes.to_vec()).unwrap())
+            .collect();
+        terms_back.sort();
+        terms.sort();
+
+        for pos in 0..terms.len() {
+            assert_eq!(terms[pos], terms_back[pos]);
+        }
+    }
 }
diff --git a/stacker/src/expull.rs b/stacker/src/expull.rs
index 03fd564b6..cbda3b8e9 100644
--- a/stacker/src/expull.rs
+++ b/stacker/src/expull.rs
@@ -2,40 +2,10 @@ use std::mem;
 
 use common::serialize_vint_u32;
 
-use crate::memory_arena::{load, store};
+use crate::fastcpy::fast_short_slice_copy;
 use crate::{Addr, MemoryArena};
 
-const MAX_BLOCK_LEN: u32 = 1u32 << 15;
-const FIRST_BLOCK: usize = 16;
-const INLINED_BLOCK_LEN: usize = FIRST_BLOCK + mem::size_of::<Addr>();
-
-enum CapacityResult {
-    Available(u32),
-    NeedAlloc(u32),
-}
-
-fn len_to_capacity(len: u32) -> CapacityResult {
-    match len {
-        0..=15 => CapacityResult::Available(FIRST_BLOCK as u32 - len),
-        16..=MAX_BLOCK_LEN => {
-            let cap = 1 << (32u32 - (len - 1u32).leading_zeros());
-            let available = cap - len;
-            if available == 0 {
-                CapacityResult::NeedAlloc(len)
-            } else {
-                CapacityResult::Available(available)
-            }
-        }
-        n => {
-            let available = n % MAX_BLOCK_LEN;
-            if available == 0 {
-                CapacityResult::NeedAlloc(MAX_BLOCK_LEN)
-            } else {
-                CapacityResult::Available(MAX_BLOCK_LEN - available)
-            }
-        }
-    }
-}
+const FIRST_BLOCK_NUM: u16 = 2;
 
 /// An exponential unrolled link.
 ///
@@ -52,17 +22,33 @@ fn len_to_capacity(len: u32) -> CapacityResult {
 /// problem of selecting an adequate block size using a strategy similar to
 /// that of the `Vec` amortized resize strategy.
 ///
-/// Data is stored in a linked list of blocks. The first block has a size of `4`
+/// Data is stored in a linked list of blocks. The first block has a size of `8`
 /// and each block has a length of twice that of the previous block up to
-/// `MAX_BLOCK_LEN = 32768`.
+/// `MAX_BLOCK_LEN = 1<<15`.
 ///
 /// This strategy is a good trade off to handle numerous very rare terms
 /// and avoid wasting half of the memory for very frequent terms.
 #[derive(Debug, Clone, Copy)]
 pub struct ExpUnrolledLinkedList {
-    len: u32,
+    // u16, since the max size of each block is (1<<next_cap_pow_2)
+    // Limited to 15, so we don't overflow remaining_cap.
+    remaining_cap: u16,
+    // To get the current number of blocks: block_num - FIRST_BLOCK_NUM
+    block_num: u16,
+    head: Addr,
     tail: Addr,
-    inlined_data: [u8; INLINED_BLOCK_LEN],
+}
+
+impl Default for ExpUnrolledLinkedList {
+    fn default() -> Self {
+        Self {
+            // 0 to trigger an initial allocation. Init with MemoryArena would be better.
+            remaining_cap: 0,
+            block_num: FIRST_BLOCK_NUM,
+            head: Addr::null_pointer(),
+            tail: Addr::null_pointer(),
+        }
+    }
 }
 
 pub struct ExpUnrolledLinkedListWriter<'a> {
@@ -70,32 +56,22 @@ pub struct ExpUnrolledLinkedListWriter<'a> {
     arena: &'a mut MemoryArena,
 }
 
+#[inline]
 fn ensure_capacity<'a>(
     eull: &'a mut ExpUnrolledLinkedList,
     arena: &'a mut MemoryArena,
-) -> &'a mut [u8] {
-    if eull.len <= FIRST_BLOCK as u32 {
-        // We are still hitting the inline block.
-        if eull.len < FIRST_BLOCK as u32 {
-            return &mut eull.inlined_data[eull.len as usize..FIRST_BLOCK];
-        }
-        // We need to allocate a new block!
-        let new_block_addr: Addr = arena.allocate_space(FIRST_BLOCK + mem::size_of::<Addr>());
-        store(&mut eull.inlined_data[FIRST_BLOCK..], new_block_addr);
-        eull.tail = new_block_addr;
-        return arena.slice_mut(eull.tail, FIRST_BLOCK);
+    allocate: u32,
+) {
+    let new_block_addr: Addr = arena.allocate_space(allocate as usize + mem::size_of::<Addr>());
+    // Check first write
+    if eull.head.is_null() {
+        eull.head = new_block_addr;
+    } else {
+        arena.write_at(eull.tail, new_block_addr);
     }
-    let len = match len_to_capacity(eull.len) {
-        CapacityResult::NeedAlloc(new_block_len) => {
-            let new_block_addr: Addr =
-                arena.allocate_space(new_block_len as usize + mem::size_of::<Addr>());
-            arena.write_at(eull.tail, new_block_addr);
-            eull.tail = new_block_addr;
-            new_block_len
-        }
-        CapacityResult::Available(available) => available,
-    };
-    arena.slice_mut(eull.tail, len as usize)
+
+    eull.tail = new_block_addr;
+    eull.remaining_cap = allocate as u16;
 }
 
 impl<'a> ExpUnrolledLinkedListWriter<'a> {
@@ -111,56 +87,63 @@ impl<'a> ExpUnrolledLinkedListWriter<'a> {
         while !buf.is_empty() {
             let add_len: usize;
             {
-                let output_buf = ensure_capacity(self.eull, self.arena);
+                if self.eull.remaining_cap == 0 {
+                    // Double the next cap
+                    self.eull.increment_num_blocks();
+                    let block_size = get_block_size(self.eull.block_num);
+                    ensure_capacity(self.eull, self.arena, block_size as u32);
+                }
+
+                let output_buf = self
+                    .arena
+                    .slice_mut(self.eull.tail, self.eull.remaining_cap as usize);
                 add_len = buf.len().min(output_buf.len());
-                output_buf[..add_len].copy_from_slice(&buf[..add_len]);
+                let output_buf = &mut output_buf[..add_len];
+                let buf = &buf[..add_len];
+
+                fast_short_slice_copy(buf, output_buf);
             }
-            self.eull.len += add_len as u32;
+            self.eull.remaining_cap -= add_len as u16;
             self.eull.tail = self.eull.tail.offset(add_len as u32);
             buf = &buf[add_len..];
         }
     }
 }
 
-impl Default for ExpUnrolledLinkedList {
-    fn default() -> ExpUnrolledLinkedList {
-        ExpUnrolledLinkedList {
-            len: 0u32,
-            tail: Addr::null_pointer(),
-            inlined_data: [0u8; INLINED_BLOCK_LEN],
-        }
-    }
+// The block size is 2^block_num + 2, but max 2^15= 32k
+// Inital size is 8, for the first block => block_num == 1
+#[inline]
+fn get_block_size(block_num: u16) -> u16 {
+    1 << block_num.min(15)
 }
 
 impl ExpUnrolledLinkedList {
+    pub fn increment_num_blocks(&mut self) {
+        self.block_num += 1;
+    }
+
     #[inline]
     pub fn writer<'a>(&'a mut self, arena: &'a mut MemoryArena) -> ExpUnrolledLinkedListWriter<'a> {
         ExpUnrolledLinkedListWriter { eull: self, arena }
     }
 
     pub fn read_to_end(&self, arena: &MemoryArena, output: &mut Vec<u8>) {
-        let len = self.len as usize;
-        if len <= FIRST_BLOCK {
-            output.extend_from_slice(&self.inlined_data[..len]);
+        let mut addr = self.head;
+        if addr.is_null() {
             return;
         }
-        output.extend_from_slice(&self.inlined_data[..FIRST_BLOCK]);
-        let mut cur = FIRST_BLOCK;
-        let mut addr = load(&self.inlined_data[FIRST_BLOCK..]);
-        loop {
-            let cap = match len_to_capacity(cur as u32) {
-                CapacityResult::Available(capacity) => capacity,
-                CapacityResult::NeedAlloc(capacity) => capacity,
-            } as usize;
+        let last_block_len = get_block_size(self.block_num) as usize - self.remaining_cap as usize;
+
+        // Full Blocks
+        for block_num in FIRST_BLOCK_NUM + 1..self.block_num {
+            let cap = get_block_size(block_num) as usize;
             let data = arena.slice(addr, cap);
-            if cur + cap >= len {
-                output.extend_from_slice(&data[..(len - cur)]);
-                return;
-            }
             output.extend_from_slice(data);
-            cur += cap;
             addr = arena.read(addr.offset(cap as u32));
         }
+        // Last Block
+        let data = arena.slice(addr, last_block_len);
+        output.extend_from_slice(data);
     }
 }
 
@@ -169,10 +152,21 @@ mod tests {
     use common::{read_u32_vint, write_u32_vint};
 
     use super::super::MemoryArena;
-    use super::{len_to_capacity, *};
+    use super::*;
 
     #[test]
-    fn test_eull() {
+    fn test_eull_empty() {
+        let arena = MemoryArena::default();
+        let stack = ExpUnrolledLinkedList::default();
+        {
+            let mut buffer = Vec::new();
+            stack.read_to_end(&arena, &mut buffer);
+            assert_eq!(&buffer[..], &[]);
+        }
+    }
+
+    #[test]
+    fn test_eull1() {
         let mut arena = MemoryArena::default();
         let mut stack = ExpUnrolledLinkedList::default();
         stack.writer(&mut arena).extend_from_slice(&[1u8]);
@@ -186,6 +180,35 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_eull_vint1() {
+        let mut arena = MemoryArena::default();
+        let mut stack = ExpUnrolledLinkedList::default();
+        stack.writer(&mut arena).extend_from_slice(&[1u8]);
+        stack.writer(&mut arena).extend_from_slice(&[2u8]);
+        stack.writer(&mut arena).extend_from_slice(&[3u8, 4u8]);
+        stack.writer(&mut arena).extend_from_slice(&[5u8]);
+        {
+            let mut buffer = Vec::new();
+            stack.read_to_end(&arena, &mut buffer);
+            assert_eq!(&buffer[..], &[1u8, 2u8, 3u8, 4u8, 5u8]);
+        }
+    }
+
+    #[test]
+    fn test_eull_first_write_extends_cap() {
+        let mut arena = MemoryArena::default();
+        let mut stack = ExpUnrolledLinkedList::default();
+        stack
+            .writer(&mut arena)
+            .extend_from_slice(&[1u8, 2, 3, 4, 5, 6, 7, 8, 9]);
+        {
+            let mut buffer = Vec::new();
+            stack.read_to_end(&arena, &mut buffer);
+            assert_eq!(&buffer[..], &[1u8, 2, 3, 4, 5, 6, 7, 8, 9]);
+        }
+    }
+
     #[test]
     fn test_eull_long() {
         let mut arena = MemoryArena::default();
@@ -204,9 +227,18 @@ mod tests {
         assert_eq!(&result[..], &data[..]);
     }
 
+    #[test]
+    fn test_eull_limit() {
+        let mut eull = ExpUnrolledLinkedList::default();
+        for _ in 0..100 {
+            eull.increment_num_blocks();
+        }
+        assert_eq!(get_block_size(eull.block_num), 1 << 15);
+    }
+
     #[test]
     fn test_eull_interlaced() {
-        let mut eull = MemoryArena::default();
+        let mut arena = MemoryArena::default();
         let mut stack = ExpUnrolledLinkedList::default();
         let mut stack2 = ExpUnrolledLinkedList::default();
 
@@ -214,68 +246,20 @@ mod tests {
         let mut vec2: Vec<u8> = vec![];
 
         for i in 0..9 {
-            stack.writer(&mut eull).write_u32_vint(i);
+            stack.writer(&mut arena).write_u32_vint(i);
             assert!(write_u32_vint(i, &mut vec1).is_ok());
             if i % 2 == 0 {
-                stack2.writer(&mut eull).write_u32_vint(i);
+                stack2.writer(&mut arena).write_u32_vint(i);
                 assert!(write_u32_vint(i, &mut vec2).is_ok());
             }
         }
         let mut res1 = vec![];
         let mut res2 = vec![];
-        stack.read_to_end(&eull, &mut res1);
-        stack2.read_to_end(&eull, &mut res2);
+        stack.read_to_end(&arena, &mut res1);
+        stack2.read_to_end(&arena, &mut res2);
         assert_eq!(&vec1[..], &res1[..]);
         assert_eq!(&vec2[..], &res2[..]);
     }
-
-    #[test]
-    fn test_jump_if_needed() {
-        let mut available = 16u32;
-        for i in 0..10_000_000 {
-            match len_to_capacity(i) {
-                CapacityResult::NeedAlloc(cap) => {
-                    assert_eq!(available, 0, "Failed len={i}: Expected 0 got {cap}");
-                    available = cap;
-                }
-                CapacityResult::Available(cap) => {
-                    assert_eq!(
-                        available, cap,
-                        "Failed len={i}: Expected {available} Got {cap}"
-                    );
-                }
-            }
-            available -= 1;
-        }
-    }
-
-    #[test]
-    fn test_jump_if_needed_progression() {
-        let mut v = vec![];
-        for i in 0.. {
-            if v.len() >= 10 {
-                break;
-            }
-            if let CapacityResult::NeedAlloc(cap) = len_to_capacity(i) {
-                v.push((i, cap));
-            }
-        }
-        assert_eq!(
-            &v[..],
-            &[
-                (16, 16),
-                (32, 32),
-                (64, 64),
-                (128, 128),
-                (256, 256),
-                (512, 512),
-                (1024, 1024),
-                (2048, 2048),
-                (4096, 4096),
-                (8192, 8192)
-            ]
-        );
-    }
 }
 
 #[cfg(all(test, feature = "unstable"))]
diff --git a/stacker/src/fastcmp.rs b/stacker/src/fastcmp.rs
new file mode 100644
index 000000000..3097ca703
--- /dev/null
+++ b/stacker/src/fastcmp.rs
@@ -0,0 +1,147 @@
+/// fastcmp employs a trick to speed up the comparison of two slices of bytes.
+/// It's also possible to inline compared to the memcmp call.
+///
+/// E.g. Comparing equality of slice length 7 in two steps, by comparing two 4 byte slices
+/// unconditionally instead comparing the remaining 3 bytes if the first comparison was equal.
+/// [1, 2, 3, 4, 5, 6, 7]
+/// [1, 2, 3, 4]
+///          [4, 5, 6, 7]
+///
+/// This method uses the XMM register for bytes slices bigger than 16, else regular registers.
+#[inline]
+pub fn fast_short_slice_compare(left: &[u8], right: &[u8]) -> bool {
+    let len = left.len();
+    if len != right.len() {
+        return false;
+    }
+
+    // This could be less equals, but to make the job a little bit easier for the branch predictor
+    // we put the length 8 into the bigger group (8-16 bytes), that compares two u64
+    // assuming that range 8-16 are more common than 4-7
+
+    // This weird branching is done on purpose to get the best assembly.
+    // if len< 4 {
+    // ..
+    // if len < 8
+    // will cause assembly inlined instead of jumps
+    if len < 8 {
+        if len >= 4 {
+            return double_check_trick::<4>(left, right);
+        } else {
+            return short_compare(left, right);
+        }
+    }
+
+    if len > 16 {
+        return fast_nbyte_slice_compare::<16>(left, right);
+    }
+
+    double_check_trick::<8>(left, right)
+}
+
+// Note: The straigthforward left.chunks_exact(SIZE).zip(right.chunks_exact(SIZE)) produces slower
+// assembly
+#[inline]
+pub fn fast_nbyte_slice_compare<const SIZE: usize>(left: &[u8], right: &[u8]) -> bool {
+    let last = left.len() - left.len() % SIZE;
+    let mut i = 0;
+    loop {
+        if unsafe { left.get_unchecked(i..i + SIZE) != right.get_unchecked(i..i + SIZE) } {
+            return false;
+        }
+        i += SIZE;
+        if i >= last {
+            break;
+        }
+    }
+    unsafe { left.get_unchecked(left.len() - SIZE..) == right.get_unchecked(right.len() - SIZE..) }
+}
+
+#[inline(always)]
+fn short_compare(left: &[u8], right: &[u8]) -> bool {
+    for (l, r) in left.iter().zip(right) {
+        if l != r {
+            return false;
+        }
+    }
+    true
+}
+
+#[inline(always)]
+fn double_check_trick<const SIZE: usize>(left: &[u8], right: &[u8]) -> bool {
+    left[0..SIZE] == right[0..SIZE] && left[left.len() - SIZE..] == right[right.len() - SIZE..]
+}
+
+#[cfg(test)]
+mod tests {
+    use proptest::prelude::*;
+
+    use super::*;
+
+    #[test]
+    fn test_slice_compare_bytes_len_8() {
+        let a = &[1, 2, 3, 4, 5, 6, 7, 8];
+        let b = &[1, 2, 3, 4, 5, 6, 7, 8];
+        let c = &[1, 2, 3, 4, 5, 6, 7, 7];
+
+        assert!(fast_short_slice_compare(a, b));
+        assert!(!fast_short_slice_compare(a, c));
+    }
+
+    #[test]
+    fn test_slice_compare_bytes_len_9() {
+        let a = &[1, 2, 3, 4, 5, 6, 7, 8, 9];
+        let b = &[1, 2, 3, 4, 5, 6, 7, 8, 9];
+        let c = &[0, 2, 3, 4, 5, 6, 7, 8, 9];
+
+        assert!(fast_short_slice_compare(a, b));
+        assert!(!fast_short_slice_compare(a, c));
+    }
+
+    #[test]
+    fn test_slice_compare_bytes_len_16() {
+        let a = &[1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let b = &[1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8];
+        let c = &[1, 2, 3, 4, 5, 6, 7, 7, 1, 2, 3, 4, 5, 6, 7, 8];
+
+        assert!(fast_short_slice_compare(a, b));
+        assert!(!fast_short_slice_compare(a, c));
+    }
+
+    #[test]
+    fn test_slice_compare_bytes_short() {
+        let a = &[1, 2, 3, 4];
+        let b = &[1, 2, 3, 4];
+
+        assert!(fast_short_slice_compare(a, b));
+
+        let a = &[1, 2, 3];
+        let b = &[1, 2, 3];
+
+        assert!(fast_short_slice_compare(a, b));
+
+        let a = &[1, 2];
+        let b = &[1, 2];
+
+        assert!(fast_short_slice_compare(a, b));
+    }
+
+    proptest! {
+        #[test]
+        fn test_fast_short_slice_compare(left in prop::collection::vec(any::<u8>(), 0..100),
+                                          right in prop::collection::vec(any::<u8>(), 0..100)) {
+            let result = fast_short_slice_compare(&left, &right);
+            let expected = left == right;
+            prop_assert_eq!(result, expected, "left: {:?}, right: {:?}", left, right);
+        }
+
+        #[test]
+        fn test_fast_short_slice_compare_equal(left in prop::collection::vec(any::<u8>(), 0..100),
+                                          ) {
+            let result = fast_short_slice_compare(&left, &left);
+            let expected = left == left;
+            prop_assert_eq!(result, expected, "left: {:?}, right: {:?}", left, left);
+        }
+
+    }
+}
diff --git a/stacker/src/fastcpy.rs b/stacker/src/fastcpy.rs
new file mode 100644
index 000000000..6731f8b6f
--- /dev/null
+++ b/stacker/src/fastcpy.rs
@@ -0,0 +1,117 @@
+/// Optimized copy for small sizes. All bounds checks are elided.
+/// Avoids call to memcpy
+/// Applies unbranched copy trick for sizes 8, 16, 32
+///
+/// src and dst must be num_bytes long.
+#[inline]
+pub fn fast_short_slice_copy(src: &[u8], dst: &mut [u8]) {
+    #[inline(never)]
+    #[cold]
+    #[track_caller]
+    fn len_mismatch_fail(dst_len: usize, src_len: usize) -> ! {
+        panic!(
+            "source slice length ({}) does not match destination slice length ({})",
+            src_len, dst_len,
+        );
+    }
+
+    if src.len() != dst.len() {
+        len_mismatch_fail(src.len(), dst.len());
+    }
+    let len = src.len();
+
+    if src.is_empty() {
+        return;
+    }
+
+    if len < 4 {
+        short_copy(src, dst);
+        return;
+    }
+
+    if len < 8 {
+        double_copy_trick::<4>(src, dst);
+        return;
+    }
+
+    if len <= 16 {
+        double_copy_trick::<8>(src, dst);
+        return;
+    }
+
+    if len <= 32 {
+        double_copy_trick::<16>(src, dst);
+        return;
+    }
+
+    /// The code will use the vmovdqu instruction to copy 32 bytes at a time.
+    #[cfg(target_feature = "avx")]
+    {
+        if len <= 64 {
+            double_copy_trick::<32>(src, dst);
+            return;
+        }
+    }
+
+    // For larger sizes we use the default, which calls memcpy
+    // memcpy does some virtual memory tricks to copy large chunks of memory.
+    //
+    // The theory should be that the checks above don't cost much relative to the copy call for
+    // larger copies.
+    // The bounds checks in `copy_from_slice` are elided.
+    dst.copy_from_slice(src);
+}
+
+#[inline(always)]
+fn short_copy(src: &[u8], dst: &mut [u8]) {
+    debug_assert_ne!(src.len(), 0);
+    debug_assert_eq!(src.len(), dst.len());
+    let len = src.len();
+
+    // length 1-3
+    dst[0] = src[0];
+    if len >= 2 {
+        double_copy_trick::<2>(src, dst);
+    }
+}
+
+#[inline(always)]
+fn double_copy_trick<const SIZE: usize>(src: &[u8], dst: &mut [u8]) {
+    debug_assert!(src.len() >= SIZE);
+    debug_assert!(dst.len() >= SIZE);
+    dst[0..SIZE].copy_from_slice(&src[0..SIZE]);
+    dst[src.len() - SIZE..].copy_from_slice(&src[src.len() - SIZE..]);
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn copy_test<const SIZE: usize>() {
+        let src: Vec<u8> = (0..SIZE as u8).collect();
+        let mut dst = [0u8; SIZE];
+        fast_short_slice_copy(&src, &mut dst);
+        assert_eq!(src, dst);
+    }
+
+    #[test]
+    fn copy_test_n() {
+        copy_test::<1>();
+        copy_test::<2>();
+        copy_test::<3>();
+        copy_test::<4>();
+        copy_test::<5>();
+        copy_test::<6>();
+        copy_test::<7>();
+        copy_test::<8>();
+        copy_test::<9>();
+        copy_test::<10>();
+        copy_test::<11>();
+        copy_test::<31>();
+        copy_test::<32>();
+        copy_test::<33>();
+        copy_test::<47>();
+        copy_test::<48>();
+        copy_test::<49>();
+    }
+}
diff --git a/stacker/src/lib.rs b/stacker/src/lib.rs
index a43fa4782..04ec3f414 100644
--- a/stacker/src/lib.rs
+++ b/stacker/src/lib.rs
@@ -5,6 +5,9 @@ extern crate test;
 
 mod arena_hashmap;
 mod expull;
+#[allow(dead_code)]
+mod fastcmp;
+mod fastcpy;
 mod memory_arena;
 
 pub use self::arena_hashmap::{compute_table_memory_size, ArenaHashMap};
diff --git a/stacker/src/memory_arena.rs b/stacker/src/memory_arena.rs
index b9e9d3770..f3ed5d4bb 100644
--- a/stacker/src/memory_arena.rs
+++ b/stacker/src/memory_arena.rs
@@ -74,7 +74,7 @@ impl Addr {
     }
 }
 
-#[inline]
+#[inline(always)]
 pub fn store<Item: Copy + 'static>(dest: &mut [u8], val: Item) {
     debug_assert_eq!(dest.len(), std::mem::size_of::<Item>());
     unsafe {
@@ -104,12 +104,6 @@ impl Default for MemoryArena {
 }
 
 impl MemoryArena {
-    fn add_page(&mut self) -> &mut Page {
-        let new_page_id = self.pages.len();
-        self.pages.push(Page::new(new_page_id));
-        &mut self.pages[new_page_id]
-    }
-
     /// Returns an estimate in number of bytes
     /// of resident memory consumed by the `MemoryArena`.
     ///
@@ -134,36 +128,58 @@ impl MemoryArena {
     pub fn read<Item: Copy + 'static>(&self, addr: Addr) -> Item {
         load(self.slice(addr, mem::size_of::<Item>()))
     }
+    #[inline]
+    fn get_page(&self, page_id: usize) -> &Page {
+        unsafe { self.pages.get_unchecked(page_id) }
+    }
+    #[inline]
+    fn get_page_mut(&mut self, page_id: usize) -> &mut Page {
+        unsafe { self.pages.get_unchecked_mut(page_id) }
+    }
 
     #[inline]
     pub fn slice(&self, addr: Addr, len: usize) -> &[u8] {
-        self.pages[addr.page_id()].slice(addr.page_local_addr(), len)
+        self.get_page(addr.page_id())
+            .slice(addr.page_local_addr(), len)
     }
 
     #[inline]
     pub fn slice_from(&self, addr: Addr) -> &[u8] {
-        self.pages[addr.page_id()].slice_from(addr.page_local_addr())
+        self.get_page(addr.page_id())
+            .slice_from(addr.page_local_addr())
     }
 
     #[inline]
     pub fn slice_mut(&mut self, addr: Addr, len: usize) -> &mut [u8] {
-        self.pages[addr.page_id()].slice_mut(addr.page_local_addr(), len)
+        self.get_page_mut(addr.page_id())
+            .slice_mut(addr.page_local_addr(), len)
+    }
+
+    /// Add a page and allocate len on it.
+    /// Return the address
+    fn add_page(&mut self, len: usize) -> Addr {
+        let new_page_id = self.pages.len();
+        let mut page = Page::new(new_page_id);
+        page.len = len;
+        self.pages.push(page);
+        Addr::new(new_page_id, 0)
     }
 
     /// Allocates `len` bytes and returns the allocated address.
+    #[inline]
     pub fn allocate_space(&mut self, len: usize) -> Addr {
         let page_id = self.pages.len() - 1;
-        if let Some(addr) = self.pages[page_id].allocate_space(len) {
+        if let Some(addr) = self.get_page_mut(page_id).allocate_space(len) {
             return addr;
         }
-        self.add_page().allocate_space(len).unwrap()
+        self.add_page(len)
     }
 }
 
 struct Page {
     page_id: usize,
     len: usize,
-    data: Box<[u8]>,
+    data: Box<[u8; PAGE_SIZE]>,
 }
 
 impl Page {
@@ -171,7 +187,7 @@ impl Page {
         Page {
             page_id,
             len: 0,
-            data: vec![0u8; PAGE_SIZE].into_boxed_slice(),
+            data: vec![0u8; PAGE_SIZE].into_boxed_slice().try_into().unwrap(),
         }
     }
 
@@ -182,7 +198,8 @@ impl Page {
 
     #[inline]
     fn slice(&self, local_addr: usize, len: usize) -> &[u8] {
-        &self.slice_from(local_addr)[..len]
+        let data = &self.slice_from(local_addr);
+        unsafe { data.get_unchecked(..len) }
     }
 
     #[inline]
@@ -192,9 +209,11 @@ impl Page {
 
     #[inline]
     fn slice_mut(&mut self, local_addr: usize, len: usize) -> &mut [u8] {
-        &mut self.data[local_addr..][..len]
+        let data = &mut self.data[local_addr..];
+        unsafe { data.get_unchecked_mut(..len) }
     }
 
+    #[inline]
     fn allocate_space(&mut self, len: usize) -> Option<Addr> {
         if self.is_available(len) {
             let addr = Addr::new(self.page_id, self.len);