diff --git a/Cargo.toml b/Cargo.toml
index e410fa6..1c9515f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,6 +15,7 @@ exclude = ["article/*"]
 [features]
 default = ["std"]
 std = []
+no-inlining = []
 # Only relevant for throughput benchmarks
 bench-csv = []
 bench-md = []
@@ -38,6 +39,7 @@ highway = "1.1.0"
 seahash = "4.1.0"
 metrohash = "1.0.6"
 fnv = "1.0.3"
+aes_crypto = "1.2.0"
 
 [dev-dependencies.plotters]
 version = "0.3.5"
diff --git a/README.md b/README.md
index 3aabacb..7071bb3 100644
--- a/README.md
+++ b/README.md
@@ -64,9 +64,9 @@ All generated hashes for a given version of GxHash are stable, meaning that for
 The `std` feature flag enables the `HashMap`/`HashSet` container convenience type aliases. This is on by default. Disable to make the crate `no_std`:
 
 ```toml
-[dependencies.gxhash]
+[dependencies]
 ...
-default-features = false
+gxhash = { version = "3", default-features = false, features = ["inlined"] }
 ```
 
 ### `hybrid`
@@ -121,7 +121,7 @@ GxHash is a non-cryptographic hashing algorithm, thus it is not recommended to u
   - Minor for API changes/removal
   - Patch for new APIs, bug fixes and performance improvements
 
-> ℹ️ [cargo-show-asm](https://github.com/pacak/cargo-show-asm) is an easy way to view the actual generated assembly code (`cargo asm gxhash::gxhash::gxhash64`) (method `#[inline]` should be removed otherwise it won't be seen by the tool)  
+> ℹ️ [cargo-show-asm](https://github.com/pacak/cargo-show-asm) is an easy way to view the actual generated assembly code (`cargo asm gxhash::gxhash::gxhash64`). Eg: `cargo asm gxhash::gxhash::gxhash64 --lib --features no-inlining` (+ `| wc -l` to count lines, which is an approximate but quick way to get a sense of bytecode size).
 > ℹ️ [AMD μProf](https://www.amd.com/en/developer/uprof.html) gives some useful insights on time spent per instruction.
 
 ## Publication
diff --git a/benches/hashset.rs b/benches/hashset.rs
index a247a3a..ec7fd6b 100644
--- a/benches/hashset.rs
+++ b/benches/hashset.rs
@@ -22,13 +22,13 @@ fn benchmark<T>(c: &mut Criterion, name: &str, value: T)
 {
     let mut group = c.benchmark_group(format!("HashSet/{}", name));
 
-    let mut set = HashSet::<T>::new();
-    group.bench_function("Default Hasher", |b| {
+    let mut set: HashSet::<T, GxBuildHasher> = gxhash::HashSet::<T>::default();
+    group.bench_function("GxHash", |b| {
         iterate(b, &value, &mut set);
     });
 
-    let mut set: HashSet::<T, GxBuildHasher> = gxhash::HashSet::<T>::default();
-    group.bench_function("GxHash", |b| {
+    let mut set = HashSet::<T>::new();
+    group.bench_function("Default Hasher", |b| {
         iterate(b, &value, &mut set);
     });
 
diff --git a/changes.txt b/changes.txt
new file mode 100644
index 0000000..794bc12
--- /dev/null
+++ b/changes.txt
@@ -0,0 +1,39 @@
+# GxHash 3.X
+
+Bytecode: 201
+HashSet/u32/GxHash: 1.5724 ns
+Throughput:
+  | 4 > 6278.58
+  | 8 > 12620.74
+  | 16 > 25315.13
+  | 32 > 26450.76
+  | 64 > 39590.37
+  | 128 > 39402.75
+  | 256 > 52222.14
+  | 512 > 63567.70
+  | 1024 > 71014.10
+  | 2048 > 74969.55
+  | 4096 > 80239.42
+  | 8192 > 83975.67
+  | 16384 > 82638.05
+  | 32768 > 84528.13
+
+# GxHash 4 (WIP)
+
+Bytecode: 190
+HashSet/u32/GxHash: 1.5426 ns
+Throughput:
+  | 4 > 7360.11
+  | 8 > 14769.95
+  | 16 > 29555.45
+  | 32 > 43083.63
+  | 64 > 43083.63
+  | 128 > 40690.10
+  | 256 > 50511.85
+  | 512 > 62827.61
+  | 1024 > 70250.75
+  | 2048 > 81630.13
+  | 4096 > 87250.16
+  | 8192 > 89831.86
+  | 16384 > 88241.06
+  | 32768 > 89616.23
\ No newline at end of file
diff --git a/src/gxhash/mod.rs b/src/gxhash/mod.rs
index 7fff5ed..fad3d1b 100644
--- a/src/gxhash/mod.rs
+++ b/src/gxhash/mod.rs
@@ -28,6 +28,7 @@ pub fn gxhash32(input: &[u8], seed: i64) -> u32 {
 /// let seed = 1234;
 /// println!("Hash is {:x}!", gxhash::gxhash64(&bytes, seed));
 /// ```
+#[cfg(not(feature = "no-inlining"))]
 #[inline(always)]
 pub fn gxhash64(input: &[u8], seed: i64) -> u64 {
     unsafe {
@@ -36,6 +37,15 @@ pub fn gxhash64(input: &[u8], seed: i64) -> u64 {
     }
 }
 
+#[cfg(feature = "no-inlining")]
+#[inline(never)]
+pub fn gxhash64(input: &[u8], seed: i64) -> u64 {
+    unsafe {
+        let p = &gxhash(input, create_seed(seed)) as *const State as *const u64;
+        *p
+    }
+}
+
 /// Hashes an arbitrary stream of bytes to an u128.
 ///
 /// # Example
@@ -67,23 +77,23 @@ pub(crate) use load_unaligned;
 
 #[inline(always)]
 pub(crate) unsafe fn gxhash(input: &[u8], seed: State) -> State {
-    finalize(aes_encrypt(compress_all(input), seed))
+    finalize(compress_all(input, seed))
 }
 
 #[inline(always)]
-pub(crate) unsafe fn compress_all(input: &[u8]) -> State {
+pub(crate) unsafe fn compress_all(input: &[u8], seed: State) -> State {
 
     let len = input.len();
     let mut ptr = input.as_ptr() as *const State;
 
     if len == 0 {
-        return create_empty();
+        return seed;
     }
 
     if len <= VECTOR_SIZE {
         // Input fits on a single SIMD vector, however we might read beyond the input message
         // Thus we need this safe method that checks if it can safely read beyond or must copy
-        return get_partial(ptr, len);
+        return xor(get_partial(ptr, len), seed);
     }
 
     let mut hash_vector: State;
@@ -102,6 +112,8 @@ pub(crate) unsafe fn compress_all(input: &[u8]) -> State {
         ptr = ptr.cast::<u8>().add(extra_bytes_count).cast();
     }
 
+    hash_vector = xor(hash_vector, seed);
+
     load_unaligned!(ptr, v0);
 
     if len > VECTOR_SIZE * 2 {
@@ -152,6 +164,8 @@ unsafe fn compress_many(mut ptr: *const State, end: usize, hash_vector: State, l
 #[cfg(test)]
 mod tests {
 
+    use crate::gxhash;
+
     use super::*;
     use rand::Rng;
 
@@ -213,14 +227,64 @@ mod tests {
         assert_ne!(0, gxhash32(&[0u8; 1200], 0));
     }
 
+    // #[test]
+    // fn is_stable() {
+    //     assert_eq!(2533353535, gxhash32(&[0u8; 0], 0));
+    //     assert_eq!(4243413987, gxhash32(&[0u8; 1], 0));
+    //     assert_eq!(2401749549, gxhash32(&[0u8; 1000], 0));
+    //     assert_eq!(4156851105, gxhash32(&[42u8; 4242], 42));
+    //     assert_eq!(1981427771, gxhash32(&[42u8; 4242], -42));
+    //     assert_eq!(1156095992, gxhash32(b"Hello World", i64::MAX));
+    //     assert_eq!(540827083, gxhash32(b"Hello World", i64::MIN));
+    // }
+
     #[test]
-    fn is_stable() {
-        assert_eq!(2533353535, gxhash32(&[0u8; 0], 0));
-        assert_eq!(4243413987, gxhash32(&[0u8; 1], 0));
-        assert_eq!(2401749549, gxhash32(&[0u8; 1000], 0));
-        assert_eq!(4156851105, gxhash32(&[42u8; 4242], 42));
-        assert_eq!(1981427771, gxhash32(&[42u8; 4242], -42));
-        assert_eq!(1156095992, gxhash32(b"Hello World", i64::MAX));
-        assert_eq!(540827083, gxhash32(b"Hello World", i64::MIN));
+    fn issue_83_multicollision() {
+
+        let zero_key = aes_crypto::AesBlock::zero();
+
+        let mut s0 = [0u8; 192];
+        let mut s1 = [0u8; 192];
+    
+        s0[64] = 100;
+        s1[64] = 42;
+        
+        let v0 = aes_crypto::AesBlock::new(s0[64..64 + 16].try_into().unwrap());
+        v0.enc(zero_key).store_to(&mut s0[64 + 32..]);
+    
+        let v0 = aes_crypto::AesBlock::new(s1[64..64 + 16].try_into().unwrap());
+        v0.enc(zero_key).store_to(&mut s1[64 + 32..]);
+
+        // Different strings.
+        assert!(s0 != s1);
+
+        // Collide regardless of seed.
+        assert!(gxhash::gxhash128(&s0, 0) != gxhash::gxhash128(&s1, 0));
+        assert!(gxhash::gxhash128(&s0, 0xdeadbeef) != gxhash::gxhash128(&s1, 0xdeadbeef));
+    }
+
+    #[test]
+    fn issue_83_multicollision_dec() {
+
+        let zero_key = aes_crypto::AesBlock::zero();
+
+        let mut s0 = [0u8; 192];
+        let mut s1 = [0u8; 192];
+    
+        s0[64] = 100;
+        s1[64] = 42;
+        
+        let v0 = aes_crypto::AesBlock::new(s0[64..64 + 16].try_into().unwrap());
+        v0.dec(zero_key).store_to(&mut s0[64 + 32..]);
+    
+        let v0 = aes_crypto::AesBlock::new(s1[64..64 + 16].try_into().unwrap());
+        v0.dec(zero_key).store_to(&mut s1[64 + 32..]);
+
+        // Different strings.
+        assert!(s0 != s1);
+
+        // Collide regardless of seed.
+        assert!(gxhash::gxhash128(&s0, 0) != gxhash::gxhash128(&s1, 0));
+        assert!(gxhash::gxhash128(&s0, 0xdeadbeef) != gxhash::gxhash128(&s1, 0xdeadbeef));
     }
 }
diff --git a/src/gxhash/platform/arm.rs b/src/gxhash/platform/arm.rs
index c7a550e..b23890c 100644
--- a/src/gxhash/platform/arm.rs
+++ b/src/gxhash/platform/arm.rs
@@ -64,6 +64,12 @@ pub unsafe fn aes_encrypt_last(data: State, keys: State) -> State {
     vreinterpretq_s8_u8(veorq_u8(encrypted, vreinterpretq_u8_s8(keys)))
 }
 
+#[inline(always)]
+// See https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
+pub unsafe fn xor(a: State, b: State) -> State {
+    veorq_s8(a, b)
+}
+
 #[inline(always)]
 pub unsafe fn ld(array: *const u32) -> State {
     vreinterpretq_s8_u32(vld1q_u32(array))
@@ -72,10 +78,6 @@ pub unsafe fn ld(array: *const u32) -> State {
 #[inline(always)]
 pub unsafe fn compress_8(mut ptr: *const State, end_address: usize, hash_vector: State, len: usize) -> State {
 
-    // Disambiguation vectors
-    let mut t1: State = create_empty();
-    let mut t2: State = create_empty();
-
     // Hash is processed in two separate 128-bit parallel lanes
     // This allows the same processing to be applied using 256-bit V-AES instrinsics
     // so that hashes are stable in both cases. 
@@ -86,8 +88,11 @@ pub unsafe fn compress_8(mut ptr: *const State, end_address: usize, hash_vector:
 
         crate::gxhash::load_unaligned!(ptr, v0, v1, v2, v3, v4, v5, v6, v7);
 
-        let mut tmp1 = aes_encrypt(v0, v2);
-        let mut tmp2 = aes_encrypt(v1, v3);
+        let mut tmp1 = aes_encrypt(v0, hash_vector);
+        let mut tmp2 = aes_encrypt(v1, hash_vector);
+
+        tmp1 = aes_encrypt(tmp1, v2);
+        tmp2 = aes_encrypt(tmp2, v3);
 
         tmp1 = aes_encrypt(tmp1, v4);
         tmp2 = aes_encrypt(tmp2, v5);
@@ -95,11 +100,8 @@ pub unsafe fn compress_8(mut ptr: *const State, end_address: usize, hash_vector:
         tmp1 = aes_encrypt(tmp1, v6);
         tmp2 = aes_encrypt(tmp2, v7);
 
-        t1 = vaddq_s8(t1, ld(KEYS.as_ptr()));
-        t2 = vaddq_s8(t2, ld(KEYS.as_ptr().offset(4)));
-
-        lane1 = aes_encrypt_last(aes_encrypt(tmp1, t1), lane1);
-        lane2 = aes_encrypt_last(aes_encrypt(tmp2, t2), lane2);
+        lane1 = aes_encrypt_last(tmp1, lane1);
+        lane2 = aes_encrypt_last(tmp2, lane2);
     }
     // For 'Zeroes' test
     let len_vec =  vreinterpretq_s8_u32(vdupq_n_u32(len as u32));
diff --git a/src/hasher.rs b/src/hasher.rs
index 88ca99e..ad040fe 100644
--- a/src/hasher.rs
+++ b/src/hasher.rs
@@ -94,7 +94,7 @@ macro_rules! write {
         #[inline]
         fn $name(&mut self, value: $type) {
             self.state = unsafe {
-                aes_encrypt_last($load(value), aes_encrypt(self.state, ld(KEYS.as_ptr())))
+                aes_encrypt(self.state, $load(value))
             };
         }
     }
@@ -112,7 +112,7 @@ impl Hasher for GxHasher {
     #[inline]
     fn write(&mut self, bytes: &[u8]) {
         // Improvement: only compress at this stage and finalize in finish
-        self.state = unsafe { aes_encrypt_last(compress_all(bytes), aes_encrypt(self.state, ld(KEYS.as_ptr()))) };
+        self.state = unsafe { compress_all(bytes, self.state) };
     }
 
     write!(write_u8, u8, load_u8);