From dddc04aff7d826f9f95f6e6d3940b262b466082a Mon Sep 17 00:00:00 2001 From: "Michael G. Noll" Date: Wed, 7 Jan 2015 12:00:45 +0100 Subject: [PATCH] GH-392: Improve hashing for CMS[BigInt] --- .../caliper/CMSHashingBenchmark.scala | 98 +++++++++++++++++++ .../com/twitter/algebird/CountMinSketch.scala | 25 +++-- 2 files changed, 115 insertions(+), 8 deletions(-) create mode 100644 algebird-caliper/src/test/scala/com/twitter/algebird/caliper/CMSHashingBenchmark.scala diff --git a/algebird-caliper/src/test/scala/com/twitter/algebird/caliper/CMSHashingBenchmark.scala b/algebird-caliper/src/test/scala/com/twitter/algebird/caliper/CMSHashingBenchmark.scala new file mode 100644 index 000000000..b4fc78088 --- /dev/null +++ b/algebird-caliper/src/test/scala/com/twitter/algebird/caliper/CMSHashingBenchmark.scala @@ -0,0 +1,98 @@ +package com.twitter.algebird.caliper + +import com.google.caliper.{Param, SimpleBenchmark} + +/** + * Benchmarks the hashing algorithms used by Count-Min sketch for CMS[BigInt]. + * + * The input values are generated ahead of time to ensure that each trial uses the same input (and that the RNG is not + * influencing the runtime of the trials). + */ +// Once we can convince cappi (https://github.com/softprops/capp) -- the sbt plugin we use to run +// caliper benchmarks -- to work with the latest caliper 1.0-beta-1, we would: +// - Let `CMSHashingBenchmark` extend `Benchmark` (instead of `SimpleBenchmark`) +// - Annotate `timePlus` with `@MacroBenchmark`. +class CMSHashingBenchmark extends SimpleBenchmark { + + /** + * The `a` parameter for CMS' default ("legacy") hashing algorithm: `h_i(x) = a_i * x + b_i (mod p)`. + */ + @Param(Array("5123456")) + val a: Int = 0 + + /** + * The `b` parameter for CMS' default ("legacy") hashing algorithm: `h_i(x) = a_i * x + b_i (mod p)`. + * + * Algebird's CMS implementation hard-codes `b` to `0`. + */ + @Param(Array("0")) + val b: Int = 0 + + /** + * Width of the counting table. + */ + @Param(Array("11" /* eps = 0.271 */ , "544" /* eps = 0.005 */ , "2719" /* eps = 1E-3 */ , "271829" /* eps = 1E-5 */)) + val width: Int = 0 + + /** + * Number of operations per benchmark repetition. + */ + @Param(Array("100000")) + val operations: Int = 0 + + /** + * Maximum number of bits for randomly generated BigInt instances. + */ + @Param(Array("128", "1024", "2048")) + val maxBits: Int = 0 + + var random: scala.util.Random = _ + var inputs: Seq[BigInt] = _ + + override def setUp() { + random = new scala.util.Random + // We draw numbers randomly from a 2^maxBits address space. + inputs = (1 to operations).view.map { _ => scala.math.BigInt(maxBits, random)} + } + + private def murmurHashScala(a: Int, b: Int, width: Int)(x: BigInt) = { + val hash: Int = scala.util.hashing.MurmurHash3.arrayHash(x.toByteArray, a) + val h = { + // We only want positive integers for the subsequent modulo. This method mimics Java's Hashtable implementation, + // and it requires `hash` to be an `Int` = have 32 bits (to match with `0x7FFFFFFF`). + val positiveHash = hash & 0x7FFFFFFF + positiveHash % width + } + assert(h >= 0, "hash must not be negative") + h + } + + private val PRIME_MODULUS = (1L << 31) - 1 + + private def brokenCurrentHash(a: Int, b: Int, width: Int)(x: BigInt) = { + val unModded: BigInt = (x * a) + b + val modded: BigInt = (unModded + (unModded >> 32)) & PRIME_MODULUS + val h = modded.toInt % width + assert(h >= 0, "hash must not be negative") + h + } + + def timeBrokenCurrentHashWithRandomMaxBitsNumbers(operations: Int): Int = { + var dummy = 0 + while (dummy < operations) { + inputs.foreach { input => brokenCurrentHash(a, b, width)(input)} + dummy += 1 + } + dummy + } + + def timeMurmurHashScalaWithRandomMaxBitsNumbers(operations: Int): Int = { + var dummy = 0 + while (dummy < operations) { + inputs.foreach { input => murmurHashScala(a, b, width)(input)} + dummy += 1 + } + dummy + } + +} \ No newline at end of file diff --git a/algebird-core/src/main/scala/com/twitter/algebird/CountMinSketch.scala b/algebird-core/src/main/scala/com/twitter/algebird/CountMinSketch.scala index a2fdd0263..8e4deea06 100644 --- a/algebird-core/src/main/scala/com/twitter/algebird/CountMinSketch.scala +++ b/algebird-core/src/main/scala/com/twitter/algebird/CountMinSketch.scala @@ -1047,7 +1047,7 @@ object CMSHasherImplicits { implicit object CMSHasherLong extends CMSHasher[Long] { - def hash(a: Int, b: Int, width: Int)(x: Long) = { + override def hash(a: Int, b: Int, width: Int)(x: Long) = { val unModded: Long = (x * a) + b // Apparently a super fast way of computing x mod 2^p-1 // See page 149 of http://www.cs.princeton.edu/courses/archive/fall09/cos521/Handouts/universalclasses.pdf @@ -1061,13 +1061,13 @@ object CMSHasherImplicits { implicit object CMSHasherShort extends CMSHasher[Short] { - def hash(a: Int, b: Int, width: Int)(x: Short) = CMSHasherInt.hash(a, b, width)(x) + override def hash(a: Int, b: Int, width: Int)(x: Short) = CMSHasherInt.hash(a, b, width)(x) } implicit object CMSHasherInt extends CMSHasher[Int] { - def hash(a: Int, b: Int, width: Int)(x: Int) = { + override def hash(a: Int, b: Int, width: Int)(x: Int) = { val unModded: Int = (x * a) + b val modded: Long = (unModded + (unModded >> 32)) & PRIME_MODULUS val h = modded.toInt % width @@ -1079,14 +1079,23 @@ object CMSHasherImplicits { implicit object CMSHasherBigInt extends CMSHasher[BigInt] { - def hash(a: Int, b: Int, width: Int)(x: BigInt) = { - val unModded: BigInt = (x * a) + b - val modded: BigInt = (unModded + (unModded >> 32)) & PRIME_MODULUS - val h = modded.toInt % width + /** + * Implementation detail: This hash function is based upon Murmur3. Note that the original CMS paper requires + * `d` (depth) pair-wise independent hash functions; in the specific case of Murmur3 we argue that it is sufficient + * to pass `d` different seed values to Murmur3 to achieve a similar effect. + */ + override def hash(a: Int, b: Int, width: Int)(x: BigInt) = { + val hash: Int = scala.util.hashing.MurmurHash3.arrayHash(x.toByteArray, a) + val h = { + // We only want positive integers for the subsequent modulo. This method mimics Java's Hashtable implementation, + // and it requires `hash` to be an `Int` = have 32 bits (to match with `0x7FFFFFFF`). + val positiveHash = hash & 0x7FFFFFFF + positiveHash % width + } assert(h >= 0, "hash must not be negative") h } } -} +} \ No newline at end of file