From 7ab3384e5d2a77ff1f70a5b76f8cfa7b11c1d37f Mon Sep 17 00:00:00 2001
From: Marko Kreen <markokr@gmail.com>
Date: Thu, 8 Nov 2007 21:32:40 +0000
Subject: [PATCH] jenkins hash: optimize last memcpy

compiler cannot optimize variable-size memcpy.
use a inlined simple version then.

That makes this version always faster than
Jenkins version on Core Duo.  On other cpu-s
its +3% win although still slower than Jenkins'.
---
 src/hash.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/hash.c b/src/hash.c
index d87e656..c153129 100644
--- a/src/hash.c
+++ b/src/hash.c
@@ -56,7 +56,17 @@
 	c ^= b; c -= rot(b,24); \
 } while (0)
 
-/* simple version - let compiler worry about memory access */
+/* for a small amount of bytes the call to libc is a loss */
+static inline void simple_memcpy(void *dst, const void *src, size_t n)
+{
+	const uint8_t *s = src;
+	uint8_t *d = dst;
+
+	while (n--)
+		*d++ = *s++;
+}
+
+/* short version - let compiler worry about memory access */
 uint32_t lookup3_hash(const void *data, size_t len)
 {
 	uint32_t a, b, c;
@@ -78,7 +88,7 @@ uint32_t lookup3_hash(const void *data, size_t len)
 	}
 
 	buf[0] = buf[1] = buf[2] = 0;
-	memcpy(buf, p, len);
+	simple_memcpy(buf, p, len);
 	a += buf[0];
 	b += buf[1];
 	c += buf[2];
-- 
2.39.5