Fix several hash functions that were taking chintzy shortcuts instead of

author Tom Lane <tgl@sss.pgh.pa.us>

Fri, 1 Jun 2007 15:33:19 +0000 (15:33 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Fri, 1 Jun 2007 15:33:19 +0000 (15:33 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Fri, 1 Jun 2007 15:33:19 +0000 (15:33 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Fri, 1 Jun 2007 15:33:19 +0000 (15:33 +0000)
diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c

index 33cdb9ce78ddc194f1420b3e8579732482a30c36..58898cf9dd176d7bb8ae2f0e2bc71b4f3a3b51b6 100644 (file)
--- a/src/backend/access/hash/hashfunc.c
+++ b/src/backend/access/hash/hashfunc.c
@@ -1,7 +1,7 @@
  /*-------------------------------------------------------------------------
   *
   * hashfunc.c
- *       Comparison functions for hash access method.
+ *       Support functions for hash access method.
   *
   * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
@@ -12,8 +12,15 @@
   *
   * NOTES
   *       These functions are stored in pg_amproc.      For each operator class
- *       defined on hash tables, they compute the hash value of the argument.
+ *       defined for hash indexes, they compute the hash value of the argument.
   *
+ *       Additional hash functions appear in /utils/adt/ files for various
+ *       specialized datatypes.
+ *
+ *       It is expected that every bit of a hash function's 32-bit result is
+ *       as random as every other; failure to ensure this is likely to lead
+ *       to poor performance of hash joins, for example.  In most cases a hash
+ *       function should use hash_any() or its variant hash_uint32().
   *-------------------------------------------------------------------------
   */
  
@@ -26,19 +33,19 @@
  Datum
  hashchar(PG_FUNCTION_ARGS)
  {
-       PG_RETURN_UINT32(~((uint32) PG_GETARG_CHAR(0)));
+       return hash_uint32((int32) PG_GETARG_CHAR(0));
  }
  
  Datum
  hashint2(PG_FUNCTION_ARGS)
  {
-       PG_RETURN_UINT32(~((uint32) PG_GETARG_INT16(0)));
+       return hash_uint32((int32) PG_GETARG_INT16(0));
  }
  
  Datum
  hashint4(PG_FUNCTION_ARGS)
  {
-       PG_RETURN_UINT32(~PG_GETARG_UINT32(0));
+       return hash_uint32(PG_GETARG_INT32(0));
  }
  
  Datum
@@ -59,23 +66,23 @@ hashint8(PG_FUNCTION_ARGS)
  
         lohalf ^= (val >= 0) ? hihalf : ~hihalf;
  
-       PG_RETURN_UINT32(~lohalf);
+       return hash_uint32(lohalf);
  #else
         /* here if we can't count on "x >> 32" to work sanely */
-       PG_RETURN_UINT32(~((uint32) PG_GETARG_INT64(0)));
+       return hash_uint32((int32) PG_GETARG_INT64(0));
  #endif
  }
  
  Datum
  hashoid(PG_FUNCTION_ARGS)
  {
-       PG_RETURN_UINT32(~((uint32) PG_GETARG_OID(0)));
+       return hash_uint32((uint32) PG_GETARG_OID(0));
  }
  
  Datum
  hashenum(PG_FUNCTION_ARGS)
  {
-    PG_RETURN_UINT32(~((uint32) PG_GETARG_OID(0)));
+       return hash_uint32((uint32) PG_GETARG_OID(0));
  }
  
  Datum
@@ -283,6 +290,31 @@ hash_any(register const unsigned char *k, register int keylen)
                         /* case 0: nothing left to add */
         }
         mix(a, b, c);
+
+       /* report the result */
+       return UInt32GetDatum(c);
+}
+
+/*
+ * hash_uint32() -- hash a 32-bit value
+ *
+ * This has the same result (at least on little-endian machines) as
+ *             hash_any(&k, sizeof(uint32))
+ * but is faster and doesn't force the caller to store k into memory.
+ */
+Datum
+hash_uint32(uint32 k)
+{
+       register uint32 a,
+                               b,
+                               c;
+
+       a = 0x9e3779b9 + k;
+       b = 0x9e3779b9;
+       c = 3923095 + (uint32) sizeof(uint32);
+
+       mix(a, b, c);
+
         /* report the result */
         return UInt32GetDatum(c);
  }
diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c

index 6840248eb6d1819e0abe01385fe248e2d0b7fceb..b46be62544e98aa6be93dcd6d96c65524c433853 100644 (file)
--- a/src/backend/nodes/bitmapset.c
+++ b/src/backend/nodes/bitmapset.c
@@ -21,6 +21,7 @@
  #include "postgres.h"
  
  #include "nodes/bitmapset.h"
+#include "access/hash.h"
  
  
  #define WORDNUM(x)     ((x) / BITS_PER_BITMAPWORD)
@@ -769,36 +770,23 @@ bms_first_member(Bitmapset *a)
   *
   * Note: we must ensure that any two bitmapsets that are bms_equal() will
   * hash to the same value; in practice this means that trailing all-zero
- * words cannot affect the result.     The circular-shift-and-XOR hash method
- * used here has this property, so long as we work from back to front.
- *
- * Note: you might wonder why we bother with the circular shift; at first
- * glance a straight longitudinal XOR seems as good and much simpler.  The
- * reason is empirical: this gives a better distribution of hash values on
- * the bitmapsets actually generated by the planner.  A common way to have
- * multiword bitmapsets is "a JOIN b JOIN c JOIN d ...", which gives rise
- * to rangetables in which base tables and JOIN nodes alternate; so
- * bitmapsets of base table RT indexes tend to use only odd-numbered or only
- * even-numbered bits. A straight longitudinal XOR would preserve this
- * property, leading to a much smaller set of possible outputs than if
- * we include a shift.
+ * words must not affect the result.  Hence we strip those before applying
+ * hash_any().
   */
  uint32
  bms_hash_value(const Bitmapset *a)
  {
-       bitmapword      result = 0;
-       int                     wordnum;
+       int                     lastword;
  
-       if (a == NULL || a->nwords <= 0)
+       if (a == NULL)
                 return 0;                               /* All empty sets hash to 0 */
-       for (wordnum = a->nwords; --wordnum > 0;)
+       for (lastword = a->nwords; --lastword >= 0;)
         {
-               result ^= a->words[wordnum];
-               if (result & ((bitmapword) 1 << (BITS_PER_BITMAPWORD - 1)))
-                       result = (result << 1) | 1;
-               else
-                       result = (result << 1);
+               if (a->words[lastword] != 0)
+                       break;
         }
-       result ^= a->words[0];
-       return (uint32) result;
+       if (lastword < 0)
+               return 0;                               /* All empty sets hash to 0 */
+       return DatumGetUInt32(hash_any((const unsigned char *) a->words,
+                                                                  (lastword + 1) * sizeof(bitmapword)));
  }
diff --git a/src/backend/utils/hash/hashfn.c b/src/backend/utils/hash/hashfn.c

index 31da210cf182ec76c0fd33dec8eb5b2995e03224..3719f84f4023efd44aba5401d54c09dfd6bf3166 100644 (file)
--- a/src/backend/utils/hash/hashfn.c
+++ b/src/backend/utils/hash/hashfn.c
@@ -11,6 +11,12 @@
   * IDENTIFICATION
   *       $PostgreSQL$
   *
+ * NOTES
+ *       It is expected that every bit of a hash function's 32-bit result is
+ *       as random as every other; failure to ensure this is likely to lead
+ *       to poor performance of hash tables.  In most cases a hash
+ *       function should use hash_any() or its variant hash_uint32().
+ *
   *-------------------------------------------------------------------------
   */
  #include "postgres.h"
@@ -58,8 +64,7 @@ uint32
  oid_hash(const void *key, Size keysize)
  {
         Assert(keysize == sizeof(Oid));
-       /* We don't actually bother to do anything to the OID value ... */
-       return (uint32) *((const Oid *) key);
+       return DatumGetUInt32(hash_uint32((uint32) *((const Oid *) key)));
  }
  
  /*
diff --git a/src/include/access/hash.h b/src/include/access/hash.h

index 5895bec7592390837592a9780337073ea12527e1..aacde91083e01d9bd909de037418fa30d7ee2766 100644 (file)
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -265,6 +265,7 @@ extern Datum hashname(PG_FUNCTION_ARGS);
  extern Datum hashtext(PG_FUNCTION_ARGS);
  extern Datum hashvarlena(PG_FUNCTION_ARGS);
  extern Datum hash_any(register const unsigned char *k, register int keylen);
+extern Datum hash_uint32(uint32 k);
  
  /* private routines */
  
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h

index 58721f924227d01dfaa26071d0ef329a004f19f1..0303bcf84e3f569e71a09b34b14556cf83efdc08 100644 (file)
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
   */
  
  /*                                                     yyyymmddN */
-#define CATALOG_VERSION_NO     200705211
+#define CATALOG_VERSION_NO     200706011
  
  #endif
author	Tom Lane <tgl@sss.pgh.pa.us>
	Fri, 1 Jun 2007 15:33:19 +0000 (15:33 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Fri, 1 Jun 2007 15:33:19 +0000 (15:33 +0000)
src/backend/access/hash/hashfunc.c		patch \| blob \| blame \| history
src/backend/nodes/bitmapset.c		patch \| blob \| blame \| history
src/backend/utils/hash/hashfn.c		patch \| blob \| blame \| history
src/include/access/hash.h		patch \| blob \| blame \| history
src/include/catalog/catversion.h		patch \| blob \| blame \| history