Change hash indexes to store only the hash code rather than the whole indexed

author Tom Lane <tgl@sss.pgh.pa.us>

Mon, 15 Sep 2008 18:43:41 +0000 (18:43 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Mon, 15 Sep 2008 18:43:41 +0000 (18:43 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Mon, 15 Sep 2008 18:43:41 +0000 (18:43 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Mon, 15 Sep 2008 18:43:41 +0000 (18:43 +0000)
diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml

index 947c470d26e44380f6bfa93b0fd28d494f3bb40f..40e046092c53370dba768ea65d0d3e36c0ef7c4c 100644 (file)
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -451,6 +451,13 @@
        <entry>Can an index of this type be clustered on?</entry>
       </row>
  
+     <row>
+      <entry><structfield>amkeytype</structfield></entry>
+      <entry><type>oid</type></entry>
+      <entry><literal><link linkend="catalog-pg-type"><structname>pg_type</structname></link>.oid</literal></entry>
+      <entry>Type of data stored in index, or zero if not a fixed type</entry>
+     </row>
+
       <row>
        <entry><structfield>aminsert</structfield></entry>
        <entry><type>regproc</type></entry>
@@ -6424,7 +6431,7 @@
       <row>
        <entry><structfield>sourceline</structfield></entry>
        <entry><type>text</type></entry>
-      <entry>Line number within the sourcefile the current value was set 
+      <entry>Line number within the sourcefile the current value was set
        from (NULL for values set in sources other than configuration files)
        </entry>
       </row>
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c

index 6a5c0009dbb63be5740782a2ce3bb95a40859e92..67d0691a00fba1f8c334cee188aa42216ccc3911 100644 (file)
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -79,12 +79,12 @@ hashbuild(PG_FUNCTION_ARGS)
          * then we'll thrash horribly.  To prevent that scenario, we can sort the
          * tuples by (expected) bucket number.  However, such a sort is useless
          * overhead when the index does fit in RAM.  We choose to sort if the
-        * initial index size exceeds effective_cache_size.
+        * initial index size exceeds NBuffers.
          *
          * NOTE: this test will need adjustment if a bucket is ever different
          * from one page.
          */
-       if (num_buckets >= (uint32) effective_cache_size)
+       if (num_buckets >= (uint32) NBuffers)
                 buildstate.spool = _h_spoolinit(index, num_buckets);
         else
                 buildstate.spool = NULL;
@@ -129,7 +129,7 @@ hashbuildCallback(Relation index,
         IndexTuple      itup;
  
         /* form an index tuple and point it at the heap tuple */
-       itup = index_form_tuple(RelationGetDescr(index), values, isnull);
+       itup = _hash_form_tuple(index, values, isnull);
         itup->t_tid = htup->t_self;
  
         /* Hash indexes don't index nulls, see notes in hashinsert */
@@ -153,8 +153,8 @@ hashbuildCallback(Relation index,
  /*
   *     hashinsert() -- insert an index tuple into a hash table.
   *
- *     Hash on the index tuple's key, find the appropriate location
- *     for the new tuple, and put it there.
+ *     Hash on the heap tuple's key, form an index tuple with hash code.
+ *     Find the appropriate location for the new tuple, and put it there.
   */
  Datum
  hashinsert(PG_FUNCTION_ARGS)
@@ -171,7 +171,7 @@ hashinsert(PG_FUNCTION_ARGS)
         IndexTuple      itup;
  
         /* generate an index tuple */
-       itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
+       itup = _hash_form_tuple(rel, values, isnull);
         itup->t_tid = *ht_ctid;
  
         /*
@@ -211,8 +211,8 @@ hashgettuple(PG_FUNCTION_ARGS)
         OffsetNumber offnum;
         bool            res;
  
-       /* Hash indexes are never lossy (at the moment anyway) */
-       scan->xs_recheck = false;
+       /* Hash indexes are always lossy since we store only the hash code */
+       scan->xs_recheck = true;
  
         /*
          * We hold pin but not lock on current buffer while outside the hash AM.
@@ -317,7 +317,8 @@ hashgetbitmap(PG_FUNCTION_ARGS)
                 /* Save tuple ID, and continue scanning */
                 if (add_tuple) 
                 {
-                       tbm_add_tuples(tbm, &scan->xs_ctup.t_self, 1, false);
+                       /* Note we mark the tuple ID as requiring recheck */
+                       tbm_add_tuples(tbm, &scan->xs_ctup.t_self, 1, true);
                         ntids++;
                 }
  
@@ -527,7 +528,7 @@ hashbulkdelete(PG_FUNCTION_ARGS)
          * each bucket.
          */
         metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-       metap = (HashMetaPage) BufferGetPage(metabuf);
+       metap =  HashPageGetMeta(BufferGetPage(metabuf));
         orig_maxbucket = metap->hashm_maxbucket;
         orig_ntuples = metap->hashm_ntuples;
         memcpy(&local_metapage, metap, sizeof(local_metapage));
@@ -629,7 +630,7 @@ loop_top:
  
         /* Write-lock metapage and check for split since we started */
         metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE, LH_META_PAGE);
-       metap = (HashMetaPage) BufferGetPage(metabuf);
+       metap = HashPageGetMeta(BufferGetPage(metabuf));
  
         if (cur_maxbucket != metap->hashm_maxbucket)
         {
diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c

index 3eb226a8a5473cd06cb22c15b0fc4ac5066db50a..0048c3ce23a041fc69f1b2904111c188a76a315a 100644 (file)
--- a/src/backend/access/hash/hashinsert.c
+++ b/src/backend/access/hash/hashinsert.c
@@ -43,18 +43,11 @@ _hash_doinsert(Relation rel, IndexTuple itup)
         bool            do_expand;
         uint32          hashkey;
         Bucket          bucket;
-       Datum           datum;
-       bool            isnull;
  
         /*
-        * Compute the hash key for the item.  We do this first so as not to need
-        * to hold any locks while running the hash function.
+        * Get the hash key for the item (it's stored in the index tuple itself).
          */
-       if (rel->rd_rel->relnatts != 1)
-               elog(ERROR, "hash indexes support only one index key");
-       datum = index_getattr(itup, 1, RelationGetDescr(rel), &isnull);
-       Assert(!isnull);
-       hashkey = _hash_datum2hashkey(rel, datum);
+       hashkey = _hash_get_indextuple_hashkey(itup);
  
         /* compute item size too */
         itemsz = IndexTupleDSize(*itup);
@@ -69,12 +62,14 @@ _hash_doinsert(Relation rel, IndexTuple itup)
  
         /* Read the metapage */
         metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-       metap = (HashMetaPage) BufferGetPage(metabuf);
+       metap = HashPageGetMeta(BufferGetPage(metabuf));
  
         /*
          * Check whether the item can fit on a hash page at all. (Eventually, we
          * ought to try to apply TOAST methods if not.)  Note that at this point,
          * itemsz doesn't include the ItemId.
+        *
+        * XXX this is useless code if we are only storing hash keys.
          */
         if (itemsz > HashMaxItemSize((Page) metap))
                 ereport(ERROR,
@@ -197,11 +192,15 @@ _hash_pgaddtup(Relation rel,
  {
         OffsetNumber itup_off;
         Page            page;
+       uint32          hashkey;
  
         _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
         page = BufferGetPage(buf);
  
-       itup_off = OffsetNumberNext(PageGetMaxOffsetNumber(page));
+       /* Find where to insert the tuple (preserving page's hashkey ordering) */
+       hashkey = _hash_get_indextuple_hashkey(itup);
+       itup_off = _hash_binsearch(page, hashkey);
+
         if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false)
                 == InvalidOffsetNumber)
                 elog(ERROR, "failed to add index item to \"%s\"",
diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c

index cc10135d80fd160c803932f4f6fa96f75e6f6195..e17ae173e8ab85ad6d6ac7c8f6cb12d1d7903a33 100644 (file)
--- a/src/backend/access/hash/hashovfl.c
+++ b/src/backend/access/hash/hashovfl.c
@@ -187,7 +187,7 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
         _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
  
         _hash_checkpage(rel, metabuf, LH_META_PAGE);
-       metap = (HashMetaPage) BufferGetPage(metabuf);
+       metap = HashPageGetMeta(BufferGetPage(metabuf));
  
         /* start search at hashm_firstfree */
         orig_firstfree = metap->hashm_firstfree;
@@ -450,7 +450,7 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf,
  
         /* Read the metapage so we can determine which bitmap page to use */
         metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-       metap = (HashMetaPage) BufferGetPage(metabuf);
+       metap = HashPageGetMeta(BufferGetPage(metabuf));
  
         /* Identify which bit to set */
         ovflbitno = blkno_to_bitno(metap, ovflblkno);
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c

index ba6177c18ab043f58dbf374f9377312d5f7c831b..431bb2e204e91a6a6f5d29fe866410422b31c4d1 100644 (file)
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -348,11 +348,9 @@ _hash_metapinit(Relation rel, double num_tuples)
          * Determine the target fill factor (in tuples per bucket) for this index.
          * The idea is to make the fill factor correspond to pages about as full
          * as the user-settable fillfactor parameter says.      We can compute it
-        * exactly if the index datatype is fixed-width, but for var-width there's
-        * some guessing involved.
+        * exactly since the index datatype (i.e. uint32 hash key) is fixed-width.
          */
-       data_width = get_typavgwidth(RelationGetDescr(rel)->attrs[0]->atttypid,
-                                                                RelationGetDescr(rel)->attrs[0]->atttypmod);
+       data_width = sizeof(uint32);
         item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) +
                 sizeof(ItemIdData);             /* include the line pointer */
         ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width;
@@ -395,20 +393,18 @@ _hash_metapinit(Relation rel, double num_tuples)
         pageopaque->hasho_flag = LH_META_PAGE;
         pageopaque->hasho_page_id = HASHO_PAGE_ID;
  
-       metap = (HashMetaPage) pg;
+       metap = HashPageGetMeta(pg);
  
         metap->hashm_magic = HASH_MAGIC;
         metap->hashm_version = HASH_VERSION;
         metap->hashm_ntuples = 0;
         metap->hashm_nmaps = 0;
         metap->hashm_ffactor = ffactor;
-       metap->hashm_bsize = BufferGetPageSize(metabuf);
+       metap->hashm_bsize = HashGetMaxBitmapSize(pg);
         /* find largest bitmap array size that will fit in page size */
         for (i = _hash_log2(metap->hashm_bsize); i > 0; --i)
         {
-               if ((1 << i) <= (metap->hashm_bsize -
-                                                (MAXALIGN(sizeof(PageHeaderData)) +
-                                                 MAXALIGN(sizeof(HashPageOpaqueData)))))
+               if ((1 << i) <= metap->hashm_bsize)
                         break;
         }
         Assert(i > 0);
@@ -532,7 +528,7 @@ _hash_expandtable(Relation rel, Buffer metabuf)
         _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
  
         _hash_checkpage(rel, metabuf, LH_META_PAGE);
-       metap = (HashMetaPage) BufferGetPage(metabuf);
+       metap = HashPageGetMeta(BufferGetPage(metabuf));
  
         /*
          * Check to see if split is still needed; someone else might have already
@@ -774,8 +770,6 @@ _hash_splitbucket(Relation rel,
         Buffer          nbuf;
         BlockNumber oblkno;
         BlockNumber nblkno;
-       bool            null;
-       Datum           datum;
         HashPageOpaque oopaque;
         HashPageOpaque nopaque;
         IndexTuple      itup;
@@ -785,7 +779,6 @@ _hash_splitbucket(Relation rel,
         OffsetNumber omaxoffnum;
         Page            opage;
         Page            npage;
-       TupleDesc       itupdesc = RelationGetDescr(rel);
  
         /*
          * It should be okay to simultaneously write-lock pages from each bucket,
@@ -846,16 +839,11 @@ _hash_splitbucket(Relation rel,
                 }
  
                 /*
-                * Re-hash the tuple to determine which bucket it now belongs in.
-                *
-                * It is annoying to call the hash function while holding locks, but
-                * releasing and relocking the page for each tuple is unappealing too.
+                * Fetch the item's hash key (conveniently stored in the item)
+                * and determine which bucket it now belongs in.
                  */
                 itup = (IndexTuple) PageGetItem(opage, PageGetItemId(opage, ooffnum));
-               datum = index_getattr(itup, 1, itupdesc, &null);
-               Assert(!null);
-
-               bucket = _hash_hashkey2bucket(_hash_datum2hashkey(rel, datum),
+               bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
                                                                           maxbucket, highmask, lowmask);
  
                 if (bucket == nbucket)
diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c

index 258526be739ebf826477eb681fd96096da424974..8b77d2cc5335958a59a864d8530de58ed6bcb7dc 100644 (file)
--- a/src/backend/access/hash/hashsearch.c
+++ b/src/backend/access/hash/hashsearch.c
@@ -178,6 +178,8 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
                 hashkey = _hash_datum2hashkey_type(rel, cur->sk_argument,
                                                                                    cur->sk_subtype);
  
+       so->hashso_sk_hash = hashkey;
+
         /*
          * Acquire shared split lock so we can compute the target bucket safely
          * (see README).
@@ -186,7 +188,7 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
  
         /* Read the metapage */
         metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-       metap = (HashMetaPage) BufferGetPage(metabuf);
+       metap = HashPageGetMeta(BufferGetPage(metabuf));
  
         /*
          * Compute the target bucket number, and convert to block number.
@@ -284,7 +286,7 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
                 offnum = InvalidOffsetNumber;
  
         /*
-        * 'offnum' now points to the last tuple we have seen (if any).
+        * 'offnum' now points to the last tuple we examined (if any).
          *
          * continue to step through tuples until: 1) we get to the end of the
          * bucket chain or 2) we find a valid tuple.
@@ -297,25 +299,39 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
                                 if (offnum != InvalidOffsetNumber)
                                         offnum = OffsetNumberNext(offnum);      /* move forward */
                                 else
-                                       offnum = FirstOffsetNumber; /* new page */
+                               {
+                                       /* new page, locate starting position by binary search */
+                                       offnum = _hash_binsearch(page, so->hashso_sk_hash);
+                               }
  
-                               while (offnum > maxoff)
+                               for (;;)
                                 {
                                         /*
-                                        * either this page is empty (maxoff ==
-                                        * InvalidOffsetNumber) or we ran off the end.
+                                        * check if we're still in the range of items with
+                                        * the target hash key
+                                        */
+                                       if (offnum <= maxoff)
+                                       {
+                                               Assert(offnum >= FirstOffsetNumber);
+                                               itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+                                               if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
+                                                       break;                          /* yes, so exit for-loop */
+                                       }
+
+                                       /*
+                                        * ran off the end of this page, try the next
                                          */
                                         _hash_readnext(rel, &buf, &page, &opaque);
                                         if (BufferIsValid(buf))
                                         {
                                                 maxoff = PageGetMaxOffsetNumber(page);
-                                               offnum = FirstOffsetNumber;
+                                               offnum = _hash_binsearch(page, so->hashso_sk_hash);
                                         }
                                         else
                                         {
                                                 /* end of bucket */
-                                               maxoff = offnum = InvalidOffsetNumber;
-                                               break;  /* exit while */
+                                               itup = NULL;
+                                               break;  /* exit for-loop */
                                         }
                                 }
                                 break;
@@ -324,22 +340,39 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
                                 if (offnum != InvalidOffsetNumber)
                                         offnum = OffsetNumberPrev(offnum);      /* move back */
                                 else
-                                       offnum = maxoff;        /* new page */
+                               {
+                                       /* new page, locate starting position by binary search */
+                                       offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
+                               }
  
-                               while (offnum < FirstOffsetNumber)
+                               for (;;)
                                 {
                                         /*
-                                        * either this page is empty (offnum ==
-                                        * InvalidOffsetNumber) or we ran off the end.
+                                        * check if we're still in the range of items with
+                                        * the target hash key
+                                        */
+                                       if (offnum >= FirstOffsetNumber)
+                                       {
+                                               Assert(offnum <= maxoff);
+                                               itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+                                               if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
+                                                       break;                          /* yes, so exit for-loop */
+                                       }
+
+                                       /*
+                                        * ran off the end of this page, try the next
                                          */
                                         _hash_readprev(rel, &buf, &page, &opaque);
                                         if (BufferIsValid(buf))
-                                               maxoff = offnum = PageGetMaxOffsetNumber(page);
+                                       {
+                                               maxoff = PageGetMaxOffsetNumber(page);
+                                               offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
+                                       }
                                         else
                                         {
                                                 /* end of bucket */
-                                               maxoff = offnum = InvalidOffsetNumber;
-                                               break;  /* exit while */
+                                               itup = NULL;
+                                               break;  /* exit for-loop */
                                         }
                                 }
                                 break;
@@ -347,19 +380,19 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
                         default:
                                 /* NoMovementScanDirection */
                                 /* this should not be reached */
+                               itup = NULL;
                                 break;
                 }
  
-               /* we ran off the end of the world without finding a match */
-               if (offnum == InvalidOffsetNumber)
+               if (itup == NULL)
                 {
+                       /* we ran off the end of the bucket without finding a match */
                         *bufP = so->hashso_curbuf = InvalidBuffer;
                         ItemPointerSetInvalid(current);
                         return false;
                 }
  
-               /* get ready to check this tuple */
-               itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+               /* check the tuple quals, loop around if not met */
         } while (!_hash_checkqual(scan, itup));
  
         /* if we made it to here, we've found a valid tuple */
diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c

index 6829097afea45b960b21d8bca922ddb0895f16ae..e8a3b88029237510dc4a8043a58a7118b135e719 100644 (file)
--- a/src/backend/access/hash/hashutil.c
+++ b/src/backend/access/hash/hashutil.c
@@ -28,12 +28,21 @@
  bool
  _hash_checkqual(IndexScanDesc scan, IndexTuple itup)
  {
+       /*
+        * Currently, we can't check any of the scan conditions since we do
+        * not have the original index entry value to supply to the sk_func.
+        * Always return true; we expect that hashgettuple already set the
+        * recheck flag to make the main indexscan code do it.
+        */
+#ifdef NOT_USED
         TupleDesc       tupdesc = RelationGetDescr(scan->indexRelation);
         ScanKey         key = scan->keyData;
         int                     scanKeySize = scan->numberOfKeys;
+#endif
  
         IncrIndexProcessed();
  
+#ifdef NOT_USED
         while (scanKeySize > 0)
         {
                 Datum           datum;
@@ -59,6 +68,7 @@ _hash_checkqual(IndexScanDesc scan, IndexTuple itup)
                 key++;
                 scanKeySize--;
         }
+#endif
  
         return true;
  }
@@ -190,7 +200,7 @@ _hash_checkpage(Relation rel, Buffer buf, int flags)
          */
         if (flags == LH_META_PAGE)
         {
-               HashMetaPage metap = (HashMetaPage) page;
+               HashMetaPage metap = HashPageGetMeta(page);
  
                 if (metap->hashm_magic != HASH_MAGIC)
                         ereport(ERROR,
@@ -221,3 +231,123 @@ hashoptions(PG_FUNCTION_ARGS)
                 PG_RETURN_BYTEA_P(result);
         PG_RETURN_NULL();
  }
+
+/*
+ * _hash_get_indextuple_hashkey - get the hash index tuple's hash key value
+ */
+uint32
+_hash_get_indextuple_hashkey(IndexTuple itup)
+{
+       char       *attp;
+
+       /*
+        * We assume the hash key is the first attribute and can't be null,
+        * so this can be done crudely but very very cheaply ...
+        */
+       attp = (char *) itup + IndexInfoFindDataOffset(itup->t_info);
+       return *((uint32 *) attp);
+}
+
+/*
+ * _hash_form_tuple - form an index tuple containing hash code only
+ */
+IndexTuple
+_hash_form_tuple(Relation index, Datum *values, bool *isnull)
+{
+       IndexTuple              itup;
+       uint32                  hashkey;
+       Datum                   hashkeydatum;
+       TupleDesc               hashdesc;
+
+       if (isnull[0])
+               hashkeydatum = (Datum) 0;
+       else
+       {
+               hashkey = _hash_datum2hashkey(index, values[0]);
+               hashkeydatum = UInt32GetDatum(hashkey);
+       }
+       hashdesc = RelationGetDescr(index);
+       Assert(hashdesc->natts == 1);
+       itup = index_form_tuple(hashdesc, &hashkeydatum, isnull);
+       return itup;
+}
+
+/*
+ * _hash_binsearch - Return the offset number in the page where the
+ *                                      specified hash value should be sought or inserted.
+ *
+ * We use binary search, relying on the assumption that the existing entries
+ * are ordered by hash key.
+ *
+ * Returns the offset of the first index entry having hashkey >= hash_value,
+ * or the page's max offset plus one if hash_value is greater than all
+ * existing hash keys in the page.  This is the appropriate place to start
+ * a search, or to insert a new item.
+ */
+OffsetNumber
+_hash_binsearch(Page page, uint32 hash_value)
+{
+       OffsetNumber    upper;
+       OffsetNumber    lower;
+
+       /* Loop invariant: lower <= desired place <= upper */
+       upper = PageGetMaxOffsetNumber(page) + 1;
+       lower = FirstOffsetNumber;
+
+       while (upper > lower)
+       {
+               OffsetNumber    off;
+               IndexTuple              itup;
+               uint32                  hashkey;
+
+               off = (upper + lower) / 2;
+               Assert(OffsetNumberIsValid(off));
+
+               itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
+               hashkey = _hash_get_indextuple_hashkey(itup);
+               if (hashkey < hash_value)
+                       lower = off + 1;
+               else
+                       upper = off;
+       }
+
+       return lower;
+}
+
+/*
+ * _hash_binsearch_last
+ *
+ * Same as above, except that if there are multiple matching items in the
+ * page, we return the offset of the last one instead of the first one,
+ * and the possible range of outputs is 0..maxoffset not 1..maxoffset+1.
+ * This is handy for starting a new page in a backwards scan.
+ */
+OffsetNumber
+_hash_binsearch_last(Page page, uint32 hash_value)
+{
+       OffsetNumber    upper;
+       OffsetNumber    lower;
+
+       /* Loop invariant: lower <= desired place <= upper */
+       upper = PageGetMaxOffsetNumber(page);
+       lower = FirstOffsetNumber - 1;
+
+       while (upper > lower)
+       {
+               IndexTuple              itup;
+               OffsetNumber    off;
+               uint32                  hashkey;
+
+               off = (upper + lower + 1) / 2;
+               Assert(OffsetNumberIsValid(off));
+
+               itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
+               hashkey = _hash_get_indextuple_hashkey(itup);
+               if (hashkey > hash_value)
+                       upper = off - 1;
+               else
+                       lower = off;
+       }
+
+       return lower;
+}
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c

index 34ce0620104c64c5261b33278b23c5e1c812fa45..5986cb45cb87e12d556e0d8dfaeee0fe4bd5e9cf 100644 (file)
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -76,6 +76,7 @@ typedef struct
  /* non-export function prototypes */
  static TupleDesc ConstructTupleDescriptor(Relation heapRelation,
                                                  IndexInfo *indexInfo,
+                                                Oid accessMethodObjectId,
                                                  Oid *classObjectId);
  static void InitializeAttributeOids(Relation indexRelation,
                                                 int numatts, Oid indexoid);
@@ -105,15 +106,28 @@ static Oid        IndexGetRelation(Oid indexId);
  static TupleDesc
  ConstructTupleDescriptor(Relation heapRelation,
                                                  IndexInfo *indexInfo,
+                                                Oid accessMethodObjectId,
                                                  Oid *classObjectId)
  {
         int                     numatts = indexInfo->ii_NumIndexAttrs;
         ListCell   *indexpr_item = list_head(indexInfo->ii_Expressions);
+       HeapTuple       amtuple;
+       Form_pg_am      amform;
         TupleDesc       heapTupDesc;
         TupleDesc       indexTupDesc;
         int                     natts;                  /* #atts in heap rel --- for error checks */
         int                     i;
  
+       /* We need access to the index AM's pg_am tuple */
+       amtuple = SearchSysCache(AMOID,
+                                                        ObjectIdGetDatum(accessMethodObjectId),
+                                                        0, 0, 0);
+       if (!HeapTupleIsValid(amtuple))
+               elog(ERROR, "cache lookup failed for access method %u",
+                        accessMethodObjectId);
+       amform = (Form_pg_am) GETSTRUCT(amtuple);
+
+       /* ... and to the table's tuple descriptor */
         heapTupDesc = RelationGetDescr(heapRelation);
         natts = RelationGetForm(heapRelation)->relnatts;
  
@@ -133,6 +147,7 @@ ConstructTupleDescriptor(Relation heapRelation,
                 Form_pg_attribute to = indexTupDesc->attrs[i];
                 HeapTuple       tuple;
                 Form_pg_type typeTup;
+               Form_pg_opclass opclassTup;
                 Oid                     keyType;
  
                 if (atnum != 0)
@@ -231,8 +246,8 @@ ConstructTupleDescriptor(Relation heapRelation,
                 to->attrelid = InvalidOid;
  
                 /*
-                * Check the opclass to see if it provides a keytype (overriding the
-                * attribute type).
+                * Check the opclass and index AM to see if either provides a keytype
+                * (overriding the attribute type).  Opclass takes precedence.
                  */
                 tuple = SearchSysCache(CLAOID,
                                                            ObjectIdGetDatum(classObjectId[i]),
@@ -240,7 +255,11 @@ ConstructTupleDescriptor(Relation heapRelation,
                 if (!HeapTupleIsValid(tuple))
                         elog(ERROR, "cache lookup failed for opclass %u",
                                  classObjectId[i]);
-               keyType = ((Form_pg_opclass) GETSTRUCT(tuple))->opckeytype;
+               opclassTup = (Form_pg_opclass) GETSTRUCT(tuple);
+               if (OidIsValid(opclassTup->opckeytype))
+                       keyType = opclassTup->opckeytype;
+               else
+                       keyType = amform->amkeytype;
                 ReleaseSysCache(tuple);
  
                 if (OidIsValid(keyType) && keyType != to->atttypid)
@@ -264,6 +283,8 @@ ConstructTupleDescriptor(Relation heapRelation,
                 }
         }
  
+       ReleaseSysCache(amtuple);
+
         return indexTupDesc;
  }
  
@@ -577,6 +598,7 @@ index_create(Oid heapRelationId,
          */
         indexTupDesc = ConstructTupleDescriptor(heapRelation,
                                                                                         indexInfo,
+                                                                                       accessMethodObjectId,
                                                                                         classObjectId);
  
         /*
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c

index 35e90c3abe7dcda0e2fabda45e70b8f0bc918e32..6b7c8868cc79d4ff5deef1332e47e0ba5f8e1e81 100644 (file)
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -101,7 +101,6 @@
  #include <limits.h>
  
  #include "access/genam.h"
-#include "access/hash.h"
  #include "access/nbtree.h"
  #include "catalog/pg_amop.h"
  #include "catalog/pg_operator.h"
@@ -353,7 +352,6 @@ struct Tuplesortstate
         bool            enforceUnique;  /* complain if we find duplicate tuples */
  
         /* These are specific to the index_hash subcase: */
-       FmgrInfo   *hash_proc;          /* call info for the hash function */
         uint32          hash_mask;              /* mask for sortable part of hash code */
  
         /*
@@ -689,13 +687,6 @@ tuplesort_begin_index_hash(Relation indexRel,
  
         state->indexRel = indexRel;
  
-       /*
-        * We look up the index column's hash function just once, to avoid
-        * chewing lots of cycles in repeated index_getprocinfo calls.  This
-        * assumes that our caller holds the index relation open throughout the
-        * sort, else the pointer obtained here might cease to be valid.
-        */
-       state->hash_proc = index_getprocinfo(indexRel, 1, HASHPROC);
         state->hash_mask = hash_mask;
  
         MemoryContextSwitchTo(oldcontext);
@@ -2821,11 +2812,6 @@ static int
  comparetup_index_hash(const SortTuple *a, const SortTuple *b,
                                           Tuplesortstate *state)
  {
-       /*
-        * It's slightly annoying to redo the hash function each time, although
-        * most hash functions ought to be cheap.  Is it worth having a variant
-        * tuple storage format so we can store the hash code?
-        */
         uint32          hash1;
         uint32          hash2;
         IndexTuple      tuple1;
@@ -2834,13 +2820,14 @@ comparetup_index_hash(const SortTuple *a, const SortTuple *b,
         /* Allow interrupting long sorts */
         CHECK_FOR_INTERRUPTS();
  
-       /* Compute hash codes and mask off bits we don't want to sort by */
+       /*
+        * Fetch hash keys and mask off bits we don't want to sort by.
+        * We know that the first column of the index tuple is the hash key.
+        */
         Assert(!a->isnull1);
-       hash1 = DatumGetUInt32(FunctionCall1(state->hash_proc, a->datum1))
-               & state->hash_mask;
+       hash1 = DatumGetUInt32(a->datum1) & state->hash_mask;
         Assert(!b->isnull1);
-       hash2 = DatumGetUInt32(FunctionCall1(state->hash_proc, b->datum1))
-               & state->hash_mask;
+       hash2 = DatumGetUInt32(b->datum1) & state->hash_mask;
  
         if (hash1 > hash2)
                 return 1;
diff --git a/src/include/access/hash.h b/src/include/access/hash.h

index 1e2b9887d81dee2d6a1564e06b75ef71f26cf482..bd4ec10db807e1a0904def6a30ed4925c811799a 100644 (file)
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -75,6 +75,9 @@ typedef HashPageOpaqueData *HashPageOpaque;
   */
  typedef struct HashScanOpaqueData
  {
+       /* Hash value of the scan key, ie, the hash key we seek */
+       uint32          hashso_sk_hash;
+
         /*
          * By definition, a hash scan should be examining only one bucket. We
          * record the bucket number here as soon as it is known.
@@ -111,7 +114,7 @@ typedef HashScanOpaqueData *HashScanOpaque;
  #define HASH_METAPAGE  0               /* metapage is always block 0 */
  
  #define HASH_MAGIC             0x6440640
-#define HASH_VERSION   1               /* new for Pg 7.4 */
+#define HASH_VERSION   2               /* 2 signifies only hash key value is stored */
  
  /*
   * Spares[] holds the number of overflow pages currently allocated at or
@@ -138,7 +141,6 @@ typedef HashScanOpaqueData *HashScanOpaque;
  
  typedef struct HashMetaPageData
  {
-       PageHeaderData hashm_phdr;      /* pad for page header (do not use) */
         uint32          hashm_magic;    /* magic no. for hash tables */
         uint32          hashm_version;  /* version ID */
         double          hashm_ntuples;  /* number of tuples stored in the table */
@@ -191,8 +193,16 @@ typedef HashMetaPageData *HashMetaPage;
  #define BMPGSZ_BIT(metap)              ((metap)->hashm_bmsize << BYTE_TO_BIT)
  #define BMPG_SHIFT(metap)              ((metap)->hashm_bmshift)
  #define BMPG_MASK(metap)               (BMPGSZ_BIT(metap) - 1)
-#define HashPageGetBitmap(pg) \
-       ((uint32 *) (((char *) (pg)) + MAXALIGN(sizeof(PageHeaderData))))
+
+#define HashPageGetBitmap(page) \
+       ((uint32 *) PageGetContents(page))
+
+#define HashGetMaxBitmapSize(page) \
+       (PageGetPageSize((Page) page) - \
+        (MAXALIGN(SizeOfPageHeaderData) + MAXALIGN(sizeof(HashPageOpaqueData))))
+
+#define HashPageGetMeta(page) \
+       ((HashMetaPage) PageGetContents(page))
  
  /*
   * The number of bits in an ovflpage bitmap word.
@@ -330,6 +340,11 @@ extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket,
                                          uint32 highmask, uint32 lowmask);
  extern uint32 _hash_log2(uint32 num);
  extern void _hash_checkpage(Relation rel, Buffer buf, int flags);
+extern uint32 _hash_get_indextuple_hashkey(IndexTuple itup);
+extern IndexTuple _hash_form_tuple(Relation index,
+                                                                  Datum *values, bool *isnull);
+extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value);
+extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value);
  
  /* hash.c */
  extern void hash_redo(XLogRecPtr lsn, XLogRecord *record);
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h

index 76ff03064f0899ce1f68f884b7fe3ea1a73240f6..dc1c9a90c681ce915deea7b6d2ca5a2282d9a356 100644 (file)
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
   */
  
  /*                                                     yyyymmddN */
-#define CATALOG_VERSION_NO     200809101
+#define CATALOG_VERSION_NO     200809151
  
  #endif
diff --git a/src/include/catalog/pg_am.h b/src/include/catalog/pg_am.h

index 6b7440682e1ab3ec2ae8ef82239c3f48b50edf40..9eedbe4e62890117d0bb41db75535f2889ae9454 100644 (file)
--- a/src/include/catalog/pg_am.h
+++ b/src/include/catalog/pg_am.h
@@ -48,6 +48,7 @@ CATALOG(pg_am,2601)
         bool            amsearchnulls;  /* can AM search for NULL index entries? */
         bool            amstorage;              /* can storage type differ from column type? */
         bool            amclusterable;  /* does AM support cluster command? */
+       Oid                     amkeytype;              /* type of data in index, or InvalidOid */
         regproc         aminsert;               /* "insert this tuple" function */
         regproc         ambeginscan;    /* "start new scan" function */
         regproc         amgettuple;             /* "next valid tuple" function */
@@ -74,7 +75,7 @@ typedef FormData_pg_am *Form_pg_am;
   *             compiler constants for pg_am
   * ----------------
   */
-#define Natts_pg_am                                            24
+#define Natts_pg_am                                            25
  #define Anum_pg_am_amname                              1
  #define Anum_pg_am_amstrategies                        2
  #define Anum_pg_am_amsupport                   3
@@ -86,35 +87,36 @@ typedef FormData_pg_am *Form_pg_am;
  #define Anum_pg_am_amsearchnulls               9
  #define Anum_pg_am_amstorage                   10
  #define Anum_pg_am_amclusterable               11
-#define Anum_pg_am_aminsert                            12
-#define Anum_pg_am_ambeginscan                 13
-#define Anum_pg_am_amgettuple                  14
-#define Anum_pg_am_amgetbitmap                 15
-#define Anum_pg_am_amrescan                            16
-#define Anum_pg_am_amendscan                   17
-#define Anum_pg_am_ammarkpos                   18
-#define Anum_pg_am_amrestrpos                  19
-#define Anum_pg_am_ambuild                             20
-#define Anum_pg_am_ambulkdelete                        21
-#define Anum_pg_am_amvacuumcleanup             22
-#define Anum_pg_am_amcostestimate              23
-#define Anum_pg_am_amoptions                   24
+#define Anum_pg_am_amkeytype                   12
+#define Anum_pg_am_aminsert                            13
+#define Anum_pg_am_ambeginscan                 14
+#define Anum_pg_am_amgettuple                  15
+#define Anum_pg_am_amgetbitmap                 16
+#define Anum_pg_am_amrescan                            17
+#define Anum_pg_am_amendscan                   18
+#define Anum_pg_am_ammarkpos                   19
+#define Anum_pg_am_amrestrpos                  20
+#define Anum_pg_am_ambuild                             21
+#define Anum_pg_am_ambulkdelete                        22
+#define Anum_pg_am_amvacuumcleanup             23
+#define Anum_pg_am_amcostestimate              24
+#define Anum_pg_am_amoptions                   25
  
  /* ----------------
   *             initial contents of pg_am
   * ----------------
   */
  
-DATA(insert OID = 403 (  btree 5 1 t t t t t t f t btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbulkdelete btvacuumcleanup btcostestimate btoptions ));
+DATA(insert OID = 403 (  btree 5 1 t t t t t t f t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbulkdelete btvacuumcleanup btcostestimate btoptions ));
  DESCR("b-tree index access method");
  #define BTREE_AM_OID 403
-DATA(insert OID = 405 (  hash  1 1 f f f f f f f f hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions ));
+DATA(insert OID = 405 (  hash  1 1 f f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions ));
  DESCR("hash index access method");
  #define HASH_AM_OID 405
-DATA(insert OID = 783 (  gist  0 7 f f t t t t t t gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions ));
+DATA(insert OID = 783 (  gist  0 7 f f t t t t t t 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions ));
  DESCR("GiST index access method");
  #define GIST_AM_OID 783
-DATA(insert OID = 2742 (  gin  0 5 f f t t f f t f gininsert ginbeginscan gingettuple gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbulkdelete ginvacuumcleanup gincostestimate ginoptions ));
+DATA(insert OID = 2742 (  gin  0 5 f f t t f f t f 0 gininsert ginbeginscan gingettuple gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbulkdelete ginvacuumcleanup gincostestimate ginoptions ));
  DESCR("GIN index access method");
  #define GIN_AM_OID 2742
  
diff --git a/src/include/catalog/pg_opclass.h b/src/include/catalog/pg_opclass.h

index 115069b4325cd6e6b3478a95b6f8d72b290a692a..9578c81034c3d701d388b6a8d09cf863030c40ce 100644 (file)
--- a/src/include/catalog/pg_opclass.h
+++ b/src/include/catalog/pg_opclass.h
@@ -123,13 +123,13 @@ DATA(insert (     403             macaddr_ops                     PGNSP PGUID 1984  829 t 0 ));
  DATA(insert (  405             macaddr_ops                     PGNSP PGUID 1985  829 t 0 ));
  /*
   * Here's an ugly little hack to save space in the system catalog indexes.
- * btree and hash don't ordinarily allow a storage type different from input
- * type; but cstring and name are the same thing except for trailing padding,
+ * btree doesn't ordinarily allow a storage type different from input type;
+ * but cstring and name are the same thing except for trailing padding,
   * and we can safely omit that within an index entry.  So we declare the
- * opclasses for name as using cstring storage type.
+ * btree opclass for name as using cstring storage type.
   */
  DATA(insert (  403             name_ops                        PGNSP PGUID 1986   19 t 2275 ));
-DATA(insert (  405             name_ops                        PGNSP PGUID 1987   19 t 2275 ));
+DATA(insert (  405             name_ops                        PGNSP PGUID 1987   19 t 0 ));
  DATA(insert (  403             numeric_ops                     PGNSP PGUID 1988 1700 t 0 ));
  DATA(insert (  405             numeric_ops                     PGNSP PGUID 1998 1700 t 0 ));
  DATA(insert OID = 1981 ( 403   oid_ops         PGNSP PGUID 1989   26 t 0 ));
author	Tom Lane <tgl@sss.pgh.pa.us>
	Mon, 15 Sep 2008 18:43:41 +0000 (18:43 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Mon, 15 Sep 2008 18:43:41 +0000 (18:43 +0000)
doc/src/sgml/catalogs.sgml		patch \| blob \| blame \| history
src/backend/access/hash/hash.c		patch \| blob \| blame \| history
src/backend/access/hash/hashinsert.c		patch \| blob \| blame \| history
src/backend/access/hash/hashovfl.c		patch \| blob \| blame \| history
src/backend/access/hash/hashpage.c		patch \| blob \| blame \| history
src/backend/access/hash/hashsearch.c		patch \| blob \| blame \| history
src/backend/access/hash/hashutil.c		patch \| blob \| blame \| history
src/backend/catalog/index.c		patch \| blob \| blame \| history
src/backend/utils/sort/tuplesort.c		patch \| blob \| blame \| history
src/include/access/hash.h		patch \| blob \| blame \| history
src/include/catalog/catversion.h		patch \| blob \| blame \| history
src/include/catalog/pg_am.h		patch \| blob \| blame \| history
src/include/catalog/pg_opclass.h		patch \| blob \| blame \| history