Change hash indexes to store only the hash code rather than the whole indexed
authorTom Lane <tgl@sss.pgh.pa.us>
Mon, 15 Sep 2008 18:43:41 +0000 (18:43 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Mon, 15 Sep 2008 18:43:41 +0000 (18:43 +0000)
value.  This means that hash index lookups are always lossy and have to be
rechecked when the heap is visited; however, the gain in index compactness
outweighs this when the indexed values are wide.  Also, we only need to
perform datatype comparisons when the hash codes match exactly, rather than
for every entry in the hash bucket; so it could also win for datatypes that
have expensive comparison functions.  A small additional win is gained by
keeping hash index pages sorted by hash code and using binary search to reduce
the number of index tuples we have to look at.

Xiao Meng

This commit also incorporates Zdenek Kotala's patch to isolate hash metapages
and hash bitmaps a bit better from the page header datastructures.

13 files changed:
doc/src/sgml/catalogs.sgml
src/backend/access/hash/hash.c
src/backend/access/hash/hashinsert.c
src/backend/access/hash/hashovfl.c
src/backend/access/hash/hashpage.c
src/backend/access/hash/hashsearch.c
src/backend/access/hash/hashutil.c
src/backend/catalog/index.c
src/backend/utils/sort/tuplesort.c
src/include/access/hash.h
src/include/catalog/catversion.h
src/include/catalog/pg_am.h
src/include/catalog/pg_opclass.h

index 947c470d26e44380f6bfa93b0fd28d494f3bb40f..40e046092c53370dba768ea65d0d3e36c0ef7c4c 100644 (file)
       <entry>Can an index of this type be clustered on?</entry>
      </row>
 
+     <row>
+      <entry><structfield>amkeytype</structfield></entry>
+      <entry><type>oid</type></entry>
+      <entry><literal><link linkend="catalog-pg-type"><structname>pg_type</structname></link>.oid</literal></entry>
+      <entry>Type of data stored in index, or zero if not a fixed type</entry>
+     </row>
+
      <row>
       <entry><structfield>aminsert</structfield></entry>
       <entry><type>regproc</type></entry>
      <row>
       <entry><structfield>sourceline</structfield></entry>
       <entry><type>text</type></entry>
-      <entry>Line number within the sourcefile the current value was set 
+      <entry>Line number within the sourcefile the current value was set
       from (NULL for values set in sources other than configuration files)
       </entry>
      </row>
index 6a5c0009dbb63be5740782a2ce3bb95a40859e92..67d0691a00fba1f8c334cee188aa42216ccc3911 100644 (file)
@@ -79,12 +79,12 @@ hashbuild(PG_FUNCTION_ARGS)
         * then we'll thrash horribly.  To prevent that scenario, we can sort the
         * tuples by (expected) bucket number.  However, such a sort is useless
         * overhead when the index does fit in RAM.  We choose to sort if the
-        * initial index size exceeds effective_cache_size.
+        * initial index size exceeds NBuffers.
         *
         * NOTE: this test will need adjustment if a bucket is ever different
         * from one page.
         */
-       if (num_buckets >= (uint32) effective_cache_size)
+       if (num_buckets >= (uint32) NBuffers)
                buildstate.spool = _h_spoolinit(index, num_buckets);
        else
                buildstate.spool = NULL;
@@ -129,7 +129,7 @@ hashbuildCallback(Relation index,
        IndexTuple      itup;
 
        /* form an index tuple and point it at the heap tuple */
-       itup = index_form_tuple(RelationGetDescr(index), values, isnull);
+       itup = _hash_form_tuple(index, values, isnull);
        itup->t_tid = htup->t_self;
 
        /* Hash indexes don't index nulls, see notes in hashinsert */
@@ -153,8 +153,8 @@ hashbuildCallback(Relation index,
 /*
  *     hashinsert() -- insert an index tuple into a hash table.
  *
- *     Hash on the index tuple's key, find the appropriate location
- *     for the new tuple, and put it there.
+ *     Hash on the heap tuple's key, form an index tuple with hash code.
+ *     Find the appropriate location for the new tuple, and put it there.
  */
 Datum
 hashinsert(PG_FUNCTION_ARGS)
@@ -171,7 +171,7 @@ hashinsert(PG_FUNCTION_ARGS)
        IndexTuple      itup;
 
        /* generate an index tuple */
-       itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
+       itup = _hash_form_tuple(rel, values, isnull);
        itup->t_tid = *ht_ctid;
 
        /*
@@ -211,8 +211,8 @@ hashgettuple(PG_FUNCTION_ARGS)
        OffsetNumber offnum;
        bool            res;
 
-       /* Hash indexes are never lossy (at the moment anyway) */
-       scan->xs_recheck = false;
+       /* Hash indexes are always lossy since we store only the hash code */
+       scan->xs_recheck = true;
 
        /*
         * We hold pin but not lock on current buffer while outside the hash AM.
@@ -317,7 +317,8 @@ hashgetbitmap(PG_FUNCTION_ARGS)
                /* Save tuple ID, and continue scanning */
                if (add_tuple) 
                {
-                       tbm_add_tuples(tbm, &scan->xs_ctup.t_self, 1, false);
+                       /* Note we mark the tuple ID as requiring recheck */
+                       tbm_add_tuples(tbm, &scan->xs_ctup.t_self, 1, true);
                        ntids++;
                }
 
@@ -527,7 +528,7 @@ hashbulkdelete(PG_FUNCTION_ARGS)
         * each bucket.
         */
        metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-       metap = (HashMetaPage) BufferGetPage(metabuf);
+       metap =  HashPageGetMeta(BufferGetPage(metabuf));
        orig_maxbucket = metap->hashm_maxbucket;
        orig_ntuples = metap->hashm_ntuples;
        memcpy(&local_metapage, metap, sizeof(local_metapage));
@@ -629,7 +630,7 @@ loop_top:
 
        /* Write-lock metapage and check for split since we started */
        metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE, LH_META_PAGE);
-       metap = (HashMetaPage) BufferGetPage(metabuf);
+       metap = HashPageGetMeta(BufferGetPage(metabuf));
 
        if (cur_maxbucket != metap->hashm_maxbucket)
        {
index 3eb226a8a5473cd06cb22c15b0fc4ac5066db50a..0048c3ce23a041fc69f1b2904111c188a76a315a 100644 (file)
@@ -43,18 +43,11 @@ _hash_doinsert(Relation rel, IndexTuple itup)
        bool            do_expand;
        uint32          hashkey;
        Bucket          bucket;
-       Datum           datum;
-       bool            isnull;
 
        /*
-        * Compute the hash key for the item.  We do this first so as not to need
-        * to hold any locks while running the hash function.
+        * Get the hash key for the item (it's stored in the index tuple itself).
         */
-       if (rel->rd_rel->relnatts != 1)
-               elog(ERROR, "hash indexes support only one index key");
-       datum = index_getattr(itup, 1, RelationGetDescr(rel), &isnull);
-       Assert(!isnull);
-       hashkey = _hash_datum2hashkey(rel, datum);
+       hashkey = _hash_get_indextuple_hashkey(itup);
 
        /* compute item size too */
        itemsz = IndexTupleDSize(*itup);
@@ -69,12 +62,14 @@ _hash_doinsert(Relation rel, IndexTuple itup)
 
        /* Read the metapage */
        metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-       metap = (HashMetaPage) BufferGetPage(metabuf);
+       metap = HashPageGetMeta(BufferGetPage(metabuf));
 
        /*
         * Check whether the item can fit on a hash page at all. (Eventually, we
         * ought to try to apply TOAST methods if not.)  Note that at this point,
         * itemsz doesn't include the ItemId.
+        *
+        * XXX this is useless code if we are only storing hash keys.
         */
        if (itemsz > HashMaxItemSize((Page) metap))
                ereport(ERROR,
@@ -197,11 +192,15 @@ _hash_pgaddtup(Relation rel,
 {
        OffsetNumber itup_off;
        Page            page;
+       uint32          hashkey;
 
        _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
        page = BufferGetPage(buf);
 
-       itup_off = OffsetNumberNext(PageGetMaxOffsetNumber(page));
+       /* Find where to insert the tuple (preserving page's hashkey ordering) */
+       hashkey = _hash_get_indextuple_hashkey(itup);
+       itup_off = _hash_binsearch(page, hashkey);
+
        if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false)
                == InvalidOffsetNumber)
                elog(ERROR, "failed to add index item to \"%s\"",
index cc10135d80fd160c803932f4f6fa96f75e6f6195..e17ae173e8ab85ad6d6ac7c8f6cb12d1d7903a33 100644 (file)
@@ -187,7 +187,7 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
        _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
 
        _hash_checkpage(rel, metabuf, LH_META_PAGE);
-       metap = (HashMetaPage) BufferGetPage(metabuf);
+       metap = HashPageGetMeta(BufferGetPage(metabuf));
 
        /* start search at hashm_firstfree */
        orig_firstfree = metap->hashm_firstfree;
@@ -450,7 +450,7 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf,
 
        /* Read the metapage so we can determine which bitmap page to use */
        metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-       metap = (HashMetaPage) BufferGetPage(metabuf);
+       metap = HashPageGetMeta(BufferGetPage(metabuf));
 
        /* Identify which bit to set */
        ovflbitno = blkno_to_bitno(metap, ovflblkno);
index ba6177c18ab043f58dbf374f9377312d5f7c831b..431bb2e204e91a6a6f5d29fe866410422b31c4d1 100644 (file)
@@ -348,11 +348,9 @@ _hash_metapinit(Relation rel, double num_tuples)
         * Determine the target fill factor (in tuples per bucket) for this index.
         * The idea is to make the fill factor correspond to pages about as full
         * as the user-settable fillfactor parameter says.      We can compute it
-        * exactly if the index datatype is fixed-width, but for var-width there's
-        * some guessing involved.
+        * exactly since the index datatype (i.e. uint32 hash key) is fixed-width.
         */
-       data_width = get_typavgwidth(RelationGetDescr(rel)->attrs[0]->atttypid,
-                                                                RelationGetDescr(rel)->attrs[0]->atttypmod);
+       data_width = sizeof(uint32);
        item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) +
                sizeof(ItemIdData);             /* include the line pointer */
        ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width;
@@ -395,20 +393,18 @@ _hash_metapinit(Relation rel, double num_tuples)
        pageopaque->hasho_flag = LH_META_PAGE;
        pageopaque->hasho_page_id = HASHO_PAGE_ID;
 
-       metap = (HashMetaPage) pg;
+       metap = HashPageGetMeta(pg);
 
        metap->hashm_magic = HASH_MAGIC;
        metap->hashm_version = HASH_VERSION;
        metap->hashm_ntuples = 0;
        metap->hashm_nmaps = 0;
        metap->hashm_ffactor = ffactor;
-       metap->hashm_bsize = BufferGetPageSize(metabuf);
+       metap->hashm_bsize = HashGetMaxBitmapSize(pg);
        /* find largest bitmap array size that will fit in page size */
        for (i = _hash_log2(metap->hashm_bsize); i > 0; --i)
        {
-               if ((1 << i) <= (metap->hashm_bsize -
-                                                (MAXALIGN(sizeof(PageHeaderData)) +
-                                                 MAXALIGN(sizeof(HashPageOpaqueData)))))
+               if ((1 << i) <= metap->hashm_bsize)
                        break;
        }
        Assert(i > 0);
@@ -532,7 +528,7 @@ _hash_expandtable(Relation rel, Buffer metabuf)
        _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
 
        _hash_checkpage(rel, metabuf, LH_META_PAGE);
-       metap = (HashMetaPage) BufferGetPage(metabuf);
+       metap = HashPageGetMeta(BufferGetPage(metabuf));
 
        /*
         * Check to see if split is still needed; someone else might have already
@@ -774,8 +770,6 @@ _hash_splitbucket(Relation rel,
        Buffer          nbuf;
        BlockNumber oblkno;
        BlockNumber nblkno;
-       bool            null;
-       Datum           datum;
        HashPageOpaque oopaque;
        HashPageOpaque nopaque;
        IndexTuple      itup;
@@ -785,7 +779,6 @@ _hash_splitbucket(Relation rel,
        OffsetNumber omaxoffnum;
        Page            opage;
        Page            npage;
-       TupleDesc       itupdesc = RelationGetDescr(rel);
 
        /*
         * It should be okay to simultaneously write-lock pages from each bucket,
@@ -846,16 +839,11 @@ _hash_splitbucket(Relation rel,
                }
 
                /*
-                * Re-hash the tuple to determine which bucket it now belongs in.
-                *
-                * It is annoying to call the hash function while holding locks, but
-                * releasing and relocking the page for each tuple is unappealing too.
+                * Fetch the item's hash key (conveniently stored in the item)
+                * and determine which bucket it now belongs in.
                 */
                itup = (IndexTuple) PageGetItem(opage, PageGetItemId(opage, ooffnum));
-               datum = index_getattr(itup, 1, itupdesc, &null);
-               Assert(!null);
-
-               bucket = _hash_hashkey2bucket(_hash_datum2hashkey(rel, datum),
+               bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
                                                                          maxbucket, highmask, lowmask);
 
                if (bucket == nbucket)
index 258526be739ebf826477eb681fd96096da424974..8b77d2cc5335958a59a864d8530de58ed6bcb7dc 100644 (file)
@@ -178,6 +178,8 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
                hashkey = _hash_datum2hashkey_type(rel, cur->sk_argument,
                                                                                   cur->sk_subtype);
 
+       so->hashso_sk_hash = hashkey;
+
        /*
         * Acquire shared split lock so we can compute the target bucket safely
         * (see README).
@@ -186,7 +188,7 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
 
        /* Read the metapage */
        metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-       metap = (HashMetaPage) BufferGetPage(metabuf);
+       metap = HashPageGetMeta(BufferGetPage(metabuf));
 
        /*
         * Compute the target bucket number, and convert to block number.
@@ -284,7 +286,7 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
                offnum = InvalidOffsetNumber;
 
        /*
-        * 'offnum' now points to the last tuple we have seen (if any).
+        * 'offnum' now points to the last tuple we examined (if any).
         *
         * continue to step through tuples until: 1) we get to the end of the
         * bucket chain or 2) we find a valid tuple.
@@ -297,25 +299,39 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
                                if (offnum != InvalidOffsetNumber)
                                        offnum = OffsetNumberNext(offnum);      /* move forward */
                                else
-                                       offnum = FirstOffsetNumber; /* new page */
+                               {
+                                       /* new page, locate starting position by binary search */
+                                       offnum = _hash_binsearch(page, so->hashso_sk_hash);
+                               }
 
-                               while (offnum > maxoff)
+                               for (;;)
                                {
                                        /*
-                                        * either this page is empty (maxoff ==
-                                        * InvalidOffsetNumber) or we ran off the end.
+                                        * check if we're still in the range of items with
+                                        * the target hash key
+                                        */
+                                       if (offnum <= maxoff)
+                                       {
+                                               Assert(offnum >= FirstOffsetNumber);
+                                               itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+                                               if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
+                                                       break;                          /* yes, so exit for-loop */
+                                       }
+
+                                       /*
+                                        * ran off the end of this page, try the next
                                         */
                                        _hash_readnext(rel, &buf, &page, &opaque);
                                        if (BufferIsValid(buf))
                                        {
                                                maxoff = PageGetMaxOffsetNumber(page);
-                                               offnum = FirstOffsetNumber;
+                                               offnum = _hash_binsearch(page, so->hashso_sk_hash);
                                        }
                                        else
                                        {
                                                /* end of bucket */
-                                               maxoff = offnum = InvalidOffsetNumber;
-                                               break;  /* exit while */
+                                               itup = NULL;
+                                               break;  /* exit for-loop */
                                        }
                                }
                                break;
@@ -324,22 +340,39 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
                                if (offnum != InvalidOffsetNumber)
                                        offnum = OffsetNumberPrev(offnum);      /* move back */
                                else
-                                       offnum = maxoff;        /* new page */
+                               {
+                                       /* new page, locate starting position by binary search */
+                                       offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
+                               }
 
-                               while (offnum < FirstOffsetNumber)
+                               for (;;)
                                {
                                        /*
-                                        * either this page is empty (offnum ==
-                                        * InvalidOffsetNumber) or we ran off the end.
+                                        * check if we're still in the range of items with
+                                        * the target hash key
+                                        */
+                                       if (offnum >= FirstOffsetNumber)
+                                       {
+                                               Assert(offnum <= maxoff);
+                                               itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+                                               if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
+                                                       break;                          /* yes, so exit for-loop */
+                                       }
+
+                                       /*
+                                        * ran off the end of this page, try the next
                                         */
                                        _hash_readprev(rel, &buf, &page, &opaque);
                                        if (BufferIsValid(buf))
-                                               maxoff = offnum = PageGetMaxOffsetNumber(page);
+                                       {
+                                               maxoff = PageGetMaxOffsetNumber(page);
+                                               offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
+                                       }
                                        else
                                        {
                                                /* end of bucket */
-                                               maxoff = offnum = InvalidOffsetNumber;
-                                               break;  /* exit while */
+                                               itup = NULL;
+                                               break;  /* exit for-loop */
                                        }
                                }
                                break;
@@ -347,19 +380,19 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
                        default:
                                /* NoMovementScanDirection */
                                /* this should not be reached */
+                               itup = NULL;
                                break;
                }
 
-               /* we ran off the end of the world without finding a match */
-               if (offnum == InvalidOffsetNumber)
+               if (itup == NULL)
                {
+                       /* we ran off the end of the bucket without finding a match */
                        *bufP = so->hashso_curbuf = InvalidBuffer;
                        ItemPointerSetInvalid(current);
                        return false;
                }
 
-               /* get ready to check this tuple */
-               itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+               /* check the tuple quals, loop around if not met */
        } while (!_hash_checkqual(scan, itup));
 
        /* if we made it to here, we've found a valid tuple */
index 6829097afea45b960b21d8bca922ddb0895f16ae..e8a3b88029237510dc4a8043a58a7118b135e719 100644 (file)
 bool
 _hash_checkqual(IndexScanDesc scan, IndexTuple itup)
 {
+       /*
+        * Currently, we can't check any of the scan conditions since we do
+        * not have the original index entry value to supply to the sk_func.
+        * Always return true; we expect that hashgettuple already set the
+        * recheck flag to make the main indexscan code do it.
+        */
+#ifdef NOT_USED
        TupleDesc       tupdesc = RelationGetDescr(scan->indexRelation);
        ScanKey         key = scan->keyData;
        int                     scanKeySize = scan->numberOfKeys;
+#endif
 
        IncrIndexProcessed();
 
+#ifdef NOT_USED
        while (scanKeySize > 0)
        {
                Datum           datum;
@@ -59,6 +68,7 @@ _hash_checkqual(IndexScanDesc scan, IndexTuple itup)
                key++;
                scanKeySize--;
        }
+#endif
 
        return true;
 }
@@ -190,7 +200,7 @@ _hash_checkpage(Relation rel, Buffer buf, int flags)
         */
        if (flags == LH_META_PAGE)
        {
-               HashMetaPage metap = (HashMetaPage) page;
+               HashMetaPage metap = HashPageGetMeta(page);
 
                if (metap->hashm_magic != HASH_MAGIC)
                        ereport(ERROR,
@@ -221,3 +231,123 @@ hashoptions(PG_FUNCTION_ARGS)
                PG_RETURN_BYTEA_P(result);
        PG_RETURN_NULL();
 }
+
+/*
+ * _hash_get_indextuple_hashkey - get the hash index tuple's hash key value
+ */
+uint32
+_hash_get_indextuple_hashkey(IndexTuple itup)
+{
+       char       *attp;
+
+       /*
+        * We assume the hash key is the first attribute and can't be null,
+        * so this can be done crudely but very very cheaply ...
+        */
+       attp = (char *) itup + IndexInfoFindDataOffset(itup->t_info);
+       return *((uint32 *) attp);
+}
+
+/*
+ * _hash_form_tuple - form an index tuple containing hash code only
+ */
+IndexTuple
+_hash_form_tuple(Relation index, Datum *values, bool *isnull)
+{
+       IndexTuple              itup;
+       uint32                  hashkey;
+       Datum                   hashkeydatum;
+       TupleDesc               hashdesc;
+
+       if (isnull[0])
+               hashkeydatum = (Datum) 0;
+       else
+       {
+               hashkey = _hash_datum2hashkey(index, values[0]);
+               hashkeydatum = UInt32GetDatum(hashkey);
+       }
+       hashdesc = RelationGetDescr(index);
+       Assert(hashdesc->natts == 1);
+       itup = index_form_tuple(hashdesc, &hashkeydatum, isnull);
+       return itup;
+}
+
+/*
+ * _hash_binsearch - Return the offset number in the page where the
+ *                                      specified hash value should be sought or inserted.
+ *
+ * We use binary search, relying on the assumption that the existing entries
+ * are ordered by hash key.
+ *
+ * Returns the offset of the first index entry having hashkey >= hash_value,
+ * or the page's max offset plus one if hash_value is greater than all
+ * existing hash keys in the page.  This is the appropriate place to start
+ * a search, or to insert a new item.
+ */
+OffsetNumber
+_hash_binsearch(Page page, uint32 hash_value)
+{
+       OffsetNumber    upper;
+       OffsetNumber    lower;
+
+       /* Loop invariant: lower <= desired place <= upper */
+       upper = PageGetMaxOffsetNumber(page) + 1;
+       lower = FirstOffsetNumber;
+
+       while (upper > lower)
+       {
+               OffsetNumber    off;
+               IndexTuple              itup;
+               uint32                  hashkey;
+
+               off = (upper + lower) / 2;
+               Assert(OffsetNumberIsValid(off));
+
+               itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
+               hashkey = _hash_get_indextuple_hashkey(itup);
+               if (hashkey < hash_value)
+                       lower = off + 1;
+               else
+                       upper = off;
+       }
+
+       return lower;
+}
+
+/*
+ * _hash_binsearch_last
+ *
+ * Same as above, except that if there are multiple matching items in the
+ * page, we return the offset of the last one instead of the first one,
+ * and the possible range of outputs is 0..maxoffset not 1..maxoffset+1.
+ * This is handy for starting a new page in a backwards scan.
+ */
+OffsetNumber
+_hash_binsearch_last(Page page, uint32 hash_value)
+{
+       OffsetNumber    upper;
+       OffsetNumber    lower;
+
+       /* Loop invariant: lower <= desired place <= upper */
+       upper = PageGetMaxOffsetNumber(page);
+       lower = FirstOffsetNumber - 1;
+
+       while (upper > lower)
+       {
+               IndexTuple              itup;
+               OffsetNumber    off;
+               uint32                  hashkey;
+
+               off = (upper + lower + 1) / 2;
+               Assert(OffsetNumberIsValid(off));
+
+               itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
+               hashkey = _hash_get_indextuple_hashkey(itup);
+               if (hashkey > hash_value)
+                       upper = off - 1;
+               else
+                       lower = off;
+       }
+
+       return lower;
+}
index 34ce0620104c64c5261b33278b23c5e1c812fa45..5986cb45cb87e12d556e0d8dfaeee0fe4bd5e9cf 100644 (file)
@@ -76,6 +76,7 @@ typedef struct
 /* non-export function prototypes */
 static TupleDesc ConstructTupleDescriptor(Relation heapRelation,
                                                 IndexInfo *indexInfo,
+                                                Oid accessMethodObjectId,
                                                 Oid *classObjectId);
 static void InitializeAttributeOids(Relation indexRelation,
                                                int numatts, Oid indexoid);
@@ -105,15 +106,28 @@ static Oid        IndexGetRelation(Oid indexId);
 static TupleDesc
 ConstructTupleDescriptor(Relation heapRelation,
                                                 IndexInfo *indexInfo,
+                                                Oid accessMethodObjectId,
                                                 Oid *classObjectId)
 {
        int                     numatts = indexInfo->ii_NumIndexAttrs;
        ListCell   *indexpr_item = list_head(indexInfo->ii_Expressions);
+       HeapTuple       amtuple;
+       Form_pg_am      amform;
        TupleDesc       heapTupDesc;
        TupleDesc       indexTupDesc;
        int                     natts;                  /* #atts in heap rel --- for error checks */
        int                     i;
 
+       /* We need access to the index AM's pg_am tuple */
+       amtuple = SearchSysCache(AMOID,
+                                                        ObjectIdGetDatum(accessMethodObjectId),
+                                                        0, 0, 0);
+       if (!HeapTupleIsValid(amtuple))
+               elog(ERROR, "cache lookup failed for access method %u",
+                        accessMethodObjectId);
+       amform = (Form_pg_am) GETSTRUCT(amtuple);
+
+       /* ... and to the table's tuple descriptor */
        heapTupDesc = RelationGetDescr(heapRelation);
        natts = RelationGetForm(heapRelation)->relnatts;
 
@@ -133,6 +147,7 @@ ConstructTupleDescriptor(Relation heapRelation,
                Form_pg_attribute to = indexTupDesc->attrs[i];
                HeapTuple       tuple;
                Form_pg_type typeTup;
+               Form_pg_opclass opclassTup;
                Oid                     keyType;
 
                if (atnum != 0)
@@ -231,8 +246,8 @@ ConstructTupleDescriptor(Relation heapRelation,
                to->attrelid = InvalidOid;
 
                /*
-                * Check the opclass to see if it provides a keytype (overriding the
-                * attribute type).
+                * Check the opclass and index AM to see if either provides a keytype
+                * (overriding the attribute type).  Opclass takes precedence.
                 */
                tuple = SearchSysCache(CLAOID,
                                                           ObjectIdGetDatum(classObjectId[i]),
@@ -240,7 +255,11 @@ ConstructTupleDescriptor(Relation heapRelation,
                if (!HeapTupleIsValid(tuple))
                        elog(ERROR, "cache lookup failed for opclass %u",
                                 classObjectId[i]);
-               keyType = ((Form_pg_opclass) GETSTRUCT(tuple))->opckeytype;
+               opclassTup = (Form_pg_opclass) GETSTRUCT(tuple);
+               if (OidIsValid(opclassTup->opckeytype))
+                       keyType = opclassTup->opckeytype;
+               else
+                       keyType = amform->amkeytype;
                ReleaseSysCache(tuple);
 
                if (OidIsValid(keyType) && keyType != to->atttypid)
@@ -264,6 +283,8 @@ ConstructTupleDescriptor(Relation heapRelation,
                }
        }
 
+       ReleaseSysCache(amtuple);
+
        return indexTupDesc;
 }
 
@@ -577,6 +598,7 @@ index_create(Oid heapRelationId,
         */
        indexTupDesc = ConstructTupleDescriptor(heapRelation,
                                                                                        indexInfo,
+                                                                                       accessMethodObjectId,
                                                                                        classObjectId);
 
        /*
index 35e90c3abe7dcda0e2fabda45e70b8f0bc918e32..6b7c8868cc79d4ff5deef1332e47e0ba5f8e1e81 100644 (file)
 #include <limits.h>
 
 #include "access/genam.h"
-#include "access/hash.h"
 #include "access/nbtree.h"
 #include "catalog/pg_amop.h"
 #include "catalog/pg_operator.h"
@@ -353,7 +352,6 @@ struct Tuplesortstate
        bool            enforceUnique;  /* complain if we find duplicate tuples */
 
        /* These are specific to the index_hash subcase: */
-       FmgrInfo   *hash_proc;          /* call info for the hash function */
        uint32          hash_mask;              /* mask for sortable part of hash code */
 
        /*
@@ -689,13 +687,6 @@ tuplesort_begin_index_hash(Relation indexRel,
 
        state->indexRel = indexRel;
 
-       /*
-        * We look up the index column's hash function just once, to avoid
-        * chewing lots of cycles in repeated index_getprocinfo calls.  This
-        * assumes that our caller holds the index relation open throughout the
-        * sort, else the pointer obtained here might cease to be valid.
-        */
-       state->hash_proc = index_getprocinfo(indexRel, 1, HASHPROC);
        state->hash_mask = hash_mask;
 
        MemoryContextSwitchTo(oldcontext);
@@ -2821,11 +2812,6 @@ static int
 comparetup_index_hash(const SortTuple *a, const SortTuple *b,
                                          Tuplesortstate *state)
 {
-       /*
-        * It's slightly annoying to redo the hash function each time, although
-        * most hash functions ought to be cheap.  Is it worth having a variant
-        * tuple storage format so we can store the hash code?
-        */
        uint32          hash1;
        uint32          hash2;
        IndexTuple      tuple1;
@@ -2834,13 +2820,14 @@ comparetup_index_hash(const SortTuple *a, const SortTuple *b,
        /* Allow interrupting long sorts */
        CHECK_FOR_INTERRUPTS();
 
-       /* Compute hash codes and mask off bits we don't want to sort by */
+       /*
+        * Fetch hash keys and mask off bits we don't want to sort by.
+        * We know that the first column of the index tuple is the hash key.
+        */
        Assert(!a->isnull1);
-       hash1 = DatumGetUInt32(FunctionCall1(state->hash_proc, a->datum1))
-               & state->hash_mask;
+       hash1 = DatumGetUInt32(a->datum1) & state->hash_mask;
        Assert(!b->isnull1);
-       hash2 = DatumGetUInt32(FunctionCall1(state->hash_proc, b->datum1))
-               & state->hash_mask;
+       hash2 = DatumGetUInt32(b->datum1) & state->hash_mask;
 
        if (hash1 > hash2)
                return 1;
index 1e2b9887d81dee2d6a1564e06b75ef71f26cf482..bd4ec10db807e1a0904def6a30ed4925c811799a 100644 (file)
@@ -75,6 +75,9 @@ typedef HashPageOpaqueData *HashPageOpaque;
  */
 typedef struct HashScanOpaqueData
 {
+       /* Hash value of the scan key, ie, the hash key we seek */
+       uint32          hashso_sk_hash;
+
        /*
         * By definition, a hash scan should be examining only one bucket. We
         * record the bucket number here as soon as it is known.
@@ -111,7 +114,7 @@ typedef HashScanOpaqueData *HashScanOpaque;
 #define HASH_METAPAGE  0               /* metapage is always block 0 */
 
 #define HASH_MAGIC             0x6440640
-#define HASH_VERSION   1               /* new for Pg 7.4 */
+#define HASH_VERSION   2               /* 2 signifies only hash key value is stored */
 
 /*
  * Spares[] holds the number of overflow pages currently allocated at or
@@ -138,7 +141,6 @@ typedef HashScanOpaqueData *HashScanOpaque;
 
 typedef struct HashMetaPageData
 {
-       PageHeaderData hashm_phdr;      /* pad for page header (do not use) */
        uint32          hashm_magic;    /* magic no. for hash tables */
        uint32          hashm_version;  /* version ID */
        double          hashm_ntuples;  /* number of tuples stored in the table */
@@ -191,8 +193,16 @@ typedef HashMetaPageData *HashMetaPage;
 #define BMPGSZ_BIT(metap)              ((metap)->hashm_bmsize << BYTE_TO_BIT)
 #define BMPG_SHIFT(metap)              ((metap)->hashm_bmshift)
 #define BMPG_MASK(metap)               (BMPGSZ_BIT(metap) - 1)
-#define HashPageGetBitmap(pg) \
-       ((uint32 *) (((char *) (pg)) + MAXALIGN(sizeof(PageHeaderData))))
+
+#define HashPageGetBitmap(page) \
+       ((uint32 *) PageGetContents(page))
+
+#define HashGetMaxBitmapSize(page) \
+       (PageGetPageSize((Page) page) - \
+        (MAXALIGN(SizeOfPageHeaderData) + MAXALIGN(sizeof(HashPageOpaqueData))))
+
+#define HashPageGetMeta(page) \
+       ((HashMetaPage) PageGetContents(page))
 
 /*
  * The number of bits in an ovflpage bitmap word.
@@ -330,6 +340,11 @@ extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket,
                                         uint32 highmask, uint32 lowmask);
 extern uint32 _hash_log2(uint32 num);
 extern void _hash_checkpage(Relation rel, Buffer buf, int flags);
+extern uint32 _hash_get_indextuple_hashkey(IndexTuple itup);
+extern IndexTuple _hash_form_tuple(Relation index,
+                                                                  Datum *values, bool *isnull);
+extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value);
+extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value);
 
 /* hash.c */
 extern void hash_redo(XLogRecPtr lsn, XLogRecord *record);
index 76ff03064f0899ce1f68f884b7fe3ea1a73240f6..dc1c9a90c681ce915deea7b6d2ca5a2282d9a356 100644 (file)
@@ -53,6 +53,6 @@
  */
 
 /*                                                     yyyymmddN */
-#define CATALOG_VERSION_NO     200809101
+#define CATALOG_VERSION_NO     200809151
 
 #endif
index 6b7440682e1ab3ec2ae8ef82239c3f48b50edf40..9eedbe4e62890117d0bb41db75535f2889ae9454 100644 (file)
@@ -48,6 +48,7 @@ CATALOG(pg_am,2601)
        bool            amsearchnulls;  /* can AM search for NULL index entries? */
        bool            amstorage;              /* can storage type differ from column type? */
        bool            amclusterable;  /* does AM support cluster command? */
+       Oid                     amkeytype;              /* type of data in index, or InvalidOid */
        regproc         aminsert;               /* "insert this tuple" function */
        regproc         ambeginscan;    /* "start new scan" function */
        regproc         amgettuple;             /* "next valid tuple" function */
@@ -74,7 +75,7 @@ typedef FormData_pg_am *Form_pg_am;
  *             compiler constants for pg_am
  * ----------------
  */
-#define Natts_pg_am                                            24
+#define Natts_pg_am                                            25
 #define Anum_pg_am_amname                              1
 #define Anum_pg_am_amstrategies                        2
 #define Anum_pg_am_amsupport                   3
@@ -86,35 +87,36 @@ typedef FormData_pg_am *Form_pg_am;
 #define Anum_pg_am_amsearchnulls               9
 #define Anum_pg_am_amstorage                   10
 #define Anum_pg_am_amclusterable               11
-#define Anum_pg_am_aminsert                            12
-#define Anum_pg_am_ambeginscan                 13
-#define Anum_pg_am_amgettuple                  14
-#define Anum_pg_am_amgetbitmap                 15
-#define Anum_pg_am_amrescan                            16
-#define Anum_pg_am_amendscan                   17
-#define Anum_pg_am_ammarkpos                   18
-#define Anum_pg_am_amrestrpos                  19
-#define Anum_pg_am_ambuild                             20
-#define Anum_pg_am_ambulkdelete                        21
-#define Anum_pg_am_amvacuumcleanup             22
-#define Anum_pg_am_amcostestimate              23
-#define Anum_pg_am_amoptions                   24
+#define Anum_pg_am_amkeytype                   12
+#define Anum_pg_am_aminsert                            13
+#define Anum_pg_am_ambeginscan                 14
+#define Anum_pg_am_amgettuple                  15
+#define Anum_pg_am_amgetbitmap                 16
+#define Anum_pg_am_amrescan                            17
+#define Anum_pg_am_amendscan                   18
+#define Anum_pg_am_ammarkpos                   19
+#define Anum_pg_am_amrestrpos                  20
+#define Anum_pg_am_ambuild                             21
+#define Anum_pg_am_ambulkdelete                        22
+#define Anum_pg_am_amvacuumcleanup             23
+#define Anum_pg_am_amcostestimate              24
+#define Anum_pg_am_amoptions                   25
 
 /* ----------------
  *             initial contents of pg_am
  * ----------------
  */
 
-DATA(insert OID = 403 (  btree 5 1 t t t t t t f t btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbulkdelete btvacuumcleanup btcostestimate btoptions ));
+DATA(insert OID = 403 (  btree 5 1 t t t t t t f t btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbulkdelete btvacuumcleanup btcostestimate btoptions ));
 DESCR("b-tree index access method");
 #define BTREE_AM_OID 403
-DATA(insert OID = 405 (  hash  1 1 f f f f f f f f hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions ));
+DATA(insert OID = 405 (  hash  1 1 f f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions ));
 DESCR("hash index access method");
 #define HASH_AM_OID 405
-DATA(insert OID = 783 (  gist  0 7 f f t t t t t t gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions ));
+DATA(insert OID = 783 (  gist  0 7 f f t t t t t t gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions ));
 DESCR("GiST index access method");
 #define GIST_AM_OID 783
-DATA(insert OID = 2742 (  gin  0 5 f f t t f f t f gininsert ginbeginscan gingettuple gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbulkdelete ginvacuumcleanup gincostestimate ginoptions ));
+DATA(insert OID = 2742 (  gin  0 5 f f t t f f t f gininsert ginbeginscan gingettuple gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbulkdelete ginvacuumcleanup gincostestimate ginoptions ));
 DESCR("GIN index access method");
 #define GIN_AM_OID 2742
 
index 115069b4325cd6e6b3478a95b6f8d72b290a692a..9578c81034c3d701d388b6a8d09cf863030c40ce 100644 (file)
@@ -123,13 +123,13 @@ DATA(insert (     403             macaddr_ops                     PGNSP PGUID 1984  829 t 0 ));
 DATA(insert (  405             macaddr_ops                     PGNSP PGUID 1985  829 t 0 ));
 /*
  * Here's an ugly little hack to save space in the system catalog indexes.
- * btree and hash don't ordinarily allow a storage type different from input
- * type; but cstring and name are the same thing except for trailing padding,
+ * btree doesn't ordinarily allow a storage type different from input type;
+ * but cstring and name are the same thing except for trailing padding,
  * and we can safely omit that within an index entry.  So we declare the
- * opclasses for name as using cstring storage type.
+ * btree opclass for name as using cstring storage type.
  */
 DATA(insert (  403             name_ops                        PGNSP PGUID 1986   19 t 2275 ));
-DATA(insert (  405             name_ops                        PGNSP PGUID 1987   19 t 2275 ));
+DATA(insert (  405             name_ops                        PGNSP PGUID 1987   19 t 0 ));
 DATA(insert (  403             numeric_ops                     PGNSP PGUID 1988 1700 t 0 ));
 DATA(insert (  405             numeric_ops                     PGNSP PGUID 1998 1700 t 0 ));
 DATA(insert OID = 1981 ( 403   oid_ops         PGNSP PGUID 1989   26 t 0 ));