HOT updates. When we update a tuple without changing any of its indexed
authorTom Lane <tgl@sss.pgh.pa.us>
Thu, 20 Sep 2007 17:56:33 +0000 (17:56 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Thu, 20 Sep 2007 17:56:33 +0000 (17:56 +0000)
columns, and the new version can be stored on the same heap page, we no longer
generate extra index entries for the new version.  Instead, index searches
follow the HOT-chain links to ensure they find the correct tuple version.

In addition, this patch introduces the ability to "prune" dead tuples on a
per-page basis, without having to do a complete VACUUM pass to recover space.
VACUUM is still needed to clean up dead index entries, however.

Pavan Deolasee, with help from a bunch of other people.

65 files changed:
contrib/pgstattuple/pgstattuple.c
doc/src/sgml/catalogs.sgml
doc/src/sgml/monitoring.sgml
doc/src/sgml/ref/create_index.sgml
src/backend/access/gin/ginentrypage.c
src/backend/access/gin/ginvacuum.c
src/backend/access/gin/ginxlog.c
src/backend/access/gist/gist.c
src/backend/access/gist/gistutil.c
src/backend/access/gist/gistvacuum.c
src/backend/access/hash/hashinsert.c
src/backend/access/hash/hashovfl.c
src/backend/access/hash/hashpage.c
src/backend/access/heap/Makefile
src/backend/access/heap/README.HOT [new file with mode: 0644]
src/backend/access/heap/heapam.c
src/backend/access/heap/hio.c
src/backend/access/heap/pruneheap.c [new file with mode: 0644]
src/backend/access/heap/rewriteheap.c
src/backend/access/index/genam.c
src/backend/access/index/indexam.c
src/backend/access/nbtree/nbtinsert.c
src/backend/access/nbtree/nbtsort.c
src/backend/access/nbtree/nbtxlog.c
src/backend/catalog/index.c
src/backend/catalog/indexing.c
src/backend/catalog/system_views.sql
src/backend/catalog/toasting.c
src/backend/commands/indexcmds.c
src/backend/commands/sequence.c
src/backend/commands/vacuum.c
src/backend/commands/vacuumlazy.c
src/backend/executor/execMain.c
src/backend/executor/execUtils.c
src/backend/executor/nodeBitmapHeapscan.c
src/backend/executor/spi.c
src/backend/nodes/tidbitmap.c
src/backend/optimizer/plan/planner.c
src/backend/optimizer/util/plancat.c
src/backend/optimizer/util/var.c
src/backend/postmaster/pgstat.c
src/backend/storage/buffer/bufmgr.c
src/backend/storage/page/bufpage.c
src/backend/utils/adt/pgstatfuncs.c
src/backend/utils/cache/plancache.c
src/backend/utils/cache/relcache.c
src/include/access/heapam.h
src/include/access/htup.h
src/include/access/relscan.h
src/include/catalog/catversion.h
src/include/catalog/pg_attribute.h
src/include/catalog/pg_index.h
src/include/catalog/pg_proc.h
src/include/nodes/execnodes.h
src/include/nodes/plannodes.h
src/include/nodes/relation.h
src/include/optimizer/var.h
src/include/pgstat.h
src/include/storage/bufmgr.h
src/include/storage/bufpage.h
src/include/utils/plancache.h
src/include/utils/rel.h
src/include/utils/relcache.h
src/test/regress/expected/create_index.out
src/test/regress/expected/rules.out

index 126d0bcdac63e1deb40ca487f187366e80404333..83546d57c38c7df53928cf94db599c830af30884 100644 (file)
@@ -290,7 +290,7 @@ pgstat_heap(Relation rel, FunctionCallInfo fcinfo)
                {
                        buffer = ReadBuffer(rel, block);
                        LockBuffer(buffer, BUFFER_LOCK_SHARE);
-                       stat.free_space += PageGetFreeSpace((Page) BufferGetPage(buffer));
+                       stat.free_space += PageGetHeapFreeSpace((Page) BufferGetPage(buffer));
                        LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
                        ReleaseBuffer(buffer);
                        block++;
@@ -301,7 +301,7 @@ pgstat_heap(Relation rel, FunctionCallInfo fcinfo)
        while (block < nblocks)
        {
                buffer = ReadBuffer(rel, block);
-               stat.free_space += PageGetFreeSpace((Page) BufferGetPage(buffer));
+               stat.free_space += PageGetHeapFreeSpace((Page) BufferGetPage(buffer));
                ReleaseBuffer(buffer);
                block++;
        }
index 94e5a5f4cc641ed76610ef7df1c4861137af4a69..e11e870e38d76e84852b1f63dbf0d7fd7aec5b12 100644 (file)
       </entry>
      </row>
 
+     <row>
+      <entry><structfield>indcheckxmin</structfield></entry>
+      <entry><type>bool</type></entry>
+      <entry></entry>
+      <entry>
+       If true, queries must not use the index until the <structfield>xmin</>
+       of this <structname>pg_index</> row is below their TransactionXmin
+       event horizon, because the table may contain broken HOT chains with
+       incompatible rows that they can see
+      </entry>
+     </row>
+
+     <row>
+      <entry><structfield>indisready</structfield></entry>
+      <entry><type>bool</type></entry>
+      <entry></entry>
+      <entry>
+       If true, the index is currently ready for inserts.  False means the
+       index must be ignored by <command>INSERT</>/<command>UPDATE</>
+       operations
+      </entry>
+     </row>
+
      <row>
       <entry><structfield>indkey</structfield></entry>
       <entry><type>int2vector</type></entry>
index 175a6bf6dddb421cee9cddde73400c0eb607be21..fa9f6ec155dceec143aa118166faa41a055c6f31 100644 (file)
@@ -276,6 +276,8 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
       scans, number of index scans initiated (over all indexes
       belonging to the table), number of live rows fetched by index
       scans, numbers of row insertions, updates, and deletions,
+      number of row updates that were HOT (i.e., no separate index update),
+      numbers of live and dead rows,
       the last time the table was vacuumed manually,
       the last time it was vacuumed by the autovacuum daemon,
       the last time it was analyzed manually,
@@ -580,7 +582,7 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
       <entry><literal><function>pg_stat_get_tuples_updated</function>(<type>oid</type>)</literal></entry>
       <entry><type>bigint</type></entry>
       <entry>
-       Number of rows updated in table
+       Number of rows updated in table (includes HOT updates)
       </entry>
      </row>
 
@@ -592,6 +594,30 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
       </entry>
      </row>
 
+     <row>
+      <entry><literal><function>pg_stat_get_tuples_hot_updated</function>(<type>oid</type>)</literal></entry>
+      <entry><type>bigint</type></entry>
+      <entry>
+       Number of rows HOT-updated in table
+      </entry>
+     </row>
+
+     <row>
+      <entry><literal><function>pg_stat_get_live_tuples</function>(<type>oid</type>)</literal></entry>
+      <entry><type>bigint</type></entry>
+      <entry>
+       Number of live rows in table
+      </entry>
+     </row>
+
+     <row>
+      <entry><literal><function>pg_stat_get_dead_tuples</function>(<type>oid</type>)</literal></entry>
+      <entry><type>bigint</type></entry>
+      <entry>
+       Number of dead rows in table
+      </entry>
+     </row>
+
      <row>
       <entry><literal><function>pg_stat_get_blocks_fetched</function>(<type>oid</type>)</literal></entry>
       <entry><type>bigint</type></entry>
@@ -716,6 +742,18 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
       </entry>
      </row>
 
+     <row>
+      <entry><literal><function>pg_stat_get_backend_xact_start</function>(<type>integer</type>)</literal></entry>
+      <entry><type>timestamp with time zone</type></entry>
+      <entry>
+       The time at which the given server process' currently
+       executing transaction was started, but only if the
+       current user is a superuser or the same user as that of
+       the session being queried (and
+       <varname>stats_command_string</varname> is on)
+      </entry>
+     </row>
+
      <row>
       <entry><literal><function>pg_stat_get_backend_start</function>(<type>integer</type>)</literal></entry>
       <entry><type>timestamp with time zone</type></entry>
index c341db255a7fee037b0bd3125860a1b3018dc18d..454cd7f39ff15386270089d567cb67f50d17b172 100644 (file)
@@ -329,7 +329,10 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] <replaceable class="parameter">name</re
    </para>
 
    <para>
-    If a problem arises during the second scan of the table, such as a
+    In a concurrent index build, the index is actually entered into the
+    system catalogs in one transaction, then the two table scans occur in a
+    second and third transaction.
+    If a problem arises while scanning the table, such as a
     uniqueness violation in a unique index, the <command>CREATE INDEX</>
     command will fail but leave behind an <quote>invalid</> index. This index
     will be ignored for querying purposes because it might be incomplete;
index eff0ae610af74ec909ebc7583097fc5f9f49b43d..8e067f9f54c8606b2940d17587380de05ebc0859 100644 (file)
@@ -359,7 +359,7 @@ entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prd
        *prdata = rdata;
        data.updateBlkno = entryPreparePage(btree, page, off);
 
-       placed = PageAddItem(page, (Item) btree->entry, IndexTupleSize(btree->entry), off, false);
+       placed = PageAddItem(page, (Item) btree->entry, IndexTupleSize(btree->entry), off, false, false);
        if (placed != off)
                elog(ERROR, "failed to add item to index page in \"%s\"",
                         RelationGetRelationName(btree->index));
@@ -488,7 +488,7 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogR
                        lsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
                }
 
-               if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+               if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
                        elog(ERROR, "failed to add item to index page in \"%s\"",
                                 RelationGetRelationName(btree->index));
                ptr += MAXALIGN(IndexTupleSize(itup));
@@ -563,11 +563,11 @@ entryFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf)
        page = BufferGetPage(root);
 
        itup = ginPageGetLinkItup(lbuf);
-       if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+       if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
                elog(ERROR, "failed to add item to index root page");
 
        itup = ginPageGetLinkItup(rbuf);
-       if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+       if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
                elog(ERROR, "failed to add item to index root page");
 }
 
index 0ae4f17a23008abf2f4911d4fb04f84a4ef87d2a..a02b83b9c76b0059e19eef3a0dfef73a6ad87ed7 100644 (file)
@@ -544,7 +544,7 @@ ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint3
                                itup = GinFormTuple(&gvs->ginstate, value, GinGetPosting(itup), newN);
                                PageIndexTupleDelete(tmppage, i);
 
-                               if (PageAddItem(tmppage, (Item) itup, IndexTupleSize(itup), i, false) != i)
+                               if (PageAddItem(tmppage, (Item) itup, IndexTupleSize(itup), i, false, false) != i)
                                        elog(ERROR, "failed to add item to index page in \"%s\"",
                                                 RelationGetRelationName(gvs->index));
 
index 74d30e4edfce50ce492a2265ca6b6de5a2167e5f..03e34fd9840e4eb8117b34e4d5fb5d699e823c43 100644 (file)
@@ -199,7 +199,7 @@ ginRedoInsert(XLogRecPtr lsn, XLogRecord *record)
 
                itup = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogInsert));
 
-               if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), data->offset, false) == InvalidOffsetNumber)
+               if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), data->offset, false, false) == InvalidOffsetNumber)
                        elog(ERROR, "failed to add item to index page in %u/%u/%u",
                                 data->node.spcNode, data->node.dbNode, data->node.relNode);
 
@@ -281,7 +281,7 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record)
 
                for (i = 0; i < data->separator; i++)
                {
-                       if (PageAddItem(lpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+                       if (PageAddItem(lpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
                                elog(ERROR, "failed to add item to index page in %u/%u/%u",
                                  data->node.spcNode, data->node.dbNode, data->node.relNode);
                        itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup)));
@@ -289,7 +289,7 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record)
 
                for (i = data->separator; i < data->nitem; i++)
                {
-                       if (PageAddItem(rpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+                       if (PageAddItem(rpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
                                elog(ERROR, "failed to add item to index page in %u/%u/%u",
                                  data->node.spcNode, data->node.dbNode, data->node.relNode);
                        itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup)));
@@ -375,7 +375,7 @@ ginRedoVacuumPage(XLogRecPtr lsn, XLogRecord *record)
 
                for (i = 0; i < data->nitem; i++)
                {
-                       if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+                       if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
                                elog(ERROR, "failed to add item to index page in %u/%u/%u",
                                  data->node.spcNode, data->node.dbNode, data->node.relNode);
                        itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup)));
index 68af1d57da3112df7f177fa48c8b4de9bbd04f5b..5de2a0eec8bada21dedfebeb19e615c3fa5c216b 100644 (file)
@@ -366,7 +366,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate)
                        data = (char *) (ptr->list);
                        for (i = 0; i < ptr->block.num; i++)
                        {
-                               if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false) == InvalidOffsetNumber)
+                               if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
                                        elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(state->r));
                                data += IndexTupleSize((IndexTuple) data);
                        }
index 1ddf0e97db4596b89243477937b7c9a6c616aa1f..4404d00a5c0e16b7fc861b909b5acbc0339d9737 100644 (file)
@@ -42,7 +42,7 @@ gistfillbuffer(Relation r, Page page, IndexTuple *itup,
        for (i = 0; i < len; i++)
        {
                l = PageAddItem(page, (Item) itup[i], IndexTupleSize(itup[i]),
-                                               off, false);
+                                               off, false, false);
                if (l == InvalidOffsetNumber)
                        elog(ERROR, "failed to add item to index page in \"%s\"",
                                 RelationGetRelationName(r));
index 7bba2dbd63511ffeb0a55c5ac8f9572ad2d7f73f..4874e945511195fba19d4f12a273ddecd574307e 100644 (file)
@@ -201,7 +201,7 @@ vacuumSplitPage(GistVacuum *gv, Page tempPage, Buffer buffer, IndexTuple *addon,
                data = (char *) (ptr->list);
                for (i = 0; i < ptr->block.num; i++)
                {
-                       if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false) == InvalidOffsetNumber)
+                       if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
                                elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(gv->index));
                        data += IndexTupleSize((IndexTuple) data);
                }
index 7d4d55f1615edad940b5e50b48d5d27a882df51f..49327ec69a3ff564cef832920f574495b091062f 100644 (file)
@@ -200,7 +200,7 @@ _hash_pgaddtup(Relation rel,
        page = BufferGetPage(buf);
 
        itup_off = OffsetNumberNext(PageGetMaxOffsetNumber(page));
-       if (PageAddItem(page, (Item) itup, itemsize, itup_off, false)
+       if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false)
                == InvalidOffsetNumber)
                elog(ERROR, "failed to add index item to \"%s\"",
                         RelationGetRelationName(rel));
index b9ad162b25b30825190d3f2b76460b9ee2e9f48b..6857b133fede653a43b10c9f89adb9a0a61fdd2c 100644 (file)
@@ -684,7 +684,7 @@ _hash_squeezebucket(Relation rel,
                         * we have found room so insert on the "write" page.
                         */
                        woffnum = OffsetNumberNext(PageGetMaxOffsetNumber(wpage));
-                       if (PageAddItem(wpage, (Item) itup, itemsz, woffnum, false)
+                       if (PageAddItem(wpage, (Item) itup, itemsz, woffnum, false, false)
                                == InvalidOffsetNumber)
                                elog(ERROR, "failed to add index item to \"%s\"",
                                         RelationGetRelationName(rel));
index b694e8800c3cc156e8770db69cd79470b1141994..34ac3cc6bbcdef91eb0b364cf2f5c34128c2fffc 100644 (file)
@@ -830,7 +830,7 @@ _hash_splitbucket(Relation rel,
                        }
 
                        noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage));
-                       if (PageAddItem(npage, (Item) itup, itemsz, noffnum, false)
+                       if (PageAddItem(npage, (Item) itup, itemsz, noffnum, false, false)
                                == InvalidOffsetNumber)
                                elog(ERROR, "failed to add index item to \"%s\"",
                                         RelationGetRelationName(rel));
index dd9ff2676ab37cdb046929ab8d1b2ec590fe1414..e8fbb8ffc473a39eebcf986ad2b9e65599a4c12f 100644 (file)
@@ -12,7 +12,7 @@ subdir = src/backend/access/heap
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = heapam.o hio.o rewriteheap.o syncscan.o tuptoaster.o
+OBJS = heapam.o hio.o pruneheap.o rewriteheap.o syncscan.o tuptoaster.o
 
 all: SUBSYS.o
 
diff --git a/src/backend/access/heap/README.HOT b/src/backend/access/heap/README.HOT
new file mode 100644 (file)
index 0000000..856662a
--- /dev/null
@@ -0,0 +1,489 @@
+$PostgreSQL$
+
+                           Heap Only Tuples (HOT)
+
+Introduction
+------------
+
+The Heap Only Tuple (HOT) feature eliminates redundant index entries and
+allows the re-use of space taken by DELETEd or obsoleted UPDATEd tuples
+without performing a table-wide vacuum.  It does this by allowing
+single-page vacuuming, also called "defragmentation".
+
+Note: there is a Glossary at the end of this document that may be helpful
+for first-time readers.
+
+
+Technical Challenges
+--------------------
+
+Page-at-a-time vacuuming is normally impractical because of the costs of
+finding and removing the index entries that link to the tuples to be
+reclaimed.  Standard vacuuming scans the indexes to ensure all such index
+entries are removed, amortizing the index scan cost across as many dead
+tuples as possible; this approach does not scale down well to the case of
+reclaiming just a few tuples.  In principle one could recompute the index
+keys and do standard index searches to find the index entries, but this is
+risky in the presence of possibly-buggy user-defined functions in
+functional indexes.  An allegedly immutable function that in fact is not
+immutable might prevent us from re-finding an index entry (and we cannot
+throw an error for not finding it, in view of the fact that dead index
+entries are sometimes reclaimed early).  That would lead to a seriously
+corrupt index, in the form of entries pointing to tuple slots that by now
+contain some unrelated content.  In any case we would prefer to be able
+to do vacuuming without invoking any user-written code.
+
+HOT solves this problem for a restricted but useful special case:
+where a tuple is repeatedly updated in ways that do not change its
+indexed columns.  (Here, "indexed column" means any column referenced
+at all in an index definition, including for example columns that are
+tested in a partial-index predicate but are not stored in the index.)
+
+An additional property of HOT is that it reduces index size by avoiding
+the creation of identically-keyed index entries.  This improves search
+speeds.
+
+
+Update Chains With a Single Index Entry
+---------------------------------------
+
+Without HOT, every version of a row in an update chain has its own index
+entries, even if all indexed columns are the same.  With HOT, a new tuple
+placed on the same page and with all indexed columns the same as its
+parent row version does not get new index entries.  This means there is
+only one index entry for the entire update chain on the heap page.
+An index-entry-less tuple is marked with the HEAP_ONLY_TUPLE flag.
+The prior row version is marked HEAP_HOT_UPDATED, and (as always in an
+update chain) its t_ctid field links forward to the newer version.
+
+For example:
+
+       Index points to 1
+       lp [1]  [2]
+
+       [111111111]->[2222222222]
+
+In the above diagram, the index points to line pointer 1, and tuple 1 is
+marked as HEAP_HOT_UPDATED.  Tuple 2 is a HOT tuple, meaning it has
+no index entry pointing to it, and is marked as HEAP_ONLY_TUPLE.
+Although tuple 2 is not directly referenced by the index, it can still be
+found by an index search: after traversing from the index to tuple 1,
+the index search proceeds forward to child tuples as long as it sees the
+HEAP_HOT_UPDATED flag set.  Since we restrict the HOT chain to lie within
+a single page, this requires no additional page fetches and doesn't
+introduce much performance penalty.
+
+Eventually, tuple 1 will no longer be visible to any transaction.
+At that point its space could be reclaimed, but its line pointer cannot,
+since the index still links to that line pointer and we still need to
+be able to find tuple 2 in an index search.  HOT handles this by turning
+line pointer 1 into a "redirecting line pointer", which links to tuple 2
+but has no actual tuple attached.  This state of affairs looks like
+
+       Index points to 1
+       lp [1]->[2]
+
+       [2222222222]
+
+If now the row is updated again, to version 3, the page looks like this:
+
+       Index points to 1
+       lp [1]->[2]  [3]
+
+       [2222222222]->[3333333333]
+
+At some later time when no transaction can see tuple 2 in its snapshot,
+tuple 2 and its line pointer can be pruned entirely:
+
+       Index points to 1
+       lp [1]------>[3]
+
+       [3333333333]
+
+This is safe because no index entry points to line pointer 2.  Subsequent
+insertions into the page can now recycle both line pointer 2 and the
+space formerly used by tuple 2.
+
+If an update changes any indexed column, or there is not room on the
+same page for the new tuple, then the HOT chain ends: the last member
+has a regular t_ctid link to the next version and is not marked
+HEAP_HOT_UPDATED.  (In principle we could continue a HOT chain across
+pages, but this would destroy the desired property of being able to
+reclaim space with just page-local manipulations.  Anyway, we don't
+want to have to chase through multiple heap pages to get from an index
+entry to the desired tuple, so it seems better to create a new index
+entry for the new tuple.)  If further updates occur, the next version
+could become the root of a new HOT chain.
+
+Line pointer 1 has to remain as long as there is any non-dead member of
+the chain on the page.  When there is not, it is marked "dead".
+This lets us reclaim the last child line pointer and associated tuple
+immediately.  The next regular VACUUM pass can reclaim the index entries
+pointing at the line pointer and then the line pointer itself.  Since a
+line pointer is small compared to a tuple, this does not represent an
+undue space cost.
+
+Note: we can use a "dead" line pointer for any DELETEd tuple,
+whether it was part of a HOT chain or not.  This allows space reclamation
+in advance of running VACUUM for plain DELETEs as well as HOT updates.
+
+The requirement for doing a HOT update is that none of the indexed
+columns are changed.  This is checked at execution time by comparing the
+binary representation of the old and new values.  We insist on bitwise
+equality rather than using datatype-specific equality routines.  The
+main reason to avoid the latter is that there might be multiple notions
+of equality for a datatype, and we don't know exactly which one is
+relevant for the indexes at hand.  We assume that bitwise equality
+guarantees equality for all purposes.
+
+
+Abort Cases
+-----------
+
+If a heap-only tuple's xmin is aborted, then it can be removed immediately:
+it was never visible to any other transaction, and all descendant row
+versions must be aborted as well.  Therefore we need not consider it part
+of a HOT chain.  By the same token, if a HOT-updated tuple's xmax is
+aborted, there is no need to follow the chain link.  However, there is a
+race condition here: the transaction that did the HOT update might abort
+between the time we inspect the HOT-updated tuple and the time we reach
+the descendant heap-only tuple.  It is conceivable that someone prunes
+the heap-only tuple before that, and even conceivable that the line pointer
+is re-used for another purpose.  Therefore, when following a HOT chain,
+it is always necessary to be prepared for the possibility that the
+linked-to item pointer is unused, dead, or redirected; and if it is a
+normal item pointer, we still have to check that XMIN of the tuple matches
+the XMAX of the tuple we left.  Otherwise we should assume that we have
+come to the end of the HOT chain.  Note that this sort of XMIN/XMAX
+matching is required when following ordinary update chains anyway.
+
+(Early versions of the HOT code assumed that holding pin on the page
+buffer while following a HOT link would prevent this type of problem,
+but checking XMIN/XMAX matching is a much more robust solution.)
+
+
+Index/Sequential Scans
+----------------------
+
+When doing an index scan, whenever we reach a HEAP_HOT_UPDATED tuple whose
+xmax is not aborted, we need to follow its t_ctid link and check that
+entry as well; possibly repeatedly until we reach the end of the HOT
+chain.  (When using an MVCC snapshot it is possible to optimize this a
+bit: there can be at most one visible tuple in the chain, so we can stop
+when we find it.  This rule does not work for non-MVCC snapshots, though.)
+
+Sequential scans do not need to pay attention to the HOT links because
+they scan every item pointer on the page anyway.  The same goes for a
+bitmap heap scan with a lossy bitmap.
+
+
+Pruning
+-------
+
+HOT pruning means updating item pointers so that HOT chains are
+reduced in length, by collapsing out line pointers for intermediate dead
+tuples.  Although this makes those line pointers available for re-use,
+it does not immediately make the space occupied by their tuples available.
+
+
+Defragmentation
+---------------
+
+Defragmentation centralizes unused space.  After we have converted root
+line pointers to redirected line pointers and pruned away any dead
+intermediate line pointers, the tuples they linked to are free space.
+But unless that space is adjacent to the central "hole" on the page
+(the pd_lower-to-pd_upper area) it cannot be used by tuple insertion.
+Defragmentation moves the surviving tuples to coalesce all the free
+space into one "hole".  This is done with the same PageRepairFragmentation
+function that regular VACUUM uses.
+
+
+When can/should we prune or defragment?
+---------------------------------------
+
+This is the most interesting question in HOT implementation, since there
+is no simple right answer: we must use heuristics to determine when it's
+most efficient to perform pruning and/or defragmenting.
+
+We cannot prune or defragment unless we can get a "buffer cleanup lock"
+on the target page; otherwise, pruning might destroy line pointers that
+other backends have live references to, and defragmenting might move
+tuples that other backends have live pointers to.  Thus the general
+approach must be to heuristically decide if we should try to prune
+or defragment, and if so try to acquire the buffer cleanup lock without
+blocking.  If we succeed we can proceed with our housekeeping work.
+If we cannot get the lock (which should not happen often, except under
+very heavy contention) then the housekeeping has to be postponed till
+some other time.  The worst-case consequence of this is only that an
+UPDATE cannot be made HOT but has to link to a new tuple version placed on
+some other page, for lack of centralized space on the original page.
+
+Ideally we would do defragmenting only when we are about to attempt
+heap_update on a HOT-safe tuple.  The difficulty with this approach
+is that the update query has certainly got a pin on the old tuple, and
+therefore our attempt to acquire a buffer cleanup lock will always fail.
+(This corresponds to the idea that we don't want to move the old tuple
+out from under where the query's HeapTuple pointer points.  It might
+be possible to finesse that, but it seems fragile.)
+
+Pruning, however, is potentially useful even when we are not about to
+insert a new tuple, since shortening a HOT chain reduces the cost of
+subsequent index searches.  However it is unclear that this gain is
+large enough to accept any extra maintenance burden for.
+
+The currently planned heuristic is to prune and defrag when first accessing
+a page that potentially has prunable tuples (flagged by the PD_PRUNABLE
+page hint bit) and that either has free space less than MAX(fillfactor
+target free space, BLCKSZ/10) *or* has recently had an UPDATE fail to
+find enough free space to store an updated tuple version.  (These rules
+are subject to change.)
+
+We have effectively implemented the "truncate dead tuples to just line
+pointer" idea that has been proposed and rejected before because of fear
+of line pointer bloat: we might end up with huge numbers of line pointers
+and just a few actual tuples on a page.  To limit the damage in the worst
+case, and to keep various work arrays as well as the bitmaps in bitmap
+scans reasonably sized, the maximum number of line pointers per page
+is arbitrarily capped at MaxHeapTuplesPerPage (the most tuples that
+could fit without HOT pruning).
+
+
+VACUUM
+------
+
+There is little change to regular vacuum.  It performs pruning to remove
+dead heap-only tuples, and cleans up any dead line pointers as if they were
+regular dead tuples.
+
+
+VACUUM FULL
+-----------
+
+VACUUM FULL performs an extra operation of collapsing out redirecting line
+pointers, by moving the first non-DEAD tuple of each HOT chain to the root
+position and clearing its heap-only-tuple flag.  This effectively changes
+the user-visible CTID of that tuple.  This would be completely unsafe
+during normal concurrent operation, but since VACUUM FULL takes full
+exclusive lock on the table, it should be OK.  (Note that VACUUM FULL has
+always felt free to change tuples' CTIDs by moving them across pages.)
+Eliminating redirection links means that the main body of VACUUM FULL
+doesn't have to deal with them, which seems a good thing since VACUUM FULL
+is horrendously complex already.
+
+When VACUUM FULL tries to move tuple chains, it does not distinguish regular
+and heap-only tuples, but just moves both types the same.  This is OK because
+it will move the entire non-DEAD tail of an update chain and remove index
+entries for each item moved.  At worst, we'll uselessly search for index
+entries matching the heap-only tuples included in the move.
+
+
+Statistics
+----------
+
+Currently, we count HOT updates the same as cold updates for statistics
+purposes, though there is an additional per-table counter that counts
+only HOT updates.  When a page pruning operation is able to remove a
+physical tuple by eliminating an intermediate heap-only tuple or
+replacing a physical root tuple by a redirect pointer, a decrement in
+the table's number of dead tuples is reported to pgstats, which may
+postpone autovacuuming.  Note that we do not count replacing a root tuple
+by a DEAD item pointer as decrementing n_dead_tuples; we still want
+autovacuum to run to clean up the index entries and DEAD item.
+
+This area probably needs further work ...
+
+
+CREATE INDEX
+------------
+
+CREATE INDEX presents a problem for HOT updates.  While the existing HOT
+chains all have the same index values for existing indexes, the columns
+in the new index might change within a pre-existing HOT chain, creating
+a "broken" chain that can't be indexed properly.
+
+To address this issue, regular (non-concurrent) CREATE INDEX makes the
+new index usable only by transactions newer than the CREATE INDEX
+command.  This prevents transactions that can see the inconsistent HOT
+chains from trying to use the new index and getting incorrect results.  
+New transactions can only see the rows visible after the index was
+created, hence the HOT chains are consistent for them.
+
+Entries in the new index point to root tuples (tuples with current index
+pointers) so that our index uses the same index pointers as all other
+indexes on the table.  However the row we want to index is actually at
+the *end* of the chain, ie, the most recent live tuple on the HOT chain.
+That is the one we compute the index entry values for, but the TID
+we put into the index is that of the root tuple.  Since transactions that
+will be allowed to use the new index cannot see any of the older tuple
+versions in the chain, the fact that they might not match the index entry
+isn't a problem.  (Such transactions will check the tuple visibility
+information of the older versions and ignore them, without ever looking at
+their contents, so the content inconsistency is OK.)  Subsequent updates
+to the live tuple will be allowed to extend the HOT chain only if they are
+HOT-safe for all the indexes.
+
+Because we have ShareLock on the table, any DELETE_IN_PROGRESS or
+INSERT_IN_PROGRESS tuples should have come from our own transaction.
+Therefore we can consider them committed since if the CREATE INDEX
+commits, they will be committed, and if it aborts the index is discarded.
+An exception to this is that early lock release is customary for system
+catalog updates, and so we might find such tuples when reindexing a system
+catalog.  In that case we deal with it by waiting for the source
+transaction to commit or roll back.  (We could do that for user tables
+too, but since the case is unexpected we prefer to throw an error.)
+
+Practically, we prevent old transactions from using the new index by
+setting pg_index.indcheckxmin to TRUE.  Queries are allowed to use such an
+index only after pg_index.xmin is below their TransactionXmin horizon,
+thereby ensuring that any incompatible rows in HOT chains are dead to them.
+(pg_index.xmin will be the XID of the CREATE INDEX transaction.  The reason
+for using xmin rather than a normal column is that the regular vacuum
+freezing mechanism will take care of converting xmin to FrozenTransactionId
+before it can wrap around.)
+
+This means in particular that the transaction creating the index will be
+unable to use the index.  We alleviate that problem somewhat by not setting
+indcheckxmin unless the table actually contains HOT chains with
+RECENTLY_DEAD members.  (In 8.4 we may be able to improve the situation,
+at least for non-serializable transactions, because we expect to be able to
+advance TransactionXmin intratransaction.)
+
+Another unpleasant consequence is that it is now risky to use SnapshotAny
+in an index scan: if the index was created more recently than the last
+vacuum, it's possible that some of the visited tuples do not match the
+index entry they are linked to.  This does not seem to be a fatal
+objection, since there are few users of SnapshotAny and most use seqscans.
+The only exception at this writing is CLUSTER, which is okay because it
+does not require perfect ordering of the indexscan readout (and especially
+so because CLUSTER tends to write recently-dead tuples out of order anyway).
+
+
+CREATE INDEX CONCURRENTLY
+-------------------------
+
+In the concurrent case we must take a different approach.  We create the
+pg_index entry immediately, before we scan the table.  The pg_index entry
+is marked as "not ready for inserts".  Then we commit and wait for any
+transactions which have the table open to finish.  This ensures that no
+new HOT updates will change the key value for our new index, because all
+transactions will see the existence of the index and will respect its
+constraint on which updates can be HOT.  Other transactions must include
+such an index when determining HOT-safety of updates, even though they
+must ignore it for both insertion and searching purposes.
+
+We must do this to avoid making incorrect index entries.  For example,
+suppose we are building an index on column X and we make an index entry for
+a non-HOT tuple with X=1.  Then some other backend, unaware that X is an
+indexed column, HOT-updates the row to have X=2, and commits.  We now have
+an index entry for X=1 pointing at a HOT chain whose live row has X=2.
+We could make an index entry with X=2 during the validation pass, but
+there is no nice way to get rid of the wrong entry with X=1.  So we must
+have the HOT-safety property enforced before we start to build the new
+index.
+
+After waiting for transactions which had the table open, we build the index
+for all rows that are valid in a fresh snapshot.  Any tuples visible in the
+snapshot will have only valid forward-growing HOT chains.  (They might have
+older HOT updates behind them which are broken, but this is OK for the same
+reason it's OK in a regular index build.)  As above, we point the index
+entry at the root of the HOT-update chain but we use the key value from the
+live tuple.
+
+We mark the index open for inserts (but still not ready for reads) then
+we again wait for transactions which have the table open.  Then we take
+a second reference snapshot and validate the index.  This searches for
+tuples missing from the index, and inserts any missing ones.  Again,
+the index entries have to have TIDs equal to HOT-chain root TIDs, but
+the value to be inserted is the one from the live tuple.
+
+Then we wait until every transaction that could have a snapshot older than
+the second reference snapshot is finished.  This ensures that nobody is
+alive any longer who could need to see any tuples that might be missing
+from the index, as well as ensuring that no one can see any inconsistent
+rows in a broken HOT chain (the first condition is stronger than the
+second).  Finally, we can mark the index valid for searches.
+
+
+Limitations and Restrictions
+----------------------------
+
+It is worth noting that HOT forever forecloses alternative approaches
+to vacuuming, specifically the recompute-the-index-keys approach alluded
+to in Technical Challenges above.  It'll be tough to recompute the index
+keys for a root line pointer you don't have data for anymore ...
+
+
+Glossary
+--------
+
+Broken HOT Chain
+
+       A HOT chain in which the key value for an index has changed.
+
+       This is not allowed to occur normally but if a new index is created
+       it can happen.  In that case various strategies are used to ensure
+       that no transaction for which the older tuples are visible can
+       use the index.
+
+Cold update
+
+       A normal, non-HOT update, in which index entries are made for
+       the new version of the tuple.
+
+Dead line pointer
+
+       A stub line pointer, that does not point to anything, but cannot
+       be removed or reused yet because there are index pointers to it.
+       Semantically same as a dead tuple.  It has state LP_DEAD.
+
+Heap-only tuple
+
+       A heap tuple with no index pointers, which can only be reached
+       from indexes indirectly through its ancestral root tuple.
+       Marked with HEAP_ONLY_TUPLE flag.
+
+HOT-safe
+
+       A proposed tuple update is said to be HOT-safe if it changes
+       none of the tuple's indexed columns.  It will only become an
+       actual HOT update if we can find room on the same page for
+       the new tuple version.
+
+HOT update
+
+       An UPDATE where the new tuple becomes a heap-only tuple, and no
+       new index entries are made.
+
+HOT-updated tuple
+
+       An updated tuple, for which the next tuple in the chain is a
+       heap-only tuple.  Marked with HEAP_HOT_UPDATED flag.
+
+Indexed column
+
+       A column used in an index definition.  The column might not
+       actually be stored in the index --- it could be used in a
+       functional index's expression, or used in a partial index
+       predicate.  HOT treats all these cases alike.
+
+Redirecting line pointer
+
+       A line pointer that points to another line pointer and has no
+       associated tuple.  It has the special lp_flags state LP_REDIRECT,
+       and lp_off is the OffsetNumber of the line pointer it links to.
+       This is used when a root tuple becomes dead but we cannot prune
+       the line pointer because there are non-dead heap-only tuples
+       further down the chain.
+
+Root tuple
+
+       The first tuple in a HOT update chain; the one that indexes point to.
+
+Update chain
+
+       A chain of updated tuples, in which each tuple's ctid points to
+       the next tuple in the chain. A HOT update chain is an update chain
+       (or portion of an update chain) that consists of a root tuple and
+       one or more heap-only tuples.  A complete update chain can contain
+       both HOT and non-HOT (cold) updated tuples.
index 9f1a7d63848d0c86399b2477bf1ea4faf87b1ce9..ca0d11abfb3b2e4b8f044d9ef7612efe3536e2ed 100644 (file)
@@ -52,6 +52,7 @@
 #include "pgstat.h"
 #include "storage/procarray.h"
 #include "storage/smgr.h"
+#include "utils/datum.h"
 #include "utils/inval.h"
 #include "utils/lsyscache.h"
 #include "utils/relcache.h"
@@ -64,6 +65,8 @@ static HeapScanDesc heap_beginscan_internal(Relation relation,
                                                                                        bool is_bitmapscan);
 static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
                   ItemPointerData from, Buffer newbuf, HeapTuple newtup, bool move);
+static bool HeapSatisfiesHOTUpdate(Relation relation, Bitmapset *hot_attrs,
+                                          HeapTuple oldtup, HeapTuple newtup);
 
 
 /* ----------------------------------------------------------------
@@ -183,6 +186,11 @@ heapgetpage(HeapScanDesc scan, BlockNumber page)
        buffer = scan->rs_cbuf;
        snapshot = scan->rs_snapshot;
 
+       /*
+        * Prune and repair fragmentation for the whole page, if possible.
+        */
+       heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin);
+
        /*
         * We must hold share lock on the buffer content while examining tuple
         * visibility.  Afterwards, however, the tuples we have found to be
@@ -316,7 +324,7 @@ heapgettup(HeapScanDesc scan,
                         * forward scanners.
                         */
                        scan->rs_syncscan = false;
-                       /* start from last page of the scan */ 
+                       /* start from last page of the scan */
                        if (scan->rs_startblock > 0)
                                page = scan->rs_startblock - 1;
                        else
@@ -368,6 +376,7 @@ heapgettup(HeapScanDesc scan,
                dp = (Page) BufferGetPage(scan->rs_cbuf);
                lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
                lpp = PageGetItemId(dp, lineoff);
+               Assert(ItemIdIsNormal(lpp));
 
                tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
                tuple->t_len = ItemIdGetLength(lpp);
@@ -583,7 +592,7 @@ heapgettup_pagemode(HeapScanDesc scan,
                         * forward scanners.
                         */
                        scan->rs_syncscan = false;
-                       /* start from last page of the scan */ 
+                       /* start from last page of the scan */
                        if (scan->rs_startblock > 0)
                                page = scan->rs_startblock - 1;
                        else
@@ -632,6 +641,7 @@ heapgettup_pagemode(HeapScanDesc scan,
                dp = (Page) BufferGetPage(scan->rs_cbuf);
                lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
                lpp = PageGetItemId(dp, lineoff);
+               Assert(ItemIdIsNormal(lpp));
 
                tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
                tuple->t_len = ItemIdGetLength(lpp);
@@ -1246,6 +1256,9 @@ heap_getnext(HeapScanDesc scan, ScanDirection direction)
  * for statistical purposes.  (This could be the heap rel itself, an
  * associated index, or NULL to not count the fetch at all.)
  *
+ * heap_fetch does not follow HOT chains: only the exact TID requested will
+ * be fetched.
+ *
  * It is somewhat inconsistent that we ereport() on invalid block number but
  * return false on invalid item number.  There are a couple of reasons though.
  * One is that the caller can relatively easily check the block number for
@@ -1389,6 +1402,143 @@ heap_release_fetch(Relation relation,
        return false;
 }
 
+/*
+ *     heap_hot_search_buffer  - search HOT chain for tuple satisfying snapshot
+ *
+ * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
+ * of a HOT chain), and buffer is the buffer holding this tuple.  We search
+ * for the first chain member satisfying the given snapshot.  If one is
+ * found, we update *tid to reference that tuple's offset number, and
+ * return TRUE.  If no match, return FALSE without modifying *tid.
+ *
+ * If all_dead is not NULL, we check non-visible tuples to see if they are
+ * globally dead; *all_dead is set TRUE if all members of the HOT chain
+ * are vacuumable, FALSE if not.
+ *
+ * Unlike heap_fetch, the caller must already have pin and (at least) share
+ * lock on the buffer; it is still pinned/locked at exit.  Also unlike
+ * heap_fetch, we do not report any pgstats count; caller may do so if wanted.
+ */
+bool
+heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
+                                          bool *all_dead)
+{
+       Page dp = (Page) BufferGetPage(buffer);
+       TransactionId prev_xmax = InvalidTransactionId;
+       OffsetNumber offnum;
+       bool at_chain_start;
+
+       if (all_dead)
+               *all_dead = true;
+
+       Assert(ItemPointerGetBlockNumber(tid) == BufferGetBlockNumber(buffer));
+       offnum = ItemPointerGetOffsetNumber(tid);
+       at_chain_start = true;
+
+       /* Scan through possible multiple members of HOT-chain */
+       for (;;)
+       {
+               ItemId lp;
+               HeapTupleData heapTuple;
+
+               /* check for bogus TID */
+               if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
+                       break;
+
+               lp = PageGetItemId(dp, offnum);
+
+               /* check for unused, dead, or redirected items */
+               if (!ItemIdIsNormal(lp))
+               {
+                       /* We should only see a redirect at start of chain */
+                       if (ItemIdIsRedirected(lp) && at_chain_start)
+                       {
+                               /* Follow the redirect */
+                               offnum = ItemIdGetRedirect(lp);
+                               at_chain_start = false;
+                               continue;
+                       }
+                       /* else must be end of chain */
+                       break;
+               }
+
+               heapTuple.t_data = (HeapTupleHeader) PageGetItem(dp, lp);
+               heapTuple.t_len = ItemIdGetLength(lp);
+
+               /*
+                * Shouldn't see a HEAP_ONLY tuple at chain start.
+                */
+               if (at_chain_start && HeapTupleIsHeapOnly(&heapTuple))
+                       break;
+
+               /*
+                * The xmin should match the previous xmax value, else chain is broken.
+                */
+               if (TransactionIdIsValid(prev_xmax) &&
+                       !TransactionIdEquals(prev_xmax,
+                                                                HeapTupleHeaderGetXmin(heapTuple.t_data)))
+                       break;
+
+               /* If it's visible per the snapshot, we must return it */
+               if (HeapTupleSatisfiesVisibility(&heapTuple, snapshot, buffer))
+               {
+                       ItemPointerSetOffsetNumber(tid, offnum);
+                       if (all_dead)
+                               *all_dead = false;
+                       return true;
+               }
+
+               /*
+                * If we can't see it, maybe no one else can either.  At caller
+                * request, check whether all chain members are dead to all
+                * transactions.
+                */
+               if (all_dead && *all_dead &&
+                       HeapTupleSatisfiesVacuum(heapTuple.t_data, RecentGlobalXmin,
+                                                                        buffer) != HEAPTUPLE_DEAD)
+                       *all_dead = false;
+
+               /*
+                * Check to see if HOT chain continues past this tuple; if so
+                * fetch the next offnum and loop around.
+                */
+               if (HeapTupleIsHotUpdated(&heapTuple))
+               {
+                       Assert(ItemPointerGetBlockNumber(&heapTuple.t_data->t_ctid) ==
+                                  ItemPointerGetBlockNumber(tid));
+                       offnum = ItemPointerGetOffsetNumber(&heapTuple.t_data->t_ctid);
+                       at_chain_start = false;
+                       prev_xmax = HeapTupleHeaderGetXmax(heapTuple.t_data);
+               }
+               else
+                       break;                  /* end of chain */
+       }
+
+       return false;
+}
+
+/*
+ *     heap_hot_search         - search HOT chain for tuple satisfying snapshot
+ *
+ * This has the same API as heap_hot_search_buffer, except that the caller
+ * does not provide the buffer containing the page, rather we access it
+ * locally.
+ */
+bool
+heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot,
+                               bool *all_dead)
+{
+       bool    result;
+       Buffer  buffer;
+
+       buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
+       LockBuffer(buffer, BUFFER_LOCK_SHARE);
+       result = heap_hot_search_buffer(tid, buffer, snapshot, all_dead);
+       LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+       ReleaseBuffer(buffer);
+       return result;
+}
+
 /*
  *     heap_get_latest_tid -  get the latest tid of a specified tuple
  *
@@ -1594,6 +1744,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
        }
 
        tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
+       tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
        tup->t_data->t_infomask |= HEAP_XMAX_INVALID;
        HeapTupleHeaderSetXmin(tup->t_data, xid);
        HeapTupleHeaderSetCmin(tup->t_data, cid);
@@ -1628,6 +1779,17 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 
        RelationPutHeapTuple(relation, buffer, heaptup);
 
+       /*
+        * XXX Should we set PageSetPrunable on this page ?
+        *
+        * The inserting transaction may eventually abort thus making this tuple
+        * DEAD and hence available for pruning. Though we don't want to optimize
+        * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
+        * aborted tuple will never be pruned until next vacuum is triggered.
+        *
+        * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
+        */
+
        MarkBufferDirty(buffer);
 
        /* XLOG stuff */
@@ -1904,12 +2066,21 @@ l1:
 
        START_CRIT_SECTION();
 
+       /*
+        * If this transaction commits, the tuple will become DEAD sooner or
+        * later. Set hint bit that this page is a candidate for pruning.  If
+        * the transaction finally aborts, the subsequent page pruning will be
+        * a no-op and the hint will be cleared.
+        */
+       PageSetPrunable((Page) dp);
+
        /* store transaction information of xact deleting the tuple */
        tp.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
                                                           HEAP_XMAX_INVALID |
                                                           HEAP_XMAX_IS_MULTI |
                                                           HEAP_IS_LOCKED |
                                                           HEAP_MOVED);
+       HeapTupleHeaderClearHotUpdated(tp.t_data);
        HeapTupleHeaderSetXmax(tp.t_data, xid);
        HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
        /* Make sure there is no forward chain link in t_ctid */
@@ -2045,7 +2216,8 @@ simple_heap_delete(Relation relation, ItemPointer tid)
  *
  * On success, the header fields of *newtup are updated to match the new
  * stored tuple; in particular, newtup->t_self is set to the TID where the
- * new tuple was inserted.     However, any TOAST changes in the new tuple's
+ * new tuple was inserted, and its HEAP_ONLY_TUPLE flag is set iff a HOT
+ * update was done.  However, any TOAST changes in the new tuple's
  * data are not reflected into *newtup.
  *
  * In the failure cases, the routine returns the tuple's t_ctid and t_xmax.
@@ -2060,6 +2232,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 {
        HTSU_Result result;
        TransactionId xid = GetCurrentTransactionId();
+       Bitmapset  *hot_attrs;
        ItemId          lp;
        HeapTupleData oldtup;
        HeapTuple       heaptup;
@@ -2072,9 +2245,24 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
                                pagefree;
        bool            have_tuple_lock = false;
        bool            iscombo;
+       bool            use_hot_update = false;
 
        Assert(ItemPointerIsValid(otid));
 
+       /*
+        * Fetch the list of attributes to be checked for HOT update.  This is
+        * wasted effort if we fail to update or have to put the new tuple on
+        * a different page.  But we must compute the list before obtaining
+        * buffer lock --- in the worst case, if we are doing an update on one
+        * of the relevant system catalogs, we could deadlock if we try to
+        * fetch the list later.  In any case, the relcache caches the data
+        * so this is usually pretty cheap.
+        *
+        * Note that we get a copy here, so we need not worry about relcache
+        * flush happening midway through.
+        */
+       hot_attrs = RelationGetIndexAttrBitmap(relation);
+
        buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(otid));
        LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
@@ -2208,6 +2396,7 @@ l2:
                UnlockReleaseBuffer(buffer);
                if (have_tuple_lock)
                        UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock);
+               bms_free(hot_attrs);
                return result;
        }
 
@@ -2227,6 +2416,7 @@ l2:
        }
 
        newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
+       newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
        newtup->t_data->t_infomask |= (HEAP_XMAX_INVALID | HEAP_UPDATED);
        HeapTupleHeaderSetXmin(newtup->t_data, xid);
        HeapTupleHeaderSetCmin(newtup->t_data, cid);
@@ -2261,17 +2451,20 @@ l2:
                                          HeapTupleHasExternal(newtup) ||
                                          newtup->t_len > TOAST_TUPLE_THRESHOLD);
 
-       pagefree = PageGetFreeSpace((Page) dp);
+       pagefree = PageGetHeapFreeSpace((Page) dp);
 
        newtupsize = MAXALIGN(newtup->t_len);
 
        if (need_toast || newtupsize > pagefree)
        {
+               /* Clear obsolete visibility flags ... */
                oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
                                                                           HEAP_XMAX_INVALID |
                                                                           HEAP_XMAX_IS_MULTI |
                                                                           HEAP_IS_LOCKED |
                                                                           HEAP_MOVED);
+               HeapTupleClearHotUpdated(&oldtup);
+               /* ... and store info about transaction updating this tuple */
                HeapTupleHeaderSetXmax(oldtup.t_data, xid);
                HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
                /* temporarily make it look not-updated */
@@ -2324,7 +2517,7 @@ l2:
                        /* Re-acquire the lock on the old tuple's page. */
                        LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
                        /* Re-check using the up-to-date free space */
-                       pagefree = PageGetFreeSpace((Page) dp);
+                       pagefree = PageGetHeapFreeSpace((Page) dp);
                        if (newtupsize > pagefree)
                        {
                                /*
@@ -2357,18 +2550,66 @@ l2:
         * one pin is held.
         */
 
+       if (newbuf == buffer)
+       {
+               /*
+                * Since the new tuple is going into the same page, we might be able
+                * to do a HOT update.  Check if any of the index columns have been
+                * changed.  If not, then HOT update is possible.
+                */
+               if (HeapSatisfiesHOTUpdate(relation, hot_attrs, &oldtup, heaptup))
+                       use_hot_update = true;
+       }
+       else
+       {
+               /* Set a hint that the old page could use prune/defrag */
+               PageSetFull(dp);
+       }
+
        /* NO EREPORT(ERROR) from here till changes are logged */
        START_CRIT_SECTION();
 
+       /*
+        * If this transaction commits, the old tuple will become DEAD sooner or
+        * later. Set hint bit that this page is a candidate for pruning.  If
+        * the transaction finally aborts, the subsequent page pruning will be
+        * a no-op and the hint will be cleared.
+        *
+        * XXX Should we set hint on newbuf as well?  If the transaction
+        * aborts, there would be a prunable tuple in the newbuf; but for now
+        * we choose not to optimize for aborts.  Note that heap_xlog_update
+        * must be kept in sync if this changes.
+        */
+       PageSetPrunable(dp);
+
+       if (use_hot_update)
+       {
+               /* Mark the old tuple as HOT-updated */
+               HeapTupleSetHotUpdated(&oldtup);
+               /* And mark the new tuple as heap-only */
+               HeapTupleSetHeapOnly(heaptup);
+               /* Mark the caller's copy too, in case different from heaptup */
+               HeapTupleSetHeapOnly(newtup);
+       }
+       else
+       {
+               /* Make sure tuples are correctly marked as not-HOT */
+               HeapTupleClearHotUpdated(&oldtup);
+               HeapTupleClearHeapOnly(heaptup);
+               HeapTupleClearHeapOnly(newtup);
+       }
+
        RelationPutHeapTuple(relation, newbuf, heaptup);        /* insert new tuple */
 
        if (!already_marked)
        {
+               /* Clear obsolete visibility flags ... */
                oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
                                                                           HEAP_XMAX_INVALID |
                                                                           HEAP_XMAX_IS_MULTI |
                                                                           HEAP_IS_LOCKED |
                                                                           HEAP_MOVED);
+               /* ... and store info about transaction updating this tuple */
                HeapTupleHeaderSetXmax(oldtup.t_data, xid);
                HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
        }
@@ -2427,7 +2668,7 @@ l2:
        if (have_tuple_lock)
                UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock);
 
-       pgstat_count_heap_update(relation);
+       pgstat_count_heap_update(relation, use_hot_update);
 
        /*
         * If heaptup is a private copy, release it.  Don't forget to copy t_self
@@ -2439,9 +2680,119 @@ l2:
                heap_freetuple(heaptup);
        }
 
+       bms_free(hot_attrs);
+
        return HeapTupleMayBeUpdated;
 }
 
+/*
+ * Check if the specified attribute's value is same in both given tuples.
+ * Subroutine for HeapSatisfiesHOTUpdate.
+ */
+static bool
+heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
+                                          HeapTuple tup1, HeapTuple tup2)
+{
+       Datum value1, value2;
+       bool isnull1, isnull2;
+       Form_pg_attribute att;
+
+       /*
+        * If it's a whole-tuple reference, say "not equal".  It's not really
+        * worth supporting this case, since it could only succeed after a
+        * no-op update, which is hardly a case worth optimizing for.
+        */
+       if (attrnum == 0)
+               return false;
+
+       /*
+        * Likewise, automatically say "not equal" for any system attribute
+        * other than OID and tableOID; we cannot expect these to be consistent
+        * in a HOT chain, or even to be set correctly yet in the new tuple.
+        */
+       if (attrnum < 0)
+       {
+               if (attrnum != ObjectIdAttributeNumber &&
+                       attrnum != TableOidAttributeNumber)
+                       return false;
+       }
+
+       /*
+        * Extract the corresponding values.  XXX this is pretty inefficient
+        * if there are many indexed columns.  Should HeapSatisfiesHOTUpdate
+        * do a single heap_deform_tuple call on each tuple, instead?  But
+        * that doesn't work for system columns ...
+        */
+       value1 = heap_getattr(tup1, attrnum, tupdesc, &isnull1);
+       value2 = heap_getattr(tup2, attrnum, tupdesc, &isnull2);
+
+       /*
+        * If one value is NULL and other is not, then they are certainly
+        * not equal
+        */
+       if (isnull1 != isnull2)
+               return false;
+
+       /*
+        * If both are NULL, they can be considered equal.
+        */
+       if (isnull1)
+               return true;
+
+       /*
+        * We do simple binary comparison of the two datums.  This may be overly
+        * strict because there can be multiple binary representations for the
+        * same logical value.  But we should be OK as long as there are no false
+        * positives.  Using a type-specific equality operator is messy because
+        * there could be multiple notions of equality in different operator
+        * classes; furthermore, we cannot safely invoke user-defined functions
+        * while holding exclusive buffer lock.
+        */
+       if (attrnum <= 0)
+       {
+               /* The only allowed system columns are OIDs, so do this */
+               return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
+       }
+       else
+       {
+               Assert(attrnum <= tupdesc->natts);
+               att     = tupdesc->attrs[attrnum - 1];
+               return datumIsEqual(value1, value2, att->attbyval, att->attlen);
+       }
+}
+
+/*
+ * Check if the old and new tuples represent a HOT-safe update. To be able
+ * to do a HOT update, we must not have changed any columns used in index
+ * definitions.
+ *
+ * The set of attributes to be checked is passed in (we dare not try to
+ * compute it while holding exclusive buffer lock...)  NOTE that hot_attrs
+ * is destructively modified!  That is OK since this is invoked at most once
+ * by heap_update().
+ *
+ * Returns true if safe to do HOT update.
+ */
+static bool
+HeapSatisfiesHOTUpdate(Relation relation, Bitmapset *hot_attrs,
+                                          HeapTuple oldtup, HeapTuple newtup)
+{
+       int attrnum;
+
+       while ((attrnum = bms_first_member(hot_attrs)) >= 0)
+       {
+               /* Adjust for system attributes */
+               attrnum += FirstLowInvalidHeapAttributeNumber;
+
+               /* If the attribute value has changed, we can't do HOT update */
+               if (!heap_tuple_attr_equals(RelationGetDescr(relation), attrnum,
+                                                                       oldtup, newtup))
+                       return false;
+       }
+
+       return true;
+}
+
 /*
  *     simple_heap_update - replace a tuple
  *
@@ -2865,6 +3216,7 @@ l3:
         * avoids possibly generating a useless combo CID.
         */
        tuple->t_data->t_infomask = new_infomask;
+       HeapTupleHeaderClearHotUpdated(tuple->t_data);
        HeapTupleHeaderSetXmax(tuple->t_data, xid);
        /* Make sure there is no forward chain link in t_ctid */
        tuple->t_data->t_ctid = *tid;
@@ -3110,6 +3462,7 @@ recheck_xmax:
                         */
                        tuple->t_infomask &= ~HEAP_XMAX_COMMITTED;
                        tuple->t_infomask |= HEAP_XMAX_INVALID;
+                       HeapTupleHeaderClearHotUpdated(tuple);
                        changed = true;
                }
        }
@@ -3245,21 +3598,29 @@ heap_restrpos(HeapScanDesc scan)
  * Perform XLogInsert for a heap-clean operation.  Caller must already
  * have modified the buffer and marked it dirty.
  *
- * Note: for historical reasons, the entries in the unused[] array should
- * be zero-based tuple indexes, not one-based.
+ * Note: prior to Postgres 8.3, the entries in the nowunused[] array were
+ * zero-based tuple indexes.  Now they are one-based like other uses
+ * of OffsetNumber.
  */
 XLogRecPtr
-log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *unused, int uncnt)
+log_heap_clean(Relation reln, Buffer buffer,
+                          OffsetNumber *redirected, int nredirected,
+                          OffsetNumber *nowdead, int ndead,
+                          OffsetNumber *nowunused, int nunused,
+                          bool redirect_move)
 {
        xl_heap_clean xlrec;
+       uint8           info;
        XLogRecPtr      recptr;
-       XLogRecData rdata[2];
+       XLogRecData rdata[4];
 
        /* Caller should not call me on a temp relation */
        Assert(!reln->rd_istemp);
 
        xlrec.node = reln->rd_node;
        xlrec.block = BufferGetBlockNumber(buffer);
+       xlrec.nredirected = nredirected;
+       xlrec.ndead = ndead;
 
        rdata[0].data = (char *) &xlrec;
        rdata[0].len = SizeOfHeapClean;
@@ -3267,14 +3628,17 @@ log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *unused, int uncnt)
        rdata[0].next = &(rdata[1]);
 
        /*
-        * The unused-offsets array is not actually in the buffer, but pretend
-        * that it is.  When XLogInsert stores the whole buffer, the offsets array
-        * need not be stored too.
+        * The OffsetNumber arrays are not actually in the buffer, but we pretend
+        * that they are.  When XLogInsert stores the whole buffer, the offset
+        * arrays need not be stored too.  Note that even if all three arrays
+        * are empty, we want to expose the buffer as a candidate for whole-page
+        * storage, since this record type implies a defragmentation operation
+        * even if no item pointers changed state.
         */
-       if (uncnt > 0)
+       if (nredirected > 0)
        {
-               rdata[1].data = (char *) unused;
-               rdata[1].len = uncnt * sizeof(OffsetNumber);
+               rdata[1].data = (char *) redirected;
+               rdata[1].len = nredirected * sizeof(OffsetNumber) * 2;
        }
        else
        {
@@ -3283,9 +3647,38 @@ log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *unused, int uncnt)
        }
        rdata[1].buffer = buffer;
        rdata[1].buffer_std = true;
-       rdata[1].next = NULL;
+       rdata[1].next = &(rdata[2]);
+
+       if (ndead > 0)
+       {
+               rdata[2].data = (char *) nowdead;
+               rdata[2].len = ndead * sizeof(OffsetNumber);
+       }
+       else
+       {
+               rdata[2].data = NULL;
+               rdata[2].len = 0;
+       }
+       rdata[2].buffer = buffer;
+       rdata[2].buffer_std = true;
+       rdata[2].next = &(rdata[3]);
+
+       if (nunused > 0)
+       {
+               rdata[3].data = (char *) nowunused;
+               rdata[3].len = nunused * sizeof(OffsetNumber);
+       }
+       else
+       {
+               rdata[3].data = NULL;
+               rdata[3].len = 0;
+       }
+       rdata[3].buffer = buffer;
+       rdata[3].buffer_std = true;
+       rdata[3].next = NULL;
 
-       recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CLEAN, rdata);
+       info = redirect_move ? XLOG_HEAP2_CLEAN_MOVE : XLOG_HEAP2_CLEAN;
+       recptr = XLogInsert(RM_HEAP2_ID, info, rdata);
 
        return recptr;
 }
@@ -3293,8 +3686,6 @@ log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *unused, int uncnt)
 /*
  * Perform XLogInsert for a heap-freeze operation.  Caller must already
  * have modified the buffer and marked it dirty.
- *
- * Unlike log_heap_clean(), the offsets[] entries are one-based.
  */
 XLogRecPtr
 log_heap_freeze(Relation reln, Buffer buffer,
@@ -3363,17 +3754,28 @@ log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from,
        }                       xlhdr;
        int                     hsize = SizeOfHeapHeader;
        xl_heap_update xlrec;
+       uint8           info;
        XLogRecPtr      recptr;
        XLogRecData rdata[4];
        Page            page = BufferGetPage(newbuf);
-       uint8           info = (move) ? XLOG_HEAP_MOVE : XLOG_HEAP_UPDATE;
 
        /* Caller should not call me on a temp relation */
        Assert(!reln->rd_istemp);
 
+       if (move)
+       {
+               Assert(!HeapTupleIsHeapOnly(newtup));
+               info = XLOG_HEAP_MOVE;
+       }
+       else if (HeapTupleIsHeapOnly(newtup))
+               info = XLOG_HEAP_HOT_UPDATE;
+       else
+               info = XLOG_HEAP_UPDATE;
+
        xlrec.target.node = reln->rd_node;
        xlrec.target.tid = from;
        xlrec.newtid = newtup->t_self;
+
        rdata[0].data = (char *) &xlrec;
        rdata[0].len = SizeOfHeapUpdate;
        rdata[0].buffer = InvalidBuffer;
@@ -3489,13 +3891,21 @@ log_newpage(RelFileNode *rnode, BlockNumber blkno, Page page)
        return recptr;
 }
 
+/*
+ * Handles CLEAN and CLEAN_MOVE record types
+ */
 static void
-heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record)
+heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record, bool clean_move)
 {
        xl_heap_clean *xlrec = (xl_heap_clean *) XLogRecGetData(record);
        Relation        reln;
        Buffer          buffer;
        Page            page;
+       OffsetNumber *offnum;
+       OffsetNumber *end;
+       int nredirected;
+       int ndead;
+       int i;
 
        if (record->xl_info & XLR_BKP_BLOCK_1)
                return;
@@ -3512,25 +3922,63 @@ heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record)
                return;
        }
 
-       if (record->xl_len > SizeOfHeapClean)
-       {
-               OffsetNumber *unused;
-               OffsetNumber *unend;
-               ItemId          lp;
+       nredirected = xlrec->nredirected;
+       ndead = xlrec->ndead;
+       offnum = (OffsetNumber *) ((char *) xlrec + SizeOfHeapClean);
+       end = (OffsetNumber *) ((char *) xlrec + record->xl_len);
 
-               unused = (OffsetNumber *) ((char *) xlrec + SizeOfHeapClean);
-               unend = (OffsetNumber *) ((char *) xlrec + record->xl_len);
+       /* Update all redirected or moved line pointers */
+       for (i = 0; i < nredirected; i++)
+       {
+               OffsetNumber fromoff = *offnum++;
+               OffsetNumber tooff = *offnum++;
+               ItemId  fromlp = PageGetItemId(page, fromoff);
 
-               while (unused < unend)
+               if (clean_move)
                {
-                       /* unused[] entries are zero-based */
-                       lp = PageGetItemId(page, *unused + 1);
-                       ItemIdSetUnused(lp);
-                       unused++;
+                       /* Physically move the "to" item to the "from" slot */
+                       ItemId  tolp = PageGetItemId(page, tooff);
+                       HeapTupleHeader htup;
+
+                       *fromlp = *tolp;
+                       ItemIdSetUnused(tolp);
+
+                       /* We also have to clear the tuple's heap-only bit */
+                       Assert(ItemIdIsNormal(fromlp));
+                       htup = (HeapTupleHeader) PageGetItem(page, fromlp);
+                       Assert(HeapTupleHeaderIsHeapOnly(htup));
+                       HeapTupleHeaderClearHeapOnly(htup);
+               }
+               else
+               {
+                       /* Just insert a REDIRECT link at fromoff */
+                       ItemIdSetRedirect(fromlp, tooff);
                }
        }
 
-       PageRepairFragmentation(page, NULL);
+       /* Update all now-dead line pointers */
+       for (i = 0; i < ndead; i++)
+       {
+               OffsetNumber off = *offnum++;
+               ItemId  lp = PageGetItemId(page, off);
+
+               ItemIdSetDead(lp);
+       }
+
+       /* Update all now-unused line pointers */
+       while (offnum < end)
+       {
+               OffsetNumber off = *offnum++;
+               ItemId  lp = PageGetItemId(page, off);
+
+               ItemIdSetUnused(lp);
+       }
+
+       /*
+        * Finally, repair any fragmentation, and update the page's hint bit
+        * about whether it has free pointers.
+        */
+       PageRepairFragmentation(page);
 
        PageSetLSN(page, lsn);
        PageSetTLI(page, ThisTimeLineID);
@@ -3655,8 +4103,13 @@ heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
                                                  HEAP_XMAX_IS_MULTI |
                                                  HEAP_IS_LOCKED |
                                                  HEAP_MOVED);
+       HeapTupleHeaderClearHotUpdated(htup);
        HeapTupleHeaderSetXmax(htup, record->xl_xid);
        HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
+
+       /* Mark the page as a candidate for pruning */
+       PageSetPrunable(page);
+
        /* Make sure there is no forward chain link in t_ctid */
        htup->t_ctid = xlrec->target.tid;
        PageSetLSN(page, lsn);
@@ -3736,7 +4189,7 @@ heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record)
        HeapTupleHeaderSetCmin(htup, FirstCommandId);
        htup->t_ctid = xlrec->target.tid;
 
-       offnum = PageAddItem(page, (Item) htup, newlen, offnum, true);
+       offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
        if (offnum == InvalidOffsetNumber)
                elog(PANIC, "heap_insert_redo: failed to add tuple");
        PageSetLSN(page, lsn);
@@ -3746,10 +4199,10 @@ heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record)
 }
 
 /*
- * Handles UPDATE & MOVE
+ * Handles UPDATE, HOT_UPDATE & MOVE
  */
 static void
-heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move)
+heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move, bool hot_update)
 {
        xl_heap_update *xlrec = (xl_heap_update *) XLogRecGetData(record);
        Relation        reln = XLogOpenRelation(xlrec->target.node);
@@ -3808,6 +4261,7 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move)
                                                          HEAP_XMIN_INVALID |
                                                          HEAP_MOVED_IN);
                htup->t_infomask |= HEAP_MOVED_OFF;
+               HeapTupleHeaderClearHotUpdated(htup);
                HeapTupleHeaderSetXvac(htup, record->xl_xid);
                /* Make sure there is no forward chain link in t_ctid */
                htup->t_ctid = xlrec->target.tid;
@@ -3819,12 +4273,19 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move)
                                                          HEAP_XMAX_IS_MULTI |
                                                          HEAP_IS_LOCKED |
                                                          HEAP_MOVED);
+               if (hot_update)
+                       HeapTupleHeaderSetHotUpdated(htup);
+               else
+                       HeapTupleHeaderClearHotUpdated(htup);
                HeapTupleHeaderSetXmax(htup, record->xl_xid);
                HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
                /* Set forward chain link in t_ctid */
                htup->t_ctid = xlrec->newtid;
        }
 
+       /* Mark the page as a candidate for pruning */
+       PageSetPrunable(page);
+
        /*
         * this test is ugly, but necessary to avoid thinking that insert change
         * is already applied
@@ -3914,7 +4375,7 @@ newsame:;
        /* Make sure there is no forward chain link in t_ctid */
        htup->t_ctid = xlrec->newtid;
 
-       offnum = PageAddItem(page, (Item) htup, newlen, offnum, true);
+       offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
        if (offnum == InvalidOffsetNumber)
                elog(PANIC, "heap_update_redo: failed to add tuple");
        PageSetLSN(page, lsn);
@@ -3971,6 +4432,7 @@ heap_xlog_lock(XLogRecPtr lsn, XLogRecord *record)
                htup->t_infomask |= HEAP_XMAX_SHARED_LOCK;
        else
                htup->t_infomask |= HEAP_XMAX_EXCL_LOCK;
+       HeapTupleHeaderClearHotUpdated(htup);
        HeapTupleHeaderSetXmax(htup, xlrec->locking_xid);
        HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
        /* Make sure there is no forward chain link in t_ctid */
@@ -4039,25 +4501,35 @@ heap_redo(XLogRecPtr lsn, XLogRecord *record)
 {
        uint8           info = record->xl_info & ~XLR_INFO_MASK;
 
-       info &= XLOG_HEAP_OPMASK;
-       if (info == XLOG_HEAP_INSERT)
-               heap_xlog_insert(lsn, record);
-       else if (info == XLOG_HEAP_DELETE)
-               heap_xlog_delete(lsn, record);
-       else if (info == XLOG_HEAP_UPDATE)
-               heap_xlog_update(lsn, record, false);
-       else if (info == XLOG_HEAP_MOVE)
-               heap_xlog_update(lsn, record, true);
-       else if (info == XLOG_HEAP_CLEAN)
-               heap_xlog_clean(lsn, record);
-       else if (info == XLOG_HEAP_NEWPAGE)
-               heap_xlog_newpage(lsn, record);
-       else if (info == XLOG_HEAP_LOCK)
-               heap_xlog_lock(lsn, record);
-       else if (info == XLOG_HEAP_INPLACE)
-               heap_xlog_inplace(lsn, record);
-       else
-               elog(PANIC, "heap_redo: unknown op code %u", info);
+       switch (info & XLOG_HEAP_OPMASK)
+       {
+               case XLOG_HEAP_INSERT:
+                       heap_xlog_insert(lsn, record);
+                       break;
+               case XLOG_HEAP_DELETE:
+                       heap_xlog_delete(lsn, record);
+                       break;
+               case XLOG_HEAP_UPDATE:
+                       heap_xlog_update(lsn, record, false, false);
+                       break;
+               case XLOG_HEAP_MOVE:
+                       heap_xlog_update(lsn, record, true, false);
+                       break;
+               case XLOG_HEAP_HOT_UPDATE:
+                       heap_xlog_update(lsn, record, false, true);
+                       break;
+               case XLOG_HEAP_NEWPAGE:
+                       heap_xlog_newpage(lsn, record);
+                       break;
+               case XLOG_HEAP_LOCK:
+                       heap_xlog_lock(lsn, record);
+                       break;
+               case XLOG_HEAP_INPLACE:
+                       heap_xlog_inplace(lsn, record);
+                       break;
+               default:
+                       elog(PANIC, "heap_redo: unknown op code %u", info);
+       }
 }
 
 void
@@ -4065,11 +4537,20 @@ heap2_redo(XLogRecPtr lsn, XLogRecord *record)
 {
        uint8           info = record->xl_info & ~XLR_INFO_MASK;
 
-       info &= XLOG_HEAP_OPMASK;
-       if (info == XLOG_HEAP2_FREEZE)
-               heap_xlog_freeze(lsn, record);
-       else
-               elog(PANIC, "heap2_redo: unknown op code %u", info);
+       switch (info & XLOG_HEAP_OPMASK)
+       {
+               case XLOG_HEAP2_FREEZE:
+                       heap_xlog_freeze(lsn, record);
+                       break;
+               case XLOG_HEAP2_CLEAN:
+                       heap_xlog_clean(lsn, record, false);
+                       break;
+               case XLOG_HEAP2_CLEAN_MOVE:
+                       heap_xlog_clean(lsn, record, true);
+                       break;
+               default:
+                       elog(PANIC, "heap2_redo: unknown op code %u", info);
+       }
 }
 
 static void
@@ -4130,13 +4611,18 @@ heap_desc(StringInfo buf, uint8 xl_info, char *rec)
                                                 ItemPointerGetBlockNumber(&(xlrec->newtid)),
                                                 ItemPointerGetOffsetNumber(&(xlrec->newtid)));
        }
-       else if (info == XLOG_HEAP_CLEAN)
+       else if (info == XLOG_HEAP_HOT_UPDATE)
        {
-               xl_heap_clean *xlrec = (xl_heap_clean *) rec;
+               xl_heap_update *xlrec = (xl_heap_update *) rec;
 
-               appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u",
-                                                xlrec->node.spcNode, xlrec->node.dbNode,
-                                                xlrec->node.relNode, xlrec->block);
+               if (xl_info & XLOG_HEAP_INIT_PAGE) /* can this case happen? */
+                       appendStringInfo(buf, "hot_update(init): ");
+               else
+                       appendStringInfo(buf, "hot_update: ");
+               out_target(buf, &(xlrec->target));
+               appendStringInfo(buf, "; new %u/%u",
+                                                ItemPointerGetBlockNumber(&(xlrec->newtid)),
+                                                ItemPointerGetOffsetNumber(&(xlrec->newtid)));
        }
        else if (info == XLOG_HEAP_NEWPAGE)
        {
@@ -4187,6 +4673,22 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
                                                 xlrec->node.relNode, xlrec->block,
                                                 xlrec->cutoff_xid);
        }
+       else if (info == XLOG_HEAP2_CLEAN)
+       {
+               xl_heap_clean *xlrec = (xl_heap_clean *) rec;
+
+               appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u",
+                                                xlrec->node.spcNode, xlrec->node.dbNode,
+                                                xlrec->node.relNode, xlrec->block);
+       }
+       else if (info == XLOG_HEAP2_CLEAN_MOVE)
+       {
+               xl_heap_clean *xlrec = (xl_heap_clean *) rec;
+
+               appendStringInfo(buf, "clean_move: rel %u/%u/%u; blk %u",
+                                                xlrec->node.spcNode, xlrec->node.dbNode,
+                                                xlrec->node.relNode, xlrec->block);
+       }
        else
                appendStringInfo(buf, "UNKNOWN");
 }
index fa8a2afbba49834b552fba75d6e434cea28234ff..1d48879b858826b254e29df7211b766fedaa68cd 100644 (file)
@@ -41,7 +41,7 @@ RelationPutHeapTuple(Relation relation,
        pageHeader = BufferGetPage(buffer);
 
        offnum = PageAddItem(pageHeader, (Item) tuple->t_data,
-                                                tuple->t_len, InvalidOffsetNumber, false);
+                                                tuple->t_len, InvalidOffsetNumber, false, true);
 
        if (offnum == InvalidOffsetNumber)
                elog(PANIC, "failed to add tuple to page");
@@ -218,7 +218,7 @@ RelationGetBufferForTuple(Relation relation, Size len,
                 * we're done.
                 */
                pageHeader = (Page) BufferGetPage(buffer);
-               pageFreeSpace = PageGetFreeSpace(pageHeader);
+               pageFreeSpace = PageGetHeapFreeSpace(pageHeader);
                if (len + saveFreeSpace <= pageFreeSpace)
                {
                        /* use this page as future insert target, too */
@@ -311,7 +311,7 @@ RelationGetBufferForTuple(Relation relation, Size len,
 
        PageInit(pageHeader, BufferGetPageSize(buffer), 0);
 
-       if (len > PageGetFreeSpace(pageHeader))
+       if (len > PageGetHeapFreeSpace(pageHeader))
        {
                /* We should not get here given the test at the top */
                elog(PANIC, "tuple is too big: size %lu", (unsigned long) len);
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
new file mode 100644 (file)
index 0000000..85cc4e7
--- /dev/null
@@ -0,0 +1,702 @@
+/*-------------------------------------------------------------------------
+ *
+ * pruneheap.c
+ *       heap page pruning and HOT-chain management code
+ *
+ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *       $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/transam.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "utils/inval.h"
+
+
+/* Local functions */
+static int     heap_prune_chain(Relation relation, Buffer buffer,
+                                                        OffsetNumber rootoffnum,
+                                                        TransactionId OldestXmin,
+                                                        OffsetNumber *redirected, int *nredirected,
+                                                        OffsetNumber *nowdead, int *ndead,
+                                                        OffsetNumber *nowunused, int *nunused,
+                                                        bool redirect_move);
+static void heap_prune_record_redirect(OffsetNumber *redirected,
+                       int *nredirected,
+                       OffsetNumber offnum,
+                       OffsetNumber rdoffnum);
+static void heap_prune_record_dead(OffsetNumber *nowdead, int *ndead,
+                       OffsetNumber offnum);
+static void heap_prune_record_unused(OffsetNumber *nowunused, int *nunused,
+                       OffsetNumber offnum);
+
+
+/*
+ * Optionally prune and repair fragmentation in the specified page.
+ *
+ * This is an opportunistic function.  It will perform housekeeping
+ * only if the page heuristically looks like a candidate for pruning and we
+ * can acquire buffer cleanup lock without blocking.
+ *
+ * Note: this is called quite often.  It's important that it fall out quickly
+ * if there's not any use in pruning.
+ *
+ * Caller must have pin on the buffer, and must *not* have a lock on it.
+ *
+ * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD
+ * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum).
+ */
+void
+heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin)
+{
+       PageHeader      dp = (PageHeader) BufferGetPage(buffer);
+       Size            minfree;
+
+       /*
+        * Let's see if we really need pruning.
+        *
+        * Forget it if page is not hinted to contain something prunable
+        */
+       if (!PageIsPrunable(dp))
+               return;
+
+       /*
+        * We prune when a previous UPDATE failed to find enough space on the
+        * page for a new tuple version, or when free space falls below the
+        * relation's fill-factor target (but not less than 10%).
+        *
+        * Checking free space here is questionable since we aren't holding
+        * any lock on the buffer; in the worst case we could get a bogus
+        * answer.  It's unlikely to be *seriously* wrong, though, since
+        * reading either pd_lower or pd_upper is probably atomic.  Avoiding
+        * taking a lock seems better than sometimes getting a wrong answer
+        * in what is after all just a heuristic estimate.
+        */
+       minfree = RelationGetTargetPageFreeSpace(relation,
+                                                                                        HEAP_DEFAULT_FILLFACTOR);
+       minfree = Max(minfree, BLCKSZ / 10);
+
+       if (PageIsFull(dp) || PageGetHeapFreeSpace((Page) dp) < minfree)
+       {
+               /* OK, try to get exclusive buffer lock */
+               if (!ConditionalLockBufferForCleanup(buffer))
+                       return;
+
+               /*
+                * Now that we have buffer lock, get accurate information about the
+                * page's free space, and recheck the heuristic about whether to prune.
+                */
+               if (PageIsFull(dp) || PageGetHeapFreeSpace((Page) dp) < minfree)
+               {
+                       /* OK to prune (though not to remove redirects) */
+                       (void) heap_page_prune(relation, buffer, OldestXmin, false, true);
+               }
+
+               /* And release buffer lock */
+               LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+       }
+}
+
+
+/*
+ * Prune and repair fragmentation in the specified page.
+ *
+ * Caller must have pin and buffer cleanup lock on the page.
+ *
+ * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD
+ * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum).
+ *
+ * If redirect_move is set, we remove redirecting line pointers by
+ * updating the root line pointer to point directly to the first non-dead
+ * tuple in the chain.  NOTE: eliminating the redirect changes the first
+ * tuple's effective CTID, and is therefore unsafe except within VACUUM FULL.
+ * The only reason we support this capability at all is that by using it,
+ * VACUUM FULL need not cope with LP_REDIRECT items at all; which seems a
+ * good thing since VACUUM FULL is overly complicated already.
+ *
+ * If report_stats is true then we send the number of reclaimed heap-only
+ * tuples to pgstats.  (This must be FALSE during vacuum, since vacuum will
+ * send its own new total to pgstats, and we don't want this delta applied
+ * on top of that.)
+ *
+ * Returns the number of tuples deleted from the page.
+ */
+int
+heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
+                               bool redirect_move, bool report_stats)
+{
+       int                             ndeleted = 0;
+       Page                    page = BufferGetPage(buffer);
+       OffsetNumber    offnum,
+                                       maxoff;
+       OffsetNumber    redirected[MaxHeapTuplesPerPage * 2];
+       OffsetNumber    nowdead[MaxHeapTuplesPerPage];
+       OffsetNumber    nowunused[MaxHeapTuplesPerPage];
+       int                             nredirected = 0;
+       int                             ndead = 0;
+       int                             nunused = 0;
+
+       START_CRIT_SECTION();
+
+       /*
+        * Mark the page as clear of prunable tuples. If we find a tuple which
+        * may soon become prunable, we shall set the hint again.  Also clear
+        * the "page is full" flag, since there's no point in repeating the
+        * prune/defrag process until something else happens to the page.
+        */
+       PageClearPrunable(page);
+       PageClearFull(page);
+
+       /* Scan the page */
+       maxoff = PageGetMaxOffsetNumber(page);
+       for (offnum = FirstOffsetNumber;
+                offnum <= maxoff;
+                offnum = OffsetNumberNext(offnum))
+       {
+               ItemId itemid = PageGetItemId(page, offnum);
+
+               /* Nothing to do if slot is empty or already dead */
+               if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid))
+                       continue;
+
+               /* Process this item or chain of items */
+               ndeleted += heap_prune_chain(relation, buffer, offnum,
+                                                                        OldestXmin,
+                                                                        redirected, &nredirected,
+                                                                        nowdead, &ndead,
+                                                                        nowunused, &nunused,
+                                                                        redirect_move);
+       }
+
+       /* Have we pruned any items? */
+       if (nredirected > 0 || ndead > 0 || nunused > 0)
+       {
+               /*
+                * Repair page fragmentation, and update the page's hint bit about
+                * whether it has free line pointers.
+                */
+               PageRepairFragmentation((Page) page);
+
+               MarkBufferDirty(buffer);
+
+               /*
+                * Emit a WAL HEAP_CLEAN or HEAP_CLEAN_MOVE record showing what we did
+                */
+               if (!relation->rd_istemp)
+               {
+                       XLogRecPtr      recptr;
+
+                       recptr = log_heap_clean(relation, buffer,
+                                                                       redirected, nredirected,
+                                                                       nowdead, ndead,
+                                                                       nowunused, nunused,
+                                                                       redirect_move);
+                       PageSetTLI(BufferGetPage(buffer), ThisTimeLineID);
+                       PageSetLSN(BufferGetPage(buffer), recptr);
+               }
+       }
+
+       END_CRIT_SECTION();
+
+       /*
+        * If requested, report the number of tuples reclaimed to pgstats.
+        * This is ndeleted minus ndead, because we don't want to count a now-DEAD
+        * root item as a deletion for this purpose.
+        */
+       if (report_stats && ndeleted > ndead)
+               pgstat_update_heap_dead_tuples(relation, ndeleted - ndead);
+
+       /*
+        * XXX Should we update the FSM information of this page ?
+        *
+        * There are two schools of thought here. We may not want to update
+        * FSM information so that the page is not used for unrelated
+        * UPDATEs/INSERTs and any free space in this page will remain
+        * available for further UPDATEs in *this* page, thus improving
+        * chances for doing HOT updates.
+        *
+        * But for a large table and where a page does not receive further
+        * UPDATEs for a long time, we might waste this space by not
+        * updating the FSM information. The relation may get extended and
+        * fragmented further.
+        *
+        * One possibility is to leave "fillfactor" worth of space in this
+        * page and update FSM with the remaining space.
+        *
+        * In any case, the current FSM implementation doesn't accept
+        * one-page-at-a-time updates, so this is all academic for now.
+        */
+
+       return ndeleted;
+}
+
+
+/*
+ * Prune specified item pointer or a HOT chain originating at that item.
+ *
+ * If the item is an index-referenced tuple (i.e. not a heap-only tuple),
+ * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT
+ * chain.  We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple.
+ * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really
+ * DEAD, the OldestXmin test is just too coarse to detect it.
+ *
+ * The root line pointer is redirected to the tuple immediately after the
+ * latest DEAD tuple.  If all tuples in the chain are DEAD, the root line
+ * pointer is marked LP_DEAD.  (This includes the case of a DEAD simple
+ * tuple, which we treat as a chain of length 1.)
+ *
+ * OldestXmin is the cutoff XID used to identify dead tuples.
+ *
+ * Redirected items are added to the redirected[] array (two entries per
+ * redirection); items set to LP_DEAD state are added to nowdead[]; and
+ * items set to LP_UNUSED state are added to nowunused[].  (These arrays
+ * will be used to generate a WAL record after all chains are pruned.)
+ *
+ * If redirect_move is true, we get rid of redirecting line pointers.
+ *
+ * Returns the number of tuples deleted from the page.
+ */
+static int
+heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
+                                TransactionId OldestXmin,
+                                OffsetNumber *redirected, int *nredirected,
+                                OffsetNumber *nowdead, int *ndead,
+                                OffsetNumber *nowunused, int *nunused,
+                                bool redirect_move)
+{
+       int                             ndeleted = 0;
+       Page                    dp = (Page) BufferGetPage(buffer);
+       TransactionId   priorXmax = InvalidTransactionId;
+       ItemId                  rootlp;
+       HeapTupleHeader htup;
+       OffsetNumber    latestdead = InvalidOffsetNumber,
+                                       maxoff = PageGetMaxOffsetNumber(dp),
+                                       offnum;
+       OffsetNumber    chainitems[MaxHeapTuplesPerPage];
+       int                             nchain = 0,
+                                       i;
+
+       rootlp = PageGetItemId(dp, rootoffnum);
+
+       /*
+        * If it's a heap-only tuple, then it is not the start of a HOT chain.
+        */
+       if (ItemIdIsNormal(rootlp))
+       {
+               htup = (HeapTupleHeader) PageGetItem(dp, rootlp);
+               if (HeapTupleHeaderIsHeapOnly(htup))
+               {
+                       /*
+                        * If the tuple is DEAD and doesn't chain to anything else, mark it
+                        * unused immediately.  (If it does chain, we can only remove it as
+                        * part of pruning its chain.)
+                        *
+                        * We need this primarily to handle aborted HOT updates, that is,
+                        * XMIN_INVALID heap-only tuples.  Those might not be linked to
+                        * by any chain, since the parent tuple might be re-updated before
+                        * any pruning occurs.  So we have to be able to reap them
+                        * separately from chain-pruning.
+                        *
+                        * Note that we might first arrive at a dead heap-only tuple
+                        * either here or while following a chain below.  Whichever path
+                        * gets there first will mark the tuple unused.
+                        */
+                       if (HeapTupleSatisfiesVacuum(htup, OldestXmin, buffer)
+                               == HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup))
+                       {
+                               ItemIdSetUnused(rootlp);
+                               heap_prune_record_unused(nowunused, nunused, rootoffnum);
+                               ndeleted++;
+                       }
+
+                       /* Nothing more to do */
+                       return ndeleted;
+               }
+       }
+
+       /* Start from the root tuple */
+       offnum = rootoffnum;
+
+       /* while not end of the chain */
+       for (;;)
+       {
+               ItemId                  lp;
+               bool                    tupdead,
+                                               recent_dead;
+
+               /* Some sanity checks */
+               if (offnum < FirstOffsetNumber || offnum > maxoff)
+                       break;
+
+               lp = PageGetItemId(dp, offnum);
+
+               if (!ItemIdIsUsed(lp))
+                       break;
+
+               /*
+                * If we are looking at the redirected root line pointer,
+                * jump to the first normal tuple in the chain.  If we find
+                * a redirect somewhere else, stop --- it must not be same chain.
+                */
+               if (ItemIdIsRedirected(lp))
+               {
+                       if (nchain > 0)
+                               break;                  /* not at start of chain */
+                       chainitems[nchain++] = offnum;
+                       offnum = ItemIdGetRedirect(rootlp);
+                       continue;
+               }
+
+               /*
+                * Likewise, a dead item pointer can't be part of the chain.
+                * (We already eliminated the case of dead root tuple outside
+                * this function.)
+                */
+               if (ItemIdIsDead(lp))
+                       break;
+
+               Assert(ItemIdIsNormal(lp));
+               htup = (HeapTupleHeader) PageGetItem(dp, lp);
+
+               /*
+                * Check the tuple XMIN against prior XMAX, if any
+                */
+               if (TransactionIdIsValid(priorXmax) &&
+                       !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
+                       break;
+
+               /*
+                * OK, this tuple is indeed a member of the chain.
+                */
+               chainitems[nchain++] = offnum;
+
+               /*
+                * Check tuple's visibility status.
+                */
+               tupdead = recent_dead = false;
+
+               switch (HeapTupleSatisfiesVacuum(htup, OldestXmin, buffer))
+               {
+                       case HEAPTUPLE_DEAD:
+                               tupdead = true;
+                               break;
+
+                       case HEAPTUPLE_RECENTLY_DEAD:
+                               recent_dead = true;
+                               /*
+                                * This tuple may soon become DEAD. Re-set the hint bit so
+                                * that the page is reconsidered for pruning in future.
+                                */
+                               PageSetPrunable(dp);
+                               break;
+
+                       case HEAPTUPLE_DELETE_IN_PROGRESS:
+                               /*
+                                * This tuple may soon become DEAD. Re-set the hint bit so
+                                * that the page is reconsidered for pruning in future.
+                                */
+                               PageSetPrunable(dp);
+                               break;
+
+                       case HEAPTUPLE_LIVE:
+                       case HEAPTUPLE_INSERT_IN_PROGRESS:
+                               /*
+                                * If we wanted to optimize for aborts, we might consider
+                                * marking the page prunable when we see INSERT_IN_PROGRESS.
+                                * But we don't.  See related decisions about when to mark
+                                * the page prunable in heapam.c.
+                                */
+                               break;
+
+                       default:
+                               elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
+                               break;
+               }
+
+               /*
+                * Remember the last DEAD tuple seen.  We will advance past
+                * RECENTLY_DEAD tuples just in case there's a DEAD one after them;
+                * but we can't advance past anything else.  (XXX is it really worth
+                * continuing to scan beyond RECENTLY_DEAD?  The case where we will
+                * find another DEAD tuple is a fairly unusual corner case.)
+                */
+               if (tupdead)
+                       latestdead = offnum;
+               else if (!recent_dead)
+                       break;
+
+               /*
+                * If the tuple is not HOT-updated, then we are at the end of this
+                * HOT-update chain.
+                */
+               if (!HeapTupleHeaderIsHotUpdated(htup))
+                       break;
+
+               /*
+                * Advance to next chain member.
+                */
+               Assert(ItemPointerGetBlockNumber(&htup->t_ctid) ==
+                          BufferGetBlockNumber(buffer));
+               offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
+               priorXmax = HeapTupleHeaderGetXmax(htup);
+       }
+
+       /*
+        * If we found a DEAD tuple in the chain, adjust the HOT chain so that all
+        * the DEAD tuples at the start of the chain are removed and the root line
+        * pointer is appropriately redirected.
+        */
+       if (OffsetNumberIsValid(latestdead))
+       {
+               /*
+                * Mark as unused each intermediate item that we are able to remove
+                * from the chain.
+                *
+                * When the previous item is the last dead tuple seen, we are at
+                * the right candidate for redirection.
+                */
+               for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++)
+               {
+                       ItemId lp = PageGetItemId(dp, chainitems[i]);
+
+                       ItemIdSetUnused(lp);
+                       heap_prune_record_unused(nowunused, nunused, chainitems[i]);
+                       ndeleted++;
+               }
+
+               /*
+                * If the root entry had been a normal tuple, we are deleting it,
+                * so count it in the result.  But changing a redirect (even to
+                * DEAD state) doesn't count.
+                */
+               if (ItemIdIsNormal(rootlp))
+                       ndeleted++;
+
+               /*
+                * If the DEAD tuple is at the end of the chain, the entire chain is
+                * dead and the root line pointer can be marked dead.  Otherwise
+                * just redirect the root to the correct chain member.
+                */
+               if (i >= nchain)
+               {
+                       ItemIdSetDead(rootlp);
+                       heap_prune_record_dead(nowdead, ndead, rootoffnum);
+               }
+               else
+               {
+                       ItemIdSetRedirect(rootlp, chainitems[i]);
+                       heap_prune_record_redirect(redirected, nredirected,
+                                                                          rootoffnum,
+                                                                          chainitems[i]);
+               }
+       }
+       else if (nchain < 2 && ItemIdIsRedirected(rootlp))
+       {
+               /*
+                * We found a redirect item that doesn't point to a valid follow-on
+                * item.  This can happen if the loop in heap_page_prune caused us
+                * to visit the dead successor of a redirect item before visiting
+                * the redirect item.  We can clean up by setting the redirect item
+                * to DEAD state.
+                */
+               ItemIdSetDead(rootlp);
+               heap_prune_record_dead(nowdead, ndead, rootoffnum);
+       }
+
+       /*
+        * If requested, eliminate LP_REDIRECT items by moving tuples.  Note that
+        * if the root item is LP_REDIRECT and doesn't point to a valid follow-on
+        * item, we already killed it above.
+        */
+       if (redirect_move && ItemIdIsRedirected(rootlp))
+       {
+               OffsetNumber firstoffnum = ItemIdGetRedirect(rootlp);
+               ItemId firstlp = PageGetItemId(dp, firstoffnum);
+               HeapTupleData   firsttup;
+
+               Assert(ItemIdIsNormal(firstlp));
+               /* Set up firsttup to reference the tuple at its existing CTID */
+               firsttup.t_data = (HeapTupleHeader) PageGetItem(dp, firstlp);
+               firsttup.t_len = ItemIdGetLength(firstlp);
+               ItemPointerSet(&firsttup.t_self,
+                                          BufferGetBlockNumber(buffer),
+                                          firstoffnum);
+               firsttup.t_tableOid = RelationGetRelid(relation);
+
+               /*
+                * Mark the tuple for invalidation.  Needed because we're changing
+                * its CTID.
+                */
+               CacheInvalidateHeapTuple(relation, &firsttup);
+
+               /*
+                * Change heap-only status of the tuple because after the line
+                * pointer manipulation, it's no longer a heap-only tuple, but is
+                * directly pointed to by index entries.
+                */
+               Assert(HeapTupleIsHeapOnly(&firsttup));
+               HeapTupleClearHeapOnly(&firsttup);
+
+               /* Now move the item pointer */
+               *rootlp = *firstlp;
+               ItemIdSetUnused(firstlp);
+
+               /*
+                * If latestdead is valid, we have already recorded the redirection
+                * above.  Otherwise, do it now.
+                *
+                * We don't record firstlp in the nowunused[] array, since the
+                * redirection entry is enough to tell heap_xlog_clean what to do.
+                */
+               if (!OffsetNumberIsValid(latestdead))
+                       heap_prune_record_redirect(redirected, nredirected, rootoffnum,
+                                                                          firstoffnum);
+       }
+
+       return ndeleted;
+}
+
+
+/* Record newly-redirected item pointer */
+static void
+heap_prune_record_redirect(OffsetNumber *redirected, int *nredirected,
+                       OffsetNumber offnum, OffsetNumber rdoffnum)
+{
+       Assert(*nredirected < MaxHeapTuplesPerPage);
+       redirected[*nredirected * 2] = offnum;
+       redirected[*nredirected * 2 + 1] = rdoffnum;
+       (*nredirected)++;
+}
+
+/* Record newly-dead item pointer */
+static void
+heap_prune_record_dead(OffsetNumber *nowdead, int *ndead,
+                                          OffsetNumber offnum)
+{
+       Assert(*ndead < MaxHeapTuplesPerPage);
+       nowdead[*ndead] = offnum;
+       (*ndead)++;
+}
+
+/* Record newly-unused item pointer */
+static void
+heap_prune_record_unused(OffsetNumber *nowunused, int *nunused,
+                                                OffsetNumber offnum)
+{
+       Assert(*nunused < MaxHeapTuplesPerPage);
+       nowunused[*nunused] = offnum;
+       (*nunused)++;
+}
+
+
+/*
+ * For all items in this page, find their respective root line pointers.
+ * If item k is part of a HOT-chain with root at item j, then we set
+ * root_offsets[k - 1] = j.
+ *
+ * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries.
+ * We zero out all unused entries.
+ *
+ * The function must be called with at least share lock on the buffer, to
+ * prevent concurrent prune operations.
+ *
+ * Note: The information collected here is valid only as long as the caller
+ * holds a pin on the buffer. Once pin is released, a tuple might be pruned
+ * and reused by a completely unrelated tuple.
+ */
+void
+heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
+{
+       OffsetNumber    offnum, maxoff;
+
+       MemSet(root_offsets, 0, MaxHeapTuplesPerPage * sizeof(OffsetNumber));
+
+       maxoff = PageGetMaxOffsetNumber(page);
+       for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum++)
+       {
+               ItemId                  lp = PageGetItemId(page, offnum);
+               HeapTupleHeader htup;
+               OffsetNumber    nextoffnum;
+               TransactionId   priorXmax;
+
+               /* skip unused and dead items */
+               if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp))
+                       continue;
+
+               if (ItemIdIsNormal(lp))
+               {
+                       htup = (HeapTupleHeader) PageGetItem(page, lp);
+
+                       /*
+                        * Check if this tuple is part of a HOT-chain rooted at some other
+                        * tuple. If so, skip it for now; we'll process it when we find
+                        * its root.
+                        */
+                       if (HeapTupleHeaderIsHeapOnly(htup))
+                               continue;
+
+                       /*
+                        * This is either a plain tuple or the root of a HOT-chain.
+                        * Remember it in the mapping.
+                        */
+                       root_offsets[offnum - 1] = offnum;
+
+                       /* If it's not the start of a HOT-chain, we're done with it */
+                       if (!HeapTupleHeaderIsHotUpdated(htup))
+                               continue;
+
+                       /* Set up to scan the HOT-chain */
+                       nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
+                       priorXmax = HeapTupleHeaderGetXmax(htup);
+               }
+               else
+               {
+                       /* Must be a redirect item. We do not set its root_offsets entry */
+                       Assert(ItemIdIsRedirected(lp));
+                       /* Set up to scan the HOT-chain */
+                       nextoffnum = ItemIdGetRedirect(lp);
+                       priorXmax = InvalidTransactionId;
+               }
+
+               /*
+                * Now follow the HOT-chain and collect other tuples in the chain.
+                *
+                * Note: Even though this is a nested loop, the complexity of the
+                * function is O(N) because a tuple in the page should be visited not
+                * more than twice, once in the outer loop and once in HOT-chain
+                * chases.
+                */
+               for (;;)
+               {
+                       lp = PageGetItemId(page, nextoffnum);
+
+                       /* Check for broken chains */
+                       if (!ItemIdIsNormal(lp))
+                               break;
+
+                       htup = (HeapTupleHeader) PageGetItem(page, lp);
+
+                       if (TransactionIdIsValid(priorXmax) &&
+                               !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup)))
+                               break;
+
+                       /* Remember the root line pointer for this item */
+                       root_offsets[nextoffnum - 1] = offnum;
+
+                       /* Advance to next chain member, if any */
+                       if (!HeapTupleHeaderIsHotUpdated(htup))
+                               break;
+
+                       nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
+                       priorXmax = HeapTupleHeaderGetXmax(htup);
+               }
+       }
+}
index e4578ea3aa29c9ee873ceb522c08b6a1b6231d2e..18647c883a3d9f35fdf19a4d8b5b09788e66a035 100644 (file)
@@ -320,12 +320,14 @@ rewrite_heap_tuple(RewriteState state,
         * Copy the original tuple's visibility information into new_tuple.
         *
         * XXX we might later need to copy some t_infomask2 bits, too?
+        * Right now, we intentionally clear the HOT status bits.
         */
        memcpy(&new_tuple->t_data->t_choice.t_heap,
                   &old_tuple->t_data->t_choice.t_heap,
                   sizeof(HeapTupleFields));
 
        new_tuple->t_data->t_infomask &= ~HEAP_XACT_MASK;
+       new_tuple->t_data->t_infomask2 &= ~HEAP2_XACT_MASK;
        new_tuple->t_data->t_infomask |=
                old_tuple->t_data->t_infomask & HEAP_XACT_MASK;
 
@@ -593,7 +595,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
        /* Now we can check to see if there's enough free space already. */
        if (state->rs_buffer_valid)
        {
-               pageFreeSpace = PageGetFreeSpace(page);
+               pageFreeSpace = PageGetHeapFreeSpace(page);
 
                if (len + saveFreeSpace > pageFreeSpace)
                {
@@ -628,7 +630,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 
        /* And now we can insert the tuple into the page */
        newoff = PageAddItem(page, (Item) heaptup->t_data, len,
-                                                InvalidOffsetNumber, false);
+                                                InvalidOffsetNumber, false, true);
        if (newoff == InvalidOffsetNumber)
                elog(ERROR, "failed to add tuple");
 
index 136a69c8ee624a97e01cc7656104bb8614a256ae..c25de341b3050fa56087dc4cd327b1a3d9444652 100644 (file)
@@ -21,6 +21,7 @@
 
 #include "access/genam.h"
 #include "access/heapam.h"
+#include "access/transam.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 
@@ -95,6 +96,9 @@ RelationGetIndexScan(Relation indexRelation,
        ItemPointerSetInvalid(&scan->xs_ctup.t_self);
        scan->xs_ctup.t_data = NULL;
        scan->xs_cbuf = InvalidBuffer;
+       scan->xs_prev_xmax = InvalidTransactionId;
+       scan->xs_next_hot = InvalidOffsetNumber;
+       scan->xs_hot_dead = false;
 
        /*
         * Let the AM fill in the key and any opaque data it wants.
index 1074ab237da75992e62eb1086890ff6b8243a68f..7920e14c3d93914fe3fa56e64db0d2c101510f43 100644 (file)
@@ -64,6 +64,7 @@
 
 #include "access/genam.h"
 #include "access/heapam.h"
+#include "access/transam.h"
 #include "pgstat.h"
 #include "utils/relcache.h"
 
@@ -313,6 +314,8 @@ index_rescan(IndexScanDesc scan, ScanKey key)
                scan->xs_cbuf = InvalidBuffer;
        }
 
+       scan->xs_next_hot = InvalidOffsetNumber;
+
        scan->kill_prior_tuple = false;         /* for safety */
 
        FunctionCall2(procedure,
@@ -370,6 +373,14 @@ index_markpos(IndexScanDesc scan)
  * NOTE: this only restores the internal scan state of the index AM.
  * The current result tuple (scan->xs_ctup) doesn't change.  See comments
  * for ExecRestrPos().
+ *
+ * NOTE: in the presence of HOT chains, mark/restore only works correctly
+ * if the scan's snapshot is MVCC-safe; that ensures that there's at most one
+ * returnable tuple in each HOT chain, and so restoring the prior state at the
+ * granularity of the index AM is sufficient.  Since the only current user
+ * of mark/restore functionality is nodeMergejoin.c, this effectively means
+ * that merge-join plans only work for MVCC snapshots.  This could be fixed
+ * if necessary, but for now it seems unimportant.
  * ----------------
  */
 void
@@ -377,9 +388,13 @@ index_restrpos(IndexScanDesc scan)
 {
        FmgrInfo   *procedure;
 
+       Assert(IsMVCCSnapshot(scan->xs_snapshot));
+
        SCAN_CHECKS;
        GET_SCAN_PROCEDURE(amrestrpos);
 
+       scan->xs_next_hot = InvalidOffsetNumber;
+
        scan->kill_prior_tuple = false;         /* for safety */
 
        FunctionCall1(procedure, PointerGetDatum(scan));
@@ -398,72 +413,224 @@ HeapTuple
 index_getnext(IndexScanDesc scan, ScanDirection direction)
 {
        HeapTuple       heapTuple = &scan->xs_ctup;
+       ItemPointer     tid = &heapTuple->t_self;
        FmgrInfo   *procedure;
 
        SCAN_CHECKS;
        GET_SCAN_PROCEDURE(amgettuple);
 
-       /* just make sure this is false... */
-       scan->kill_prior_tuple = false;
+       /*
+        * We always reset xs_hot_dead; if we are here then either we are just
+        * starting the scan, or we previously returned a visible tuple, and in
+        * either case it's inappropriate to kill the prior index entry.
+        */
+       scan->xs_hot_dead = false;
 
        for (;;)
        {
-               bool            found;
+               OffsetNumber offnum;
+               bool at_chain_start;
+               Page dp;
 
-               /*
-                * The AM's gettuple proc finds the next tuple matching the scan keys.
-                */
-               found = DatumGetBool(FunctionCall2(procedure,
-                                                                                  PointerGetDatum(scan),
-                                                                                  Int32GetDatum(direction)));
+               if (scan->xs_next_hot != InvalidOffsetNumber)
+               {
+                       /*
+                        * We are resuming scan of a HOT chain after having returned
+                        * an earlier member.  Must still hold pin on current heap page.
+                        */
+                       Assert(BufferIsValid(scan->xs_cbuf));
+                       Assert(ItemPointerGetBlockNumber(tid) ==
+                                  BufferGetBlockNumber(scan->xs_cbuf));
+                       Assert(TransactionIdIsValid(scan->xs_prev_xmax));
+                       offnum = scan->xs_next_hot;
+                       at_chain_start = false;
+                       scan->xs_next_hot = InvalidOffsetNumber;
+               }
+               else
+               {
+                       bool            found;
+                       Buffer          prev_buf;
+
+                       /*
+                        * If we scanned a whole HOT chain and found only dead tuples,
+                        * tell index AM to kill its entry for that TID.
+                        */
+                       scan->kill_prior_tuple = scan->xs_hot_dead;
+
+                       /*
+                        * The AM's gettuple proc finds the next index entry matching the
+                        * scan keys, and puts the TID in xs_ctup.t_self (ie, *tid).
+                        */
+                       found = DatumGetBool(FunctionCall2(procedure,
+                                                                                          PointerGetDatum(scan),
+                                                                                          Int32GetDatum(direction)));
+
+                       /* Reset kill flag immediately for safety */
+                       scan->kill_prior_tuple = false;
+
+                       /* If we're out of index entries, break out of outer loop */
+                       if (!found)
+                               break;
+
+                       pgstat_count_index_tuples(scan->indexRelation, 1);
+
+                       /* Switch to correct buffer if we don't have it already */
+                       prev_buf = scan->xs_cbuf;
+                       scan->xs_cbuf = ReleaseAndReadBuffer(scan->xs_cbuf,
+                                                                                                scan->heapRelation,
+                                                                                        ItemPointerGetBlockNumber(tid));
+
+                       /*
+                        * Prune page, but only if we weren't already on this page
+                        */
+                       if (prev_buf != scan->xs_cbuf)
+                               heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf,
+                                                                       RecentGlobalXmin);
+
+                       /* Prepare to scan HOT chain starting at index-referenced offnum */
+                       offnum = ItemPointerGetOffsetNumber(tid);
+                       at_chain_start = true;
+
+                       /* We don't know what the first tuple's xmin should be */
+                       scan->xs_prev_xmax = InvalidTransactionId;
+
+                       /* Initialize flag to detect if all entries are dead */
+                       scan->xs_hot_dead = true;
+               }
+
+               /* Obtain share-lock on the buffer so we can examine visibility */
+               LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE);
 
-               /* Reset kill flag immediately for safety */
-               scan->kill_prior_tuple = false;
+               dp = (Page) BufferGetPage(scan->xs_cbuf);
 
-               if (!found)
+               /* Scan through possible multiple members of HOT-chain */
+               for (;;)
                {
-                       /* Release any held pin on a heap page */
-                       if (BufferIsValid(scan->xs_cbuf))
-                       {
-                               ReleaseBuffer(scan->xs_cbuf);
-                               scan->xs_cbuf = InvalidBuffer;
-                       }
-                       return NULL;            /* failure exit */
-               }
+                       ItemId lp;
+                       ItemPointer ctid;
 
-               pgstat_count_index_tuples(scan->indexRelation, 1);
+                       /* check for bogus TID */
+                       if (offnum < FirstOffsetNumber ||
+                               offnum > PageGetMaxOffsetNumber(dp))
+                               break;
 
-               /*
-                * Fetch the heap tuple and see if it matches the snapshot.
-                */
-               if (heap_release_fetch(scan->heapRelation, scan->xs_snapshot,
-                                                          heapTuple, &scan->xs_cbuf, true,
-                                                          scan->indexRelation))
-                       break;
+                       lp = PageGetItemId(dp, offnum);
 
-               /* Skip if no undeleted tuple at this location */
-               if (heapTuple->t_data == NULL)
-                       continue;
+                       /* check for unused, dead, or redirected items */
+                       if (!ItemIdIsNormal(lp))
+                       {
+                               /* We should only see a redirect at start of chain */
+                               if (ItemIdIsRedirected(lp) && at_chain_start)
+                               {
+                                       /* Follow the redirect */
+                                       offnum = ItemIdGetRedirect(lp);
+                                       at_chain_start = false;
+                                       continue;
+                               }
+                               /* else must be end of chain */
+                               break;
+                       }
 
-               /*
-                * If we can't see it, maybe no one else can either.  Check to see if
-                * the tuple is dead to all transactions.  If so, signal the index AM
-                * to not return it on future indexscans.
-                *
-                * We told heap_release_fetch to keep a pin on the buffer, so we can
-                * re-access the tuple here.  But we must re-lock the buffer first.
-                */
-               LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE);
+                       /*
+                        * We must initialize all of *heapTuple (ie, scan->xs_ctup)
+                        * since it is returned to the executor on success.
+                        */
+                       heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp);
+                       heapTuple->t_len = ItemIdGetLength(lp);
+                       ItemPointerSetOffsetNumber(tid, offnum);
+                       heapTuple->t_tableOid = RelationGetRelid(scan->heapRelation);
+                       ctid = &heapTuple->t_data->t_ctid;
+
+                       /*
+                        * Shouldn't see a HEAP_ONLY tuple at chain start.  (This test
+                        * should be unnecessary, since the chain root can't be removed
+                        * while we have pin on the index entry, but let's make it anyway.)
+                        */
+                       if (at_chain_start && HeapTupleIsHeapOnly(heapTuple))
+                               break;
+
+                       /*
+                        * The xmin should match the previous xmax value, else chain is
+                        * broken.  (Note: this test is not optional because it protects
+                        * us against the case where the prior chain member's xmax
+                        * aborted since we looked at it.)
+                        */
+                       if (TransactionIdIsValid(scan->xs_prev_xmax) &&
+                               !TransactionIdEquals(scan->xs_prev_xmax,
+                                                                HeapTupleHeaderGetXmin(heapTuple->t_data)))
+                               break;
+
+                       /* If it's visible per the snapshot, we must return it */
+                       if (HeapTupleSatisfiesVisibility(heapTuple, scan->xs_snapshot,
+                                                                                        scan->xs_cbuf))
+                       {
+                               /*
+                                * If the snapshot is MVCC, we know that it could accept
+                                * at most one member of the HOT chain, so we can skip
+                                * examining any more members.  Otherwise, check for
+                                * continuation of the HOT-chain, and set state for next time.
+                                */
+                               if (IsMVCCSnapshot(scan->xs_snapshot))
+                                       scan->xs_next_hot = InvalidOffsetNumber;
+                               else if (HeapTupleIsHotUpdated(heapTuple))
+                               {
+                                       Assert(ItemPointerGetBlockNumber(ctid) ==
+                                                  ItemPointerGetBlockNumber(tid));
+                                       scan->xs_next_hot = ItemPointerGetOffsetNumber(ctid);
+                                       scan->xs_prev_xmax = HeapTupleHeaderGetXmax(heapTuple->t_data);
+                               }
+                               else
+                                       scan->xs_next_hot = InvalidOffsetNumber;
+
+                               LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK);
+
+                               pgstat_count_heap_fetch(scan->indexRelation);
+
+                               return heapTuple;
+                       }
 
-               if (HeapTupleSatisfiesVacuum(heapTuple->t_data, RecentGlobalXmin,
-                                                                        scan->xs_cbuf) == HEAPTUPLE_DEAD)
-                       scan->kill_prior_tuple = true;
+                       /*
+                        * If we can't see it, maybe no one else can either.  Check to see
+                        * if the tuple is dead to all transactions.  If we find that all
+                        * the tuples in the HOT chain are dead, we'll signal the index AM
+                        * to not return that TID on future indexscans.
+                        */
+                       if (scan->xs_hot_dead &&
+                               HeapTupleSatisfiesVacuum(heapTuple->t_data, RecentGlobalXmin,
+                                                                                scan->xs_cbuf) != HEAPTUPLE_DEAD)
+                               scan->xs_hot_dead = false;
+
+                       /*
+                        * Check to see if HOT chain continues past this tuple; if so
+                        * fetch the next offnum (we don't bother storing it into
+                        * xs_next_hot, but must store xs_prev_xmax), and loop around.
+                        */
+                       if (HeapTupleIsHotUpdated(heapTuple))
+                       {
+                               Assert(ItemPointerGetBlockNumber(ctid) ==
+                                          ItemPointerGetBlockNumber(tid));
+                               offnum = ItemPointerGetOffsetNumber(ctid);
+                               at_chain_start = false;
+                               scan->xs_prev_xmax = HeapTupleHeaderGetXmax(heapTuple->t_data);
+                       }
+                       else
+                               break;                  /* end of chain */
+               } /* loop over a single HOT chain */
 
                LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK);
+
+               /* Loop around to ask index AM for another TID */
+               scan->xs_next_hot = InvalidOffsetNumber;
+       }
+
+       /* Release any held pin on a heap page */
+       if (BufferIsValid(scan->xs_cbuf))
+       {
+               ReleaseBuffer(scan->xs_cbuf);
+               scan->xs_cbuf = InvalidBuffer;
        }
 
-       /* Success exit */
-       return heapTuple;
+       return NULL;                            /* failure exit */
 }
 
 /* ----------------
index 8f8f7b63b4926841558931bd72da388b71903b3d..8c29331d824d01b684468d638c979832e6e8bd29 100644 (file)
@@ -193,8 +193,6 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
         */
        for (;;)
        {
-               HeapTupleData htup;
-               Buffer          hbuffer;
                ItemId          curitemid;
                IndexTuple      curitup;
                BlockNumber nblkno;
@@ -223,6 +221,9 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
                         */
                        if (!ItemIdIsDead(curitemid))
                        {
+                               ItemPointerData htid;
+                               bool all_dead;
+
                                /*
                                 * _bt_compare returns 0 for (1,NULL) and (1,NULL) - this's
                                 * how we handling NULLs - and so we must not use _bt_compare
@@ -234,17 +235,20 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
 
                                /* okay, we gotta fetch the heap tuple ... */
                                curitup = (IndexTuple) PageGetItem(page, curitemid);
-                               htup.t_self = curitup->t_tid;
-                               if (heap_fetch(heapRel, &SnapshotDirty, &htup, &hbuffer,
-                                                          true, NULL))
+                               htid = curitup->t_tid;
+
+                               /*
+                                * We check the whole HOT-chain to see if there is any tuple
+                                * that satisfies SnapshotDirty.  This is necessary because
+                                * we have just a single index entry for the entire chain.
+                                */
+                               if (heap_hot_search(&htid, heapRel, &SnapshotDirty, &all_dead))
                                {
                                        /* it is a duplicate */
                                        TransactionId xwait =
                                        (TransactionIdIsValid(SnapshotDirty.xmin)) ?
                                        SnapshotDirty.xmin : SnapshotDirty.xmax;
 
-                                       ReleaseBuffer(hbuffer);
-
                                        /*
                                         * If this tuple is being updated by other transaction
                                         * then we have to wait for its commit/abort.
@@ -263,15 +267,22 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
                                         * is itself now committed dead --- if so, don't complain.
                                         * This is a waste of time in normal scenarios but we must
                                         * do it to support CREATE INDEX CONCURRENTLY.
+                                        * 
+                                        * We must follow HOT-chains here because during
+                                        * concurrent index build, we insert the root TID though
+                                        * the actual tuple may be somewhere in the HOT-chain.
+                                        * While following the chain we might not stop at the exact
+                                        * tuple which triggered the insert, but that's OK because
+                                        * if we find a live tuple anywhere in this chain, we have
+                                        * a unique key conflict.  The other live tuple is not part
+                                        * of this chain because it had a different index entry.
                                         */
-                                       htup.t_self = itup->t_tid;
-                                       if (heap_fetch(heapRel, SnapshotSelf, &htup, &hbuffer,
-                                                                  false, NULL))
+                                       htid = itup->t_tid;
+                                       if (heap_hot_search(&htid, heapRel, SnapshotSelf, NULL))
                                        {
                                                /* Normal case --- it's still live */
-                                               ReleaseBuffer(hbuffer);
                                        }
-                                       else if (htup.t_data != NULL)
+                                       else
                                        {
                                                /*
                                                 * It's been deleted, so no error, and no need to
@@ -279,39 +290,27 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
                                                 */
                                                break;
                                        }
-                                       else
-                                       {
-                                               /* couldn't find the tuple?? */
-                                               elog(ERROR, "failed to fetch tuple being inserted");
-                                       }
 
                                        ereport(ERROR,
                                                        (errcode(ERRCODE_UNIQUE_VIOLATION),
                                        errmsg("duplicate key value violates unique constraint \"%s\"",
                                                   RelationGetRelationName(rel))));
                                }
-                               else if (htup.t_data != NULL)
+                               else if (all_dead)
                                {
                                        /*
-                                        * Hmm, if we can't see the tuple, maybe it can be marked
-                                        * killed.      This logic should match index_getnext and
-                                        * btgettuple.
+                                        * The conflicting tuple (or whole HOT chain) is dead to
+                                        * everyone, so we may as well mark the index entry
+                                        * killed.
                                         */
-                                       LockBuffer(hbuffer, BUFFER_LOCK_SHARE);
-                                       if (HeapTupleSatisfiesVacuum(htup.t_data, RecentGlobalXmin,
-                                                                                                hbuffer) == HEAPTUPLE_DEAD)
-                                       {
-                                               ItemIdMarkDead(curitemid);
-                                               opaque->btpo_flags |= BTP_HAS_GARBAGE;
-                                               /* be sure to mark the proper buffer dirty... */
-                                               if (nbuf != InvalidBuffer)
-                                                       SetBufferCommitInfoNeedsSave(nbuf);
-                                               else
-                                                       SetBufferCommitInfoNeedsSave(buf);
-                                       }
-                                       LockBuffer(hbuffer, BUFFER_LOCK_UNLOCK);
+                                       ItemIdMarkDead(curitemid);
+                                       opaque->btpo_flags |= BTP_HAS_GARBAGE;
+                                       /* be sure to mark the proper buffer dirty... */
+                                       if (nbuf != InvalidBuffer)
+                                               SetBufferCommitInfoNeedsSave(nbuf);
+                                       else
+                                               SetBufferCommitInfoNeedsSave(buf);
                                }
-                               ReleaseBuffer(hbuffer);
                        }
                }
 
@@ -840,7 +839,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
                itemsz = ItemIdGetLength(itemid);
                item = (IndexTuple) PageGetItem(origpage, itemid);
                if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
-                                               false) == InvalidOffsetNumber)
+                                               false, false) == InvalidOffsetNumber)
                        elog(PANIC, "failed to add hikey to the right sibling");
                rightoff = OffsetNumberNext(rightoff);
        }
@@ -865,7 +864,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
                item = (IndexTuple) PageGetItem(origpage, itemid);
        }
        if (PageAddItem(leftpage, (Item) item, itemsz, leftoff,
-                                       false) == InvalidOffsetNumber)
+                                       false, false) == InvalidOffsetNumber)
                elog(PANIC, "failed to add hikey to the left sibling");
        leftoff = OffsetNumberNext(leftoff);
 
@@ -1700,7 +1699,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
         * benefit of _bt_restore_page().
         */
        if (PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY,
-                                       false) == InvalidOffsetNumber)
+                                       false, false) == InvalidOffsetNumber)
                elog(PANIC, "failed to add leftkey to new root page");
        pfree(new_item);
 
@@ -1718,7 +1717,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
         * insert the right page pointer into the new root page.
         */
        if (PageAddItem(rootpage, (Item) new_item, itemsz, P_FIRSTKEY,
-                                       false) == InvalidOffsetNumber)
+                                       false, false) == InvalidOffsetNumber)
                elog(PANIC, "failed to add rightkey to new root page");
        pfree(new_item);
 
@@ -1805,7 +1804,7 @@ _bt_pgaddtup(Relation rel,
        }
 
        if (PageAddItem(page, (Item) itup, itemsize, itup_off,
-                                       false) == InvalidOffsetNumber)
+                                       false, false) == InvalidOffsetNumber)
                elog(PANIC, "failed to add item to the %s for \"%s\"",
                         where, RelationGetRelationName(rel));
 }
index bacf52d48ba8accd6787397160c162e9328576eb..c09897366d1e6982451c71d9d2cc1751f2a74b06 100644 (file)
@@ -400,7 +400,7 @@ _bt_sortaddtup(Page page,
        }
 
        if (PageAddItem(page, (Item) itup, itemsize, itup_off,
-                                       false) == InvalidOffsetNumber)
+                                       false, false) == InvalidOffsetNumber)
                elog(ERROR, "failed to add item to the index page");
 }
 
index ab0c37caa66405f12ecc8ac35d21f7ea113d06c8..5c1f9fd5fa96b303ece6f10b270e715924c0d4c4 100644 (file)
@@ -141,8 +141,8 @@ _bt_restore_page(Page page, char *from, int len)
                memcpy(&itupdata, from, sizeof(IndexTupleData));
                itemsz = IndexTupleDSize(itupdata);
                itemsz = MAXALIGN(itemsz);
-               if (PageAddItem(page, (Item) from, itemsz,
-                                               FirstOffsetNumber, false) == InvalidOffsetNumber)
+               if (PageAddItem(page, (Item) from, itemsz, FirstOffsetNumber,
+                                               false, false) == InvalidOffsetNumber)
                        elog(PANIC, "_bt_restore_page: cannot add item to page");
                from += itemsz;
        }
@@ -238,7 +238,7 @@ btree_xlog_insert(bool isleaf, bool ismeta,
                        {
                                if (PageAddItem(page, (Item) datapos, datalen,
                                                        ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
-                                                               false) == InvalidOffsetNumber)
+                                                               false, false) == InvalidOffsetNumber)
                                        elog(PANIC, "btree_insert_redo: failed to add item");
 
                                PageSetLSN(page, lsn);
@@ -389,7 +389,7 @@ btree_xlog_split(bool onleft, bool isroot,
                                if (onleft)
                                {
                                        if (PageAddItem(lpage, newitem, newitemsz, newitemoff,
-                                                                       false) == InvalidOffsetNumber)
+                                                                       false, false) == InvalidOffsetNumber)
                                                elog(PANIC, "failed to add new item to left page after split");
                                }
 
@@ -398,7 +398,7 @@ btree_xlog_split(bool onleft, bool isroot,
                                hiItem = PageGetItem(rpage, hiItemId);
 
                                if (PageAddItem(lpage, hiItem, ItemIdGetLength(hiItemId),
-                                                               P_HIKEY, false) == InvalidOffsetNumber)
+                                                               P_HIKEY, false, false) == InvalidOffsetNumber)
                                        elog(PANIC, "failed to add high key to left page after split");
 
                                /* Fix opaque fields */
index b2db23bd7442a7bd4f2ea7b26bda8efb8dc36b83..258c914ae5b3d7a604e06e55ee2ae4706035608d 100644 (file)
@@ -410,6 +410,9 @@ UpdateIndexRelation(Oid indexoid,
        values[Anum_pg_index_indisprimary - 1] = BoolGetDatum(primary);
        values[Anum_pg_index_indisclustered - 1] = BoolGetDatum(false);
        values[Anum_pg_index_indisvalid - 1] = BoolGetDatum(isvalid);
+       values[Anum_pg_index_indcheckxmin - 1] = BoolGetDatum(false);
+       /* we set isvalid and isready the same way */
+       values[Anum_pg_index_indisready - 1] = BoolGetDatum(isvalid);
        values[Anum_pg_index_indkey - 1] = PointerGetDatum(indkey);
        values[Anum_pg_index_indclass - 1] = PointerGetDatum(indclass);
        values[Anum_pg_index_indoption - 1] = PointerGetDatum(indoption);
@@ -944,7 +947,11 @@ BuildIndexInfo(Relation index)
 
        /* other info */
        ii->ii_Unique = indexStruct->indisunique;
-       ii->ii_Concurrent = false;      /* assume normal case */
+       ii->ii_ReadyForInserts = indexStruct->indisready;
+
+       /* initialize index-build state to default */
+       ii->ii_Concurrent = false;
+       ii->ii_BrokenHotChain = false;
 
        return ii;
 }
@@ -1308,6 +1315,35 @@ index_build(Relation heapRelation,
                                                                                 PointerGetDatum(indexInfo)));
        Assert(PointerIsValid(stats));
 
+       /*
+        * If we found any potentially broken HOT chains, mark the index as
+        * not being usable until the current transaction is below the event
+        * horizon.  See src/backend/access/heap/README.HOT for discussion.
+        */
+       if (indexInfo->ii_BrokenHotChain)
+       {
+               Oid indexId = RelationGetRelid(indexRelation);
+               Relation pg_index;
+               HeapTuple indexTuple;
+               Form_pg_index indexForm;
+
+               pg_index = heap_open(IndexRelationId, RowExclusiveLock);
+
+               indexTuple = SearchSysCacheCopy(INDEXRELID,
+                                                                               ObjectIdGetDatum(indexId),
+                                                                               0, 0, 0);
+               if (!HeapTupleIsValid(indexTuple))
+                       elog(ERROR, "cache lookup failed for index %u", indexId);
+               indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
+
+               indexForm->indcheckxmin = true;
+               simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
+               CatalogUpdateIndexes(pg_index, indexTuple);
+
+               heap_freetuple(indexTuple);
+               heap_close(pg_index, RowExclusiveLock);
+       }
+
        /*
         * Update heap and index pg_class rows
         */
@@ -1346,6 +1382,11 @@ index_build(Relation heapRelation,
  * must keep track of the number of index tuples; we don't do so here because
  * the AM might reject some of the tuples for its own reasons, such as being
  * unable to store NULLs.
+ *
+ * A side effect is to set indexInfo->ii_BrokenHotChain to true if we detect
+ * any potentially broken HOT chains.  Currently, we set this if there are
+ * any RECENTLY_DEAD entries in a HOT chain, without trying very hard to
+ * detect whether they're really incompatible with the chain tip.
  */
 double
 IndexBuildHeapScan(Relation heapRelation,
@@ -1365,6 +1406,8 @@ IndexBuildHeapScan(Relation heapRelation,
        ExprContext *econtext;
        Snapshot        snapshot;
        TransactionId OldestXmin;
+       BlockNumber root_blkno = InvalidBlockNumber;
+       OffsetNumber root_offsets[MaxHeapTuplesPerPage];
 
        /*
         * sanity checks
@@ -1427,15 +1470,47 @@ IndexBuildHeapScan(Relation heapRelation,
 
                CHECK_FOR_INTERRUPTS();
 
+               /*
+                * When dealing with a HOT-chain of updated tuples, we want to
+                * index the values of the live tuple (if any), but index it
+                * under the TID of the chain's root tuple.  This approach is
+                * necessary to preserve the HOT-chain structure in the heap.
+                * So we need to be able to find the root item offset for every
+                * tuple that's in a HOT-chain.  When first reaching a new page
+                * of the relation, call heap_get_root_tuples() to build a map
+                * of root item offsets on the page.
+                *
+                * It might look unsafe to use this information across buffer
+                * lock/unlock.  However, we hold ShareLock on the table so no
+                * ordinary insert/update/delete should occur; and we hold pin on
+                * the buffer continuously while visiting the page, so no pruning
+                * operation can occur either.
+                *
+                * Note the implied assumption that there is no more than one live
+                * tuple per HOT-chain ...
+                */
+               if (scan->rs_cblock != root_blkno)
+               {
+                       Page page = BufferGetPage(scan->rs_cbuf);
+
+                       LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
+                       heap_get_root_tuples(page, root_offsets);
+                       LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+
+                       root_blkno = scan->rs_cblock;
+               }
+
                if (snapshot == SnapshotAny)
                {
                        /* do our own time qual check */
                        bool            indexIt;
 
+               recheck:
                        /*
                         * We could possibly get away with not locking the buffer here,
                         * since caller should hold ShareLock on the relation, but let's
-                        * be conservative about it.
+                        * be conservative about it.  (This remark is still correct
+                        * even with HOT-pruning: our pin on the buffer prevents pruning.)
                         */
                        LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
 
@@ -1458,10 +1533,29 @@ IndexBuildHeapScan(Relation heapRelation,
                                         * If tuple is recently deleted then we must index it
                                         * anyway to preserve MVCC semantics.  (Pre-existing
                                         * transactions could try to use the index after we finish
-                                        * building it, and may need to see such tuples.) Exclude
-                                        * it from unique-checking, however.
+                                        * building it, and may need to see such tuples.)
+                                        *
+                                        * However, if it was HOT-updated then we must only index
+                                        * the live tuple at the end of the HOT-chain.  Since this
+                                        * breaks semantics for pre-existing snapshots, mark
+                                        * the index as unusable for them.
+                                        *
+                                        * If we've already decided that the index will be unsafe
+                                        * for old snapshots, we may as well stop indexing
+                                        * recently-dead tuples, since there's no longer any
+                                        * point.
                                         */
-                                       indexIt = true;
+                                       if (HeapTupleIsHotUpdated(heapTuple))
+                                       {
+                                               indexIt = false;
+                                               /* mark the index as unsafe for old snapshots */
+                                               indexInfo->ii_BrokenHotChain = true;
+                                       }
+                                       else if (indexInfo->ii_BrokenHotChain)
+                                               indexIt = false;
+                                       else
+                                               indexIt = true;
+                                       /* In any case, exclude the tuple from unique-checking */
                                        tupleIsAlive = false;
                                        break;
                                case HEAPTUPLE_INSERT_IN_PROGRESS:
@@ -1473,12 +1567,31 @@ IndexBuildHeapScan(Relation heapRelation,
                                         * followed by CREATE INDEX within a transaction.)      An
                                         * exception occurs when reindexing a system catalog,
                                         * because we often release lock on system catalogs before
-                                        * committing.
+                                        * committing.  In that case we wait for the inserting
+                                        * transaction to finish and check again.  (We could do
+                                        * that on user tables too, but since the case is not
+                                        * expected it seems better to throw an error.)
                                         */
                                        if (!TransactionIdIsCurrentTransactionId(
-                                                                  HeapTupleHeaderGetXmin(heapTuple->t_data))
-                                               && !IsSystemRelation(heapRelation))
-                                               elog(ERROR, "concurrent insert in progress");
+                                                                  HeapTupleHeaderGetXmin(heapTuple->t_data)))
+                                       {
+                                               if (!IsSystemRelation(heapRelation))
+                                                       elog(ERROR, "concurrent insert in progress");
+                                               else
+                                               {
+                                                       /*
+                                                        * Must drop the lock on the buffer before we wait
+                                                        */
+                                                       TransactionId xwait = HeapTupleHeaderGetXmin(heapTuple->t_data);
+                                                       LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+                                                       XactLockTableWait(xwait);
+                                                       goto recheck;
+                                               }
+                                       }
+                                       /*
+                                        * We must index such tuples, since if the index build
+                                        * commits then they're good.
+                                        */
                                        indexIt = true;
                                        tupleIsAlive = true;
                                        break;
@@ -1491,19 +1604,48 @@ IndexBuildHeapScan(Relation heapRelation,
                                         * followed by CREATE INDEX within a transaction.)      An
                                         * exception occurs when reindexing a system catalog,
                                         * because we often release lock on system catalogs before
-                                        * committing.
+                                        * committing.  In that case we wait for the deleting
+                                        * transaction to finish and check again.  (We could do
+                                        * that on user tables too, but since the case is not
+                                        * expected it seems better to throw an error.)
                                         */
                                        Assert(!(heapTuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI));
                                        if (!TransactionIdIsCurrentTransactionId(
-                                                                  HeapTupleHeaderGetXmax(heapTuple->t_data))
-                                               && !IsSystemRelation(heapRelation))
-                                               elog(ERROR, "concurrent delete in progress");
-                                       indexIt = true;
+                                                                  HeapTupleHeaderGetXmax(heapTuple->t_data)))
+                                       {
+                                               if (!IsSystemRelation(heapRelation))
+                                                       elog(ERROR, "concurrent delete in progress");
+                                               else
+                                               {
+                                                       /*
+                                                        * Must drop the lock on the buffer before we wait
+                                                        */
+                                                       TransactionId xwait = HeapTupleHeaderGetXmax(heapTuple->t_data);
+                                                       LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+                                                       XactLockTableWait(xwait);
+                                                       goto recheck;
+                                               }
+                                       }
+                                       /*
+                                        * Otherwise, we have to treat these tuples just like
+                                        * RECENTLY_DELETED ones.
+                                        */
+                                       if (HeapTupleIsHotUpdated(heapTuple))
+                                       {
+                                               indexIt = false;
+                                               /* mark the index as unsafe for old snapshots */
+                                               indexInfo->ii_BrokenHotChain = true;
+                                       }
+                                       else if (indexInfo->ii_BrokenHotChain)
+                                               indexIt = false;
+                                       else
+                                               indexIt = true;
+                                       /* In any case, exclude the tuple from unique-checking */
                                        tupleIsAlive = false;
                                        break;
                                default:
                                        elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
-                                       indexIt = tupleIsAlive = false;         /* keep compiler quiet */
+                                       indexIt = tupleIsAlive = false; /* keep compiler quiet */
                                        break;
                        }
 
@@ -1552,9 +1694,33 @@ IndexBuildHeapScan(Relation heapRelation,
                 * pass the values[] and isnull[] arrays, instead.
                 */
 
-               /* Call the AM's callback routine to process the tuple */
-               callback(indexRelation, heapTuple, values, isnull, tupleIsAlive,
-                                callback_state);
+               if (HeapTupleIsHeapOnly(heapTuple))
+               {
+                       /*
+                        * For a heap-only tuple, pretend its TID is that of the root.
+                        * See src/backend/access/heap/README.HOT for discussion.
+                        */
+                       HeapTupleData   rootTuple;
+                       OffsetNumber    offnum;
+
+                       rootTuple = *heapTuple;
+                       offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self);
+
+                       Assert(OffsetNumberIsValid(root_offsets[offnum - 1]));
+
+                       ItemPointerSetOffsetNumber(&rootTuple.t_self,
+                                                                          root_offsets[offnum - 1]);
+
+                       /* Call the AM's callback routine to process the tuple */
+                       callback(indexRelation, &rootTuple, values, isnull, tupleIsAlive,
+                                        callback_state);
+               }
+               else
+               {
+                       /* Call the AM's callback routine to process the tuple */
+                       callback(indexRelation, heapTuple, values, isnull, tupleIsAlive,
+                                        callback_state);
+               }
        }
 
        heap_endscan(scan);
@@ -1574,8 +1740,15 @@ IndexBuildHeapScan(Relation heapRelation,
 /*
  * validate_index - support code for concurrent index builds
  *
- * We do a concurrent index build by first building the index normally via
- * index_create(), while holding a weak lock that allows concurrent
+ * We do a concurrent index build by first inserting the catalog entry for the
+ * index via index_create(), marking it not indisready and not indisvalid.
+ * Then we commit our transaction and start a new one, then we wait for all
+ * transactions that could have been modifying the table to terminate.  Now
+ * we know that any subsequently-started transactions will see the index and
+ * honor its constraints on HOT updates; so while existing HOT-chains might
+ * be broken with respect to the index, no currently live tuple will have an
+ * incompatible HOT update done to it.  We now build the index normally via
+ * index_build(), while holding a weak lock that allows concurrent
  * insert/update/delete.  Also, we index only tuples that are valid
  * as of the start of the scan (see IndexBuildHeapScan), whereas a normal
  * build takes care to include recently-dead tuples.  This is OK because
@@ -1586,11 +1759,10 @@ IndexBuildHeapScan(Relation heapRelation,
  * if we used HeapTupleSatisfiesVacuum).  This leaves us with an index that
  * does not contain any tuples added to the table while we built the index.
  *
- * Next, we commit the transaction so that the index becomes visible to other
- * backends, but it is marked not "indisvalid" to prevent the planner from
- * relying on it for indexscans.  Then we wait for all transactions that
- * could have been modifying the table to terminate.  At this point we
- * know that any subsequently-started transactions will see the index and
+ * Next, we mark the index "indisready" (but still not "indisvalid") and
+ * commit the second transaction and start a third.  Again we wait for all
+ * transactions that could have been modifying the table to terminate.  Now
+ * we know that any subsequently-started transactions will see the index and
  * insert their new tuples into it.  We then take a new reference snapshot
  * which is passed to validate_index().  Any tuples that are valid according
  * to this snap, but are not in the index, must be added to the index.
@@ -1610,7 +1782,7 @@ IndexBuildHeapScan(Relation heapRelation,
  * Building a unique index this way is tricky: we might try to insert a
  * tuple that is already dead or is in process of being deleted, and we
  * mustn't have a uniqueness failure against an updated version of the same
- * row.  We can check the tuple to see if it's already dead and tell
+ * row.  We could try to check the tuple to see if it's already dead and tell
  * index_insert() not to do the uniqueness check, but that still leaves us
  * with a race condition against an in-progress update.  To handle that,
  * we expect the index AM to recheck liveness of the to-be-inserted tuple
@@ -1620,7 +1792,8 @@ IndexBuildHeapScan(Relation heapRelation,
  * were alive at the time of the reference snapshot are gone; this is
  * necessary to be sure there are none left with a serializable snapshot
  * older than the reference (and hence possibly able to see tuples we did
- * not index). Then we mark the index valid and commit.
+ * not index). Then we mark the index "indisvalid" and commit.  Subsequent
+ * transactions will be able to use it for queries.
  *
  * Doing two full table scans is a brute-force strategy.  We could try to be
  * cleverer, eg storing new tuples in a special area of the table (perhaps
@@ -1727,6 +1900,9 @@ validate_index_heapscan(Relation heapRelation,
        TupleTableSlot *slot;
        EState     *estate;
        ExprContext *econtext;
+       BlockNumber root_blkno = InvalidBlockNumber;
+       OffsetNumber    root_offsets[MaxHeapTuplesPerPage];
+       bool                    in_index[MaxHeapTuplesPerPage];
 
        /* state variables for the merge */
        ItemPointer indexcursor = NULL;
@@ -1768,39 +1944,86 @@ validate_index_heapscan(Relation heapRelation,
        while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
        {
                ItemPointer heapcursor = &heapTuple->t_self;
+               ItemPointerData rootTuple;
+               OffsetNumber    root_offnum;
 
                CHECK_FOR_INTERRUPTS();
 
                state->htups += 1;
 
+               /*
+                * As commented in IndexBuildHeapScan, we should index heap-only tuples
+                * under the TIDs of their root tuples; so when we advance onto a new
+                * heap page, build a map of root item offsets on the page.
+                *
+                * This complicates merging against the tuplesort output: we will
+                * visit the live tuples in order by their offsets, but the root
+                * offsets that we need to compare against the index contents might
+                * be ordered differently.  So we might have to "look back" within
+                * the tuplesort output, but only within the current page.  We handle
+                * that by keeping a bool array in_index[] showing all the
+                * already-passed-over tuplesort output TIDs of the current page.
+                * We clear that array here, when advancing onto a new heap page.
+                */
+               if (scan->rs_cblock != root_blkno)
+               {
+                       Page page = BufferGetPage(scan->rs_cbuf);
+
+                       LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
+                       heap_get_root_tuples(page, root_offsets);
+                       LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+
+                       memset(in_index, 0, sizeof(in_index));
+
+                       root_blkno = scan->rs_cblock;
+               }
+
+               /* Convert actual tuple TID to root TID */
+               rootTuple = *heapcursor;
+               root_offnum = ItemPointerGetOffsetNumber(heapcursor);
+
+               if (HeapTupleIsHeapOnly(heapTuple))
+               {
+                       root_offnum = root_offsets[root_offnum - 1];
+                       Assert(OffsetNumberIsValid(root_offnum));
+                       ItemPointerSetOffsetNumber(&rootTuple, root_offnum);
+               }
+
                /*
                 * "merge" by skipping through the index tuples until we find or pass
-                * the current heap tuple.
+                * the current root tuple.
                 */
                while (!tuplesort_empty &&
                           (!indexcursor ||
-                               ItemPointerCompare(indexcursor, heapcursor) < 0))
+                               ItemPointerCompare(indexcursor, &rootTuple) < 0))
                {
                        Datum           ts_val;
                        bool            ts_isnull;
 
                        if (indexcursor)
+                       {
+                               /*
+                                * Remember index items seen earlier on the current heap page
+                                */
+                               if (ItemPointerGetBlockNumber(indexcursor) == root_blkno)
+                                       in_index[ItemPointerGetOffsetNumber(indexcursor) - 1] = true;
                                pfree(indexcursor);
+                       }
+
                        tuplesort_empty = !tuplesort_getdatum(state->tuplesort, true,
                                                                                                  &ts_val, &ts_isnull);
                        Assert(tuplesort_empty || !ts_isnull);
                        indexcursor = (ItemPointer) DatumGetPointer(ts_val);
                }
 
-               if (tuplesort_empty ||
-                       ItemPointerCompare(indexcursor, heapcursor) > 0)
+               /*
+                * If the tuplesort has overshot *and* we didn't see a match earlier,
+                * then this tuple is missing from the index, so insert it.
+                */
+               if ((tuplesort_empty ||
+                        ItemPointerCompare(indexcursor, &rootTuple) > 0) &&
+                       !in_index[root_offnum - 1])
                {
-                       /*
-                        * We've overshot which means this heap tuple is missing from the
-                        * index, so insert it.
-                        */
-                       bool            check_unique;
-
                        MemoryContextReset(econtext->ecxt_per_tuple_memory);
 
                        /* Set up for predicate or expression evaluation */
@@ -1827,40 +2050,30 @@ validate_index_heapscan(Relation heapRelation,
                                                   values,
                                                   isnull);
 
-                       /*
-                        * If the tuple is already committed dead, we still have to put it
-                        * in the index (because some xacts might be able to see it), but
-                        * we might as well suppress uniqueness checking. This is just an
-                        * optimization because the index AM is not supposed to raise a
-                        * uniqueness failure anyway.
-                        */
-                       if (indexInfo->ii_Unique)
-                       {
-                               /* must lock buffer to call HeapTupleSatisfiesVisibility */
-                               LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
-
-                               if (HeapTupleSatisfiesVisibility(heapTuple, SnapshotNow,
-                                                                                                scan->rs_cbuf))
-                                       check_unique = true;
-                               else
-                                       check_unique = false;
-
-                               LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
-                       }
-                       else
-                               check_unique = false;
-
                        /*
                         * You'd think we should go ahead and build the index tuple here,
                         * but some index AMs want to do further processing on the data
                         * first. So pass the values[] and isnull[] arrays, instead.
                         */
+
+                       /*
+                        * If the tuple is already committed dead, you might think we
+                        * could suppress uniqueness checking, but this is no longer
+                        * true in the presence of HOT, because the insert is actually
+                        * a proxy for a uniqueness check on the whole HOT-chain.  That
+                        * is, the tuple we have here could be dead because it was already
+                        * HOT-updated, and if so the updating transaction will not have
+                        * thought it should insert index entries.  The index AM will
+                        * check the whole HOT-chain and correctly detect a conflict
+                        * if there is one.
+                        */
+
                        index_insert(indexRelation,
                                                 values,
                                                 isnull,
-                                                heapcursor,
+                                                &rootTuple,
                                                 heapRelation,
-                                                check_unique);
+                                                indexInfo->ii_Unique);
 
                        state->tups_inserted += 1;
                }
@@ -1983,9 +2196,9 @@ reindex_index(Oid indexId)
        ResetReindexProcessing();
 
        /*
-        * If the index is marked invalid (ie, it's from a failed CREATE INDEX
-        * CONCURRENTLY), we can now mark it valid.  This allows REINDEX to be
-        * used to clean up in such cases.
+        * If the index is marked invalid or not ready (ie, it's from a failed
+        * CREATE INDEX CONCURRENTLY), we can now mark it valid.  This allows
+        * REINDEX to be used to clean up in such cases.
         */
        pg_index = heap_open(IndexRelationId, RowExclusiveLock);
 
@@ -1996,9 +2209,10 @@ reindex_index(Oid indexId)
                elog(ERROR, "cache lookup failed for index %u", indexId);
        indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
 
-       if (!indexForm->indisvalid)
+       if (!indexForm->indisvalid || !indexForm->indisready)
        {
                indexForm->indisvalid = true;
+               indexForm->indisready = true;
                simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
                CatalogUpdateIndexes(pg_index, indexTuple);
        }
index e4867c3985a26c1692f9a5c1ebfa42ca4552f36b..e30da413993e2e982b3fc6a9c5484fc7d032051a 100644 (file)
@@ -78,6 +78,10 @@ CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple)
        Datum           values[INDEX_MAX_KEYS];
        bool            isnull[INDEX_MAX_KEYS];
 
+       /* HOT update does not require index inserts */
+       if (HeapTupleIsHeapOnly(heapTuple))
+               return;
+
        /*
         * Get information from the state structure.  Fall out if nothing to do.
         */
@@ -101,6 +105,10 @@ CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple)
 
                indexInfo = indexInfoArray[i];
 
+               /* If the index is marked as read-only, ignore it */
+               if (!indexInfo->ii_ReadyForInserts)
+                       continue;
+
                /*
                 * Expressional and partial indexes on system catalogs are not
                 * supported
index b1ea8f059d751ba13c7dfae83c598ba320738782..5be4405a25ff3f314bba9ba5947fed629fe2dc8e 100644 (file)
@@ -207,6 +207,7 @@ CREATE VIEW pg_stat_all_tables AS
             pg_stat_get_tuples_inserted(C.oid) AS n_tup_ins, 
             pg_stat_get_tuples_updated(C.oid) AS n_tup_upd, 
             pg_stat_get_tuples_deleted(C.oid) AS n_tup_del,
+            pg_stat_get_tuples_hot_updated(C.oid) AS n_tup_hot_upd,
             pg_stat_get_live_tuples(C.oid) AS n_live_tup, 
             pg_stat_get_dead_tuples(C.oid) AS n_dead_tup,
             pg_stat_get_last_vacuum_time(C.oid) as last_vacuum,
index 54194ebd533689f061b85ff10638f2c528057768..6c9373bfec51e8038b9071ac6010c399d812412b 100644 (file)
@@ -225,7 +225,9 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid)
        indexInfo->ii_Predicate = NIL;
        indexInfo->ii_PredicateState = NIL;
        indexInfo->ii_Unique = true;
+       indexInfo->ii_ReadyForInserts = true;
        indexInfo->ii_Concurrent = false;
+       indexInfo->ii_BrokenHotChain = false;
 
        classObjectId[0] = OID_BTREE_OPS_OID;
        classObjectId[1] = INT4_BTREE_OPS_OID;
index a7c364a8c542b4736bf7f1f375633f372c0119a9..e6b465802974c9a997b5fa7c84b2c4e90cd7ac58 100644 (file)
@@ -119,6 +119,7 @@ DefineIndex(RangeVar *heapRelation,
        Oid                     namespaceId;
        Oid                     tablespaceId;
        Relation        rel;
+       Relation        indexRelation;
        HeapTuple       tuple;
        Form_pg_am      accessMethodForm;
        bool            amcanorder;
@@ -420,7 +421,10 @@ DefineIndex(RangeVar *heapRelation,
        indexInfo->ii_Predicate = make_ands_implicit(predicate);
        indexInfo->ii_PredicateState = NIL;
        indexInfo->ii_Unique = unique;
+       /* In a concurrent build, mark it not-ready-for-inserts */
+       indexInfo->ii_ReadyForInserts = !concurrent;
        indexInfo->ii_Concurrent = concurrent;
+       indexInfo->ii_BrokenHotChain = false;
 
        classObjectId = (Oid *) palloc(numberOfAttributes * sizeof(Oid));
        coloptions = (int16 *) palloc(numberOfAttributes * sizeof(int16));
@@ -439,23 +443,38 @@ DefineIndex(RangeVar *heapRelation,
                                  primary ? "PRIMARY KEY" : "UNIQUE",
                                  indexRelationName, RelationGetRelationName(rel))));
 
-       /* save lockrelid for below, then close rel */
+       /* save lockrelid and locktag for below, then close rel */
        heaprelid = rel->rd_lockInfo.lockRelId;
+       SET_LOCKTAG_RELATION(heaplocktag, heaprelid.dbId, heaprelid.relId);
        heap_close(rel, NoLock);
 
+       if (!concurrent)
+       {
+               indexRelationId =
+                       index_create(relationId, indexRelationName, indexRelationId,
+                                                indexInfo, accessMethodId, tablespaceId, classObjectId,
+                                                coloptions, reloptions, primary, isconstraint,
+                                                allowSystemTableMods, skip_build, concurrent);
+
+               return;                                 /* We're done, in the standard case */
+       }
+
+       /*
+        * For a concurrent build, we next insert the catalog entry and add
+        * constraints.  We don't build the index just yet; we must first make
+        * the catalog entry so that the new index is visible to updating
+        * transactions.  That will prevent them from making incompatible HOT
+        * updates.  The new index will be marked not indisready and not
+        * indisvalid, so that no one else tries to either insert into it or use
+        * it for queries.  We pass skip_build = true to prevent the build.
+        */
        indexRelationId =
                index_create(relationId, indexRelationName, indexRelationId,
                                         indexInfo, accessMethodId, tablespaceId, classObjectId,
                                         coloptions, reloptions, primary, isconstraint,
-                                        allowSystemTableMods, skip_build, concurrent);
-
-       if (!concurrent)
-               return;                                 /* We're done, in the standard case */
+                                        allowSystemTableMods, true, concurrent);
 
        /*
-        * Phase 2 of concurrent index build (see comments for validate_index()
-        * for an overview of how this works)
-        *
         * We must commit our current transaction so that the index becomes
         * visible; then start another.  Note that all the data structures we just
         * built are lost in the commit.  The only data we keep past here are the
@@ -476,6 +495,9 @@ DefineIndex(RangeVar *heapRelation,
        StartTransactionCommand();
 
        /*
+        * Phase 2 of concurrent index build (see comments for validate_index()
+        * for an overview of how this works)
+        *
         * Now we must wait until no running transaction could have the table open
         * with the old list of indexes.  To do this, inquire which xacts
         * currently would conflict with ShareLock on the table -- ie, which ones
@@ -494,7 +516,91 @@ DefineIndex(RangeVar *heapRelation,
         * check for that.  Also, prepared xacts are not reported, which is
         * fine since they certainly aren't going to do anything more.
         */
-       SET_LOCKTAG_RELATION(heaplocktag, heaprelid.dbId, heaprelid.relId);
+       old_lockholders = GetLockConflicts(&heaplocktag, ShareLock);
+
+       while (VirtualTransactionIdIsValid(*old_lockholders))
+       {
+               VirtualXactLockTableWait(*old_lockholders);
+               old_lockholders++;
+       }
+
+       /*
+        * At this moment we are sure that there are no transactions with the
+        * table open for write that don't have this new index in their list of
+        * indexes.  We have waited out all the existing transactions and any new
+        * transaction will have the new index in its list, but the index is still
+        * marked as "not-ready-for-inserts".  The index is consulted while
+        * deciding HOT-safety though.  This arrangement ensures that no new HOT
+        * chains can be created where the new tuple and the old tuple in the
+        * chain have different index keys.
+        *
+        * We now take a new snapshot, and build the index using all tuples that
+        * are visible in this snapshot.  We can be sure that any HOT updates
+        * to these tuples will be compatible with the index, since any updates
+        * made by transactions that didn't know about the index are now committed
+        * or rolled back.  Thus, each visible tuple is either the end of its
+        * HOT-chain or the extension of the chain is HOT-safe for this index.
+        */
+
+       /* Open and lock the parent heap relation */
+       rel = heap_openrv(heapRelation, ShareUpdateExclusiveLock);
+
+       /* And the target index relation */
+       indexRelation = index_open(indexRelationId, RowExclusiveLock);
+
+       /* Set ActiveSnapshot since functions in the indexes may need it */
+       ActiveSnapshot = CopySnapshot(GetTransactionSnapshot());
+
+       /* We have to re-build the IndexInfo struct, since it was lost in commit */
+       indexInfo = BuildIndexInfo(indexRelation);
+       Assert(!indexInfo->ii_ReadyForInserts);
+       indexInfo->ii_Concurrent = true;
+       indexInfo->ii_BrokenHotChain = false;
+
+       /* Now build the index */
+       index_build(rel, indexRelation, indexInfo, primary);
+
+       /* Close both the relations, but keep the locks */
+       heap_close(rel, NoLock);
+       index_close(indexRelation, NoLock);
+
+       /*
+        * Update the pg_index row to mark the index as ready for inserts.
+        * Once we commit this transaction, any new transactions that
+        * open the table must insert new entries into the index for insertions
+        * and non-HOT updates.
+        */
+       pg_index = heap_open(IndexRelationId, RowExclusiveLock);
+
+       indexTuple = SearchSysCacheCopy(INDEXRELID,
+                                                                       ObjectIdGetDatum(indexRelationId),
+                                                                       0, 0, 0);
+       if (!HeapTupleIsValid(indexTuple))
+               elog(ERROR, "cache lookup failed for index %u", indexRelationId);
+       indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
+
+       Assert(!indexForm->indisready);
+       Assert(!indexForm->indisvalid);
+
+       indexForm->indisready = true;
+
+       simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
+       CatalogUpdateIndexes(pg_index, indexTuple);
+
+       heap_close(pg_index, RowExclusiveLock);
+
+       /*
+        * Commit this transaction to make the indisready update visible.
+        */
+       CommitTransactionCommand();
+       StartTransactionCommand();
+
+       /*
+        * Phase 3 of concurrent index build
+        *
+        * We once again wait until no transaction can have the table open with
+        * the index marked as read-only for updates.
+        */
        old_lockholders = GetLockConflicts(&heaplocktag, ShareLock);
 
        while (VirtualTransactionIdIsValid(*old_lockholders))
@@ -505,7 +611,7 @@ DefineIndex(RangeVar *heapRelation,
 
        /*
         * Now take the "reference snapshot" that will be used by validate_index()
-        * to filter candidate tuples.  Beware!  There might be still snapshots
+        * to filter candidate tuples.  Beware!  There might still be snapshots
         * in use that treat some transaction as in-progress that our reference
         * snapshot treats as committed.  If such a recently-committed transaction
         * deleted tuples in the table, we will not include them in the index; yet
@@ -560,7 +666,7 @@ DefineIndex(RangeVar *heapRelation,
                elog(ERROR, "cache lookup failed for index %u", indexRelationId);
        indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
 
-       Assert(indexForm->indexrelid = indexRelationId);
+       Assert(indexForm->indisready);
        Assert(!indexForm->indisvalid);
 
        indexForm->indisvalid = true;
@@ -575,7 +681,8 @@ DefineIndex(RangeVar *heapRelation,
         * relcache entries for the index itself, but we should also send a
         * relcache inval on the parent table to force replanning of cached plans.
         * Otherwise existing sessions might fail to use the new index where it
-        * would be useful.
+        * would be useful.  (Note that our earlier commits did not create
+        * reasons to replan; relcache flush on the index itself was sufficient.)
         */
        CacheInvalidateRelcacheByRelid(heaprelid.relId);
 
index bdee553bcae0524182e03a50335d37f850844b51..ce591787f3cf07aa8c8816f270706f62fd7885a0 100644 (file)
@@ -1281,7 +1281,7 @@ seq_redo(XLogRecPtr lsn, XLogRecord *record)
        itemsz = record->xl_len - sizeof(xl_seq_rec);
        itemsz = MAXALIGN(itemsz);
        if (PageAddItem(page, (Item) item, itemsz,
-                                       FirstOffsetNumber, false) == InvalidOffsetNumber)
+                                       FirstOffsetNumber, false, false) == InvalidOffsetNumber)
                elog(PANIC, "seq_redo: failed to add item to page");
 
        PageSetLSN(page, lsn);
index 81083ea604db0a00bdd3a61992703f8e808ba28d..a7f0376c69961505fccffc5f99ad676d628b504c 100644 (file)
@@ -124,10 +124,11 @@ typedef VTupleMoveData *VTupleMove;
 typedef struct VRelStats
 {
        /* miscellaneous statistics */
-       BlockNumber rel_pages;
-       double          rel_tuples;
-       Size            min_tlen;
-       Size            max_tlen;
+       BlockNumber rel_pages;          /* pages in relation */
+       double          rel_tuples;             /* tuples that remain after vacuuming */
+       double          rel_indexed_tuples;             /* indexed tuples that remain */
+       Size            min_tlen;               /* min surviving tuple size */
+       Size            max_tlen;               /* max surviving tuple size */
        bool            hasindex;
        /* vtlinks array for tuple chain following - sorted by new_tid */
        int                     num_vtlinks;
@@ -1177,6 +1178,7 @@ full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
        vacrelstats = (VRelStats *) palloc(sizeof(VRelStats));
        vacrelstats->rel_pages = 0;
        vacrelstats->rel_tuples = 0;
+       vacrelstats->rel_indexed_tuples = 0;
        vacrelstats->hasindex = false;
 
        /* scan the heap */
@@ -1195,13 +1197,13 @@ full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
                {
                        for (i = 0; i < nindexes; i++)
                                vacuum_index(&vacuum_pages, Irel[i],
-                                                        vacrelstats->rel_tuples, 0);
+                                                        vacrelstats->rel_indexed_tuples, 0);
                }
                else
                {
                        /* just scan indexes to update statistic */
                        for (i = 0; i < nindexes; i++)
-                               scan_index(Irel[i], vacrelstats->rel_tuples);
+                               scan_index(Irel[i], vacrelstats->rel_indexed_tuples);
                }
        }
 
@@ -1256,6 +1258,7 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
        BlockNumber empty_pages,
                                empty_end_pages;
        double          num_tuples,
+                               num_indexed_tuples,
                                tups_vacuumed,
                                nkeep,
                                nunused;
@@ -1278,7 +1281,7 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
                                        relname)));
 
        empty_pages = empty_end_pages = 0;
-       num_tuples = tups_vacuumed = nkeep = nunused = 0;
+       num_tuples = num_indexed_tuples = tups_vacuumed = nkeep = nunused = 0;
        free_space = 0;
 
        nblocks = RelationGetNumberOfBlocks(onerel);
@@ -1313,9 +1316,13 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
                 * background writer will try to write the page if it's already marked
                 * dirty.  To ensure that invalid data doesn't get written to disk, we
                 * must take exclusive buffer lock wherever we potentially modify
-                * pages.
+                * pages.  In fact, we insist on cleanup lock so that we can safely
+                * call heap_page_prune().  (This might be overkill, since the bgwriter
+                * pays no attention to individual tuples, but on the other hand it's
+                * unlikely that the bgwriter has this particular page pinned at this
+                * instant.  So violating the coding rule would buy us little anyway.)
                 */
-               LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+               LockBufferForCleanup(buf);
 
                vacpage->blkno = blkno;
                vacpage->offsets_used = 0;
@@ -1356,6 +1363,21 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
                        continue;
                }
 
+               /* 
+                * Prune all HOT-update chains in this page.
+                *
+                * We use the redirect_move option so that redirecting line pointers
+                * get collapsed out; this allows us to not worry about them below.
+                *
+                * We count tuples removed by the pruning step as removed by VACUUM.
+                */
+               tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin,
+                                                                                true, false);
+
+               /*
+                * Now scan the page to collect vacuumable items and check for
+                * tuples requiring freezing.
+                */
                nfrozen = 0;
                notup = true;
                maxoff = PageGetMaxOffsetNumber(page);
@@ -1369,7 +1391,9 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 
                        /*
                         * Collect un-used items too - it's possible to have indexes
-                        * pointing here after crash.
+                        * pointing here after crash.  (That's an ancient comment and
+                        * is likely obsolete with WAL, but we might as well continue
+                        * to check for such problems.)
                         */
                        if (!ItemIdIsUsed(itemid))
                        {
@@ -1378,6 +1402,23 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
                                continue;
                        }
 
+                       /*
+                        * DEAD item pointers are to be vacuumed normally; but we don't
+                        * count them in tups_vacuumed, else we'd be double-counting
+                        * (at least in the common case where heap_page_prune() just
+                        * freed up a non-HOT tuple).
+                        */
+                       if (ItemIdIsDead(itemid))
+                       {
+                               vacpage->offsets[vacpage->offsets_free++] = offnum;
+                               continue;
+                       }
+
+                       /* Shouldn't have any redirected items anymore */
+                       if (!ItemIdIsNormal(itemid))
+                               elog(ERROR, "relation \"%s\" TID %u/%u: unexpected redirect item",
+                                        relname, blkno, offnum);
+
                        tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
                        tuple.t_len = ItemIdGetLength(itemid);
                        ItemPointerSet(&(tuple.t_self), blkno, offnum);
@@ -1410,12 +1451,45 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
                                        }
                                        break;
                                case HEAPTUPLE_DEAD:
-                                       tupgone = true;         /* we can delete the tuple */
                                        /*
-                                        * We need not require XMIN_COMMITTED or XMAX_COMMITTED to
-                                        * be set, since we will remove the tuple without any
-                                        * further examination of its hint bits.
+                                        * Ordinarily, DEAD tuples would have been removed by
+                                        * heap_page_prune(), but it's possible that the tuple
+                                        * state changed since heap_page_prune() looked.  In
+                                        * particular an INSERT_IN_PROGRESS tuple could have
+                                        * changed to DEAD if the inserter aborted.  So this
+                                        * cannot be considered an error condition, though it
+                                        * does suggest that someone released a lock early.
+                                        *
+                                        * If the tuple is HOT-updated then it must only be
+                                        * removed by a prune operation; so we keep it as if it
+                                        * were RECENTLY_DEAD, and abandon shrinking. (XXX is it
+                                        * worth trying to make the shrinking code smart enough
+                                        * to handle this?  It's an unusual corner case.)
+                                        *
+                                        * DEAD heap-only tuples can safely be removed if they
+                                        * aren't themselves HOT-updated, although this is a bit
+                                        * inefficient since we'll uselessly try to remove
+                                        * index entries for them.
                                         */
+                                       if (HeapTupleIsHotUpdated(&tuple))
+                                       {
+                                               nkeep += 1;
+                                               if (do_shrinking)
+                                                       ereport(LOG,
+                                                                       (errmsg("relation \"%s\" TID %u/%u: dead HOT-updated tuple --- cannot shrink relation",
+                                                                                       relname, blkno, offnum)));
+                                               do_shrinking = false;
+                                       }
+                                       else
+                                       {
+                                               tupgone = true;         /* we can delete the tuple */
+                                               /*
+                                                * We need not require XMIN_COMMITTED or
+                                                * XMAX_COMMITTED to be set, since we will remove the
+                                                * tuple without any further examination of its hint
+                                                * bits.
+                                                */
+                                       }
                                        break;
                                case HEAPTUPLE_RECENTLY_DEAD:
 
@@ -1530,6 +1604,8 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
                        else
                        {
                                num_tuples += 1;
+                               if (!HeapTupleIsHeapOnly(&tuple))
+                                       num_indexed_tuples += 1;
                                notup = false;
                                if (tuple.t_len < min_tlen)
                                        min_tlen = tuple.t_len;
@@ -1549,7 +1625,7 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
                if (tempPage != NULL)
                {
                        /* Some tuples are removable; figure free space after removal */
-                       PageRepairFragmentation(tempPage, NULL);
+                       PageRepairFragmentation(tempPage);
                        vacpage->free = PageGetFreeSpaceWithFillFactor(onerel, tempPage);
                        pfree(tempPage);
                        do_reap = true;
@@ -1558,7 +1634,7 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
                {
                        /* Just use current available space */
                        vacpage->free = PageGetFreeSpaceWithFillFactor(onerel, page);
-                       /* Need to reap the page if it has LP_UNUSED line pointers */
+                       /* Need to reap the page if it has UNUSED or DEAD line pointers */
                        do_reap = (vacpage->offsets_free > 0);
                }
 
@@ -1621,6 +1697,7 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 
        /* save stats in the rel list for use later */
        vacrelstats->rel_tuples = num_tuples;
+       vacrelstats->rel_indexed_tuples = num_indexed_tuples;
        vacrelstats->rel_pages = nblocks;
        if (num_tuples == 0)
                min_tlen = max_tlen = 0;
@@ -1720,6 +1797,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                                num_fraged_pages,
                                vacuumed_pages;
        int                     keep_tuples = 0;
+       int                     keep_indexed_tuples = 0;
        PGRUsage        ru0;
 
        pg_rusage_init(&ru0);
@@ -1845,6 +1923,16 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                        if (!ItemIdIsUsed(itemid))
                                continue;
 
+                       if (ItemIdIsDead(itemid))
+                       {
+                               /* just remember it for vacuum_page() */
+                               vacpage->offsets[vacpage->offsets_free++] = offnum;
+                               continue;
+                       }
+
+                       /* Shouldn't have any redirected items now */
+                       Assert(ItemIdIsNormal(itemid));
+
                        tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
                        tuple_len = tuple.t_len = ItemIdGetLength(itemid);
                        ItemPointerSet(&(tuple.t_self), blkno, offnum);
@@ -1906,12 +1994,28 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                                        if (i >= vacpage->offsets_free)         /* not found */
                                        {
                                                vacpage->offsets[vacpage->offsets_free++] = offnum;
+                                               /*
+                                                * If this is not a heap-only tuple, there must be an
+                                                * index entry for this item which will be removed in
+                                                * the index cleanup. Decrement the keep_indexed_tuples
+                                                * count to remember this.
+                                                */
+                                               if (!HeapTupleHeaderIsHeapOnly(tuple.t_data))
+                                                       keep_indexed_tuples--;
                                                keep_tuples--;
                                        }
                                }
                                else
                                {
                                        vacpage->offsets[vacpage->offsets_free++] = offnum;
+                                       /*
+                                        * If this is not a heap-only tuple, there must be an
+                                        * index entry for this item which will be removed in
+                                        * the index cleanup. Decrement the keep_indexed_tuples
+                                        * count to remember this.
+                                        */
+                                       if (!HeapTupleHeaderIsHeapOnly(tuple.t_data))
+                                               keep_indexed_tuples--;
                                        keep_tuples--;
                                }
                                continue;
@@ -2028,7 +2132,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                                                break;
                                        }
                                        nextItemid = PageGetItemId(nextPage, nextOffnum);
-                                       if (!ItemIdIsUsed(nextItemid))
+                                       if (!ItemIdIsNormal(nextItemid))
                                        {
                                                ReleaseBuffer(nextBuf);
                                                break;
@@ -2166,7 +2270,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                                        Pitemid = PageGetItemId(Ppage,
                                                                   ItemPointerGetOffsetNumber(&(tp.t_self)));
                                        /* this can't happen since we saw tuple earlier: */
-                                       if (!ItemIdIsUsed(Pitemid))
+                                       if (!ItemIdIsNormal(Pitemid))
                                                elog(ERROR, "parent itemid marked as unused");
                                        PTdata = (HeapTupleHeader) PageGetItem(Ppage, Pitemid);
 
@@ -2268,6 +2372,14 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                                                                         dst_buffer, dst_page, destvacpage,
                                                                         &ec, &Ctid, vtmove[ti].cleanVpd);
 
+                                       /*
+                                        * If the tuple we are moving is a heap-only tuple,
+                                        * this move will generate an additional index entry,
+                                        * so increment the rel_indexed_tuples count.
+                                        */ 
+                                       if (HeapTupleHeaderIsHeapOnly(tuple.t_data))
+                                               vacrelstats->rel_indexed_tuples++;
+
                                        num_moved++;
                                        if (destvacpage->blkno > last_move_dest_block)
                                                last_move_dest_block = destvacpage->blkno;
@@ -2280,7 +2392,31 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                                                vacpage->offsets[vacpage->offsets_free++] =
                                                        ItemPointerGetOffsetNumber(&(tuple.t_self));
                                        else
+                                       {
+                                               /*
+                                                * When we move tuple chains, we may need to move
+                                                * tuples from a block that we haven't yet scanned in
+                                                * the outer walk-along-the-relation loop. Note that we
+                                                * can't be moving a tuple from a block that we have
+                                                * already scanned because if such a tuple exists, then
+                                                * we must have moved the chain along with that tuple
+                                                * when we scanned that block. IOW the test of
+                                                * (Cbuf != buf) guarantees that the tuple we are
+                                                * looking at right now is in a block which is yet to
+                                                * be scanned.
+                                                *
+                                                * We maintain two counters to correctly count the
+                                                * moved-off tuples from blocks that are not yet
+                                                * scanned (keep_tuples) and how many of them have
+                                                * index pointers (keep_indexed_tuples).  The main
+                                                * reason to track the latter is to help verify
+                                                * that indexes have the expected number of entries
+                                                * when all the dust settles.
+                                                */
+                                               if (!HeapTupleHeaderIsHeapOnly(tuple.t_data))
+                                                       keep_indexed_tuples++;
                                                keep_tuples++;
+                                       }
 
                                        ReleaseBuffer(dst_buffer);
                                        ReleaseBuffer(Cbuf);
@@ -2328,6 +2464,14 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                        move_plain_tuple(onerel, buf, page, &tuple,
                                                         dst_buffer, dst_page, dst_vacpage, &ec);
 
+                       /*
+                        * If the tuple we are moving is a heap-only tuple,
+                        * this move will generate an additional index entry,
+                        * so increment the rel_indexed_tuples count.
+                        */
+                       if (HeapTupleHeaderIsHeapOnly(tuple.t_data))
+                               vacrelstats->rel_indexed_tuples++;
+
                        num_moved++;
                        if (dst_vacpage->blkno > last_move_dest_block)
                                last_move_dest_block = dst_vacpage->blkno;
@@ -2361,6 +2505,9 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 
                                if (!ItemIdIsUsed(itemid))
                                        continue;
+                               /* Shouldn't be any DEAD or REDIRECT items anymore */
+                               Assert(ItemIdIsNormal(itemid));
+
                                htup = (HeapTupleHeader) PageGetItem(page, itemid);
                                if (htup->t_infomask & HEAP_XMIN_COMMITTED)
                                        continue;
@@ -2389,6 +2536,14 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                                        {
                                                vacpage->offsets[vacpage->offsets_free++] = off;
                                                Assert(keep_tuples > 0);
+                                               /*
+                                                * If this is not a heap-only tuple, there must be an
+                                                * index entry for this item which will be removed in
+                                                * the index cleanup. Decrement the keep_indexed_tuples
+                                                * count to remember this.
+                                                */
+                                               if (!HeapTupleHeaderIsHeapOnly(htup))
+                                                       keep_indexed_tuples--;
                                                keep_tuples--;
                                        }
                                }
@@ -2396,6 +2551,8 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                                {
                                        vacpage->offsets[vacpage->offsets_free++] = off;
                                        Assert(keep_tuples > 0);
+                                       if (!HeapTupleHeaderIsHeapOnly(htup))
+                                               keep_indexed_tuples--;
                                        keep_tuples--;
                                }
                        }
@@ -2529,11 +2686,14 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                         * page during chain moves but not been scanned over subsequently.
                         * The tuple ids of these tuples are not recorded as free offsets
                         * for any VacPage, so they will not be cleared from the indexes.
+                        * keep_indexed_tuples is the portion of these that are expected
+                        * to have index entries.
                         */
                        Assert(keep_tuples >= 0);
                        for (i = 0; i < nindexes; i++)
                                vacuum_index(&Nvacpagelist, Irel[i],
-                                                        vacrelstats->rel_tuples, keep_tuples);
+                                                        vacrelstats->rel_indexed_tuples,
+                                                        keep_indexed_tuples);
                }
 
                /*
@@ -2551,7 +2711,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                        OffsetNumber unused[MaxOffsetNumber];
                        OffsetNumber offnum,
                                                maxoff;
-                       int                     uncnt;
+                       int                     uncnt = 0;
                        int                     num_tuples = 0;
 
                        buf = ReadBufferWithStrategy(onerel, vacpage->blkno, vac_strategy);
@@ -2567,6 +2727,9 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 
                                if (!ItemIdIsUsed(itemid))
                                        continue;
+                               /* Shouldn't be any DEAD or REDIRECT items anymore */
+                               Assert(ItemIdIsNormal(itemid));
+
                                htup = (HeapTupleHeader) PageGetItem(page, itemid);
                                if (htup->t_infomask & HEAP_XMIN_COMMITTED)
                                        continue;
@@ -2584,12 +2747,14 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 
                                ItemIdSetUnused(itemid);
                                num_tuples++;
+
+                               unused[uncnt++] = offnum;
                        }
                        Assert(vacpage->offsets_free == num_tuples);
 
                        START_CRIT_SECTION();
 
-                       uncnt = PageRepairFragmentation(page, unused);
+                       PageRepairFragmentation(page);
 
                        MarkBufferDirty(buf);
 
@@ -2598,7 +2763,10 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
                        {
                                XLogRecPtr      recptr;
 
-                               recptr = log_heap_clean(onerel, buf, unused, uncnt);
+                               recptr = log_heap_clean(onerel, buf,
+                                                                               NULL, 0, NULL, 0,
+                                                                               unused, uncnt,
+                                                                               false);
                                PageSetLSN(page, recptr);
                                PageSetTLI(page, ThisTimeLineID);
                        }
@@ -2706,15 +2874,17 @@ move_chain_tuple(Relation rel,
 
        /*
         * Update the state of the copied tuple, and store it on the destination
-        * page.
+        * page.  The copied tuple is never part of a HOT chain.
         */
        newtup.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
                                                                   HEAP_XMIN_INVALID |
                                                                   HEAP_MOVED_OFF);
        newtup.t_data->t_infomask |= HEAP_MOVED_IN;
+       HeapTupleHeaderClearHotUpdated(newtup.t_data);
+       HeapTupleHeaderClearHeapOnly(newtup.t_data);
        HeapTupleHeaderSetXvac(newtup.t_data, myXID);
        newoff = PageAddItem(dst_page, (Item) newtup.t_data, tuple_len,
-                                                InvalidOffsetNumber, false);
+                                                InvalidOffsetNumber, false, true);
        if (newoff == InvalidOffsetNumber)
                elog(PANIC, "failed to add item with len = %lu to page %u while moving tuple chain",
                         (unsigned long) tuple_len, dst_vacpage->blkno);
@@ -2809,17 +2979,19 @@ move_plain_tuple(Relation rel,
        START_CRIT_SECTION();
 
        /*
-        * Mark new tuple as MOVED_IN by me.
+        * Mark new tuple as MOVED_IN by me; also mark it not HOT.
         */
        newtup.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
                                                                   HEAP_XMIN_INVALID |
                                                                   HEAP_MOVED_OFF);
        newtup.t_data->t_infomask |= HEAP_MOVED_IN;
+       HeapTupleHeaderClearHotUpdated(newtup.t_data);
+       HeapTupleHeaderClearHeapOnly(newtup.t_data);
        HeapTupleHeaderSetXvac(newtup.t_data, myXID);
 
        /* add tuple to the page */
        newoff = PageAddItem(dst_page, (Item) newtup.t_data, tuple_len,
-                                                InvalidOffsetNumber, false);
+                                                InvalidOffsetNumber, false, true);
        if (newoff == InvalidOffsetNumber)
                elog(PANIC, "failed to add item with len = %lu to page %u (free space %lu, nusd %u, noff %u)",
                         (unsigned long) tuple_len,
@@ -2934,6 +3106,9 @@ update_hint_bits(Relation rel, VacPageList fraged_pages, int num_fraged_pages,
 
                        if (!ItemIdIsUsed(itemid))
                                continue;
+                       /* Shouldn't be any DEAD or REDIRECT items anymore */
+                       Assert(ItemIdIsNormal(itemid));
+
                        htup = (HeapTupleHeader) PageGetItem(page, itemid);
                        if (htup->t_infomask & HEAP_XMIN_COMMITTED)
                                continue;
@@ -3019,10 +3194,7 @@ vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
 static void
 vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
 {
-       OffsetNumber unused[MaxOffsetNumber];
-       int                     uncnt;
        Page            page = BufferGetPage(buffer);
-       ItemId          itemid;
        int                     i;
 
        /* There shouldn't be any tuples moved onto the page yet! */
@@ -3032,11 +3204,12 @@ vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
 
        for (i = 0; i < vacpage->offsets_free; i++)
        {
-               itemid = PageGetItemId(page, vacpage->offsets[i]);
+               ItemId          itemid = PageGetItemId(page, vacpage->offsets[i]);
+
                ItemIdSetUnused(itemid);
        }
 
-       uncnt = PageRepairFragmentation(page, unused);
+       PageRepairFragmentation(page);
 
        MarkBufferDirty(buffer);
 
@@ -3045,7 +3218,10 @@ vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
        {
                XLogRecPtr      recptr;
 
-               recptr = log_heap_clean(onerel, buffer, unused, uncnt);
+               recptr = log_heap_clean(onerel, buffer,
+                                                               NULL, 0, NULL, 0,
+                                                               vacpage->offsets, vacpage->offsets_free,
+                                                               false);
                PageSetLSN(page, recptr);
                PageSetTLI(page, ThisTimeLineID);
        }
@@ -3527,8 +3703,7 @@ enough_space(VacPage vacpage, Size len)
 static Size
 PageGetFreeSpaceWithFillFactor(Relation relation, Page page)
 {
-       PageHeader      pd = (PageHeader) page;
-       Size            freespace = pd->pd_upper - pd->pd_lower;
+       Size            freespace = PageGetHeapFreeSpace(page);
        Size            targetfree;
 
        targetfree = RelationGetTargetPageFreeSpace(relation,
index 0f325a73a4c8169e5474639c51eb48c0ba02df9b..7d9e3fb421edc700eab06371b60a91f7d3c83c7a 100644 (file)
@@ -326,8 +326,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 
                buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy);
 
-               /* Initially, we only need shared access to the buffer */
-               LockBuffer(buf, BUFFER_LOCK_SHARE);
+               /* We need buffer cleanup lock so that we can prune HOT chains. */
+               LockBufferForCleanup(buf);
 
                page = BufferGetPage(buf);
 
@@ -341,11 +341,10 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                         * We have to be careful here because we could be looking at a
                         * page that someone has just added to the relation and not yet
                         * been able to initialize (see RelationGetBufferForTuple). To
-                        * interlock against that, release the buffer read lock (which we
-                        * must do anyway) and grab the relation extension lock before
-                        * re-locking in exclusive mode.  If the page is still
-                        * uninitialized by then, it must be left over from a crashed
-                        * backend, and we can initialize it.
+                        * protect against that, release the buffer lock, grab the
+                        * relation extension lock momentarily, and re-lock the buffer.
+                        * If the page is still uninitialized by then, it must be left
+                        * over from a crashed backend, and we can initialize it.
                         *
                         * We don't really need the relation lock when this is a new or
                         * temp relation, but it's probably not worth the code space to
@@ -357,7 +356,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                        LockBuffer(buf, BUFFER_LOCK_UNLOCK);
                        LockRelationForExtension(onerel, ExclusiveLock);
                        UnlockRelationForExtension(onerel, ExclusiveLock);
-                       LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+                       LockBufferForCleanup(buf);
                        if (PageIsNew(page))
                        {
                                ereport(WARNING,
@@ -366,7 +365,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                                PageInit(page, BufferGetPageSize(buf), 0);
                                empty_pages++;
                                lazy_record_free_space(vacrelstats, blkno,
-                                                                          PageGetFreeSpace(page));
+                                                                          PageGetHeapFreeSpace(page));
                        }
                        MarkBufferDirty(buf);
                        UnlockReleaseBuffer(buf);
@@ -377,11 +376,23 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                {
                        empty_pages++;
                        lazy_record_free_space(vacrelstats, blkno,
-                                                                  PageGetFreeSpace(page));
+                                                                  PageGetHeapFreeSpace(page));
                        UnlockReleaseBuffer(buf);
                        continue;
                }
 
+               /* 
+                * Prune all HOT-update chains in this page.
+                *
+                * We count tuples removed by the pruning step as removed by VACUUM.
+                */
+               tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin,
+                                                                                false, false);
+
+               /*
+                * Now scan the page to collect vacuumable items and check for
+                * tuples requiring freezing.
+                */
                nfrozen = 0;
                hastup = false;
                prev_dead_count = vacrelstats->num_dead_tuples;
@@ -394,22 +405,64 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 
                        itemid = PageGetItemId(page, offnum);
 
+                       /* Unused items require no processing, but we count 'em */
                        if (!ItemIdIsUsed(itemid))
                        {
                                nunused += 1;
                                continue;
                        }
 
+                       /* Redirect items mustn't be touched */
+                       if (ItemIdIsRedirected(itemid))
+                       {
+                               hastup = true;  /* this page won't be truncatable */
+                               continue;
+                       }
+
+                       ItemPointerSet(&(tuple.t_self), blkno, offnum);
+
+                       /*
+                        * DEAD item pointers are to be vacuumed normally; but we don't
+                        * count them in tups_vacuumed, else we'd be double-counting
+                        * (at least in the common case where heap_page_prune() just
+                        * freed up a non-HOT tuple).
+                        */
+                       if (ItemIdIsDead(itemid))
+                       {
+                               lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
+                               continue;
+                       }
+
+                       Assert(ItemIdIsNormal(itemid));
+
                        tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
                        tuple.t_len = ItemIdGetLength(itemid);
-                       ItemPointerSet(&(tuple.t_self), blkno, offnum);
 
                        tupgone = false;
 
                        switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf))
                        {
                                case HEAPTUPLE_DEAD:
-                                       tupgone = true;         /* we can delete the tuple */
+                                       /*
+                                        * Ordinarily, DEAD tuples would have been removed by
+                                        * heap_page_prune(), but it's possible that the tuple
+                                        * state changed since heap_page_prune() looked.  In
+                                        * particular an INSERT_IN_PROGRESS tuple could have
+                                        * changed to DEAD if the inserter aborted.  So this
+                                        * cannot be considered an error condition.
+                                        *
+                                        * If the tuple is HOT-updated then it must only be
+                                        * removed by a prune operation; so we keep it just as
+                                        * if it were RECENTLY_DEAD.  Also, if it's a heap-only
+                                        * tuple, we choose to keep it, because it'll be a
+                                        * lot cheaper to get rid of it in the next pruning pass
+                                        * than to treat it like an indexed tuple.
+                                        */
+                                       if (HeapTupleIsHotUpdated(&tuple) ||
+                                               HeapTupleIsHeapOnly(&tuple))
+                                               nkeep += 1;
+                                       else
+                                               tupgone = true;         /* we can delete the tuple */
                                        break;
                                case HEAPTUPLE_LIVE:
                                        /* Tuple is good --- but let's do some validity checks */
@@ -449,11 +502,10 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 
                                /*
                                 * Each non-removable tuple must be checked to see if it
-                                * needs freezing.  If we already froze anything, then
-                                * we've already switched the buffer lock to exclusive.
+                                * needs freezing.  Note we already have exclusive buffer lock.
                                 */
                                if (heap_freeze_tuple(tuple.t_data, FreezeLimit,
-                                                                         (nfrozen > 0) ? InvalidBuffer : buf))
+                                                                         InvalidBuffer))
                                        frozen[nfrozen++] = offnum;
                        }
                }                                               /* scan along page */
@@ -485,9 +537,6 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                if (nindexes == 0 &&
                        vacrelstats->num_dead_tuples > 0)
                {
-                       /* Trade in buffer share lock for super-exclusive lock */
-                       LockBuffer(buf, BUFFER_LOCK_UNLOCK);
-                       LockBufferForCleanup(buf);
                        /* Remove tuples from heap */
                        lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats);
                        /* Forget the now-vacuumed tuples, and press on */
@@ -505,7 +554,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                if (vacrelstats->num_dead_tuples == prev_dead_count)
                {
                        lazy_record_free_space(vacrelstats, blkno,
-                                                                  PageGetFreeSpace(page));
+                                                                  PageGetHeapFreeSpace(page));
                }
 
                /* Remember the location of the last page with nonremovable tuples */
@@ -598,7 +647,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
                /* Now that we've compacted the page, record its available space */
                page = BufferGetPage(buf);
                lazy_record_free_space(vacrelstats, tblk,
-                                                          PageGetFreeSpace(page));
+                                                          PageGetHeapFreeSpace(page));
                UnlockReleaseBuffer(buf);
                npages++;
        }
@@ -615,7 +664,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
  *     lazy_vacuum_page() -- free dead tuples on a page
  *                                      and repair its fragmentation.
  *
- * Caller must hold pin and lock on the buffer.
+ * Caller must hold pin and buffer cleanup lock on the buffer.
  *
  * tupindex is the index in vacrelstats->dead_tuples of the first dead
  * tuple for this page.  We assume the rest follow sequentially.
@@ -625,10 +674,9 @@ static int
 lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
                                 int tupindex, LVRelStats *vacrelstats)
 {
-       OffsetNumber unused[MaxOffsetNumber];
-       int                     uncnt;
        Page            page = BufferGetPage(buffer);
-       ItemId          itemid;
+       OffsetNumber unused[MaxOffsetNumber];
+       int                     uncnt = 0;
 
        START_CRIT_SECTION();
 
@@ -636,6 +684,7 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
        {
                BlockNumber tblk;
                OffsetNumber toff;
+               ItemId          itemid;
 
                tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
                if (tblk != blkno)
@@ -643,9 +692,10 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
                toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]);
                itemid = PageGetItemId(page, toff);
                ItemIdSetUnused(itemid);
+               unused[uncnt++] = toff;
        }
 
-       uncnt = PageRepairFragmentation(page, unused);
+       PageRepairFragmentation(page);
 
        MarkBufferDirty(buffer);
 
@@ -654,7 +704,10 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
        {
                XLogRecPtr      recptr;
 
-               recptr = log_heap_clean(onerel, buffer, unused, uncnt);
+               recptr = log_heap_clean(onerel, buffer,
+                                                               NULL, 0, NULL, 0,
+                                                               unused, uncnt,
+                                                               false);
                PageSetLSN(page, recptr);
                PageSetTLI(page, ThisTimeLineID);
        }
@@ -980,7 +1033,7 @@ lazy_record_dead_tuple(LVRelStats *vacrelstats,
        /*
         * The array shouldn't overflow under normal behavior, but perhaps it
         * could if we are given a really small maintenance_work_mem. In that
-        * case, just forget the last few tuples.
+        * case, just forget the last few tuples (we'll get 'em next time).
         */
        if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples)
        {
index 5c09ba86dd11ffe0faed194602d995325027ab69..74588d23c685c465c9e9043256dfbfdef7087a48 100644 (file)
@@ -1813,8 +1813,10 @@ lreplace:;
         *
         * Note: heap_update returns the tid (location) of the new tuple in the
         * t_self field.
+        *
+        * If it's a HOT update, we mustn't insert new index entries.
         */
-       if (resultRelInfo->ri_NumIndices > 0)
+       if (resultRelInfo->ri_NumIndices > 0 && !HeapTupleIsHeapOnly(tuple))
                ExecInsertIndexTuples(slot, &(tuple->t_self), estate, false);
 
        /* AFTER ROW UPDATE Triggers */
index db0e70b67fb0c5598107eb937463e7ed4e7d4259..e25f0ad0ed7b9ffcac4caffdd06e3d2a08a07385 100644 (file)
@@ -981,6 +981,10 @@ ExecCloseIndices(ResultRelInfo *resultRelInfo)
  *             stuff as it only exists here because the genam stuff
  *             doesn't provide the functionality needed by the
  *             executor.. -cim 9/27/89
+ *
+ *             CAUTION: this must not be called for a HOT update.
+ *             We can't defend against that here for lack of info.
+ *             Should we change the API to make it safer?
  * ----------------------------------------------------------------
  */
 void
@@ -1029,6 +1033,10 @@ ExecInsertIndexTuples(TupleTableSlot *slot,
 
                indexInfo = indexInfoArray[i];
 
+               /* If the index is marked as read-only, ignore it */
+               if (!indexInfo->ii_ReadyForInserts)
+                       continue;
+
                /* Check for partial index */
                if (indexInfo->ii_Predicate != NIL)
                {
index c830e82f147dc2583e4b54d50e6b525fc8f1e1d9..7ef4e0734d2748c76245684246d7711adf751cef 100644 (file)
@@ -240,12 +240,7 @@ bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
        BlockNumber page = tbmres->blockno;
        Buffer          buffer;
        Snapshot        snapshot;
-       Page            dp;
        int                     ntup;
-       int                     curslot;
-       int                     minslot;
-       int                     maxslot;
-       int                     maxoff;
 
        /*
         * Acquire pin on the target heap page, trading in any pin we held before.
@@ -258,6 +253,13 @@ bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
        buffer = scan->rs_cbuf;
        snapshot = scan->rs_snapshot;
 
+       ntup = 0;
+
+       /*
+        * Prune and repair fragmentation for the whole page, if possible.
+        */
+       heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin);
+
        /*
         * We must hold share lock on the buffer content while examining tuple
         * visibility.  Afterwards, however, the tuples we have found to be
@@ -265,71 +267,51 @@ bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
         */
        LockBuffer(buffer, BUFFER_LOCK_SHARE);
 
-       dp = (Page) BufferGetPage(buffer);
-       maxoff = PageGetMaxOffsetNumber(dp);
-
        /*
-        * Determine how many entries we need to look at on this page. If the
-        * bitmap is lossy then we need to look at each physical item pointer;
-        * otherwise we just look through the offsets listed in tbmres.
+        * We need two separate strategies for lossy and non-lossy cases.
         */
        if (tbmres->ntuples >= 0)
        {
-               /* non-lossy case */
-               minslot = 0;
-               maxslot = tbmres->ntuples - 1;
-       }
-       else
-       {
-               /* lossy case */
-               minslot = FirstOffsetNumber;
-               maxslot = maxoff;
-       }
-
-       ntup = 0;
-       for (curslot = minslot; curslot <= maxslot; curslot++)
-       {
-               OffsetNumber targoffset;
-               ItemId          lp;
-               HeapTupleData loctup;
-               bool            valid;
-
-               if (tbmres->ntuples >= 0)
-               {
-                       /* non-lossy case */
-                       targoffset = tbmres->offsets[curslot];
-               }
-               else
-               {
-                       /* lossy case */
-                       targoffset = (OffsetNumber) curslot;
-               }
-
                /*
-                * We'd better check for out-of-range offnum in case of VACUUM since
-                * the TID was obtained.
+                * Bitmap is non-lossy, so we just look through the offsets listed in
+                * tbmres; but we have to follow any HOT chain starting at each such
+                * offset.
                 */
-               if (targoffset < FirstOffsetNumber || targoffset > maxoff)
-                       continue;
+               int curslot;
 
-               lp = PageGetItemId(dp, targoffset);
+               for (curslot = 0; curslot < tbmres->ntuples; curslot++)
+               {
+                       OffsetNumber offnum = tbmres->offsets[curslot];
+                       ItemPointerData tid;
 
+                       ItemPointerSet(&tid, page, offnum);
+                       if (heap_hot_search_buffer(&tid, buffer, snapshot, NULL))
+                               scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
+               }
+       }
+       else
+       {
                /*
-                * Must check for deleted tuple.
+                * Bitmap is lossy, so we must examine each item pointer on the page.
+                * But we can ignore HOT chains, since we'll check each tuple anyway.
                 */
-               if (!ItemIdIsNormal(lp))
-                       continue;
+               Page            dp = (Page) BufferGetPage(buffer);
+               OffsetNumber maxoff = PageGetMaxOffsetNumber(dp);
+               OffsetNumber offnum;
 
-               /*
-                * check time qualification of tuple, remember it if valid
-                */
-               loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
-               loctup.t_len = ItemIdGetLength(lp);
-               ItemPointerSet(&(loctup.t_self), page, targoffset);
+               for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum++)
+               {
+                       ItemId          lp;
+                       HeapTupleData loctup;
 
-               valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
-               if (valid)
-                       scan->rs_vistuples[ntup++] = targoffset;
+                       lp = PageGetItemId(dp, offnum);
+                       if (!ItemIdIsNormal(lp))
+                               continue;
+                       loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
+                       loctup.t_len = ItemIdGetLength(lp);
+                       if (HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer))
+                               scan->rs_vistuples[ntup++] = offnum;
+               }
        }
 
        LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
index 2281b1415dde4998f76675c9f6aecddb0ca86251..d17ec0032eadb75d692631e62b9c351a5245dd09 100644 (file)
@@ -1407,6 +1407,7 @@ _SPI_prepare_plan(const char *src, SPIPlanPtr plan)
                plansource->num_params = nargs;
                plansource->fully_planned = true;
                plansource->fixed_result = false;
+               /* no need to set search_path, generation or saved_xmin */
                plansource->resultDesc = PlanCacheComputeResultDesc(stmt_list);
                plansource->plan = cplan;
 
@@ -1973,6 +1974,7 @@ _SPI_copy_plan(SPIPlanPtr plan, MemoryContext parentcxt)
                newsource->num_params = newplan->nargs;
                newsource->fully_planned = plansource->fully_planned;
                newsource->fixed_result = plansource->fixed_result;
+               /* no need to worry about seach_path, generation or saved_xmin */
                if (plansource->resultDesc)
                        newsource->resultDesc = CreateTupleDescCopy(plansource->resultDesc);
                newsource->plan = newcplan;
index df7512b9d2ceb9bea60cb02a694c7ab2ce28b076..b8c435d9d76018de8d31c40f1688a65d8b724fd7 100644 (file)
@@ -32,6 +32,7 @@
 #include <limits.h>
 
 #include "access/htup.h"
+#include "nodes/bitmapset.h"
 #include "nodes/tidbitmap.h"
 #include "storage/bufpage.h"
 #include "utils/hsearch.h"
@@ -61,9 +62,7 @@
  */
 #define PAGES_PER_CHUNK  (BLCKSZ / 32)
 
-/* The bitmap unit size can be adjusted by changing these declarations: */
-#define BITS_PER_BITMAPWORD 32
-typedef uint32 bitmapword;             /* must be an unsigned type */
+/* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
 
 #define WORDNUM(x)     ((x) / BITS_PER_BITMAPWORD)
 #define BITNUM(x)      ((x) % BITS_PER_BITMAPWORD)
index 5fe85770c5addf8f6f012c756da612f5b327824f..9b296df5258f435b54d5f41e75e14a0c3f73aaf1 100644 (file)
@@ -134,6 +134,7 @@ standard_planner(Query *parse, int cursorOptions, ParamListInfo boundParams)
        glob->subrtables = NIL;
        glob->rewindPlanIDs = NULL;
        glob->finalrtable = NIL;
+       glob->transientPlan = false;
 
        /* Determine what fraction of the plan is likely to be scanned */
        if (cursorOptions & CURSOR_OPT_FAST_PLAN)
@@ -183,6 +184,7 @@ standard_planner(Query *parse, int cursorOptions, ParamListInfo boundParams)
 
        result->commandType = parse->commandType;
        result->canSetTag = parse->canSetTag;
+       result->transientPlan = glob->transientPlan;
        result->planTree = top_plan;
        result->rtable = glob->finalrtable;
        result->resultRelations = root->resultRelations;
index 60d6ace71fbc23850408de8ed5faf1b284f54c68..597ecef32a87fef70c52ed6ded35e0d31e01605a 100644 (file)
@@ -19,6 +19,7 @@
 
 #include "access/genam.h"
 #include "access/heapam.h"
+#include "access/transam.h"
 #include "catalog/pg_inherits.h"
 #include "nodes/makefuncs.h"
 #include "optimizer/clauses.h"
@@ -164,6 +165,20 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
                                continue;
                        }
 
+                       /*
+                        * If the index is valid, but cannot yet be used, ignore it;
+                        * but mark the plan we are generating as transient.
+                        * See src/backend/access/heap/README.HOT for discussion.
+                        */
+                       if (index->indcheckxmin &&
+                               !TransactionIdPrecedes(HeapTupleHeaderGetXmin(indexRelation->rd_indextuple->t_data),
+                                                                          TransactionXmin))
+                       {
+                               root->glob->transientPlan = true;
+                               index_close(indexRelation, NoLock);
+                               continue;
+                       }
+
                        info = makeNode(IndexOptInfo);
 
                        info->indexoid = index->indexrelid;
index 11714a041de45db1056cec3a6ab7337a79cfba83..fdeca3bfacc79e882165b752effc912102b8678e 100644 (file)
@@ -14,6 +14,7 @@
  */
 #include "postgres.h"
 
+#include "access/htup.h"
 #include "optimizer/clauses.h"
 #include "optimizer/prep.h"
 #include "optimizer/var.h"
@@ -54,6 +55,7 @@ typedef struct
 
 static bool pull_varnos_walker(Node *node,
                                   pull_varnos_context *context);
+static bool pull_varattnos_walker(Node *node, Bitmapset **varattnos);
 static bool contain_var_reference_walker(Node *node,
                                                         contain_var_reference_context *context);
 static bool contain_var_clause_walker(Node *node, void *context);
@@ -134,6 +136,47 @@ pull_varnos_walker(Node *node, pull_varnos_context *context)
                                                                  (void *) context);
 }
 
+/*
+ * pull_varattnos
+ *             Find all the distinct attribute numbers present in an expression tree,
+ *             and add them to the initial contents of *varattnos.
+ *             Only Vars that reference RTE 1 of rtable level zero are considered.
+ *
+ * Attribute numbers are offset by FirstLowInvalidHeapAttributeNumber so that
+ * we can include system attributes (e.g., OID) in the bitmap representation.
+ *
+ * Currently, this does not support subqueries nor expressions containing
+ * references to multiple tables; not needed since it's only applied to
+ * index expressions and predicates.
+ */
+void
+pull_varattnos(Node *node, Bitmapset **varattnos)
+{
+       (void) pull_varattnos_walker(node, varattnos);
+}
+
+static bool
+pull_varattnos_walker(Node *node, Bitmapset **varattnos)
+{
+       if (node == NULL)
+               return false;
+       if (IsA(node, Var))
+       {
+               Var                *var = (Var *) node;
+
+               Assert(var->varno == 1);
+               *varattnos = bms_add_member(*varattnos,
+                                               var->varattno - FirstLowInvalidHeapAttributeNumber);
+               return false;
+       }
+       /* Should not find a subquery or subplan */
+       Assert(!IsA(node, Query));
+       Assert(!is_subplan(node));
+
+       return expression_tree_walker(node, pull_varattnos_walker,
+                                                                 (void *) varattnos);
+}
+
 
 /*
  *             contain_var_reference
index 6aa9fdb28639c58e6a6e1701d05340d1bd5f6edc..04ac6c76922a984c1d8cccfa46eb45c99fee836b 100644 (file)
@@ -1294,7 +1294,7 @@ pgstat_count_heap_insert(Relation rel)
  * pgstat_count_heap_update - count a tuple update
  */
 void
-pgstat_count_heap_update(Relation rel)
+pgstat_count_heap_update(Relation rel, bool hot)
 {
        PgStat_TableStatus *pgstat_info = rel->pgstat_info;
 
@@ -1304,6 +1304,9 @@ pgstat_count_heap_update(Relation rel)
 
                /* t_tuples_updated is nontransactional, so just advance it */
                pgstat_info->t_counts.t_tuples_updated++;
+               /* ditto for the hot_update counter */
+               if (hot)
+                       pgstat_info->t_counts.t_tuples_hot_updated++;
 
                /* We have to log the transactional effect at the proper level */
                if (pgstat_info->trans == NULL ||
@@ -1340,6 +1343,23 @@ pgstat_count_heap_delete(Relation rel)
        }
 }
 
+/*
+ * pgstat_update_heap_dead_tuples - update dead-tuples count
+ *
+ * The semantics of this are that we are reporting the nontransactional
+ * recovery of "delta" dead tuples; so t_new_dead_tuples decreases
+ * rather than increasing, and the change goes straight into the per-table
+ * counter, not into transactional state.
+ */
+void
+pgstat_update_heap_dead_tuples(Relation rel, int delta)
+{
+       PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+       if (pgstat_collect_tuplelevel && pgstat_info != NULL)
+               pgstat_info->t_counts.t_new_dead_tuples -= delta;
+}
+
 
 /* ----------
  * AtEOXact_PgStat
@@ -2901,6 +2921,7 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
                        tabentry->tuples_inserted = tabmsg[i].t_counts.t_tuples_inserted;
                        tabentry->tuples_updated = tabmsg[i].t_counts.t_tuples_updated;
                        tabentry->tuples_deleted = tabmsg[i].t_counts.t_tuples_deleted;
+                       tabentry->tuples_hot_updated = tabmsg[i].t_counts.t_tuples_hot_updated;
                        tabentry->n_live_tuples = tabmsg[i].t_counts.t_new_live_tuples;
                        tabentry->n_dead_tuples = tabmsg[i].t_counts.t_new_dead_tuples;
                        tabentry->blocks_fetched = tabmsg[i].t_counts.t_blocks_fetched;
@@ -2923,6 +2944,7 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
                        tabentry->tuples_inserted += tabmsg[i].t_counts.t_tuples_inserted;
                        tabentry->tuples_updated += tabmsg[i].t_counts.t_tuples_updated;
                        tabentry->tuples_deleted += tabmsg[i].t_counts.t_tuples_deleted;
+                       tabentry->tuples_hot_updated += tabmsg[i].t_counts.t_tuples_hot_updated;
                        tabentry->n_live_tuples += tabmsg[i].t_counts.t_new_live_tuples;
                        tabentry->n_dead_tuples += tabmsg[i].t_counts.t_new_dead_tuples;
                        tabentry->blocks_fetched += tabmsg[i].t_counts.t_blocks_fetched;
@@ -2931,6 +2953,8 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
 
                /* Clamp n_live_tuples in case of negative new_live_tuples */
                tabentry->n_live_tuples = Max(tabentry->n_live_tuples, 0);
+               /* Likewise for n_dead_tuples */
+               tabentry->n_dead_tuples = Max(tabentry->n_dead_tuples, 0);
 
                /*
                 * Add per-table stats to the per-database entry, too.
@@ -3115,6 +3139,7 @@ pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len)
        else
                tabentry->vacuum_timestamp = msg->m_vacuumtime;
        tabentry->n_live_tuples = msg->m_tuples;
+       /* Resetting dead_tuples to 0 is an approximation ... */
        tabentry->n_dead_tuples = 0;
        if (msg->m_analyze)
        {
index fd6ad150680c18debe3e0eff7652f0e08497f57a..1ba34c84341dc7ae945d7a386779a32375788db4 100644 (file)
@@ -2066,6 +2066,55 @@ LockBufferForCleanup(Buffer buffer)
        }
 }
 
+/*
+ * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
+ *
+ * We won't loop, but just check once to see if the pin count is OK.  If
+ * not, return FALSE with no lock held.
+ */ 
+bool
+ConditionalLockBufferForCleanup(Buffer buffer)
+{
+       volatile BufferDesc *bufHdr;
+
+       Assert(BufferIsValid(buffer));
+
+       if (BufferIsLocal(buffer))
+       {
+               /* There should be exactly one pin */
+               Assert(LocalRefCount[-buffer - 1] > 0);
+               if (LocalRefCount[-buffer - 1] != 1)
+                       return false;
+               /* Nobody else to wait for */
+               return true;
+       }
+
+       /* There should be exactly one local pin */
+       Assert(PrivateRefCount[buffer - 1] > 0);
+       if (PrivateRefCount[buffer - 1] != 1)
+               return false;
+
+       /* Try to acquire lock */
+       if (!ConditionalLockBuffer(buffer))
+               return false;
+
+       bufHdr = &BufferDescriptors[buffer - 1];
+       LockBufHdr(bufHdr);
+       Assert(bufHdr->refcount > 0);
+       if (bufHdr->refcount == 1)
+       {
+               /* Successfully acquired exclusive lock with pincount 1 */
+               UnlockBufHdr(bufHdr);
+               return true;
+       }
+
+       /* Failed, so release the lock */
+       UnlockBufHdr(bufHdr);
+       LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+       return false;
+}
+
+
 /*
  *     Functions for buffer I/O handling
  *
index b90ee8fd93bf979e4ca1b4f9b82783af292a6880..884bd79f6513556041525b780980a3d74c1a1ce1 100644 (file)
@@ -14,6 +14,7 @@
  */
 #include "postgres.h"
 
+#include "access/htup.h"
 #include "storage/bufpage.h"
 
 
@@ -108,6 +109,9 @@ PageHeaderIsValid(PageHeader page)
  *     If offsetNumber is not valid, then assign one by finding the first
  *     one that is both unused and deallocated.
  *
+ *     If is_heap is true, we enforce that there can't be more than
+ *     MaxHeapTuplesPerPage line pointers on the page.
+ *
  *     !!! EREPORT(ERROR) IS DISALLOWED HERE !!!
  */
 OffsetNumber
@@ -115,7 +119,8 @@ PageAddItem(Page page,
                        Item item,
                        Size size,
                        OffsetNumber offsetNumber,
-                       bool overwrite)
+                       bool overwrite,
+                       bool is_heap)
 {
        PageHeader      phdr = (PageHeader) page;
        Size            alignedSize;
@@ -200,6 +205,12 @@ PageAddItem(Page page,
                return InvalidOffsetNumber;
        }
 
+       if (is_heap && offsetNumber > MaxHeapTuplesPerPage)
+       {
+               elog(WARNING, "can't put more than MaxHeapTuplesPerPage items in a heap page");
+               return InvalidOffsetNumber;
+       }
+
        /*
         * Compute new lower and upper pointers for page, see if it'll fit.
         *
@@ -315,11 +326,10 @@ itemoffcompare(const void *itemidp1, const void *itemidp2)
  *
  * This routine is usable for heap pages only, but see PageIndexMultiDelete.
  *
- * Returns number of unused line pointers on page.     If "unused" is not NULL
- * then the unused[] array is filled with indexes of unused line pointers.
+ * As a side effect, the page's PD_HAS_FREE_LINES hint bit is updated.
  */
-int
-PageRepairFragmentation(Page page, OffsetNumber *unused)
+void
+PageRepairFragmentation(Page page)
 {
        Offset          pd_lower = ((PageHeader) page)->pd_lower;
        Offset          pd_upper = ((PageHeader) page)->pd_upper;
@@ -329,7 +339,7 @@ PageRepairFragmentation(Page page, OffsetNumber *unused)
        ItemId          lp;
        int                     nline,
                                nstorage,
-                               nused;
+                               nunused;
        int                     i;
        Size            totallen;
        Offset          upper;
@@ -352,13 +362,12 @@ PageRepairFragmentation(Page page, OffsetNumber *unused)
                                                pd_lower, pd_upper, pd_special)));
 
        nline = PageGetMaxOffsetNumber(page);
-       nused = nstorage = 0;
-       for (i = 0; i < nline; i++)
+       nunused = nstorage = 0;
+       for (i = FirstOffsetNumber; i <= nline; i++)
        {
-               lp = PageGetItemId(page, i + 1);
+               lp = PageGetItemId(page, i);
                if (ItemIdIsUsed(lp))
                {
-                       nused++;
                        if (ItemIdHasStorage(lp))
                                nstorage++;
                }
@@ -366,9 +375,7 @@ PageRepairFragmentation(Page page, OffsetNumber *unused)
                {
                        /* Unused entries should have lp_len = 0, but make sure */
                        ItemIdSetUnused(lp);
-                       /* Report to caller if asked for */
-                       if (unused)
-                               unused[i - nused] = (OffsetNumber) i;
+                       nunused++;
                }
        }
 
@@ -431,18 +438,19 @@ PageRepairFragmentation(Page page, OffsetNumber *unused)
        }
 
        /* Set hint bit for PageAddItem */
-       if (nused < nline)
+       if (nunused > 0)
                PageSetHasFreeLinePointers(page);
        else
                PageClearHasFreeLinePointers(page);
-
-       return (nline - nused);
 }
 
 /*
  * PageGetFreeSpace
  *             Returns the size of the free (allocatable) space on a page,
  *             reduced by the space needed for a new line pointer.
+ *
+ * Note: this should usually only be used on index pages.  Use
+ * PageGetHeapFreeSpace on heap pages.
  */
 Size
 PageGetFreeSpace(Page page)
@@ -465,7 +473,8 @@ PageGetFreeSpace(Page page)
 
 /*
  * PageGetExactFreeSpace
- *             Returns the size of the free (allocatable) space on a page.
+ *             Returns the size of the free (allocatable) space on a page,
+ *             without any consideration for adding/removing line pointers.
  */
 Size
 PageGetExactFreeSpace(Page page)
@@ -483,6 +492,73 @@ PageGetExactFreeSpace(Page page)
 }
 
 
+/*
+ * PageGetHeapFreeSpace
+ *             Returns the size of the free (allocatable) space on a page,
+ *             reduced by the space needed for a new line pointer.
+ *
+ * The difference between this and PageGetFreeSpace is that this will return
+ * zero if there are already MaxHeapTuplesPerPage line pointers in the page
+ * and none are free.  We use this to enforce that no more than
+ * MaxHeapTuplesPerPage line pointers are created on a heap page.  (Although
+ * no more tuples than that could fit anyway, in the presence of redirected
+ * or dead line pointers it'd be possible to have too many line pointers.
+ * To avoid breaking code that assumes MaxHeapTuplesPerPage is a hard limit
+ * on the number of line pointers, we make this extra check.)
+ */
+Size
+PageGetHeapFreeSpace(Page page)
+{
+       Size                    space;
+
+       space = PageGetFreeSpace(page);
+       if (space > 0)
+       {
+               OffsetNumber    offnum, nline;
+
+               /*
+                * Are there already MaxHeapTuplesPerPage line pointers in the page?
+                */
+               nline = PageGetMaxOffsetNumber(page);
+               if (nline >= MaxHeapTuplesPerPage)
+               {
+                       if (PageHasFreeLinePointers((PageHeader) page))
+                       {
+                               /*
+                                * Since this is just a hint, we must confirm that there is
+                                * indeed a free line pointer
+                                */
+                               for (offnum = FirstOffsetNumber; offnum <= nline; offnum++)
+                               {
+                                       ItemId  lp = PageGetItemId(page, offnum);
+
+                                       if (!ItemIdIsUsed(lp))
+                                               break;
+                               }
+
+                               if (offnum > nline)
+                               {
+                                       /*
+                                        * The hint is wrong, but we can't clear it here since
+                                        * we don't have the ability to mark the page dirty.
+                                        */
+                                       space = 0;
+                               }
+                       }
+                       else
+                       {
+                               /*
+                                * Although the hint might be wrong, PageAddItem will believe
+                                * it anyway, so we must believe it too.
+                                */
+                               space = 0;
+                       }
+               }
+       }
+       return space;
+}
+
+
 /*
  * PageIndexTupleDelete
  *
index 70a92c96d7806dcfb211e9a01b96cf9dd7e5d298..a5e21a91732cf38917516bc117c4dd163db9c17a 100644 (file)
@@ -28,6 +28,7 @@ extern Datum pg_stat_get_tuples_fetched(PG_FUNCTION_ARGS);
 extern Datum pg_stat_get_tuples_inserted(PG_FUNCTION_ARGS);
 extern Datum pg_stat_get_tuples_updated(PG_FUNCTION_ARGS);
 extern Datum pg_stat_get_tuples_deleted(PG_FUNCTION_ARGS);
+extern Datum pg_stat_get_tuples_hot_updated(PG_FUNCTION_ARGS);
 extern Datum pg_stat_get_live_tuples(PG_FUNCTION_ARGS);
 extern Datum pg_stat_get_dead_tuples(PG_FUNCTION_ARGS);
 extern Datum pg_stat_get_blocks_fetched(PG_FUNCTION_ARGS);
@@ -169,6 +170,22 @@ pg_stat_get_tuples_deleted(PG_FUNCTION_ARGS)
 }
 
 
+Datum
+pg_stat_get_tuples_hot_updated(PG_FUNCTION_ARGS)
+{
+       Oid                     relid = PG_GETARG_OID(0);
+       int64           result;
+       PgStat_StatTabEntry *tabentry;
+
+       if ((tabentry = pgstat_fetch_stat_tabentry(relid)) == NULL)
+               result = 0;
+       else
+               result = (int64) (tabentry->tuples_hot_updated);
+
+       PG_RETURN_INT64(result);
+}
+
+
 Datum
 pg_stat_get_live_tuples(PG_FUNCTION_ARGS)
 { 
index be161ff4de68bc6ab51a6cfec2582a8798393e29..0cbbd12c07625a61f3995e0a6cdf1c73f084e010 100644 (file)
@@ -40,6 +40,7 @@
 #include "postgres.h"
 
 #include "utils/plancache.h"
+#include "access/transam.h"
 #include "catalog/namespace.h"
 #include "executor/executor.h"
 #include "optimizer/clauses.h"
@@ -79,6 +80,7 @@ static void ScanQueryForRelids(Query *parsetree,
                                                           void *arg);
 static bool ScanQueryWalker(Node *node, ScanQueryWalkerContext *context);
 static bool rowmark_member(List *rowMarks, int rt_index);
+static bool plan_list_is_transient(List *stmt_list);
 static void PlanCacheCallback(Datum arg, Oid relid);
 static void InvalRelid(Oid relid, LOCKMODE lockmode,
                                           InvalRelidContext *context);
@@ -322,6 +324,13 @@ StoreCachedPlan(CachedPlanSource *plansource,
        plan->stmt_list = stmt_list;
        plan->fully_planned = plansource->fully_planned;
        plan->dead = false;
+       if (plansource->fully_planned && plan_list_is_transient(stmt_list))
+       {
+               Assert(TransactionIdIsNormal(TransactionXmin));
+               plan->saved_xmin = TransactionXmin;
+       }
+       else
+               plan->saved_xmin = InvalidTransactionId;
        plan->refcount = 1;                     /* for the parent's link */
        plan->generation = ++(plansource->generation);
        plan->context = plan_context;
@@ -411,6 +420,15 @@ RevalidateCachedPlan(CachedPlanSource *plansource, bool useResOwner)
                else
                        AcquirePlannerLocks(plan->stmt_list, true);
 
+               /*
+                * If plan was transient, check to see if TransactionXmin has
+                * advanced, and if so invalidate it.
+                */
+               if (!plan->dead &&
+                       TransactionIdIsValid(plan->saved_xmin) &&
+                       !TransactionIdEquals(plan->saved_xmin, TransactionXmin))
+                       plan->dead = true;
+
                /*
                 * By now, if any invalidation has happened, PlanCacheCallback
                 * will have marked the plan dead.
@@ -789,6 +807,28 @@ rowmark_member(List *rowMarks, int rt_index)
        return false;
 }
 
+/*
+ * plan_list_is_transient: check if any of the plans in the list are transient.
+ */
+static bool
+plan_list_is_transient(List *stmt_list)
+{
+       ListCell   *lc;
+
+       foreach(lc, stmt_list)
+       {
+               PlannedStmt *plannedstmt = (PlannedStmt *) lfirst(lc);
+               
+               if (!IsA(plannedstmt, PlannedStmt))
+                       continue;                       /* Ignore utility statements */
+
+               if (plannedstmt->transientPlan)
+                       return true;
+       }       
+
+       return false;
+}
+
 /*
  * PlanCacheComputeResultDesc: given a list of either fully-planned statements
  * or Queries, determine the result tupledesc it will produce.  Returns NULL
index e8e485c16ea835e33cedf4883d2bf19f5feff2ed..870c5644572df6578108fbb4c51a11078d4bb6c8 100644 (file)
@@ -34,6 +34,7 @@
 #include "access/reloptions.h"
 #include "access/xact.h"
 #include "catalog/catalog.h"
+#include "catalog/index.h"
 #include "catalog/indexing.h"
 #include "catalog/namespace.h"
 #include "catalog/pg_amop.h"
@@ -51,6 +52,7 @@
 #include "optimizer/clauses.h"
 #include "optimizer/planmain.h"
 #include "optimizer/prep.h"
+#include "optimizer/var.h"
 #include "rewrite/rewriteDefine.h"
 #include "storage/fd.h"
 #include "storage/smgr.h"
@@ -1658,6 +1660,10 @@ RelationReloadIndexInfo(Relation relation)
                index = (Form_pg_index) GETSTRUCT(tuple);
 
                relation->rd_index->indisvalid = index->indisvalid;
+               relation->rd_index->indcheckxmin = index->indcheckxmin;
+               relation->rd_index->indisready = index->indisready;
+               HeapTupleHeaderSetXmin(relation->rd_indextuple->t_data,
+                                                          HeapTupleHeaderGetXmin(tuple->t_data));
 
                ReleaseSysCache(tuple);
        }
@@ -1762,6 +1768,7 @@ RelationClearRelation(Relation relation, bool rebuild)
        if (relation->rd_options)
                pfree(relation->rd_options);
        list_free(relation->rd_indexlist);
+       bms_free(relation->rd_indexattr);
        if (relation->rd_indexcxt)
                MemoryContextDelete(relation->rd_indexcxt);
 
@@ -2969,6 +2976,7 @@ RelationSetIndexList(Relation relation, List *indexIds, Oid oidIndex)
        relation->rd_indexvalid = 2;    /* mark list as forced */
        /* must flag that we have a forced index list */
        need_eoxact_work = true;
+       /* we deliberately do not change rd_indexattr */
 }
 
 /*
@@ -3140,6 +3148,91 @@ RelationGetIndexPredicate(Relation relation)
        return result;
 }
 
+/*
+ * RelationGetIndexAttrBitmap -- get a bitmap of index attribute numbers
+ *
+ * The result has a bit set for each attribute used anywhere in the index
+ * definitions of all the indexes on this relation.  (This includes not only
+ * simple index keys, but attributes used in expressions and partial-index
+ * predicates.)
+ *
+ * Attribute numbers are offset by FirstLowInvalidHeapAttributeNumber so that
+ * we can include system attributes (e.g., OID) in the bitmap representation.
+ *
+ * The returned result is palloc'd in the caller's memory context and should
+ * be bms_free'd when not needed anymore.
+ */
+Bitmapset *
+RelationGetIndexAttrBitmap(Relation relation)
+{
+       Bitmapset       *indexattrs;
+       List            *indexoidlist;
+       ListCell        *l;
+       MemoryContext oldcxt;
+
+       /* Quick exit if we already computed the result. */
+       if (relation->rd_indexattr != NULL)
+               return bms_copy(relation->rd_indexattr);
+
+       /* Fast path if definitely no indexes */
+       if (!RelationGetForm(relation)->relhasindex)
+               return NULL;
+
+       /*
+        * Get cached list of index OIDs
+        */
+       indexoidlist = RelationGetIndexList(relation);
+
+       /* Fall out if no indexes (but relhasindex was set) */
+       if (indexoidlist == NIL)
+               return NULL;
+
+       /*
+        * For each index, add referenced attributes to indexattrs.
+        */
+       indexattrs = NULL;
+       foreach(l, indexoidlist)
+       {
+               Oid                     indexOid = lfirst_oid(l);
+               Relation        indexDesc;
+               IndexInfo  *indexInfo;
+               int             i;
+
+               indexDesc = index_open(indexOid, AccessShareLock);
+
+               /* Extract index key information from the index's pg_index row */
+               indexInfo = BuildIndexInfo(indexDesc);
+
+               /* Collect simple attribute references */
+               for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++)
+               {
+                       int attrnum = indexInfo->ii_KeyAttrNumbers[i];
+
+                       if (attrnum != 0)
+                               indexattrs = bms_add_member(indexattrs,
+                                               attrnum - FirstLowInvalidHeapAttributeNumber);
+               }
+
+               /* Collect all attributes used in expressions, too */
+               pull_varattnos((Node *) indexInfo->ii_Expressions, &indexattrs);
+
+               /* Collect all attributes in the index predicate, too */
+               pull_varattnos((Node *) indexInfo->ii_Predicate, &indexattrs);
+
+               index_close(indexDesc, AccessShareLock);
+       }
+
+       list_free(indexoidlist);
+
+       /* Now save a copy of the bitmap in the relcache entry. */
+       oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
+       relation->rd_indexattr = bms_copy(indexattrs);
+       MemoryContextSwitchTo(oldcxt);
+
+       /* We return our original working copy for caller to play with */
+       return indexattrs;
+}
+
 
 /*
  *     load_relcache_init_file, write_relcache_init_file
@@ -3465,6 +3558,7 @@ load_relcache_init_file(void)
                        rel->rd_refcnt = 0;
                rel->rd_indexvalid = 0;
                rel->rd_indexlist = NIL;
+               rel->rd_indexattr = NULL;
                rel->rd_oidindex = InvalidOid;
                rel->rd_createSubid = InvalidSubTransactionId;
                rel->rd_newRelfilenodeSubid = InvalidSubTransactionId;
index 2188192791c14f279d29b1a4c99a437fb3f2f071..8a1d632ac4a3374593815377eb4dbc0c7c1cb3c2 100644 (file)
@@ -153,6 +153,10 @@ extern bool heap_fetch(Relation relation, Snapshot snapshot,
 extern bool heap_release_fetch(Relation relation, Snapshot snapshot,
                                   HeapTuple tuple, Buffer *userbuf, bool keep_buf,
                                   Relation stats_relation);
+extern bool heap_hot_search_buffer(ItemPointer tid, Buffer buffer,
+                                                                  Snapshot snapshot, bool *all_dead);
+extern bool heap_hot_search(ItemPointer tid, Relation relation,
+                                                       Snapshot snapshot, bool *all_dead);
 
 extern void heap_get_latest_tid(Relation relation, Snapshot snapshot,
                                        ItemPointer tid);
@@ -183,6 +187,8 @@ extern void simple_heap_update(Relation relation, ItemPointer otid,
 extern void heap_markpos(HeapScanDesc scan);
 extern void heap_restrpos(HeapScanDesc scan);
 
+extern void heap_sync(Relation relation);
+
 extern void heap_redo(XLogRecPtr lsn, XLogRecord *rptr);
 extern void heap_desc(StringInfo buf, uint8 xl_info, char *rec);
 extern void heap2_redo(XLogRecPtr lsn, XLogRecord *rptr);
@@ -192,7 +198,10 @@ extern XLogRecPtr log_heap_move(Relation reln, Buffer oldbuf,
                          ItemPointerData from,
                          Buffer newbuf, HeapTuple newtup);
 extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer,
-                          OffsetNumber *unused, int uncnt);
+                          OffsetNumber *redirected, int nredirected,
+                          OffsetNumber *nowdead, int ndead,
+                          OffsetNumber *nowunused, int nunused,
+                          bool redirect_move);
 extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer,
                                                                  TransactionId cutoff_xid,
                                                                  OffsetNumber *offsets, int offcnt);
@@ -240,7 +249,13 @@ extern MinimalTuple minimal_tuple_from_heap_tuple(HeapTuple htup);
 extern HeapTuple heap_addheader(int natts, bool withoid,
                           Size structlen, void *structure);
 
-extern void heap_sync(Relation relation);
+/* in heap/pruneheap.c */
+extern void heap_page_prune_opt(Relation relation, Buffer buffer,
+                                                               TransactionId OldestXmin);
+extern int     heap_page_prune(Relation relation, Buffer buffer,
+                                                       TransactionId OldestXmin,
+                                                       bool redirect_move, bool report_stats);
+extern void heap_get_root_tuples(Page page, OffsetNumber *root_offsets);
 
 /* in heap/syncscan.c */
 extern void ss_report_location(Relation rel, BlockNumber location);
index bc529f1bfdfdbdb5cc36e4808cf4182983ac874c..d95fe7f96d2ff25bf48cde63b80c7608b09bcaea 100644 (file)
@@ -184,8 +184,12 @@ typedef HeapTupleHeaderData *HeapTupleHeader;
 /*
  * information stored in t_infomask2:
  */
-#define HEAP_NATTS_MASK                        0x7FF   /* 11 bits for number of attributes */
-/* bits 0xF800 are currently unused */
+#define HEAP_NATTS_MASK                        0x07FF  /* 11 bits for number of attributes */
+/* bits 0x3800 are available */
+#define HEAP_HOT_UPDATED               0x4000  /* tuple was HOT-updated */
+#define HEAP_ONLY_TUPLE                        0x8000  /* this is heap-only tuple */
+
+#define HEAP2_XACT_MASK                        0xC000  /* visibility-related bits */
 
 /*
  * HeapTupleHeader accessor macros
@@ -201,7 +205,7 @@ typedef HeapTupleHeaderData *HeapTupleHeader;
 
 #define HeapTupleHeaderSetXmin(tup, xid) \
 ( \
-       TransactionIdStore((xid), &(tup)->t_choice.t_heap.t_xmin) \
+       (tup)->t_choice.t_heap.t_xmin = (xid) \
 )
 
 #define HeapTupleHeaderGetXmax(tup) \
@@ -211,7 +215,7 @@ typedef HeapTupleHeaderData *HeapTupleHeader;
 
 #define HeapTupleHeaderSetXmax(tup, xid) \
 ( \
-       TransactionIdStore((xid), &(tup)->t_choice.t_heap.t_xmax) \
+       (tup)->t_choice.t_heap.t_xmax = (xid) \
 )
 
 /*
@@ -255,7 +259,7 @@ do { \
 #define HeapTupleHeaderSetXvac(tup, xid) \
 do { \
        Assert((tup)->t_infomask & HEAP_MOVED); \
-       TransactionIdStore((xid), &(tup)->t_choice.t_heap.t_field3.t_xvac); \
+       (tup)->t_choice.t_heap.t_field3.t_xvac = (xid); \
 } while (0)
 
 #define HeapTupleHeaderGetDatumLength(tup) \
@@ -298,6 +302,43 @@ do { \
        *((Oid *) ((char *)(tup) + (tup)->t_hoff - sizeof(Oid))) = (oid); \
 } while (0)
 
+/*
+ * Note that we stop considering a tuple HOT-updated as soon as it is known
+ * aborted or the would-be updating transaction is known aborted.  For best
+ * efficiency, check tuple visibility before using this macro, so that the
+ * INVALID bits will be as up to date as possible.
+ */
+#define HeapTupleHeaderIsHotUpdated(tup) \
+( \
+       ((tup)->t_infomask2 & HEAP_HOT_UPDATED) != 0 && \
+       ((tup)->t_infomask & (HEAP_XMIN_INVALID | HEAP_XMAX_INVALID)) == 0 \
+)
+
+#define HeapTupleHeaderSetHotUpdated(tup) \
+( \
+       (tup)->t_infomask2 |= HEAP_HOT_UPDATED \
+)
+
+#define HeapTupleHeaderClearHotUpdated(tup) \
+( \
+       (tup)->t_infomask2 &= ~HEAP_HOT_UPDATED \
+)
+
+#define HeapTupleHeaderIsHeapOnly(tup) \
+( \
+  (tup)->t_infomask2 & HEAP_ONLY_TUPLE \
+)
+
+#define HeapTupleHeaderSetHeapOnly(tup) \
+( \
+  (tup)->t_infomask2 |= HEAP_ONLY_TUPLE \
+)
+
+#define HeapTupleHeaderClearHeapOnly(tup) \
+( \
+  (tup)->t_infomask2 &= ~HEAP_ONLY_TUPLE \
+)
+
 #define HeapTupleHeaderGetNatts(tup) \
        ((tup)->t_infomask2 & HEAP_NATTS_MASK)
 
@@ -331,6 +372,11 @@ do { \
  * fit on one heap page.  (Note that indexes could have more, because they
  * use a smaller tuple header.)  We arrive at the divisor because each tuple
  * must be maxaligned, and it must have an associated item pointer.
+ *
+ * Note: with HOT, there could theoretically be more line pointers (not actual
+ * tuples) than this on a heap page.  However we constrain the number of line
+ * pointers to this anyway, to avoid excessive line-pointer bloat and not
+ * require increases in the size of work arrays.
  */
 #define MaxHeapTuplesPerPage   \
        ((int) ((BLCKSZ - offsetof(PageHeaderData, pd_linp)) / \
@@ -484,6 +530,24 @@ typedef HeapTupleData *HeapTuple;
 #define HeapTupleHasExternal(tuple) \
                (((tuple)->t_data->t_infomask & HEAP_HASEXTERNAL) != 0)
 
+#define HeapTupleIsHotUpdated(tuple) \
+               HeapTupleHeaderIsHotUpdated((tuple)->t_data)
+
+#define HeapTupleSetHotUpdated(tuple) \
+               HeapTupleHeaderSetHotUpdated((tuple)->t_data)
+
+#define HeapTupleClearHotUpdated(tuple) \
+               HeapTupleHeaderClearHotUpdated((tuple)->t_data)
+
+#define HeapTupleIsHeapOnly(tuple) \
+               HeapTupleHeaderIsHeapOnly((tuple)->t_data)
+
+#define HeapTupleSetHeapOnly(tuple) \
+               HeapTupleHeaderSetHeapOnly((tuple)->t_data)
+
+#define HeapTupleClearHeapOnly(tuple) \
+               HeapTupleHeaderClearHeapOnly((tuple)->t_data)
+
 #define HeapTupleGetOid(tuple) \
                HeapTupleHeaderGetOid((tuple)->t_data)
 
@@ -497,27 +561,30 @@ typedef HeapTupleData *HeapTuple;
  * XLOG allows to store some information in high 4 bits of log
  * record xl_info field.  We use 3 for opcode and one for init bit.
  */
-#define XLOG_HEAP_INSERT       0x00
-#define XLOG_HEAP_DELETE       0x10
-#define XLOG_HEAP_UPDATE       0x20
-#define XLOG_HEAP_MOVE         0x30
-#define XLOG_HEAP_CLEAN                0x40
-#define XLOG_HEAP_NEWPAGE      0x50
-#define XLOG_HEAP_LOCK         0x60
-#define XLOG_HEAP_INPLACE      0x70
-#define XLOG_HEAP_OPMASK       0x70
+#define XLOG_HEAP_INSERT               0x00
+#define XLOG_HEAP_DELETE               0x10
+#define XLOG_HEAP_UPDATE               0x20
+#define XLOG_HEAP_MOVE                 0x30
+#define XLOG_HEAP_HOT_UPDATE   0x40
+#define XLOG_HEAP_NEWPAGE              0x50
+#define XLOG_HEAP_LOCK                 0x60
+#define XLOG_HEAP_INPLACE              0x70
+
+#define XLOG_HEAP_OPMASK               0x70
 /*
  * When we insert 1st item on new page in INSERT/UPDATE
  * we can (and we do) restore entire page in redo
  */
-#define XLOG_HEAP_INIT_PAGE 0x80
+#define XLOG_HEAP_INIT_PAGE    0x80
 /*
  * We ran out of opcodes, so heapam.c now has a second RmgrId.  These opcodes
  * are associated with RM_HEAP2_ID, but are not logically different from
  * the ones above associated with RM_HEAP_ID.  We apply XLOG_HEAP_OPMASK,
  * although currently XLOG_HEAP_INIT_PAGE is not used for any of these.
  */
-#define XLOG_HEAP2_FREEZE      0x00
+#define XLOG_HEAP2_FREEZE              0x00
+#define XLOG_HEAP2_CLEAN               0x10
+#define XLOG_HEAP2_CLEAN_MOVE  0x20
 
 /*
  * All what we need to find changed tuple
@@ -569,7 +636,7 @@ typedef struct xl_heap_insert
 
 #define SizeOfHeapInsert       (offsetof(xl_heap_insert, target) + SizeOfHeapTid)
 
-/* This is what we need to know about update|move */
+/* This is what we need to know about update|move|hot_update */
 typedef struct xl_heap_update
 {
        xl_heaptid      target;                 /* deleted tuple id */
@@ -580,15 +647,34 @@ typedef struct xl_heap_update
 
 #define SizeOfHeapUpdate       (offsetof(xl_heap_update, newtid) + SizeOfIptrData)
 
-/* This is what we need to know about vacuum page cleanup */
+/*
+ * This is what we need to know about vacuum page cleanup/redirect
+ *
+ * The array of OffsetNumbers following the fixed part of the record contains:
+ *     * for each redirected item: the item offset, then the offset redirected to
+ *     * for each now-dead item: the item offset
+ *     * for each now-unused item: the item offset
+ * The total number of OffsetNumbers is therefore 2*nredirected+ndead+nunused.
+ * Note that nunused is not explicitly stored, but may be found by reference
+ * to the total record length.
+ *
+ * If the opcode is CLEAN_MOVE instead of CLEAN, then each redirection pair
+ * should be interpreted as physically moving the "to" item pointer to the
+ * "from" slot, rather than placing a redirection item in the "from" slot.
+ * The moved pointers should be replaced by LP_UNUSED items (there will not
+ * be explicit entries in the "now-unused" list for this).  Also, the
+ * HEAP_ONLY bit in the moved tuples must be turned off.
+ */
 typedef struct xl_heap_clean
 {
        RelFileNode node;
        BlockNumber block;
-       /* UNUSED OFFSET NUMBERS FOLLOW AT THE END */
+       uint16          nredirected;
+       uint16          ndead;
+       /* OFFSET NUMBERS FOLLOW */
 } xl_heap_clean;
 
-#define SizeOfHeapClean (offsetof(xl_heap_clean, block) + sizeof(BlockNumber))
+#define SizeOfHeapClean (offsetof(xl_heap_clean, ndead) + sizeof(uint16))
 
 /* This is for replacing a page's contents in toto */
 /* NB: this is used for indexes as well as heaps */
index 23620da18cfb54e1bea9949e9b614ebc05cff8aa..1add1cfc1c69d652e7853754bb22a1d9800910fc 100644 (file)
@@ -82,6 +82,9 @@ typedef struct IndexScanDescData
        HeapTupleData xs_ctup;          /* current heap tuple, if any */
        Buffer          xs_cbuf;                /* current heap buffer in scan, if any */
        /* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */
+       TransactionId xs_prev_xmax;     /* previous HOT chain member's XMAX, if any */
+       OffsetNumber xs_next_hot;       /* next member of HOT chain, if any */
+       bool            xs_hot_dead;    /* T if all members of HOT chain are dead */
 } IndexScanDescData;
 
 typedef IndexScanDescData *IndexScanDesc;
index ce010c56b164f6dfc21c8b9b0d029291bd7bd35c..20234ad09588fa4d0fa1a93952a36a6792e10e85 100644 (file)
@@ -53,6 +53,6 @@
  */
 
 /*                                                     yyyymmddN */
-#define CATALOG_VERSION_NO     200709181
+#define CATALOG_VERSION_NO     200709201
 
 #endif
index d1167782fe75a68b76ab246911034a5255e2b94d..9ad4794980f55cd5fed43a5a1fb678205bdf53ad 100644 (file)
@@ -471,10 +471,12 @@ DATA(insert ( 1259 tableoid                       26 0  4  -7 0 -1 -1 t p i t f f t 0));
 { 0, {"indisprimary"},         16, -1, 1, 5, 0, -1, -1, true, 'p', 'c', true, false, false, true, 0 }, \
 { 0, {"indisclustered"},       16, -1, 1, 6, 0, -1, -1, true, 'p', 'c', true, false, false, true, 0 }, \
 { 0, {"indisvalid"},           16, -1, 1, 7, 0, -1, -1, true, 'p', 'c', true, false, false, true, 0 }, \
-{ 0, {"indkey"},                       22, -1, -1, 8, 1, -1, -1, false, 'p', 'i', true, false, false, true, 0 }, \
-{ 0, {"indclass"},                     30, -1, -1, 9, 1, -1, -1, false, 'p', 'i', true, false, false, true, 0 }, \
-{ 0, {"indoption"},                    22, -1, -1, 10, 1, -1, -1, false, 'p', 'i', true, false, false, true, 0 }, \
-{ 0, {"indexprs"},                     25, -1, -1, 11, 0, -1, -1, false, 'x', 'i', false, false, false, true, 0 }, \
-{ 0, {"indpred"},                      25, -1, -1, 12, 0, -1, -1, false, 'x', 'i', false, false, false, true, 0 }
+{ 0, {"indcheckxmin"},         16, -1, 1, 8, 0, -1, -1, true, 'p', 'c', true, false, false, true, 0 }, \
+{ 0, {"indisready"},           16, -1, 1, 9, 0, -1, -1, true, 'p', 'c', true, false, false, true, 0 }, \
+{ 0, {"indkey"},                       22, -1, -1, 10, 1, -1, -1, false, 'p', 'i', true, false, false, true, 0 }, \
+{ 0, {"indclass"},                     30, -1, -1, 11, 1, -1, -1, false, 'p', 'i', true, false, false, true, 0 }, \
+{ 0, {"indoption"},                    22, -1, -1, 12, 1, -1, -1, false, 'p', 'i', true, false, false, true, 0 }, \
+{ 0, {"indexprs"},                     25, -1, -1, 13, 0, -1, -1, false, 'x', 'i', false, false, false, true, 0 }, \
+{ 0, {"indpred"},                      25, -1, -1, 14, 0, -1, -1, false, 'x', 'i', false, false, false, true, 0 }
 
 #endif   /* PG_ATTRIBUTE_H */
index 286a54304d9590a32a1129433b9546285a1ec11b..7590640ee4d2cdeb727de22ae6c0ac5d77619ba1 100644 (file)
@@ -42,6 +42,8 @@ CATALOG(pg_index,2610) BKI_WITHOUT_OIDS
        bool            indisprimary;   /* is this index for primary key? */
        bool            indisclustered; /* is this the index last clustered by? */
        bool            indisvalid;             /* is this index valid for use by queries? */
+       bool            indcheckxmin;   /* must we wait for xmin to be old? */
+       bool            indisready;             /* is this index ready for inserts? */
 
        /* VARIABLE LENGTH FIELDS: */
        int2vector      indkey;                 /* column numbers of indexed cols, or 0 */
@@ -65,7 +67,7 @@ typedef FormData_pg_index *Form_pg_index;
  *             compiler constants for pg_index
  * ----------------
  */
-#define Natts_pg_index                                 12
+#define Natts_pg_index                                 14
 #define Anum_pg_index_indexrelid               1
 #define Anum_pg_index_indrelid                 2
 #define Anum_pg_index_indnatts                 3
@@ -73,11 +75,13 @@ typedef FormData_pg_index *Form_pg_index;
 #define Anum_pg_index_indisprimary             5
 #define Anum_pg_index_indisclustered   6
 #define Anum_pg_index_indisvalid               7
-#define Anum_pg_index_indkey                   8
-#define Anum_pg_index_indclass                 9
-#define Anum_pg_index_indoption                        10
-#define Anum_pg_index_indexprs                 11
-#define Anum_pg_index_indpred                  12
+#define Anum_pg_index_indcheckxmin             8
+#define Anum_pg_index_indisready               9
+#define Anum_pg_index_indkey                   10
+#define Anum_pg_index_indclass                 11
+#define Anum_pg_index_indoption                        12
+#define Anum_pg_index_indexprs                 13
+#define Anum_pg_index_indpred                  14
 
 /*
  * Index AMs that support ordered scans must support these two indoption
index 60a7a60babf9738ca86d4f8e0d041203392650df..cc1350f9f6dcc25d3d268a05810cd915a7d1b47f 100644 (file)
@@ -2873,6 +2873,8 @@ DATA(insert OID = 1932 (  pg_stat_get_tuples_updated      PGNSP PGUID 12 1 0 f f t f
 DESCR("statistics: number of tuples updated");
 DATA(insert OID = 1933 (  pg_stat_get_tuples_deleted   PGNSP PGUID 12 1 0 f f t f s 1 20 "26" _null_ _null_ _null_ pg_stat_get_tuples_deleted - _null_ _null_ ));
 DESCR("statistics: number of tuples deleted");
+DATA(insert OID = 1972 (  pg_stat_get_tuples_hot_updated PGNSP PGUID 12 1 0 f f t f s 1 20 "26" _null_ _null_ _null_ pg_stat_get_tuples_hot_updated - _null_ _null_ ));
+DESCR("statistics: number of tuples hot updated");
 DATA(insert OID = 2878 (  pg_stat_get_live_tuples      PGNSP PGUID 12 1 0 f f t f s 1 20 "26" _null_ _null_ _null_ pg_stat_get_live_tuples - _null_ _null_ ));
 DESCR("statistics: number of live tuples");
 DATA(insert OID = 2879 (  pg_stat_get_dead_tuples      PGNSP PGUID 12 1 0 f f t f s 1 20 "26" _null_ _null_ _null_ pg_stat_get_dead_tuples - _null_ _null_ ));
index 5b07d1f15bd3f4f39021b0c971a506d61a7f8af7..b9330a5d71d74b7c1efa3bd248200f517837c07a 100644 (file)
  *             Predicate                       partial-index predicate, or NIL if none
  *             PredicateState          exec state for predicate, or NIL if none
  *             Unique                          is it a unique index?
+ *             ReadyForInserts         is it valid for inserts?
  *             Concurrent                      are we doing a concurrent index build?
+ *             BrokenHotChain          did we detect any broken HOT chains?
+ *
+ * ii_Concurrent and ii_BrokenHotChain are used only during index build;
+ * they're conventionally set to false otherwise.
  * ----------------
  */
 typedef struct IndexInfo
@@ -50,7 +55,9 @@ typedef struct IndexInfo
        List       *ii_Predicate;       /* list of Expr */
        List       *ii_PredicateState;          /* list of ExprState */
        bool            ii_Unique;
+       bool            ii_ReadyForInserts;
        bool            ii_Concurrent;
+       bool            ii_BrokenHotChain;
 } IndexInfo;
 
 /* ----------------
index bab429f5ad62f6a8d3047e9dff2a9f04898a5454..7886ee1afb5bcdf0d0b6147b3fd08eb82ec5eebf 100644 (file)
@@ -39,6 +39,8 @@ typedef struct PlannedStmt
 
        bool            canSetTag;              /* do I set the command result tag? */
 
+       bool            transientPlan;  /* redo plan when TransactionXmin changes? */
+
        struct Plan *planTree;          /* tree of Plan nodes */
 
        List       *rtable;                     /* list of RangeTblEntry nodes */
index 7311565f6669853ac3baf5971734d912cc0de39a..3ef493d5cdfd3433a8cdd070abd706ce99f1c9c3 100644 (file)
@@ -71,6 +71,8 @@ typedef struct PlannerGlobal
        Bitmapset  *rewindPlanIDs;      /* indices of subplans that require REWIND */
 
        List       *finalrtable;        /* "flat" rangetable for executor */
+
+       bool            transientPlan;  /* redo plan when TransactionXmin changes? */
 } PlannerGlobal;
 
 /* macro for fetching the Plan associated with a SubPlan node */
index 22d9217f6d98c1318bf710a9dae73d1a45302510..26f3c99d16e91dcf868ca1d4dd5395d574eb1b06 100644 (file)
@@ -18,6 +18,7 @@
 
 
 extern Relids pull_varnos(Node *node);
+extern void pull_varattnos(Node *node, Bitmapset **varattnos);
 extern bool contain_var_reference(Node *node, int varno, int varattno,
                                          int levelsup);
 extern bool contain_var_clause(Node *node);
index fd618faeb7f6f94e70f38d6be3fed819b2aa7c50..292b250fe0771bf7933d98e81aee0e85d5ac9a65 100644 (file)
@@ -55,10 +55,10 @@ typedef int64 PgStat_Counter;
  * the index AM, while tuples_fetched is the number of tuples successfully
  * fetched by heap_fetch under the control of simple indexscans for this index.
  *
- * tuples_inserted/tuples_updated/tuples_deleted count attempted actions,
+ * tuples_inserted/updated/deleted/hot_updated count attempted actions,
  * regardless of whether the transaction committed.  new_live_tuples and
  * new_dead_tuples are properly adjusted depending on commit or abort.
- * Note that new_live_tuples can be negative!
+ * Note that new_live_tuples and new_dead_tuples can be negative!
  * ----------
  */
 typedef struct PgStat_TableCounts
@@ -71,6 +71,7 @@ typedef struct PgStat_TableCounts
        PgStat_Counter t_tuples_inserted;
        PgStat_Counter t_tuples_updated;
        PgStat_Counter t_tuples_deleted;
+       PgStat_Counter t_tuples_hot_updated;
 
        PgStat_Counter t_new_live_tuples;
        PgStat_Counter t_new_dead_tuples;
@@ -323,7 +324,7 @@ typedef union PgStat_Msg
  * ------------------------------------------------------------
  */
 
-#define PGSTAT_FILE_FORMAT_ID  0x01A5BC96
+#define PGSTAT_FILE_FORMAT_ID  0x01A5BC97
 
 /* ----------
  * PgStat_StatDBEntry                  The collector's data per database
@@ -367,6 +368,7 @@ typedef struct PgStat_StatTabEntry
        PgStat_Counter tuples_inserted;
        PgStat_Counter tuples_updated;
        PgStat_Counter tuples_deleted;
+       PgStat_Counter tuples_hot_updated;
 
        PgStat_Counter n_live_tuples;
        PgStat_Counter n_dead_tuples;
@@ -545,8 +547,9 @@ extern void pgstat_initstats(Relation rel);
        } while (0)
 
 extern void pgstat_count_heap_insert(Relation rel);
-extern void pgstat_count_heap_update(Relation rel);
+extern void pgstat_count_heap_update(Relation rel, bool hot);
 extern void pgstat_count_heap_delete(Relation rel);
+extern void pgstat_update_heap_dead_tuples(Relation rel, int delta);
 
 extern void AtEOXact_PgStat(bool isCommit);
 extern void AtEOSubXact_PgStat(bool isCommit, int nestDepth);
index 807194861d9c0f5ef8a166dd2ac9aebe3a0c48ca..6a5f42cb7c22e1a3bb4c950d64db076165ea21a2 100644 (file)
@@ -156,6 +156,7 @@ extern void UnlockBuffers(void);
 extern void LockBuffer(Buffer buffer, int mode);
 extern bool ConditionalLockBuffer(Buffer buffer);
 extern void LockBufferForCleanup(Buffer buffer);
+extern bool ConditionalLockBufferForCleanup(Buffer buffer);
 
 extern void AbortBufferIO(void);
 
index 786714806b37f0ba34d1610bab501a65bfc122da..d63846c679cbd8907defb273cc0d480b71e00d14 100644 (file)
@@ -140,10 +140,21 @@ typedef PageHeaderData *PageHeader;
  * PD_HAS_FREE_LINES is set if there are any LP_UNUSED line pointers before
  * pd_lower.  This should be considered a hint rather than the truth, since
  * changes to it are not WAL-logged.
+ *
+ * PD_PRUNABLE is set if there are any prunable tuples in the page.
+ * This should be considered a hint rather than the truth, since
+ * the transaction which generates a prunable tuple may or may not commit.
+ * Also there is a lag before a tuple is declared dead.
+ *
+ * PD_PAGE_FULL is set if an UPDATE doesn't find enough free space in the
+ * page for its new tuple version; this suggests that a prune is needed.
+ * Again, this is just a hint.
  */
 #define PD_HAS_FREE_LINES      0x0001  /* are there any unused line pointers? */
+#define PD_PRUNABLE                    0x0002  /* are there any prunable tuples? */
+#define PD_PAGE_FULL           0x0004  /* not enough free space for new tuple? */
 
-#define PD_VALID_FLAG_BITS     0x0001  /* OR of all valid pd_flags bits */
+#define PD_VALID_FLAG_BITS     0x0007  /* OR of all valid pd_flags bits */
 
 /*
  * Page layout version number 0 is for pre-7.3 Postgres releases.
@@ -337,6 +348,20 @@ typedef PageHeaderData *PageHeader;
 #define PageClearHasFreeLinePointers(page) \
        (((PageHeader) (page))->pd_flags &= ~PD_HAS_FREE_LINES)
 
+#define PageIsPrunable(page) \
+       (((PageHeader) (page))->pd_flags & PD_PRUNABLE)
+#define PageSetPrunable(page) \
+       (((PageHeader) (page))->pd_flags |= PD_PRUNABLE)
+#define PageClearPrunable(page) \
+       (((PageHeader) (page))->pd_flags &= ~PD_PRUNABLE)
+
+#define PageIsFull(page) \
+       (((PageHeader) (page))->pd_flags & PD_PAGE_FULL)
+#define PageSetFull(page) \
+       (((PageHeader) (page))->pd_flags |= PD_PAGE_FULL)
+#define PageClearFull(page) \
+       (((PageHeader) (page))->pd_flags &= ~PD_PAGE_FULL)
+
 
 /* ----------------------------------------------------------------
  *             extern declarations
@@ -346,12 +371,13 @@ typedef PageHeaderData *PageHeader;
 extern void PageInit(Page page, Size pageSize, Size specialSize);
 extern bool PageHeaderIsValid(PageHeader page);
 extern OffsetNumber PageAddItem(Page page, Item item, Size size,
-                       OffsetNumber offsetNumber, bool overwrite);
+                       OffsetNumber offsetNumber, bool overwrite, bool is_heap);
 extern Page PageGetTempPage(Page page, Size specialSize);
 extern void PageRestoreTempPage(Page tempPage, Page oldPage);
-extern int     PageRepairFragmentation(Page page, OffsetNumber *unused);
+extern void PageRepairFragmentation(Page page);
 extern Size PageGetFreeSpace(Page page);
 extern Size PageGetExactFreeSpace(Page page);
+extern Size PageGetHeapFreeSpace(Page page);
 extern void PageIndexTupleDelete(Page page, OffsetNumber offset);
 extern void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems);
 
index dae720fba8a5e4fe23c1c318ae22583975abd392..5fc925a6675785eee27dba2a71a10e780d596273 100644 (file)
@@ -75,6 +75,8 @@ typedef struct CachedPlan
        List       *stmt_list;          /* list of statement or Query nodes */
        bool            fully_planned;  /* do we cache planner or rewriter output? */
        bool            dead;                   /* if true, do not use */
+       TransactionId saved_xmin;       /* if valid, replan when TransactionXmin
+                                                                * changes from this value */
        int                     refcount;               /* count of live references to this struct */
        int                     generation;             /* counter, starting at 1, for replans */
        MemoryContext context;          /* context containing this CachedPlan */
index 8268c337a12af187c13d97203aac09efb06cf325..69eee3605875c81752a523ab673e267fc9d978ab 100644 (file)
@@ -19,6 +19,7 @@
 #include "catalog/pg_class.h"
 #include "catalog/pg_index.h"
 #include "fmgr.h"
+#include "nodes/bitmapset.h"
 #include "rewrite/prs2lock.h"
 #include "storage/block.h"
 #include "storage/relfilenode.h"
@@ -145,6 +146,7 @@ typedef struct RelationData
        TupleDesc       rd_att;                 /* tuple descriptor */
        Oid                     rd_id;                  /* relation's object id */
        List       *rd_indexlist;       /* list of OIDs of indexes on relation */
+       Bitmapset  *rd_indexattr;       /* identifies columns used in indexes */
        Oid                     rd_oidindex;    /* OID of unique index on OID, if any */
        LockInfoData rd_lockInfo;       /* lock mgr's info for locking relation */
        RuleLock   *rd_rules;           /* rewrite rules */
index ff215694ed9d1c658ed13c55a205455be71a4237..bf8ff55d29bf16030f9ff8d3c1ce1a8a81c0d7f6 100644 (file)
@@ -29,6 +29,7 @@ extern List *RelationGetIndexList(Relation relation);
 extern Oid     RelationGetOidIndex(Relation relation);
 extern List *RelationGetIndexExpressions(Relation relation);
 extern List *RelationGetIndexPredicate(Relation relation);
+extern Bitmapset *RelationGetIndexAttrBitmap(Relation relation);
 
 extern void RelationSetIndexList(Relation relation,
                                         List *indexIds, Oid oidIndex);
index adec0e6c8470fdd22b518dbb89a14fe8c4487d1b..3483ba155545f5a370fa5209d00130aaf16f8785 100644 (file)
@@ -415,6 +415,7 @@ Table "public.concur_heap"
  f2     | text | 
 Indexes:
     "concur_index2" UNIQUE, btree (f1)
+    "concur_index3" UNIQUE, btree (f2) INVALID
     "concur_index1" btree (f2, f1)
     "concur_index4" btree (f2) WHERE f1 = 'a'::text
     "concur_index5" btree (f2) WHERE f1 = 'x'::text
index bafce821eba910eebf4dc16279127682f5e67431..3fc65ea23500e24458abab4b735b359683b81245 100644 (file)
@@ -1291,13 +1291,13 @@ SELECT viewname, definition FROM pg_views WHERE schemaname <> 'information_schem
  pg_shadow                | SELECT pg_authid.rolname AS usename, pg_authid.oid AS usesysid, pg_authid.rolcreatedb AS usecreatedb, pg_authid.rolsuper AS usesuper, pg_authid.rolcatupdate AS usecatupd, pg_authid.rolpassword AS passwd, (pg_authid.rolvaliduntil)::abstime AS valuntil, pg_authid.rolconfig AS useconfig FROM pg_authid WHERE pg_authid.rolcanlogin;
  pg_stat_activity         | SELECT d.oid AS datid, d.datname, pg_stat_get_backend_pid(s.backendid) AS procpid, pg_stat_get_backend_userid(s.backendid) AS usesysid, u.rolname AS usename, pg_stat_get_backend_activity(s.backendid) AS current_query, pg_stat_get_backend_waiting(s.backendid) AS waiting, pg_stat_get_backend_xact_start(s.backendid) AS xact_start, pg_stat_get_backend_activity_start(s.backendid) AS query_start, pg_stat_get_backend_start(s.backendid) AS backend_start, pg_stat_get_backend_client_addr(s.backendid) AS client_addr, pg_stat_get_backend_client_port(s.backendid) AS client_port FROM pg_database d, (SELECT pg_stat_get_backend_idset() AS backendid) s, pg_authid u WHERE ((pg_stat_get_backend_dbid(s.backendid) = d.oid) AND (pg_stat_get_backend_userid(s.backendid) = u.oid));
  pg_stat_all_indexes      | SELECT c.oid AS relid, i.oid AS indexrelid, n.nspname AS schemaname, c.relname, i.relname AS indexrelname, pg_stat_get_numscans(i.oid) AS idx_scan, pg_stat_get_tuples_returned(i.oid) AS idx_tup_read, pg_stat_get_tuples_fetched(i.oid) AS idx_tup_fetch FROM (((pg_class c JOIN pg_index x ON ((c.oid = x.indrelid))) JOIN pg_class i ON ((i.oid = x.indexrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char"]));
- pg_stat_all_tables       | SELECT c.oid AS relid, n.nspname AS schemaname, c.relname, pg_stat_get_numscans(c.oid) AS seq_scan, pg_stat_get_tuples_returned(c.oid) AS seq_tup_read, (sum(pg_stat_get_numscans(i.indexrelid)))::bigint AS idx_scan, ((sum(pg_stat_get_tuples_fetched(i.indexrelid)))::bigint + pg_stat_get_tuples_fetched(c.oid)) AS idx_tup_fetch, pg_stat_get_tuples_inserted(c.oid) AS n_tup_ins, pg_stat_get_tuples_updated(c.oid) AS n_tup_upd, pg_stat_get_tuples_deleted(c.oid) AS n_tup_del, pg_stat_get_live_tuples(c.oid) AS n_live_tup, pg_stat_get_dead_tuples(c.oid) AS n_dead_tup, pg_stat_get_last_vacuum_time(c.oid) AS last_vacuum, pg_stat_get_last_autovacuum_time(c.oid) AS last_autovacuum, pg_stat_get_last_analyze_time(c.oid) AS last_analyze, pg_stat_get_last_autoanalyze_time(c.oid) AS last_autoanalyze FROM ((pg_class c LEFT JOIN pg_index i ON ((c.oid = i.indrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char"])) GROUP BY c.oid, n.nspname, c.relname;
+ pg_stat_all_tables       | SELECT c.oid AS relid, n.nspname AS schemaname, c.relname, pg_stat_get_numscans(c.oid) AS seq_scan, pg_stat_get_tuples_returned(c.oid) AS seq_tup_read, (sum(pg_stat_get_numscans(i.indexrelid)))::bigint AS idx_scan, ((sum(pg_stat_get_tuples_fetched(i.indexrelid)))::bigint + pg_stat_get_tuples_fetched(c.oid)) AS idx_tup_fetch, pg_stat_get_tuples_inserted(c.oid) AS n_tup_ins, pg_stat_get_tuples_updated(c.oid) AS n_tup_upd, pg_stat_get_tuples_deleted(c.oid) AS n_tup_del, pg_stat_get_tuples_hot_updated(c.oid) AS n_tup_hot_upd, pg_stat_get_live_tuples(c.oid) AS n_live_tup, pg_stat_get_dead_tuples(c.oid) AS n_dead_tup, pg_stat_get_last_vacuum_time(c.oid) AS last_vacuum, pg_stat_get_last_autovacuum_time(c.oid) AS last_autovacuum, pg_stat_get_last_analyze_time(c.oid) AS last_analyze, pg_stat_get_last_autoanalyze_time(c.oid) AS last_autoanalyze FROM ((pg_class c LEFT JOIN pg_index i ON ((c.oid = i.indrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char"])) GROUP BY c.oid, n.nspname, c.relname;
  pg_stat_bgwriter         | SELECT pg_stat_get_bgwriter_timed_checkpoints() AS checkpoints_timed, pg_stat_get_bgwriter_requested_checkpoints() AS checkpoints_req, pg_stat_get_bgwriter_buf_written_checkpoints() AS buffers_checkpoint, pg_stat_get_bgwriter_buf_written_clean() AS buffers_clean, pg_stat_get_bgwriter_maxwritten_clean() AS maxwritten_clean;
  pg_stat_database         | SELECT d.oid AS datid, d.datname, pg_stat_get_db_numbackends(d.oid) AS numbackends, pg_stat_get_db_xact_commit(d.oid) AS xact_commit, pg_stat_get_db_xact_rollback(d.oid) AS xact_rollback, (pg_stat_get_db_blocks_fetched(d.oid) - pg_stat_get_db_blocks_hit(d.oid)) AS blks_read, pg_stat_get_db_blocks_hit(d.oid) AS blks_hit, pg_stat_get_db_tuples_returned(d.oid) AS tup_returned, pg_stat_get_db_tuples_fetched(d.oid) AS tup_fetched, pg_stat_get_db_tuples_inserted(d.oid) AS tup_inserted, pg_stat_get_db_tuples_updated(d.oid) AS tup_updated, pg_stat_get_db_tuples_deleted(d.oid) AS tup_deleted FROM pg_database d;
  pg_stat_sys_indexes      | SELECT pg_stat_all_indexes.relid, pg_stat_all_indexes.indexrelid, pg_stat_all_indexes.schemaname, pg_stat_all_indexes.relname, pg_stat_all_indexes.indexrelname, pg_stat_all_indexes.idx_scan, pg_stat_all_indexes.idx_tup_read, pg_stat_all_indexes.idx_tup_fetch FROM pg_stat_all_indexes WHERE ((pg_stat_all_indexes.schemaname = ANY (ARRAY['pg_catalog'::name, 'information_schema'::name])) OR (pg_stat_all_indexes.schemaname ~ '^pg_toast'::text));
- pg_stat_sys_tables       | SELECT pg_stat_all_tables.relid, pg_stat_all_tables.schemaname, pg_stat_all_tables.relname, pg_stat_all_tables.seq_scan, pg_stat_all_tables.seq_tup_read, pg_stat_all_tables.idx_scan, pg_stat_all_tables.idx_tup_fetch, pg_stat_all_tables.n_tup_ins, pg_stat_all_tables.n_tup_upd, pg_stat_all_tables.n_tup_del, pg_stat_all_tables.n_live_tup, pg_stat_all_tables.n_dead_tup, pg_stat_all_tables.last_vacuum, pg_stat_all_tables.last_autovacuum, pg_stat_all_tables.last_analyze, pg_stat_all_tables.last_autoanalyze FROM pg_stat_all_tables WHERE ((pg_stat_all_tables.schemaname = ANY (ARRAY['pg_catalog'::name, 'information_schema'::name])) OR (pg_stat_all_tables.schemaname ~ '^pg_toast'::text));
+ pg_stat_sys_tables       | SELECT pg_stat_all_tables.relid, pg_stat_all_tables.schemaname, pg_stat_all_tables.relname, pg_stat_all_tables.seq_scan, pg_stat_all_tables.seq_tup_read, pg_stat_all_tables.idx_scan, pg_stat_all_tables.idx_tup_fetch, pg_stat_all_tables.n_tup_ins, pg_stat_all_tables.n_tup_upd, pg_stat_all_tables.n_tup_del, pg_stat_all_tables.n_tup_hot_upd, pg_stat_all_tables.n_live_tup, pg_stat_all_tables.n_dead_tup, pg_stat_all_tables.last_vacuum, pg_stat_all_tables.last_autovacuum, pg_stat_all_tables.last_analyze, pg_stat_all_tables.last_autoanalyze FROM pg_stat_all_tables WHERE ((pg_stat_all_tables.schemaname = ANY (ARRAY['pg_catalog'::name, 'information_schema'::name])) OR (pg_stat_all_tables.schemaname ~ '^pg_toast'::text));
  pg_stat_user_indexes     | SELECT pg_stat_all_indexes.relid, pg_stat_all_indexes.indexrelid, pg_stat_all_indexes.schemaname, pg_stat_all_indexes.relname, pg_stat_all_indexes.indexrelname, pg_stat_all_indexes.idx_scan, pg_stat_all_indexes.idx_tup_read, pg_stat_all_indexes.idx_tup_fetch FROM pg_stat_all_indexes WHERE ((pg_stat_all_indexes.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_stat_all_indexes.schemaname !~ '^pg_toast'::text));
- pg_stat_user_tables      | SELECT pg_stat_all_tables.relid, pg_stat_all_tables.schemaname, pg_stat_all_tables.relname, pg_stat_all_tables.seq_scan, pg_stat_all_tables.seq_tup_read, pg_stat_all_tables.idx_scan, pg_stat_all_tables.idx_tup_fetch, pg_stat_all_tables.n_tup_ins, pg_stat_all_tables.n_tup_upd, pg_stat_all_tables.n_tup_del, pg_stat_all_tables.n_live_tup, pg_stat_all_tables.n_dead_tup, pg_stat_all_tables.last_vacuum, pg_stat_all_tables.last_autovacuum, pg_stat_all_tables.last_analyze, pg_stat_all_tables.last_autoanalyze FROM pg_stat_all_tables WHERE ((pg_stat_all_tables.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_stat_all_tables.schemaname !~ '^pg_toast'::text));
+ pg_stat_user_tables      | SELECT pg_stat_all_tables.relid, pg_stat_all_tables.schemaname, pg_stat_all_tables.relname, pg_stat_all_tables.seq_scan, pg_stat_all_tables.seq_tup_read, pg_stat_all_tables.idx_scan, pg_stat_all_tables.idx_tup_fetch, pg_stat_all_tables.n_tup_ins, pg_stat_all_tables.n_tup_upd, pg_stat_all_tables.n_tup_del, pg_stat_all_tables.n_tup_hot_upd, pg_stat_all_tables.n_live_tup, pg_stat_all_tables.n_dead_tup, pg_stat_all_tables.last_vacuum, pg_stat_all_tables.last_autovacuum, pg_stat_all_tables.last_analyze, pg_stat_all_tables.last_autoanalyze FROM pg_stat_all_tables WHERE ((pg_stat_all_tables.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_stat_all_tables.schemaname !~ '^pg_toast'::text));
  pg_statio_all_indexes    | SELECT c.oid AS relid, i.oid AS indexrelid, n.nspname AS schemaname, c.relname, i.relname AS indexrelname, (pg_stat_get_blocks_fetched(i.oid) - pg_stat_get_blocks_hit(i.oid)) AS idx_blks_read, pg_stat_get_blocks_hit(i.oid) AS idx_blks_hit FROM (((pg_class c JOIN pg_index x ON ((c.oid = x.indrelid))) JOIN pg_class i ON ((i.oid = x.indexrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char"]));
  pg_statio_all_sequences  | SELECT c.oid AS relid, n.nspname AS schemaname, c.relname, (pg_stat_get_blocks_fetched(c.oid) - pg_stat_get_blocks_hit(c.oid)) AS blks_read, pg_stat_get_blocks_hit(c.oid) AS blks_hit FROM (pg_class c LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = 'S'::"char");
  pg_statio_all_tables     | SELECT c.oid AS relid, n.nspname AS schemaname, c.relname, (pg_stat_get_blocks_fetched(c.oid) - pg_stat_get_blocks_hit(c.oid)) AS heap_blks_read, pg_stat_get_blocks_hit(c.oid) AS heap_blks_hit, (sum((pg_stat_get_blocks_fetched(i.indexrelid) - pg_stat_get_blocks_hit(i.indexrelid))))::bigint AS idx_blks_read, (sum(pg_stat_get_blocks_hit(i.indexrelid)))::bigint AS idx_blks_hit, (pg_stat_get_blocks_fetched(t.oid) - pg_stat_get_blocks_hit(t.oid)) AS toast_blks_read, pg_stat_get_blocks_hit(t.oid) AS toast_blks_hit, (pg_stat_get_blocks_fetched(x.oid) - pg_stat_get_blocks_hit(x.oid)) AS tidx_blks_read, pg_stat_get_blocks_hit(x.oid) AS tidx_blks_hit FROM ((((pg_class c LEFT JOIN pg_index i ON ((c.oid = i.indrelid))) LEFT JOIN pg_class t ON ((c.reltoastrelid = t.oid))) LEFT JOIN pg_class x ON ((t.reltoastidxid = x.oid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char"])) GROUP BY c.oid, n.nspname, c.relname, t.oid, x.oid;