Fix bug in following update chain when locking a heap tuple REL_18_STABLE github/REL_18_STABLE
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>
Tue, 23 Dec 2025 11:37:16 +0000 (13:37 +0200)
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>
Tue, 23 Dec 2025 11:37:23 +0000 (13:37 +0200)
After waiting for a concurrent updater to finish, heap_lock_tuple()
followed the update chain to lock all tuple versions. However, when
stepping from the initial tuple to the next one, it failed to check
that the next tuple's XMIN matches the initial tuple's XMAX. That's an
important check whenever following an update chain, and the recursive
part that follows the chain did it, but the initial step missed it.
Without the check, if the updating transaction aborts, the updated
tuple is vacuumed away and replaced by an unrelated tuple, the
unrelated tuple might get incorrectly locked.

Author: Jasper Smit <jasper.smit@servicenow.com>
Discussion: https://www.postgresql.org/message-id/CAOG+RQ74x0q=kgBBQ=mezuvOeZBfSxM1qu_o0V28bwDz3dHxLw@mail.gmail.com
Backpatch-through: 14

src/backend/access/heap/heapam.c
src/test/modules/injection_points/Makefile
src/test/modules/injection_points/expected/heap_lock_update.out [new file with mode: 0644]
src/test/modules/injection_points/meson.build
src/test/modules/injection_points/specs/heap_lock_update.spec [new file with mode: 0644]

index d3dcea06bdfa5f8fe65894b38bf4cc2d026e3955..255c43da856f54f7519a1a5a022029b557c045be 100644 (file)
@@ -85,8 +85,11 @@ static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
                                      LockTupleMode mode, bool is_update,
                                      TransactionId *result_xmax, uint16 *result_infomask,
                                      uint16 *result_infomask2);
-static TM_Result heap_lock_updated_tuple(Relation rel, HeapTuple tuple,
-                                        ItemPointer ctid, TransactionId xid,
+static TM_Result heap_lock_updated_tuple(Relation rel,
+                                        uint16 prior_infomask,
+                                        TransactionId prior_rawxmax,
+                                        const ItemPointerData *prior_ctid,
+                                        TransactionId xid,
                                         LockTupleMode mode);
 static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
                                   uint16 *new_infomask2);
@@ -4783,11 +4786,13 @@ l3:
                 * If there are updates, follow the update chain; bail out if
                 * that cannot be done.
                 */
-               if (follow_updates && updated)
+               if (follow_updates && updated &&
+                   !ItemPointerEquals(&tuple->t_self, &t_ctid))
                {
                    TM_Result   res;
 
-                   res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
+                   res = heap_lock_updated_tuple(relation,
+                                                 infomask, xwait, &t_ctid,
                                                  GetCurrentTransactionId(),
                                                  mode);
                    if (res != TM_Ok)
@@ -5030,11 +5035,13 @@ l3:
            }
 
            /* if there are updates, follow the update chain */
-           if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
+           if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
+               !ItemPointerEquals(&tuple->t_self, &t_ctid))
            {
                TM_Result   res;
 
-               res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
+               res = heap_lock_updated_tuple(relation,
+                                             infomask, xwait, &t_ctid,
                                              GetCurrentTransactionId(),
                                              mode);
                if (res != TM_Ok)
@@ -5688,7 +5695,8 @@ test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid,
  * version as well.
  */
 static TM_Result
-heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid,
+heap_lock_updated_tuple_rec(Relation rel, TransactionId priorXmax,
+                           const ItemPointerData *tid, TransactionId xid,
                            LockTupleMode mode)
 {
    TM_Result   result;
@@ -5701,7 +5709,6 @@ heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid,
                old_infomask2;
    TransactionId xmax,
                new_xmax;
-   TransactionId priorXmax = InvalidTransactionId;
    bool        cleared_all_frozen = false;
    bool        pinned_desired_page;
    Buffer      vmbuffer = InvalidBuffer;
@@ -6015,7 +6022,10 @@ out_unlocked:
  *     Follow update chain when locking an updated tuple, acquiring locks (row
  *     marks) on the updated versions.
  *
- * The initial tuple is assumed to be already locked.
+ * 'prior_infomask', 'prior_raw_xmax' and 'prior_ctid' are the corresponding
+ * fields from the initial tuple.  We will lock the tuples starting from the
+ * one that 'prior_ctid' points to.  Note: This function does not lock the
+ * initial tuple itself.
  *
  * This function doesn't check visibility, it just unconditionally marks the
  * tuple(s) as locked.  If any tuple in the updated chain is being deleted
@@ -6033,16 +6043,22 @@ out_unlocked:
  * levels, because that would lead to a serializability failure.
  */
 static TM_Result
-heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid,
+heap_lock_updated_tuple(Relation rel,
+                       uint16 prior_infomask,
+                       TransactionId prior_raw_xmax,
+                       const ItemPointerData *prior_ctid,
                        TransactionId xid, LockTupleMode mode)
 {
+   INJECTION_POINT("heap_lock_updated_tuple", NULL);
+
    /*
-    * If the tuple has not been updated, or has moved into another partition
-    * (effectively a delete) stop here.
+    * If the tuple has moved into another partition (effectively a delete)
+    * stop here.
     */
-   if (!HeapTupleHeaderIndicatesMovedPartitions(tuple->t_data) &&
-       !ItemPointerEquals(&tuple->t_self, ctid))
+   if (!ItemPointerIndicatesMovedPartitions(prior_ctid))
    {
+       TransactionId prior_xmax;
+
        /*
         * If this is the first possibly-multixact-able operation in the
         * current transaction, set my per-backend OldestMemberMXactId
@@ -6054,7 +6070,9 @@ heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid,
         */
        MultiXactIdSetOldestMember();
 
-       return heap_lock_updated_tuple_rec(rel, ctid, xid, mode);
+       prior_xmax = (prior_infomask & HEAP_XMAX_IS_MULTI) ?
+           MultiXactIdGetUpdateXid(prior_raw_xmax, prior_infomask) : prior_raw_xmax;
+       return heap_lock_updated_tuple_rec(rel, prior_xmax, prior_ctid, xid, mode);
    }
 
    /* nothing to lock */
index fc82cd67f6cd607bb31df5bf18dcff4ac45a91d6..2e49d30f942453534f291821d6d69d06aa08021d 100644 (file)
@@ -14,7 +14,10 @@ PGFILEDESC = "injection_points - facility for injection points"
 REGRESS = injection_points hashagg reindex_conc vacuum
 REGRESS_OPTS = --dlpath=$(top_builddir)/src/test/regress
 
-ISOLATION = basic inplace syscache-update-pruned
+ISOLATION = basic \
+       inplace \
+       syscache-update-pruned \
+       heap_lock_update
 
 TAP_TESTS = 1
 
diff --git a/src/test/modules/injection_points/expected/heap_lock_update.out b/src/test/modules/injection_points/expected/heap_lock_update.out
new file mode 100644 (file)
index 0000000..1ec8d87
--- /dev/null
@@ -0,0 +1,83 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1begin s1update s2lock s1abort vacuum reinsert wake
+step s1begin: BEGIN;
+step s1update: UPDATE t SET id = 10000 WHERE id = 1 RETURNING ctid;
+ctid 
+-----
+(1,2)
+(1 row)
+
+step s2lock: select * from t where id = 1 for update; <waiting ...>
+step s1abort: ABORT;
+step vacuum: VACUUM t;
+step reinsert: 
+   INSERT INTO t VALUES (10001) RETURNING ctid;
+   UPDATE t SET id = 10002 WHERE id = 10001 RETURNING ctid;
+
+ctid 
+-----
+(1,2)
+(1 row)
+
+ctid 
+-----
+(1,3)
+(1 row)
+
+step wake: 
+   SELECT FROM injection_points_detach('heap_lock_updated_tuple');
+   SELECT FROM injection_points_wakeup('heap_lock_updated_tuple');
+ <waiting ...>
+step s2lock: <... completed>
+id
+--
+ 1
+(1 row)
+
+step wake: <... completed>
+
+starting permutation: s1begin s1update s2lock s1abort vacuum reinsert_and_lock wake
+step s1begin: BEGIN;
+step s1update: UPDATE t SET id = 10000 WHERE id = 1 RETURNING ctid;
+ctid 
+-----
+(1,2)
+(1 row)
+
+step s2lock: select * from t where id = 1 for update; <waiting ...>
+step s1abort: ABORT;
+step vacuum: VACUUM t;
+step reinsert_and_lock: 
+   BEGIN;
+   INSERT INTO t VALUES (10001) RETURNING ctid;
+   SELECT ctid, * FROM t WHERE id = 1 FOR UPDATE;
+   COMMIT;
+   UPDATE t SET id = 10002 WHERE id = 10001 returning ctid;
+
+ctid 
+-----
+(1,2)
+(1 row)
+
+ctid |id
+-----+--
+(0,1)| 1
+(1 row)
+
+ctid 
+-----
+(1,3)
+(1 row)
+
+step wake: 
+   SELECT FROM injection_points_detach('heap_lock_updated_tuple');
+   SELECT FROM injection_points_wakeup('heap_lock_updated_tuple');
+ <waiting ...>
+step s2lock: <... completed>
+id
+--
+ 1
+(1 row)
+
+step wake: <... completed>
index ce778ccf9ac45ac43b26e384d9ea084b23e77f39..a73e1fe6c349b8e949f33ada7d71373db3aa133c 100644 (file)
@@ -48,6 +48,7 @@ tests += {
       'basic',
       'inplace',
       'syscache-update-pruned',
+      'heap_lock_update',
     ],
     'runningcheck': false, # see syscache-update-pruned
   },
diff --git a/src/test/modules/injection_points/specs/heap_lock_update.spec b/src/test/modules/injection_points/specs/heap_lock_update.spec
new file mode 100644 (file)
index 0000000..b3992a1
--- /dev/null
@@ -0,0 +1,117 @@
+# Test race condition in tuple locking
+#
+# This is a reproducer for the bug reported at:
+# https://www.postgresql.org/message-id/CAOG%2BRQ74x0q%3DkgBBQ%3DmezuvOeZBfSxM1qu_o0V28bwDz3dHxLw%40mail.gmail.com
+#
+# The bug was that when following an update chain when locking tuples,
+# we sometimes failed to check that the xmin on the next tuple matched
+# the prior's xmax. If the updated tuple version was vacuumed away and
+# the slot was reused for an unrelated tuple, we'd incorrectly follow
+# and lock the unrelated tuple.
+
+
+# Set up a test table with enough rows to fill a page. We need the
+# UPDATE used in the test to put the new tuple on a different page,
+# because otherwise the VACUUM cannot remove the aborted tuple because
+# we hold a pin on the first page.
+#
+# The exact number of rows inserted doesn't otherwise matter, but we
+# arrange things in a deterministic fashion so that the last inserted
+# tuple goes to (1,1), and the updated and aborted tuple goes to
+# (1,2). That way we can just memorize those ctids in the expected
+# output, to verify that the test exercises the scenario we want.
+setup
+{
+   CREATE EXTENSION injection_points;
+
+   CREATE TABLE t (id int PRIMARY KEY);
+   do $$
+       DECLARE
+           i int;
+           tid tid;
+       BEGIN
+           FOR i IN 1..5000 LOOP
+               INSERT INTO t VALUES (i) RETURNING ctid INTO tid;
+               IF tid = '(1,1)' THEN
+                   RETURN;
+               END IF;
+           END LOOP;
+           RAISE 'expected to insert tuple to (1,1)';
+      END;
+   $$;
+}
+teardown
+{
+   DROP TABLE t;
+   DROP EXTENSION injection_points;
+}
+
+session s1
+step s1begin   { BEGIN; }
+step s1update  { UPDATE t SET id = 10000 WHERE id = 1 RETURNING ctid; }
+step s1abort   { ABORT; }
+step vacuum        { VACUUM t; }
+
+# Insert a new tuple, and update it.
+step reinsert  {
+   INSERT INTO t VALUES (10001) RETURNING ctid;
+   UPDATE t SET id = 10002 WHERE id = 10001 RETURNING ctid;
+}
+
+# Same as the 'reinsert' step, but for extra confusion, we also stamp
+# the original tuple with the same 'xmax' as the re-inserted one.
+step reinsert_and_lock {
+   BEGIN;
+   INSERT INTO t VALUES (10001) RETURNING ctid;
+   SELECT ctid, * FROM t WHERE id = 1 FOR UPDATE;
+   COMMIT;
+   UPDATE t SET id = 10002 WHERE id = 10001 returning ctid;
+}
+
+step wake  {
+   SELECT FROM injection_points_detach('heap_lock_updated_tuple');
+   SELECT FROM injection_points_wakeup('heap_lock_updated_tuple');
+}
+
+session s2
+setup  {
+   SELECT FROM injection_points_set_local();
+   SELECT FROM injection_points_attach('heap_lock_updated_tuple', 'wait');
+}
+step s2lock    { select * from t where id = 1 for update; }
+
+permutation
+   # Begin transaction, update a row. Because of how we set up the
+   # test table, the updated tuple lands at (1,2)
+   s1begin
+   s1update
+
+   # While the updating transaction is open, start a new session that
+   # tries to lock the row. This blocks on the open transaction.
+   s2lock
+
+   # Abort the updating transaction. This unblocks session 2, but it
+   # will immediately hit the injection point and block on that.
+   s1abort
+   # Vacuum away the updated, aborted tuple.
+   vacuum
+
+   # Insert a new tuple. It lands at the same location where the
+   # updated tuple was.
+   reinsert
+
+   # Let the locking transaction continue. It should lock the
+   # original tuple, ignoring the re-inserted tuple.
+   wake(s2lock)
+
+# Variant where the re-inserted tuple is also locked by the inserting
+# transaction. This failed an earlier version of the fix during
+# development.
+permutation
+   s1begin
+   s1update
+   s2lock
+   s1abort
+   vacuum
+   reinsert_and_lock
+   wake(s2lock)