Fix bug in following update chain when locking a heap tuple

author Heikki Linnakangas <heikki.linnakangas@iki.fi>

Tue, 23 Dec 2025 11:37:16 +0000 (13:37 +0200)

committer Heikki Linnakangas <heikki.linnakangas@iki.fi>

Tue, 23 Dec 2025 11:37:23 +0000 (13:37 +0200)
author Heikki Linnakangas <heikki.linnakangas@iki.fi>
Tue, 23 Dec 2025 11:37:16 +0000 (13:37 +0200)
committer Heikki Linnakangas <heikki.linnakangas@iki.fi>
Tue, 23 Dec 2025 11:37:23 +0000 (13:37 +0200)
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c

index d3dcea06bdfa5f8fe65894b38bf4cc2d026e3955..255c43da856f54f7519a1a5a022029b557c045be 100644 (file)
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -85,8 +85,11 @@ static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
                                       LockTupleMode mode, bool is_update,
                                       TransactionId *result_xmax, uint16 *result_infomask,
                                       uint16 *result_infomask2);
-static TM_Result heap_lock_updated_tuple(Relation rel, HeapTuple tuple,
-                                        ItemPointer ctid, TransactionId xid,
+static TM_Result heap_lock_updated_tuple(Relation rel,
+                                        uint16 prior_infomask,
+                                        TransactionId prior_rawxmax,
+                                        const ItemPointerData *prior_ctid,
+                                        TransactionId xid,
                                          LockTupleMode mode);
  static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
                                    uint16 *new_infomask2);
@@ -4783,11 +4786,13 @@ l3:
                  * If there are updates, follow the update chain; bail out if
                  * that cannot be done.
                  */
-               if (follow_updates && updated)
+               if (follow_updates && updated &&
+                   !ItemPointerEquals(&tuple->t_self, &t_ctid))
                 {
                     TM_Result   res;
  
-                   res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
+                   res = heap_lock_updated_tuple(relation,
+                                                 infomask, xwait, &t_ctid,
                                                   GetCurrentTransactionId(),
                                                   mode);
                     if (res != TM_Ok)
@@ -5030,11 +5035,13 @@ l3:
             }
  
             /* if there are updates, follow the update chain */
-           if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
+           if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
+               !ItemPointerEquals(&tuple->t_self, &t_ctid))
             {
                 TM_Result   res;
  
-               res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
+               res = heap_lock_updated_tuple(relation,
+                                             infomask, xwait, &t_ctid,
                                               GetCurrentTransactionId(),
                                               mode);
                 if (res != TM_Ok)
@@ -5688,7 +5695,8 @@ test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid,
   * version as well.
   */
  static TM_Result
-heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid,
+heap_lock_updated_tuple_rec(Relation rel, TransactionId priorXmax,
+                           const ItemPointerData *tid, TransactionId xid,
                             LockTupleMode mode)
  {
     TM_Result   result;
@@ -5701,7 +5709,6 @@ heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid,
                 old_infomask2;
     TransactionId xmax,
                 new_xmax;
-   TransactionId priorXmax = InvalidTransactionId;
     bool        cleared_all_frozen = false;
     bool        pinned_desired_page;
     Buffer      vmbuffer = InvalidBuffer;
@@ -6015,7 +6022,10 @@ out_unlocked:
   *     Follow update chain when locking an updated tuple, acquiring locks (row
   *     marks) on the updated versions.
   *
- * The initial tuple is assumed to be already locked.
+ * 'prior_infomask', 'prior_raw_xmax' and 'prior_ctid' are the corresponding
+ * fields from the initial tuple.  We will lock the tuples starting from the
+ * one that 'prior_ctid' points to.  Note: This function does not lock the
+ * initial tuple itself.
   *
   * This function doesn't check visibility, it just unconditionally marks the
   * tuple(s) as locked.  If any tuple in the updated chain is being deleted
@@ -6033,16 +6043,22 @@ out_unlocked:
   * levels, because that would lead to a serializability failure.
   */
  static TM_Result
-heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid,
+heap_lock_updated_tuple(Relation rel,
+                       uint16 prior_infomask,
+                       TransactionId prior_raw_xmax,
+                       const ItemPointerData *prior_ctid,
                         TransactionId xid, LockTupleMode mode)
  {
+   INJECTION_POINT("heap_lock_updated_tuple", NULL);
+
     /*
-    * If the tuple has not been updated, or has moved into another partition
-    * (effectively a delete) stop here.
+    * If the tuple has moved into another partition (effectively a delete)
+    * stop here.
      */
-   if (!HeapTupleHeaderIndicatesMovedPartitions(tuple->t_data) &&
-       !ItemPointerEquals(&tuple->t_self, ctid))
+   if (!ItemPointerIndicatesMovedPartitions(prior_ctid))
     {
+       TransactionId prior_xmax;
+
         /*
          * If this is the first possibly-multixact-able operation in the
          * current transaction, set my per-backend OldestMemberMXactId
@@ -6054,7 +6070,9 @@ heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid,
          */
         MultiXactIdSetOldestMember();
  
-       return heap_lock_updated_tuple_rec(rel, ctid, xid, mode);
+       prior_xmax = (prior_infomask & HEAP_XMAX_IS_MULTI) ?
+           MultiXactIdGetUpdateXid(prior_raw_xmax, prior_infomask) : prior_raw_xmax;
+       return heap_lock_updated_tuple_rec(rel, prior_xmax, prior_ctid, xid, mode);
     }
  
     /* nothing to lock */
diff --git a/src/test/modules/injection_points/Makefile b/src/test/modules/injection_points/Makefile

index fc82cd67f6cd607bb31df5bf18dcff4ac45a91d6..2e49d30f942453534f291821d6d69d06aa08021d 100644 (file)
--- a/src/test/modules/injection_points/Makefile
+++ b/src/test/modules/injection_points/Makefile
@@ -14,7 +14,10 @@ PGFILEDESC = "injection_points - facility for injection points"
  REGRESS = injection_points hashagg reindex_conc vacuum
  REGRESS_OPTS = --dlpath=$(top_builddir)/src/test/regress
  
-ISOLATION = basic inplace syscache-update-pruned
+ISOLATION = basic \
+       inplace \
+       syscache-update-pruned \
+       heap_lock_update
  
  TAP_TESTS = 1
  
diff --git a/src/test/modules/injection_points/expected/heap_lock_update.out b/src/test/modules/injection_points/expected/heap_lock_update.out

new file mode 100644 (file)

index 0000000..1ec8d87
--- /dev/null
+++ b/src/test/modules/injection_points/expected/heap_lock_update.out
@@ -0,0 +1,83 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1begin s1update s2lock s1abort vacuum reinsert wake
+step s1begin: BEGIN;
+step s1update: UPDATE t SET id = 10000 WHERE id = 1 RETURNING ctid;
+ctid 
+-----
+(1,2)
+(1 row)
+
+step s2lock: select * from t where id = 1 for update; <waiting ...>
+step s1abort: ABORT;
+step vacuum: VACUUM t;
+step reinsert: 
+   INSERT INTO t VALUES (10001) RETURNING ctid;
+   UPDATE t SET id = 10002 WHERE id = 10001 RETURNING ctid;
+
+ctid 
+-----
+(1,2)
+(1 row)
+
+ctid 
+-----
+(1,3)
+(1 row)
+
+step wake: 
+   SELECT FROM injection_points_detach('heap_lock_updated_tuple');
+   SELECT FROM injection_points_wakeup('heap_lock_updated_tuple');
+ <waiting ...>
+step s2lock: <... completed>
+id
+--
+ 1
+(1 row)
+
+step wake: <... completed>
+
+starting permutation: s1begin s1update s2lock s1abort vacuum reinsert_and_lock wake
+step s1begin: BEGIN;
+step s1update: UPDATE t SET id = 10000 WHERE id = 1 RETURNING ctid;
+ctid 
+-----
+(1,2)
+(1 row)
+
+step s2lock: select * from t where id = 1 for update; <waiting ...>
+step s1abort: ABORT;
+step vacuum: VACUUM t;
+step reinsert_and_lock: 
+   BEGIN;
+   INSERT INTO t VALUES (10001) RETURNING ctid;
+   SELECT ctid, * FROM t WHERE id = 1 FOR UPDATE;
+   COMMIT;
+   UPDATE t SET id = 10002 WHERE id = 10001 returning ctid;
+
+ctid 
+-----
+(1,2)
+(1 row)
+
+ctid |id
+-----+--
+(0,1)| 1
+(1 row)
+
+ctid 
+-----
+(1,3)
+(1 row)
+
+step wake: 
+   SELECT FROM injection_points_detach('heap_lock_updated_tuple');
+   SELECT FROM injection_points_wakeup('heap_lock_updated_tuple');
+ <waiting ...>
+step s2lock: <... completed>
+id
+--
+ 1
+(1 row)
+
+step wake: <... completed>
diff --git a/src/test/modules/injection_points/meson.build b/src/test/modules/injection_points/meson.build

index ce778ccf9ac45ac43b26e384d9ea084b23e77f39..a73e1fe6c349b8e949f33ada7d71373db3aa133c 100644 (file)
--- a/src/test/modules/injection_points/meson.build
+++ b/src/test/modules/injection_points/meson.build
@@ -48,6 +48,7 @@ tests += {
        'basic',
        'inplace',
        'syscache-update-pruned',
+      'heap_lock_update',
      ],
      'runningcheck': false, # see syscache-update-pruned
    },
diff --git a/src/test/modules/injection_points/specs/heap_lock_update.spec b/src/test/modules/injection_points/specs/heap_lock_update.spec

new file mode 100644 (file)

index 0000000..b3992a1
--- /dev/null
+++ b/src/test/modules/injection_points/specs/heap_lock_update.spec
@@ -0,0 +1,117 @@
+# Test race condition in tuple locking
+#
+# This is a reproducer for the bug reported at:
+# https://www.postgresql.org/message-id/CAOG%2BRQ74x0q%3DkgBBQ%3DmezuvOeZBfSxM1qu_o0V28bwDz3dHxLw%40mail.gmail.com
+#
+# The bug was that when following an update chain when locking tuples,
+# we sometimes failed to check that the xmin on the next tuple matched
+# the prior's xmax. If the updated tuple version was vacuumed away and
+# the slot was reused for an unrelated tuple, we'd incorrectly follow
+# and lock the unrelated tuple.
+
+
+# Set up a test table with enough rows to fill a page. We need the
+# UPDATE used in the test to put the new tuple on a different page,
+# because otherwise the VACUUM cannot remove the aborted tuple because
+# we hold a pin on the first page.
+#
+# The exact number of rows inserted doesn't otherwise matter, but we
+# arrange things in a deterministic fashion so that the last inserted
+# tuple goes to (1,1), and the updated and aborted tuple goes to
+# (1,2). That way we can just memorize those ctids in the expected
+# output, to verify that the test exercises the scenario we want.
+setup
+{
+   CREATE EXTENSION injection_points;
+
+   CREATE TABLE t (id int PRIMARY KEY);
+   do $$
+       DECLARE
+           i int;
+           tid tid;
+       BEGIN
+           FOR i IN 1..5000 LOOP
+               INSERT INTO t VALUES (i) RETURNING ctid INTO tid;
+               IF tid = '(1,1)' THEN
+                   RETURN;
+               END IF;
+           END LOOP;
+           RAISE 'expected to insert tuple to (1,1)';
+      END;
+   $$;
+}
+teardown
+{
+   DROP TABLE t;
+   DROP EXTENSION injection_points;
+}
+
+session s1
+step s1begin   { BEGIN; }
+step s1update  { UPDATE t SET id = 10000 WHERE id = 1 RETURNING ctid; }
+step s1abort   { ABORT; }
+step vacuum        { VACUUM t; }
+
+# Insert a new tuple, and update it.
+step reinsert  {
+   INSERT INTO t VALUES (10001) RETURNING ctid;
+   UPDATE t SET id = 10002 WHERE id = 10001 RETURNING ctid;
+}
+
+# Same as the 'reinsert' step, but for extra confusion, we also stamp
+# the original tuple with the same 'xmax' as the re-inserted one.
+step reinsert_and_lock {
+   BEGIN;
+   INSERT INTO t VALUES (10001) RETURNING ctid;
+   SELECT ctid, * FROM t WHERE id = 1 FOR UPDATE;
+   COMMIT;
+   UPDATE t SET id = 10002 WHERE id = 10001 returning ctid;
+}
+
+step wake  {
+   SELECT FROM injection_points_detach('heap_lock_updated_tuple');
+   SELECT FROM injection_points_wakeup('heap_lock_updated_tuple');
+}
+
+session s2
+setup  {
+   SELECT FROM injection_points_set_local();
+   SELECT FROM injection_points_attach('heap_lock_updated_tuple', 'wait');
+}
+step s2lock    { select * from t where id = 1 for update; }
+
+permutation
+   # Begin transaction, update a row. Because of how we set up the
+   # test table, the updated tuple lands at (1,2)
+   s1begin
+   s1update
+
+   # While the updating transaction is open, start a new session that
+   # tries to lock the row. This blocks on the open transaction.
+   s2lock
+
+   # Abort the updating transaction. This unblocks session 2, but it
+   # will immediately hit the injection point and block on that.
+   s1abort
+   # Vacuum away the updated, aborted tuple.
+   vacuum
+
+   # Insert a new tuple. It lands at the same location where the
+   # updated tuple was.
+   reinsert
+
+   # Let the locking transaction continue. It should lock the
+   # original tuple, ignoring the re-inserted tuple.
+   wake(s2lock)
+
+# Variant where the re-inserted tuple is also locked by the inserting
+# transaction. This failed an earlier version of the fix during
+# development.
+permutation
+   s1begin
+   s1update
+   s2lock
+   s1abort
+   vacuum
+   reinsert_and_lock
+   wake(s2lock)
author	Heikki Linnakangas <heikki.linnakangas@iki.fi>
	Tue, 23 Dec 2025 11:37:16 +0000 (13:37 +0200)
committer	Heikki Linnakangas <heikki.linnakangas@iki.fi>
	Tue, 23 Dec 2025 11:37:23 +0000 (13:37 +0200)
src/backend/access/heap/heapam.c		patch \| blob \| blame \| history
src/test/modules/injection_points/Makefile		patch \| blob \| blame \| history
src/test/modules/injection_points/expected/heap_lock_update.out	[new file with mode: 0644]	patch \| blob
src/test/modules/injection_points/meson.build		patch \| blob \| blame \| history
src/test/modules/injection_points/specs/heap_lock_update.spec	[new file with mode: 0644]	patch \| blob