CSNs

author Heikki Linnakangas <heikki.linnakangas@iki.fi>

Mon, 22 Aug 2016 11:00:57 +0000 (14:00 +0300)

committer Heikki Linnakangas <heikki.linnakangas@iki.fi>

Mon, 22 Aug 2016 18:21:58 +0000 (21:21 +0300)
author Heikki Linnakangas <heikki.linnakangas@iki.fi>
Mon, 22 Aug 2016 11:00:57 +0000 (14:00 +0300)
committer Heikki Linnakangas <heikki.linnakangas@iki.fi>
Mon, 22 Aug 2016 18:21:58 +0000 (21:21 +0300)
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml

index 169a385a9cc9decdfadfde16a86ec95eaea7e397..7d6927e0acf272af0e84024fe9f77bdbd603d722 100644 (file)
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -17123,10 +17123,6 @@ SELECT collation for ('foo' COLLATE "de_DE");
      <primary>txid_current_snapshot</primary>
     </indexterm>
  
-   <indexterm>
-    <primary>txid_snapshot_xip</primary>
-   </indexterm>
-
     <indexterm>
      <primary>txid_snapshot_xmax</primary>
     </indexterm>
@@ -17164,11 +17160,6 @@ SELECT collation for ('foo' COLLATE "de_DE");
         <entry><type>txid_snapshot</type></entry>
         <entry>get current snapshot</entry>
        </row>
-      <row>
-       <entry><literal><function>txid_snapshot_xip(<parameter>txid_snapshot</parameter>)</function></literal></entry>
-       <entry><type>setof bigint</type></entry>
-       <entry>get in-progress transaction IDs in snapshot</entry>
-      </row>
        <row>
         <entry><literal><function>txid_snapshot_xmax(<parameter>txid_snapshot</parameter>)</function></literal></entry>
         <entry><type>bigint</type></entry>
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c

index c63dfa0bafc606ea3dc1ee9c7427d92c28ed09d4..a01edd75d9d1c64ed25c849d905e7164e49a6fc4 100644 (file)
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -3714,9 +3714,8 @@ l2:
                                 update_xact = InvalidTransactionId;
  
                         /*
-                        * There was no UPDATE in the MultiXact; or it aborted. No
-                        * TransactionIdIsInProgress() call needed here, since we called
-                        * MultiXactIdWait() above.
+                        * There was no UPDATE in the MultiXact; or it aborted. It cannot
+                        * be in-progress anymore, since we called MultiXactIdWait() above.
                          */
                         if (!TransactionIdIsValid(update_xact) ||
                                 TransactionIdDidAbort(update_xact))
@@ -5271,7 +5270,7 @@ heap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode,
   * either here, or within MultiXactIdExpand.
   *
   * There is a similar race condition possible when the old xmax was a regular
- * TransactionId.  We test TransactionIdIsInProgress again just to narrow the
+ * TransactionId.  We test TransactionIdGetStatus again just to narrow the
   * window, but it's still possible to end up creating an unnecessary
   * MultiXactId.  Fortunately this is harmless.
   */
@@ -5282,6 +5281,7 @@ compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
                                                   TransactionId *result_xmax, uint16 *result_infomask,
                                                   uint16 *result_infomask2)
  {
+       TransactionIdStatus xidstatus;
         TransactionId new_xmax;
         uint16          new_infomask,
                                 new_infomask2;
@@ -5417,7 +5417,7 @@ l5:
                 new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
                 GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
         }
-       else if (TransactionIdIsInProgress(xmax))
+       else if ((xidstatus = TransactionIdGetStatus(xmax)) == XID_INPROGRESS)
         {
                 /*
                  * If the XMAX is a valid, in-progress TransactionId, then we need to
@@ -5446,8 +5446,9 @@ l5:
                                 /*
                                  * LOCK_ONLY can be present alone only when a page has been
                                  * upgraded by pg_upgrade.  But in that case,
-                                * TransactionIdIsInProgress() should have returned false.  We
-                                * assume it's no longer locked in this case.
+                                * TransactionIdGetStatus() should not have returned
+                                * XID_INPROGRESS.  We assume it's no longer locked in this
+                                * case.
                                  */
                                 elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
                                 old_infomask |= HEAP_XMAX_INVALID;
@@ -5500,7 +5501,7 @@ l5:
                 GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
         }
         else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) &&
-                        TransactionIdDidCommit(xmax))
+                        xidstatus == XID_COMMITTED)
         {
                 /*
                  * It's a committed update, so we gotta preserve him as updater of the
@@ -5529,7 +5530,7 @@ l5:
                 /*
                  * Can get here iff the locking/updating transaction was running when
                  * the infomask was extracted from the tuple, but finished before
-                * TransactionIdIsInProgress got to run.  Deal with it as if there was
+                * TransactionIdGetStatus got to run.  Deal with it as if there was
                  * no locker at all in the first place.
                  */
                 old_infomask |= HEAP_XMAX_INVALID;
@@ -5560,15 +5561,11 @@ test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid,
                                                    LockTupleMode mode, bool *needwait)
  {
         MultiXactStatus wantedstatus;
+       TransactionIdStatus xidstatus;
  
         *needwait = false;
         wantedstatus = get_mxact_status_for_lock(mode, false);
  
-       /*
-        * Note: we *must* check TransactionIdIsInProgress before
-        * TransactionIdDidAbort/Commit; see comment at top of tqual.c for an
-        * explanation.
-        */
         if (TransactionIdIsCurrentTransactionId(xid))
         {
                 /*
@@ -5577,7 +5574,9 @@ test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid,
                  */
                 return HeapTupleSelfUpdated;
         }
-       else if (TransactionIdIsInProgress(xid))
+       xidstatus = TransactionIdGetStatus(xid);
+
+       if (xidstatus == XID_INPROGRESS)
         {
                 /*
                  * If the locking transaction is running, what we do depends on
@@ -5597,9 +5596,9 @@ test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid,
                  */
                 return HeapTupleMayBeUpdated;
         }
-       else if (TransactionIdDidAbort(xid))
+       else if (xidstatus == XID_ABORTED)
                 return HeapTupleMayBeUpdated;
-       else if (TransactionIdDidCommit(xid))
+       else if (xidstatus == XID_COMMITTED)
         {
                 /*
                  * The other transaction committed.  If it was only a locker, then the
@@ -5612,7 +5611,7 @@ test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid,
                  * Note: the reason we worry about ISUPDATE here is because as soon as
                  * a transaction ends, all its locks are gone and meaningless, and
                  * thus we can ignore them; whereas its updates persist.  In the
-                * TransactionIdIsInProgress case, above, we don't need to check
+                * XID_INPROGRESS case, above, we don't need to check
                  * because we know the lock is still "alive" and thus a conflict needs
                  * always be checked.
                  */
@@ -5626,9 +5625,7 @@ test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid,
  
                 return HeapTupleMayBeUpdated;
         }
-
-       /* Not in progress, not aborted, not committed -- must have crashed */
-       return HeapTupleMayBeUpdated;
+       return 0; /* not reached */
  }
  
  
@@ -6372,7 +6369,7 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
                          */
                         if (TransactionIdPrecedes(xid, cutoff_xid))
                         {
-                               Assert(!TransactionIdDidCommit(xid));
+                               Assert(TransactionIdGetStatus(xid) == XID_ABORTED);
                                 *flags |= FRM_INVALIDATE_XMAX;
                                 xid = InvalidTransactionId;             /* not strictly necessary */
                         }
@@ -6443,6 +6440,7 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
                 if (ISUPDATE_from_mxstatus(members[i].status))
                 {
                         TransactionId xid = members[i].xid;
+                       TransactionIdStatus xidstatus;
  
                         /*
                          * It's an update; should we keep it?  If the transaction is known
@@ -6450,18 +6448,14 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
                          * Note that an updater older than cutoff_xid cannot possibly be
                          * committed, because HeapTupleSatisfiesVacuum would have returned
                          * HEAPTUPLE_DEAD and we would not be trying to freeze the tuple.
-                        *
-                        * As with all tuple visibility routines, it's critical to test
-                        * TransactionIdIsInProgress before TransactionIdDidCommit,
-                        * because of race conditions explained in detail in tqual.c.
                          */
-                       if (TransactionIdIsCurrentTransactionId(xid) ||
-                               TransactionIdIsInProgress(xid))
+                       xidstatus = TransactionIdGetStatus(xid);
+                       if (xidstatus == XID_INPROGRESS)
                         {
                                 Assert(!TransactionIdIsValid(update_xid));
                                 update_xid = xid;
                         }
-                       else if (TransactionIdDidCommit(xid))
+                       else if (xidstatus == XID_COMMITTED)
                         {
                                 /*
                                  * The transaction committed, so we can tell caller to set
@@ -6499,8 +6493,7 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
                 else
                 {
                         /* We only keep lockers if they are still running */
-                       if (TransactionIdIsCurrentTransactionId(members[i].xid) ||
-                               TransactionIdIsInProgress(members[i].xid))
+                       if (TransactionIdGetStatus(members[i].xid) == XID_INPROGRESS)
                         {
                                 /* running locker cannot possibly be older than the cutoff */
                                 Assert(!TransactionIdPrecedes(members[i].xid, cutoff_xid));
@@ -6974,6 +6967,7 @@ DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
                 {
                         TransactionId memxid;
                         LOCKMODE        memlockmode;
+                       TransactionIdStatus     xidstatus;
  
                         memlockmode = LOCKMODE_from_mxstatus(members[i].status);
  
@@ -6986,16 +6980,18 @@ DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
                         if (TransactionIdIsCurrentTransactionId(memxid))
                                 continue;
  
+                       xidstatus = TransactionIdGetStatus(memxid);
+
                         if (ISUPDATE_from_mxstatus(members[i].status))
                         {
                                 /* ignore aborted updaters */
-                               if (TransactionIdDidAbort(memxid))
+                               if (xidstatus == XID_ABORTED)
                                         continue;
                         }
                         else
                         {
                                 /* ignore lockers-only that are no longer in progress */
-                               if (!TransactionIdIsInProgress(memxid))
+                               if (xidstatus != XID_INPROGRESS)
                                         continue;
                         }
  
@@ -7075,7 +7071,7 @@ Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
                         if (!DoLockModesConflict(LOCKMODE_from_mxstatus(memstatus),
                                                                          LOCKMODE_from_mxstatus(status)))
                         {
-                               if (remaining && TransactionIdIsInProgress(memxid))
+                               if (remaining && TransactionIdGetStatus(memxid) == XID_INPROGRESS)
                                         remain++;
                                 continue;
                         }
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README

index 067d15c803916e3524e992dbbf4dbf2e40559fac..92b76aa8faacebdb66b23508b636bebc5581e2e7 100644 (file)
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -321,6 +321,9 @@ older than RecentGlobalXmin.  As collateral damage, this implementation
  also waits for running XIDs with no snapshots and for snapshots taken
  until the next transaction to allocate an XID commits.
  
+XXX: now that we use CSNs as snapshots, it would be more
+straightforward to use something based on CSNs instead of RecentGlobalXmin.
+
  Reclaiming a page doesn't actually change its state on disk --- we simply
  record it in the shared-memory free space map, from which it will be
  handed out the next time a new page is needed for a page split.  The
diff --git a/src/backend/access/rmgrdesc/standbydesc.c b/src/backend/access/rmgrdesc/standbydesc.c

index 13797a3d2f49b94ded08e97c754611ff2805d1bc..267ac5e51954227b3b71a107f39cbc5c5335445a 100644 (file)
--- a/src/backend/access/rmgrdesc/standbydesc.c
+++ b/src/backend/access/rmgrdesc/standbydesc.c
@@ -19,21 +19,10 @@
  static void
  standby_desc_running_xacts(StringInfo buf, xl_running_xacts *xlrec)
  {
-       int                     i;
-
         appendStringInfo(buf, "nextXid %u latestCompletedXid %u oldestRunningXid %u",
                                          xlrec->nextXid,
                                          xlrec->latestCompletedXid,
                                          xlrec->oldestRunningXid);
-       if (xlrec->xcnt > 0)
-       {
-               appendStringInfo(buf, "; %d xacts:", xlrec->xcnt);
-               for (i = 0; i < xlrec->xcnt; i++)
-                       appendStringInfo(buf, " %u", xlrec->xids[i]);
-       }
-
-       if (xlrec->subxid_overflow)
-               appendStringInfoString(buf, "; subxid ovf");
  }
  
  void
diff --git a/src/backend/access/rmgrdesc/xactdesc.c b/src/backend/access/rmgrdesc/xactdesc.c

index 91d27d0654ea36816bdfba1f8e54f6da7a0afcf3..a9c7bc0fa0eafc3ed7cedfa17d8c902ce65bdd2c 100644 (file)
--- a/src/backend/access/rmgrdesc/xactdesc.c
+++ b/src/backend/access/rmgrdesc/xactdesc.c
@@ -255,17 +255,6 @@ xact_desc_abort(StringInfo buf, uint8 info, xl_xact_abort *xlrec)
         }
  }
  
-static void
-xact_desc_assignment(StringInfo buf, xl_xact_assignment *xlrec)
-{
-       int                     i;
-
-       appendStringInfoString(buf, "subxacts:");
-
-       for (i = 0; i < xlrec->nsubxacts; i++)
-               appendStringInfo(buf, " %u", xlrec->xsub[i]);
-}
-
  void
  xact_desc(StringInfo buf, XLogReaderState *record)
  {
@@ -285,18 +274,6 @@ xact_desc(StringInfo buf, XLogReaderState *record)
  
                 xact_desc_abort(buf, XLogRecGetInfo(record), xlrec);
         }
-       else if (info == XLOG_XACT_ASSIGNMENT)
-       {
-               xl_xact_assignment *xlrec = (xl_xact_assignment *) rec;
-
-               /*
-                * Note that we ignore the WAL record's xid, since we're more
-                * interested in the top-level xid that issued the record and which
-                * xids are being reported here.
-                */
-               appendStringInfo(buf, "xtop %u: ", xlrec->xtop);
-               xact_desc_assignment(buf, xlrec);
-       }
  }
  
  const char *
@@ -321,9 +298,6 @@ xact_identify(uint8 info)
                 case XLOG_XACT_ABORT_PREPARED:
                         id = "ABORT_PREPARED";
                         break;
-               case XLOG_XACT_ASSIGNMENT:
-                       id = "ASSIGNMENT";
-                       break;
         }
  
         return id;
diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile

index 16fbe47269a9be93ac6a283b93da54aa4b82454e..fea6d28e3336cf354d18c742655a7916c0468584 100644 (file)
--- a/src/backend/access/transam/Makefile
+++ b/src/backend/access/transam/Makefile
@@ -12,8 +12,8 @@ subdir = src/backend/access/transam
  top_builddir = ../../../..
  include $(top_builddir)/src/Makefile.global
  
-OBJS = clog.o commit_ts.o generic_xlog.o multixact.o parallel.o rmgr.o slru.o \
-       subtrans.o timeline.o transam.o twophase.o twophase_rmgr.o varsup.o \
+OBJS = clog.o commit_ts.o csnlog.o generic_xlog.o multixact.o parallel.o rmgr.o slru.o \
+       timeline.o transam.o twophase.o twophase_rmgr.o varsup.o \
         xact.o xlog.o xlogarchive.o xlogfuncs.o \
         xloginsert.o xlogreader.o xlogutils.o
  
diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README

index 4ae4715339e707ab5ed879a628aa96b73002ef43..51b0d166be1003e86697379302e41197912066fb 100644 (file)
--- a/src/backend/access/transam/README
+++ b/src/backend/access/transam/README
@@ -244,44 +244,24 @@ transaction Y as committed, then snapshot A must consider transaction Y as
  committed".
  
  What we actually enforce is strict serialization of commits and rollbacks
-with snapshot-taking: we do not allow any transaction to exit the set of
-running transactions while a snapshot is being taken.  (This rule is
-stronger than necessary for consistency, but is relatively simple to
-enforce, and it assists with some other issues as explained below.)  The
-implementation of this is that GetSnapshotData takes the ProcArrayLock in
-shared mode (so that multiple backends can take snapshots in parallel),
-but ProcArrayEndTransaction must take the ProcArrayLock in exclusive mode
-while clearing MyPgXact->xid at transaction end (either commit or abort).
-(To reduce context switching, when multiple transactions commit nearly
-simultaneously, we have one backend take ProcArrayLock and clear the XIDs
-of multiple processes at once.)
-
-ProcArrayEndTransaction also holds the lock while advancing the shared
-latestCompletedXid variable.  This allows GetSnapshotData to use
-latestCompletedXid + 1 as xmax for its snapshot: there can be no
-transaction >= this xid value that the snapshot needs to consider as
-completed.
-
-In short, then, the rule is that no transaction may exit the set of
-currently-running transactions between the time we fetch latestCompletedXid
-and the time we finish building our snapshot.  However, this restriction
-only applies to transactions that have an XID --- read-only transactions
-can end without acquiring ProcArrayLock, since they don't affect anyone
-else's snapshot nor latestCompletedXid.
-
-Transaction start, per se, doesn't have any interlocking with these
-considerations, since we no longer assign an XID immediately at transaction
-start.  But when we do decide to allocate an XID, GetNewTransactionId must
-store the new XID into the shared ProcArray before releasing XidGenLock.
-This ensures that all top-level XIDs <= latestCompletedXid are either
-present in the ProcArray, or not running anymore.  (This guarantee doesn't
-apply to subtransaction XIDs, because of the possibility that there's not
-room for them in the subxid array; instead we guarantee that they are
-present or the overflow flag is set.)  If a backend released XidGenLock
-before storing its XID into MyPgXact, then it would be possible for another
-backend to allocate and commit a later XID, causing latestCompletedXid to
-pass the first backend's XID, before that value became visible in the
-ProcArray.  That would break GetOldestXmin, as discussed below.
+with snapshot-taking. Each commit is assigned a Commit Sequence Number, or
+CSN for short, using a monotonically increasing counter. A snapshot is
+represented by the value of the CSN counter, at the time the snapshot was
+taken. All (committed) transactions with a CSN <= the snapshot's CSN are
+considered as visible to the snapshot.
+
+When checking the visibility of a tuple, we need to look up the CSN
+of the xmin/xmax. For that purpose, we store the CSN of each
+transaction in the Commit Sequence Number log (csnlog).
+
+So, a snapshot is simply a CSN, such that all transactions that committed
+before that LSN are visible, and everything later is still considered as
+in-progress. However, to avoid consulting the csnlog every time the visibilty
+of a tuple is checked, we also record a lower and upper bound of the XIDs
+considered visible by the snapshot, in SnapshotData. When a snapshot is
+taken, xmax is set to the current nextXid value; any transaction that begins
+after the snapshot is surely still running. The xmin is tracked lazily in
+shared memory, by AdvanceRecentGlobalXmin().
  
  We allow GetNewTransactionId to store the XID into MyPgXact->xid (or the
  subxid array) without taking ProcArrayLock.  This was once necessary to
@@ -293,48 +273,34 @@ once, rather than assume they can read it multiple times and get the same
  answer each time.  (Use volatile-qualified pointers when doing this, to
  ensure that the C compiler does exactly what you tell it to.)
  
-Another important activity that uses the shared ProcArray is GetOldestXmin,
-which must determine a lower bound for the oldest xmin of any active MVCC
-snapshot, system-wide.  Each individual backend advertises the smallest
-xmin of its own snapshots in MyPgXact->xmin, or zero if it currently has no
+Another important activity that uses the shared ProcArray is GetOldestSnapshot
+which must determine a lower bound for the oldest of any active MVCC
+snapshots, system-wide.  Each individual backend advertises the earliest
+of its own snapshots in MyPgXact->snapshotcsn, or zero if it currently has no
  live snapshots (eg, if it's between transactions or hasn't yet set a
-snapshot for a new transaction).  GetOldestXmin takes the MIN() of the
-valid xmin fields.  It does this with only shared lock on ProcArrayLock,
-which means there is a potential race condition against other backends
-doing GetSnapshotData concurrently: we must be certain that a concurrent
-backend that is about to set its xmin does not compute an xmin less than
-what GetOldestXmin returns.  We ensure that by including all the active
-XIDs into the MIN() calculation, along with the valid xmins.  The rule that
-transactions can't exit without taking exclusive ProcArrayLock ensures that
-concurrent holders of shared ProcArrayLock will compute the same minimum of
-currently-active XIDs: no xact, in particular not the oldest, can exit
-while we hold shared ProcArrayLock.  So GetOldestXmin's view of the minimum
-active XID will be the same as that of any concurrent GetSnapshotData, and
-so it can't produce an overestimate.  If there is no active transaction at
-all, GetOldestXmin returns latestCompletedXid + 1, which is a lower bound
-for the xmin that might be computed by concurrent or later GetSnapshotData
-calls.  (We know that no XID less than this could be about to appear in
-the ProcArray, because of the XidGenLock interlock discussed above.)
-
-GetSnapshotData also performs an oldest-xmin calculation (which had better
-match GetOldestXmin's) and stores that into RecentGlobalXmin, which is used
-for some tuple age cutoff checks where a fresh call of GetOldestXmin seems
-too expensive.  Note that while it is certain that two concurrent
-executions of GetSnapshotData will compute the same xmin for their own
-snapshots, as argued above, it is not certain that they will arrive at the
-same estimate of RecentGlobalXmin.  This is because we allow XID-less
-transactions to clear their MyPgXact->xmin asynchronously (without taking
-ProcArrayLock), so one execution might see what had been the oldest xmin,
-and another not.  This is OK since RecentGlobalXmin need only be a valid
-lower bound.  As noted above, we are already assuming that fetch/store
-of the xid fields is atomic, so assuming it for xmin as well is no extra
-risk.
-
-
-pg_clog and pg_subtrans
+snapshot for a new transaction).  GetOldestSnapshot takes the MIN() of the
+snapshots.
+
+For freezing tuples, vacuum needs to know the oldest XID that is still
+considered running by any active transaction. That is, the oldest XID still
+considered running by the oldest active snapshot, as returned by
+GetOldestSnapshotCSN(). This value is somewhat expensive to calculate, so
+the most recently calculated value is kept in shared memory
+(SharedVariableCache->recentXmin), and is recalculated lazily by
+AdvanceRecentGlobalXmin() function. AdvanceRecentGlobalXmin() first scans
+the proc array, and makes note of the oldest active XID. That XID - 1 will
+become the new xmin. It then waits until all currently active snapshots have
+finished. Any snapshot that begins later will see the xmin as finished, so
+after all the active snapshots have finished, xmin will be visible to
+everyone. However, AdvanceRecentGlobalXmin() does not actually block waiting
+for anything; instead it contains a state machine that advances if possible,
+when AdvanceRecentGlobalXmin() is called. AdvanceRecentGlobalXmin() is
+called periodically by the WAL writer, so that it doesn't get very stale.
+
+pg_clog and pg_csnlog
  -----------------------
  
-pg_clog and pg_subtrans are permanent (on-disk) storage of transaction related
+pg_clog and pg_csnlog are permanent (on-disk) storage of transaction related
  information.  There is a limited number of pages of each kept in memory, so
  in many cases there is no need to actually read from disk.  However, if
  there's a long running transaction or a backend sitting idle with an open
@@ -343,21 +309,10 @@ from disk.  They also allow information to be permanent across server restarts.
  
  pg_clog records the commit status for each transaction that has been assigned
  an XID.  A transaction can be in progress, committed, aborted, or
-"sub-committed".  This last state means that it's a subtransaction that's no
-longer running, but its parent has not updated its state yet.  It is not
-necessary to update a subtransaction's transaction status to subcommit, so we
-can just defer it until main transaction commit.  The main role of marking
-transactions as sub-committed is to provide an atomic commit protocol when
-transaction status is spread across multiple clog pages. As a result, whenever
-transaction status spreads across multiple pages we must use a two-phase commit
-protocol: the first phase is to mark the subtransactions as sub-committed, then
-we mark the top level transaction and all its subtransactions committed (in
-that order).  Thus, subtransactions that have not aborted appear as in-progress
-even when they have already finished, and the subcommit status appears as a
-very short transitory state during main transaction commit.  Subtransaction
-abort is always marked in clog as soon as it occurs.  When the transaction
-status all fit in a single CLOG page, we atomically mark them all as committed
-without bothering with the intermediate sub-commit state.
+"committing". For committed transactions, the clog stores the commit WAL
+record's LSN. This last state means that the transaction is just about to
+write its commit WAL record, or just did so, but it hasn't yet updated the
+clog with the record's LSN.
  
  Savepoints are implemented using subtransactions.  A subtransaction is a
  transaction inside a transaction; its commit or abort status is not only
@@ -370,7 +325,7 @@ transaction.
  The "subtransaction parent" (pg_subtrans) mechanism records, for each
  transaction with an XID, the TransactionId of its parent transaction.  This
  information is stored as soon as the subtransaction is assigned an XID.
-Top-level transactions do not have a parent, so they leave their pg_subtrans
+Top-level transactions do not have a parent, so they leave their pg_csnlog
  entries set to the default value of zero (InvalidTransactionId).
  
  pg_subtrans is used to check whether the transaction in question is still
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c

index 263447679b8991ff35796d421bb790e6d2f5dd28..0c382d15dd689572bda91280dd22d25059aaa736 100644 (file)
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -33,6 +33,7 @@
  #include "postgres.h"
  
  #include "access/clog.h"
+#include "access/mvccvars.h"
  #include "access/slru.h"
  #include "access/transam.h"
  #include "access/xlog.h"
@@ -84,17 +85,15 @@ static int  ZeroCLOGPage(int pageno, bool writeXlog);
  static bool CLOGPagePrecedes(int page1, int page2);
  static void WriteZeroPageXlogRec(int pageno);
  static void WriteTruncateXlogRec(int pageno);
-static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
-                                                  TransactionId *subxids, XidStatus status,
+static void CLogSetPageStatus(TransactionId xid, int nsubxids,
+                                                  TransactionId *subxids, CLogXidStatus status,
                                                    XLogRecPtr lsn, int pageno);
-static void TransactionIdSetStatusBit(TransactionId xid, XidStatus status,
+static void CLogSetStatusBit(TransactionId xid, CLogXidStatus status,
                                                   XLogRecPtr lsn, int slotno);
-static void set_status_by_pages(int nsubxids, TransactionId *subxids,
-                                       XidStatus status, XLogRecPtr lsn);
  
  
  /*
- * TransactionIdSetTreeStatus
+ * CLogSetTreeStatus
   *
   * Record the final state of transaction entries in the commit log for
   * a transaction and its subtransaction tree. Take care to ensure this is
@@ -112,30 +111,13 @@ static void set_status_by_pages(int nsubxids, TransactionId *subxids,
   * caller guarantees the commit record is already flushed in that case.  It
   * should be InvalidXLogRecPtr for abort cases, too.
   *
- * In the commit case, atomicity is limited by whether all the subxids are in
- * the same CLOG page as xid.  If they all are, then the lock will be grabbed
- * only once, and the status will be set to committed directly.  Otherwise
- * we must
- *      1. set sub-committed all subxids that are not on the same page as the
- *             main xid
- *      2. atomically set committed the main xid and the subxids on the same page
- *      3. go over the first bunch again and set them committed
- * Note that as far as concurrent checkers are concerned, main transaction
- * commit as a whole is still atomic.
- *
- * Example:
- *             TransactionId t commits and has subxids t1, t2, t3, t4
- *             t is on page p1, t1 is also on p1, t2 and t3 are on p2, t4 is on p3
- *             1. update pages2-3:
- *                                     page2: set t2,t3 as sub-committed
- *                                     page3: set t4 as sub-committed
- *             2. update page1:
- *                                     set t1 as sub-committed,
- *                                     then set t as committed,
-                                       then set t1 as committed
- *             3. update pages2-3:
- *                                     page2: set t2,t3 as committed
- *                                     page3: set t4 as committed
+ * The atomicity is limited by whether all the subxids are in the same CLOG
+ * page as xid.  If they all are, then the lock will be grabbed only once,
+ * and the status will be set to committed directly.  Otherwise there is
+ * a window that the parent will be seen as committed, while (some of) the
+ * children are still seen as in-progress. That's OK with the current use,
+ * as visibility checking code will not rely on the CLOG for recent
+ * transactions (CSNLOG will be used instead).
   *
   * NB: this is a low-level routine and is NOT the preferred entry point
   * for most uses; functions in transam.c are the intended callers.
@@ -145,102 +127,45 @@ static void set_status_by_pages(int nsubxids, TransactionId *subxids,
   * cache yet.
   */
  void
-TransactionIdSetTreeStatus(TransactionId xid, int nsubxids,
-                                       TransactionId *subxids, XidStatus status, XLogRecPtr lsn)
+CLogSetTreeStatus(TransactionId xid, int nsubxids,
+                                 TransactionId *subxids, CLogXidStatus status, XLogRecPtr lsn)
  {
-       int                     pageno = TransactionIdToPage(xid);              /* get page of parent */
+       TransactionId topXid;
+       int                     pageno;
         int                     i;
+       int                     offset;
  
-       Assert(status == TRANSACTION_STATUS_COMMITTED ||
-                  status == TRANSACTION_STATUS_ABORTED);
-
-       /*
-        * See how many subxids, if any, are on the same page as the parent, if
-        * any.
-        */
-       for (i = 0; i < nsubxids; i++)
-       {
-               if (TransactionIdToPage(subxids[i]) != pageno)
-                       break;
-       }
+       Assert(status == CLOG_XID_STATUS_COMMITTED ||
+                  status == CLOG_XID_STATUS_ABORTED);
  
         /*
-        * Do all items fit on a single page?
+        * Update the clog page-by-page. On first iteration, we will set the
+        * status of the top-XID, and any subtransactions on the same page.
          */
-       if (i == nsubxids)
-       {
-               /*
-                * Set the parent and all subtransactions in a single call
-                */
-               TransactionIdSetPageStatus(xid, nsubxids, subxids, status, lsn,
-                                                                  pageno);
-       }
-       else
-       {
-               int                     nsubxids_on_first_page = i;
-
-               /*
-                * If this is a commit then we care about doing this correctly (i.e.
-                * using the subcommitted intermediate status).  By here, we know
-                * we're updating more than one page of clog, so we must mark entries
-                * that are *not* on the first page so that they show as subcommitted
-                * before we then return to update the status to fully committed.
-                *
-                * To avoid touching the first page twice, skip marking subcommitted
-                * for the subxids on that first page.
-                */
-               if (status == TRANSACTION_STATUS_COMMITTED)
-                       set_status_by_pages(nsubxids - nsubxids_on_first_page,
-                                                               subxids + nsubxids_on_first_page,
-                                                               TRANSACTION_STATUS_SUB_COMMITTED, lsn);
-
-               /*
-                * Now set the parent and subtransactions on same page as the parent,
-                * if any
-                */
-               pageno = TransactionIdToPage(xid);
-               TransactionIdSetPageStatus(xid, nsubxids_on_first_page, subxids, status,
-                                                                  lsn, pageno);
-
-               /*
-                * Now work through the rest of the subxids one clog page at a time,
-                * starting from the second page onwards, like we did above.
-                */
-               set_status_by_pages(nsubxids - nsubxids_on_first_page,
-                                                       subxids + nsubxids_on_first_page,
-                                                       status, lsn);
-       }
-}
-
-/*
- * Helper for TransactionIdSetTreeStatus: set the status for a bunch of
- * transactions, chunking in the separate CLOG pages involved. We never
- * pass the whole transaction tree to this function, only subtransactions
- * that are on different pages to the top level transaction id.
- */
-static void
-set_status_by_pages(int nsubxids, TransactionId *subxids,
-                                       XidStatus status, XLogRecPtr lsn)
-{
-       int                     pageno = TransactionIdToPage(subxids[0]);
-       int                     offset = 0;
-       int                     i = 0;
-
-       while (i < nsubxids)
+       pageno = TransactionIdToPage(xid);              /* get page of parent */
+       topXid = xid;
+       offset = 0;
+       i = 0;
+       for (;;)
         {
                 int                     num_on_page = 0;
  
-               while (TransactionIdToPage(subxids[i]) == pageno && i < nsubxids)
+               while (i < nsubxids && TransactionIdToPage(subxids[i]) == pageno)
                 {
                         num_on_page++;
                         i++;
                 }
  
-               TransactionIdSetPageStatus(InvalidTransactionId,
-                                                                  num_on_page, subxids + offset,
-                                                                  status, lsn, pageno);
+               CLogSetPageStatus(topXid,
+                                                 num_on_page, subxids + offset,
+                                                 status, lsn, pageno);
+
+               if (i == nsubxids)
+                       break;
+
                 offset = i;
                 pageno = TransactionIdToPage(subxids[offset]);
+               topXid = InvalidTransactionId;
         }
  }
  
@@ -248,19 +173,18 @@ set_status_by_pages(int nsubxids, TransactionId *subxids,
   * Record the final state of transaction entries in the commit log for
   * all entries on a single page.  Atomic only on this page.
   *
- * Otherwise API is same as TransactionIdSetTreeStatus()
+ * Otherwise API is same as CLogSetTreeStatus()
   */
  static void
-TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
-                                                  TransactionId *subxids, XidStatus status,
-                                                  XLogRecPtr lsn, int pageno)
+CLogSetPageStatus(TransactionId xid, int nsubxids,
+                                 TransactionId *subxids, CLogXidStatus status,
+                                 XLogRecPtr lsn, int pageno)
  {
         int                     slotno;
         int                     i;
  
-       Assert(status == TRANSACTION_STATUS_COMMITTED ||
-                  status == TRANSACTION_STATUS_ABORTED ||
-                  (status == TRANSACTION_STATUS_SUB_COMMITTED && !TransactionIdIsValid(xid)));
+       Assert(status == CLOG_XID_STATUS_COMMITTED ||
+                  status == CLOG_XID_STATUS_ABORTED);
  
         LWLockAcquire(CLogControlLock, LW_EXCLUSIVE);
  
@@ -275,38 +199,15 @@ TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
          */
         slotno = SimpleLruReadPage(ClogCtl, pageno, XLogRecPtrIsInvalid(lsn), xid);
  
-       /*
-        * Set the main transaction id, if any.
-        *
-        * If we update more than one xid on this page while it is being written
-        * out, we might find that some of the bits go to disk and others don't.
-        * If we are updating commits on the page with the top-level xid that
-        * could break atomicity, so we subcommit the subxids first before we mark
-        * the top-level commit.
-        */
+       /* Set the main transaction id, if any. */
         if (TransactionIdIsValid(xid))
-       {
-               /* Subtransactions first, if needed ... */
-               if (status == TRANSACTION_STATUS_COMMITTED)
-               {
-                       for (i = 0; i < nsubxids; i++)
-                       {
-                               Assert(ClogCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i]));
-                               TransactionIdSetStatusBit(subxids[i],
-                                                                                 TRANSACTION_STATUS_SUB_COMMITTED,
-                                                                                 lsn, slotno);
-                       }
-               }
-
-               /* ... then the main transaction */
-               TransactionIdSetStatusBit(xid, status, lsn, slotno);
-       }
+               CLogSetStatusBit(xid, status, lsn, slotno);
  
         /* Set the subtransactions */
         for (i = 0; i < nsubxids; i++)
         {
                 Assert(ClogCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i]));
-               TransactionIdSetStatusBit(subxids[i], status, lsn, slotno);
+               CLogSetStatusBit(subxids[i], status, lsn, slotno);
         }
  
         ClogCtl->shared->page_dirty[slotno] = true;
@@ -320,7 +221,7 @@ TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
   * Must be called with CLogControlLock held
   */
  static void
-TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, int slotno)
+CLogSetStatusBit(TransactionId xid, CLogXidStatus status, XLogRecPtr lsn, int slotno)
  {
         int                     byteno = TransactionIdToByte(xid);
         int                     bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
@@ -331,23 +232,13 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
         byteptr = ClogCtl->shared->page_buffer[slotno] + byteno;
         curval = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
  
-       /*
-        * When replaying transactions during recovery we still need to perform
-        * the two phases of subcommit and then commit. However, some transactions
-        * are already correctly marked, so we just treat those as a no-op which
-        * allows us to keep the following Assert as restrictive as possible.
-        */
-       if (InRecovery && status == TRANSACTION_STATUS_SUB_COMMITTED &&
-               curval == TRANSACTION_STATUS_COMMITTED)
-               return;
-
         /*
          * Current state change should be from 0 or subcommitted to target state
          * or we should already be there when replaying changes during recovery.
          */
         Assert(curval == 0 ||
-                  (curval == TRANSACTION_STATUS_SUB_COMMITTED &&
-                       status != TRANSACTION_STATUS_IN_PROGRESS) ||
+                  (curval == CLOG_XID_STATUS_SUB_COMMITTED &&
+                       status != CLOG_XID_STATUS_IN_PROGRESS) ||
                    curval == status);
  
         /* note this assumes exclusive access to the clog page */
@@ -388,8 +279,8 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
   * NB: this is a low-level routine and is NOT the preferred entry point
   * for most uses; TransactionLogFetch() in transam.c is the intended caller.
   */
-XidStatus
-TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn)
+CLogXidStatus
+CLogGetStatus(TransactionId xid, XLogRecPtr *lsn)
  {
         int                     pageno = TransactionIdToPage(xid);
         int                     byteno = TransactionIdToByte(xid);
@@ -397,7 +288,7 @@ TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn)
         int                     slotno;
         int                     lsnindex;
         char       *byteptr;
-       XidStatus       status;
+       CLogXidStatus   status;
  
         /* lock is acquired by SimpleLruReadPage_ReadOnly */
  
diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c

index e330105217d83c4bb7075cccb207a07d8d91d252..0e7aba12fa696248c8139ff57dd7d0c370217dd4 100644 (file)
--- a/src/backend/access/transam/commit_ts.c
+++ b/src/backend/access/transam/commit_ts.c
@@ -26,6 +26,7 @@
  
  #include "access/commit_ts.h"
  #include "access/htup_details.h"
+#include "access/mvccvars.h"
  #include "access/slru.h"
  #include "access/transam.h"
  #include "catalog/pg_type.h"
diff --git a/src/backend/access/transam/csnlog.c b/src/backend/access/transam/csnlog.c

new file mode 100644 (file)

index 0000000..23ad93c
--- /dev/null
+++ b/src/backend/access/transam/csnlog.c
@@ -0,0 +1,599 @@
+/*-------------------------------------------------------------------------
+ *
+ * csnlog.c
+ *             Tracking Commit-Sequence-Numbers and in-progress subtransactions
+ *
+ * The pg_csnlog manager is a pg_clog-like manager that stores the commit
+ * sequence number, or parent transaction Id, for each transaction.  It is
+ * a fundamental part of MVCC.
+ *
+ * The csnlog serves two purposes:
+ *
+ * 1. While a transaction is in progress, it stores the parent transaction
+ * Id for each in-progress subtransaction. A main transaction has a parent
+ * of InvalidTransactionId, and each subtransaction has its immediate
+ * parent. The tree can easily be walked from child to parent, but not in
+ * the opposite direction.
+ *
+ * 2. After a transaction has committed, it stores the Commit Sequence
+ * Number of the commit.
+ *
+ * We can use the same structure for both, because we don't care about the
+ * parent-child relationships subtransaction after commit.
+ *
+ * This code is based on clog.c, but the robustness requirements
+ * are completely different from pg_clog, because we only need to remember
+ * pg_csnlog information for currently-open and recently committed
+ * transactions.  Thus, there is no need to preserve data over a crash and
+ * restart.
+ *
+ * There are no XLOG interactions since we do not care about preserving
+ * data across crashes.  During database startup, we simply force the
+ * currently-active page of CSNLOG to zeroes.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/csnlog.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/csnlog.h"
+#include "access/mvccvars.h"
+#include "access/slru.h"
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "utils/snapmgr.h"
+
+/*
+ * Defines for CSNLOG page sizes.  A page is the same BLCKSZ as is used
+ * everywhere else in Postgres.
+ *
+ * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF,
+ * CSNLOG page numbering also wraps around at 0xFFFFFFFF/CSNLOG_XACTS_PER_PAGE,
+ * and CSNLOG segment numbering at
+ * 0xFFFFFFFF/CLOG_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT.  We need take no
+ * explicit notice of that fact in this module, except when comparing segment
+ * and page numbers in TruncateCSNLOG (see CSNLOGPagePrecedes).
+ */
+
+/* We store the commit LSN for each xid */
+#define CSNLOG_XACTS_PER_PAGE (BLCKSZ / sizeof(CommitSeqNo))
+
+#define TransactionIdToPage(xid)       ((xid) / (TransactionId) CSNLOG_XACTS_PER_PAGE)
+#define TransactionIdToPgIndex(xid) ((xid) % (TransactionId) CSNLOG_XACTS_PER_PAGE)
+
+/*
+ * Link to shared-memory data structures for CLOG control
+ */
+static SlruCtlData CsnlogCtlData;
+
+#define CsnlogCtl (&CsnlogCtlData)
+
+
+static int     ZeroCSNLOGPage(int pageno);
+static bool CSNLOGPagePrecedes(int page1, int page2);
+static void CSNLogSetPageStatus(TransactionId xid, int nsubxids,
+                                                  TransactionId *subxids,
+                                                  CommitSeqNo csn, int pageno);
+static void CSNLogSetCSN(TransactionId xid, CommitSeqNo csn, int slotno);
+
+/*
+ * CSNLogSetCommitSeqNo
+ *
+ * Record the status and CSN of transaction entries in the commit log for a
+ * transaction and its subtransaction tree. Take care to ensure this is
+ * efficient, and as atomic as possible.
+ *
+ * xid is a single xid to set status for. This will typically be the
+ * top level transactionid for a top level commit or abort. It can
+ * also be a subtransaction when we record transaction aborts.
+ *
+ * subxids is an array of xids of length nsubxids, representing subtransactions
+ * in the tree of xid. In various cases nsubxids may be zero.
+ *
+ * csn is the commit sequence number of the transaction. It should be
+ * InvalidCommitSeqNo for abort cases.
+ *
+ * Note: This doesn't guarantee atomicity. The caller can use the
+ * COMMITSEQNO_COMMITTING special value for that.
+ */
+void
+CSNLogSetCommitSeqNo(TransactionId xid, int nsubxids,
+                                        TransactionId *subxids, CommitSeqNo csn)
+{
+       int                     pageno;
+       int                     i = 0;
+       int                     offset = 0;
+
+       if (csn == InvalidCommitSeqNo || xid == BootstrapTransactionId)
+       {
+               if (IsBootstrapProcessingMode())
+                       csn = COMMITSEQNO_FROZEN;
+               else
+                       elog(ERROR, "cannot mark transaction committed without CSN");
+       }
+
+       pageno = TransactionIdToPage(xid);              /* get page of parent */
+       for (;;)
+       {
+               int                     num_on_page = 0;
+
+               while (i < nsubxids && TransactionIdToPage(subxids[i]) == pageno)
+               {
+                       num_on_page++;
+                       i++;
+               }
+
+               CSNLogSetPageStatus(xid,
+                                                       num_on_page, subxids + offset,
+                                                       csn, pageno);
+               if (i >= nsubxids)
+                       break;
+
+               offset = i;
+               pageno = TransactionIdToPage(subxids[offset]);
+               xid = InvalidTransactionId;
+       }
+}
+
+/*
+ * Record the final state of transaction entries in the csn log for
+ * all entries on a single page.  Atomic only on this page.
+ *
+ * Otherwise API is same as TransactionIdSetTreeStatus()
+ */
+static void
+CSNLogSetPageStatus(TransactionId xid, int nsubxids,
+                                                  TransactionId *subxids,
+                                                  CommitSeqNo csn, int pageno)
+{
+       int                     slotno;
+       int                     i;
+
+       LWLockAcquire(CSNLogControlLock, LW_EXCLUSIVE);
+
+       slotno = SimpleLruReadPage(CsnlogCtl, pageno, true, xid);
+
+       /* Subtransactions first, if needed ... */
+       for (i = 0; i < nsubxids; i++)
+       {
+               Assert(CsnlogCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i]));
+               CSNLogSetCSN(subxids[i],        csn, slotno);
+       }
+
+       /* ... then the main transaction */
+       if (TransactionIdIsValid(xid))
+               CSNLogSetCSN(xid, csn, slotno);
+
+       CsnlogCtl->shared->page_dirty[slotno] = true;
+
+       LWLockRelease(CSNLogControlLock);
+}
+
+
+
+/*
+ * Record the parent of a subtransaction in the subtrans log.
+ *
+ * In some cases we may need to overwrite an existing value.
+ */
+void
+SubTransSetParent(TransactionId xid, TransactionId parent, bool overwriteOK)
+{
+       int                     pageno = TransactionIdToPage(xid);
+       int                     entryno = TransactionIdToPgIndex(xid);
+       int                     slotno;
+       CommitSeqNo *ptr;
+       CommitSeqNo newcsn;
+
+       Assert(TransactionIdIsValid(parent));
+
+       newcsn = CSN_SUBTRANS_BIT | (uint64) parent;
+
+       LWLockAcquire(CSNLogControlLock, LW_EXCLUSIVE);
+
+       slotno = SimpleLruReadPage(CsnlogCtl, pageno, true, xid);
+       ptr = (CommitSeqNo *) CsnlogCtl->shared->page_buffer[slotno];
+       ptr += entryno;
+
+       /* Current state should be 0 */
+       Assert(*ptr == COMMITSEQNO_INPROGRESS ||
+                  (*ptr == newcsn && overwriteOK));
+
+       *ptr = newcsn;
+
+       CsnlogCtl->shared->page_dirty[slotno] = true;
+
+       LWLockRelease(CSNLogControlLock);
+}
+
+/*
+ * Interrogate the parent of a transaction in the csnlog.
+ */
+TransactionId
+SubTransGetParent(TransactionId xid)
+{
+       CommitSeqNo csn;
+
+       csn = CSNLogGetCommitSeqNo(xid);
+
+       if (COMMITSEQNO_IS_SUBTRANS(csn))
+               return (TransactionId) (csn & 0xFFFFFFFF);
+       else
+               return InvalidTransactionId;
+}
+
+/*
+ * SubTransGetTopmostTransaction
+ *
+ * Returns the topmost transaction of the given transaction id.
+ *
+ * Because we cannot look back further than TransactionXmin, it is possible
+ * that this function will lie and return an intermediate subtransaction ID
+ * instead of the true topmost parent ID.  This is OK, because in practice
+ * we only care about detecting whether the topmost parent is still running
+ * or is part of a current snapshot's list of still-running transactions.
+ * Therefore, any XID before TransactionXmin is as good as any other.
+ */
+TransactionId
+SubTransGetTopmostTransaction(TransactionId xid)
+{
+       TransactionId parentXid = xid,
+                               previousXid = xid;
+
+       /* Can't ask about stuff that might not be around anymore */
+       Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin));
+
+       while (TransactionIdIsValid(parentXid))
+       {
+               previousXid = parentXid;
+               if (TransactionIdPrecedes(parentXid, TransactionXmin))
+                       break;
+               parentXid = SubTransGetParent(parentXid);
+       }
+
+       Assert(TransactionIdIsValid(previousXid));
+
+       return previousXid;
+}
+
+
+
+
+/*
+ * Sets the commit status of a single transaction.
+ *
+ * Must be called with CSNLogControlLock held
+ */
+static void
+CSNLogSetCSN(TransactionId xid, CommitSeqNo csn, int slotno)
+{
+       int                     entryno = TransactionIdToPgIndex(xid);
+       CommitSeqNo *ptr;
+
+       ptr = (CommitSeqNo *) (CsnlogCtl->shared->page_buffer[slotno] + entryno * sizeof(XLogRecPtr));
+
+       /*
+        * Current state change should be from 0 to target state. (Allow
+        * setting it again to same value.)
+        */
+       Assert(COMMITSEQNO_IS_INPROGRESS(*ptr) ||
+                  COMMITSEQNO_IS_COMMITTING(*ptr) ||
+                  COMMITSEQNO_IS_SUBTRANS(*ptr) ||
+                  *ptr == csn);
+
+       *ptr = csn;
+}
+
+/*
+ * Interrogate the state of a transaction in the commit log.
+ *
+ * Aside from the actual commit status, this function returns (into *lsn)
+ * an LSN that is late enough to be able to guarantee that if we flush up to
+ * that LSN then we will have flushed the transaction's commit record to disk.
+ * The result is not necessarily the exact LSN of the transaction's commit
+ * record!     For example, for long-past transactions (those whose clog pages
+ * already migrated to disk), we'll return InvalidXLogRecPtr.  Also, because
+ * we group transactions on the same clog page to conserve storage, we might
+ * return the LSN of a later transaction that falls into the same group.
+ *
+ * NB: this is a low-level routine and is NOT the preferred entry point
+ * for most uses; TransactionLogFetch() in transam.c is the intended caller.
+ */
+CommitSeqNo
+CSNLogGetCommitSeqNo(TransactionId xid)
+{
+       int                     pageno = TransactionIdToPage(xid);
+       int                     entryno = TransactionIdToPgIndex(xid);
+       int                     slotno;
+       XLogRecPtr *ptr;
+       XLogRecPtr      commitlsn;
+
+       /* Can't ask about stuff that might not be around anymore */
+       Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin));
+
+       if (!TransactionIdIsNormal(xid))
+       {
+               if (xid == InvalidTransactionId)
+                       return COMMITSEQNO_ABORTED;
+               if (xid == FrozenTransactionId || xid == BootstrapTransactionId)
+                       return COMMITSEQNO_FROZEN;
+       }
+
+       /* lock is acquired by SimpleLruReadPage_ReadOnly */
+
+       slotno = SimpleLruReadPage_ReadOnly(CsnlogCtl, pageno, xid);
+       ptr = (XLogRecPtr *) (CsnlogCtl->shared->page_buffer[slotno] + entryno * sizeof(XLogRecPtr));
+
+       commitlsn = *ptr;
+
+       LWLockRelease(CSNLogControlLock);
+
+       return commitlsn;
+}
+
+/*
+ * Number of shared CSNLOG buffers.
+ */
+Size
+CSNLOGShmemBuffers(void)
+{
+       return Min(32, Max(4, NBuffers / 512));
+}
+
+/*
+ * Initialization of shared memory for CSNLOG
+ */
+Size
+CSNLOGShmemSize(void)
+{
+       return SimpleLruShmemSize(CSNLOGShmemBuffers(), 0);
+}
+
+void
+CSNLOGShmemInit(void)
+{
+       CsnlogCtl->PagePrecedes = CSNLOGPagePrecedes;
+       SimpleLruInit(CsnlogCtl, "CSNLOG Ctl", CSNLOGShmemBuffers(), 0,
+                                 CSNLogControlLock, "pg_csnlog", LWTRANCHE_CSNLOG_BUFFERS);
+}
+
+/*
+ * This func must be called ONCE on system install.  It creates
+ * the initial CSNLOG segment.  (The pg_csnlog directory is assumed to
+ * have been created by initdb, and CSNLOGShmemInit must have been
+ * called already.)
+ */
+void
+BootStrapCSNLOG(void)
+{
+       int                     slotno;
+
+       LWLockAcquire(CSNLogControlLock, LW_EXCLUSIVE);
+
+       /* Create and zero the first page of the commit log */
+       slotno = ZeroCSNLOGPage(0);
+
+       /* Make sure it's written out */
+       SimpleLruWritePage(CsnlogCtl, slotno);
+       Assert(!CsnlogCtl->shared->page_dirty[slotno]);
+
+       LWLockRelease(CSNLogControlLock);
+}
+
+/*
+ * Initialize (or reinitialize) a page of CLOG to zeroes.
+ * If writeXlog is TRUE, also emit an XLOG record saying we did this.
+ *
+ * The page is not actually written, just set up in shared memory.
+ * The slot number of the new page is returned.
+ *
+ * Control lock must be held at entry, and will be held at exit.
+ */
+static int
+ZeroCSNLOGPage(int pageno)
+{
+       return SimpleLruZeroPage(CsnlogCtl, pageno);
+}
+
+/*
+ * This must be called ONCE during postmaster or standalone-backend startup,
+ * after StartupXLOG has initialized ShmemVariableCache->nextXid.
+ *
+ * oldestActiveXID is the oldest XID of any prepared transaction, or nextXid
+ * if there are none.
+ */
+void
+StartupCSNLOG(TransactionId oldestActiveXID)
+{
+       int                     startPage;
+       int                     endPage;
+
+       /*
+        * Since we don't expect pg_csnlog to be valid across crashes, we
+        * initialize the currently-active page(s) to zeroes during startup.
+        * Whenever we advance into a new page, ExtendCSNLOG will likewise zero
+        * the new page without regard to whatever was previously on disk.
+        */
+       LWLockAcquire(CSNLogControlLock, LW_EXCLUSIVE);
+
+       startPage = TransactionIdToPage(oldestActiveXID);
+       endPage = TransactionIdToPage(ShmemVariableCache->nextXid);
+
+       while (startPage != endPage)
+       {
+               (void) ZeroCSNLOGPage(startPage);
+               startPage++;
+               /* must account for wraparound */
+               if (startPage > TransactionIdToPage(MaxTransactionId))
+                       startPage = 0;
+       }
+       (void) ZeroCSNLOGPage(startPage);
+
+       LWLockRelease(CSNLogControlLock);
+}
+
+/*
+ * This must be called ONCE during postmaster or standalone-backend shutdown
+ */
+void
+ShutdownCSNLOG(void)
+{
+       /*
+        * Flush dirty CLOG pages to disk
+        *
+        * This is not actually necessary from a correctness point of view. We do
+        * it merely as a debugging aid.
+        */
+       TRACE_POSTGRESQL_CSNLOG_CHECKPOINT_START(false);
+       SimpleLruFlush(CsnlogCtl, false);
+       TRACE_POSTGRESQL_CSNLOG_CHECKPOINT_DONE(false);
+}
+
+/*
+ * This must be called ONCE at the end of startup/recovery.
+ */
+void
+TrimCSNLOG(void)
+{
+       TransactionId xid = ShmemVariableCache->nextXid;
+       int                     pageno = TransactionIdToPage(xid);
+
+       LWLockAcquire(CSNLogControlLock, LW_EXCLUSIVE);
+
+       /*
+        * Re-Initialize our idea of the latest page number.
+        */
+       CsnlogCtl->shared->latest_page_number = pageno;
+
+       /*
+        * Zero out the remainder of the current clog page.  Under normal
+        * circumstances it should be zeroes already, but it seems at least
+        * theoretically possible that XLOG replay will have settled on a nextXID
+        * value that is less than the last XID actually used and marked by the
+        * previous database lifecycle (since subtransaction commit writes clog
+        * but makes no WAL entry).  Let's just be safe. (We need not worry about
+        * pages beyond the current one, since those will be zeroed when first
+        * used.  For the same reason, there is no need to do anything when
+        * nextXid is exactly at a page boundary; and it's likely that the
+        * "current" page doesn't exist yet in that case.)
+        */
+       if (TransactionIdToPgIndex(xid) != 0)
+       {
+               int                     entryno = TransactionIdToPgIndex(xid);
+               int                     byteno = entryno * sizeof(XLogRecPtr);
+               int                     slotno;
+               char       *byteptr;
+
+               slotno = SimpleLruReadPage(CsnlogCtl, pageno, false, xid);
+
+               byteptr = CsnlogCtl->shared->page_buffer[slotno] + byteno;
+
+               /* Zero the rest of the page */
+               MemSet(byteptr, 0, BLCKSZ - byteno);
+
+               CsnlogCtl->shared->page_dirty[slotno] = true;
+       }
+
+       LWLockRelease(CSNLogControlLock);
+}
+
+/*
+ * Perform a checkpoint --- either during shutdown, or on-the-fly
+ */
+void
+CheckPointCSNLOG(void)
+{
+       /*
+        * Flush dirty CLOG pages to disk
+        *
+        * This is not actually necessary from a correctness point of view. We do
+        * it merely to improve the odds that writing of dirty pages is done by
+        * the checkpoint process and not by backends.
+        */
+       TRACE_POSTGRESQL_CSNLOG_CHECKPOINT_START(true);
+       SimpleLruFlush(CsnlogCtl, true);
+       TRACE_POSTGRESQL_CSNLOG_CHECKPOINT_DONE(true);
+}
+
+
+/*
+ * Make sure that CSNLOG has room for a newly-allocated XID.
+ *
+ * NB: this is called while holding XidGenLock.  We want it to be very fast
+ * most of the time; even when it's not so fast, no actual I/O need happen
+ * unless we're forced to write out a dirty clog or xlog page to make room
+ * in shared memory.
+ */
+void
+ExtendCSNLOG(TransactionId newestXact)
+{
+       int                     pageno;
+
+       /*
+        * No work except at first XID of a page.  But beware: just after
+        * wraparound, the first XID of page zero is FirstNormalTransactionId.
+        */
+       if (TransactionIdToPgIndex(newestXact) != 0 &&
+               !TransactionIdEquals(newestXact, FirstNormalTransactionId))
+               return;
+
+       pageno = TransactionIdToPage(newestXact);
+
+       LWLockAcquire(CSNLogControlLock, LW_EXCLUSIVE);
+
+       /* Zero the page and make an XLOG entry about it */
+       ZeroCSNLOGPage(pageno);
+
+       LWLockRelease(CSNLogControlLock);
+}
+
+
+/*
+ * Remove all CSNLOG segments before the one holding the passed transaction ID
+ *
+ * This is normally called during checkpoint, with oldestXact being the
+ * oldest TransactionXmin of any running transaction.
+ */
+void
+TruncateCSNLOG(TransactionId oldestXact)
+{
+       int                     cutoffPage;
+
+       /*
+        * The cutoff point is the start of the segment containing oldestXact. We
+        * pass the *page* containing oldestXact to SimpleLruTruncate.
+        */
+       cutoffPage = TransactionIdToPage(oldestXact);
+
+       SimpleLruTruncate(CsnlogCtl, cutoffPage);
+}
+
+
+/*
+ * Decide which of two CLOG page numbers is "older" for truncation purposes.
+ *
+ * We need to use comparison of TransactionIds here in order to do the right
+ * thing with wraparound XID arithmetic.  However, if we are asked about
+ * page number zero, we don't want to hand InvalidTransactionId to
+ * TransactionIdPrecedes: it'll get weird about permanent xact IDs.  So,
+ * offset both xids by FirstNormalTransactionId to avoid that.
+ */
+static bool
+CSNLOGPagePrecedes(int page1, int page2)
+{
+       TransactionId xid1;
+       TransactionId xid2;
+
+       xid1 = ((TransactionId) page1) * CSNLOG_XACTS_PER_PAGE;
+       xid1 += FirstNormalTransactionId;
+       xid2 = ((TransactionId) page2) * CSNLOG_XACTS_PER_PAGE;
+       xid2 += FirstNormalTransactionId;
+
+       return TransactionIdPrecedes(xid1, xid2);
+}
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c

index 0c8c17af33c5b8321ccdea64367bae55923a317b..9cc6d3dffda36041c5b819be56d25da0e06064d7 100644 (file)
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -69,6 +69,7 @@
  #include "postgres.h"
  
  #include "access/multixact.h"
+#include "access/mvccvars.h"
  #include "access/slru.h"
  #include "access/transam.h"
  #include "access/twophase.h"
@@ -513,9 +514,11 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
  
         for (i = 0, j = 0; i < nmembers; i++)
         {
-               if (TransactionIdIsInProgress(members[i].xid) ||
+               TransactionIdStatus xidstatus = TransactionIdGetStatus(members[i].xid);
+
+               if (xidstatus == XID_INPROGRESS ||
                         (ISUPDATE_from_mxstatus(members[i].status) &&
-                        TransactionIdDidCommit(members[i].xid)))
+                        xidstatus == XID_COMMITTED))
                 {
                         newMembers[j].xid = members[i].xid;
                         newMembers[j++].status = members[i].status;
@@ -590,7 +593,7 @@ MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly)
          */
         for (i = 0; i < nmembers; i++)
         {
-               if (TransactionIdIsInProgress(members[i].xid))
+               if (TransactionIdGetStatus(members[i].xid) == XID_INPROGRESS)
                 {
                         debug_elog4(DEBUG2, "IsRunning: member %d (%u) is running",
                                                 i, members[i].xid);
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c

deleted file mode 100644 (file)

index 908fe2d..0000000
--- a/src/backend/access/transam/subtrans.c
+++ /dev/null
@@ -1,382 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * subtrans.c
- *             PostgreSQL subtransaction-log manager
- *
- * The pg_subtrans manager is a pg_clog-like manager that stores the parent
- * transaction Id for each transaction.  It is a fundamental part of the
- * nested transactions implementation.  A main transaction has a parent
- * of InvalidTransactionId, and each subtransaction has its immediate parent.
- * The tree can easily be walked from child to parent, but not in the
- * opposite direction.
- *
- * This code is based on clog.c, but the robustness requirements
- * are completely different from pg_clog, because we only need to remember
- * pg_subtrans information for currently-open transactions.  Thus, there is
- * no need to preserve data over a crash and restart.
- *
- * There are no XLOG interactions since we do not care about preserving
- * data across crashes.  During database startup, we simply force the
- * currently-active page of SUBTRANS to zeroes.
- *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
- * Portions Copyright (c) 1994, Regents of the University of California
- *
- * src/backend/access/transam/subtrans.c
- *
- *-------------------------------------------------------------------------
- */
-#include "postgres.h"
-
-#include "access/slru.h"
-#include "access/subtrans.h"
-#include "access/transam.h"
-#include "pg_trace.h"
-#include "utils/snapmgr.h"
-
-
-/*
- * Defines for SubTrans page sizes.  A page is the same BLCKSZ as is used
- * everywhere else in Postgres.
- *
- * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF,
- * SubTrans page numbering also wraps around at
- * 0xFFFFFFFF/SUBTRANS_XACTS_PER_PAGE, and segment numbering at
- * 0xFFFFFFFF/SUBTRANS_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT.  We need take no
- * explicit notice of that fact in this module, except when comparing segment
- * and page numbers in TruncateSUBTRANS (see SubTransPagePrecedes) and zeroing
- * them in StartupSUBTRANS.
- */
-
-/* We need four bytes per xact */
-#define SUBTRANS_XACTS_PER_PAGE (BLCKSZ / sizeof(TransactionId))
-
-#define TransactionIdToPage(xid) ((xid) / (TransactionId) SUBTRANS_XACTS_PER_PAGE)
-#define TransactionIdToEntry(xid) ((xid) % (TransactionId) SUBTRANS_XACTS_PER_PAGE)
-
-
-/*
- * Link to shared-memory data structures for SUBTRANS control
- */
-static SlruCtlData SubTransCtlData;
-
-#define SubTransCtl  (&SubTransCtlData)
-
-
-static int     ZeroSUBTRANSPage(int pageno);
-static bool SubTransPagePrecedes(int page1, int page2);
-
-
-/*
- * Record the parent of a subtransaction in the subtrans log.
- *
- * In some cases we may need to overwrite an existing value.
- */
-void
-SubTransSetParent(TransactionId xid, TransactionId parent, bool overwriteOK)
-{
-       int                     pageno = TransactionIdToPage(xid);
-       int                     entryno = TransactionIdToEntry(xid);
-       int                     slotno;
-       TransactionId *ptr;
-
-       Assert(TransactionIdIsValid(parent));
-
-       LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE);
-
-       slotno = SimpleLruReadPage(SubTransCtl, pageno, true, xid);
-       ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno];
-       ptr += entryno;
-
-       /* Current state should be 0 */
-       Assert(*ptr == InvalidTransactionId ||
-                  (*ptr == parent && overwriteOK));
-
-       *ptr = parent;
-
-       SubTransCtl->shared->page_dirty[slotno] = true;
-
-       LWLockRelease(SubtransControlLock);
-}
-
-/*
- * Interrogate the parent of a transaction in the subtrans log.
- */
-TransactionId
-SubTransGetParent(TransactionId xid)
-{
-       int                     pageno = TransactionIdToPage(xid);
-       int                     entryno = TransactionIdToEntry(xid);
-       int                     slotno;
-       TransactionId *ptr;
-       TransactionId parent;
-
-       /* Can't ask about stuff that might not be around anymore */
-       Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin));
-
-       /* Bootstrap and frozen XIDs have no parent */
-       if (!TransactionIdIsNormal(xid))
-               return InvalidTransactionId;
-
-       /* lock is acquired by SimpleLruReadPage_ReadOnly */
-
-       slotno = SimpleLruReadPage_ReadOnly(SubTransCtl, pageno, xid);
-       ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno];
-       ptr += entryno;
-
-       parent = *ptr;
-
-       LWLockRelease(SubtransControlLock);
-
-       return parent;
-}
-
-/*
- * SubTransGetTopmostTransaction
- *
- * Returns the topmost transaction of the given transaction id.
- *
- * Because we cannot look back further than TransactionXmin, it is possible
- * that this function will lie and return an intermediate subtransaction ID
- * instead of the true topmost parent ID.  This is OK, because in practice
- * we only care about detecting whether the topmost parent is still running
- * or is part of a current snapshot's list of still-running transactions.
- * Therefore, any XID before TransactionXmin is as good as any other.
- */
-TransactionId
-SubTransGetTopmostTransaction(TransactionId xid)
-{
-       TransactionId parentXid = xid,
-                               previousXid = xid;
-
-       /* Can't ask about stuff that might not be around anymore */
-       Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin));
-
-       while (TransactionIdIsValid(parentXid))
-       {
-               previousXid = parentXid;
-               if (TransactionIdPrecedes(parentXid, TransactionXmin))
-                       break;
-               parentXid = SubTransGetParent(parentXid);
-       }
-
-       Assert(TransactionIdIsValid(previousXid));
-
-       return previousXid;
-}
-
-
-/*
- * Initialization of shared memory for SUBTRANS
- */
-Size
-SUBTRANSShmemSize(void)
-{
-       return SimpleLruShmemSize(NUM_SUBTRANS_BUFFERS, 0);
-}
-
-void
-SUBTRANSShmemInit(void)
-{
-       SubTransCtl->PagePrecedes = SubTransPagePrecedes;
-       SimpleLruInit(SubTransCtl, "subtrans", NUM_SUBTRANS_BUFFERS, 0,
-                                 SubtransControlLock, "pg_subtrans",
-                                 LWTRANCHE_SUBTRANS_BUFFERS);
-       /* Override default assumption that writes should be fsync'd */
-       SubTransCtl->do_fsync = false;
-}
-
-/*
- * This func must be called ONCE on system install.  It creates
- * the initial SUBTRANS segment.  (The SUBTRANS directory is assumed to
- * have been created by the initdb shell script, and SUBTRANSShmemInit
- * must have been called already.)
- *
- * Note: it's not really necessary to create the initial segment now,
- * since slru.c would create it on first write anyway.  But we may as well
- * do it to be sure the directory is set up correctly.
- */
-void
-BootStrapSUBTRANS(void)
-{
-       int                     slotno;
-
-       LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE);
-
-       /* Create and zero the first page of the subtrans log */
-       slotno = ZeroSUBTRANSPage(0);
-
-       /* Make sure it's written out */
-       SimpleLruWritePage(SubTransCtl, slotno);
-       Assert(!SubTransCtl->shared->page_dirty[slotno]);
-
-       LWLockRelease(SubtransControlLock);
-}
-
-/*
- * Initialize (or reinitialize) a page of SUBTRANS to zeroes.
- *
- * The page is not actually written, just set up in shared memory.
- * The slot number of the new page is returned.
- *
- * Control lock must be held at entry, and will be held at exit.
- */
-static int
-ZeroSUBTRANSPage(int pageno)
-{
-       return SimpleLruZeroPage(SubTransCtl, pageno);
-}
-
-/*
- * This must be called ONCE during postmaster or standalone-backend startup,
- * after StartupXLOG has initialized ShmemVariableCache->nextXid.
- *
- * oldestActiveXID is the oldest XID of any prepared transaction, or nextXid
- * if there are none.
- */
-void
-StartupSUBTRANS(TransactionId oldestActiveXID)
-{
-       int                     startPage;
-       int                     endPage;
-
-       /*
-        * Since we don't expect pg_subtrans to be valid across crashes, we
-        * initialize the currently-active page(s) to zeroes during startup.
-        * Whenever we advance into a new page, ExtendSUBTRANS will likewise zero
-        * the new page without regard to whatever was previously on disk.
-        */
-       LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE);
-
-       startPage = TransactionIdToPage(oldestActiveXID);
-       endPage = TransactionIdToPage(ShmemVariableCache->nextXid);
-
-       while (startPage != endPage)
-       {
-               (void) ZeroSUBTRANSPage(startPage);
-               startPage++;
-               /* must account for wraparound */
-               if (startPage > TransactionIdToPage(MaxTransactionId))
-                       startPage = 0;
-       }
-       (void) ZeroSUBTRANSPage(startPage);
-
-       LWLockRelease(SubtransControlLock);
-}
-
-/*
- * This must be called ONCE during postmaster or standalone-backend shutdown
- */
-void
-ShutdownSUBTRANS(void)
-{
-       /*
-        * Flush dirty SUBTRANS pages to disk
-        *
-        * This is not actually necessary from a correctness point of view. We do
-        * it merely as a debugging aid.
-        */
-       TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_START(false);
-       SimpleLruFlush(SubTransCtl, false);
-       TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(false);
-}
-
-/*
- * Perform a checkpoint --- either during shutdown, or on-the-fly
- */
-void
-CheckPointSUBTRANS(void)
-{
-       /*
-        * Flush dirty SUBTRANS pages to disk
-        *
-        * This is not actually necessary from a correctness point of view. We do
-        * it merely to improve the odds that writing of dirty pages is done by
-        * the checkpoint process and not by backends.
-        */
-       TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_START(true);
-       SimpleLruFlush(SubTransCtl, true);
-       TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(true);
-}
-
-
-/*
- * Make sure that SUBTRANS has room for a newly-allocated XID.
- *
- * NB: this is called while holding XidGenLock.  We want it to be very fast
- * most of the time; even when it's not so fast, no actual I/O need happen
- * unless we're forced to write out a dirty subtrans page to make room
- * in shared memory.
- */
-void
-ExtendSUBTRANS(TransactionId newestXact)
-{
-       int                     pageno;
-
-       /*
-        * No work except at first XID of a page.  But beware: just after
-        * wraparound, the first XID of page zero is FirstNormalTransactionId.
-        */
-       if (TransactionIdToEntry(newestXact) != 0 &&
-               !TransactionIdEquals(newestXact, FirstNormalTransactionId))
-               return;
-
-       pageno = TransactionIdToPage(newestXact);
-
-       LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE);
-
-       /* Zero the page */
-       ZeroSUBTRANSPage(pageno);
-
-       LWLockRelease(SubtransControlLock);
-}
-
-
-/*
- * Remove all SUBTRANS segments before the one holding the passed transaction ID
- *
- * This is normally called during checkpoint, with oldestXact being the
- * oldest TransactionXmin of any running transaction.
- */
-void
-TruncateSUBTRANS(TransactionId oldestXact)
-{
-       int                     cutoffPage;
-
-       /*
-        * The cutoff point is the start of the segment containing oldestXact. We
-        * pass the *page* containing oldestXact to SimpleLruTruncate.  We step
-        * back one transaction to avoid passing a cutoff page that hasn't been
-        * created yet in the rare case that oldestXact would be the first item on
-        * a page and oldestXact == next XID.  In that case, if we didn't subtract
-        * one, we'd trigger SimpleLruTruncate's wraparound detection.
-        */
-       TransactionIdRetreat(oldestXact);
-       cutoffPage = TransactionIdToPage(oldestXact);
-
-       SimpleLruTruncate(SubTransCtl, cutoffPage);
-}
-
-
-/*
- * Decide which of two SUBTRANS page numbers is "older" for truncation purposes.
- *
- * We need to use comparison of TransactionIds here in order to do the right
- * thing with wraparound XID arithmetic.  However, if we are asked about
- * page number zero, we don't want to hand InvalidTransactionId to
- * TransactionIdPrecedes: it'll get weird about permanent xact IDs.  So,
- * offset both xids by FirstNormalTransactionId to avoid that.
- */
-static bool
-SubTransPagePrecedes(int page1, int page2)
-{
-       TransactionId xid1;
-       TransactionId xid2;
-
-       xid1 = ((TransactionId) page1) * SUBTRANS_XACTS_PER_PAGE;
-       xid1 += FirstNormalTransactionId;
-       xid2 = ((TransactionId) page2) * SUBTRANS_XACTS_PER_PAGE;
-       xid2 += FirstNormalTransactionId;
-
-       return TransactionIdPrecedes(xid1, xid2);
-}
diff --git a/src/backend/access/transam/transam.c b/src/backend/access/transam/transam.c

index 1eba49a94b321b83185f0a6f9ac4ab1914ad6c63..e4ce0d51490e1992a68861bc4a488b20309aa78c 100644 (file)
--- a/src/backend/access/transam/transam.c
+++ b/src/backend/access/transam/transam.c
@@ -3,6 +3,15 @@
   * transam.c
   *       postgres transaction log interface routines
   *
+ * This module contains high level functions for managing the status
+ * of transactions. It sits on top of two lower level structures: the
+ * CLOG, and the CSNLOG. The CLOG is a permanent on-disk structure that
+ * tracks the committed/aborted status for each transaction ID. The CSNLOG
+ * tracks *when* each transaction ID committed (or aborted). The CSNLOG
+ * is used when checking the status of recent transactions that might still
+ * be in-progress, and it is reset at server startup. The CLOG is used for
+ * older transactions that are known to have completed (or crashed).
+ *
   * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
@@ -10,56 +19,49 @@
   * IDENTIFICATION
   *       src/backend/access/transam/transam.c
   *
- * NOTES
- *       This file contains the high level access-method interface to the
- *       transaction system.
- *
   *-------------------------------------------------------------------------
   */
  
  #include "postgres.h"
  
  #include "access/clog.h"
+#include "access/csnlog.h"
+#include "access/mvccvars.h"
  #include "access/subtrans.h"
  #include "access/transam.h"
+#include "storage/lmgr.h"
  #include "utils/snapmgr.h"
  
  /*
- * Single-item cache for results of TransactionLogFetch.  It's worth having
+ * Single-item cache for results of TransactionIdGetCommitSeqNo.  It's worth
+ * having
   * such a cache because we frequently find ourselves repeatedly checking the
   * same XID, for example when scanning a table just after a bulk insert,
   * update, or delete.
   */
  static TransactionId cachedFetchXid = InvalidTransactionId;
-static XidStatus cachedFetchXidStatus;
-static XLogRecPtr cachedCommitLSN;
+static CommitSeqNo cachedCSN;
  
-/* Local functions */
-static XidStatus TransactionLogFetch(TransactionId transactionId);
-
-
-/* ----------------------------------------------------------------
- *             Postgres log access method interface
- *
- *             TransactionLogFetch
- * ----------------------------------------------------------------
+/*
+ * Also have a (separate) cache for CLogGetCommitLSN()
   */
+static TransactionId cachedLSNFetchXid = InvalidTransactionId;
+static XLogRecPtr cachedCommitLSN;
  
  /*
- * TransactionLogFetch --- fetch commit status of specified transaction id
+ * TransactionIdGetCommitSeqNo --- fetch CSN of specified transaction id
   */
-static XidStatus
-TransactionLogFetch(TransactionId transactionId)
+CommitSeqNo
+TransactionIdGetCommitSeqNo(TransactionId transactionId)
  {
-       XidStatus       xidstatus;
-       XLogRecPtr      xidlsn;
+       CommitSeqNo     csn;
  
         /*
          * Before going to the commit log manager, check our single item cache to
          * see if we didn't just check the transaction status a moment ago.
          */
         if (TransactionIdEquals(transactionId, cachedFetchXid))
-               return cachedFetchXidStatus;
+               return cachedCSN;
  
         /*
          * Also, check to see if the transaction ID is a permanent one.
@@ -67,53 +69,63 @@ TransactionLogFetch(TransactionId transactionId)
         if (!TransactionIdIsNormal(transactionId))
         {
                 if (TransactionIdEquals(transactionId, BootstrapTransactionId))
-                       return TRANSACTION_STATUS_COMMITTED;
+                       return COMMITSEQNO_FROZEN;
                 if (TransactionIdEquals(transactionId, FrozenTransactionId))
-                       return TRANSACTION_STATUS_COMMITTED;
-               return TRANSACTION_STATUS_ABORTED;
+                       return COMMITSEQNO_FROZEN;
+               return COMMITSEQNO_ABORTED;
         }
  
         /*
-        * Get the transaction status.
+        * If the XID is older than TransactionXmin, check the clog. Otherwise
+        * check the csnlog.
          */
-       xidstatus = TransactionIdGetStatus(transactionId, &xidlsn);
+       Assert(TransactionIdIsValid(TransactionXmin));
+       if (TransactionIdPrecedes(transactionId, TransactionXmin))
+       {
+               XLogRecPtr lsn;
+
+               if (CLogGetStatus(transactionId, &lsn) == CLOG_XID_STATUS_COMMITTED)
+                       csn = COMMITSEQNO_FROZEN;
+               else
+                       csn = COMMITSEQNO_ABORTED;
+       }
+       else
+       {
+               csn = CSNLogGetCommitSeqNo(transactionId);
+
+               if (csn == COMMITSEQNO_COMMITTING)
+               {
+                       /*
+                        * If the transaction is committing at this very instant, and
+                        * hasn't set its CSN yet, wait for it to finish doing so.
+                        *
+                        * XXX: Alternatively, we could wait on the heavy-weight lock on
+                        * the XID. that'd make TransactionIdCommitTree() slightly
+                        * cheaper, as it wouldn't need to acquire CommitSeqNoLock (even
+                        * in shared mode).
+                        */
+                       LWLockAcquire(CommitSeqNoLock, LW_EXCLUSIVE);
+                       LWLockRelease(CommitSeqNoLock);
+
+                       csn = CSNLogGetCommitSeqNo(transactionId);
+                       Assert(csn != COMMITSEQNO_COMMITTING);
+               }
+       }
  
         /*
-        * Cache it, but DO NOT cache status for unfinished or sub-committed
-        * transactions!  We only cache status that is guaranteed not to change.
+        * Cache it, but DO NOT cache status for unfinished transactions!
+        * We only cache status that is guaranteed not to change.
          */
-       if (xidstatus != TRANSACTION_STATUS_IN_PROGRESS &&
-               xidstatus != TRANSACTION_STATUS_SUB_COMMITTED)
+       if (COMMITSEQNO_IS_COMMITTED(csn) ||
+               COMMITSEQNO_IS_ABORTED(csn))
         {
                 cachedFetchXid = transactionId;
-               cachedFetchXidStatus = xidstatus;
-               cachedCommitLSN = xidlsn;
+               cachedCSN = csn;
         }
  
-       return xidstatus;
+       return csn;
  }
  
-/* ----------------------------------------------------------------
- *                                             Interface functions
- *
- *             TransactionIdDidCommit
- *             TransactionIdDidAbort
- *             ========
- *                these functions test the transaction status of
- *                a specified transaction id.
- *
- *             TransactionIdCommitTree
- *             TransactionIdAsyncCommitTree
- *             TransactionIdAbortTree
- *             ========
- *                these functions set the transaction status of the specified
- *                transaction tree.
- *
- * See also TransactionIdIsInProgress, which once was in this module
- * but now lives in procarray.c.
- * ----------------------------------------------------------------
- */
-
  /*
   * TransactionIdDidCommit
   *             True iff transaction associated with the identifier did commit.
@@ -124,50 +136,14 @@ TransactionLogFetch(TransactionId transactionId)
  bool                                                   /* true if given transaction committed */
  TransactionIdDidCommit(TransactionId transactionId)
  {
-       XidStatus       xidstatus;
+       CommitSeqNo csn;
  
-       xidstatus = TransactionLogFetch(transactionId);
+       csn = TransactionIdGetCommitSeqNo(transactionId);
  
-       /*
-        * If it's marked committed, it's committed.
-        */
-       if (xidstatus == TRANSACTION_STATUS_COMMITTED)
+       if (COMMITSEQNO_IS_COMMITTED(csn))
                 return true;
-
-       /*
-        * If it's marked subcommitted, we have to check the parent recursively.
-        * However, if it's older than TransactionXmin, we can't look at
-        * pg_subtrans; instead assume that the parent crashed without cleaning up
-        * its children.
-        *
-        * Originally we Assert'ed that the result of SubTransGetParent was not
-        * zero. However with the introduction of prepared transactions, there can
-        * be a window just after database startup where we do not have complete
-        * knowledge in pg_subtrans of the transactions after TransactionXmin.
-        * StartupSUBTRANS() has ensured that any missing information will be
-        * zeroed.  Since this case should not happen under normal conditions, it
-        * seems reasonable to emit a WARNING for it.
-        */
-       if (xidstatus == TRANSACTION_STATUS_SUB_COMMITTED)
-       {
-               TransactionId parentXid;
-
-               if (TransactionIdPrecedes(transactionId, TransactionXmin))
-                       return false;
-               parentXid = SubTransGetParent(transactionId);
-               if (!TransactionIdIsValid(parentXid))
-               {
-                       elog(WARNING, "no pg_subtrans entry for subcommitted XID %u",
-                                transactionId);
-                       return false;
-               }
-               return TransactionIdDidCommit(parentXid);
-       }
-
-       /*
-        * It's not committed.
-        */
-       return false;
+       else
+               return false;
  }
  
  /*
@@ -180,70 +156,35 @@ TransactionIdDidCommit(TransactionId transactionId)
  bool                                                   /* true if given transaction aborted */
  TransactionIdDidAbort(TransactionId transactionId)
  {
-       XidStatus       xidstatus;
+       CommitSeqNo csn;
  
-       xidstatus = TransactionLogFetch(transactionId);
+       csn = TransactionIdGetCommitSeqNo(transactionId);
  
-       /*
-        * If it's marked aborted, it's aborted.
-        */
-       if (xidstatus == TRANSACTION_STATUS_ABORTED)
+       if (COMMITSEQNO_IS_ABORTED(csn))
                 return true;
-
-       /*
-        * If it's marked subcommitted, we have to check the parent recursively.
-        * However, if it's older than TransactionXmin, we can't look at
-        * pg_subtrans; instead assume that the parent crashed without cleaning up
-        * its children.
-        */
-       if (xidstatus == TRANSACTION_STATUS_SUB_COMMITTED)
-       {
-               TransactionId parentXid;
-
-               if (TransactionIdPrecedes(transactionId, TransactionXmin))
-                       return true;
-               parentXid = SubTransGetParent(transactionId);
-               if (!TransactionIdIsValid(parentXid))
-               {
-                       /* see notes in TransactionIdDidCommit */
-                       elog(WARNING, "no pg_subtrans entry for subcommitted XID %u",
-                                transactionId);
-                       return true;
-               }
-               return TransactionIdDidAbort(parentXid);
-       }
-
-       /*
-        * It's not aborted.
-        */
-       return false;
+       else
+               return false;
  }
  
  /*
- * TransactionIdIsKnownCompleted
- *             True iff transaction associated with the identifier is currently
- *             known to have either committed or aborted.
- *
- * This does NOT look into pg_clog but merely probes our local cache
- * (and so it's not named TransactionIdDidComplete, which would be the
- * appropriate name for a function that worked that way).  The intended
- * use is just to short-circuit TransactionIdIsInProgress calls when doing
- * repeated tqual.c checks for the same XID.  If this isn't extremely fast
- * then it will be counterproductive.
+ * Returns the status of the tranaction.
   *
- * Note:
- *             Assumes transaction identifier is valid.
+ * Note that this treats a a crashed transaction as still in-progress,
+ * until it falls off the xmin horizon.
   */
-bool
-TransactionIdIsKnownCompleted(TransactionId transactionId)
+TransactionIdStatus
+TransactionIdGetStatus(TransactionId xid)
  {
-       if (TransactionIdEquals(transactionId, cachedFetchXid))
-       {
-               /* If it's in the cache at all, it must be completed. */
-               return true;
-       }
+       CommitSeqNo csn;
+
+       csn = TransactionIdGetCommitSeqNo(xid);
  
-       return false;
+       if (COMMITSEQNO_IS_COMMITTED(csn))
+               return XID_COMMITTED;
+       else if (COMMITSEQNO_IS_ABORTED(csn))
+               return XID_ABORTED;
+       else
+               return XID_INPROGRESS;
  }
  
  /*
@@ -252,28 +193,80 @@ TransactionIdIsKnownCompleted(TransactionId transactionId)
   *
   * "xid" is a toplevel transaction commit, and the xids array contains its
   * committed subtransactions.
- *
- * This commit operation is not guaranteed to be atomic, but if not, subxids
- * are correctly marked subcommit first.
   */
  void
  TransactionIdCommitTree(TransactionId xid, int nxids, TransactionId *xids)
  {
-       TransactionIdSetTreeStatus(xid, nxids, xids,
-                                                          TRANSACTION_STATUS_COMMITTED,
-                                                          InvalidXLogRecPtr);
+       TransactionIdAsyncCommitTree(xid, nxids, xids, InvalidXLogRecPtr);
  }
  
  /*
   * TransactionIdAsyncCommitTree
- *             Same as above, but for async commits.  The commit record LSN is needed.
+ *             Same as above, but for async commits.
+ *
+ * "xid" is a toplevel transaction commit, and the xids array contains its
+ * committed subtransactions.
   */
  void
  TransactionIdAsyncCommitTree(TransactionId xid, int nxids, TransactionId *xids,
                                                          XLogRecPtr lsn)
  {
-       TransactionIdSetTreeStatus(xid, nxids, xids,
-                                                          TRANSACTION_STATUS_COMMITTED, lsn);
+       CommitSeqNo csn;
+       TransactionId latestXid;
+       TransactionId currentLatestCompletedXid;
+
+       latestXid = TransactionIdLatest(xid, nxids, xids);
+
+       /*
+        * Grab the CommitSeqNoLock, in shared mode. This is only used to
+        * provide a way for a concurrent transaction to wait for us to
+        * complete (see TransactionIdGetCommitSeqNo()).
+        *
+        * XXX: We could reduce the time the lock is held, by only setting
+        * the CSN on the top-XID while holding the lock, and updating the
+        * sub-XIDs later. But it doesn't matter much, because we're only
+        * holding it in shared mode, and it's rare for it to be acquired
+        * in exclusive mode.
+        */
+       LWLockAcquire(CommitSeqNoLock, LW_SHARED);
+
+       /*
+        * First update latestCompletedXid to cover this xid. We do this before
+        * assigning a CSN, so that if someone acquires a new snapshot at the same
+        * time, the xmax it computes is sure to cover our XID.
+        */
+       currentLatestCompletedXid = pg_atomic_read_u32(&ShmemVariableCache->latestCompletedXid);
+       while (TransactionIdFollows(latestXid, currentLatestCompletedXid))
+       {
+               if (pg_atomic_compare_exchange_u32(&ShmemVariableCache->latestCompletedXid,
+                                                                                  &currentLatestCompletedXid,
+                                                                                  latestXid))
+                       break;
+       }
+
+       /*
+        * Mark our top transaction id as commit-in-progress.
+        */
+       CSNLogSetCommitSeqNo(xid, 0, NULL, COMMITSEQNO_COMMITTING);
+
+       /* Get our CSN and increment */
+       csn = pg_atomic_fetch_add_u64(&ShmemVariableCache->nextCommitSeqNo, 1);
+       Assert(csn >= COMMITSEQNO_FIRST_NORMAL);
+
+       /* Stamp this XID (and sub-XIDs) with the CSN */
+       CSNLogSetCommitSeqNo(xid, nxids, xids, csn);
+
+       LWLockRelease(CommitSeqNoLock);
+
+       /*
+        * Also update the CLOG. This doesn't need to happen atomically with
+        * updating the CSN log, because no-one will look at the CLOG until
+        * GlobalXmin has advanced past our XID, and that can't happen until
+        * we clear the XID from the proc array.
+        */
+       CLogSetTreeStatus(xid, nxids, xids,
+                                         CLOG_XID_STATUS_COMMITTED,
+                                         lsn);
  }
  
  /*
@@ -289,8 +282,23 @@ TransactionIdAsyncCommitTree(TransactionId xid, int nxids, TransactionId *xids,
  void
  TransactionIdAbortTree(TransactionId xid, int nxids, TransactionId *xids)
  {
-       TransactionIdSetTreeStatus(xid, nxids, xids,
-                                                          TRANSACTION_STATUS_ABORTED, InvalidXLogRecPtr);
+       TransactionId latestXid;
+       TransactionId currentLatestCompletedXid;
+
+       latestXid = TransactionIdLatest(xid, nxids, xids);
+
+       currentLatestCompletedXid = pg_atomic_read_u32(&ShmemVariableCache->latestCompletedXid);
+       while (TransactionIdFollows(latestXid, currentLatestCompletedXid))
+       {
+               if (pg_atomic_compare_exchange_u32(&ShmemVariableCache->latestCompletedXid,
+                                                                                  &currentLatestCompletedXid,
+                                                                                  latestXid))
+                       break;
+       }
+
+       CSNLogSetCommitSeqNo(xid, nxids, xids, COMMITSEQNO_ABORTED);
+       CLogSetTreeStatus(xid, nxids, xids,
+                                         CLOG_XID_STATUS_ABORTED, InvalidCommitSeqNo);
  }
  
  /*
@@ -409,7 +417,7 @@ TransactionIdGetCommitLSN(TransactionId xid)
          * checking TransactionLogFetch's cache will usually succeed and avoid an
          * extra trip to shared memory.
          */
-       if (TransactionIdEquals(xid, cachedFetchXid))
+       if (TransactionIdEquals(xid, cachedLSNFetchXid))
                 return cachedCommitLSN;
  
         /* Special XIDs are always known committed */
@@ -419,7 +427,10 @@ TransactionIdGetCommitLSN(TransactionId xid)
         /*
          * Get the transaction status.
          */
-       (void) TransactionIdGetStatus(xid, &result);
+       (void) CLogGetStatus(xid, &result);
+
+       cachedLSNFetchXid = xid;
+       cachedCommitLSN = result;
  
         return result;
  }
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c

index 9f55adcaf5ef0e50fe1ea93e4e725c5c7b794215..3aa91572d52b8172e9e3dd3eec1c30b750b011bb 100644 (file)
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -22,7 +22,7 @@
   *             transaction in prepared state with the same GID.
   *
   *             A global transaction (gxact) also has dummy PGXACT and PGPROC; this is
- *             what keeps the XID considered running by TransactionIdIsInProgress.
+ *             what keeps the XID considered running by the functions in procarray.c.
   *             It is also convenient as a PGPROC to hook the gxact's locks to.
   *
   *             Information to recover prepared transactions in case of crash is
@@ -60,6 +60,7 @@
  
  #include "access/commit_ts.h"
  #include "access/htup_details.h"
+#include "access/mvccvars.h"
  #include "access/subtrans.h"
  #include "access/transam.h"
  #include "access/twophase.h"
@@ -414,6 +415,7 @@ MarkAsPreparing(TransactionId xid, const char *gid,
         proc->lxid = (LocalTransactionId) xid;
         pgxact->xid = xid;
         pgxact->xmin = InvalidTransactionId;
+       pgxact->snapshotcsn = InvalidCommitSeqNo;
         pgxact->delayChkpt = false;
         pgxact->vacuumFlags = 0;
         proc->pid = 0;
@@ -426,9 +428,6 @@ MarkAsPreparing(TransactionId xid, const char *gid,
         proc->waitProcLock = NULL;
         for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
                 SHMQueueInit(&(proc->myProcLocks[i]));
-       /* subxid data must be filled later by GXactLoadSubxactData */
-       pgxact->overflowed = false;
-       pgxact->nxids = 0;
  
         gxact->prepared_at = prepared_at;
         /* initialize LSN to InvalidXLogRecPtr */
@@ -455,34 +454,6 @@ MarkAsPreparing(TransactionId xid, const char *gid,
         return gxact;
  }
  
-/*
- * GXactLoadSubxactData
- *
- * If the transaction being persisted had any subtransactions, this must
- * be called before MarkAsPrepared() to load information into the dummy
- * PGPROC.
- */
-static void
-GXactLoadSubxactData(GlobalTransaction gxact, int nsubxacts,
-                                        TransactionId *children)
-{
-       PGPROC     *proc = &ProcGlobal->allProcs[gxact->pgprocno];
-       PGXACT     *pgxact = &ProcGlobal->allPgXact[gxact->pgprocno];
-
-       /* We need no extra lock since the GXACT isn't valid yet */
-       if (nsubxacts > PGPROC_MAX_CACHED_SUBXIDS)
-       {
-               pgxact->overflowed = true;
-               nsubxacts = PGPROC_MAX_CACHED_SUBXIDS;
-       }
-       if (nsubxacts > 0)
-       {
-               memcpy(proc->subxids.xids, children,
-                          nsubxacts * sizeof(TransactionId));
-               pgxact->nxids = nsubxacts;
-       }
-}
-
  /*
   * MarkAsPrepared
   *             Mark the GXACT as fully valid, and enter it into the global ProcArray.
@@ -497,7 +468,7 @@ MarkAsPrepared(GlobalTransaction gxact)
         LWLockRelease(TwoPhaseStateLock);
  
         /*
-        * Put it into the global ProcArray so TransactionIdIsInProgress considers
+        * Put it into the global ProcArray so GetOldestActiveTransactionId() considers
          * the XID as still running.
          */
         ProcArrayAdd(&ProcGlobal->allProcs[gxact->pgprocno]);
@@ -992,8 +963,6 @@ StartPrepare(GlobalTransaction gxact)
         if (hdr.nsubxacts > 0)
         {
                 save_state_data(children, hdr.nsubxacts * sizeof(TransactionId));
-               /* While we have the child-xact data, stuff it in the gxact too */
-               GXactLoadSubxactData(gxact, hdr.nsubxacts, children);
         }
         if (hdr.ncommitrels > 0)
         {
@@ -1079,7 +1048,7 @@ EndPrepare(GlobalTransaction gxact)
          * NB: a side effect of this is to make a dummy ProcArray entry for the
          * prepared XID.  This must happen before we clear the XID from MyPgXact,
          * else there is a window where the XID is not running according to
-        * TransactionIdIsInProgress, and onlookers would be entitled to assume
+        * GetOldestActiveTransactionId, and onlookers would be entitled to assume
          * the xact crashed.  Instead we have a window where the same XID appears
          * twice in ProcArray, which is OK.
          */
@@ -1328,7 +1297,6 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
         char       *buf;
         char       *bufptr;
         TwoPhaseFileHeader *hdr;
-       TransactionId latestXid;
         TransactionId *children;
         RelFileNode *commitrels;
         RelFileNode *abortrels;
@@ -1373,14 +1341,11 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
         invalmsgs = (SharedInvalidationMessage *) bufptr;
         bufptr += MAXALIGN(hdr->ninvalmsgs * sizeof(SharedInvalidationMessage));
  
-       /* compute latestXid among all children */
-       latestXid = TransactionIdLatest(xid, hdr->nsubxacts, children);
-
         /*
          * The order of operations here is critical: make the XLOG entry for
          * commit or abort, then mark the transaction committed or aborted in
          * pg_clog, then remove its PGPROC from the global ProcArray (which means
-        * TransactionIdIsInProgress will stop saying the prepared xact is in
+        * GetOldestActiveTransactionId() will stop saying the prepared xact is in
          * progress), then run the post-commit or post-abort callbacks. The
          * callbacks will release the locks the transaction held.
          */
@@ -1395,7 +1360,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
                                                                            hdr->nsubxacts, children,
                                                                            hdr->nabortrels, abortrels);
  
-       ProcArrayRemove(proc, latestXid);
+       ProcArrayRemove(proc);
  
         /*
          * In case we fail while running the callbacks, mark the gxact invalid so
@@ -1841,7 +1806,7 @@ StandbyRecoverPreparedTransactions(bool overwriteOK)
                         xid = (TransactionId) strtoul(clde->d_name, NULL, 16);
  
                         /* Already processed? */
-                       if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
+                       if (TransactionIdGetStatus(xid) != XID_INPROGRESS)
                         {
                                 ereport(WARNING,
                                                 (errmsg("removing stale two-phase state file \"%s\"",
@@ -1926,7 +1891,7 @@ RecoverPreparedTransactions(void)
                         xid = (TransactionId) strtoul(clde->d_name, NULL, 16);
  
                         /* Already processed? */
-                       if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
+                       if (TransactionIdGetStatus(xid) != XID_INPROGRESS)
                         {
                                 ereport(WARNING,
                                                 (errmsg("removing stale two-phase state file \"%s\"",
@@ -1965,9 +1930,13 @@ RecoverPreparedTransactions(void)
                          * It's possible that SubTransSetParent has been set before, if
                          * the prepared transaction generated xid assignment records. Test
                          * here must match one used in AssignTransactionId().
+                        *
+                        * FIXME: I think this now always needs to be true. Or false?
                          */
+#ifdef FIXME
                         if (InHotStandby && (hdr->nsubxacts >= PGPROC_MAX_CACHED_SUBXIDS ||
                                                                  XLogLogicalInfoActive()))
+#endif
                                 overwriteOK = true;
  
                         /*
@@ -1987,7 +1956,6 @@ RecoverPreparedTransactions(void)
                                                                         hdr->prepared_at,
                                                                         hdr->owner, hdr->database);
                         gxact->ondisk = true;
-                       GXactLoadSubxactData(gxact, hdr->nsubxacts, subxids);
                         MarkAsPrepared(gxact);
  
                         /*
@@ -2089,7 +2057,7 @@ RecordTransactionCommitPrepared(TransactionId xid,
         /* Flush XLOG to disk */
         XLogFlush(recptr);
  
-       /* Mark the transaction committed in pg_clog */
+       /* Mark the transaction committed in pg_clog and pg_csnlog */
         TransactionIdCommitTree(xid, nchildren, children);
  
         /* Checkpoint can proceed now */
@@ -2127,7 +2095,7 @@ RecordTransactionAbortPrepared(TransactionId xid,
          * Catch the scenario where we aborted partway through
          * RecordTransactionCommitPrepared ...
          */
-       if (TransactionIdDidCommit(xid))
+       if (TransactionIdGetStatus(xid) == XID_COMMITTED)
                 elog(PANIC, "cannot abort transaction %u, it was already committed",
                          xid);
  
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c

index 2f7e645ace8d85c28489024af956fc194bd18bf7..d6a91254877700187c904776d3d67ede60a38528 100644 (file)
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -15,6 +15,8 @@
  
  #include "access/clog.h"
  #include "access/commit_ts.h"
+#include "access/csnlog.h"
+#include "access/mvccvars.h"
  #include "access/subtrans.h"
  #include "access/transam.h"
  #include "access/xact.h"
@@ -169,8 +171,8 @@ GetNewTransactionId(bool isSubXact)
          * Extend pg_subtrans and pg_commit_ts too.
          */
         ExtendCLOG(xid);
+       ExtendCSNLOG(xid);
         ExtendCommitTs(xid);
-       ExtendSUBTRANS(xid);
  
         /*
          * Now advance the nextXid counter.  This must not happen until after we
@@ -200,17 +202,8 @@ GetNewTransactionId(bool isSubXact)
          * A solution to the atomic-store problem would be to give each PGXACT its
          * own spinlock used only for fetching/storing that PGXACT's xid and
          * related fields.
-        *
-        * If there's no room to fit a subtransaction XID into PGPROC, set the
-        * cache-overflowed flag instead.  This forces readers to look in
-        * pg_subtrans to map subtransaction XIDs up to top-level XIDs. There is a
-        * race-condition window, in that the new XID will not appear as running
-        * until its parent link has been placed into pg_subtrans. However, that
-        * will happen before anyone could possibly have a reason to inquire about
-        * the status of the XID, so it seems OK.  (Snapshots taken during this
-        * window *will* include the parent XID, so they will deliver the correct
-        * answer later on when someone does have a reason to inquire.)
          */
+       if (!isSubXact)
         {
                 /*
                  * Use volatile pointer to prevent code rearrangement; other backends
@@ -219,23 +212,9 @@ GetNewTransactionId(bool isSubXact)
                  * nxids before filling the array entry.  Note we are assuming that
                  * TransactionId and int fetch/store are atomic.
                  */
-               volatile PGPROC *myproc = MyProc;
                 volatile PGXACT *mypgxact = MyPgXact;
  
-               if (!isSubXact)
-                       mypgxact->xid = xid;
-               else
-               {
-                       int                     nxids = mypgxact->nxids;
-
-                       if (nxids < PGPROC_MAX_CACHED_SUBXIDS)
-                       {
-                               myproc->subxids.xids[nxids] = xid;
-                               mypgxact->nxids = nxids + 1;
-                       }
-                       else
-                               mypgxact->overflowed = true;
-               }
+               mypgxact->xid = xid;
         }
  
         LWLockRelease(XidGenLock);
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c

index 23f36ead7e54e86d0ef1e33a63aa60ade10f8b8f..c33e5d37db424f1b2f7d1119b52045ee509b5d3d 100644 (file)
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -20,8 +20,10 @@
  #include <time.h>
  #include <unistd.h>
  
+#include "access/clog.h"
  #include "access/commit_ts.h"
  #include "access/multixact.h"
+#include "access/mvccvars.h"
  #include "access/parallel.h"
  #include "access/subtrans.h"
  #include "access/transam.h"
@@ -183,7 +185,6 @@ typedef struct TransactionStateData
         int                     prevSecContext; /* previous SecurityRestrictionContext */
         bool            prevXactReadOnly;               /* entry-time xact r/o state */
         bool            startedInRecovery;              /* did we start in recovery? */
-       bool            didLogXid;              /* has xid been included in WAL record? */
         int                     parallelModeLevel;              /* Enter/ExitParallelMode counter */
         struct TransactionStateData *parent;            /* back link to parent */
  } TransactionStateData;
@@ -214,18 +215,10 @@ static TransactionStateData TopTransactionStateData = {
         0,                                                      /* previous SecurityRestrictionContext */
         false,                                          /* entry-time xact r/o state */
         false,                                          /* startedInRecovery */
-       false,                                          /* didLogXid */
         0,                                                      /* parallelMode */
         NULL                                            /* link to parent state block */
  };
  
-/*
- * unreportedXids holds XIDs of all subtransactions that have not yet been
- * reported in an XLOG_XACT_ASSIGNMENT record.
- */
-static int     nUnreportedXids;
-static TransactionId unreportedXids[PGPROC_MAX_CACHED_SUBXIDS];
-
  static TransactionState CurrentTransactionState = &TopTransactionStateData;
  
  /*
@@ -309,7 +302,7 @@ static void CleanupTransaction(void);
  static void CheckTransactionChain(bool isTopLevel, bool throwError,
                                           const char *stmtType);
  static void CommitTransaction(void);
-static TransactionId RecordTransactionAbort(bool isSubXact);
+static void RecordTransactionAbort(bool isSubXact);
  static void StartTransaction(void);
  
  static void StartSubTransaction(void);
@@ -433,19 +426,6 @@ GetCurrentTransactionIdIfAny(void)
         return CurrentTransactionState->transactionId;
  }
  
-/*
- *     MarkCurrentTransactionIdLoggedIfAny
- *
- * Remember that the current xid - if it is assigned - now has been wal logged.
- */
-void
-MarkCurrentTransactionIdLoggedIfAny(void)
-{
-       if (TransactionIdIsValid(CurrentTransactionState->transactionId))
-               CurrentTransactionState->didLogXid = true;
-}
-
-
  /*
   *     GetStableLatestTransactionId
   *
@@ -487,7 +467,6 @@ AssignTransactionId(TransactionState s)
  {
         bool            isSubXact = (s->parent != NULL);
         ResourceOwner currentOwner;
-       bool            log_unknown_top = false;
  
         /* Assert that caller didn't screw up */
         Assert(!TransactionIdIsValid(s->transactionId));
@@ -538,18 +517,14 @@ AssignTransactionId(TransactionState s)
          * superfluously log something. That can happen when an xid is included
          * somewhere inside a wal record, but not in XLogRecord->xl_xid, like in
          * xl_standby_locks.
+        *
+        * FIXME: didLogXid and the whole xact_assignment stuff is no more. We
+        * no longer need it for subtransactions. Do we still need it for this
+        * logical stuff?
          */
-       if (isSubXact && XLogLogicalInfoActive() &&
-               !TopTransactionStateData.didLogXid)
-               log_unknown_top = true;
  
         /*
          * Generate a new Xid and record it in PG_PROC and pg_subtrans.
-        *
-        * NB: we must make the subtrans entry BEFORE the Xid appears anywhere in
-        * shared storage other than PG_PROC; because if there's no room for it in
-        * PG_PROC, the subtrans entry is needed to ensure that other backends see
-        * the Xid as "running".  See GetNewTransactionId.
          */
         s->transactionId = GetNewTransactionId(isSubXact);
         if (!isSubXact)
@@ -584,59 +559,6 @@ AssignTransactionId(TransactionState s)
         }
         PG_END_TRY();
         CurrentResourceOwner = currentOwner;
-
-       /*
-        * Every PGPROC_MAX_CACHED_SUBXIDS assigned transaction ids within each
-        * top-level transaction we issue a WAL record for the assignment. We
-        * include the top-level xid and all the subxids that have not yet been
-        * reported using XLOG_XACT_ASSIGNMENT records.
-        *
-        * This is required to limit the amount of shared memory required in a hot
-        * standby server to keep track of in-progress XIDs. See notes for
-        * RecordKnownAssignedTransactionIds().
-        *
-        * We don't keep track of the immediate parent of each subxid, only the
-        * top-level transaction that each subxact belongs to. This is correct in
-        * recovery only because aborted subtransactions are separately WAL
-        * logged.
-        *
-        * This is correct even for the case where several levels above us didn't
-        * have an xid assigned as we recursed up to them beforehand.
-        */
-       if (isSubXact && XLogStandbyInfoActive())
-       {
-               unreportedXids[nUnreportedXids] = s->transactionId;
-               nUnreportedXids++;
-
-               /*
-                * ensure this test matches similar one in
-                * RecoverPreparedTransactions()
-                */
-               if (nUnreportedXids >= PGPROC_MAX_CACHED_SUBXIDS ||
-                       log_unknown_top)
-               {
-                       xl_xact_assignment xlrec;
-
-                       /*
-                        * xtop is always set by now because we recurse up transaction
-                        * stack to the highest unassigned xid and then come back down
-                        */
-                       xlrec.xtop = GetTopTransactionId();
-                       Assert(TransactionIdIsValid(xlrec.xtop));
-                       xlrec.nsubxacts = nUnreportedXids;
-
-                       XLogBeginInsert();
-                       XLogRegisterData((char *) &xlrec, MinSizeOfXactAssignment);
-                       XLogRegisterData((char *) unreportedXids,
-                                                        nUnreportedXids * sizeof(TransactionId));
-
-                       (void) XLogInsert(RM_XACT_ID, XLOG_XACT_ASSIGNMENT);
-
-                       nUnreportedXids = 0;
-                       /* mark top, not current xact as having been logged */
-                       TopTransactionStateData.didLogXid = true;
-               }
-       }
  }
  
  /*
@@ -1117,17 +1039,13 @@ AtSubStart_ResourceOwner(void)
  /*
   *     RecordTransactionCommit
   *
- * Returns latest XID among xact and its children, or InvalidTransactionId
- * if the xact has no XID.  (We compute that here just because it's easier.)
- *
   * If you change this function, see RecordTransactionCommitPrepared also.
   */
-static TransactionId
+static void
  RecordTransactionCommit(void)
  {
         TransactionId xid = GetTopTransactionIdIfAny();
         bool            markXidCommitted = TransactionIdIsValid(xid);
-       TransactionId latestXid = InvalidTransactionId;
         int                     nrels;
         RelFileNode *rels;
         int                     nchildren;
@@ -1290,7 +1208,7 @@ RecordTransactionCommit(void)
                 XLogFlush(XactLastRecEnd);
  
                 /*
-                * Now we may update the CLOG, if we wrote a COMMIT record above
+                * Now we may update the CLOG and CSNLOG, if we wrote a COMMIT record above
                  */
                 if (markXidCommitted)
                         TransactionIdCommitTree(xid, nchildren, children);
@@ -1316,7 +1234,8 @@ RecordTransactionCommit(void)
                  * flushed before the CLOG may be updated.
                  */
                 if (markXidCommitted)
-                       TransactionIdAsyncCommitTree(xid, nchildren, children, XactLastRecEnd);
+                       TransactionIdAsyncCommitTree(xid, nchildren, children,
+                                                                                XactLastRecEnd);
         }
  
         /*
@@ -1329,9 +1248,6 @@ RecordTransactionCommit(void)
                 END_CRIT_SECTION();
         }
  
-       /* Compute latestXid while we have the child XIDs handy */
-       latestXid = TransactionIdLatest(xid, nchildren, children);
-
         /*
          * Wait for synchronous replication, if required. Similar to the decision
          * above about using committing asynchronously we only want to wait if
@@ -1353,8 +1269,6 @@ cleanup:
         /* Clean up local data */
         if (rels)
                 pfree(rels);
-
-       return latestXid;
  }
  
  
@@ -1522,15 +1436,11 @@ AtSubCommit_childXids(void)
  
  /*
   *     RecordTransactionAbort
- *
- * Returns latest XID among xact and its children, or InvalidTransactionId
- * if the xact has no XID.  (We compute that here just because it's easier.)
   */
-static TransactionId
+static void
  RecordTransactionAbort(bool isSubXact)
  {
         TransactionId xid = GetCurrentTransactionIdIfAny();
-       TransactionId latestXid;
         int                     nrels;
         RelFileNode *rels;
         int                     nchildren;
@@ -1548,7 +1458,7 @@ RecordTransactionAbort(bool isSubXact)
                 /* Reset XactLastRecEnd until the next transaction writes something */
                 if (!isSubXact)
                         XactLastRecEnd = 0;
-               return InvalidTransactionId;
+               return;
         }
  
         /*
@@ -1611,18 +1521,6 @@ RecordTransactionAbort(bool isSubXact)
  
         END_CRIT_SECTION();
  
-       /* Compute latestXid while we have the child XIDs handy */
-       latestXid = TransactionIdLatest(xid, nchildren, children);
-
-       /*
-        * If we're aborting a subtransaction, we can immediately remove failed
-        * XIDs from PGPROC's cache of running child XIDs.  We do that here for
-        * subxacts, because we already have the child XID array at hand.  For
-        * main xacts, the equivalent happens just after this function returns.
-        */
-       if (isSubXact)
-               XidCacheRemoveRunningXids(xid, nchildren, children, latestXid);
-
         /* Reset XactLastRecEnd until the next transaction writes something */
         if (!isSubXact)
                 XactLastRecEnd = 0;
@@ -1630,8 +1528,6 @@ RecordTransactionAbort(bool isSubXact)
         /* And clean up local data */
         if (rels)
                 pfree(rels);
-
-       return latestXid;
  }
  
  /*
@@ -1857,12 +1753,6 @@ StartTransaction(void)
         currentCommandId = FirstCommandId;
         currentCommandIdUsed = false;
  
-       /*
-        * initialize reported xid accounting
-        */
-       nUnreportedXids = 0;
-       s->didLogXid = false;
-
         /*
          * must initialize resource-management stuff first
          */
@@ -1940,7 +1830,6 @@ static void
  CommitTransaction(void)
  {
         TransactionState s = CurrentTransactionState;
-       TransactionId latestXid;
         bool            is_parallel_worker;
  
         is_parallel_worker = (s->blockState == TBLOCK_PARALLEL_INPROGRESS);
@@ -2040,16 +1929,10 @@ CommitTransaction(void)
                  * We need to mark our XIDs as committed in pg_clog.  This is where we
                  * durably commit.
                  */
-               latestXid = RecordTransactionCommit();
+               RecordTransactionCommit();
         }
         else
         {
-               /*
-                * We must not mark our XID committed; the parallel master is
-                * responsible for that.
-                */
-               latestXid = InvalidTransactionId;
-
                 /*
                  * Make sure the master will know about any WAL we wrote before it
                  * commits.
@@ -2064,7 +1947,7 @@ CommitTransaction(void)
          * must be done _before_ releasing locks we hold and _after_
          * RecordTransactionCommit.
          */
-       ProcArrayEndTransaction(MyProc, latestXid);
+       ProcArrayEndTransaction(MyProc);
  
         /*
          * This is all post-commit cleanup.  Note that if an error is raised here,
@@ -2447,7 +2330,6 @@ static void
  AbortTransaction(void)
  {
         TransactionState s = CurrentTransactionState;
-       TransactionId latestXid;
         bool            is_parallel_worker;
  
         /* Prevent cancel/die interrupt while cleaning up */
@@ -2549,11 +2431,9 @@ AbortTransaction(void)
          * record.
          */
         if (!is_parallel_worker)
-               latestXid = RecordTransactionAbort(false);
+               RecordTransactionAbort(false);
         else
         {
-               latestXid = InvalidTransactionId;
-
                 /*
                  * Since the parallel master won't get our value of XactLastRecEnd in
                  * this case, we nudge WAL-writer ourselves in this case.  See related
@@ -2569,7 +2449,7 @@ AbortTransaction(void)
          * must be done _before_ releasing locks we hold and _after_
          * RecordTransactionAbort.
          */
-       ProcArrayEndTransaction(MyProc, latestXid);
+       ProcArrayEndTransaction(MyProc);
  
         /*
          * Post-abort cleanup.  See notes in CommitTransaction() concerning
@@ -5375,9 +5255,12 @@ xact_redo_commit(xl_xact_parsed_commit *parsed,
         if (standbyState == STANDBY_DISABLED)
         {
                 /*
-                * Mark the transaction committed in pg_clog.
+                * Mark the transaction committed in pg_clog. We don't bother updating
+                * pg_csnlog during replay.
                  */
-               TransactionIdCommitTree(xid, parsed->nsubxacts, parsed->subxacts);
+               CLogSetTreeStatus(xid, parsed->nsubxacts, parsed->subxacts,
+                                                 CLOG_XID_STATUS_COMMITTED,
+                                                 InvalidXLogRecPtr);
         }
         else
         {
@@ -5401,14 +5284,7 @@ xact_redo_commit(xl_xact_parsed_commit *parsed,
                  * bits set on changes made by transactions that haven't yet
                  * recovered. It's unlikely but it's good to be safe.
                  */
-               TransactionIdAsyncCommitTree(
-                                                         xid, parsed->nsubxacts, parsed->subxacts, lsn);
-
-               /*
-                * We must mark clog before we update the ProcArray.
-                */
-               ExpireTreeKnownAssignedTransactionIds(
-                                                 xid, parsed->nsubxacts, parsed->subxacts, max_xid);
+               TransactionIdAsyncCommitTree(xid, parsed->nsubxacts, parsed->subxacts, lsn);
  
                 /*
                  * Send any cache invalidations attached to the commit. We must
@@ -5530,8 +5406,13 @@ xact_redo_abort(xl_xact_parsed_abort *parsed, TransactionId xid)
  
         if (standbyState == STANDBY_DISABLED)
         {
-               /* Mark the transaction aborted in pg_clog, no need for async stuff */
-               TransactionIdAbortTree(xid, parsed->nsubxacts, parsed->subxacts);
+               /*
+                * Mark the transaction aborted in pg_clog, no need for async stuff or
+                * to update pg_csnlog.
+                */
+               CLogSetTreeStatus(xid, parsed->nsubxacts, parsed->subxacts,
+                                                 CLOG_XID_STATUS_ABORTED,
+                                                 InvalidXLogRecPtr);
         }
         else
         {
@@ -5549,12 +5430,6 @@ xact_redo_abort(xl_xact_parsed_abort *parsed, TransactionId xid)
                 /* Mark the transaction aborted in pg_clog, no need for async stuff */
                 TransactionIdAbortTree(xid, parsed->nsubxacts, parsed->subxacts);
  
-               /*
-                * We must update the ProcArray after we have marked clog.
-                */
-               ExpireTreeKnownAssignedTransactionIds(
-                                                 xid, parsed->nsubxacts, parsed->subxacts, max_xid);
-
                 /*
                  * There are no flat files that need updating, nor invalidation
                  * messages to send or undo.
@@ -5635,14 +5510,6 @@ xact_redo(XLogReaderState *record)
                 RecreateTwoPhaseFile(XLogRecGetXid(record),
                                                   XLogRecGetData(record), XLogRecGetDataLen(record));
         }
-       else if (info == XLOG_XACT_ASSIGNMENT)
-       {
-               xl_xact_assignment *xlrec = (xl_xact_assignment *) XLogRecGetData(record);
-
-               if (standbyState >= STANDBY_INITIALIZED)
-                       ProcArrayApplyXidAssignment(xlrec->xtop,
-                                                                               xlrec->nsubxacts, xlrec->xsub);
-       }
         else
                 elog(PANIC, "xact_redo: unknown op code %u", info);
  }
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c

index f13f9c1fa5e271709a137d517533284c84253afd..308398154c7fa44f1d8db19e3cd8e5c22982df40 100644 (file)
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -23,7 +23,9 @@
  
  #include "access/clog.h"
  #include "access/commit_ts.h"
+#include "access/csnlog.h"
  #include "access/multixact.h"
+#include "access/mvccvars.h"
  #include "access/rewriteheap.h"
  #include "access/subtrans.h"
  #include "access/timeline.h"
@@ -1022,8 +1024,6 @@ XLogInsertRecord(XLogRecData *rdata, XLogRecPtr fpw_lsn)
          */
         WALInsertLockRelease();
  
-       MarkCurrentTransactionIdLoggedIfAny();
-
         END_CRIT_SECTION();
  
         /*
@@ -4766,6 +4766,7 @@ BootStrapXLOG(void)
         uint64          sysidentifier;
         struct timeval tv;
         pg_crc32c       crc;
+       TransactionId latestCompletedXid;
  
         /*
          * Select a hopefully-unique system identifier code for this installation.
@@ -4820,6 +4821,14 @@ BootStrapXLOG(void)
         ShmemVariableCache->nextXid = checkPoint.nextXid;
         ShmemVariableCache->nextOid = checkPoint.nextOid;
         ShmemVariableCache->oidCount = 0;
+
+       pg_atomic_write_u64(&ShmemVariableCache->nextCommitSeqNo, COMMITSEQNO_FIRST_NORMAL);
+       latestCompletedXid = checkPoint.nextXid;
+       TransactionIdRetreat(latestCompletedXid);
+       pg_atomic_write_u32(&ShmemVariableCache->latestCompletedXid, latestCompletedXid);
+       pg_atomic_write_u32(&ShmemVariableCache->oldestActiveXid, checkPoint.nextXid);
+       pg_atomic_write_u32(&ShmemVariableCache->globalXmin, checkPoint.nextXid);
+
         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
@@ -4912,8 +4921,8 @@ BootStrapXLOG(void)
  
         /* Bootstrap the commit log, too */
         BootStrapCLOG();
+       BootStrapCSNLOG();
         BootStrapCommitTs();
-       BootStrapSUBTRANS();
         BootStrapMultiXact();
  
         pfree(buffer);
@@ -5930,6 +5939,7 @@ StartupXLOG(void)
         XLogPageReadPrivate private;
         bool            fast_promoted = false;
         struct stat st;
+       TransactionId latestCompletedXid;
  
         /*
          * Read control file and check XLOG status looks valid.
@@ -6346,6 +6356,13 @@ StartupXLOG(void)
         XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
         XLogCtl->ckptXid = checkPoint.nextXid;
  
+       pg_atomic_write_u64(&ShmemVariableCache->nextCommitSeqNo, COMMITSEQNO_FIRST_NORMAL);
+       latestCompletedXid = checkPoint.nextXid;
+       TransactionIdRetreat(latestCompletedXid);
+       pg_atomic_write_u32(&ShmemVariableCache->latestCompletedXid, latestCompletedXid);
+       pg_atomic_write_u32(&ShmemVariableCache->oldestActiveXid, checkPoint.nextXid);
+       pg_atomic_write_u32(&ShmemVariableCache->globalXmin, checkPoint.nextXid);
+
         /*
          * Initialize replication slots, before there's a chance to remove
          * required resources.
@@ -6588,15 +6605,15 @@ StartupXLOG(void)
                         Assert(TransactionIdIsValid(oldestActiveXID));
  
                         /* Tell procarray about the range of xids it has to deal with */
-                       ProcArrayInitRecovery(ShmemVariableCache->nextXid);
+                       ProcArrayInitRecovery(oldestActiveXID, ShmemVariableCache->nextXid);
  
                         /*
-                        * Startup commit log and subtrans only.  MultiXact and commit
+                        * Startup commit log and csnlog only.  MultiXact and commit
                          * timestamp have already been started up and other SLRUs are not
                          * maintained during recovery and need not be started yet.
                          */
                         StartupCLOG();
-                       StartupSUBTRANS(oldestActiveXID);
+                       StartupCSNLOG(oldestActiveXID);
  
                         /*
                          * If we're beginning at a shutdown checkpoint, we know that
@@ -6607,7 +6624,6 @@ StartupXLOG(void)
                         if (wasShutdown)
                         {
                                 RunningTransactionsData running;
-                               TransactionId latestCompletedXid;
  
                                 /*
                                  * Construct a RunningTransactions snapshot representing a
@@ -6615,16 +6631,8 @@ StartupXLOG(void)
                                  * alive. We're never overflowed at this point because all
                                  * subxids are listed with their parent prepared transactions.
                                  */
-                               running.xcnt = nxids;
-                               running.subxcnt = 0;
-                               running.subxid_overflow = false;
                                 running.nextXid = checkPoint.nextXid;
                                 running.oldestRunningXid = oldestActiveXID;
-                               latestCompletedXid = checkPoint.nextXid;
-                               TransactionIdRetreat(latestCompletedXid);
-                               Assert(TransactionIdIsNormal(latestCompletedXid));
-                               running.latestCompletedXid = latestCompletedXid;
-                               running.xids = xids;
  
                                 ProcArrayApplyRecoveryInfo(&running);
  
@@ -7358,20 +7366,22 @@ StartupXLOG(void)
         /* start the archive_timeout timer running */
         XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
  
-       /* also initialize latestCompletedXid, to nextXid - 1 */
-       LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
-       ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
-       TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
-       LWLockRelease(ProcArrayLock);
+       /* also initialize latestCompletedXid, to nextXid - 1, and oldestActiveXid */
+       latestCompletedXid = ShmemVariableCache->nextXid;
+       TransactionIdRetreat(latestCompletedXid);
+       pg_atomic_write_u32(&ShmemVariableCache->latestCompletedXid,
+                                               latestCompletedXid);
+       pg_atomic_write_u32(&ShmemVariableCache->oldestActiveXid,
+                                               oldestActiveXID);
  
         /*
-        * Start up the commit log and subtrans, if not already done for hot
+        * Start up the commit log and csnlog, if not already done for hot
          * standby.  (commit timestamps are started below, if necessary.)
          */
         if (standbyState == STANDBY_DISABLED)
         {
                 StartupCLOG();
-               StartupSUBTRANS(oldestActiveXID);
+               StartupCSNLOG(oldestActiveXID);
         }
  
         /*
@@ -7975,8 +7985,8 @@ ShutdownXLOG(int code, Datum arg)
                 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
         }
         ShutdownCLOG();
+       ShutdownCSNLOG();
         ShutdownCommitTs();
-       ShutdownSUBTRANS();
         ShutdownMultiXact();
  }
  
@@ -8546,14 +8556,14 @@ CreateCheckPoint(int flags)
                 PreallocXlogFiles(recptr);
  
         /*
-        * Truncate pg_subtrans if possible.  We can throw away all data before
+        * Truncate pg_csnlog if possible.  We can throw away all data before
          * the oldest XMIN of any running transaction.  No future transaction will
-        * attempt to reference any pg_subtrans entry older than that (see Asserts
-        * in subtrans.c).  During recovery, though, we mustn't do this because
-        * StartupSUBTRANS hasn't been called yet.
+        * attempt to reference any pg_csnlog entry older than that (see Asserts
+        * in csnlog.c).  During recovery, though, we mustn't do this because
+        * StartupCSNLOG hasn't been called yet.
          */
         if (!RecoveryInProgress())
-               TruncateSUBTRANS(GetOldestXmin(NULL, false));
+               TruncateCSNLOG(GetOldestXmin(NULL, false));
  
         /* Real work is done, but log and update stats before releasing lock. */
         LogCheckpointEnd(false);
@@ -8629,13 +8639,12 @@ static void
  CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
  {
         CheckPointCLOG();
+       CheckPointCSNLOG();
         CheckPointCommitTs();
-       CheckPointSUBTRANS();
         CheckPointMultiXact();
         CheckPointPredicate();
         CheckPointRelationMap();
         CheckPointReplicationSlots();
-       CheckPointSnapBuild();
         CheckPointLogicalRewriteHeap();
         CheckPointBuffers(flags);       /* performs all required fsyncs */
         CheckPointReplicationOrigin();
@@ -8885,14 +8894,14 @@ CreateRestartPoint(int flags)
         }
  
         /*
-        * Truncate pg_subtrans if possible.  We can throw away all data before
+        * Truncate pg_csnlog if possible.  We can throw away all data before
          * the oldest XMIN of any running transaction.  No future transaction will
-        * attempt to reference any pg_subtrans entry older than that (see Asserts
-        * in subtrans.c).  When hot standby is disabled, though, we mustn't do
-        * this because StartupSUBTRANS hasn't been called yet.
+        * attempt to reference any pg_csnlog entry older than that (see Asserts
+        * in csnlog.c).  When hot standby is disabled, though, we mustn't do
+        * this because StartupCSNLOG hasn't been called yet.
          */
         if (EnableHotStandby)
-               TruncateSUBTRANS(GetOldestXmin(NULL, false));
+               TruncateCSNLOG(GetOldestXmin(NULL, false));
  
         /* Real work is done, but log and update before releasing lock. */
         LogCheckpointEnd(true);
@@ -9271,7 +9280,6 @@ xlog_redo(XLogReaderState *record)
                         TransactionId *xids;
                         int                     nxids;
                         TransactionId oldestActiveXID;
-                       TransactionId latestCompletedXid;
                         RunningTransactionsData running;
  
                         oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
@@ -9282,16 +9290,8 @@ xlog_redo(XLogReaderState *record)
                          * never overflowed at this point because all subxids are listed
                          * with their parent prepared transactions.
                          */
-                       running.xcnt = nxids;
-                       running.subxcnt = 0;
-                       running.subxid_overflow = false;
                         running.nextXid = checkPoint.nextXid;
                         running.oldestRunningXid = oldestActiveXID;
-                       latestCompletedXid = checkPoint.nextXid;
-                       TransactionIdRetreat(latestCompletedXid);
-                       Assert(TransactionIdIsNormal(latestCompletedXid));
-                       running.latestCompletedXid = latestCompletedXid;
-                       running.xids = xids;
  
                         ProcArrayApplyRecoveryInfo(&running);
  
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c

index e997b574ca9eaf93514817464b3a77467d1a5371..d47bdd07e34ba1ce13473a24fd4a9e5b1e550b76 100644 (file)
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -64,6 +64,7 @@
  #include "parser/parse_expr.h"
  #include "parser/parse_relation.h"
  #include "storage/predicate.h"
+#include "storage/procarray.h"
  #include "storage/smgr.h"
  #include "utils/acl.h"
  #include "utils/builtins.h"
@@ -895,7 +896,7 @@ AddNewRelationTuple(Relation pg_class_desc,
                  * We know that no xacts older than RecentXmin are still running, so
                  * that will do.
                  */
-               new_rel_reltup->relfrozenxid = RecentXmin;
+               new_rel_reltup->relfrozenxid = GetOldestActiveTransactionId();
  
                 /*
                  * Similarly, initialize the minimum Multixact to the first value that
diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c

index 716f1c33183da6e2cc0d50bf463098bfcbf213a6..fb77e5f85df2d84453690e8e8c9d2ab27000e291 100644 (file)
--- a/src/backend/commands/async.c
+++ b/src/backend/commands/async.c
@@ -1928,27 +1928,21 @@ asyncQueueProcessPageEntries(volatile QueuePosition *current,
                 /* Ignore messages destined for other databases */
                 if (qe->dboid == MyDatabaseId)
                 {
-                       if (TransactionIdIsInProgress(qe->xid))
+                       TransactionIdStatus xidstatus = TransactionIdGetStatus(qe->xid);
+
+                       if (xidstatus == XID_INPROGRESS)
                         {
                                 /*
                                  * The source transaction is still in progress, so we can't
                                  * process this message yet.  Break out of the loop, but first
                                  * back up *current so we will reprocess the message next
-                                * time.  (Note: it is unlikely but not impossible for
-                                * TransactionIdDidCommit to fail, so we can't really avoid
-                                * this advance-then-back-up behavior when dealing with an
-                                * uncommitted message.)
-                                *
-                                * Note that we must test TransactionIdIsInProgress before we
-                                * test TransactionIdDidCommit, else we might return a message
-                                * from a transaction that is not yet visible to snapshots;
-                                * compare the comments at the head of tqual.c.
+                                * time.
                                  */
                                 *current = thisentry;
                                 reachedStop = true;
                                 break;
                         }
-                       else if (TransactionIdDidCommit(qe->xid))
+                       else if (xidstatus == XID_COMMITTED)
                         {
                                 /* qe->data is the null-terminated channel name */
                                 char       *channel = qe->data;
diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c

index 6cddcbd02c380f7243cbf97ffeb41347d5a15e5b..4bcacfbe747a6a8ad49dd95fe3e41b7eb80acd02 100644 (file)
--- a/src/backend/commands/matview.c
+++ b/src/backend/commands/matview.c
@@ -32,6 +32,7 @@
  #include "parser/parse_relation.h"
  #include "rewrite/rewriteHandler.h"
  #include "storage/lmgr.h"
+#include "storage/procarray.h"
  #include "storage/smgr.h"
  #include "tcop/tcopprot.h"
  #include "utils/builtins.h"
@@ -820,7 +821,8 @@ static void
  refresh_by_heap_swap(Oid matviewOid, Oid OIDNewHeap, char relpersistence)
  {
         finish_heap_swap(matviewOid, OIDNewHeap, false, false, true, true,
-                                        RecentXmin, ReadNextMultiXactId(), relpersistence);
+                                        GetOldestActiveTransactionId(), ReadNextMultiXactId(),
+                                        relpersistence);
  }
  
  
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c

index 86e98148c1667e1b5cf04146e4945f1f5b5c8b42..38ca2d37c5a09823e3bee0223b75bb4bb985f7b9 100644 (file)
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -82,6 +82,7 @@
  #include "storage/lmgr.h"
  #include "storage/lock.h"
  #include "storage/predicate.h"
+#include "storage/procarray.h"
  #include "storage/smgr.h"
  #include "utils/acl.h"
  #include "utils/builtins.h"
@@ -1219,7 +1220,7 @@ ExecuteTruncate(TruncateStmt *stmt)
                          * deletion at commit.
                          */
                         RelationSetNewRelfilenode(rel, rel->rd_rel->relpersistence,
-                                                                         RecentXmin, minmulti);
+                                                                         GetOldestActiveTransactionId(), minmulti);
                         if (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED)
                                 heap_create_init_fork(rel);
  
@@ -1233,7 +1234,7 @@ ExecuteTruncate(TruncateStmt *stmt)
                         {
                                 rel = relation_open(toast_relid, AccessExclusiveLock);
                                 RelationSetNewRelfilenode(rel, rel->rd_rel->relpersistence,
-                                                                                 RecentXmin, minmulti);
+                                                                                 GetOldestActiveTransactionId(), minmulti);
                                 if (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED)
                                         heap_create_init_fork(rel);
                                 heap_close(rel, NoLock);
@@ -3868,7 +3869,7 @@ ATRewriteTables(AlterTableStmt *parsetree, List **wqueue, LOCKMODE lockmode)
                         finish_heap_swap(tab->relid, OIDNewHeap,
                                                          false, false, true,
                                                          !OidIsValid(tab->newTableSpace),
-                                                        RecentXmin,
+                                                        GetOldestActiveTransactionId(),
                                                          ReadNextMultiXactId(),
                                                          persistence);
                 }
diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c

index 46cd5ba1f2ded5b68ce5246384d19f4c53ecca58..ff1a2427d912404da5a5dfd5aa363f1276efe489 100644 (file)
--- a/src/backend/replication/logical/decode.c
+++ b/src/backend/replication/logical/decode.c
@@ -165,7 +165,6 @@ LogicalDecodingProcessRecord(LogicalDecodingContext *ctx, XLogReaderState *recor
  static void
  DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
  {
-       SnapBuild  *builder = ctx->snapshot_builder;
         uint8           info = XLogRecGetInfo(buf->record) & ~XLR_INFO_MASK;
  
         ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(buf->record),
@@ -176,8 +175,6 @@ DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
                         /* this is also used in END_OF_RECOVERY checkpoints */
                 case XLOG_CHECKPOINT_SHUTDOWN:
                 case XLOG_END_OF_RECOVERY:
-                       SnapBuildSerializationPoint(builder, buf->origptr);
-
                         break;
                 case XLOG_CHECKPOINT_ONLINE:
  
@@ -217,8 +214,11 @@ DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
          * ok not to call ReorderBufferProcessXid() in that case, except in the
          * assignment case there'll not be any later records with the same xid;
          * and in the assignment case we'll not decode those xacts.
+        *
+        * FIXME: the assignment record is no more. I don't understand the above
+        * comment. Can it be just removed?
          */
-       if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT)
+       if (SnapBuildCurrentState(builder) < SNAPBUILD_CONSISTENT)
                 return;
  
         switch (info)
@@ -259,23 +259,6 @@ DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
                                 DecodeAbort(ctx, buf, &parsed, xid);
                                 break;
                         }
-               case XLOG_XACT_ASSIGNMENT:
-                       {
-                               xl_xact_assignment *xlrec;
-                               int                     i;
-                               TransactionId *sub_xid;
-
-                               xlrec = (xl_xact_assignment *) XLogRecGetData(r);
-
-                               sub_xid = &xlrec->xsub[0];
-
-                               for (i = 0; i < xlrec->nsubxacts; i++)
-                               {
-                                       ReorderBufferAssignChild(reorder, xlrec->xtop,
-                                                                                        *(sub_xid++), buf->origptr);
-                               }
-                               break;
-                       }
                 case XLOG_XACT_PREPARE:
  
                         /*
@@ -354,7 +337,7 @@ DecodeHeap2Op(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
         ReorderBufferProcessXid(ctx->reorder, xid, buf->origptr);
  
         /* no point in doing anything yet */
-       if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT)
+       if (SnapBuildCurrentState(builder) < SNAPBUILD_CONSISTENT)
                 return;
  
         switch (info)
@@ -409,7 +392,7 @@ DecodeHeapOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
         ReorderBufferProcessXid(ctx->reorder, xid, buf->origptr);
  
         /* no point in doing anything yet */
-       if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT)
+       if (SnapBuildCurrentState(builder) < SNAPBUILD_CONSISTENT)
                 return;
  
         switch (info)
@@ -502,7 +485,7 @@ DecodeLogicalMsgOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
         ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(r), buf->origptr);
  
         /* No point in doing anything yet. */
-       if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT)
+       if (SnapBuildCurrentState(builder) < SNAPBUILD_CONSISTENT)
                 return;
  
         message = (xl_logical_message *) XLogRecGetData(r);
diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c

index ecf9a03318044a438b6549a7555faf1b0c723603..9dd658cc33fc7ed3b913b289ec49156e5957797d 100644 (file)
--- a/src/backend/replication/logical/logical.c
+++ b/src/backend/replication/logical/logical.c
@@ -113,7 +113,6 @@ CheckLogicalDecodingRequirements(void)
  static LogicalDecodingContext *
  StartupDecodingContext(List *output_plugin_options,
                                            XLogRecPtr start_lsn,
-                                          TransactionId xmin_horizon,
                                            XLogPageReadCB read_page,
                                            LogicalOutputPluginWriterPrepareWrite prepare_write,
                                            LogicalOutputPluginWriterWrite do_write)
@@ -173,7 +172,7 @@ StartupDecodingContext(List *output_plugin_options,
  
         ctx->reorder = ReorderBufferAllocate();
         ctx->snapshot_builder =
-               AllocateSnapshotBuilder(ctx->reorder, xmin_horizon, start_lsn);
+               AllocateSnapshotBuilder(ctx->reorder, start_lsn);
  
         ctx->reorder->private_data = ctx;
  
@@ -216,7 +215,6 @@ CreateInitDecodingContext(char *plugin,
                                                   LogicalOutputPluginWriterPrepareWrite prepare_write,
                                                   LogicalOutputPluginWriterWrite do_write)
  {
-       TransactionId xmin_horizon = InvalidTransactionId;
         ReplicationSlot *slot;
         LogicalDecodingContext *ctx;
         MemoryContext old_context;
@@ -280,16 +278,10 @@ CreateInitDecodingContext(char *plugin,
  
         LWLockRelease(ProcArrayLock);
  
-       /*
-        * tell the snapshot builder to only assemble snapshot once reaching the
-        * running_xact's record with the respective xmin.
-        */
-       xmin_horizon = slot->data.catalog_xmin;
-
         ReplicationSlotMarkDirty();
         ReplicationSlotSave();
  
-       ctx = StartupDecodingContext(NIL, InvalidXLogRecPtr, xmin_horizon,
+       ctx = StartupDecodingContext(NIL, InvalidXLogRecPtr,
                                                                  read_page, prepare_write, do_write);
  
         /* call output plugin initialization callback */
@@ -379,7 +371,7 @@ CreateDecodingContext(XLogRecPtr start_lsn,
         }
  
         ctx = StartupDecodingContext(output_plugin_options,
-                                                                start_lsn, InvalidTransactionId,
+                                                                start_lsn,
                                                                  read_page, prepare_write, do_write);
  
         /* call output plugin initialization callback */
@@ -749,12 +741,12 @@ message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
  }
  
  /*
- * Set the required catalog xmin horizon for historic snapshots in the current
- * replication slot.
+ * Set the oldest snapshot required for historic catalog lookups in the
+ * current replication slot.
   *
- * Note that in the most cases, we won't be able to immediately use the xmin
- * to increase the xmin horizon: we need to wait till the client has confirmed
- * receiving current_lsn with LogicalConfirmReceivedLocation().
+ * Note that in the most cases, we won't be able to immediately use the
+ * snapshot to increase the oldest snapshot, we need to wait till the client
+ * has confirmed receiving current_lsn with LogicalConfirmReceivedLocation().
   */
  void
  LogicalIncreaseXminForSlot(XLogRecPtr current_lsn, TransactionId xmin)
diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c

index 213ce34674ced87e1c010ed617e4458cee975668..bc744d215661d212828bedd447143f0b2d95d6b7 100644 (file)
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -1275,7 +1275,6 @@ ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
         Size            size;
  
         size = sizeof(SnapshotData) +
-               sizeof(TransactionId) * orig_snap->xcnt +
                 sizeof(TransactionId) * (txn->nsubtxns + 1);
  
         snap = MemoryContextAllocZero(rb->context, size);
@@ -1284,36 +1283,33 @@ ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
         snap->copied = true;
         snap->active_count = 1;         /* mark as active so nobody frees it */
         snap->regd_count = 0;
-       snap->xip = (TransactionId *) (snap + 1);
-
-       memcpy(snap->xip, orig_snap->xip, sizeof(TransactionId) * snap->xcnt);
  
         /*
          * snap->subxip contains all txids that belong to our transaction which we
          * need to check via cmin/cmax. Thats why we store the toplevel
          * transaction in there as well.
          */
-       snap->subxip = snap->xip + snap->xcnt;
-       snap->subxip[i++] = txn->xid;
+       snap->this_xip = (TransactionId *) (snap + 1);
+       snap->this_xip[i++] = txn->xid;
  
         /*
          * nsubxcnt isn't decreased when subtransactions abort, so count manually.
          * Since it's an upper boundary it is safe to use it for the allocation
          * above.
          */
-       snap->subxcnt = 1;
+       snap->this_xcnt = 1;
  
         dlist_foreach(iter, &txn->subtxns)
         {
                 ReorderBufferTXN *sub_txn;
  
                 sub_txn = dlist_container(ReorderBufferTXN, node, iter.cur);
-               snap->subxip[i++] = sub_txn->xid;
-               snap->subxcnt++;
+               snap->this_xip[i++] = sub_txn->xid;
+               snap->this_xcnt++;
         }
  
         /* sort so we can bsearch() later */
-       qsort(snap->subxip, snap->subxcnt, sizeof(TransactionId), xidComparator);
+       qsort(snap->this_xip, snap->this_xcnt, sizeof(TransactionId), xidComparator);
  
         /* store the specified current CommandId */
         snap->curcid = cid;
@@ -1389,6 +1385,7 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
         }
  
         snapshot_now = txn->base_snapshot;
+       Assert(snapshot_now->snapshotcsn != InvalidCommitSeqNo);
  
         /* build data to be able to lookup the CommandIds of catalog tuples */
         ReorderBufferBuildTupleCidHash(rb, txn);
@@ -2277,10 +2274,7 @@ ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
  
                                 snap = change->data.snapshot;
  
-                               sz += sizeof(SnapshotData) +
-                                       sizeof(TransactionId) * snap->xcnt +
-                                       sizeof(TransactionId) * snap->subxcnt
-                                       ;
+                               sz += sizeof(SnapshotData);
  
                                 /* make sure we have enough space */
                                 ReorderBufferSerializeReserve(rb, sz);
@@ -2290,20 +2284,6 @@ ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
  
                                 memcpy(data, snap, sizeof(SnapshotData));
                                 data += sizeof(SnapshotData);
-
-                               if (snap->xcnt)
-                               {
-                                       memcpy(data, snap->xip,
-                                                  sizeof(TransactionId) * snap->xcnt);
-                                       data += sizeof(TransactionId) * snap->xcnt;
-                               }
-
-                               if (snap->subxcnt)
-                               {
-                                       memcpy(data, snap->subxip,
-                                                  sizeof(TransactionId) * snap->subxcnt);
-                                       data += sizeof(TransactionId) * snap->subxcnt;
-                               }
                                 break;
                         }
                 case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
@@ -2563,24 +2543,16 @@ ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
                         }
                 case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
                         {
-                               Snapshot        oldsnap;
                                 Snapshot        newsnap;
                                 Size            size;
  
-                               oldsnap = (Snapshot) data;
-
-                               size = sizeof(SnapshotData) +
-                                       sizeof(TransactionId) * oldsnap->xcnt +
-                                       sizeof(TransactionId) * (oldsnap->subxcnt + 0);
+                               size = sizeof(SnapshotData);
  
                                 change->data.snapshot = MemoryContextAllocZero(rb->context, size);
  
                                 newsnap = change->data.snapshot;
  
                                 memcpy(newsnap, data, size);
-                               newsnap->xip = (TransactionId *)
-                                       (((char *) newsnap) + sizeof(SnapshotData));
-                               newsnap->subxip = newsnap->xip + newsnap->xcnt;
                                 newsnap->copied = true;
                                 break;
                         }
@@ -3230,7 +3202,7 @@ UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot)
                         continue;
  
                 /* not for our transaction */
-               if (!TransactionIdInArray(f_mapped_xid, snapshot->subxip, snapshot->subxcnt))
+               if (!TransactionIdInArray(f_mapped_xid, snapshot->this_xip, snapshot->this_xcnt))
                         continue;
  
                 /* ok, relevant, queue for apply */
@@ -3258,7 +3230,7 @@ UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot)
                 RewriteMappingFile *f = files_a[off];
  
                 elog(DEBUG1, "applying mapping: \"%s\" in %u", f->fname,
-                        snapshot->subxip[0]);
+                        snapshot->this_xip[0]);
                 ApplyLogicalMappingFile(tuplecid_data, relid, f->fname);
                 pfree(f);
         }
diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c

index b5fa3dbbc0bb7a4be872087933fb386348a88477..3ce4e0e375a50dfa96acc7c660b026ba9b32b8bc 100644 (file)
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -152,90 +152,24 @@ struct SnapBuild
         /* all transactions >= than this are uncommitted */
         TransactionId xmax;
  
+       /* this determines the state of transactions between xmin and xmax */
+       CommitSeqNo snapshotcsn;
+
         /*
          * Don't replay commits from an LSN < this LSN. This can be set externally
          * but it will also be advanced (never retreat) from within snapbuild.c.
          */
         XLogRecPtr      start_decoding_at;
  
-       /*
-        * Don't start decoding WAL until the "xl_running_xacts" information
-        * indicates there are no running xids with an xid smaller than this.
-        */
-       TransactionId initial_xmin_horizon;
-
         /*
          * Snapshot that's valid to see the catalog state seen at this moment.
          */
         Snapshot        snapshot;
  
-       /*
-        * LSN of the last location we are sure a snapshot has been serialized to.
-        */
-       XLogRecPtr      last_serialized_snapshot;
-
         /*
          * The reorderbuffer we need to update with usable snapshots et al.
          */
         ReorderBuffer *reorder;
-
-       /*
-        * Information about initially running transactions
-        *
-        * When we start building a snapshot there already may be transactions in
-        * progress.  Those are stored in running.xip.  We don't have enough
-        * information about those to decode their contents, so until they are
-        * finished (xcnt=0) we cannot switch to a CONSISTENT state.
-        */
-       struct
-       {
-               /*
-                * As long as running.xcnt all XIDs < running.xmin and > running.xmax
-                * have to be checked whether they still are running.
-                */
-               TransactionId xmin;
-               TransactionId xmax;
-
-               size_t          xcnt;           /* number of used xip entries */
-               size_t          xcnt_space; /* allocated size of xip */
-               TransactionId *xip;             /* running xacts array, xidComparator-sorted */
-       }                       running;
-
-       /*
-        * Array of transactions which could have catalog changes that committed
-        * between xmin and xmax.
-        */
-       struct
-       {
-               /* number of committed transactions */
-               size_t          xcnt;
-
-               /* available space for committed transactions */
-               size_t          xcnt_space;
-
-               /*
-                * Until we reach a CONSISTENT state, we record commits of all
-                * transactions, not just the catalog changing ones. Record when that
-                * changes so we know we cannot export a snapshot safely anymore.
-                */
-               bool            includes_all_transactions;
-
-               /*
-                * Array of committed transactions that have modified the catalog.
-                *
-                * As this array is frequently modified we do *not* keep it in
-                * xidComparator order. Instead we sort the array when building &
-                * distributing a snapshot.
-                *
-                * TODO: It's unclear whether that reasoning has much merit. Every
-                * time we add something here after becoming consistent will also
-                * require distributing a snapshot. Storing them sorted would
-                * potentially also make it easier to purge (but more complicated wrt
-                * wraparound?). Should be improved if sorting while building the
-                * snapshot shows up in profiles.
-                */
-               TransactionId *xip;
-       }                       committed;
  };
  
  /*
@@ -245,15 +179,6 @@ struct SnapBuild
  static ResourceOwner SavedResourceOwnerDuringExport = NULL;
  static bool ExportInProgress = false;
  
-/* transaction state manipulation functions */
-static void SnapBuildEndTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid);
-
-/* ->running manipulation */
-static bool SnapBuildTxnIsRunning(SnapBuild *builder, TransactionId xid);
-
-/* ->committed manipulation */
-static void SnapBuildPurgeCommittedTxn(SnapBuild *builder);
-
  /* snapshot building/manipulation/distribution functions */
  static Snapshot SnapBuildBuildSnapshot(SnapBuild *builder, TransactionId xid);
  
@@ -263,13 +188,6 @@ static void SnapBuildSnapIncRefcount(Snapshot snap);
  
  static void SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn);
  
-/* xlog reading helper functions for SnapBuildProcessRecord */
-static bool SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running);
-
-/* serialization functions */
-static void SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn);
-static bool SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn);
-
  
  /*
   * Allocate a new snapshot builder.
@@ -279,7 +197,6 @@ static bool SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn);
   */
  SnapBuild *
  AllocateSnapshotBuilder(ReorderBuffer *reorder,
-                                               TransactionId xmin_horizon,
                                                 XLogRecPtr start_lsn)
  {
         MemoryContext context;
@@ -301,13 +218,6 @@ AllocateSnapshotBuilder(ReorderBuffer *reorder,
         builder->reorder = reorder;
         /* Other struct members initialized by zeroing via palloc0 above */
  
-       builder->committed.xcnt = 0;
-       builder->committed.xcnt_space = 128;            /* arbitrary number */
-       builder->committed.xip =
-               palloc0(builder->committed.xcnt_space * sizeof(TransactionId));
-       builder->committed.includes_all_transactions = true;
-
-       builder->initial_xmin_horizon = xmin_horizon;
         builder->start_decoding_at = start_lsn;
  
         MemoryContextSwitchTo(oldcontext);
@@ -345,7 +255,6 @@ SnapBuildFreeSnapshot(Snapshot snap)
  
         /* make sure nobody modified our snapshot */
         Assert(snap->curcid == FirstCommandId);
-       Assert(!snap->suboverflowed);
         Assert(!snap->takenDuringRecovery);
         Assert(snap->regd_count == 0);
  
@@ -403,7 +312,6 @@ SnapBuildSnapDecRefcount(Snapshot snap)
  
         /* make sure nobody modified our snapshot */
         Assert(snap->curcid == FirstCommandId);
-       Assert(!snap->suboverflowed);
         Assert(!snap->takenDuringRecovery);
  
         Assert(snap->regd_count == 0);
@@ -433,10 +341,9 @@ SnapBuildBuildSnapshot(SnapBuild *builder, TransactionId xid)
         Snapshot        snapshot;
         Size            ssize;
  
-       Assert(builder->state >= SNAPBUILD_FULL_SNAPSHOT);
+       Assert(builder->state >= SNAPBUILD_CONSISTENT);
  
         ssize = sizeof(SnapshotData)
-               + sizeof(TransactionId) * builder->committed.xcnt
                 + sizeof(TransactionId) * 1 /* toplevel xid */ ;
  
         snapshot = MemoryContextAllocZero(builder->context, ssize);
@@ -444,52 +351,34 @@ SnapBuildBuildSnapshot(SnapBuild *builder, TransactionId xid)
         snapshot->satisfies = HeapTupleSatisfiesHistoricMVCC;
  
         /*
-        * We misuse the original meaning of SnapshotData's xip and subxip fields
-        * to make the more fitting for our needs.
-        *
-        * In the 'xip' array we store transactions that have to be treated as
-        * committed. Since we will only ever look at tuples from transactions
-        * that have modified the catalog it's more efficient to store those few
-        * that exist between xmin and xmax (frequently there are none).
-        *
          * Snapshots that are used in transactions that have modified the catalog
-        * also use the 'subxip' array to store their toplevel xid and all the
+        * use the 'this_xip' array to store their toplevel xid and all the
          * subtransaction xids so we can recognize when we need to treat rows as
-        * visible that are not in xip but still need to be visible. Subxip only
+        * visible that would not normally be visible by the CSN test. this_xip only
          * gets filled when the transaction is copied into the context of a
          * catalog modifying transaction since we otherwise share a snapshot
          * between transactions. As long as a txn hasn't modified the catalog it
          * doesn't need to treat any uncommitted rows as visible, so there is no
          * need for those xids.
          *
-        * Both arrays are qsort'ed so that we can use bsearch() on them.
+        * this_xip array is qsort'ed so that we can use bsearch() on them.
          */
         Assert(TransactionIdIsNormal(builder->xmin));
         Assert(TransactionIdIsNormal(builder->xmax));
+       Assert(builder->snapshotcsn != InvalidCommitSeqNo);
  
         snapshot->xmin = builder->xmin;
         snapshot->xmax = builder->xmax;
-
-       /* store all transactions to be treated as committed by this snapshot */
-       snapshot->xip =
-               (TransactionId *) ((char *) snapshot + sizeof(SnapshotData));
-       snapshot->xcnt = builder->committed.xcnt;
-       memcpy(snapshot->xip,
-                  builder->committed.xip,
-                  builder->committed.xcnt * sizeof(TransactionId));
-
-       /* sort so we can bsearch() */
-       qsort(snapshot->xip, snapshot->xcnt, sizeof(TransactionId), xidComparator);
+       snapshot->snapshotcsn = builder->snapshotcsn;
  
         /*
-        * Initially, subxip is empty, i.e. it's a snapshot to be used by
+        * Initially, this_xip is empty, i.e. it's a snapshot to be used by
          * transactions that don't modify the catalog. Will be filled by
          * ReorderBufferCopySnap() if necessary.
          */
-       snapshot->subxcnt = 0;
-       snapshot->subxip = NULL;
+       snapshot->this_xcnt = 0;
+       snapshot->this_xip = NULL;
  
-       snapshot->suboverflowed = false;
         snapshot->takenDuringRecovery = false;
         snapshot->copied = false;
         snapshot->curcid = FirstCommandId;
@@ -515,19 +404,13 @@ SnapBuildExportSnapshot(SnapBuild *builder)
  {
         Snapshot        snap;
         char       *snapname;
-       TransactionId xid;
-       TransactionId *newxip;
-       int                     newxcnt = 0;
  
         if (builder->state != SNAPBUILD_CONSISTENT)
                 elog(ERROR, "cannot export a snapshot before reaching a consistent state");
  
-       if (!builder->committed.includes_all_transactions)
-               elog(ERROR, "cannot export a snapshot, not all transactions are monitored anymore");
-
         /* so we don't overwrite the existing value */
-       if (TransactionIdIsValid(MyPgXact->xmin))
-               elog(ERROR, "cannot export a snapshot when MyPgXact->xmin already is valid");
+       if (TransactionIdIsValid(MyPgXact->snapshotcsn))
+               elog(ERROR, "cannot export a snapshot when MyPgXact->snapshotcsn already is valid");
  
         if (IsTransactionOrTransactionBlock())
                 elog(ERROR, "cannot export a snapshot from within a transaction");
@@ -553,42 +436,7 @@ SnapBuildExportSnapshot(SnapBuild *builder)
          * mechanism. Due to that we can do this without locks, we're only
          * changing our own value.
          */
-       MyPgXact->xmin = snap->xmin;
-
-       /* allocate in transaction context */
-       newxip = (TransactionId *)
-               palloc(sizeof(TransactionId) * GetMaxSnapshotXidCount());
-
-       /*
-        * snapbuild.c builds transactions in an "inverted" manner, which means it
-        * stores committed transactions in ->xip, not ones in progress. Build a
-        * classical snapshot by marking all non-committed transactions as
-        * in-progress. This can be expensive.
-        */
-       for (xid = snap->xmin; NormalTransactionIdPrecedes(xid, snap->xmax);)
-       {
-               void       *test;
-
-               /*
-                * Check whether transaction committed using the decoding snapshot
-                * meaning of ->xip.
-                */
-               test = bsearch(&xid, snap->xip, snap->xcnt,
-                                          sizeof(TransactionId), xidComparator);
-
-               if (test == NULL)
-               {
-                       if (newxcnt >= GetMaxSnapshotXidCount())
-                               elog(ERROR, "snapshot too large");
-
-                       newxip[newxcnt++] = xid;
-               }
-
-               TransactionIdAdvance(xid);
-       }
-
-       snap->xcnt = newxcnt;
-       snap->xip = newxip;
+       MyPgXact->snapshotcsn = snap->snapshotcsn;
  
         /*
          * now that we've built a plain snapshot, use the normal mechanisms for
@@ -597,10 +445,10 @@ SnapBuildExportSnapshot(SnapBuild *builder)
         snapname = ExportSnapshot(snap);
  
         ereport(LOG,
-                       (errmsg_plural("exported logical decoding snapshot: \"%s\" with %u transaction ID",
-               "exported logical decoding snapshot: \"%s\" with %u transaction IDs",
-                                                  snap->xcnt,
-                                                  snapname, snap->xcnt)));
+                       (errmsg("exported logical decoding snapshot: \"%s\" at %X/%X",
+                                       snapname,
+                                       (uint32) (snap->snapshotcsn >> 32),
+                                       (uint32) snap->snapshotcsn)));
         return snapname;
  }
  
@@ -658,16 +506,7 @@ SnapBuildProcessChange(SnapBuild *builder, TransactionId xid, XLogRecPtr lsn)
          * We can't handle data in transactions if we haven't built a snapshot
          * yet, so don't store them.
          */
-       if (builder->state < SNAPBUILD_FULL_SNAPSHOT)
-               return false;
-
-       /*
-        * No point in keeping track of changes in transactions that we don't have
-        * enough information about to decode. This means that they started before
-        * we got into the SNAPBUILD_FULL_SNAPSHOT state.
-        */
-       if (builder->state < SNAPBUILD_CONSISTENT &&
-               SnapBuildTxnIsRunning(builder, xid))
+       if (builder->state < SNAPBUILD_CONSISTENT)
                 return false;
  
         /*
@@ -735,38 +574,6 @@ SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid,
         ReorderBufferAddNewCommandId(builder->reorder, xid, lsn, cid + 1);
  }
  
-/*
- * Check whether `xid` is currently 'running'.
- *
- * Running transactions in our parlance are transactions which we didn't
- * observe from the start so we can't properly decode their contents. They
- * only exist after we freshly started from an < CONSISTENT snapshot.
- */
-static bool
-SnapBuildTxnIsRunning(SnapBuild *builder, TransactionId xid)
-{
-       Assert(builder->state < SNAPBUILD_CONSISTENT);
-       Assert(TransactionIdIsNormal(builder->running.xmin));
-       Assert(TransactionIdIsNormal(builder->running.xmax));
-
-       if (builder->running.xcnt &&
-               NormalTransactionIdFollows(xid, builder->running.xmin) &&
-               NormalTransactionIdPrecedes(xid, builder->running.xmax))
-       {
-               TransactionId *search =
-               bsearch(&xid, builder->running.xip, builder->running.xcnt_space,
-                               sizeof(TransactionId), xidComparator);
-
-               if (search != NULL)
-               {
-                       Assert(*search == xid);
-                       return true;
-               }
-       }
-
-       return false;
-}
-
  /*
   * Add a new Snapshot to all transactions we're decoding that currently are
   * in-progress so they can see new catalog contents made by the transaction
@@ -818,133 +625,6 @@ SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn)
         }
  }
  
-/*
- * Keep track of a new catalog changing transaction that has committed.
- */
-static void
-SnapBuildAddCommittedTxn(SnapBuild *builder, TransactionId xid)
-{
-       Assert(TransactionIdIsValid(xid));
-
-       if (builder->committed.xcnt == builder->committed.xcnt_space)
-       {
-               builder->committed.xcnt_space = builder->committed.xcnt_space * 2 + 1;
-
-               elog(DEBUG1, "increasing space for committed transactions to %u",
-                        (uint32) builder->committed.xcnt_space);
-
-               builder->committed.xip = repalloc(builder->committed.xip,
-                                         builder->committed.xcnt_space * sizeof(TransactionId));
-       }
-
-       /*
-        * TODO: It might make sense to keep the array sorted here instead of
-        * doing it every time we build a new snapshot. On the other hand this
-        * gets called repeatedly when a transaction with subtransactions commits.
-        */
-       builder->committed.xip[builder->committed.xcnt++] = xid;
-}
-
-/*
- * Remove knowledge about transactions we treat as committed that are smaller
- * than ->xmin. Those won't ever get checked via the ->committed array but via
- * the clog machinery, so we don't need to waste memory on them.
- */
-static void
-SnapBuildPurgeCommittedTxn(SnapBuild *builder)
-{
-       int                     off;
-       TransactionId *workspace;
-       int                     surviving_xids = 0;
-
-       /* not ready yet */
-       if (!TransactionIdIsNormal(builder->xmin))
-               return;
-
-       /* TODO: Neater algorithm than just copying and iterating? */
-       workspace =
-               MemoryContextAlloc(builder->context,
-                                                  builder->committed.xcnt * sizeof(TransactionId));
-
-       /* copy xids that still are interesting to workspace */
-       for (off = 0; off < builder->committed.xcnt; off++)
-       {
-               if (NormalTransactionIdPrecedes(builder->committed.xip[off],
-                                                                               builder->xmin))
-                       ;                                       /* remove */
-               else
-                       workspace[surviving_xids++] = builder->committed.xip[off];
-       }
-
-       /* copy workspace back to persistent state */
-       memcpy(builder->committed.xip, workspace,
-                  surviving_xids * sizeof(TransactionId));
-
-       elog(DEBUG3, "purged committed transactions from %u to %u, xmin: %u, xmax: %u",
-                (uint32) builder->committed.xcnt, (uint32) surviving_xids,
-                builder->xmin, builder->xmax);
-       builder->committed.xcnt = surviving_xids;
-
-       pfree(workspace);
-}
-
-/*
- * Common logic for SnapBuildAbortTxn and SnapBuildCommitTxn dealing with
- * keeping track of the amount of running transactions.
- */
-static void
-SnapBuildEndTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid)
-{
-       if (builder->state == SNAPBUILD_CONSISTENT)
-               return;
-
-       /*
-        * NB: This handles subtransactions correctly even if we started from
-        * suboverflowed xl_running_xacts because we only keep track of toplevel
-        * transactions. Since the latter are always allocated before their
-        * subxids and since they end at the same time it's sufficient to deal
-        * with them here.
-        */
-       if (SnapBuildTxnIsRunning(builder, xid))
-       {
-               Assert(builder->running.xcnt > 0);
-
-               if (!--builder->running.xcnt)
-               {
-                       /*
-                        * None of the originally running transaction is running anymore,
-                        * so our incrementaly built snapshot now is consistent.
-                        */
-                       ereport(LOG,
-                                 (errmsg("logical decoding found consistent point at %X/%X",
-                                                 (uint32) (lsn >> 32), (uint32) lsn),
-                                  errdetail("Transaction ID %u finished; no more running transactions.",
-                                                        xid)));
-                       builder->state = SNAPBUILD_CONSISTENT;
-               }
-       }
-}
-
-/*
- * Abort a transaction, throw away all state we kept.
- */
-void
-SnapBuildAbortTxn(SnapBuild *builder, XLogRecPtr lsn,
-                                 TransactionId xid,
-                                 int nsubxacts, TransactionId *subxacts)
-{
-       int                     i;
-
-       for (i = 0; i < nsubxacts; i++)
-       {
-               TransactionId subxid = subxacts[i];
-
-               SnapBuildEndTxn(builder, lsn, subxid);
-       }
-
-       SnapBuildEndTxn(builder, lsn, xid);
-}
-
  /*
   * Handle everything that needs to be done when a transaction commits
   */
@@ -955,10 +635,8 @@ SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid,
         int                     nxact;
  
         bool            forced_timetravel = false;
-       bool            sub_needs_timetravel = false;
-       bool            top_needs_timetravel = false;
  
-       TransactionId xmax = xid;
+       TransactionId xmax;
  
         /*
          * If we couldn't observe every change of a transaction because it was
@@ -984,93 +662,36 @@ SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid,
                 elog(DEBUG1, "forced to assume catalog changes for xid %u because it was running too early", xid);
         }
  
+       xmax = builder->xmax;
+
+       if (NormalTransactionIdFollows(xid, xmax))
+               xmax = xid;
+       if (!forced_timetravel)
+       {
+               if (ReorderBufferXidHasCatalogChanges(builder->reorder, xid))
+                       forced_timetravel = true;
+       }
         for (nxact = 0; nxact < nsubxacts; nxact++)
         {
                 TransactionId subxid = subxacts[nxact];
  
-               /*
-                * make sure txn is not tracked in running txn's anymore, switch state
-                */
-               SnapBuildEndTxn(builder, lsn, subxid);
+               if (NormalTransactionIdFollows(subxid, xmax))
+                       xmax = subxid;
  
-               /*
-                * If we're forcing timetravel we also need visibility information
-                * about subtransaction, so keep track of subtransaction's state.
-                */
-               if (forced_timetravel)
+               if (!forced_timetravel)
                 {
-                       SnapBuildAddCommittedTxn(builder, subxid);
-                       if (NormalTransactionIdFollows(subxid, xmax))
-                               xmax = subxid;
+                       if (ReorderBufferXidHasCatalogChanges(builder->reorder, subxid))
+                               forced_timetravel = true;
                 }
-
-               /*
-                * Add subtransaction to base snapshot if it DDL, we don't distinguish
-                * to toplevel transactions there.
-                */
-               else if (ReorderBufferXidHasCatalogChanges(builder->reorder, subxid))
-               {
-                       sub_needs_timetravel = true;
-
-                       elog(DEBUG1, "found subtransaction %u:%u with catalog changes.",
-                                xid, subxid);
-
-                       SnapBuildAddCommittedTxn(builder, subxid);
-
-                       if (NormalTransactionIdFollows(subxid, xmax))
-                               xmax = subxid;
-               }
-       }
-
-       /*
-        * Make sure toplevel txn is not tracked in running txn's anymore, switch
-        * state to consistent if possible.
-        */
-       SnapBuildEndTxn(builder, lsn, xid);
-
-       if (forced_timetravel)
-       {
-               elog(DEBUG2, "forced transaction %u to do timetravel.", xid);
-
-               SnapBuildAddCommittedTxn(builder, xid);
         }
-       /* add toplevel transaction to base snapshot */
-       else if (ReorderBufferXidHasCatalogChanges(builder->reorder, xid))
-       {
-               elog(DEBUG2, "found top level transaction %u, with catalog changes!",
-                        xid);
  
-               top_needs_timetravel = true;
-               SnapBuildAddCommittedTxn(builder, xid);
-       }
-       else if (sub_needs_timetravel)
-       {
-               /* mark toplevel txn as timetravel as well */
-               SnapBuildAddCommittedTxn(builder, xid);
-       }
+       builder->xmax = xmax;
+       /* We use the commit record's LSN as the snapshot */
+       builder->snapshotcsn = (CommitSeqNo) lsn;
  
         /* if there's any reason to build a historic snapshot, do so now */
-       if (forced_timetravel || top_needs_timetravel || sub_needs_timetravel)
+       if (forced_timetravel)
         {
-               /*
-                * Adjust xmax of the snapshot builder, we only do that for committed,
-                * catalog modifying, transactions, everything else isn't interesting
-                * for us since we'll never look at the respective rows.
-                */
-               if (!TransactionIdIsValid(builder->xmax) ||
-                       TransactionIdFollowsOrEquals(xmax, builder->xmax))
-               {
-                       builder->xmax = xmax;
-                       TransactionIdAdvance(builder->xmax);
-               }
-
-               /*
-                * If we haven't built a complete snapshot yet there's no need to hand
-                * it out, it wouldn't (and couldn't) be used anyway.
-                */
-               if (builder->state < SNAPBUILD_FULL_SNAPSHOT)
-                       return;
-
                 /*
                  * Decrease the snapshot builder's refcount of the old snapshot, note
                  * that it still will be used if it has been handed out to the
@@ -1095,11 +716,12 @@ SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid,
                 /* add a new Snapshot to all currently running transactions */
                 SnapBuildDistributeNewCatalogSnapshot(builder, lsn);
         }
-       else
-       {
-               /* record that we cannot export a general snapshot anymore */
-               builder->committed.includes_all_transactions = false;
-       }
+}
+
+void
+SnapBuildAbortTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid,
+                                 int nsubxacts, TransactionId *subxacts)
+{
  }
  
  
@@ -1118,40 +740,17 @@ SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xact
  {
         ReorderBufferTXN *txn;
  
-       /*
-        * If we're not consistent yet, inspect the record to see whether it
-        * allows to get closer to being consistent. If we are consistent, dump
-        * our snapshot so others or we, after a restart, can use it.
-        */
-       if (builder->state < SNAPBUILD_CONSISTENT)
-       {
-               /* returns false if there's no point in performing cleanup just yet */
-               if (!SnapBuildFindSnapshot(builder, lsn, running))
-                       return;
-       }
-       else
-               SnapBuildSerialize(builder, lsn);
-
         /*
          * Update range of interesting xids based on the running xacts
-        * information. We don't increase ->xmax using it, because once we are in
-        * a consistent state we can do that ourselves and much more efficiently
-        * so, because we only need to do it for catalog transactions since we
-        * only ever look at those.
-        *
-        * NB: Because of that xmax can be lower than xmin, because we only
-        * increase xmax when a catalog modifying transaction commits. While odd
-        * looking, it's correct and actually more efficient this way since we hit
-        * fast paths in tqual.c.
+        * information.
          */
         builder->xmin = running->oldestRunningXid;
+       builder->xmax = running->nextXid;
+       builder->snapshotcsn = (CommitSeqNo) lsn;
  
-       /* Remove transactions we don't need to keep track off anymore */
-       SnapBuildPurgeCommittedTxn(builder);
-
-       elog(DEBUG3, "xmin: %u, xmax: %u, oldestrunning: %u",
-                builder->xmin, builder->xmax,
-                running->oldestRunningXid);
+       elog(DEBUG3, "xmin: %u, xmax: %u",
+                builder->xmin, builder->xmax);
+       Assert(lsn != InvalidXLogRecPtr);
  
         /*
          * Inrease shared memory limits, so vacuum can work on tuples we prevented
@@ -1171,12 +770,8 @@ SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xact
          * beginning. That point is where we can restart from.
          */
  
-       /*
-        * Can't know about a serialized snapshot's location if we're not
-        * consistent.
-        */
         if (builder->state < SNAPBUILD_CONSISTENT)
-               return;
+               builder->state = SNAPBUILD_CONSISTENT;
  
         txn = ReorderBufferGetOldestTXN(builder->reorder);
  
@@ -1186,732 +781,4 @@ SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xact
          */
         if (txn != NULL && txn->restart_decoding_lsn != InvalidXLogRecPtr)
                 LogicalIncreaseRestartDecodingForSlot(lsn, txn->restart_decoding_lsn);
-
-       /*
-        * No in-progress transaction, can reuse the last serialized snapshot if
-        * we have one.
-        */
-       else if (txn == NULL &&
-               builder->reorder->current_restart_decoding_lsn != InvalidXLogRecPtr &&
-                        builder->last_serialized_snapshot != InvalidXLogRecPtr)
-               LogicalIncreaseRestartDecodingForSlot(lsn,
-                                                                                 builder->last_serialized_snapshot);
-}
-
-
-/*
- * Build the start of a snapshot that's capable of decoding the catalog.
- *
- * Helper function for SnapBuildProcessRunningXacts() while we're not yet
- * consistent.
- *
- * Returns true if there is a point in performing internal maintenance/cleanup
- * using the xl_running_xacts record.
- */
-static bool
-SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running)
-{
-       /* ---
-        * Build catalog decoding snapshot incrementally using information about
-        * the currently running transactions. There are several ways to do that:
-        *
-        * a) There were no running transactions when the xl_running_xacts record
-        *        was inserted, jump to CONSISTENT immediately. We might find such a
-        *        state we were waiting for b) and c).
-        *
-        * b) Wait for all toplevel transactions that were running to end. We
-        *        simply track the number of in-progress toplevel transactions and
-        *        lower it whenever one commits or aborts. When that number
-        *        (builder->running.xcnt) reaches zero, we can go from FULL_SNAPSHOT
-        *        to CONSISTENT.
-        *        NB: We need to search running.xip when seeing a transaction's end to
-        *        make sure it's a toplevel transaction and it's been one of the
-        *        initially running ones.
-        *        Interestingly, in contrast to HS, this allows us not to care about
-        *        subtransactions - and by extension suboverflowed xl_running_xacts -
-        *        at all.
-        *
-        * c) This (in a previous run) or another decoding slot serialized a
-        *        snapshot to disk that we can use.
-        * ---
-        */
-
-       /*
-        * xl_running_xact record is older than what we can use, we might not have
-        * all necessary catalog rows anymore.
-        */
-       if (TransactionIdIsNormal(builder->initial_xmin_horizon) &&
-               NormalTransactionIdPrecedes(running->oldestRunningXid,
-                                                                       builder->initial_xmin_horizon))
-       {
-               ereport(DEBUG1,
-                               (errmsg_internal("skipping snapshot at %X/%X while building logical decoding snapshot, xmin horizon too low",
-                                                                (uint32) (lsn >> 32), (uint32) lsn),
-               errdetail_internal("initial xmin horizon of %u vs the snapshot's %u",
-                                builder->initial_xmin_horizon, running->oldestRunningXid)));
-               return true;
-       }
-
-       /*
-        * a) No transaction were running, we can jump to consistent.
-        *
-        * NB: We might have already started to incrementally assemble a snapshot,
-        * so we need to be careful to deal with that.
-        */
-       if (running->xcnt == 0)
-       {
-               if (builder->start_decoding_at == InvalidXLogRecPtr ||
-                       builder->start_decoding_at <= lsn)
-                       /* can decode everything after this */
-                       builder->start_decoding_at = lsn + 1;
-
-               /* As no transactions were running xmin/xmax can be trivially set. */
-               builder->xmin = running->nextXid;               /* < are finished */
-               builder->xmax = running->nextXid;               /* >= are running */
-
-               /* so we can safely use the faster comparisons */
-               Assert(TransactionIdIsNormal(builder->xmin));
-               Assert(TransactionIdIsNormal(builder->xmax));
-
-               /* no transactions running now */
-               builder->running.xcnt = 0;
-               builder->running.xmin = InvalidTransactionId;
-               builder->running.xmax = InvalidTransactionId;
-
-               builder->state = SNAPBUILD_CONSISTENT;
-
-               ereport(LOG,
-                               (errmsg("logical decoding found consistent point at %X/%X",
-                                               (uint32) (lsn >> 32), (uint32) lsn),
-                                errdetail("There are no running transactions.")));
-
-               return false;
-       }
-       /* c) valid on disk state */
-       else if (SnapBuildRestore(builder, lsn))
-       {
-               /* there won't be any state to cleanup */
-               return false;
-       }
-
-       /*
-        * b) first encounter of a useable xl_running_xacts record. If we had
-        * found one earlier we would either track running transactions (i.e.
-        * builder->running.xcnt != 0) or be consistent (this function wouldn't
-        * get called).
-        */
-       else if (!builder->running.xcnt)
-       {
-               int                     off;
-
-               /*
-                * We only care about toplevel xids as those are the ones we
-                * definitely see in the wal stream. As snapbuild.c tracks committed
-                * instead of running transactions we don't need to know anything
-                * about uncommitted subtransactions.
-                */
-
-               /*
-                * Start with an xmin/xmax that's correct for future, when all the
-                * currently running transactions have finished. We'll update both
-                * while waiting for the pending transactions to finish.
-                */
-               builder->xmin = running->nextXid;               /* < are finished */
-               builder->xmax = running->nextXid;               /* >= are running */
-
-               /* so we can safely use the faster comparisons */
-               Assert(TransactionIdIsNormal(builder->xmin));
-               Assert(TransactionIdIsNormal(builder->xmax));
-
-               builder->running.xcnt = running->xcnt;
-               builder->running.xcnt_space = running->xcnt;
-               builder->running.xip =
-                       MemoryContextAlloc(builder->context,
-                                                          builder->running.xcnt * sizeof(TransactionId));
-               memcpy(builder->running.xip, running->xids,
-                          builder->running.xcnt * sizeof(TransactionId));
-
-               /* sort so we can do a binary search */
-               qsort(builder->running.xip, builder->running.xcnt,
-                         sizeof(TransactionId), xidComparator);
-
-               builder->running.xmin = builder->running.xip[0];
-               builder->running.xmax = builder->running.xip[running->xcnt - 1];
-
-               /* makes comparisons cheaper later */
-               TransactionIdRetreat(builder->running.xmin);
-               TransactionIdAdvance(builder->running.xmax);
-
-               builder->state = SNAPBUILD_FULL_SNAPSHOT;
-
-               ereport(LOG,
-                       (errmsg("logical decoding found initial starting point at %X/%X",
-                                       (uint32) (lsn >> 32), (uint32) lsn),
-                        errdetail_plural("%u transaction needs to finish.",
-                                                         "%u transactions need to finish.",
-                                                         builder->running.xcnt,
-                                                         (uint32) builder->running.xcnt)));
-
-               /*
-                * Iterate through all xids, wait for them to finish.
-                *
-                * This isn't required for the correctness of decoding, but to allow
-                * isolationtester to notice that we're currently waiting for
-                * something.
-                */
-               for (off = 0; off < builder->running.xcnt; off++)
-               {
-                       TransactionId xid = builder->running.xip[off];
-
-                       /*
-                        * Upper layers should prevent that we ever need to wait on
-                        * ourselves. Check anyway, since failing to do so would either
-                        * result in an endless wait or an Assert() failure.
-                        */
-                       if (TransactionIdIsCurrentTransactionId(xid))
-                               elog(ERROR, "waiting for ourselves");
-
-                       XactLockTableWait(xid, NULL, NULL, XLTW_None);
-               }
-
-               /* nothing could have built up so far, so don't perform cleanup */
-               return false;
-       }
-
-       /*
-        * We already started to track running xacts and need to wait for all
-        * in-progress ones to finish. We fall through to the normal processing of
-        * records so incremental cleanup can be performed.
-        */
-       return true;
-}
-
-
-/* -----------------------------------
- * Snapshot serialization support
- * -----------------------------------
- */
-
-/*
- * We store current state of struct SnapBuild on disk in the following manner:
- *
- * struct SnapBuildOnDisk;
- * TransactionId * running.xcnt_space;
- * TransactionId * committed.xcnt; (*not xcnt_space*)
- *
- */
-typedef struct SnapBuildOnDisk
-{
-       /* first part of this struct needs to be version independent */
-
-       /* data not covered by checksum */
-       uint32          magic;
-       pg_crc32c       checksum;
-
-       /* data covered by checksum */
-
-       /* version, in case we want to support pg_upgrade */
-       uint32          version;
-       /* how large is the on disk data, excluding the constant sized part */
-       uint32          length;
-
-       /* version dependent part */
-       SnapBuild       builder;
-
-       /* variable amount of TransactionIds follows */
-} SnapBuildOnDisk;
-
-#define SnapBuildOnDiskConstantSize \
-       offsetof(SnapBuildOnDisk, builder)
-#define SnapBuildOnDiskNotChecksummedSize \
-       offsetof(SnapBuildOnDisk, version)
-
-#define SNAPBUILD_MAGIC 0x51A1E001
-#define SNAPBUILD_VERSION 2
-
-/*
- * Store/Load a snapshot from disk, depending on the snapshot builder's state.
- *
- * Supposed to be used by external (i.e. not snapbuild.c) code that just read
- * a record that's a potential location for a serialized snapshot.
- */
-void
-SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn)
-{
-       if (builder->state < SNAPBUILD_CONSISTENT)
-               SnapBuildRestore(builder, lsn);
-       else
-               SnapBuildSerialize(builder, lsn);
-}
-
-/*
- * Serialize the snapshot 'builder' at the location 'lsn' if it hasn't already
- * been done by another decoding process.
- */
-static void
-SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
-{
-       Size            needed_length;
-       SnapBuildOnDisk *ondisk;
-       char       *ondisk_c;
-       int                     fd;
-       char            tmppath[MAXPGPATH];
-       char            path[MAXPGPATH];
-       int                     ret;
-       struct stat stat_buf;
-       Size            sz;
-
-       Assert(lsn != InvalidXLogRecPtr);
-       Assert(builder->last_serialized_snapshot == InvalidXLogRecPtr ||
-                  builder->last_serialized_snapshot <= lsn);
-
-       /*
-        * no point in serializing if we cannot continue to work immediately after
-        * restoring the snapshot
-        */
-       if (builder->state < SNAPBUILD_CONSISTENT)
-               return;
-
-       /*
-        * We identify snapshots by the LSN they are valid for. We don't need to
-        * include timelines in the name as each LSN maps to exactly one timeline
-        * unless the user used pg_resetxlog or similar. If a user did so, there's
-        * no hope continuing to decode anyway.
-        */
-       sprintf(path, "pg_logical/snapshots/%X-%X.snap",
-                       (uint32) (lsn >> 32), (uint32) lsn);
-
-       /*
-        * first check whether some other backend already has written the snapshot
-        * for this LSN. It's perfectly fine if there's none, so we accept ENOENT
-        * as a valid state. Everything else is an unexpected error.
-        */
-       ret = stat(path, &stat_buf);
-
-       if (ret != 0 && errno != ENOENT)
-               ereport(ERROR,
-                               (errmsg("could not stat file \"%s\": %m", path)));
-
-       else if (ret == 0)
-       {
-               /*
-                * somebody else has already serialized to this point, don't overwrite
-                * but remember location, so we don't need to read old data again.
-                *
-                * To be sure it has been synced to disk after the rename() from the
-                * tempfile filename to the real filename, we just repeat the fsync.
-                * That ought to be cheap because in most scenarios it should already
-                * be safely on disk.
-                */
-               fsync_fname(path, false);
-               fsync_fname("pg_logical/snapshots", true);
-
-               builder->last_serialized_snapshot = lsn;
-               goto out;
-       }
-
-       /*
-        * there is an obvious race condition here between the time we stat(2) the
-        * file and us writing the file. But we rename the file into place
-        * atomically and all files created need to contain the same data anyway,
-        * so this is perfectly fine, although a bit of a resource waste. Locking
-        * seems like pointless complication.
-        */
-       elog(DEBUG1, "serializing snapshot to %s", path);
-
-       /* to make sure only we will write to this tempfile, include pid */
-       sprintf(tmppath, "pg_logical/snapshots/%X-%X.snap.%u.tmp",
-                       (uint32) (lsn >> 32), (uint32) lsn, MyProcPid);
-
-       /*
-        * Unlink temporary file if it already exists, needs to have been before a
-        * crash/error since we won't enter this function twice from within a
-        * single decoding slot/backend and the temporary file contains the pid of
-        * the current process.
-        */
-       if (unlink(tmppath) != 0 && errno != ENOENT)
-               ereport(ERROR,
-                               (errcode_for_file_access(),
-                                errmsg("could not remove file \"%s\": %m", path)));
-
-       needed_length = sizeof(SnapBuildOnDisk) +
-               sizeof(TransactionId) * builder->running.xcnt_space +
-               sizeof(TransactionId) * builder->committed.xcnt;
-
-       ondisk_c = MemoryContextAllocZero(builder->context, needed_length);
-       ondisk = (SnapBuildOnDisk *) ondisk_c;
-       ondisk->magic = SNAPBUILD_MAGIC;
-       ondisk->version = SNAPBUILD_VERSION;
-       ondisk->length = needed_length;
-       INIT_CRC32C(ondisk->checksum);
-       COMP_CRC32C(ondisk->checksum,
-                               ((char *) ondisk) + SnapBuildOnDiskNotChecksummedSize,
-                       SnapBuildOnDiskConstantSize - SnapBuildOnDiskNotChecksummedSize);
-       ondisk_c += sizeof(SnapBuildOnDisk);
-
-       memcpy(&ondisk->builder, builder, sizeof(SnapBuild));
-       /* NULL-ify memory-only data */
-       ondisk->builder.context = NULL;
-       ondisk->builder.snapshot = NULL;
-       ondisk->builder.reorder = NULL;
-       ondisk->builder.running.xip = NULL;
-       ondisk->builder.committed.xip = NULL;
-
-       COMP_CRC32C(ondisk->checksum,
-                               &ondisk->builder,
-                               sizeof(SnapBuild));
-
-       /* copy running xacts */
-       sz = sizeof(TransactionId) * builder->running.xcnt_space;
-       memcpy(ondisk_c, builder->running.xip, sz);
-       COMP_CRC32C(ondisk->checksum, ondisk_c, sz);
-       ondisk_c += sz;
-
-       /* copy committed xacts */
-       sz = sizeof(TransactionId) * builder->committed.xcnt;
-       memcpy(ondisk_c, builder->committed.xip, sz);
-       COMP_CRC32C(ondisk->checksum, ondisk_c, sz);
-       ondisk_c += sz;
-
-       FIN_CRC32C(ondisk->checksum);
-
-       /* we have valid data now, open tempfile and write it there */
-       fd = OpenTransientFile(tmppath,
-                                                  O_CREAT | O_EXCL | O_WRONLY | PG_BINARY,
-                                                  S_IRUSR | S_IWUSR);
-       if (fd < 0)
-               ereport(ERROR,
-                               (errmsg("could not open file \"%s\": %m", path)));
-
-       if ((write(fd, ondisk, needed_length)) != needed_length)
-       {
-               CloseTransientFile(fd);
-               ereport(ERROR,
-                               (errcode_for_file_access(),
-                                errmsg("could not write to file \"%s\": %m", tmppath)));
-       }
-
-       /*
-        * fsync the file before renaming so that even if we crash after this we
-        * have either a fully valid file or nothing.
-        *
-        * TODO: Do the fsync() via checkpoints/restartpoints, doing it here has
-        * some noticeable overhead since it's performed synchronously during
-        * decoding?
-        */
-       if (pg_fsync(fd) != 0)
-       {
-               CloseTransientFile(fd);
-               ereport(ERROR,
-                               (errcode_for_file_access(),
-                                errmsg("could not fsync file \"%s\": %m", tmppath)));
-       }
-       CloseTransientFile(fd);
-
-       fsync_fname("pg_logical/snapshots", true);
-
-       /*
-        * We may overwrite the work from some other backend, but that's ok, our
-        * snapshot is valid as well, we'll just have done some superfluous work.
-        */
-       if (rename(tmppath, path) != 0)
-       {
-               ereport(ERROR,
-                               (errcode_for_file_access(),
-                                errmsg("could not rename file \"%s\" to \"%s\": %m",
-                                               tmppath, path)));
-       }
-
-       /* make sure we persist */
-       fsync_fname(path, false);
-       fsync_fname("pg_logical/snapshots", true);
-
-       /*
-        * Now there's no way we can loose the dumped state anymore, remember this
-        * as a serialization point.
-        */
-       builder->last_serialized_snapshot = lsn;
-
-out:
-       ReorderBufferSetRestartPoint(builder->reorder,
-                                                                builder->last_serialized_snapshot);
-}
-
-/*
- * Restore a snapshot into 'builder' if previously one has been stored at the
- * location indicated by 'lsn'. Returns true if successful, false otherwise.
- */
-static bool
-SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn)
-{
-       SnapBuildOnDisk ondisk;
-       int                     fd;
-       char            path[MAXPGPATH];
-       Size            sz;
-       int                     readBytes;
-       pg_crc32c       checksum;
-
-       /* no point in loading a snapshot if we're already there */
-       if (builder->state == SNAPBUILD_CONSISTENT)
-               return false;
-
-       sprintf(path, "pg_logical/snapshots/%X-%X.snap",
-                       (uint32) (lsn >> 32), (uint32) lsn);
-
-       fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
-
-       if (fd < 0 && errno == ENOENT)
-               return false;
-       else if (fd < 0)
-               ereport(ERROR,
-                               (errcode_for_file_access(),
-                                errmsg("could not open file \"%s\": %m", path)));
-
-       /* ----
-        * Make sure the snapshot had been stored safely to disk, that's normally
-        * cheap.
-        * Note that we do not need PANIC here, nobody will be able to use the
-        * slot without fsyncing, and saving it won't succeed without an fsync()
-        * either...
-        * ----
-        */
-       fsync_fname(path, false);
-       fsync_fname("pg_logical/snapshots", true);
-
-
-       /* read statically sized portion of snapshot */
-       readBytes = read(fd, &ondisk, SnapBuildOnDiskConstantSize);
-       if (readBytes != SnapBuildOnDiskConstantSize)
-       {
-               CloseTransientFile(fd);
-               ereport(ERROR,
-                               (errcode_for_file_access(),
-                                errmsg("could not read file \"%s\", read %d of %d: %m",
-                                               path, readBytes, (int) SnapBuildOnDiskConstantSize)));
-       }
-
-       if (ondisk.magic != SNAPBUILD_MAGIC)
-               ereport(ERROR,
-                               (errmsg("snapbuild state file \"%s\" has wrong magic number: %u instead of %u",
-                                               path, ondisk.magic, SNAPBUILD_MAGIC)));
-
-       if (ondisk.version != SNAPBUILD_VERSION)
-               ereport(ERROR,
-                               (errmsg("snapbuild state file \"%s\" has unsupported version: %u instead of %u",
-                                               path, ondisk.version, SNAPBUILD_VERSION)));
-
-       INIT_CRC32C(checksum);
-       COMP_CRC32C(checksum,
-                               ((char *) &ondisk) + SnapBuildOnDiskNotChecksummedSize,
-                       SnapBuildOnDiskConstantSize - SnapBuildOnDiskNotChecksummedSize);
-
-       /* read SnapBuild */
-       readBytes = read(fd, &ondisk.builder, sizeof(SnapBuild));
-       if (readBytes != sizeof(SnapBuild))
-       {
-               CloseTransientFile(fd);
-               ereport(ERROR,
-                               (errcode_for_file_access(),
-                                errmsg("could not read file \"%s\", read %d of %d: %m",
-                                               path, readBytes, (int) sizeof(SnapBuild))));
-       }
-       COMP_CRC32C(checksum, &ondisk.builder, sizeof(SnapBuild));
-
-       /* restore running xacts information */
-       sz = sizeof(TransactionId) * ondisk.builder.running.xcnt_space;
-       ondisk.builder.running.xip = MemoryContextAllocZero(builder->context, sz);
-       readBytes = read(fd, ondisk.builder.running.xip, sz);
-       if (readBytes != sz)
-       {
-               CloseTransientFile(fd);
-               ereport(ERROR,
-                               (errcode_for_file_access(),
-                                errmsg("could not read file \"%s\", read %d of %d: %m",
-                                               path, readBytes, (int) sz)));
-       }
-       COMP_CRC32C(checksum, ondisk.builder.running.xip, sz);
-
-       /* restore committed xacts information */
-       sz = sizeof(TransactionId) * ondisk.builder.committed.xcnt;
-       ondisk.builder.committed.xip = MemoryContextAllocZero(builder->context, sz);
-       readBytes = read(fd, ondisk.builder.committed.xip, sz);
-       if (readBytes != sz)
-       {
-               CloseTransientFile(fd);
-               ereport(ERROR,
-                               (errcode_for_file_access(),
-                                errmsg("could not read file \"%s\", read %d of %d: %m",
-                                               path, readBytes, (int) sz)));
-       }
-       COMP_CRC32C(checksum, ondisk.builder.committed.xip, sz);
-
-       CloseTransientFile(fd);
-
-       FIN_CRC32C(checksum);
-
-       /* verify checksum of what we've read */
-       if (!EQ_CRC32C(checksum, ondisk.checksum))
-               ereport(ERROR,
-                               (errcode_for_file_access(),
-                                errmsg("checksum mismatch for snapbuild state file \"%s\": is %u, should be %u",
-                                               path, checksum, ondisk.checksum)));
-
-       /*
-        * ok, we now have a sensible snapshot here, figure out if it has more
-        * information than we have.
-        */
-
-       /*
-        * We are only interested in consistent snapshots for now, comparing
-        * whether one incomplete snapshot is more "advanced" seems to be
-        * unnecessarily complex.
-        */
-       if (ondisk.builder.state < SNAPBUILD_CONSISTENT)
-               goto snapshot_not_interesting;
-
-       /*
-        * Don't use a snapshot that requires an xmin that we cannot guarantee to
-        * be available.
-        */
-       if (TransactionIdPrecedes(ondisk.builder.xmin, builder->initial_xmin_horizon))
-               goto snapshot_not_interesting;
-
-
-       /* ok, we think the snapshot is sensible, copy over everything important */
-       builder->xmin = ondisk.builder.xmin;
-       builder->xmax = ondisk.builder.xmax;
-       builder->state = ondisk.builder.state;
-
-       builder->committed.xcnt = ondisk.builder.committed.xcnt;
-       /* We only allocated/stored xcnt, not xcnt_space xids ! */
-       /* don't overwrite preallocated xip, if we don't have anything here */
-       if (builder->committed.xcnt > 0)
-       {
-               pfree(builder->committed.xip);
-               builder->committed.xcnt_space = ondisk.builder.committed.xcnt;
-               builder->committed.xip = ondisk.builder.committed.xip;
-       }
-       ondisk.builder.committed.xip = NULL;
-
-       builder->running.xcnt = ondisk.builder.running.xcnt;
-       if (builder->running.xip)
-               pfree(builder->running.xip);
-       builder->running.xcnt_space = ondisk.builder.running.xcnt_space;
-       builder->running.xip = ondisk.builder.running.xip;
-
-       /* our snapshot is not interesting anymore, build a new one */
-       if (builder->snapshot != NULL)
-       {
-               SnapBuildSnapDecRefcount(builder->snapshot);
-       }
-       builder->snapshot = SnapBuildBuildSnapshot(builder, InvalidTransactionId);
-       SnapBuildSnapIncRefcount(builder->snapshot);
-
-       ReorderBufferSetRestartPoint(builder->reorder, lsn);
-
-       Assert(builder->state == SNAPBUILD_CONSISTENT);
-
-       ereport(LOG,
-                       (errmsg("logical decoding found consistent point at %X/%X",
-                                       (uint32) (lsn >> 32), (uint32) lsn),
-                        errdetail("Logical decoding will begin using saved snapshot.")));
-       return true;
-
-snapshot_not_interesting:
-       if (ondisk.builder.running.xip != NULL)
-               pfree(ondisk.builder.running.xip);
-       if (ondisk.builder.committed.xip != NULL)
-               pfree(ondisk.builder.committed.xip);
-       return false;
-}
-
-/*
- * Remove all serialized snapshots that are not required anymore because no
- * slot can need them. This doesn't actually have to run during a checkpoint,
- * but it's a convenient point to schedule this.
- *
- * NB: We run this during checkpoints even if logical decoding is disabled so
- * we cleanup old slots at some point after it got disabled.
- */
-void
-CheckPointSnapBuild(void)
-{
-       XLogRecPtr      cutoff;
-       XLogRecPtr      redo;
-       DIR                *snap_dir;
-       struct dirent *snap_de;
-       char            path[MAXPGPATH];
-
-       /*
-        * We start of with a minimum of the last redo pointer. No new replication
-        * slot will start before that, so that's a safe upper bound for removal.
-        */
-       redo = GetRedoRecPtr();
-
-       /* now check for the restart ptrs from existing slots */
-       cutoff = ReplicationSlotsComputeLogicalRestartLSN();
-
-       /* don't start earlier than the restart lsn */
-       if (redo < cutoff)
-               cutoff = redo;
-
-       snap_dir = AllocateDir("pg_logical/snapshots");
-       while ((snap_de = ReadDir(snap_dir, "pg_logical/snapshots")) != NULL)
-       {
-               uint32          hi;
-               uint32          lo;
-               XLogRecPtr      lsn;
-               struct stat statbuf;
-
-               if (strcmp(snap_de->d_name, ".") == 0 ||
-                       strcmp(snap_de->d_name, "..") == 0)
-                       continue;
-
-               snprintf(path, MAXPGPATH, "pg_logical/snapshots/%s", snap_de->d_name);
-
-               if (lstat(path, &statbuf) == 0 && !S_ISREG(statbuf.st_mode))
-               {
-                       elog(DEBUG1, "only regular files expected: %s", path);
-                       continue;
-               }
-
-               /*
-                * temporary filenames from SnapBuildSerialize() include the LSN and
-                * everything but are postfixed by .$pid.tmp. We can just remove them
-                * the same as other files because there can be none that are
-                * currently being written that are older than cutoff.
-                *
-                * We just log a message if a file doesn't fit the pattern, it's
-                * probably some editors lock/state file or similar...
-                */
-               if (sscanf(snap_de->d_name, "%X-%X.snap", &hi, &lo) != 2)
-               {
-                       ereport(LOG,
-                                       (errmsg("could not parse file name \"%s\"", path)));
-                       continue;
-               }
-
-               lsn = ((uint64) hi) << 32 | lo;
-
-               /* check whether we still need it */
-               if (lsn < cutoff || cutoff == InvalidXLogRecPtr)
-               {
-                       elog(DEBUG1, "removing snapbuild snapshot %s", path);
-
-                       /*
-                        * It's not particularly harmful, though strange, if we can't
-                        * remove the file here. Don't prevent the checkpoint from
-                        * completing, that'd be cure worse than the disease.
-                        */
-                       if (unlink(path) < 0)
-                       {
-                               ereport(LOG,
-                                               (errcode_for_file_access(),
-                                                errmsg("could not remove file \"%s\": %m",
-                                                               path)));
-                               continue;
-                       }
-               }
-       }
-       FreeDir(snap_dir);
  }
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c

index c04b17fa8ead59f8190a3fa7d880197e9cff9235..b2d447aaa7ad4a515c478da5f30fac24e45eb6b8 100644 (file)
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -16,10 +16,10 @@
  
  #include "access/clog.h"
  #include "access/commit_ts.h"
+#include "access/csnlog.h"
  #include "access/heapam.h"
  #include "access/multixact.h"
  #include "access/nbtree.h"
-#include "access/subtrans.h"
  #include "access/twophase.h"
  #include "commands/async.h"
  #include "miscadmin.h"
@@ -120,8 +120,8 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
                 size = add_size(size, ProcGlobalShmemSize());
                 size = add_size(size, XLOGShmemSize());
                 size = add_size(size, CLOGShmemSize());
+               size = add_size(size, CSNLOGShmemSize());
                 size = add_size(size, CommitTsShmemSize());
-               size = add_size(size, SUBTRANSShmemSize());
                 size = add_size(size, TwoPhaseShmemSize());
                 size = add_size(size, BackgroundWorkerShmemSize());
                 size = add_size(size, MultiXactShmemSize());
@@ -204,8 +204,8 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
          */
         XLOGShmemInit();
         CLOGShmemInit();
+       CSNLOGShmemInit();
         CommitTsShmemInit();
-       SUBTRANSShmemInit();
         MultiXactShmemInit();
         InitBufferPool();
  
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c

index e5d487dbb74cc8cd1b29ffc95e251c5a43813cc4..570d2729113e7a596eaf98e660c489af1a244b97 100644 (file)
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -13,24 +13,14 @@
   * See notes in src/backend/access/transam/README.
   *
   * The process arrays now also include structures representing prepared
- * transactions.  The xid and subxids fields of these are valid, as are the
+ * transactions.  The xid fields of these are valid, as are the
   * myProcLocks lists.  They can be distinguished from regular backend PGPROCs
   * at need by checking for pid == 0.
   *
- * During hot standby, we also keep a list of XIDs representing transactions
- * that are known to be running in the master (or more precisely, were running
- * as of the current point in the WAL stream).  This list is kept in the
- * KnownAssignedXids array, and is updated by watching the sequence of
- * arriving XIDs.  This is necessary because if we leave those XIDs out of
- * snapshots taken for standby queries, then they will appear to be already
- * complete, leading to MVCC failures.  Note that in hot standby, the PGPROC
- * array represents standby processes, which by definition are not running
- * transactions that have XIDs.
- *
- * It is perhaps possible for a backend on the master to terminate without
- * writing an abort record for its transaction.  While that shouldn't really
- * happen, it would tie up KnownAssignedXids indefinitely, so we protect
- * ourselves by pruning the array when a valid list of running XIDs arrives.
+ * During hot standby, we update latestCompletedXid, oldestActiveXid, and
+ * latestObservedXid, as we replay transaction commit/abort and standby WAL
+ * records. Note that in hot standby, the PGPROC array represents standby
+ * processes, which by definition are not running transactions that have XIDs.
   *
   * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
@@ -46,7 +36,8 @@
  #include <signal.h>
  
  #include "access/clog.h"
-#include "access/subtrans.h"
+#include "access/csnlog.h"
+#include "access/mvccvars.h"
  #include "access/transam.h"
  #include "access/twophase.h"
  #include "access/xact.h"
@@ -67,24 +58,6 @@ typedef struct ProcArrayStruct
         int                     numProcs;               /* number of valid procs entries */
         int                     maxProcs;               /* allocated size of procs array */
  
-       /*
-        * Known assigned XIDs handling
-        */
-       int                     maxKnownAssignedXids;   /* allocated size of array */
-       int                     numKnownAssignedXids;   /* current # of valid entries */
-       int                     tailKnownAssignedXids;  /* index of oldest valid element */
-       int                     headKnownAssignedXids;  /* index of newest element, + 1 */
-       slock_t         known_assigned_xids_lck;                /* protects head/tail pointers */
-
-       /*
-        * Highest subxid that has been removed from KnownAssignedXids array to
-        * prevent overflow; or InvalidTransactionId if none.  We track this for
-        * similar reasons to tracking overflowing cached subxids in PGXACT
-        * entries.  Must hold exclusive ProcArrayLock to change this, and shared
-        * lock to read it.
-        */
-       TransactionId lastOverflowedXid;
-
         /* oldest xmin of any replication slot */
         TransactionId replication_slot_xmin;
         /* oldest catalog xmin of any replication slot */
@@ -100,79 +73,15 @@ static PGPROC *allProcs;
  static PGXACT *allPgXact;
  
  /*
- * Bookkeeping for tracking emulated transactions in recovery
+ * Bookkeeping for tracking transactions in recovery
   */
-static TransactionId *KnownAssignedXids;
-static bool *KnownAssignedXidsValid;
  static TransactionId latestObservedXid = InvalidTransactionId;
  
  /* LWLock tranche for backend locks */
  static LWLockTranche ProcLWLockTranche;
  
-/*
- * If we're in STANDBY_SNAPSHOT_PENDING state, standbySnapshotPendingXmin is
- * the highest xid that might still be running that we don't have in
- * KnownAssignedXids.
- */
-static TransactionId standbySnapshotPendingXmin;
-
-#ifdef XIDCACHE_DEBUG
-
-/* counters for XidCache measurement */
-static long xc_by_recent_xmin = 0;
-static long xc_by_known_xact = 0;
-static long xc_by_my_xact = 0;
-static long xc_by_latest_xid = 0;
-static long xc_by_main_xid = 0;
-static long xc_by_child_xid = 0;
-static long xc_by_known_assigned = 0;
-static long xc_no_overflow = 0;
-static long xc_slow_answer = 0;
-
-#define xc_by_recent_xmin_inc()                (xc_by_recent_xmin++)
-#define xc_by_known_xact_inc()         (xc_by_known_xact++)
-#define xc_by_my_xact_inc()                    (xc_by_my_xact++)
-#define xc_by_latest_xid_inc()         (xc_by_latest_xid++)
-#define xc_by_main_xid_inc()           (xc_by_main_xid++)
-#define xc_by_child_xid_inc()          (xc_by_child_xid++)
-#define xc_by_known_assigned_inc()     (xc_by_known_assigned++)
-#define xc_no_overflow_inc()           (xc_no_overflow++)
-#define xc_slow_answer_inc()           (xc_slow_answer++)
-
-static void DisplayXidCache(void);
-#else                                                  /* !XIDCACHE_DEBUG */
-
-#define xc_by_recent_xmin_inc()                ((void) 0)
-#define xc_by_known_xact_inc()         ((void) 0)
-#define xc_by_my_xact_inc()                    ((void) 0)
-#define xc_by_latest_xid_inc()         ((void) 0)
-#define xc_by_main_xid_inc()           ((void) 0)
-#define xc_by_child_xid_inc()          ((void) 0)
-#define xc_by_known_assigned_inc()     ((void) 0)
-#define xc_no_overflow_inc()           ((void) 0)
-#define xc_slow_answer_inc()           ((void) 0)
-#endif   /* XIDCACHE_DEBUG */
-
-/* Primitives for KnownAssignedXids array handling for standby */
-static void KnownAssignedXidsCompress(bool force);
-static void KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid,
-                                        bool exclusive_lock);
-static bool KnownAssignedXidsSearch(TransactionId xid, bool remove);
-static bool KnownAssignedXidExists(TransactionId xid);
-static void KnownAssignedXidsRemove(TransactionId xid);
-static void KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids,
-                                                       TransactionId *subxids);
-static void KnownAssignedXidsRemovePreceding(TransactionId xid);
-static int     KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax);
-static int KnownAssignedXidsGetAndSetXmin(TransactionId *xarray,
-                                                          TransactionId *xmin,
-                                                          TransactionId xmax);
-static TransactionId KnownAssignedXidsGetOldestXmin(void);
-static void KnownAssignedXidsDisplay(int trace_level);
-static void KnownAssignedXidsReset(void);
-static inline void ProcArrayEndTransactionInternal(PGPROC *proc,
-                                                               PGXACT *pgxact, TransactionId latestXid);
-static void ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid);
+static void AdvanceOldestActiveXid(TransactionId myXid);
+static void AdvanceGlobalXmin(TransactionId myXmin);
  
  /*
   * Report shared-memory space needed by CreateSharedProcArray.
@@ -188,31 +97,6 @@ ProcArrayShmemSize(void)
         size = offsetof(ProcArrayStruct, pgprocnos);
         size = add_size(size, mul_size(sizeof(int), PROCARRAY_MAXPROCS));
  
-       /*
-        * During Hot Standby processing we have a data structure called
-        * KnownAssignedXids, created in shared memory. Local data structures are
-        * also created in various backends during GetSnapshotData(),
-        * TransactionIdIsInProgress() and GetRunningTransactionData(). All of the
-        * main structures created in those functions must be identically sized,
-        * since we may at times copy the whole of the data structures around. We
-        * refer to this size as TOTAL_MAX_CACHED_SUBXIDS.
-        *
-        * Ideally we'd only create this structure if we were actually doing hot
-        * standby in the current run, but we don't know that yet at the time
-        * shared memory is being set up.
-        */
-#define TOTAL_MAX_CACHED_SUBXIDS \
-       ((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS)
-
-       if (EnableHotStandby)
-       {
-               size = add_size(size,
-                                               mul_size(sizeof(TransactionId),
-                                                                TOTAL_MAX_CACHED_SUBXIDS));
-               size = add_size(size,
-                                               mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS));
-       }
-
         return size;
  }
  
@@ -240,31 +124,11 @@ CreateSharedProcArray(void)
                 procArray->numProcs = 0;
                 procArray->maxProcs = PROCARRAY_MAXPROCS;
                 procArray->replication_slot_xmin = InvalidTransactionId;
-               procArray->maxKnownAssignedXids = TOTAL_MAX_CACHED_SUBXIDS;
-               procArray->numKnownAssignedXids = 0;
-               procArray->tailKnownAssignedXids = 0;
-               procArray->headKnownAssignedXids = 0;
-               SpinLockInit(&procArray->known_assigned_xids_lck);
-               procArray->lastOverflowedXid = InvalidTransactionId;
         }
  
         allProcs = ProcGlobal->allProcs;
         allPgXact = ProcGlobal->allPgXact;
  
-       /* Create or attach to the KnownAssignedXids arrays too, if needed */
-       if (EnableHotStandby)
-       {
-               KnownAssignedXids = (TransactionId *)
-                       ShmemInitStruct("KnownAssignedXids",
-                                                       mul_size(sizeof(TransactionId),
-                                                                        TOTAL_MAX_CACHED_SUBXIDS),
-                                                       &found);
-               KnownAssignedXidsValid = (bool *)
-                       ShmemInitStruct("KnownAssignedXidsValid",
-                                                       mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS),
-                                                       &found);
-       }
-
         /* Register and initialize fields of ProcLWLockTranche */
         ProcLWLockTranche.name = "proc";
         ProcLWLockTranche.array_base = (char *) (ProcGlobal->allProcs) +
@@ -326,43 +190,15 @@ ProcArrayAdd(PGPROC *proc)
  
  /*
   * Remove the specified PGPROC from the shared array.
- *
- * When latestXid is a valid XID, we are removing a live 2PC gxact from the
- * array, and thus causing it to appear as "not running" anymore.  In this
- * case we must advance latestCompletedXid.  (This is essentially the same
- * as ProcArrayEndTransaction followed by removal of the PGPROC, but we take
- * the ProcArrayLock only once, and don't damage the content of the PGPROC;
- * twophase.c depends on the latter.)
   */
  void
-ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
+ProcArrayRemove(PGPROC *proc)
  {
         ProcArrayStruct *arrayP = procArray;
         int                     index;
  
-#ifdef XIDCACHE_DEBUG
-       /* dump stats at backend shutdown, but not prepared-xact end */
-       if (proc->pid != 0)
-               DisplayXidCache();
-#endif
-
         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
  
-       if (TransactionIdIsValid(latestXid))
-       {
-               Assert(TransactionIdIsValid(allPgXact[proc->pgprocno].xid));
-
-               /* Advance global latestCompletedXid while holding the lock */
-               if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
-                                                                 latestXid))
-                       ShmemVariableCache->latestCompletedXid = latestXid;
-       }
-       else
-       {
-               /* Shouldn't be trying to remove a live transaction here */
-               Assert(!TransactionIdIsValid(allPgXact[proc->pgprocno].xid));
-       }
-
         for (index = 0; index < arrayP->numProcs; index++)
         {
                 if (arrayP->pgprocnos[index] == proc->pgprocno)
@@ -391,208 +227,41 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
   * commit/abort must already be reported to WAL and pg_clog.
   *
   * proc is currently always MyProc, but we pass it explicitly for flexibility.
- * latestXid is the latest Xid among the transaction's main XID and
- * subtransactions, or InvalidTransactionId if it has no XID.  (We must ask
- * the caller to pass latestXid, instead of computing it from the PGPROC's
- * contents, because the subxid information in the PGPROC might be
- * incomplete.)
   */
  void
-ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
+ProcArrayEndTransaction(PGPROC *proc)
  {
         PGXACT     *pgxact = &allPgXact[proc->pgprocno];
+       TransactionId myXid;
+       TransactionId myXmin;
  
-       if (TransactionIdIsValid(latestXid))
-       {
-               /*
-                * We must lock ProcArrayLock while clearing our advertised XID, so
-                * that we do not exit the set of "running" transactions while someone
-                * else is taking a snapshot.  See discussion in
-                * src/backend/access/transam/README.
-                */
-               Assert(TransactionIdIsValid(allPgXact[proc->pgprocno].xid));
-
-               /*
-                * If we can immediately acquire ProcArrayLock, we clear our own XID
-                * and release the lock.  If not, use group XID clearing to improve
-                * efficiency.
-                */
-               if (LWLockConditionalAcquire(ProcArrayLock, LW_EXCLUSIVE))
-               {
-                       ProcArrayEndTransactionInternal(proc, pgxact, latestXid);
-                       LWLockRelease(ProcArrayLock);
-               }
-               else
-                       ProcArrayGroupClearXid(proc, latestXid);
-       }
-       else
-       {
-               /*
-                * If we have no XID, we don't need to lock, since we won't affect
-                * anyone else's calculation of a snapshot.  We might change their
-                * estimate of global xmin, but that's OK.
-                */
-               Assert(!TransactionIdIsValid(allPgXact[proc->pgprocno].xid));
-
-               proc->lxid = InvalidLocalTransactionId;
-               pgxact->xmin = InvalidTransactionId;
-               /* must be cleared with xid/xmin: */
-               pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
-               pgxact->delayChkpt = false;             /* be sure this is cleared in abort */
-               proc->recoveryConflictPending = false;
-
-               Assert(pgxact->nxids == 0);
-               Assert(pgxact->overflowed == false);
-       }
-}
+       myXid = pgxact->xid;
+       myXmin = pgxact->xmin;
  
-/*
- * Mark a write transaction as no longer running.
- *
- * We don't do any locking here; caller must handle that.
- */
-static inline void
-ProcArrayEndTransactionInternal(PGPROC *proc, PGXACT *pgxact,
-                                                               TransactionId latestXid)
-{
+       /* A shared lock is enough to modify our own fields */
+       LWLockAcquire(ProcArrayLock, LW_SHARED);
         pgxact->xid = InvalidTransactionId;
         proc->lxid = InvalidLocalTransactionId;
         pgxact->xmin = InvalidTransactionId;
-       /* must be cleared with xid/xmin: */
+       pgxact->snapshotcsn = InvalidCommitSeqNo;
+       /* must be cleared with xid/xmin/snapshotcsn: */
         pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
         pgxact->delayChkpt = false; /* be sure this is cleared in abort */
         proc->recoveryConflictPending = false;
  
-       /* Clear the subtransaction-XID cache too while holding the lock */
-       pgxact->nxids = 0;
-       pgxact->overflowed = false;
-
-       /* Also advance global latestCompletedXid while holding the lock */
-       if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
-                                                         latestXid))
-               ShmemVariableCache->latestCompletedXid = latestXid;
-}
-
-/*
- * ProcArrayGroupClearXid -- group XID clearing
- *
- * When we cannot immediately acquire ProcArrayLock in exclusive mode at
- * commit time, add ourselves to a list of processes that need their XIDs
- * cleared.  The first process to add itself to the list will acquire
- * ProcArrayLock in exclusive mode and perform ProcArrayEndTransactionInternal
- * on behalf of all group members.  This avoids a great deal of contention
- * around ProcArrayLock when many processes are trying to commit at once,
- * since the lock need not be repeatedly handed off from one committing
- * process to the next.
- */
-static void
-ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid)
-{
-       volatile PROC_HDR *procglobal = ProcGlobal;
-       uint32          nextidx;
-       uint32          wakeidx;
-       int                     extraWaits = -1;
-
-       /* We should definitely have an XID to clear. */
-       Assert(TransactionIdIsValid(allPgXact[proc->pgprocno].xid));
-
-       /* Add ourselves to the list of processes needing a group XID clear. */
-       proc->procArrayGroupMember = true;
-       proc->procArrayGroupMemberXid = latestXid;
-       while (true)
-       {
-               nextidx = pg_atomic_read_u32(&procglobal->procArrayGroupFirst);
-               pg_atomic_write_u32(&proc->procArrayGroupNext, nextidx);
-
-               if (pg_atomic_compare_exchange_u32(&procglobal->procArrayGroupFirst,
-                                                                                  &nextidx,
-                                                                                  (uint32) proc->pgprocno))
-                       break;
-       }
-
-       /*
-        * If the list was not empty, the leader will clear our XID.  It is
-        * impossible to have followers without a leader because the first process
-        * that has added itself to the list will always have nextidx as
-        * INVALID_PGPROCNO.
-        */
-       if (nextidx != INVALID_PGPROCNO)
-       {
-               /* Sleep until the leader clears our XID. */
-               for (;;)
-               {
-                       /* acts as a read barrier */
-                       PGSemaphoreLock(&proc->sem);
-                       if (!proc->procArrayGroupMember)
-                               break;
-                       extraWaits++;
-               }
-
-               Assert(pg_atomic_read_u32(&proc->procArrayGroupNext) == INVALID_PGPROCNO);
-
-               /* Fix semaphore count for any absorbed wakeups */
-               while (extraWaits-- > 0)
-                       PGSemaphoreUnlock(&proc->sem);
-               return;
-       }
-
-       /* We are the leader.  Acquire the lock on behalf of everyone. */
-       LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
-
-       /*
-        * Now that we've got the lock, clear the list of processes waiting for
-        * group XID clearing, saving a pointer to the head of the list.  Trying
-        * to pop elements one at a time could lead to an ABA problem.
-        */
-       while (true)
-       {
-               nextidx = pg_atomic_read_u32(&procglobal->procArrayGroupFirst);
-               if (pg_atomic_compare_exchange_u32(&procglobal->procArrayGroupFirst,
-                                                                                  &nextidx,
-                                                                                  INVALID_PGPROCNO))
-                       break;
-       }
-
-       /* Remember head of list so we can perform wakeups after dropping lock. */
-       wakeidx = nextidx;
-
-       /* Walk the list and clear all XIDs. */
-       while (nextidx != INVALID_PGPROCNO)
-       {
-               PGPROC     *proc = &allProcs[nextidx];
-               PGXACT     *pgxact = &allPgXact[nextidx];
-
-               ProcArrayEndTransactionInternal(proc, pgxact, proc->procArrayGroupMemberXid);
-
-               /* Move to next proc in list. */
-               nextidx = pg_atomic_read_u32(&proc->procArrayGroupNext);
-       }
-
-       /* We're done with the lock now. */
         LWLockRelease(ProcArrayLock);
  
+       /* If we were the oldest active XID, advance oldestXid */
+       if (TransactionIdIsValid(myXid))
+               AdvanceOldestActiveXid(myXid);
+
         /*
-        * Now that we've released the lock, go back and wake everybody up.  We
-        * don't do this under the lock so as to keep lock hold times to a
-        * minimum.  The system calls we need to perform to wake other processes
-        * up are probably much slower than the simple memory writes we did while
-        * holding the lock.
+        * Likewise, if we had the oldest xmin, advance GlobalXmin. (There
+        * can be multiple transactions with the same xmin, so this
+        * might be futile.)
          */
-       while (wakeidx != INVALID_PGPROCNO)
-       {
-               PGPROC     *proc = &allProcs[wakeidx];
-
-               wakeidx = pg_atomic_read_u32(&proc->procArrayGroupNext);
-               pg_atomic_write_u32(&proc->procArrayGroupNext, INVALID_PGPROCNO);
-
-               /* ensure all previous writes are visible before follower continues. */
-               pg_write_barrier();
-
-               proc->procArrayGroupMember = false;
-
-               if (proc != MyProc)
-                       PGSemaphoreUnlock(&proc->sem);
-       }
+       if (TransactionIdIsValid(myXmin))
+               AdvanceGlobalXmin(myXmin);
  }
  
  /*
@@ -617,38 +286,46 @@ ProcArrayClearTransaction(PGPROC *proc)
         pgxact->xid = InvalidTransactionId;
         proc->lxid = InvalidLocalTransactionId;
         pgxact->xmin = InvalidTransactionId;
+       pgxact->snapshotcsn = InvalidCommitSeqNo;
         proc->recoveryConflictPending = false;
  
         /* redundant, but just in case */
         pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
         pgxact->delayChkpt = false;
  
-       /* Clear the subtransaction-XID cache too */
-       pgxact->nxids = 0;
-       pgxact->overflowed = false;
+       /*
+        * We don't need to update oldestActiveXid, because the gxact entry in
+        * the procarray is still running with the same XID.
+        *
+        * FIXME: Do we need advance GlobalXmin, though? Does a gxact have a
+        * valid xmin?
+        */
  }
  
  /*
   * ProcArrayInitRecovery -- initialize recovery xid mgmt environment
   *
- * Remember up to where the startup process initialized the CLOG and subtrans
+ * Remember up to where the startup process initialized the CLOG and CSNLOG
   * so we can ensure it's initialized gaplessly up to the point where necessary
   * while in recovery.
   */
  void
-ProcArrayInitRecovery(TransactionId initializedUptoXID)
+ProcArrayInitRecovery(TransactionId oldestActiveXID, TransactionId initializedUptoXID)
  {
         Assert(standbyState == STANDBY_INITIALIZED);
         Assert(TransactionIdIsNormal(initializedUptoXID));
  
         /*
-        * we set latestObservedXid to the xid SUBTRANS has been initialized up
+        * we set latestObservedXid to the xid SUBTRANS (XXX csnlog?) has been initialized up
          * to, so we can extend it from that point onwards in
          * RecordKnownAssignedTransactionIds, and when we get consistent in
          * ProcArrayApplyRecoveryInfo().
          */
         latestObservedXid = initializedUptoXID;
         TransactionIdRetreat(latestObservedXid);
+
+       /* also initialize oldestActiveXid */
+       pg_atomic_write_u32(&ShmemVariableCache->oldestActiveXid, oldestActiveXID);
  }
  
  /*
@@ -669,20 +346,11 @@ ProcArrayInitRecovery(TransactionId initializedUptoXID)
  void
  ProcArrayApplyRecoveryInfo(RunningTransactions running)
  {
-       TransactionId *xids;
-       int                     nxids;
         TransactionId nextXid;
-       int                     i;
  
         Assert(standbyState >= STANDBY_INITIALIZED);
         Assert(TransactionIdIsValid(running->nextXid));
         Assert(TransactionIdIsValid(running->oldestRunningXid));
-       Assert(TransactionIdIsNormal(running->latestCompletedXid));
-
-       /*
-        * Remove stale transactions, if any.
-        */
-       ExpireOldKnownAssignedTransactionIds(running->oldestRunningXid);
  
         /*
          * Remove stale locks, if any.
@@ -690,7 +358,7 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running)
          * Locks are always assigned to the toplevel xid so we don't need to care
          * about subxcnt/subxids (and by extension not about ->suboverflowed).
          */
-       StandbyReleaseOldLocks(running->xcnt, running->xids);
+       StandbyReleaseOldLocks(running->oldestRunningXid);
  
         /*
          * If our snapshot is already valid, nothing else to do...
@@ -698,51 +366,6 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running)
         if (standbyState == STANDBY_SNAPSHOT_READY)
                 return;
  
-       /*
-        * If our initial RunningTransactionsData had an overflowed snapshot then
-        * we knew we were missing some subxids from our snapshot. If we continue
-        * to see overflowed snapshots then we might never be able to start up, so
-        * we make another test to see if our snapshot is now valid. We know that
-        * the missing subxids are equal to or earlier than nextXid. After we
-        * initialise we continue to apply changes during recovery, so once the
-        * oldestRunningXid is later than the nextXid from the initial snapshot we
-        * know that we no longer have missing information and can mark the
-        * snapshot as valid.
-        */
-       if (standbyState == STANDBY_SNAPSHOT_PENDING)
-       {
-               /*
-                * If the snapshot isn't overflowed or if its empty we can reset our
-                * pending state and use this snapshot instead.
-                */
-               if (!running->subxid_overflow || running->xcnt == 0)
-               {
-                       /*
-                        * If we have already collected known assigned xids, we need to
-                        * throw them away before we apply the recovery snapshot.
-                        */
-                       KnownAssignedXidsReset();
-                       standbyState = STANDBY_INITIALIZED;
-               }
-               else
-               {
-                       if (TransactionIdPrecedes(standbySnapshotPendingXmin,
-                                                                         running->oldestRunningXid))
-                       {
-                               standbyState = STANDBY_SNAPSHOT_READY;
-                               elog(trace_recovery(DEBUG1),
-                                        "recovery snapshots are now enabled");
-                       }
-                       else
-                               elog(trace_recovery(DEBUG1),
-                                 "recovery snapshot waiting for non-overflowed snapshot or "
-                               "until oldest active xid on standby is at least %u (now %u)",
-                                        standbySnapshotPendingXmin,
-                                        running->oldestRunningXid);
-                       return;
-               }
-       }
-
         Assert(standbyState == STANDBY_INITIALIZED);
  
         /*
@@ -753,78 +376,10 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running)
          */
  
         /*
-        * Nobody else is running yet, but take locks anyhow
-        */
-       LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
-
-       /*
-        * KnownAssignedXids is sorted so we cannot just add the xids, we have to
-        * sort them first.
-        *
-        * Some of the new xids are top-level xids and some are subtransactions.
-        * We don't call SubtransSetParent because it doesn't matter yet. If we
-        * aren't overflowed then all xids will fit in snapshot and so we don't
-        * need subtrans. If we later overflow, an xid assignment record will add
-        * xids to subtrans. If RunningXacts is overflowed then we don't have
-        * enough information to correctly update subtrans anyway.
-        */
-
-       /*
-        * Allocate a temporary array to avoid modifying the array passed as
-        * argument.
-        */
-       xids = palloc(sizeof(TransactionId) * (running->xcnt + running->subxcnt));
-
-       /*
-        * Add to the temp array any xids which have not already completed.
-        */
-       nxids = 0;
-       for (i = 0; i < running->xcnt + running->subxcnt; i++)
-       {
-               TransactionId xid = running->xids[i];
-
-               /*
-                * The running-xacts snapshot can contain xids that were still visible
-                * in the procarray when the snapshot was taken, but were already
-                * WAL-logged as completed. They're not running anymore, so ignore
-                * them.
-                */
-               if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
-                       continue;
-
-               xids[nxids++] = xid;
-       }
-
-       if (nxids > 0)
-       {
-               if (procArray->numKnownAssignedXids != 0)
-               {
-                       LWLockRelease(ProcArrayLock);
-                       elog(ERROR, "KnownAssignedXids is not empty");
-               }
-
-               /*
-                * Sort the array so that we can add them safely into
-                * KnownAssignedXids.
-                */
-               qsort(xids, nxids, sizeof(TransactionId), xidComparator);
-
-               /*
-                * Add the sorted snapshot into KnownAssignedXids
-                */
-               for (i = 0; i < nxids; i++)
-                       KnownAssignedXidsAdd(xids[i], xids[i], true);
-
-               KnownAssignedXidsDisplay(trace_recovery(DEBUG3));
-       }
-
-       pfree(xids);
-
-       /*
-        * latestObservedXid is at least set to the point where SUBTRANS was
+        * latestObservedXid is at least set to the point where CSNLOG was
          * started up to (c.f. ProcArrayInitRecovery()) or to the biggest xid
-        * RecordKnownAssignedTransactionIds() was called for.  Initialize
-        * subtrans from thereon, up to nextXid - 1.
+        * RecordKnownAssignedTransactionIds() (FIXME: gone!) was called for.  Initialize
+        * csnlog from thereon, up to nextXid - 1.
          *
          * We need to duplicate parts of RecordKnownAssignedTransactionId() here,
          * because we've just added xids to the known assigned xids machinery that
@@ -834,52 +389,11 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running)
         TransactionIdAdvance(latestObservedXid);
         while (TransactionIdPrecedes(latestObservedXid, running->nextXid))
         {
-               ExtendSUBTRANS(latestObservedXid);
+               ExtendCSNLOG(latestObservedXid);
                 TransactionIdAdvance(latestObservedXid);
         }
         TransactionIdRetreat(latestObservedXid);        /* = running->nextXid - 1 */
  
-       /* ----------
-        * Now we've got the running xids we need to set the global values that
-        * are used to track snapshots as they evolve further.
-        *
-        * - latestCompletedXid which will be the xmax for snapshots
-        * - lastOverflowedXid which shows whether snapshots overflow
-        * - nextXid
-        *
-        * If the snapshot overflowed, then we still initialise with what we know,
-        * but the recovery snapshot isn't fully valid yet because we know there
-        * are some subxids missing. We don't know the specific subxids that are
-        * missing, so conservatively assume the last one is latestObservedXid.
-        * ----------
-        */
-       if (running->subxid_overflow)
-       {
-               standbyState = STANDBY_SNAPSHOT_PENDING;
-
-               standbySnapshotPendingXmin = latestObservedXid;
-               procArray->lastOverflowedXid = latestObservedXid;
-       }
-       else
-       {
-               standbyState = STANDBY_SNAPSHOT_READY;
-
-               standbySnapshotPendingXmin = InvalidTransactionId;
-       }
-
-       /*
-        * If a transaction wrote a commit record in the gap between taking and
-        * logging the snapshot then latestCompletedXid may already be higher than
-        * the value from the snapshot, so check before we use the incoming value.
-        */
-       if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
-                                                         running->latestCompletedXid))
-               ShmemVariableCache->latestCompletedXid = running->latestCompletedXid;
-
-       Assert(TransactionIdIsNormal(ShmemVariableCache->latestCompletedXid));
-
-       LWLockRelease(ProcArrayLock);
-
         /*
          * ShmemVariableCache->nextXid must be beyond any observed xid.
          *
@@ -898,366 +412,202 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running)
  
         Assert(TransactionIdIsValid(ShmemVariableCache->nextXid));
  
-       KnownAssignedXidsDisplay(trace_recovery(DEBUG3));
-       if (standbyState == STANDBY_SNAPSHOT_READY)
-               elog(trace_recovery(DEBUG1), "recovery snapshots are now enabled");
-       else
-               elog(trace_recovery(DEBUG1),
-                        "recovery snapshot waiting for non-overflowed snapshot or "
-                        "until oldest active xid on standby is at least %u (now %u)",
-                        standbySnapshotPendingXmin,
-                        running->oldestRunningXid);
+       standbyState = STANDBY_SNAPSHOT_READY;
+       elog(trace_recovery(DEBUG1), "recovery snapshots are now enabled");
  }
  
  /*
- * ProcArrayApplyXidAssignment
- *             Process an XLOG_XACT_ASSIGNMENT WAL record
+ * TransactionIdIsActive -- is xid the top-level XID of an active backend?
+ *
+ * This ignores prepared transactions and subtransactions, since that's not
+ * needed for current uses.
   */
-void
-ProcArrayApplyXidAssignment(TransactionId topxid,
-                                                       int nsubxids, TransactionId *subxids)
+bool
+TransactionIdIsActive(TransactionId xid)
  {
-       TransactionId max_xid;
+       bool            result = false;
+       ProcArrayStruct *arrayP = procArray;
         int                     i;
  
-       Assert(standbyState >= STANDBY_INITIALIZED);
-
-       max_xid = TransactionIdLatest(topxid, nsubxids, subxids);
-
-       /*
-        * Mark all the subtransactions as observed.
-        *
-        * NOTE: This will fail if the subxid contains too many previously
-        * unobserved xids to fit into known-assigned-xids. That shouldn't happen
-        * as the code stands, because xid-assignment records should never contain
-        * more than PGPROC_MAX_CACHED_SUBXIDS entries.
-        */
-       RecordKnownAssignedTransactionIds(max_xid);
+       LWLockAcquire(ProcArrayLock, LW_SHARED);
  
-       /*
-        * Notice that we update pg_subtrans with the top-level xid, rather than
-        * the parent xid. This is a difference between normal processing and
-        * recovery, yet is still correct in all cases. The reason is that
-        * subtransaction commit is not marked in clog until commit processing, so
-        * all aborted subtransactions have already been clearly marked in clog.
-        * As a result we are able to refer directly to the top-level
-        * transaction's state rather than skipping through all the intermediate
-        * states in the subtransaction tree. This should be the first time we
-        * have attempted to SubTransSetParent().
-        */
-       for (i = 0; i < nsubxids; i++)
-               SubTransSetParent(subxids[i], topxid, false);
+       for (i = 0; i < arrayP->numProcs; i++)
+       {
+               int                     pgprocno = arrayP->pgprocnos[i];
+               volatile PGPROC *proc = &allProcs[pgprocno];
+               volatile PGXACT *pgxact = &allPgXact[pgprocno];
+               TransactionId pxid;
  
-       /* KnownAssignedXids isn't maintained yet, so we're done for now */
-       if (standbyState == STANDBY_INITIALIZED)
-               return;
+               /* Fetch xid just once - see GetNewTransactionId */
+               pxid = pgxact->xid;
  
-       /*
-        * Uses same locking as transaction commit
-        */
-       LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+               if (!TransactionIdIsValid(pxid))
+                       continue;
  
-       /*
-        * Remove subxids from known-assigned-xacts.
-        */
-       KnownAssignedXidsRemoveTree(InvalidTransactionId, nsubxids, subxids);
+               if (proc->pid == 0)
+                       continue;                       /* ignore prepared transactions */
  
-       /*
-        * Advance lastOverflowedXid to be at least the last of these subxids.
-        */
-       if (TransactionIdPrecedes(procArray->lastOverflowedXid, max_xid))
-               procArray->lastOverflowedXid = max_xid;
+               if (TransactionIdEquals(pxid, xid))
+               {
+                       result = true;
+                       break;
+               }
+       }
  
         LWLockRelease(ProcArrayLock);
+
+       return result;
  }
  
  /*
- * TransactionIdIsInProgress -- is given transaction running in some backend
- *
- * Aside from some shortcuts such as checking RecentXmin and our own Xid,
- * there are four possibilities for finding a running transaction:
- *
- * 1. The given Xid is a main transaction Id.  We will find this out cheaply
- * by looking at the PGXACT struct for each backend.
- *
- * 2. The given Xid is one of the cached subxact Xids in the PGPROC array.
- * We can find this out cheaply too.
+ * AdvanceOldestActiveXid --
   *
- * 3. In Hot Standby mode, we must search the KnownAssignedXids list to see
- * if the Xid is running on the master.
- *
- * 4. Search the SubTrans tree to find the Xid's topmost parent, and then see
- * if that is running according to PGXACT or KnownAssignedXids.  This is the
- * slowest way, but sadly it has to be done always if the others failed,
- * unless we see that the cached subxact sets are complete (none have
- * overflowed).
- *
- * ProcArrayLock has to be held while we do 1, 2, 3.  If we save the top Xids
- * while doing 1 and 3, we can release the ProcArrayLock while we do 4.
- * This buys back some concurrency (and we can't retrieve the main Xids from
- * PGXACT again anyway; see GetNewTransactionId).
+ * Advance oldestActiveXid. 'oldXid' is the current value, and it's known to be
+ * finished now.
   */
-bool
-TransactionIdIsInProgress(TransactionId xid)
+static void
+AdvanceOldestActiveXid(TransactionId myXid)
  {
-       static TransactionId *xids = NULL;
-       int                     nxids = 0;
-       ProcArrayStruct *arrayP = procArray;
-       TransactionId topxid;
-       int                     i,
-                               j;
+       TransactionId nextXid;
+       TransactionId xid;
+       TransactionId oldValue;
  
-       /*
-        * Don't bother checking a transaction older than RecentXmin; it could not
-        * possibly still be running.  (Note: in particular, this guarantees that
-        * we reject InvalidTransactionId, FrozenTransactionId, etc as not
-        * running.)
-        */
-       if (TransactionIdPrecedes(xid, RecentXmin))
-       {
-               xc_by_recent_xmin_inc();
-               return false;
-       }
+       oldValue = pg_atomic_read_u32(&ShmemVariableCache->oldestActiveXid);
  
-       /*
-        * We may have just checked the status of this transaction, so if it is
-        * already known to be completed, we can fall out without any access to
-        * shared memory.
-        */
-       if (TransactionIdIsKnownCompleted(xid))
-       {
-               xc_by_known_xact_inc();
-               return false;
-       }
+       /* Quick exit if we were not the oldest active XID. */
+       if (myXid != oldValue)
+               return;
  
-       /*
-        * Also, we can handle our own transaction (and subtransactions) without
-        * any access to shared memory.
-        */
-       if (TransactionIdIsCurrentTransactionId(xid))
-       {
-               xc_by_my_xact_inc();
-               return true;
-       }
+       xid = myXid;
+       TransactionIdAdvance(xid);
  
-       /*
-        * If first time through, get workspace to remember main XIDs in. We
-        * malloc it permanently to avoid repeated palloc/pfree overhead.
-        */
-       if (xids == NULL)
+       for (;;)
         {
                 /*
-                * In hot standby mode, reserve enough space to hold all xids in the
-                * known-assigned list. If we later finish recovery, we no longer need
-                * the bigger array, but we don't bother to shrink it.
+                * Current nextXid is the upper bound, if there are no transactions
+                * active at all.
                  */
-               int                     maxxids = RecoveryInProgress() ? TOTAL_MAX_CACHED_SUBXIDS : arrayP->maxProcs;
+               /* assume we can read nextXid atomically without holding XidGenlock. */
+               nextXid = ShmemVariableCache->nextXid;
  
-               xids = (TransactionId *) malloc(maxxids * sizeof(TransactionId));
-               if (xids == NULL)
-                       ereport(ERROR,
-                                       (errcode(ERRCODE_OUT_OF_MEMORY),
-                                        errmsg("out of memory")));
-       }
+               /* Scan the CSN Log for the next in-progress xid */
+               while (TransactionIdPrecedes(xid, nextXid) &&
+                          TransactionIdGetStatus(xid) != XID_INPROGRESS)
+                       TransactionIdAdvance(xid);
  
-       LWLockAcquire(ProcArrayLock, LW_SHARED);
+               Assert(xid >= pg_atomic_read_u32(&ShmemVariableCache->globalXmin));
  
-       /*
-        * Now that we have the lock, we can check latestCompletedXid; if the
-        * target Xid is after that, it's surely still running.
-        */
-       if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, xid))
-       {
-               LWLockRelease(ProcArrayLock);
-               xc_by_latest_xid_inc();
-               return true;
-       }
-
-       /* No shortcuts, gotta grovel through the array */
-       for (i = 0; i < arrayP->numProcs; i++)
-       {
-               int                     pgprocno = arrayP->pgprocnos[i];
-               volatile PGPROC *proc = &allProcs[pgprocno];
-               volatile PGXACT *pgxact = &allPgXact[pgprocno];
-               TransactionId pxid;
-
-               /* Ignore my own proc --- dealt with it above */
-               if (proc == MyProc)
-                       continue;
-
-               /* Fetch xid just once - see GetNewTransactionId */
-               pxid = pgxact->xid;
-
-               if (!TransactionIdIsValid(pxid))
-                       continue;
-
-               /*
-                * Step 1: check the main Xid
-                */
-               if (TransactionIdEquals(pxid, xid))
-               {
-                       LWLockRelease(ProcArrayLock);
-                       xc_by_main_xid_inc();
-                       return true;
-               }
-
-               /*
-                * We can ignore main Xids that are younger than the target Xid, since
-                * the target could not possibly be their child.
-                */
-               if (TransactionIdPrecedes(xid, pxid))
-                       continue;
-
-               /*
-                * Step 2: check the cached child-Xids arrays
-                */
-               for (j = pgxact->nxids - 1; j >= 0; j--)
+               if (xid == oldValue)
                 {
-                       /* Fetch xid just once - see GetNewTransactionId */
-                       TransactionId cxid = proc->subxids.xids[j];
-
-                       if (TransactionIdEquals(cxid, xid))
-                       {
-                               LWLockRelease(ProcArrayLock);
-                               xc_by_child_xid_inc();
-                               return true;
-                       }
+                       /* nothing more to do */
+                       break;
                 }
  
                 /*
-                * Save the main Xid for step 4.  We only need to remember main Xids
-                * that have uncached children.  (Note: there is no race condition
-                * here because the overflowed flag cannot be cleared, only set, while
-                * we hold ProcArrayLock.  So we can't miss an Xid that we need to
-                * worry about.)
+                * Update oldestActiveXid with that value.
                  */
-               if (pgxact->overflowed)
-                       xids[nxids++] = pxid;
-       }
-
-       /*
-        * Step 3: in hot standby mode, check the known-assigned-xids list.  XIDs
-        * in the list must be treated as running.
-        */
-       if (RecoveryInProgress())
-       {
-               /* none of the PGXACT entries should have XIDs in hot standby mode */
-               Assert(nxids == 0);
-
-               if (KnownAssignedXidExists(xid))
+               if (!pg_atomic_compare_exchange_u32(&ShmemVariableCache->oldestActiveXid,
+                                                                                       &oldValue,
+                                                                                       xid))
                 {
-                       LWLockRelease(ProcArrayLock);
-                       xc_by_known_assigned_inc();
-                       return true;
+                       /*
+                        * Someone beat us to it. This can happen if we hit the race
+                        * condition described below. That's OK. We're no longer the oldest active
+                        * XID in that case, so we're done.
+                        */
+                       Assert(TransactionIdFollows(oldValue, myXid));
+                       break;
                 }
  
                 /*
-                * If the KnownAssignedXids overflowed, we have to check pg_subtrans
-                * too.  Fetch all xids from KnownAssignedXids that are lower than
-                * xid, since if xid is a subtransaction its parent will always have a
-                * lower value.  Note we will collect both main and subXIDs here, but
-                * there's no help for it.
+                * We're not necessarily done yet. It's possible that the XID that we saw
+                * as still running committed just before we updated oldestActiveXid.
+                * She didn't see herself as the oldest transaction, so she wouldn't
+                * update oldestActiveXid. Loop back to check the XID that we saw as
+                * the oldest in-progress one is still in-progress, and if not, update
+                * oldestActiveXid again, on behalf of that transaction.
                  */
-               if (TransactionIdPrecedesOrEquals(xid, procArray->lastOverflowedXid))
-                       nxids = KnownAssignedXidsGet(xids, xid);
-       }
-
-       LWLockRelease(ProcArrayLock);
-
-       /*
-        * If none of the relevant caches overflowed, we know the Xid is not
-        * running without even looking at pg_subtrans.
-        */
-       if (nxids == 0)
-       {
-               xc_no_overflow_inc();
-               return false;
-       }
-
-       /*
-        * Step 4: have to check pg_subtrans.
-        *
-        * At this point, we know it's either a subtransaction of one of the Xids
-        * in xids[], or it's not running.  If it's an already-failed
-        * subtransaction, we want to say "not running" even though its parent may
-        * still be running.  So first, check pg_clog to see if it's been aborted.
-        */
-       xc_slow_answer_inc();
-
-       if (TransactionIdDidAbort(xid))
-               return false;
-
-       /*
-        * It isn't aborted, so check whether the transaction tree it belongs to
-        * is still running (or, more precisely, whether it was running when we
-        * held ProcArrayLock).
-        */
-       topxid = SubTransGetTopmostTransaction(xid);
-       Assert(TransactionIdIsValid(topxid));
-       if (!TransactionIdEquals(topxid, xid))
-       {
-               for (i = 0; i < nxids; i++)
-               {
-                       if (TransactionIdEquals(xids[i], topxid))
-                               return true;
-               }
+               oldValue = xid;
         }
-
-       return false;
  }
  
  /*
- * TransactionIdIsActive -- is xid the top-level XID of an active backend?
+ * AdvanceGlobalXmin --
   *
- * This differs from TransactionIdIsInProgress in that it ignores prepared
- * transactions, as well as transactions running on the master if we're in
- * hot standby.  Also, we ignore subtransactions since that's not needed
- * for current uses.
+ * Advance GlobalXmin.
   */
-bool
-TransactionIdIsActive(TransactionId xid)
+static void
+AdvanceGlobalXmin(TransactionId myXmin)
  {
-       bool            result = false;
+       TransactionId newGlobalXmin;
+       TransactionId currentGlobalXmin;
         ProcArrayStruct *arrayP = procArray;
-       int                     i;
+       int                     index;
  
-       /*
-        * Don't bother checking a transaction older than RecentXmin; it could not
-        * possibly still be running.
-        */
-       if (TransactionIdPrecedes(xid, RecentXmin))
-               return false;
+       /* Quick exit if we were not the oldest xmin */
+       if (myXmin != pg_atomic_read_u32(&ShmemVariableCache->globalXmin))
+               return;
  
         LWLockAcquire(ProcArrayLock, LW_SHARED);
  
-       for (i = 0; i < arrayP->numProcs; i++)
+       /*
+        * We initialize the MIN() calculation with oldestActiveXid. This
+        * is a lower bound for the XIDs that might appear in the ProcArray later,
+        * and so protects us against overestimating the result due to future
+        * additions.
+        */
+       newGlobalXmin = pg_atomic_read_u32(&ShmemVariableCache->oldestActiveXid);
+       Assert(TransactionIdIsNormal(newGlobalXmin));
+
+       for (index = 0; index < arrayP->numProcs; index++)
         {
-               int                     pgprocno = arrayP->pgprocnos[i];
-               volatile PGPROC *proc = &allProcs[pgprocno];
+               int                     pgprocno = arrayP->pgprocnos[index];
                 volatile PGXACT *pgxact = &allPgXact[pgprocno];
-               TransactionId pxid;
  
                 /* Fetch xid just once - see GetNewTransactionId */
-               pxid = pgxact->xid;
+               TransactionId xid = pgxact->xid;
  
-               if (!TransactionIdIsValid(pxid))
+               /*
+                * Backend is doing logical decoding which manages xmin separately,
+                * check below.
+                */
+               if (pgxact->vacuumFlags & PROC_IN_LOGICAL_DECODING)
                         continue;
  
-               if (proc->pid == 0)
-                       continue;                       /* ignore prepared transactions */
+               if (pgxact->vacuumFlags & PROC_IN_VACUUM)
+                       continue;
  
-               if (TransactionIdEquals(pxid, xid))
-               {
-                       result = true;
-                       break;
-               }
+               /* First consider the transaction's own Xid, if any */
+               if (TransactionIdIsNormal(xid) &&
+                       TransactionIdPrecedes(xid, newGlobalXmin))
+                       newGlobalXmin = xid;
+
+               /*
+                * Also consider the transaction's Xmin, if set.
+                *
+                * We must check both Xid and Xmin because a transaction might
+                * have an Xmin but not (yet) an Xid; conversely, if it has an
+                * Xid, that could determine some not-yet-set Xmin.
+                */
+               xid = pgxact->xmin; /* Fetch just once */
+               if (TransactionIdIsNormal(xid) &&
+                       TransactionIdPrecedes(xid, newGlobalXmin))
+                       newGlobalXmin = xid;
         }
  
-       LWLockRelease(ProcArrayLock);
+       for (;;)
+       {
+               currentGlobalXmin = pg_atomic_read_u32(&ShmemVariableCache->globalXmin);
+               if (!TransactionIdFollows(newGlobalXmin, currentGlobalXmin))
+                       break;  /* someone else computed a higher value */
  
-       return result;
-}
+               if (pg_atomic_compare_exchange_u32(&ShmemVariableCache->globalXmin,
+                                                                                  &currentGlobalXmin, newGlobalXmin))
+                       break;  /* we updated the value successfully. */
+       }
  
+       LWLockRelease(ProcArrayLock);
+}
  
  /*
   * GetOldestXmin -- returns oldest transaction that was running
@@ -1276,7 +626,7 @@ TransactionIdIsActive(TransactionId xid)
   * ignore concurrently running lazy VACUUMs because (a) they must be working
   * on other tables, and (b) they don't need to do snapshot-based lookups.
   *
- * This is also used to determine where to truncate pg_subtrans.  For that
+ * This is also used to determine where to truncate pg_csnlog. For that
   * backends in all databases have to be considered, so rel = NULL has to be
   * passed in.
   *
@@ -1306,6 +656,10 @@ TransactionIdIsActive(TransactionId xid)
   * The return value is also adjusted with vacuum_defer_cleanup_age, so
   * increasing that setting on the fly is another easy way to make
   * GetOldestXmin() move backwards, with no consequences for data integrity.
+ *
+ *
+ * XXX: We track GlobalXmin in shared memory now. Would it makes sense to
+ * have GetOldestXmin() just return that? At least for the rel == NULL case.
   */
  TransactionId
  GetOldestXmin(Relation rel, bool ignoreVacuum)
@@ -1336,7 +690,7 @@ GetOldestXmin(Relation rel, bool ignoreVacuum)
          * and so protects us against overestimating the result due to future
          * additions.
          */
-       result = ShmemVariableCache->latestCompletedXid;
+       result = pg_atomic_read_u32(&ShmemVariableCache->latestCompletedXid);
         Assert(TransactionIdIsNormal(result));
         TransactionIdAdvance(result);
  
@@ -1386,27 +740,10 @@ GetOldestXmin(Relation rel, bool ignoreVacuum)
         replication_slot_xmin = procArray->replication_slot_xmin;
         replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin;
  
-       if (RecoveryInProgress())
-       {
-               /*
-                * Check to see whether KnownAssignedXids contains an xid value older
-                * than the main procarray.
-                */
-               TransactionId kaxmin = KnownAssignedXidsGetOldestXmin();
-
-               LWLockRelease(ProcArrayLock);
+       LWLockRelease(ProcArrayLock);
  
-               if (TransactionIdIsNormal(kaxmin) &&
-                       TransactionIdPrecedes(kaxmin, result))
-                       result = kaxmin;
-       }
-       else
+       if (!RecoveryInProgress())
         {
-               /*
-                * No other information needed, so release the lock immediately.
-                */
-               LWLockRelease(ProcArrayLock);
-
                 /*
                  * Compute the cutoff XID by subtracting vacuum_defer_cleanup_age,
                  * being careful not to generate a "permanent" XID.
@@ -1449,277 +786,169 @@ GetOldestXmin(Relation rel, bool ignoreVacuum)
  }
  
  /*
- * GetMaxSnapshotXidCount -- get max size for snapshot XID array
- *
- * We have to export this for use by snapmgr.c.
- */
-int
-GetMaxSnapshotXidCount(void)
-{
-       return procArray->maxProcs;
-}
  
-/*
- * GetMaxSnapshotSubxidCount -- get max size for snapshot sub-XID array
- *
- * We have to export this for use by snapmgr.c.
- */
-int
-GetMaxSnapshotSubxidCount(void)
-{
-       return TOTAL_MAX_CACHED_SUBXIDS;
-}
+oldestActiveXid
+       oldest XID that's currently in-progress
+
+GlobalXmin
+       oldest XID that's *seen* by any active snapshot as still in-progress
+
+latestCompletedXid
+       latest XID that has committed.
+
+CSN
+       current CSN
+
+
+
+Get snapshot:
+
+1. LWLockAcquire(ProcArrayLock, LW_SHARED)
+2. Read oldestActiveXid. Store it in MyProc->xmin
+3. Read CSN
+4. LWLockRelease(ProcArrayLock)
+
+End-of-xact:
+
+1. LWLockAcquire(ProcArrayLock, LW_SHARED)
+2. Reset MyProc->xmin, xid and CSN
+3. Was my XID == oldestActiveXid? If so, advance oldestActiveXid.
+4. Was my xmin == oldestXmin? If so, advance oldestXmin.
+5. LWLockRelease(ProcArrayLock)
+
+AdvanceGlobalXmin:
+
+1. LWLockAcquire(ProcArrayLock, LW_SHARED)
+2. Read current oldestActiveXid. That's the upper bound. If a transaction
+   begins now, that's the xmin it would get.
+3. Scan ProcArray, for the smallest xmin.
+4. Set that as the new GlobalXmin.
+5. LWLockRelease(ProcArrayLock)
+
+AdvanceOldestActiveXid:
+
+Two alternatives: scan the csnlog or scan the procarray. Scanning the
+procarray is tricky: it's possible that a backend has just read nextXid,
+but not set it in MyProc->xid yet.
+
+
+*/
+
+
  
  /*
- * GetSnapshotData -- returns information about running transactions.
- *
- * The returned snapshot includes xmin (lowest still-running xact ID),
- * xmax (highest completed xact ID + 1), and a list of running xact IDs
- * in the range xmin <= xid < xmax.  It is used as follows:
- *             All xact IDs < xmin are considered finished.
- *             All xact IDs >= xmax are considered still running.
- *             For an xact ID xmin <= xid < xmax, consult list to see whether
- *             it is considered running or not.
+ * GetSnapshotData -- returns an MVCC snapshot.
+ *
+ * The crux of the returned snapshot is the current Commit-Sequence-Number.
+ * All transactions that committed before the CSN is considered
+ * as visible to the snapshot, and all transactions that committed at or
+ * later are considered as still-in-progress.
+ *
+ * The returned snapshot also includes xmin (lowest still-running xact ID),
+ * and xmax (highest completed xact ID + 1). They can be used to avoid
+ * the more expensive check against the CSN:
+ *             All xact IDs < xmin are known to be finished.
+ *             All xact IDs >= xmax are known to be still running.
+ *             For an xact ID xmin <= xid < xmax, consult the CSNLOG to see
+ *             whether its CSN is before or after the snapshot's CSN.
+ *
   * This ensures that the set of transactions seen as "running" by the
   * current xact will not change after it takes the snapshot.
   *
- * All running top-level XIDs are included in the snapshot, except for lazy
- * VACUUM processes.  We also try to include running subtransaction XIDs,
- * but since PGPROC has only a limited cache area for subxact XIDs, full
- * information may not be available.  If we find any overflowed subxid arrays,
- * we have to mark the snapshot's subxid data as overflowed, and extra work
- * *may* need to be done to determine what's running (see XidInMVCCSnapshot()
- * in tqual.c).
- *
   * We also update the following backend-global variables:
   *             TransactionXmin: the oldest xmin of any snapshot in use in the
- *                     current transaction (this is the same as MyPgXact->xmin).
- *             RecentXmin: the xmin computed for the most recent snapshot.  XIDs
- *                     older than this are known not running any more.
+ *                     current transaction.
   *             RecentGlobalXmin: the global xmin (oldest TransactionXmin across all
- *                     running transactions, except those running LAZY VACUUM).  This is
- *                     the same computation done by GetOldestXmin(true, true).
+ *                     running transactions, except those running LAZY VACUUM). This
+ *                     can be used to opportunistically remove old dead tuples.
   *             RecentGlobalDataXmin: the global xmin for non-catalog tables
   *                     >= RecentGlobalXmin
- *
- * Note: this function should probably not be called with an argument that's
- * not statically allocated (see xip allocation below).
   */
  Snapshot
  GetSnapshotData(Snapshot snapshot)
  {
-       ProcArrayStruct *arrayP = procArray;
         TransactionId xmin;
         TransactionId xmax;
         TransactionId globalxmin;
-       int                     index;
-       int                     count = 0;
-       int                     subcount = 0;
-       bool            suboverflowed = false;
+       CommitSeqNo snapshotcsn;
+       bool            takenDuringRecovery;
         volatile TransactionId replication_slot_xmin = InvalidTransactionId;
         volatile TransactionId replication_slot_catalog_xmin = InvalidTransactionId;
  
         Assert(snapshot != NULL);
  
         /*
-        * Allocating space for maxProcs xids is usually overkill; numProcs would
-        * be sufficient.  But it seems better to do the malloc while not holding
-        * the lock, so we can't look at numProcs.  Likewise, we allocate much
-        * more subxip storage than is probably needed.
-        *
-        * This does open a possibility for avoiding repeated malloc/free: since
-        * maxProcs does not change at runtime, we can simply reuse the previous
-        * xip arrays if any.  (This relies on the fact that all callers pass
-        * static SnapshotData structs.)
-        */
-       if (snapshot->xip == NULL)
-       {
-               /*
-                * First call for this snapshot. Snapshot is same size whether or not
-                * we are in recovery, see later comments.
-                */
-               snapshot->xip = (TransactionId *)
-                       malloc(GetMaxSnapshotXidCount() * sizeof(TransactionId));
-               if (snapshot->xip == NULL)
-                       ereport(ERROR,
-                                       (errcode(ERRCODE_OUT_OF_MEMORY),
-                                        errmsg("out of memory")));
-               Assert(snapshot->subxip == NULL);
-               snapshot->subxip = (TransactionId *)
-                       malloc(GetMaxSnapshotSubxidCount() * sizeof(TransactionId));
-               if (snapshot->subxip == NULL)
-                       ereport(ERROR,
-                                       (errcode(ERRCODE_OUT_OF_MEMORY),
-                                        errmsg("out of memory")));
-       }
-
-       /*
-        * It is sufficient to get shared lock on ProcArrayLock, even if we are
-        * going to set MyPgXact->xmin.
+        * A shared lock is enough to modify my own entry
          */
         LWLockAcquire(ProcArrayLock, LW_SHARED);
  
-       /* xmax is always latestCompletedXid + 1 */
-       xmax = ShmemVariableCache->latestCompletedXid;
-       Assert(TransactionIdIsNormal(xmax));
-       TransactionIdAdvance(xmax);
-
-       /* initialize xmin calculation with xmax */
-       globalxmin = xmin = xmax;
+       takenDuringRecovery = RecoveryInProgress();
  
-       snapshot->takenDuringRecovery = RecoveryInProgress();
+       /* Anything older than oldestActiveXid is surely finished by now. */
+       xmin = pg_atomic_read_u32(&ShmemVariableCache->oldestActiveXid);
  
-       if (!snapshot->takenDuringRecovery)
+       /* Announce my xmin, to hold back GlobalXmin. */
+       if (!TransactionIdIsValid(MyPgXact->xmin))
         {
-               int                *pgprocnos = arrayP->pgprocnos;
-               int                     numProcs;
-
-               /*
-                * Spin over procArray checking xid, xmin, and subxids.  The goal is
-                * to gather all active xids, find the lowest xmin, and try to record
-                * subxids.
-                */
-               numProcs = arrayP->numProcs;
-               for (index = 0; index < numProcs; index++)
-               {
-                       int                     pgprocno = pgprocnos[index];
-                       volatile PGXACT *pgxact = &allPgXact[pgprocno];
-                       TransactionId xid;
+               TransactionId oldestActiveXid;
  
-                       /*
-                        * Backend is doing logical decoding which manages xmin
-                        * separately, check below.
-                        */
-                       if (pgxact->vacuumFlags & PROC_IN_LOGICAL_DECODING)
-                               continue;
-
-                       /* Ignore procs running LAZY VACUUM */
-                       if (pgxact->vacuumFlags & PROC_IN_VACUUM)
-                               continue;
-
-                       /* Update globalxmin to be the smallest valid xmin */
-                       xid = pgxact->xmin; /* fetch just once */
-                       if (TransactionIdIsNormal(xid) &&
-                               NormalTransactionIdPrecedes(xid, globalxmin))
-                               globalxmin = xid;
-
-                       /* Fetch xid just once - see GetNewTransactionId */
-                       xid = pgxact->xid;
-
-                       /*
-                        * If the transaction has no XID assigned, we can skip it; it
-                        * won't have sub-XIDs either.  If the XID is >= xmax, we can also
-                        * skip it; such transactions will be treated as running anyway
-                        * (and any sub-XIDs will also be >= xmax).
-                        */
-                       if (!TransactionIdIsNormal(xid)
-                               || !NormalTransactionIdPrecedes(xid, xmax))
-                               continue;
-
-                       /*
-                        * We don't include our own XIDs (if any) in the snapshot, but we
-                        * must include them in xmin.
-                        */
-                       if (NormalTransactionIdPrecedes(xid, xmin))
-                               xmin = xid;
-                       if (pgxact == MyPgXact)
-                               continue;
-
-                       /* Add XID to snapshot. */
-                       snapshot->xip[count++] = xid;
-
-                       /*
-                        * Save subtransaction XIDs if possible (if we've already
-                        * overflowed, there's no point).  Note that the subxact XIDs must
-                        * be later than their parent, so no need to check them against
-                        * xmin.  We could filter against xmax, but it seems better not to
-                        * do that much work while holding the ProcArrayLock.
-                        *
-                        * The other backend can add more subxids concurrently, but cannot
-                        * remove any.  Hence it's important to fetch nxids just once.
-                        * Should be safe to use memcpy, though.  (We needn't worry about
-                        * missing any xids added concurrently, because they must postdate
-                        * xmax.)
-                        *
-                        * Again, our own XIDs are not included in the snapshot.
-                        */
-                       if (!suboverflowed)
-                       {
-                               if (pgxact->overflowed)
-                                       suboverflowed = true;
-                               else
-                               {
-                                       int                     nxids = pgxact->nxids;
+               MyPgXact->xmin = xmin;
  
-                                       if (nxids > 0)
-                                       {
-                                               volatile PGPROC *proc = &allProcs[pgprocno];
-
-                                               memcpy(snapshot->subxip + subcount,
-                                                          (void *) proc->subxids.xids,
-                                                          nxids * sizeof(TransactionId));
-                                               subcount += nxids;
-                                       }
-                               }
-                       }
-               }
-       }
-       else
-       {
                 /*
-                * We're in hot standby, so get XIDs from KnownAssignedXids.
-                *
-                * We store all xids directly into subxip[]. Here's why:
-                *
-                * In recovery we don't know which xids are top-level and which are
-                * subxacts, a design choice that greatly simplifies xid processing.
-                *
-                * It seems like we would want to try to put xids into xip[] only, but
-                * that is fairly small. We would either need to make that bigger or
-                * to increase the rate at which we WAL-log xid assignment; neither is
-                * an appealing choice.
+                * Recheck, if oldestActiveXid advanced after we read it.
                  *
-                * We could try to store xids into xip[] first and then into subxip[]
-                * if there are too many xids. That only works if the snapshot doesn't
-                * overflow because we do not search subxip[] in that case. A simpler
-                * way is to just store all xids in the subxact array because this is
-                * by far the bigger array. We just leave the xip array empty.
+                * This protects against a race condition with AdvanceGlobalXmin().
+                * If a transaction ends runs AdvanceGlobalXmin(), just after we fetch
+                * oldestActiveXid, but before we set MyPgXact->xmin, it's possible
+                * that AdvanceGlobalXmin() computed a new GlobalXmin that doesn't
+                * cover the xmin that we got. To fix that, check oldestActiveXid
+                * again, after setting xmin. Redoing it once is enough, we don't need
+                * to loop, because the (stale) xmin that we set prevents the same
+                * race condition from advancing oldestXid again.
                  *
-                * Either way we need to change the way XidInMVCCSnapshot() works
-                * depending upon when the snapshot was taken, or change normal
-                * snapshot processing so it matches.
-                *
-                * Note: It is possible for recovery to end before we finish taking
-                * the snapshot, and for newly assigned transaction ids to be added to
-                * the ProcArray.  xmax cannot change while we hold ProcArrayLock, so
-                * those newly added transaction ids would be filtered away, so we
-                * need not be concerned about them.
+                * For a brief moment, we can have the situation that our xmin is
+                * lower than GlobalXmin, but it's OK because we don't use that xmin
+                * until we've re-checked and corrected it if necessary.
+                */
+               /*
+                * memory barrier to make sure that setting the xmin in our PGPROC entry
+                * is made visible to others, before the read below.
                  */
-               subcount = KnownAssignedXidsGetAndSetXmin(snapshot->subxip, &xmin,
-                                                                                                 xmax);
+               pg_memory_barrier();
+
+               oldestActiveXid  = pg_atomic_read_u32(&ShmemVariableCache->oldestActiveXid);
+               if (oldestActiveXid != xmin)
+               {
+                       xmin = oldestActiveXid;
+
+                       MyPgXact->xmin = xmin;
+               }
  
-               if (TransactionIdPrecedesOrEquals(xmin, procArray->lastOverflowedXid))
-                       suboverflowed = true;
+               TransactionXmin = xmin;
         }
  
+       /*
+        * Get the current snapshot CSN, and copy that to my PGPROC entry. This
+        * serializes us with any concurrent commits.
+        */
+       snapshotcsn = pg_atomic_read_u64(&ShmemVariableCache->nextCommitSeqNo);
+       if (MyPgXact->snapshotcsn == InvalidCommitSeqNo)
+               MyPgXact->snapshotcsn = snapshotcsn;
+
+       /* Also get xmax. It is always latestCompletedXid + 1. */
+       xmax = pg_atomic_read_u32(&ShmemVariableCache->latestCompletedXid);
+       Assert(TransactionIdIsNormal(xmax));
+       TransactionIdAdvance(xmax);
+
+       /* Also read GlobalXmin. */
+       globalxmin = pg_atomic_read_u32(&ShmemVariableCache->globalXmin);
  
         /* fetch into volatile var while ProcArrayLock is held */
         replication_slot_xmin = procArray->replication_slot_xmin;
         replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin;
  
-       if (!TransactionIdIsValid(MyPgXact->xmin))
-               MyPgXact->xmin = TransactionXmin = xmin;
-
         LWLockRelease(ProcArrayLock);
  
-       /*
-        * Update globalxmin to include actual process xids.  This is a slightly
-        * different way of computing it than GetOldestXmin uses, but should give
-        * the same result.
-        */
-       if (TransactionIdPrecedes(xmin, globalxmin))
-               globalxmin = xmin;
-
         /* Update global variables too */
         RecentGlobalXmin = globalxmin - vacuum_defer_cleanup_age;
         if (!TransactionIdIsNormal(RecentGlobalXmin))
@@ -1741,15 +970,11 @@ GetSnapshotData(Snapshot snapshot)
                 NormalTransactionIdPrecedes(replication_slot_catalog_xmin, RecentGlobalXmin))
                 RecentGlobalXmin = replication_slot_catalog_xmin;
  
-       RecentXmin = xmin;
-
         snapshot->xmin = xmin;
         snapshot->xmax = xmax;
-       snapshot->xcnt = count;
-       snapshot->subxcnt = subcount;
-       snapshot->suboverflowed = suboverflowed;
-
+       snapshot->snapshotcsn = snapshotcsn;
         snapshot->curcid = GetCurrentCommandId(false);
+       snapshot->takenDuringRecovery = takenDuringRecovery;
  
         /*
          * This is a new snapshot, so set both refcounts are zero, and mark it as
@@ -1804,8 +1029,10 @@ ProcArrayInstallImportedXmin(TransactionId xmin, TransactionId sourcexid)
         if (!TransactionIdIsNormal(sourcexid))
                 return false;
  
-       /* Get lock so source xact can't end while we're doing this */
-       LWLockAcquire(ProcArrayLock, LW_SHARED);
+       /*
+        * Get exclusive lock so source xact can't end while we're doing this.
+        */
+       LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
  
         for (index = 0; index < arrayP->numProcs; index++)
         {
@@ -1875,10 +1102,12 @@ ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc)
         Assert(TransactionIdIsNormal(xmin));
         Assert(proc != NULL);
  
-       /* Get lock so source xact can't end while we're doing this */
-       LWLockAcquire(ProcArrayLock, LW_SHARED);
-
-       pgxact = &allPgXact[proc->pgprocno];
+       /*
+        * Get exclusive lock so source xact can't end while we're doing this.
+        */
+       LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+       pgxact = &allPgXact[proc->pgprocno];
  
         /*
          * Be certain that the referenced PGPROC has an advertised xmin which is
@@ -1903,29 +1132,24 @@ ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc)
  /*
   * GetRunningTransactionData -- returns information about running transactions.
   *
- * Similar to GetSnapshotData but returns more information. We include
- * all PGXACTs with an assigned TransactionId, even VACUUM processes.
+ * Returns the oldest running TransactionId among all backends, even VACUUM
+ * processes.
+ *
+ * We acquire XidGenlock, but the caller is responsible for releasing it.
+ * Acquiring XidGenLock ensures that no new XID can be assigned until
+ * the caller has WAL-logged this snapshot, and releases the lock.
+ * FIXME: this also used to hold ProcArrayLock, to prevent any transactions
+ * from committing until the caller has WAL-logged. I don't think we need
+ * that anymore, but verify.
   *
- * We acquire XidGenLock and ProcArrayLock, but the caller is responsible for
- * releasing them. Acquiring XidGenLock ensures that no new XIDs enter the proc
- * array until the caller has WAL-logged this snapshot, and releases the
- * lock. Acquiring ProcArrayLock ensures that no transactions commit until the
- * lock is released.
+ * Returns the current xmin and xmax, like GetSnapshotData does.
   *
   * The returned data structure is statically allocated; caller should not
   * modify it, and must not assume it is valid past the next call.
   *
- * This is never executed during recovery so there is no need to look at
- * KnownAssignedXids.
- *
   * We don't worry about updating other counters, we want to keep this as
   * simple as possible and leave GetSnapshotData() as the primary code for
   * that bookkeeping.
- *
- * Note that if any transaction has overflowed its cached subtransactions
- * then there is no real need include any subtransactions. That isn't a
- * common enough case to worry about optimising the size of the WAL record,
- * and we may wish to see that data for diagnostic purposes anyway.
   */
  RunningTransactions
  GetRunningTransactionData(void)
@@ -1935,43 +1159,11 @@ GetRunningTransactionData(void)
  
         ProcArrayStruct *arrayP = procArray;
         RunningTransactions CurrentRunningXacts = &CurrentRunningXactsData;
-       TransactionId latestCompletedXid;
         TransactionId oldestRunningXid;
-       TransactionId *xids;
         int                     index;
-       int                     count;
-       int                     subcount;
-       bool            suboverflowed;
  
         Assert(!RecoveryInProgress());
  
-       /*
-        * Allocating space for maxProcs xids is usually overkill; numProcs would
-        * be sufficient.  But it seems better to do the malloc while not holding
-        * the lock, so we can't look at numProcs.  Likewise, we allocate much
-        * more subxip storage than is probably needed.
-        *
-        * Should only be allocated in bgwriter, since only ever executed during
-        * checkpoints.
-        */
-       if (CurrentRunningXacts->xids == NULL)
-       {
-               /*
-                * First call
-                */
-               CurrentRunningXacts->xids = (TransactionId *)
-                       malloc(TOTAL_MAX_CACHED_SUBXIDS * sizeof(TransactionId));
-               if (CurrentRunningXacts->xids == NULL)
-                       ereport(ERROR,
-                                       (errcode(ERRCODE_OUT_OF_MEMORY),
-                                        errmsg("out of memory")));
-       }
-
-       xids = CurrentRunningXacts->xids;
-
-       count = subcount = 0;
-       suboverflowed = false;
-
         /*
          * Ensure that no xids enter or leave the procarray while we obtain
          * snapshot.
@@ -1979,8 +1171,6 @@ GetRunningTransactionData(void)
         LWLockAcquire(ProcArrayLock, LW_SHARED);
         LWLockAcquire(XidGenLock, LW_SHARED);
  
-       latestCompletedXid = ShmemVariableCache->latestCompletedXid;
-
         oldestRunningXid = ShmemVariableCache->nextXid;
  
         /*
@@ -2002,47 +1192,8 @@ GetRunningTransactionData(void)
                 if (!TransactionIdIsValid(xid))
                         continue;
  
-               xids[count++] = xid;
-
                 if (TransactionIdPrecedes(xid, oldestRunningXid))
                         oldestRunningXid = xid;
-
-               if (pgxact->overflowed)
-                       suboverflowed = true;
-       }
-
-       /*
-        * Spin over procArray collecting all subxids, but only if there hasn't
-        * been a suboverflow.
-        */
-       if (!suboverflowed)
-       {
-               for (index = 0; index < arrayP->numProcs; index++)
-               {
-                       int                     pgprocno = arrayP->pgprocnos[index];
-                       volatile PGPROC *proc = &allProcs[pgprocno];
-                       volatile PGXACT *pgxact = &allPgXact[pgprocno];
-                       int                     nxids;
-
-                       /*
-                        * Save subtransaction XIDs. Other backends can't add or remove
-                        * entries while we're holding XidGenLock.
-                        */
-                       nxids = pgxact->nxids;
-                       if (nxids > 0)
-                       {
-                               memcpy(&xids[count], (void *) proc->subxids.xids,
-                                          nxids * sizeof(TransactionId));
-                               count += nxids;
-                               subcount += nxids;
-
-                               /*
-                                * Top-level XID of a transaction is always less than any of
-                                * its subxids, so we don't need to check if any of the
-                                * subxids are smaller than oldestRunningXid
-                                */
-                       }
-               }
         }
  
         /*
@@ -2054,18 +1205,14 @@ GetRunningTransactionData(void)
          * increases if slots do.
          */
  
-       CurrentRunningXacts->xcnt = count - subcount;
-       CurrentRunningXacts->subxcnt = subcount;
-       CurrentRunningXacts->subxid_overflow = suboverflowed;
         CurrentRunningXacts->nextXid = ShmemVariableCache->nextXid;
         CurrentRunningXacts->oldestRunningXid = oldestRunningXid;
-       CurrentRunningXacts->latestCompletedXid = latestCompletedXid;
  
         Assert(TransactionIdIsValid(CurrentRunningXacts->nextXid));
         Assert(TransactionIdIsValid(CurrentRunningXacts->oldestRunningXid));
-       Assert(TransactionIdIsNormal(CurrentRunningXacts->latestCompletedXid));
  
-       /* We don't release the locks here, the caller is responsible for that */
+       LWLockRelease(ProcArrayLock);
+       /* We don't release XidGenLock here, the caller is responsible for that */
  
         return CurrentRunningXacts;
  }
@@ -2073,17 +1220,18 @@ GetRunningTransactionData(void)
  /*
   * GetOldestActiveTransactionId()
   *
- * Similar to GetSnapshotData but returns just oldestActiveXid. We include
+ * Returns the oldest XID that's still running. We include
   * all PGXACTs with an assigned TransactionId, even VACUUM processes.
   * We look at all databases, though there is no need to include WALSender
   * since this has no effect on hot standby conflicts.
   *
- * This is never executed during recovery so there is no need to look at
- * KnownAssignedXids.
- *
   * We don't worry about updating other counters, we want to keep this as
   * simple as possible and leave GetSnapshotData() as the primary code for
   * that bookkeeping.
+ *
+ * XXX: We could just use return ShmemVariableCache->oldestActiveXid. this
+ * uses a different method of computing the value though, so maybe this is
+ * useful as a cross-check?
   */
  TransactionId
  GetOldestActiveTransactionId(void)
@@ -2530,7 +1678,7 @@ GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0,
   *
   * All callers that are checking xmins always now supply a valid and useful
   * value for limitXmin. The limitXmin is always lower than the lowest
- * numbered KnownAssignedXid that is not already a FATAL error. This is
+ * numbered KnownAssignedXid (XXX) that is not already a FATAL error. This is
   * because we only care about cleanup records that are cleaning up tuple
   * versions from committed transactions. In that case they will only occur
   * at the point where the record is less than the lowest running xid. That
@@ -2952,170 +2100,9 @@ ProcArrayGetReplicationSlotXmin(TransactionId *xmin,
         LWLockRelease(ProcArrayLock);
  }
  
-
-#define XidCacheRemove(i) \
-       do { \
-               MyProc->subxids.xids[i] = MyProc->subxids.xids[MyPgXact->nxids - 1]; \
-               MyPgXact->nxids--; \
-       } while (0)
-
-/*
- * XidCacheRemoveRunningXids
- *
- * Remove a bunch of TransactionIds from the list of known-running
- * subtransactions for my backend.  Both the specified xid and those in
- * the xids[] array (of length nxids) are removed from the subxids cache.
- * latestXid must be the latest XID among the group.
- */
-void
-XidCacheRemoveRunningXids(TransactionId xid,
-                                                 int nxids, const TransactionId *xids,
-                                                 TransactionId latestXid)
-{
-       int                     i,
-                               j;
-
-       Assert(TransactionIdIsValid(xid));
-
-       /*
-        * We must hold ProcArrayLock exclusively in order to remove transactions
-        * from the PGPROC array.  (See src/backend/access/transam/README.)  It's
-        * possible this could be relaxed since we know this routine is only used
-        * to abort subtransactions, but pending closer analysis we'd best be
-        * conservative.
-        */
-       LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
-
-       /*
-        * Under normal circumstances xid and xids[] will be in increasing order,
-        * as will be the entries in subxids.  Scan backwards to avoid O(N^2)
-        * behavior when removing a lot of xids.
-        */
-       for (i = nxids - 1; i >= 0; i--)
-       {
-               TransactionId anxid = xids[i];
-
-               for (j = MyPgXact->nxids - 1; j >= 0; j--)
-               {
-                       if (TransactionIdEquals(MyProc->subxids.xids[j], anxid))
-                       {
-                               XidCacheRemove(j);
-                               break;
-                       }
-               }
-
-               /*
-                * Ordinarily we should have found it, unless the cache has
-                * overflowed. However it's also possible for this routine to be
-                * invoked multiple times for the same subtransaction, in case of an
-                * error during AbortSubTransaction.  So instead of Assert, emit a
-                * debug warning.
-                */
-               if (j < 0 && !MyPgXact->overflowed)
-                       elog(WARNING, "did not find subXID %u in MyProc", anxid);
-       }
-
-       for (j = MyPgXact->nxids - 1; j >= 0; j--)
-       {
-               if (TransactionIdEquals(MyProc->subxids.xids[j], xid))
-               {
-                       XidCacheRemove(j);
-                       break;
-               }
-       }
-       /* Ordinarily we should have found it, unless the cache has overflowed */
-       if (j < 0 && !MyPgXact->overflowed)
-               elog(WARNING, "did not find subXID %u in MyProc", xid);
-
-       /* Also advance global latestCompletedXid while holding the lock */
-       if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
-                                                         latestXid))
-               ShmemVariableCache->latestCompletedXid = latestXid;
-
-       LWLockRelease(ProcArrayLock);
-}
-
-#ifdef XIDCACHE_DEBUG
-
-/*
- * Print stats about effectiveness of XID cache
- */
-static void
-DisplayXidCache(void)
-{
-       fprintf(stderr,
-                       "XidCache: xmin: %ld, known: %ld, myxact: %ld, latest: %ld, mainxid: %ld, childxid: %ld, knownassigned: %ld, nooflo: %ld, slow: %ld\n",
-                       xc_by_recent_xmin,
-                       xc_by_known_xact,
-                       xc_by_my_xact,
-                       xc_by_latest_xid,
-                       xc_by_main_xid,
-                       xc_by_child_xid,
-                       xc_by_known_assigned,
-                       xc_no_overflow,
-                       xc_slow_answer);
-}
-#endif   /* XIDCACHE_DEBUG */
-
-
-/* ----------------------------------------------
- *             KnownAssignedTransactions sub-module
- * ----------------------------------------------
- */
-
-/*
- * In Hot Standby mode, we maintain a list of transactions that are (or were)
- * running in the master at the current point in WAL.  These XIDs must be
- * treated as running by standby transactions, even though they are not in
- * the standby server's PGXACT array.
- *
- * We record all XIDs that we know have been assigned.  That includes all the
- * XIDs seen in WAL records, plus all unobserved XIDs that we can deduce have
- * been assigned.  We can deduce the existence of unobserved XIDs because we
- * know XIDs are assigned in sequence, with no gaps.  The KnownAssignedXids
- * list expands as new XIDs are observed or inferred, and contracts when
- * transaction completion records arrive.
- *
- * During hot standby we do not fret too much about the distinction between
- * top-level XIDs and subtransaction XIDs. We store both together in the
- * KnownAssignedXids list.  In backends, this is copied into snapshots in
- * GetSnapshotData(), taking advantage of the fact that XidInMVCCSnapshot()
- * doesn't care about the distinction either.  Subtransaction XIDs are
- * effectively treated as top-level XIDs and in the typical case pg_subtrans
- * links are *not* maintained (which does not affect visibility).
- *
- * We have room in KnownAssignedXids and in snapshots to hold maxProcs *
- * (1 + PGPROC_MAX_CACHED_SUBXIDS) XIDs, so every master transaction must
- * report its subtransaction XIDs in a WAL XLOG_XACT_ASSIGNMENT record at
- * least every PGPROC_MAX_CACHED_SUBXIDS.  When we receive one of these
- * records, we mark the subXIDs as children of the top XID in pg_subtrans,
- * and then remove them from KnownAssignedXids.  This prevents overflow of
- * KnownAssignedXids and snapshots, at the cost that status checks for these
- * subXIDs will take a slower path through TransactionIdIsInProgress().
- * This means that KnownAssignedXids is not necessarily complete for subXIDs,
- * though it should be complete for top-level XIDs; this is the same situation
- * that holds with respect to the PGPROC entries in normal running.
- *
- * When we throw away subXIDs from KnownAssignedXids, we need to keep track of
- * that, similarly to tracking overflow of a PGPROC's subxids array.  We do
- * that by remembering the lastOverflowedXID, ie the last thrown-away subXID.
- * As long as that is within the range of interesting XIDs, we have to assume
- * that subXIDs are missing from snapshots.  (Note that subXID overflow occurs
- * on primary when 65th subXID arrives, whereas on standby it occurs when 64th
- * subXID arrives - that is not an error.)
- *
- * Should a backend on primary somehow disappear before it can write an abort
- * record, then we just leave those XIDs in KnownAssignedXids. They actually
- * aborted but we think they were running; the distinction is irrelevant
- * because either way any changes done by the transaction are not visible to
- * backends in the standby.  We prune KnownAssignedXids when
- * XLOG_RUNNING_XACTS arrives, to forestall possible overflow of the
- * array due to such dead XIDs.
- */
-
  /*
   * RecordKnownAssignedTransactionIds
- *             Record the given XID in KnownAssignedXids, as well as any preceding
+ *             Record the given XID in KnownAssignedXids (FIXME: update comment, KnownAssignedXid is no more), as well as any preceding
   *             unobserved XIDs.
   *
   * RecordKnownAssignedTransactionIds() should be run for *every* WAL record
@@ -3144,7 +2131,7 @@ RecordKnownAssignedTransactionIds(TransactionId xid)
                 TransactionId next_expected_xid;
  
                 /*
-                * Extend subtrans like we do in GetNewTransactionId() during normal
+                * Extend csnlog like we do in GetNewTransactionId() during normal
                  * operation using individual extend steps. Note that we do not need
                  * to extend clog since its extensions are WAL logged.
                  *
@@ -3156,27 +2143,10 @@ RecordKnownAssignedTransactionIds(TransactionId xid)
                 while (TransactionIdPrecedes(next_expected_xid, xid))
                 {
                         TransactionIdAdvance(next_expected_xid);
-                       ExtendSUBTRANS(next_expected_xid);
+                       ExtendCSNLOG(next_expected_xid);
                 }
                 Assert(next_expected_xid == xid);
  
-               /*
-                * If the KnownAssignedXids machinery isn't up yet, there's nothing
-                * more to do since we don't track assigned xids yet.
-                */
-               if (standbyState <= STANDBY_INITIALIZED)
-               {
-                       latestObservedXid = xid;
-                       return;
-               }
-
-               /*
-                * Add (latestObservedXid, xid] onto the KnownAssignedXids array.
-                */
-               next_expected_xid = latestObservedXid;
-               TransactionIdAdvance(next_expected_xid);
-               KnownAssignedXidsAdd(next_expected_xid, xid, false);
-
                 /*
                  * Now we can advance latestObservedXid
                  */
@@ -3190,726 +2160,3 @@ RecordKnownAssignedTransactionIds(TransactionId xid)
                 LWLockRelease(XidGenLock);
         }
  }
-
-/*
- * ExpireTreeKnownAssignedTransactionIds
- *             Remove the given XIDs from KnownAssignedXids.
- *
- * Called during recovery in analogy with and in place of ProcArrayEndTransaction()
- */
-void
-ExpireTreeKnownAssignedTransactionIds(TransactionId xid, int nsubxids,
-                                                          TransactionId *subxids, TransactionId max_xid)
-{
-       Assert(standbyState >= STANDBY_INITIALIZED);
-
-       /*
-        * Uses same locking as transaction commit
-        */
-       LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
-
-       KnownAssignedXidsRemoveTree(xid, nsubxids, subxids);
-
-       /* As in ProcArrayEndTransaction, advance latestCompletedXid */
-       if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
-                                                         max_xid))
-               ShmemVariableCache->latestCompletedXid = max_xid;
-
-       LWLockRelease(ProcArrayLock);
-}
-
-/*
- * ExpireAllKnownAssignedTransactionIds
- *             Remove all entries in KnownAssignedXids
- */
-void
-ExpireAllKnownAssignedTransactionIds(void)
-{
-       LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
-       KnownAssignedXidsRemovePreceding(InvalidTransactionId);
-       LWLockRelease(ProcArrayLock);
-}
-
-/*
- * ExpireOldKnownAssignedTransactionIds
- *             Remove KnownAssignedXids entries preceding the given XID
- */
-void
-ExpireOldKnownAssignedTransactionIds(TransactionId xid)
-{
-       LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
-       KnownAssignedXidsRemovePreceding(xid);
-       LWLockRelease(ProcArrayLock);
-}
-
-
-/*
- * Private module functions to manipulate KnownAssignedXids
- *
- * There are 5 main uses of the KnownAssignedXids data structure:
- *
- *     * backends taking snapshots - all valid XIDs need to be copied out
- *     * backends seeking to determine presence of a specific XID
- *     * startup process adding new known-assigned XIDs
- *     * startup process removing specific XIDs as transactions end
- *     * startup process pruning array when special WAL records arrive
- *
- * This data structure is known to be a hot spot during Hot Standby, so we
- * go to some lengths to make these operations as efficient and as concurrent
- * as possible.
- *
- * The XIDs are stored in an array in sorted order --- TransactionIdPrecedes
- * order, to be exact --- to allow binary search for specific XIDs.  Note:
- * in general TransactionIdPrecedes would not provide a total order, but
- * we know that the entries present at any instant should not extend across
- * a large enough fraction of XID space to wrap around (the master would
- * shut down for fear of XID wrap long before that happens).  So it's OK to
- * use TransactionIdPrecedes as a binary-search comparator.
- *
- * It's cheap to maintain the sortedness during insertions, since new known
- * XIDs are always reported in XID order; we just append them at the right.
- *
- * To keep individual deletions cheap, we need to allow gaps in the array.
- * This is implemented by marking array elements as valid or invalid using
- * the parallel boolean array KnownAssignedXidsValid[].  A deletion is done
- * by setting KnownAssignedXidsValid[i] to false, *without* clearing the
- * XID entry itself.  This preserves the property that the XID entries are
- * sorted, so we can do binary searches easily.  Periodically we compress
- * out the unused entries; that's much cheaper than having to compress the
- * array immediately on every deletion.
- *
- * The actually valid items in KnownAssignedXids[] and KnownAssignedXidsValid[]
- * are those with indexes tail <= i < head; items outside this subscript range
- * have unspecified contents.  When head reaches the end of the array, we
- * force compression of unused entries rather than wrapping around, since
- * allowing wraparound would greatly complicate the search logic.  We maintain
- * an explicit tail pointer so that pruning of old XIDs can be done without
- * immediately moving the array contents.  In most cases only a small fraction
- * of the array contains valid entries at any instant.
- *
- * Although only the startup process can ever change the KnownAssignedXids
- * data structure, we still need interlocking so that standby backends will
- * not observe invalid intermediate states.  The convention is that backends
- * must hold shared ProcArrayLock to examine the array.  To remove XIDs from
- * the array, the startup process must hold ProcArrayLock exclusively, for
- * the usual transactional reasons (compare commit/abort of a transaction
- * during normal running).  Compressing unused entries out of the array
- * likewise requires exclusive lock.  To add XIDs to the array, we just insert
- * them into slots to the right of the head pointer and then advance the head
- * pointer.  This wouldn't require any lock at all, except that on machines
- * with weak memory ordering we need to be careful that other processors
- * see the array element changes before they see the head pointer change.
- * We handle this by using a spinlock to protect reads and writes of the
- * head/tail pointers.  (We could dispense with the spinlock if we were to
- * create suitable memory access barrier primitives and use those instead.)
- * The spinlock must be taken to read or write the head/tail pointers unless
- * the caller holds ProcArrayLock exclusively.
- *
- * Algorithmic analysis:
- *
- * If we have a maximum of M slots, with N XIDs currently spread across
- * S elements then we have N <= S <= M always.
- *
- *     * Adding a new XID is O(1) and needs little locking (unless compression
- *             must happen)
- *     * Compressing the array is O(S) and requires exclusive lock
- *     * Removing an XID is O(logS) and requires exclusive lock
- *     * Taking a snapshot is O(S) and requires shared lock
- *     * Checking for an XID is O(logS) and requires shared lock
- *
- * In comparison, using a hash table for KnownAssignedXids would mean that
- * taking snapshots would be O(M). If we can maintain S << M then the
- * sorted array technique will deliver significantly faster snapshots.
- * If we try to keep S too small then we will spend too much time compressing,
- * so there is an optimal point for any workload mix. We use a heuristic to
- * decide when to compress the array, though trimming also helps reduce
- * frequency of compressing. The heuristic requires us to track the number of
- * currently valid XIDs in the array.
- */
-
-
-/*
- * Compress KnownAssignedXids by shifting valid data down to the start of the
- * array, removing any gaps.
- *
- * A compression step is forced if "force" is true, otherwise we do it
- * only if a heuristic indicates it's a good time to do it.
- *
- * Caller must hold ProcArrayLock in exclusive mode.
- */
-static void
-KnownAssignedXidsCompress(bool force)
-{
-       /* use volatile pointer to prevent code rearrangement */
-       volatile ProcArrayStruct *pArray = procArray;
-       int                     head,
-                               tail;
-       int                     compress_index;
-       int                     i;
-
-       /* no spinlock required since we hold ProcArrayLock exclusively */
-       head = pArray->headKnownAssignedXids;
-       tail = pArray->tailKnownAssignedXids;
-
-       if (!force)
-       {
-               /*
-                * If we can choose how much to compress, use a heuristic to avoid
-                * compressing too often or not often enough.
-                *
-                * Heuristic is if we have a large enough current spread and less than
-                * 50% of the elements are currently in use, then compress. This
-                * should ensure we compress fairly infrequently. We could compress
-                * less often though the virtual array would spread out more and
-                * snapshots would become more expensive.
-                */
-               int                     nelements = head - tail;
-
-               if (nelements < 4 * PROCARRAY_MAXPROCS ||
-                       nelements < 2 * pArray->numKnownAssignedXids)
-                       return;
-       }
-
-       /*
-        * We compress the array by reading the valid values from tail to head,
-        * re-aligning data to 0th element.
-        */
-       compress_index = 0;
-       for (i = tail; i < head; i++)
-       {
-               if (KnownAssignedXidsValid[i])
-               {
-                       KnownAssignedXids[compress_index] = KnownAssignedXids[i];
-                       KnownAssignedXidsValid[compress_index] = true;
-                       compress_index++;
-               }
-       }
-
-       pArray->tailKnownAssignedXids = 0;
-       pArray->headKnownAssignedXids = compress_index;
-}
-
-/*
- * Add xids into KnownAssignedXids at the head of the array.
- *
- * xids from from_xid to to_xid, inclusive, are added to the array.
- *
- * If exclusive_lock is true then caller already holds ProcArrayLock in
- * exclusive mode, so we need no extra locking here.  Else caller holds no
- * lock, so we need to be sure we maintain sufficient interlocks against
- * concurrent readers.  (Only the startup process ever calls this, so no need
- * to worry about concurrent writers.)
- */
-static void
-KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid,
-                                        bool exclusive_lock)
-{
-       /* use volatile pointer to prevent code rearrangement */
-       volatile ProcArrayStruct *pArray = procArray;
-       TransactionId next_xid;
-       int                     head,
-                               tail;
-       int                     nxids;
-       int                     i;
-
-       Assert(TransactionIdPrecedesOrEquals(from_xid, to_xid));
-
-       /*
-        * Calculate how many array slots we'll need.  Normally this is cheap; in
-        * the unusual case where the XIDs cross the wrap point, we do it the hard
-        * way.
-        */
-       if (to_xid >= from_xid)
-               nxids = to_xid - from_xid + 1;
-       else
-       {
-               nxids = 1;
-               next_xid = from_xid;
-               while (TransactionIdPrecedes(next_xid, to_xid))
-               {
-                       nxids++;
-                       TransactionIdAdvance(next_xid);
-               }
-       }
-
-       /*
-        * Since only the startup process modifies the head/tail pointers, we
-        * don't need a lock to read them here.
-        */
-       head = pArray->headKnownAssignedXids;
-       tail = pArray->tailKnownAssignedXids;
-
-       Assert(head >= 0 && head <= pArray->maxKnownAssignedXids);
-       Assert(tail >= 0 && tail < pArray->maxKnownAssignedXids);
-
-       /*
-        * Verify that insertions occur in TransactionId sequence.  Note that even
-        * if the last existing element is marked invalid, it must still have a
-        * correctly sequenced XID value.
-        */
-       if (head > tail &&
-               TransactionIdFollowsOrEquals(KnownAssignedXids[head - 1], from_xid))
-       {
-               KnownAssignedXidsDisplay(LOG);
-               elog(ERROR, "out-of-order XID insertion in KnownAssignedXids");
-       }
-
-       /*
-        * If our xids won't fit in the remaining space, compress out free space
-        */
-       if (head + nxids > pArray->maxKnownAssignedXids)
-       {
-               /* must hold lock to compress */
-               if (!exclusive_lock)
-                       LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
-
-               KnownAssignedXidsCompress(true);
-
-               head = pArray->headKnownAssignedXids;
-               /* note: we no longer care about the tail pointer */
-
-               if (!exclusive_lock)
-                       LWLockRelease(ProcArrayLock);
-
-               /*
-                * If it still won't fit then we're out of memory
-                */
-               if (head + nxids > pArray->maxKnownAssignedXids)
-                       elog(ERROR, "too many KnownAssignedXids");
-       }
-
-       /* Now we can insert the xids into the space starting at head */
-       next_xid = from_xid;
-       for (i = 0; i < nxids; i++)
-       {
-               KnownAssignedXids[head] = next_xid;
-               KnownAssignedXidsValid[head] = true;
-               TransactionIdAdvance(next_xid);
-               head++;
-       }
-
-       /* Adjust count of number of valid entries */
-       pArray->numKnownAssignedXids += nxids;
-
-       /*
-        * Now update the head pointer.  We use a spinlock to protect this
-        * pointer, not because the update is likely to be non-atomic, but to
-        * ensure that other processors see the above array updates before they
-        * see the head pointer change.
-        *
-        * If we're holding ProcArrayLock exclusively, there's no need to take the
-        * spinlock.
-        */
-       if (exclusive_lock)
-               pArray->headKnownAssignedXids = head;
-       else
-       {
-               SpinLockAcquire(&pArray->known_assigned_xids_lck);
-               pArray->headKnownAssignedXids = head;
-               SpinLockRelease(&pArray->known_assigned_xids_lck);
-       }
-}
-
-/*
- * KnownAssignedXidsSearch
- *
- * Searches KnownAssignedXids for a specific xid and optionally removes it.
- * Returns true if it was found, false if not.
- *
- * Caller must hold ProcArrayLock in shared or exclusive mode.
- * Exclusive lock must be held for remove = true.
- */
-static bool
-KnownAssignedXidsSearch(TransactionId xid, bool remove)
-{
-       /* use volatile pointer to prevent code rearrangement */
-       volatile ProcArrayStruct *pArray = procArray;
-       int                     first,
-                               last;
-       int                     head;
-       int                     tail;
-       int                     result_index = -1;
-
-       if (remove)
-       {
-               /* we hold ProcArrayLock exclusively, so no need for spinlock */
-               tail = pArray->tailKnownAssignedXids;
-               head = pArray->headKnownAssignedXids;
-       }
-       else
-       {
-               /* take spinlock to ensure we see up-to-date array contents */
-               SpinLockAcquire(&pArray->known_assigned_xids_lck);
-               tail = pArray->tailKnownAssignedXids;
-               head = pArray->headKnownAssignedXids;
-               SpinLockRelease(&pArray->known_assigned_xids_lck);
-       }
-
-       /*
-        * Standard binary search.  Note we can ignore the KnownAssignedXidsValid
-        * array here, since even invalid entries will contain sorted XIDs.
-        */
-       first = tail;
-       last = head - 1;
-       while (first <= last)
-       {
-               int                     mid_index;
-               TransactionId mid_xid;
-
-               mid_index = (first + last) / 2;
-               mid_xid = KnownAssignedXids[mid_index];
-
-               if (xid == mid_xid)
-               {
-                       result_index = mid_index;
-                       break;
-               }
-               else if (TransactionIdPrecedes(xid, mid_xid))
-                       last = mid_index - 1;
-               else
-                       first = mid_index + 1;
-       }
-
-       if (result_index < 0)
-               return false;                   /* not in array */
-
-       if (!KnownAssignedXidsValid[result_index])
-               return false;                   /* in array, but invalid */
-
-       if (remove)
-       {
-               KnownAssignedXidsValid[result_index] = false;
-
-               pArray->numKnownAssignedXids--;
-               Assert(pArray->numKnownAssignedXids >= 0);
-
-               /*
-                * If we're removing the tail element then advance tail pointer over
-                * any invalid elements.  This will speed future searches.
-                */
-               if (result_index == tail)
-               {
-                       tail++;
-                       while (tail < head && !KnownAssignedXidsValid[tail])
-                               tail++;
-                       if (tail >= head)
-                       {
-                               /* Array is empty, so we can reset both pointers */
-                               pArray->headKnownAssignedXids = 0;
-                               pArray->tailKnownAssignedXids = 0;
-                       }
-                       else
-                       {
-                               pArray->tailKnownAssignedXids = tail;
-                       }
-               }
-       }
-
-       return true;
-}
-
-/*
- * Is the specified XID present in KnownAssignedXids[]?
- *
- * Caller must hold ProcArrayLock in shared or exclusive mode.
- */
-static bool
-KnownAssignedXidExists(TransactionId xid)
-{
-       Assert(TransactionIdIsValid(xid));
-
-       return KnownAssignedXidsSearch(xid, false);
-}
-
-/*
- * Remove the specified XID from KnownAssignedXids[].
- *
- * Caller must hold ProcArrayLock in exclusive mode.
- */
-static void
-KnownAssignedXidsRemove(TransactionId xid)
-{
-       Assert(TransactionIdIsValid(xid));
-
-       elog(trace_recovery(DEBUG4), "remove KnownAssignedXid %u", xid);
-
-       /*
-        * Note: we cannot consider it an error to remove an XID that's not
-        * present.  We intentionally remove subxact IDs while processing
-        * XLOG_XACT_ASSIGNMENT, to avoid array overflow.  Then those XIDs will be
-        * removed again when the top-level xact commits or aborts.
-        *
-        * It might be possible to track such XIDs to distinguish this case from
-        * actual errors, but it would be complicated and probably not worth it.
-        * So, just ignore the search result.
-        */
-       (void) KnownAssignedXidsSearch(xid, true);
-}
-
-/*
- * KnownAssignedXidsRemoveTree
- *             Remove xid (if it's not InvalidTransactionId) and all the subxids.
- *
- * Caller must hold ProcArrayLock in exclusive mode.
- */
-static void
-KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids,
-                                                       TransactionId *subxids)
-{
-       int                     i;
-
-       if (TransactionIdIsValid(xid))
-               KnownAssignedXidsRemove(xid);
-
-       for (i = 0; i < nsubxids; i++)
-               KnownAssignedXidsRemove(subxids[i]);
-
-       /* Opportunistically compress the array */
-       KnownAssignedXidsCompress(false);
-}
-
-/*
- * Prune KnownAssignedXids up to, but *not* including xid. If xid is invalid
- * then clear the whole table.
- *
- * Caller must hold ProcArrayLock in exclusive mode.
- */
-static void
-KnownAssignedXidsRemovePreceding(TransactionId removeXid)
-{
-       /* use volatile pointer to prevent code rearrangement */
-       volatile ProcArrayStruct *pArray = procArray;
-       int                     count = 0;
-       int                     head,
-                               tail,
-                               i;
-
-       if (!TransactionIdIsValid(removeXid))
-       {
-               elog(trace_recovery(DEBUG4), "removing all KnownAssignedXids");
-               pArray->numKnownAssignedXids = 0;
-               pArray->headKnownAssignedXids = pArray->tailKnownAssignedXids = 0;
-               return;
-       }
-
-       elog(trace_recovery(DEBUG4), "prune KnownAssignedXids to %u", removeXid);
-
-       /*
-        * Mark entries invalid starting at the tail.  Since array is sorted, we
-        * can stop as soon as we reach an entry >= removeXid.
-        */
-       tail = pArray->tailKnownAssignedXids;
-       head = pArray->headKnownAssignedXids;
-
-       for (i = tail; i < head; i++)
-       {
-               if (KnownAssignedXidsValid[i])
-               {
-                       TransactionId knownXid = KnownAssignedXids[i];
-
-                       if (TransactionIdFollowsOrEquals(knownXid, removeXid))
-                               break;
-
-                       if (!StandbyTransactionIdIsPrepared(knownXid))
-                       {
-                               KnownAssignedXidsValid[i] = false;
-                               count++;
-                       }
-               }
-       }
-
-       pArray->numKnownAssignedXids -= count;
-       Assert(pArray->numKnownAssignedXids >= 0);
-
-       /*
-        * Advance the tail pointer if we've marked the tail item invalid.
-        */
-       for (i = tail; i < head; i++)
-       {
-               if (KnownAssignedXidsValid[i])
-                       break;
-       }
-       if (i >= head)
-       {
-               /* Array is empty, so we can reset both pointers */
-               pArray->headKnownAssignedXids = 0;
-               pArray->tailKnownAssignedXids = 0;
-       }
-       else
-       {
-               pArray->tailKnownAssignedXids = i;
-       }
-
-       /* Opportunistically compress the array */
-       KnownAssignedXidsCompress(false);
-}
-
-/*
- * KnownAssignedXidsGet - Get an array of xids by scanning KnownAssignedXids.
- * We filter out anything >= xmax.
- *
- * Returns the number of XIDs stored into xarray[].  Caller is responsible
- * that array is large enough.
- *
- * Caller must hold ProcArrayLock in (at least) shared mode.
- */
-static int
-KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax)
-{
-       TransactionId xtmp = InvalidTransactionId;
-
-       return KnownAssignedXidsGetAndSetXmin(xarray, &xtmp, xmax);
-}
-
-/*
- * KnownAssignedXidsGetAndSetXmin - as KnownAssignedXidsGet, plus
- * we reduce *xmin to the lowest xid value seen if not already lower.
- *
- * Caller must hold ProcArrayLock in (at least) shared mode.
- */
-static int
-KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin,
-                                                          TransactionId xmax)
-{
-       int                     count = 0;
-       int                     head,
-                               tail;
-       int                     i;
-
-       /*
-        * Fetch head just once, since it may change while we loop. We can stop
-        * once we reach the initially seen head, since we are certain that an xid
-        * cannot enter and then leave the array while we hold ProcArrayLock.  We
-        * might miss newly-added xids, but they should be >= xmax so irrelevant
-        * anyway.
-        *
-        * Must take spinlock to ensure we see up-to-date array contents.
-        */
-       SpinLockAcquire(&procArray->known_assigned_xids_lck);
-       tail = procArray->tailKnownAssignedXids;
-       head = procArray->headKnownAssignedXids;
-       SpinLockRelease(&procArray->known_assigned_xids_lck);
-
-       for (i = tail; i < head; i++)
-       {
-               /* Skip any gaps in the array */
-               if (KnownAssignedXidsValid[i])
-               {
-                       TransactionId knownXid = KnownAssignedXids[i];
-
-                       /*
-                        * Update xmin if required.  Only the first XID need be checked,
-                        * since the array is sorted.
-                        */
-                       if (count == 0 &&
-                               TransactionIdPrecedes(knownXid, *xmin))
-                               *xmin = knownXid;
-
-                       /*
-                        * Filter out anything >= xmax, again relying on sorted property
-                        * of array.
-                        */
-                       if (TransactionIdIsValid(xmax) &&
-                               TransactionIdFollowsOrEquals(knownXid, xmax))
-                               break;
-
-                       /* Add knownXid into output array */
-                       xarray[count++] = knownXid;
-               }
-       }
-
-       return count;
-}
-
-/*
- * Get oldest XID in the KnownAssignedXids array, or InvalidTransactionId
- * if nothing there.
- */
-static TransactionId
-KnownAssignedXidsGetOldestXmin(void)
-{
-       int                     head,
-                               tail;
-       int                     i;
-
-       /*
-        * Fetch head just once, since it may change while we loop.
-        */
-       SpinLockAcquire(&procArray->known_assigned_xids_lck);
-       tail = procArray->tailKnownAssignedXids;
-       head = procArray->headKnownAssignedXids;
-       SpinLockRelease(&procArray->known_assigned_xids_lck);
-
-       for (i = tail; i < head; i++)
-       {
-               /* Skip any gaps in the array */
-               if (KnownAssignedXidsValid[i])
-                       return KnownAssignedXids[i];
-       }
-
-       return InvalidTransactionId;
-}
-
-/*
- * Display KnownAssignedXids to provide debug trail
- *
- * Currently this is only called within startup process, so we need no
- * special locking.
- *
- * Note this is pretty expensive, and much of the expense will be incurred
- * even if the elog message will get discarded.  It's not currently called
- * in any performance-critical places, however, so no need to be tenser.
- */
-static void
-KnownAssignedXidsDisplay(int trace_level)
-{
-       /* use volatile pointer to prevent code rearrangement */
-       volatile ProcArrayStruct *pArray = procArray;
-       StringInfoData buf;
-       int                     head,
-                               tail,
-                               i;
-       int                     nxids = 0;
-
-       tail = pArray->tailKnownAssignedXids;
-       head = pArray->headKnownAssignedXids;
-
-       initStringInfo(&buf);
-
-       for (i = tail; i < head; i++)
-       {
-               if (KnownAssignedXidsValid[i])
-               {
-                       nxids++;
-                       appendStringInfo(&buf, "[%d]=%u ", i, KnownAssignedXids[i]);
-               }
-       }
-
-       elog(trace_level, "%d KnownAssignedXids (num=%d tail=%d head=%d) %s",
-                nxids,
-                pArray->numKnownAssignedXids,
-                pArray->tailKnownAssignedXids,
-                pArray->headKnownAssignedXids,
-                buf.data);
-
-       pfree(buf.data);
-}
-
-/*
- * KnownAssignedXidsReset
- *             Resets KnownAssignedXids to be empty
- */
-static void
-KnownAssignedXidsReset(void)
-{
-       /* use volatile pointer to prevent code rearrangement */
-       volatile ProcArrayStruct *pArray = procArray;
-
-       LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
-
-       pArray->numKnownAssignedXids = 0;
-       pArray->tailKnownAssignedXids = 0;
-       pArray->headKnownAssignedXids = 0;
-
-       LWLockRelease(ProcArrayLock);
-}
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c

index 1efe0201a795a315e158d47bf41786d9d17c85ad..78b94b0f30416b0bfbe6200bd17c19fbf0a9f4af 100644 (file)
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -65,7 +65,7 @@
  
  #include "postgres.h"
  
-#include "access/transam.h"
+#include "access/mvccvars.h"
  #include "miscadmin.h"
  #include "storage/lwlock.h"
  #include "storage/pg_shmem.h"
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c

index 547f1a88fe26e0ba780b69d8c03f221f562d2a45..53aa39b2b343fa04f0c8506505cf15849dad1b81 100644 (file)
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -100,9 +100,6 @@ InitRecoveryTransactionEnvironment(void)
  void
  ShutdownRecoveryTransactionEnvironment(void)
  {
-       /* Mark all tracked in-progress transactions as finished. */
-       ExpireAllKnownAssignedTransactionIds();
-
         /* Release all locks the tracked transactions were holding */
         StandbyReleaseAllLocks();
  
@@ -306,7 +303,7 @@ ResolveRecoveryConflictWithTablespace(Oid tsid)
          *
          * We don't wait for commit because drop tablespace is non-transactional.
          */
-       temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId,
+       temp_file_users = GetConflictingVirtualXIDs(InvalidCommitSeqNo,
                                                                                                 InvalidOid);
         ResolveRecoveryConflictWithVirtualXIDs(temp_file_users,
                                                                            PROCSIG_RECOVERY_CONFLICT_TABLESPACE);
@@ -607,8 +604,7 @@ StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid)
  
         /* Already processed? */
         if (!TransactionIdIsValid(xid) ||
-               TransactionIdDidCommit(xid) ||
-               TransactionIdDidAbort(xid))
+               TransactionIdGetStatus(xid) != XID_INPROGRESS)
                 return;
  
         elog(trace_recovery(DEBUG4),
@@ -723,7 +719,7 @@ StandbyReleaseAllLocks(void)
   *             as long as they're not prepared transactions.
   */
  void
-StandbyReleaseOldLocks(int nxids, TransactionId *xids)
+StandbyReleaseOldLocks(TransactionId oldestRunningXid)
  {
         ListCell   *cell,
                            *prev,
@@ -742,26 +738,8 @@ StandbyReleaseOldLocks(int nxids, TransactionId *xids)
  
                 if (StandbyTransactionIdIsPrepared(lock->xid))
                         remove = false;
-               else
-               {
-                       int                     i;
-                       bool            found = false;
-
-                       for (i = 0; i < nxids; i++)
-                       {
-                               if (lock->xid == xids[i])
-                               {
-                                       found = true;
-                                       break;
-                               }
-                       }
-
-                       /*
-                        * If its not a running transaction, remove it.
-                        */
-                       if (!found)
-                               remove = true;
-               }
+               else if (TransactionIdPrecedes(lock->xid, oldestRunningXid))
+                       remove = true;
  
                 if (remove)
                 {
@@ -816,13 +794,8 @@ standby_redo(XLogReaderState *record)
                 xl_running_xacts *xlrec = (xl_running_xacts *) XLogRecGetData(record);
                 RunningTransactionsData running;
  
-               running.xcnt = xlrec->xcnt;
-               running.subxcnt = xlrec->subxcnt;
-               running.subxid_overflow = xlrec->subxid_overflow;
                 running.nextXid = xlrec->nextXid;
-               running.latestCompletedXid = xlrec->latestCompletedXid;
                 running.oldestRunningXid = xlrec->oldestRunningXid;
-               running.xids = xlrec->xids;
  
                 ProcArrayApplyRecoveryInfo(&running);
         }
@@ -930,27 +903,8 @@ LogStandbySnapshot(void)
          */
         running = GetRunningTransactionData();
  
-       /*
-        * GetRunningTransactionData() acquired ProcArrayLock, we must release it.
-        * For Hot Standby this can be done before inserting the WAL record
-        * because ProcArrayApplyRecoveryInfo() rechecks the commit status using
-        * the clog. For logical decoding, though, the lock can't be released
-        * early because the clog might be "in the future" from the POV of the
-        * historic snapshot. This would allow for situations where we're waiting
-        * for the end of a transaction listed in the xl_running_xacts record
-        * which, according to the WAL, has committed before the xl_running_xacts
-        * record. Fortunately this routine isn't executed frequently, and it's
-        * only a shared lock.
-        */
-       if (wal_level < WAL_LEVEL_LOGICAL)
-               LWLockRelease(ProcArrayLock);
-
         recptr = LogCurrentRunningXacts(running);
  
-       /* Release lock if we kept it longer ... */
-       if (wal_level >= WAL_LEVEL_LOGICAL)
-               LWLockRelease(ProcArrayLock);
-
         /* GetRunningTransactionData() acquired XidGenLock, we must release it */
         LWLockRelease(XidGenLock);
  
@@ -971,40 +925,20 @@ LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
         xl_running_xacts xlrec;
         XLogRecPtr      recptr;
  
-       xlrec.xcnt = CurrRunningXacts->xcnt;
-       xlrec.subxcnt = CurrRunningXacts->subxcnt;
-       xlrec.subxid_overflow = CurrRunningXacts->subxid_overflow;
         xlrec.nextXid = CurrRunningXacts->nextXid;
         xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid;
-       xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid;
  
         /* Header */
         XLogBeginInsert();
-       XLogRegisterData((char *) (&xlrec), MinSizeOfXactRunningXacts);
-
-       /* array of TransactionIds */
-       if (xlrec.xcnt > 0)
-               XLogRegisterData((char *) CurrRunningXacts->xids,
-                                          (xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId));
+       XLogRegisterData((char *) (&xlrec), SizeOfXactRunningXacts);
  
         recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS);
  
-       if (CurrRunningXacts->subxid_overflow)
-               elog(trace_recovery(DEBUG2),
-                        "snapshot of %u running transactions overflowed (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
-                        CurrRunningXacts->xcnt,
-                        (uint32) (recptr >> 32), (uint32) recptr,
-                        CurrRunningXacts->oldestRunningXid,
-                        CurrRunningXacts->latestCompletedXid,
-                        CurrRunningXacts->nextXid);
-       else
-               elog(trace_recovery(DEBUG2),
-                        "snapshot of %u+%u running transaction ids (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
-                        CurrRunningXacts->xcnt, CurrRunningXacts->subxcnt,
-                        (uint32) (recptr >> 32), (uint32) recptr,
-                        CurrRunningXacts->oldestRunningXid,
-                        CurrRunningXacts->latestCompletedXid,
-                        CurrRunningXacts->nextXid);
+       elog(trace_recovery(DEBUG2),
+                "snapshot of running transaction ids (lsn %X/%X oldest xid %u next xid %u)",
+                (uint32) (recptr >> 32), (uint32) recptr,
+                CurrRunningXacts->oldestRunningXid,
+                CurrRunningXacts->nextXid);
  
         /*
          * Ensure running_xacts information is synced to disk not too far in the
diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c

index eeedc38251ac24f3243d82d8258764678d4d0c90..6cbd6e301269c35a942ca2356b4b04eb4696686c 100644 (file)
--- a/src/backend/storage/lmgr/lmgr.c
+++ b/src/backend/storage/lmgr/lmgr.c
@@ -588,8 +588,13 @@ XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid,
  
                 LockRelease(&tag, ShareLock, false);
  
-               if (!TransactionIdIsInProgress(xid))
+               /*
+                * Ok, this xid is not running anymore. But it might be a
+                * subtransaction whose parent is still running.
+                */
+               if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
                         break;
+
                 xid = SubTransGetParent(xid);
         }
  
@@ -620,8 +625,9 @@ ConditionalXactLockTableWait(TransactionId xid)
  
                 LockRelease(&tag, ShareLock, false);
  
-               if (!TransactionIdIsInProgress(xid))
+               if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
                         break;
+
                 xid = SubTransGetParent(xid);
         }
  
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt

index f8996cd21a552089d08978936cd7418f9232aee9..926e52888d15bd2c832a823a705887ffc0defe58 100644 (file)
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -16,7 +16,7 @@ WALWriteLock                                          8
  ControlFileLock                                                9
  CheckpointLock                                         10
  CLogControlLock                                                11
-SubtransControlLock                                    12
+# 12 is available; was formerly SubtransControlLock
  MultiXactGenLock                                       13
  MultiXactOffsetControlLock                     14
  MultiXactMemberControlLock                     15
@@ -47,3 +47,5 @@ CommitTsLock                                          39
  ReplicationOriginLock                          40
  MultiXactTruncationLock                                41
  OldSnapshotTimeMapLock                         42
+CSNLogControlLock                                      43
+CommitSeqNoLock                                                44
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c

index 7cdb35541bf5e620335470c6dd873086b76035e6..78232a5a77b355d76bbf50bd4659e9b444c3a79e 100644 (file)
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -183,7 +183,9 @@
  
  #include "postgres.h"
  
+#include "access/clog.h"
  #include "access/htup_details.h"
+#include "access/mvccvars.h"
  #include "access/slru.h"
  #include "access/subtrans.h"
  #include "access/transam.h"
@@ -3830,7 +3832,7 @@ static bool
  XidIsConcurrent(TransactionId xid)
  {
         Snapshot        snap;
-       uint32          i;
+       XLogRecPtr      csn;
  
         Assert(TransactionIdIsValid(xid));
         Assert(!TransactionIdEquals(xid, GetTopTransactionIdIfAny()));
@@ -3843,11 +3845,11 @@ XidIsConcurrent(TransactionId xid)
         if (TransactionIdFollowsOrEquals(xid, snap->xmax))
                 return true;
  
-       for (i = 0; i < snap->xcnt; i++)
-       {
-               if (xid == snap->xip[i])
-                       return true;
-       }
+       csn = TransactionIdGetCommitSeqNo(xid);
+       if (COMMITSEQNO_IS_INPROGRESS(csn))
+               return true;
+       if (COMMITSEQNO_IS_COMMITTED(csn))
+               return csn > snap->snapshotcsn;
  
         return false;
  }
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c

index 9a758bd91600b0839afadf3e3907e06a0902f8d5..9c999d5f1726c1f793285c3bfe2327ed48e70965 100644 (file)
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -366,7 +366,7 @@ InitProcess(void)
         MyProc->fpVXIDLock = false;
         MyProc->fpLocalTransactionId = InvalidLocalTransactionId;
         MyPgXact->xid = InvalidTransactionId;
-       MyPgXact->xmin = InvalidTransactionId;
+       MyPgXact->snapshotcsn = InvalidCommitSeqNo;
         MyProc->pid = MyProcPid;
         /* backendId, databaseId and roleId will be filled in later */
         MyProc->backendId = InvalidBackendId;
@@ -540,7 +540,7 @@ InitAuxiliaryProcess(void)
         MyProc->fpVXIDLock = false;
         MyProc->fpLocalTransactionId = InvalidLocalTransactionId;
         MyPgXact->xid = InvalidTransactionId;
-       MyPgXact->xmin = InvalidTransactionId;
+       MyPgXact->snapshotcsn = InvalidCommitSeqNo;
         MyProc->backendId = InvalidBackendId;
         MyProc->databaseId = InvalidOid;
         MyProc->roleId = InvalidOid;
@@ -770,7 +770,7 @@ static void
  RemoveProcFromArray(int code, Datum arg)
  {
         Assert(MyProc != NULL);
-       ProcArrayRemove(MyProc, InvalidTransactionId);
+       ProcArrayRemove(MyProc);
  }
  
  /*
diff --git a/src/backend/utils/adt/txid.c b/src/backend/utils/adt/txid.c

index c2069a9923b9df22b361dfbd50c1ea980e0fe458..ef4ae4592e13c9c1c9f80310dac99eae5b2b7341 100644 (file)
--- a/src/backend/utils/adt/txid.c
+++ b/src/backend/utils/adt/txid.c
@@ -51,6 +51,8 @@ typedef uint64 txid;
  
  /*
   * Snapshot containing 8byte txids.
+ *
+ * FIXME: this could be a fixed-length datatype now.
   */
  typedef struct
  {
@@ -61,17 +63,16 @@ typedef struct
          */
         int32           __varsz;
  
-       uint32          nxip;                   /* number of txids in xip array */
-       txid            xmin;
         txid            xmax;
-       /* in-progress txids, xmin <= xip[i] < xmax: */
-       txid            xip[FLEXIBLE_ARRAY_MEMBER];
+       /*
+        * FIXME: this is change in on-disk format if someone created a column
+        * with txid datatype. Dump+reload won't load either.
+        */
+       CommitSeqNo     snapshotcsn;
  } TxidSnapshot;
  
-#define TXID_SNAPSHOT_SIZE(nxip) \
-       (offsetof(TxidSnapshot, xip) + sizeof(txid) * (nxip))
-#define TXID_SNAPSHOT_MAX_NXIP \
-       ((MaxAllocSize - offsetof(TxidSnapshot, xip)) / sizeof(txid))
+#define TXID_SNAPSHOT_SIZE \
+       (offsetof(TxidSnapshot, snapshotcsn) + sizeof(CommitSeqNo))
  
  /*
   * Epoch values from xact.c
@@ -116,61 +117,13 @@ convert_xid(TransactionId xid, const TxidEpoch *state)
         return (epoch << 32) | xid;
  }
  
-/*
- * txid comparator for qsort/bsearch
- */
-static int
-cmp_txid(const void *aa, const void *bb)
-{
-       txid            a = *(const txid *) aa;
-       txid            b = *(const txid *) bb;
-
-       if (a < b)
-               return -1;
-       if (a > b)
-               return 1;
-       return 0;
-}
-
-/*
- * Sort a snapshot's txids, so we can use bsearch() later.  Also remove
- * any duplicates.
- *
- * For consistency of on-disk representation, we always sort even if bsearch
- * will not be used.
- */
-static void
-sort_snapshot(TxidSnapshot *snap)
-{
-       txid            last = 0;
-       int                     nxip,
-                               idx1,
-                               idx2;
-
-       if (snap->nxip > 1)
-       {
-               qsort(snap->xip, snap->nxip, sizeof(txid), cmp_txid);
-
-               /* remove duplicates */
-               nxip = snap->nxip;
-               idx1 = idx2 = 0;
-               while (idx1 < nxip)
-               {
-                       if (snap->xip[idx1] != last)
-                               last = snap->xip[idx2++] = snap->xip[idx1];
-                       else
-                               snap->nxip--;
-                       idx1++;
-               }
-       }
-}
-
  /*
   * check txid visibility.
   */
  static bool
  is_visible_txid(txid value, const TxidSnapshot *snap)
  {
+#ifdef BROKEN
         if (value < snap->xmin)
                 return true;
         else if (value >= snap->xmax)
@@ -196,50 +149,8 @@ is_visible_txid(txid value, const TxidSnapshot *snap)
                 }
                 return true;
         }
-}
-
-/*
- * helper functions to use StringInfo for TxidSnapshot creation.
- */
-
-static StringInfo
-buf_init(txid xmin, txid xmax)
-{
-       TxidSnapshot snap;
-       StringInfo      buf;
-
-       snap.xmin = xmin;
-       snap.xmax = xmax;
-       snap.nxip = 0;
-
-       buf = makeStringInfo();
-       appendBinaryStringInfo(buf, (char *) &snap, TXID_SNAPSHOT_SIZE(0));
-       return buf;
-}
-
-static void
-buf_add_txid(StringInfo buf, txid xid)
-{
-       TxidSnapshot *snap = (TxidSnapshot *) buf->data;
-
-       /* do this before possible realloc */
-       snap->nxip++;
-
-       appendBinaryStringInfo(buf, (char *) &xid, sizeof(xid));
-}
-
-static TxidSnapshot *
-buf_finalize(StringInfo buf)
-{
-       TxidSnapshot *snap = (TxidSnapshot *) buf->data;
-
-       SET_VARSIZE(snap, buf->len);
-
-       /* buf is not needed anymore */
-       buf->data = NULL;
-       pfree(buf);
-
-       return snap;
+#endif
+       return false;
  }
  
  /*
@@ -284,54 +195,29 @@ str2txid(const char *s, const char **endp)
  static TxidSnapshot *
  parse_snapshot(const char *str)
  {
-       txid            xmin;
-       txid            xmax;
-       txid            last_val = 0,
-                               val;
         const char *str_start = str;
         const char *endp;
-       StringInfo      buf;
+       TxidSnapshot *snap;
+       uint32          csn_hi,
+                               csn_lo;
  
-       xmin = str2txid(str, &endp);
-       if (*endp != ':')
-               goto bad_format;
-       str = endp + 1;
+       snap = palloc0(TXID_SNAPSHOT_SIZE);
+       SET_VARSIZE(snap, TXID_SNAPSHOT_SIZE);
  
-       xmax = str2txid(str, &endp);
+       snap->xmax = str2txid(str, &endp);
         if (*endp != ':')
                 goto bad_format;
         str = endp + 1;
  
         /* it should look sane */
-       if (xmin == 0 || xmax == 0 || xmin > xmax)
+       if (snap->xmax == 0)
                 goto bad_format;
  
-       /* allocate buffer */
-       buf = buf_init(xmin, xmax);
-
-       /* loop over values */
-       while (*str != '\0')
-       {
-               /* read next value */
-               val = str2txid(str, &endp);
-               str = endp;
-
-               /* require the input to be in order */
-               if (val < xmin || val >= xmax || val < last_val)
-                       goto bad_format;
-
-               /* skip duplicates */
-               if (val != last_val)
-                       buf_add_txid(buf, val);
-               last_val = val;
-
-               if (*str == ',')
-                       str++;
-               else if (*str != '\0')
-                       goto bad_format;
-       }
+       if (sscanf(str, "%X/%X", &csn_hi, &csn_lo) != 2)
+               goto bad_format;
+       snap->snapshotcsn = ((uint64) csn_hi) << 32 | csn_lo;
  
-       return buf_finalize(buf);
+       return snap;
  
  bad_format:
         ereport(ERROR,
@@ -387,8 +273,6 @@ Datum
  txid_current_snapshot(PG_FUNCTION_ARGS)
  {
         TxidSnapshot *snap;
-       uint32          nxip,
-                               i;
         TxidEpoch       state;
         Snapshot        cur;
  
@@ -398,35 +282,13 @@ txid_current_snapshot(PG_FUNCTION_ARGS)
  
         load_xid_epoch(&state);
  
-       /*
-        * Compile-time limits on the procarray (MAX_BACKENDS processes plus
-        * MAX_BACKENDS prepared transactions) guarantee nxip won't be too large.
-        */
-       StaticAssertStmt(MAX_BACKENDS * 2 <= TXID_SNAPSHOT_MAX_NXIP,
-                                        "possible overflow in txid_current_snapshot()");
-
         /* allocate */
-       nxip = cur->xcnt;
-       snap = palloc(TXID_SNAPSHOT_SIZE(nxip));
+       snap = palloc(TXID_SNAPSHOT_SIZE);
+       SET_VARSIZE(snap, TXID_SNAPSHOT_SIZE);
  
         /* fill */
-       snap->xmin = convert_xid(cur->xmin, &state);
         snap->xmax = convert_xid(cur->xmax, &state);
-       snap->nxip = nxip;
-       for (i = 0; i < nxip; i++)
-               snap->xip[i] = convert_xid(cur->xip[i], &state);
-
-       /*
-        * We want them guaranteed to be in ascending order.  This also removes
-        * any duplicate xids.  Normally, an XID can only be assigned to one
-        * backend, but when preparing a transaction for two-phase commit, there
-        * is a transient state when both the original backend and the dummy
-        * PGPROC entry reserved for the prepared transaction hold the same XID.
-        */
-       sort_snapshot(snap);
-
-       /* set size after sorting, because it may have removed duplicate xips */
-       SET_VARSIZE(snap, TXID_SNAPSHOT_SIZE(snap->nxip));
+       snap->snapshotcsn = cur->snapshotcsn;
  
         PG_RETURN_POINTER(snap);
  }
@@ -457,19 +319,12 @@ txid_snapshot_out(PG_FUNCTION_ARGS)
  {
         TxidSnapshot *snap = (TxidSnapshot *) PG_GETARG_VARLENA_P(0);
         StringInfoData str;
-       uint32          i;
  
         initStringInfo(&str);
  
-       appendStringInfo(&str, TXID_FMT ":", snap->xmin);
         appendStringInfo(&str, TXID_FMT ":", snap->xmax);
-
-       for (i = 0; i < snap->nxip; i++)
-       {
-               if (i > 0)
-                       appendStringInfoChar(&str, ',');
-               appendStringInfo(&str, TXID_FMT, snap->xip[i]);
-       }
+       appendStringInfo(&str, "%X/%X", (uint32) (snap->snapshotcsn >> 32),
+                                        (uint32) snap->snapshotcsn);
  
         PG_RETURN_CSTRING(str.data);
  }
@@ -484,6 +339,7 @@ txid_snapshot_out(PG_FUNCTION_ARGS)
  Datum
  txid_snapshot_recv(PG_FUNCTION_ARGS)
  {
+#ifdef BROKEN
         StringInfo      buf = (StringInfo) PG_GETARG_POINTER(0);
         TxidSnapshot *snap;
         txid            last = 0;
@@ -492,11 +348,6 @@ txid_snapshot_recv(PG_FUNCTION_ARGS)
         txid            xmin,
                                 xmax;
  
-       /* load and validate nxip */
-       nxip = pq_getmsgint(buf, 4);
-       if (nxip < 0 || nxip > TXID_SNAPSHOT_MAX_NXIP)
-               goto bad_format;
-
         xmin = pq_getmsgint64(buf);
         xmax = pq_getmsgint64(buf);
         if (xmin == 0 || xmax == 0 || xmin > xmax || xmax > MAX_TXID)
@@ -529,6 +380,7 @@ txid_snapshot_recv(PG_FUNCTION_ARGS)
         PG_RETURN_POINTER(snap);
  
  bad_format:
+#endif
         ereport(ERROR,
                         (errcode(ERRCODE_INVALID_BINARY_REPRESENTATION),
                          errmsg("invalid external txid_snapshot data")));
@@ -547,14 +399,13 @@ txid_snapshot_send(PG_FUNCTION_ARGS)
  {
         TxidSnapshot *snap = (TxidSnapshot *) PG_GETARG_VARLENA_P(0);
         StringInfoData buf;
-       uint32          i;
  
         pq_begintypsend(&buf);
-       pq_sendint(&buf, snap->nxip, 4);
+#ifdef BROKEN
         pq_sendint64(&buf, snap->xmin);
         pq_sendint64(&buf, snap->xmax);
-       for (i = 0; i < snap->nxip; i++)
-               pq_sendint64(&buf, snap->xip[i]);
+#endif
+       pq_sendint64(&buf, snap->snapshotcsn);
         PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
  }
  
@@ -575,14 +426,18 @@ txid_visible_in_snapshot(PG_FUNCTION_ARGS)
  /*
   * txid_snapshot_xmin(txid_snapshot) returns int8
   *
- *             return snapshot's xmin
+ *             return snapshot's xmin
   */
  Datum
  txid_snapshot_xmin(PG_FUNCTION_ARGS)
  {
+       /* FIXME: we don't store xmin in the TxidSnapshot anymore. Maybe we still should? */
+#ifdef BROKEN
         TxidSnapshot *snap = (TxidSnapshot *) PG_GETARG_VARLENA_P(0);
  
         PG_RETURN_INT64(snap->xmin);
+#endif
+       PG_RETURN_INT64(0);
  }
  
  /*
@@ -597,43 +452,3 @@ txid_snapshot_xmax(PG_FUNCTION_ARGS)
  
         PG_RETURN_INT64(snap->xmax);
  }
-
-/*
- * txid_snapshot_xip(txid_snapshot) returns setof int8
- *
- *             return in-progress TXIDs in snapshot.
- */
-Datum
-txid_snapshot_xip(PG_FUNCTION_ARGS)
-{
-       FuncCallContext *fctx;
-       TxidSnapshot *snap;
-       txid            value;
-
-       /* on first call initialize snap_state and get copy of snapshot */
-       if (SRF_IS_FIRSTCALL())
-       {
-               TxidSnapshot *arg = (TxidSnapshot *) PG_GETARG_VARLENA_P(0);
-
-               fctx = SRF_FIRSTCALL_INIT();
-
-               /* make a copy of user snapshot */
-               snap = MemoryContextAlloc(fctx->multi_call_memory_ctx, VARSIZE(arg));
-               memcpy(snap, arg, VARSIZE(arg));
-
-               fctx->user_fctx = snap;
-       }
-
-       /* return values one-by-one */
-       fctx = SRF_PERCALL_SETUP();
-       snap = fctx->user_fctx;
-       if (fctx->call_cntr < snap->nxip)
-       {
-               value = snap->xip[fctx->call_cntr];
-               SRF_RETURN_NEXT(fctx, Int64GetDatum(value));
-       }
-       else
-       {
-               SRF_RETURN_DONE(fctx);
-       }
-}
diff --git a/src/backend/utils/probes.d b/src/backend/utils/probes.d

index 976774e795e6883440710612f14fe0d8bb2f7ed0..97ea20fdcc2ff5841f87ef3aa5c6918402baf0f9 100644 (file)
--- a/src/backend/utils/probes.d
+++ b/src/backend/utils/probes.d
@@ -75,6 +75,8 @@ provider postgresql {
         probe checkpoint__done(int, int, int, int, int);
         probe clog__checkpoint__start(bool);
         probe clog__checkpoint__done(bool);
+       probe csnlog__checkpoint__start(bool);
+       probe csnlog__checkpoint__done(bool);
         probe subtrans__checkpoint__start(bool);
         probe subtrans__checkpoint__done(bool);
         probe multixact__checkpoint__start(bool);
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c

index 1ec9f70f0eeff3dfe47b4dea17faa32d271f5472..efe9a6b2e1fab37c2816a4737ca1243b344be503 100644 (file)
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -153,9 +153,9 @@ static Snapshot HistoricSnapshot = NULL;
  static bool CatalogSnapshotStale = true;
  
  /*
- * These are updated by GetSnapshotData.  We initialize them this way
- * for the convenience of TransactionIdIsInProgress: even in bootstrap
- * mode, we don't want it to say that BootstrapTransactionId is in progress.
+ * These are updated by GetSnapshotData.  We initialize them this way,
+ * because even in bootstrap mode, we don't want it to say that
+ * BootstrapTransactionId is in progress.
   *
   * RecentGlobalXmin and RecentGlobalDataXmin are initialized to
   * InvalidTransactionId, to ensure that no one tries to use a stale
@@ -163,7 +163,6 @@ static bool CatalogSnapshotStale = true;
   * before using it.
   */
  TransactionId TransactionXmin = FirstNormalTransactionId;
-TransactionId RecentXmin = FirstNormalTransactionId;
  TransactionId RecentGlobalXmin = InvalidTransactionId;
  TransactionId RecentGlobalDataXmin = InvalidTransactionId;
  
@@ -236,9 +235,7 @@ typedef struct SerializedSnapshotData
  {
         TransactionId xmin;
         TransactionId xmax;
-       uint32          xcnt;
-       int32           subxcnt;
-       bool            suboverflowed;
+       CommitSeqNo snapshotcsn;
         bool            takenDuringRecovery;
         CommandId       curcid;
         int64           whenTaken;
@@ -534,7 +531,7 @@ SetTransactionSnapshot(Snapshot sourcesnap, TransactionId sourcexid,
          * Even though we are not going to use the snapshot it computes, we must
          * call GetSnapshotData, for two reasons: (1) to be sure that
          * CurrentSnapshotData's XID arrays have been allocated, and (2) to update
-        * RecentXmin and RecentGlobalXmin.  (We could alternatively include those
+        * RecentGlobalXmin.  (We could alternatively include those
          * two variables in exported snapshot files, but it seems better to have
          * snapshot importers compute reasonably up-to-date values for them.)
          */
@@ -543,17 +540,7 @@ SetTransactionSnapshot(Snapshot sourcesnap, TransactionId sourcexid,
         /*
          * Now copy appropriate fields from the source snapshot.
          */
-       CurrentSnapshot->xmin = sourcesnap->xmin;
         CurrentSnapshot->xmax = sourcesnap->xmax;
-       CurrentSnapshot->xcnt = sourcesnap->xcnt;
-       Assert(sourcesnap->xcnt <= GetMaxSnapshotXidCount());
-       memcpy(CurrentSnapshot->xip, sourcesnap->xip,
-                  sourcesnap->xcnt * sizeof(TransactionId));
-       CurrentSnapshot->subxcnt = sourcesnap->subxcnt;
-       Assert(sourcesnap->subxcnt <= GetMaxSnapshotSubxidCount());
-       memcpy(CurrentSnapshot->subxip, sourcesnap->subxip,
-                  sourcesnap->subxcnt * sizeof(TransactionId));
-       CurrentSnapshot->suboverflowed = sourcesnap->suboverflowed;
         CurrentSnapshot->takenDuringRecovery = sourcesnap->takenDuringRecovery;
         /* NB: curcid should NOT be copied, it's a local matter */
  
@@ -614,50 +601,17 @@ static Snapshot
  CopySnapshot(Snapshot snapshot)
  {
         Snapshot        newsnap;
-       Size            subxipoff;
-       Size            size;
  
         Assert(snapshot != InvalidSnapshot);
  
         /* We allocate any XID arrays needed in the same palloc block. */
-       size = subxipoff = sizeof(SnapshotData) +
-               snapshot->xcnt * sizeof(TransactionId);
-       if (snapshot->subxcnt > 0)
-               size += snapshot->subxcnt * sizeof(TransactionId);
-
-       newsnap = (Snapshot) MemoryContextAlloc(TopTransactionContext, size);
+       newsnap = (Snapshot) MemoryContextAlloc(TopTransactionContext, sizeof(SnapshotData));
         memcpy(newsnap, snapshot, sizeof(SnapshotData));
  
         newsnap->regd_count = 0;
         newsnap->active_count = 0;
         newsnap->copied = true;
  
-       /* setup XID array */
-       if (snapshot->xcnt > 0)
-       {
-               newsnap->xip = (TransactionId *) (newsnap + 1);
-               memcpy(newsnap->xip, snapshot->xip,
-                          snapshot->xcnt * sizeof(TransactionId));
-       }
-       else
-               newsnap->xip = NULL;
-
-       /*
-        * Setup subXID array. Don't bother to copy it if it had overflowed,
-        * though, because it's not used anywhere in that case. Except if it's a
-        * snapshot taken during recovery; all the top-level XIDs are in subxip as
-        * well in that case, so we mustn't lose them.
-        */
-       if (snapshot->subxcnt > 0 &&
-               (!snapshot->suboverflowed || snapshot->takenDuringRecovery))
-       {
-               newsnap->subxip = (TransactionId *) ((char *) newsnap + subxipoff);
-               memcpy(newsnap->subxip, snapshot->subxip,
-                          snapshot->subxcnt * sizeof(TransactionId));
-       }
-       else
-               newsnap->subxip = NULL;
-
         return newsnap;
  }
  
@@ -1098,12 +1052,8 @@ char *
  ExportSnapshot(Snapshot snapshot)
  {
         TransactionId topXid;
-       TransactionId *children;
-       int                     nchildren;
-       int                     addTopXid;
         StringInfoData buf;
         FILE       *f;
-       int                     i;
         MemoryContext oldcxt;
         char            path[MAXPGPATH];
         char            pathtmp[MAXPGPATH];
@@ -1137,13 +1087,6 @@ ExportSnapshot(Snapshot snapshot)
                                 (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
                                  errmsg("cannot export a snapshot from a subtransaction")));
  
-       /*
-        * We do however allow previous committed subtransactions to exist.
-        * Importers of the snapshot must see them as still running, so get their
-        * XIDs to add them to the snapshot.
-        */
-       nchildren = xactGetCommittedChildren(&children);
-
         /*
          * Copy the snapshot into TopTransactionContext, add it to the
          * exportedSnapshots list, and mark it pseudo-registered.  We do this to
@@ -1174,41 +1117,10 @@ ExportSnapshot(Snapshot snapshot)
         appendStringInfo(&buf, "xmin:%u\n", snapshot->xmin);
         appendStringInfo(&buf, "xmax:%u\n", snapshot->xmax);
  
-       /*
-        * We must include our own top transaction ID in the top-xid data, since
-        * by definition we will still be running when the importing transaction
-        * adopts the snapshot, but GetSnapshotData never includes our own XID in
-        * the snapshot.  (There must, therefore, be enough room to add it.)
-        *
-        * However, it could be that our topXid is after the xmax, in which case
-        * we shouldn't include it because xip[] members are expected to be before
-        * xmax.  (We need not make the same check for subxip[] members, see
-        * snapshot.h.)
-        */
-       addTopXid = TransactionIdPrecedes(topXid, snapshot->xmax) ? 1 : 0;
-       appendStringInfo(&buf, "xcnt:%d\n", snapshot->xcnt + addTopXid);
-       for (i = 0; i < snapshot->xcnt; i++)
-               appendStringInfo(&buf, "xip:%u\n", snapshot->xip[i]);
-       if (addTopXid)
-               appendStringInfo(&buf, "xip:%u\n", topXid);
-
-       /*
-        * Similarly, we add our subcommitted child XIDs to the subxid data. Here,
-        * we have to cope with possible overflow.
-        */
-       if (snapshot->suboverflowed ||
-               snapshot->subxcnt + nchildren > GetMaxSnapshotSubxidCount())
-               appendStringInfoString(&buf, "sof:1\n");
-       else
-       {
-               appendStringInfoString(&buf, "sof:0\n");
-               appendStringInfo(&buf, "sxcnt:%d\n", snapshot->subxcnt + nchildren);
-               for (i = 0; i < snapshot->subxcnt; i++)
-                       appendStringInfo(&buf, "sxp:%u\n", snapshot->subxip[i]);
-               for (i = 0; i < nchildren; i++)
-                       appendStringInfo(&buf, "sxp:%u\n", children[i]);
-       }
         appendStringInfo(&buf, "rec:%u\n", snapshot->takenDuringRecovery);
+       appendStringInfo(&buf, "snapshotcsn:%X/%X\n",
+                                        (uint32) (snapshot->snapshotcsn >> 32),
+                                        (uint32) snapshot->snapshotcsn);
  
         /*
          * Now write the text representation into a file.  We first write to a
@@ -1324,6 +1236,33 @@ parseXidFromText(const char *prefix, char **s, const char *filename)
         return val;
  }
  
+static CommitSeqNo
+parseCSNFromText(const char *prefix, char **s, const char *filename)
+{
+       char       *ptr = *s;
+       int                     prefixlen = strlen(prefix);
+       uint32          hi,
+                               lo;
+
+       if (strncmp(ptr, prefix, prefixlen) != 0)
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+                                errmsg("invalid snapshot data in file \"%s\"", filename)));
+       ptr += prefixlen;
+       if (sscanf(ptr, "%X/%X", &hi, &lo) != 2)
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+                                errmsg("invalid snapshot data in file \"%s\"", filename)));
+       ptr = strchr(ptr, '\n');
+       if (!ptr)
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+                                errmsg("invalid snapshot data in file \"%s\"", filename)));
+       *s = ptr + 1;
+
+       return (CommitSeqNo) (((uint64) hi) << 32 | (uint64) lo);
+}
+
  /*
   * ImportSnapshot
   *             Import a previously exported snapshot.  The argument should be a
@@ -1337,8 +1276,6 @@ ImportSnapshot(const char *idstr)
         FILE       *f;
         struct stat stat_buf;
         char       *filebuf;
-       int                     xcnt;
-       int                     i;
         TransactionId src_xid;
         Oid                     src_dbid;
         int                     src_isolevel;
@@ -1409,44 +1346,9 @@ ImportSnapshot(const char *idstr)
         src_isolevel = parseIntFromText("iso:", &filebuf, path);
         src_readonly = parseIntFromText("ro:", &filebuf, path);
  
-       snapshot.xmin = parseXidFromText("xmin:", &filebuf, path);
         snapshot.xmax = parseXidFromText("xmax:", &filebuf, path);
-
-       snapshot.xcnt = xcnt = parseIntFromText("xcnt:", &filebuf, path);
-
-       /* sanity-check the xid count before palloc */
-       if (xcnt < 0 || xcnt > GetMaxSnapshotXidCount())
-               ereport(ERROR,
-                               (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
-                                errmsg("invalid snapshot data in file \"%s\"", path)));
-
-       snapshot.xip = (TransactionId *) palloc(xcnt * sizeof(TransactionId));
-       for (i = 0; i < xcnt; i++)
-               snapshot.xip[i] = parseXidFromText("xip:", &filebuf, path);
-
-       snapshot.suboverflowed = parseIntFromText("sof:", &filebuf, path);
-
-       if (!snapshot.suboverflowed)
-       {
-               snapshot.subxcnt = xcnt = parseIntFromText("sxcnt:", &filebuf, path);
-
-               /* sanity-check the xid count before palloc */
-               if (xcnt < 0 || xcnt > GetMaxSnapshotSubxidCount())
-                       ereport(ERROR,
-                                       (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
-                                        errmsg("invalid snapshot data in file \"%s\"", path)));
-
-               snapshot.subxip = (TransactionId *) palloc(xcnt * sizeof(TransactionId));
-               for (i = 0; i < xcnt; i++)
-                       snapshot.subxip[i] = parseXidFromText("sxp:", &filebuf, path);
-       }
-       else
-       {
-               snapshot.subxcnt = 0;
-               snapshot.subxip = NULL;
-       }
-
         snapshot.takenDuringRecovery = parseIntFromText("rec:", &filebuf, path);
+       snapshot.snapshotcsn = parseCSNFromText("snapshotcsn:", &filebuf, path);
  
         /*
          * Do some additional sanity checking, just to protect ourselves.  We
@@ -1455,7 +1357,6 @@ ImportSnapshot(const char *idstr)
          */
         if (!TransactionIdIsNormal(src_xid) ||
                 !OidIsValid(src_dbid) ||
-               !TransactionIdIsNormal(snapshot.xmin) ||
                 !TransactionIdIsNormal(snapshot.xmax))
                 ereport(ERROR,
                                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
@@ -1481,10 +1382,10 @@ ImportSnapshot(const char *idstr)
  
         /*
          * We cannot import a snapshot that was taken in a different database,
-        * because vacuum calculates OldestXmin on a per-database basis; so the
-        * source transaction's xmin doesn't protect us from data loss.  This
+        * because vacuum calculates OldestSnapshot on a per-database basis; so the
+        * source transaction's snapshot doesn't protect us from data loss.  This
          * restriction could be removed if the source transaction were to mark its
-        * xmin as being globally applicable.  But that would require some
+        * snapshot as being globally applicable.  But that would require some
          * additional syntax, since that has to be known when the snapshot is
          * initially taken.  (See pgsql-hackers discussion of 2011-10-21.)
          */
@@ -1730,7 +1631,6 @@ TransactionIdLimitedForOldSnapshots(TransactionId recentXmin,
                 if (NormalTransactionIdFollows(xlimit, recentXmin))
                         return xlimit;
         }
-
         return recentXmin;
  }
  
@@ -1941,13 +1841,7 @@ EstimateSnapshotSpace(Snapshot snap)
         Assert(snap != InvalidSnapshot);
         Assert(snap->satisfies == HeapTupleSatisfiesMVCC);
  
-       /* We allocate any XID arrays needed in the same palloc block. */
-       size = add_size(sizeof(SerializedSnapshotData),
-                                       mul_size(snap->xcnt, sizeof(TransactionId)));
-       if (snap->subxcnt > 0 &&
-               (!snap->suboverflowed || snap->takenDuringRecovery))
-               size = add_size(size,
-                                               mul_size(snap->subxcnt, sizeof(TransactionId)));
+       size = sizeof(SerializedSnapshotData);
  
         return size;
  }
@@ -1962,48 +1856,17 @@ SerializeSnapshot(Snapshot snapshot, char *start_address)
  {
         SerializedSnapshotData *serialized_snapshot;
  
-       Assert(snapshot->subxcnt >= 0);
-
         serialized_snapshot = (SerializedSnapshotData *) start_address;
  
         /* Copy all required fields */
         serialized_snapshot->xmin = snapshot->xmin;
         serialized_snapshot->xmax = snapshot->xmax;
-       serialized_snapshot->xcnt = snapshot->xcnt;
-       serialized_snapshot->subxcnt = snapshot->subxcnt;
-       serialized_snapshot->suboverflowed = snapshot->suboverflowed;
         serialized_snapshot->takenDuringRecovery = snapshot->takenDuringRecovery;
         serialized_snapshot->curcid = snapshot->curcid;
         serialized_snapshot->whenTaken = snapshot->whenTaken;
         serialized_snapshot->lsn = snapshot->lsn;
  
-       /*
-        * Ignore the SubXID array if it has overflowed, unless the snapshot was
-        * taken during recovey - in that case, top-level XIDs are in subxip as
-        * well, and we mustn't lose them.
-        */
-       if (serialized_snapshot->suboverflowed && !snapshot->takenDuringRecovery)
-               serialized_snapshot->subxcnt = 0;
-
-       /* Copy XID array */
-       if (snapshot->xcnt > 0)
-               memcpy((TransactionId *) (serialized_snapshot + 1),
-                          snapshot->xip, snapshot->xcnt * sizeof(TransactionId));
-
-       /*
-        * Copy SubXID array. Don't bother to copy it if it had overflowed,
-        * though, because it's not used anywhere in that case. Except if it's a
-        * snapshot taken during recovery; all the top-level XIDs are in subxip as
-        * well in that case, so we mustn't lose them.
-        */
-       if (serialized_snapshot->subxcnt > 0)
-       {
-               Size            subxipoff = sizeof(SerializedSnapshotData) +
-               snapshot->xcnt * sizeof(TransactionId);
-
-               memcpy((TransactionId *) ((char *) serialized_snapshot + subxipoff),
-                          snapshot->subxip, snapshot->subxcnt * sizeof(TransactionId));
-       }
+       serialized_snapshot->snapshotcsn = snapshot->snapshotcsn;
  }
  
  /*
@@ -2019,49 +1882,23 @@ RestoreSnapshot(char *start_address)
         SerializedSnapshotData *serialized_snapshot;
         Size            size;
         Snapshot        snapshot;
-       TransactionId *serialized_xids;
  
         serialized_snapshot = (SerializedSnapshotData *) start_address;
-       serialized_xids = (TransactionId *)
-               (start_address + sizeof(SerializedSnapshotData));
  
         /* We allocate any XID arrays needed in the same palloc block. */
-       size = sizeof(SnapshotData)
-               + serialized_snapshot->xcnt * sizeof(TransactionId)
-               + serialized_snapshot->subxcnt * sizeof(TransactionId);
+       size = sizeof(SnapshotData);
  
         /* Copy all required fields */
         snapshot = (Snapshot) MemoryContextAlloc(TopTransactionContext, size);
         snapshot->satisfies = HeapTupleSatisfiesMVCC;
         snapshot->xmin = serialized_snapshot->xmin;
         snapshot->xmax = serialized_snapshot->xmax;
-       snapshot->xip = NULL;
-       snapshot->xcnt = serialized_snapshot->xcnt;
-       snapshot->subxip = NULL;
-       snapshot->subxcnt = serialized_snapshot->subxcnt;
-       snapshot->suboverflowed = serialized_snapshot->suboverflowed;
+       snapshot->snapshotcsn = serialized_snapshot->snapshotcsn;
         snapshot->takenDuringRecovery = serialized_snapshot->takenDuringRecovery;
         snapshot->curcid = serialized_snapshot->curcid;
         snapshot->whenTaken = serialized_snapshot->whenTaken;
         snapshot->lsn = serialized_snapshot->lsn;
  
-       /* Copy XIDs, if present. */
-       if (serialized_snapshot->xcnt > 0)
-       {
-               snapshot->xip = (TransactionId *) (snapshot + 1);
-               memcpy(snapshot->xip, serialized_xids,
-                          serialized_snapshot->xcnt * sizeof(TransactionId));
-       }
-
-       /* Copy SubXIDs, if present. */
-       if (serialized_snapshot->subxcnt > 0)
-       {
-               snapshot->subxip = ((TransactionId *) (snapshot + 1)) +
-                       serialized_snapshot->xcnt;
-               memcpy(snapshot->subxip, serialized_xids + serialized_snapshot->xcnt,
-                          serialized_snapshot->subxcnt * sizeof(TransactionId));
-       }
-
         /* Set the copied flag so that the caller will set refcounts correctly. */
         snapshot->regd_count = 0;
         snapshot->active_count = 0;
diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c

index 1aff2e9f2d1b1fa7b8e14f5b06cdc70bac8001ad..5fe898ba0570c8e3e6712e7fa72ae8337721d60f 100644 (file)
--- a/src/backend/utils/time/tqual.c
+++ b/src/backend/utils/time/tqual.c
@@ -10,28 +10,6 @@
   * the passed-in buffer.  The caller must hold not only a pin, but at least
   * shared buffer content lock on the buffer containing the tuple.
   *
- * NOTE: When using a non-MVCC snapshot, we must check
- * TransactionIdIsInProgress (which looks in the PGXACT array)
- * before TransactionIdDidCommit/TransactionIdDidAbort (which look in
- * pg_clog).  Otherwise we have a race condition: we might decide that a
- * just-committed transaction crashed, because none of the tests succeed.
- * xact.c is careful to record commit/abort in pg_clog before it unsets
- * MyPgXact->xid in the PGXACT array.  That fixes that problem, but it
- * also means there is a window where TransactionIdIsInProgress and
- * TransactionIdDidCommit will both return true.  If we check only
- * TransactionIdDidCommit, we could consider a tuple committed when a
- * later GetSnapshotData call will still think the originating transaction
- * is in progress, which leads to application-level inconsistency.  The
- * upshot is that we gotta check TransactionIdIsInProgress first in all
- * code paths, except for a few cases where we are looking at
- * subtransactions of our own main transaction and so there can't be any
- * race condition.
- *
- * When using an MVCC snapshot, we rely on XidInMVCCSnapshot rather than
- * TransactionIdIsInProgress, but the logic is otherwise the same: do not
- * check pg_clog until after deciding that the xact is no longer in progress.
- *
- *
   * Summary of visibility functions:
   *
   *      HeapTupleSatisfiesMVCC()
@@ -80,7 +58,7 @@ SnapshotData SnapshotSelfData = {HeapTupleSatisfiesSelf};
  SnapshotData SnapshotAnyData = {HeapTupleSatisfiesAny};
  
  /* local functions */
-static bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot);
+static bool XidVisibleInSnapshot(TransactionId xid, Snapshot snapshot, bool known_committed, TransactionIdStatus *hintstatus);
  static bool IsMovedTupleVisible(HeapTuple htup, Buffer buffer);
  
  /*
@@ -121,7 +99,7 @@ SetHintBits(HeapTupleHeader tuple, Buffer buffer,
         if (TransactionIdIsValid(xid))
         {
                 /* NB: xid must be known committed here! */
-               XLogRecPtr      commitLSN = TransactionIdGetCommitLSN(xid);
+               XLogRecPtr              commitLSN = TransactionIdGetCommitLSN(xid);
  
                 if (BufferIsPermanent(buffer) && XLogNeedsFlush(commitLSN) &&
                         BufferGetLSNAtomic(buffer) < commitLSN)
@@ -177,6 +155,8 @@ bool
  HeapTupleSatisfiesSelf(HeapTuple htup, Snapshot snapshot, Buffer buffer)
  {
         HeapTupleHeader tuple = htup->t_data;
+       bool            visible;
+       TransactionIdStatus     hintstatus;
  
         Assert(ItemPointerIsValid(&htup->t_self));
         Assert(htup->t_tableOid != InvalidOid);
@@ -189,7 +169,8 @@ HeapTupleSatisfiesSelf(HeapTuple htup, Snapshot snapshot, Buffer buffer)
                 /* Used by pre-9.0 binary upgrades */
                 if (tuple->t_infomask & HEAP_MOVED)
                         return IsMovedTupleVisible(htup, buffer);
-               else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple)))
+
+               if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple)))
                 {
                         if (tuple->t_infomask & HEAP_XMAX_INVALID)      /* xid invalid */
                                 return true;
@@ -223,17 +204,18 @@ HeapTupleSatisfiesSelf(HeapTuple htup, Snapshot snapshot, Buffer buffer)
  
                         return false;
                 }
-               else if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmin(tuple)))
-                       return false;
-               else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple)))
-                       SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
-                                               HeapTupleHeaderGetRawXmin(tuple));
                 else
                 {
-                       /* it must have aborted or crashed */
-                       SetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
-                                               InvalidTransactionId);
-                       return false;
+                       visible = XidVisibleInSnapshot(HeapTupleHeaderGetRawXmin(tuple), snapshot, false, &hintstatus);
+
+                       if (hintstatus == XID_COMMITTED)
+                               SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+                                                       HeapTupleHeaderGetRawXmin(tuple));
+                       if (hintstatus == XID_ABORTED)
+                               SetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+                                                       InvalidTransactionId);
+                       if (!visible)
+                               return false;
                 }
         }
  
@@ -263,12 +245,13 @@ HeapTupleSatisfiesSelf(HeapTuple htup, Snapshot snapshot, Buffer buffer)
  
                 if (TransactionIdIsCurrentTransactionId(xmax))
                         return false;
-               if (TransactionIdIsInProgress(xmax))
+
+               visible = XidVisibleInSnapshot(xmax, snapshot, false, &hintstatus);
+               if (!visible)
+               {
+                       /* it must have aborted or crashed */
                         return true;
-               if (TransactionIdDidCommit(xmax))
-                       return false;
-               /* it must have aborted or crashed */
-               return true;
+               }
         }
  
         if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
@@ -278,16 +261,15 @@ HeapTupleSatisfiesSelf(HeapTuple htup, Snapshot snapshot, Buffer buffer)
                 return false;
         }
  
-       if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple)))
-               return true;
-
-       if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple)))
+       visible = XidVisibleInSnapshot(HeapTupleHeaderGetRawXmax(tuple), snapshot, false, &hintstatus);
+       if (hintstatus == XID_ABORTED)
         {
                 /* it must have aborted or crashed */
                 SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
                                         InvalidTransactionId);
-               return true;
         }
+       if (!visible)
+               return true;
  
         /* xmax transaction committed */
  
@@ -390,6 +372,7 @@ HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid,
                                                  Buffer buffer)
  {
         HeapTupleHeader tuple = htup->t_data;
+       TransactionIdStatus     xidstatus;
  
         Assert(ItemPointerIsValid(&htup->t_self));
         Assert(htup->t_tableOid != InvalidOid);
@@ -442,9 +425,11 @@ HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid,
                                  * left in this Xmax; otherwise, report the tuple as
                                  * locked/updated.
                                  */
-                               if (!TransactionIdIsInProgress(xmax))
+                               xidstatus = TransactionIdGetStatus(xmax);
+                               if (xidstatus != XID_INPROGRESS)
                                         return HeapTupleMayBeUpdated;
-                               return HeapTupleBeingUpdated;
+                               else
+                                       return HeapTupleBeingUpdated;
                         }
  
                         if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
@@ -488,17 +473,21 @@ HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid,
                         else
                                 return HeapTupleInvisible;              /* updated before scan started */
                 }
-               else if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmin(tuple)))
-                       return HeapTupleInvisible;
-               else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple)))
-                       SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
-                                               HeapTupleHeaderGetRawXmin(tuple));
                 else
                 {
-                       /* it must have aborted or crashed */
-                       SetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
-                                               InvalidTransactionId);
-                       return HeapTupleInvisible;
+                       xidstatus = TransactionIdGetStatus(HeapTupleHeaderGetRawXmin(tuple));
+                       if (xidstatus == XID_COMMITTED)
+                       {
+                               SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+                                                       HeapTupleHeaderGetXmin(tuple));
+                       }
+                       else
+                       {
+                               if (xidstatus == XID_ABORTED)
+                                       SetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+                                                               InvalidTransactionId);
+                               return HeapTupleInvisible;
+                       }
                 }
         }
  
@@ -548,17 +537,21 @@ HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid,
                                 return HeapTupleInvisible;              /* updated before scan started */
                 }
  
-               if (MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), false))
-                       return HeapTupleBeingUpdated;
-
-               if (TransactionIdDidCommit(xmax))
-                       return HeapTupleUpdated;
+               xidstatus = TransactionIdGetStatus(xmax);
+               switch (xidstatus)
+               {
+                       case XID_INPROGRESS:
+                               return HeapTupleBeingUpdated;
+                       case XID_COMMITTED:
+                               return HeapTupleUpdated;
+                       case XID_ABORTED:
+                               break;
+               }
  
                 /*
                  * By here, the update in the Xmax is either aborted or crashed, but
                  * what about the other members?
                  */
-
                 if (!MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), false))
                 {
                         /*
@@ -586,15 +579,18 @@ HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid,
                         return HeapTupleInvisible;      /* updated before scan started */
         }
  
-       if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple)))
-               return HeapTupleBeingUpdated;
-
-       if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple)))
+       xidstatus = TransactionIdGetStatus(HeapTupleHeaderGetRawXmax(tuple));
+       switch (xidstatus)
         {
-               /* it must have aborted or crashed */
-               SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
-                                       InvalidTransactionId);
-               return HeapTupleMayBeUpdated;
+               case XID_INPROGRESS:
+                       return HeapTupleBeingUpdated;
+               case XID_ABORTED:
+                       /* it must have aborted or crashed */
+                       SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+                                               InvalidTransactionId);
+                       return HeapTupleMayBeUpdated;
+               case XID_COMMITTED:
+                       break;
         }
  
         /* xmax transaction committed */
@@ -639,6 +635,7 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot,
                                                 Buffer buffer)
  {
         HeapTupleHeader tuple = htup->t_data;
+       TransactionIdStatus xidstatus;
  
         Assert(ItemPointerIsValid(&htup->t_self));
         Assert(htup->t_tableOid != InvalidOid);
@@ -689,35 +686,39 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot,
  
                         return false;
                 }
-               else if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmin(tuple)))
+               else
                 {
-                       /*
-                        * Return the speculative token to caller.  Caller can worry about
-                        * xmax, since it requires a conclusively locked row version, and
-                        * a concurrent update to this tuple is a conflict of its
-                        * purposes.
-                        */
-                       if (HeapTupleHeaderIsSpeculative(tuple))
+                       xidstatus = TransactionIdGetStatus(HeapTupleHeaderGetRawXmin(tuple));
+                       switch (xidstatus)
                         {
-                               snapshot->speculativeToken =
-                                       HeapTupleHeaderGetSpeculativeToken(tuple);
-
-                               Assert(snapshot->speculativeToken != 0);
+                               case XID_INPROGRESS:
+                                       /*
+                                        * Return the speculative token to caller.  Caller can worry about
+                                        * xmax, since it requires a conclusively locked row version, and
+                                        * a concurrent update to this tuple is a conflict of its
+                                        * purposes.
+                                        */
+                                       if (HeapTupleHeaderIsSpeculative(tuple))
+                                       {
+                                               snapshot->speculativeToken =
+                                                       HeapTupleHeaderGetSpeculativeToken(tuple);
+
+                                               Assert(snapshot->speculativeToken != 0);
+                                       }
+
+                                       snapshot->xmin = HeapTupleHeaderGetRawXmin(tuple);
+                                       /* XXX shouldn't we fall through to look at xmax? */
+                                       return true;            /* in insertion by other */
+                               case XID_COMMITTED:
+                                       SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+                                                               HeapTupleHeaderGetRawXmin(tuple));
+                                       break;
+                               case XID_ABORTED:
+                                       /* it must have aborted or crashed */
+                                       SetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+                                                               InvalidTransactionId);
+                               return false;
                         }
-
-                       snapshot->xmin = HeapTupleHeaderGetRawXmin(tuple);
-                       /* XXX shouldn't we fall through to look at xmax? */
-                       return true;            /* in insertion by other */
-               }
-               else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple)))
-                       SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
-                                               HeapTupleHeaderGetRawXmin(tuple));
-               else
-               {
-                       /* it must have aborted or crashed */
-                       SetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
-                                               InvalidTransactionId);
-                       return false;
                 }
         }
  
@@ -747,15 +748,19 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot,
  
                 if (TransactionIdIsCurrentTransactionId(xmax))
                         return false;
-               if (TransactionIdIsInProgress(xmax))
+
+               xidstatus = TransactionIdGetStatus(xmax);
+               switch (xidstatus)
                 {
-                       snapshot->xmax = xmax;
-                       return true;
+                       case XID_INPROGRESS:
+                               snapshot->xmax = xmax;
+                               return true;
+                       case XID_COMMITTED:
+                               return false;
+                       case XID_ABORTED:
+                               /* it must have aborted or crashed */
+                               return true;
                 }
-               if (TransactionIdDidCommit(xmax))
-                       return false;
-               /* it must have aborted or crashed */
-               return true;
         }
  
         if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
@@ -765,19 +770,20 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot,
                 return false;
         }
  
-       if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple)))
+       xidstatus = TransactionIdGetStatus(HeapTupleHeaderGetRawXmax(tuple));
+       switch (xidstatus)
         {
-               if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
-                       snapshot->xmax = HeapTupleHeaderGetRawXmax(tuple);
-               return true;
-       }
-
-       if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple)))
-       {
-               /* it must have aborted or crashed */
-               SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
-                                       InvalidTransactionId);
-               return true;
+               case XID_INPROGRESS:
+                       if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
+                               snapshot->xmax = HeapTupleHeaderGetRawXmax(tuple);
+                       return true;
+               case XID_ABORTED:
+                       /* it must have aborted or crashed */
+                       SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+                                               InvalidTransactionId);
+                       return true;
+               case XID_COMMITTED:
+                       break;
         }
  
         /* xmax transaction committed */
@@ -806,28 +812,14 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot,
   *             transactions shown as in-progress by the snapshot
   *             transactions started after the snapshot was taken
   *             changes made by the current command
- *
- * Notice that here, we will not update the tuple status hint bits if the
- * inserting/deleting transaction is still running according to our snapshot,
- * even if in reality it's committed or aborted by now.  This is intentional.
- * Checking the true transaction state would require access to high-traffic
- * shared data structures, creating contention we'd rather do without, and it
- * would not change the result of our visibility check anyway.  The hint bits
- * will be updated by the first visitor that has a snapshot new enough to see
- * the inserting/deleting transaction as done.  In the meantime, the cost of
- * leaving the hint bits unset is basically that each HeapTupleSatisfiesMVCC
- * call will need to run TransactionIdIsCurrentTransactionId in addition to
- * XidInMVCCSnapshot (but it would have to do the latter anyway).  In the old
- * coding where we tried to set the hint bits as soon as possible, we instead
- * did TransactionIdIsInProgress in each call --- to no avail, as long as the
- * inserting/deleting transaction was still running --- which was more cycles
- * and more contention on the PGXACT array.
   */
  bool
  HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot,
                                            Buffer buffer)
  {
         HeapTupleHeader tuple = htup->t_data;
+       bool            visible;
+       TransactionIdStatus     hintstatus;
  
         Assert(ItemPointerIsValid(&htup->t_self));
         Assert(htup->t_tableOid != InvalidOid);
@@ -883,25 +875,40 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot,
                         else
                                 return false;   /* deleted before scan started */
                 }
-               else if (XidInMVCCSnapshot(HeapTupleHeaderGetRawXmin(tuple), snapshot))
-                       return false;
-               else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple)))
-                       SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
-                                               HeapTupleHeaderGetRawXmin(tuple));
                 else
                 {
-                       /* it must have aborted or crashed */
-                       SetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
-                                               InvalidTransactionId);
-                       return false;
+                       visible = XidVisibleInSnapshot(HeapTupleHeaderGetXmin(tuple),
+                                                                                  snapshot, false, &hintstatus);
+                       if (hintstatus == XID_COMMITTED)
+                               SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+                                                       HeapTupleHeaderGetRawXmin(tuple));
+                       if (hintstatus == XID_ABORTED)
+                               SetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+                                                       InvalidTransactionId);
+                       if (!visible)
+                               return false;
                 }
         }
         else
         {
                 /* xmin is committed, but maybe not according to our snapshot */
-               if (!HeapTupleHeaderXminFrozen(tuple) &&
-                       XidInMVCCSnapshot(HeapTupleHeaderGetRawXmin(tuple), snapshot))
-                       return false;           /* treat as still in progress */
+               if (!HeapTupleHeaderXminFrozen(tuple))
+               {
+                       visible = XidVisibleInSnapshot(HeapTupleHeaderGetRawXmin(tuple), snapshot,
+                                                                                  true, &hintstatus);
+                       if (hintstatus == XID_COMMITTED)
+                       {
+                               SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+                                                       HeapTupleHeaderGetRawXmin(tuple));
+                       }
+                       if (hintstatus == XID_ABORTED)
+                       {
+                               SetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+                                                       InvalidTransactionId);
+                       }
+                       if (!visible)
+                               return false;           /* treat as still in progress */
+               }
         }
  
         /* by here, the inserting transaction has committed */
@@ -931,12 +938,15 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot,
                         else
                                 return false;   /* deleted before scan started */
                 }
-               if (XidInMVCCSnapshot(xmax, snapshot))
-                       return true;
-               if (TransactionIdDidCommit(xmax))
+
+               visible = XidVisibleInSnapshot(xmax, snapshot, false, &hintstatus);
+               if (visible)
                         return false;           /* updating transaction committed */
-               /* it must have aborted or crashed */
-               return true;
+               else
+               {
+                       /* it must have aborted or crashed */
+                       return true;
+               }
         }
  
         if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED))
@@ -949,25 +959,28 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot,
                                 return false;   /* deleted before scan started */
                 }
  
-               if (XidInMVCCSnapshot(HeapTupleHeaderGetRawXmax(tuple), snapshot))
-                       return true;
-
-               if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple)))
+               visible = XidVisibleInSnapshot(HeapTupleHeaderGetRawXmax(tuple),
+                                                                          snapshot, false, &hintstatus);
+               if (hintstatus == XID_COMMITTED)
+               {
+                       /* xmax transaction committed */
+                       SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
+                                               HeapTupleHeaderGetRawXmax(tuple));
+               }
+               if (hintstatus == XID_ABORTED)
                 {
                         /* it must have aborted or crashed */
                         SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
                                                 InvalidTransactionId);
                         return true;
                 }
-
-               /* xmax transaction committed */
-               SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
-                                       HeapTupleHeaderGetRawXmax(tuple));
         }
         else
         {
                 /* xmax is committed, but maybe not according to our snapshot */
-               if (XidInMVCCSnapshot(HeapTupleHeaderGetRawXmax(tuple), snapshot))
+               visible = XidVisibleInSnapshot(HeapTupleHeaderGetRawXmax(tuple), snapshot,
+                                                                          false, &hintstatus);
+               if (!visible)
                         return true;            /* treat as still in progress */
         }
  
@@ -984,16 +997,22 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot,
   *     we mainly want to know is if a tuple is potentially visible to *any*
   *     running transaction.  If so, it can't be removed yet by VACUUM.
   *
- * OldestXmin is a cutoff XID (obtained from GetOldestXmin()).  Tuples
- * deleted by XIDs >= OldestXmin are deemed "recently dead"; they might
- * still be visible to some open transaction, so we can't remove them,
- * even if we see that the deleting transaction has committed.
+ * OldestSnapshot is a cutoff snapshot (obtained from GetOldestSnapshot()).
+ * Tuples deleted by XIDs that are still visible to OldestSnapshot are deemed
+ * "recently dead"; they might still be visible to some open transaction,
+ * so we can't remove them, even if we see that the deleting transaction
+ * has committed.
+ *
+ * Note: predicate.c calls this with a current snapshot, rather than one obtained
+ * from GetOldestSnapshot(). So even if this function determines that a tuple
+ * is not visible to anyone anymore, we can't "kill" the tuple right here.
   */
  HTSV_Result
  HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin,
                                                  Buffer buffer)
  {
         HeapTupleHeader tuple = htup->t_data;
+       TransactionIdStatus     xidstatus;
  
         Assert(ItemPointerIsValid(&htup->t_self));
         Assert(htup->t_tableOid != InvalidOid);
@@ -1032,7 +1051,10 @@ HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin,
                         /* deleting subtransaction must have aborted */
                         return HEAPTUPLE_INSERT_IN_PROGRESS;
                 }
-               else if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmin(tuple)))
+
+               xidstatus = TransactionIdGetStatus(HeapTupleHeaderGetRawXmin(tuple));
+
+               if (xidstatus == XID_INPROGRESS)
                 {
                         /*
                          * It'd be possible to discern between INSERT/DELETE in progress
@@ -1044,7 +1066,7 @@ HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin,
                          */
                         return HEAPTUPLE_INSERT_IN_PROGRESS;
                 }
-               else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple)))
+               else if (xidstatus == XID_COMMITTED)
                         SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
                                                 HeapTupleHeaderGetRawXmin(tuple));
                 else
@@ -1095,7 +1117,8 @@ HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin,
                         }
                         else
                         {
-                               if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple)))
+                               xidstatus = TransactionIdGetStatus(HeapTupleHeaderGetRawXmax(tuple));
+                               if (xidstatus == XID_INPROGRESS)
                                         return HEAPTUPLE_LIVE;
                                 SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
                                                         InvalidTransactionId);
@@ -1125,13 +1148,17 @@ HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin,
                         /* not LOCKED_ONLY, so it has to have an xmax */
                         Assert(TransactionIdIsValid(xmax));
  
-                       if (TransactionIdIsInProgress(xmax))
-                               return HEAPTUPLE_DELETE_IN_PROGRESS;
-                       else if (TransactionIdDidCommit(xmax))
-                               /* there are still lockers around -- can't return DEAD here */
-                               return HEAPTUPLE_RECENTLY_DEAD;
-                       /* updating transaction aborted */
-                       return HEAPTUPLE_LIVE;
+                       switch(TransactionIdGetStatus(xmax))
+                       {
+                               case XID_INPROGRESS:
+                                       return HEAPTUPLE_DELETE_IN_PROGRESS;
+                               case XID_COMMITTED:
+                                       /* there are still lockers around -- can't return DEAD here */
+                                       return HEAPTUPLE_RECENTLY_DEAD;
+                               case XID_ABORTED:
+                                       /* updating transaction aborted */
+                                       return HEAPTUPLE_LIVE;
+                       }
                 }
  
                 Assert(!(tuple->t_infomask & HEAP_XMAX_COMMITTED));
@@ -1141,8 +1168,12 @@ HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin,
                 /* not LOCKED_ONLY, so it has to have an xmax */
                 Assert(TransactionIdIsValid(xmax));
  
-               /* multi is not running -- updating xact cannot be */
-               Assert(!TransactionIdIsInProgress(xmax));
+               /*
+                * multi is not running -- updating xact cannot be (this assertion
+                * won't catch a running subtransaction)
+                */
+               Assert(!TransactionIdIsActive(xmax));
+
                 if (TransactionIdDidCommit(xmax))
                 {
                         if (!TransactionIdPrecedes(xmax, OldestXmin))
@@ -1161,9 +1192,11 @@ HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin,
  
         if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED))
         {
-               if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple)))
+               xidstatus = TransactionIdGetStatus(HeapTupleHeaderGetRawXmax(tuple));
+
+               if (xidstatus == XID_INPROGRESS)
                         return HEAPTUPLE_DELETE_IN_PROGRESS;
-               else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple)))
+               else if (xidstatus == XID_COMMITTED)
                         SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
                                                 HeapTupleHeaderGetRawXmax(tuple));
                 else
@@ -1253,125 +1286,60 @@ HeapTupleIsSurelyDead(HeapTuple htup, TransactionId OldestXmin)
  }
  
  /*
- * XidInMVCCSnapshot
- *             Is the given XID still-in-progress according to the snapshot?
+ * XidVisibleInSnapshot
+ *             Is the given XID visible according to the snapshot?
+ *
+ * If 'known_committed' is true, xid is known to be committed already, even
+ * though it might not be visible to the snapshot. Passing 'true' can save
+ * some cycles.
   *
- * Note: GetSnapshotData never stores either top xid or subxids of our own
- * backend into a snapshot, so these xids will not be reported as "running"
- * by this function.  This is OK for current uses, because we always check
- * TransactionIdIsCurrentTransactionId first, except for known-committed
- * XIDs which could not be ours anyway.
+ * On return, *hintstatus is set to indicate if the transaction had committed,
+ * or aborted, whether or not it's not visible to us.
   */
  static bool
-XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
+XidVisibleInSnapshot(TransactionId xid, Snapshot snapshot,
+                                        bool known_committed, TransactionIdStatus *hintstatus)
  {
-       uint32          i;
+       CommitSeqNo csn;
+
+       elog(DEBUG1, "XidVisibleInSnapshot %u, %u:%u",
+                xid, snapshot->xmin, snapshot->xmax);
+
+       *hintstatus = XID_INPROGRESS;
  
         /*
          * Make a quick range check to eliminate most XIDs without looking at the
-        * xip arrays.  Note that this is OK even if we convert a subxact XID to
-        * its parent below, because a subxact with XID < xmin has surely also got
-        * a parent with XID < xmin, while one with XID >= xmax must belong to a
-        * parent that was not yet committed at the time of this snapshot.
+        * CSN log.
          */
-
-       /* Any xid < xmin is not in-progress */
-       if (TransactionIdPrecedes(xid, snapshot->xmin))
-               return false;
-       /* Any xid >= xmax is in-progress */
-       if (TransactionIdFollowsOrEquals(xid, snapshot->xmax))
+       if (known_committed && TransactionIdPrecedes(xid, snapshot->xmin))
+       {
+               *hintstatus = XID_COMMITTED;
                 return true;
+       }
  
         /*
-        * Snapshot information is stored slightly differently in snapshots taken
-        * during recovery.
+        * Any xid >= xmax is in-progress (or aborted, but we don't distinguish
+        * that here.
          */
-       if (!snapshot->takenDuringRecovery)
-       {
-               /*
-                * If the snapshot contains full subxact data, the fastest way to
-                * check things is just to compare the given XID against both subxact
-                * XIDs and top-level XIDs.  If the snapshot overflowed, we have to
-                * use pg_subtrans to convert a subxact XID to its parent XID, but
-                * then we need only look at top-level XIDs not subxacts.
-                */
-               if (!snapshot->suboverflowed)
-               {
-                       /* we have full data, so search subxip */
-                       int32           j;
+       if (TransactionIdFollowsOrEquals(xid, snapshot->xmax))
+               return false;
  
-                       for (j = 0; j < snapshot->subxcnt; j++)
-                       {
-                               if (TransactionIdEquals(xid, snapshot->subxip[j]))
-                                       return true;
-                       }
+       csn = TransactionIdGetCommitSeqNo(xid);
  
-                       /* not there, fall through to search xip[] */
-               }
+       if (COMMITSEQNO_IS_COMMITTED(csn))
+       {
+               *hintstatus = XID_COMMITTED;
+               if (csn < snapshot->snapshotcsn)
+                       return true;
                 else
-               {
-                       /*
-                        * Snapshot overflowed, so convert xid to top-level.  This is safe
-                        * because we eliminated too-old XIDs above.
-                        */
-                       xid = SubTransGetTopmostTransaction(xid);
-
-                       /*
-                        * If xid was indeed a subxact, we might now have an xid < xmin,
-                        * so recheck to avoid an array scan.  No point in rechecking
-                        * xmax.
-                        */
-                       if (TransactionIdPrecedes(xid, snapshot->xmin))
-                               return false;
-               }
-
-               for (i = 0; i < snapshot->xcnt; i++)
-               {
-                       if (TransactionIdEquals(xid, snapshot->xip[i]))
-                               return true;
-               }
+                       return false;
         }
         else
         {
-               int32           j;
-
-               /*
-                * In recovery we store all xids in the subxact array because it is by
-                * far the bigger array, and we mostly don't know which xids are
-                * top-level and which are subxacts. The xip array is empty.
-                *
-                * We start by searching subtrans, if we overflowed.
-                */
-               if (snapshot->suboverflowed)
-               {
-                       /*
-                        * Snapshot overflowed, so convert xid to top-level.  This is safe
-                        * because we eliminated too-old XIDs above.
-                        */
-                       xid = SubTransGetTopmostTransaction(xid);
-
-                       /*
-                        * If xid was indeed a subxact, we might now have an xid < xmin,
-                        * so recheck to avoid an array scan.  No point in rechecking
-                        * xmax.
-                        */
-                       if (TransactionIdPrecedes(xid, snapshot->xmin))
-                               return false;
-               }
-
-               /*
-                * We now have either a top-level xid higher than xmin or an
-                * indeterminate xid. We don't know whether it's top level or subxact
-                * but it doesn't matter. If it's present, the xid is visible.
-                */
-               for (j = 0; j < snapshot->subxcnt; j++)
-               {
-                       if (TransactionIdEquals(xid, snapshot->subxip[j]))
-                               return true;
-               }
+               if (csn == COMMITSEQNO_ABORTED)
+                       *hintstatus = XID_ABORTED;
+               return false;
         }
-
-       return false;
  }
  
  /*
@@ -1387,6 +1355,7 @@ bool
  HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple)
  {
         TransactionId xmax;
+       TransactionIdStatus     xidstatus;
  
         /* if there's no valid Xmax, then there's obviously no update either */
         if (tuple->t_infomask & HEAP_XMAX_INVALID)
@@ -1414,9 +1383,11 @@ HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple)
  
         if (TransactionIdIsCurrentTransactionId(xmax))
                 return false;
-       if (TransactionIdIsInProgress(xmax))
+
+       xidstatus = TransactionIdGetStatus(xmax);
+       if (xidstatus == XID_INPROGRESS)
                 return false;
-       if (TransactionIdDidCommit(xmax))
+       if (xidstatus == XID_COMMITTED)
                 return false;
  
         /*
@@ -1457,6 +1428,7 @@ HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot,
         HeapTupleHeader tuple = htup->t_data;
         TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
         TransactionId xmax = HeapTupleHeaderGetRawXmax(tuple);
+       TransactionIdStatus hintstatus;
  
         Assert(ItemPointerIsValid(&htup->t_self));
         Assert(htup->t_tableOid != InvalidOid);
@@ -1468,7 +1440,7 @@ HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot,
                 return false;
         }
         /* check if it's one of our txids, toplevel is also in there */
-       else if (TransactionIdInArray(xmin, snapshot->subxip, snapshot->subxcnt))
+       else if (TransactionIdInArray(xmin, snapshot->this_xip, snapshot->this_xcnt))
         {
                 bool            resolved;
                 CommandId       cmin = HeapTupleHeaderGetRawCommandId(tuple);
@@ -1479,7 +1451,8 @@ HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot,
                  * cmin/cmax was stored in a combocid. So we need to lookup the actual
                  * values externally.
                  */
-               resolved = ResolveCminCmaxDuringDecoding(HistoricSnapshotGetTupleCids(), snapshot,
+               resolved = ResolveCminCmaxDuringDecoding(HistoricSnapshotGetTupleCids(),
+                                                                                                snapshot,
                                                                                                  htup, buffer,
                                                                                                  &cmin, &cmax);
  
@@ -1492,34 +1465,11 @@ HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot,
                         return false;           /* inserted after scan started */
                 /* fall through */
         }
-       /* committed before our xmin horizon. Do a normal visibility check. */
-       else if (TransactionIdPrecedes(xmin, snapshot->xmin))
-       {
-               Assert(!(HeapTupleHeaderXminCommitted(tuple) &&
-                                !TransactionIdDidCommit(xmin)));
-
-               /* check for hint bit first, consult clog afterwards */
-               if (!HeapTupleHeaderXminCommitted(tuple) &&
-                       !TransactionIdDidCommit(xmin))
-                       return false;
-               /* fall through */
-       }
-       /* beyond our xmax horizon, i.e. invisible */
-       else if (TransactionIdFollowsOrEquals(xmin, snapshot->xmax))
-       {
-               return false;
-       }
-       /* check if it's a committed transaction in [xmin, xmax) */
-       else if (TransactionIdInArray(xmin, snapshot->xip, snapshot->xcnt))
-       {
-               /* fall through */
-       }
-
         /*
-        * none of the above, i.e. between [xmin, xmax) but hasn't committed. I.e.
-        * invisible.
+        * it's not "this" transaction. Do a normal visibility check using the
+        * snapshot.
          */
-       else
+       else if (!XidVisibleInSnapshot(xmin, snapshot, false, &hintstatus))
         {
                 return false;
         }
@@ -1543,14 +1493,15 @@ HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot,
         }
  
         /* check if it's one of our txids, toplevel is also in there */
-       if (TransactionIdInArray(xmax, snapshot->subxip, snapshot->subxcnt))
+       if (TransactionIdInArray(xmax, snapshot->this_xip, snapshot->this_xcnt))
         {
                 bool            resolved;
                 CommandId       cmin;
                 CommandId       cmax = HeapTupleHeaderGetRawCommandId(tuple);
  
                 /* Lookup actual cmin/cmax values */
-               resolved = ResolveCminCmaxDuringDecoding(HistoricSnapshotGetTupleCids(), snapshot,
+               resolved = ResolveCminCmaxDuringDecoding(HistoricSnapshotGetTupleCids(),
+                                                                                                snapshot,
                                                                                                  htup, buffer,
                                                                                                  &cmin, &cmax);
  
@@ -1564,26 +1515,12 @@ HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot,
                 else
                         return false;           /* deleted before scan started */
         }
-       /* below xmin horizon, normal transaction state is valid */
-       else if (TransactionIdPrecedes(xmax, snapshot->xmin))
-       {
-               Assert(!(tuple->t_infomask & HEAP_XMAX_COMMITTED &&
-                                !TransactionIdDidCommit(xmax)));
-
-               /* check hint bit first */
-               if (tuple->t_infomask & HEAP_XMAX_COMMITTED)
-                       return false;
-
-               /* check clog */
-               return !TransactionIdDidCommit(xmax);
-       }
-       /* above xmax horizon, we cannot possibly see the deleting transaction */
-       else if (TransactionIdFollowsOrEquals(xmax, snapshot->xmax))
-               return true;
-       /* xmax is between [xmin, xmax), check known committed array */
-       else if (TransactionIdInArray(xmax, snapshot->xip, snapshot->xcnt))
+       /*
+        * it's not "this" transaction. Do a normal visibility check using the
+        * snapshot.
+        */
+       if (XidVisibleInSnapshot(xmax, snapshot, false, &hintstatus))
                 return false;
-       /* xmax is between [xmin, xmax), but known not to have committed yet */
         else
                 return true;
  }
@@ -1601,20 +1538,20 @@ IsMovedTupleVisible(HeapTuple htup, Buffer buffer)
  {
         HeapTupleHeader tuple = htup->t_data;
         TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
+       TransactionIdStatus xidstatus;
  
         /*
          * Check that the xvac is not a live transaction. This should never
          * happen, because HEAP_MOVED flags are not set by current code.
          */
-       if (TransactionIdIsCurrentTransactionId(xvac) ||
-               TransactionIdIsInProgress(xvac))
-       {
+       if (TransactionIdIsCurrentTransactionId(xvac))
                 elog(ERROR, "HEAP_MOVED tuple with in-progress xvac: %u", xvac);
-       }
+
+       xidstatus = TransactionIdGetStatus(xvac);
  
         if (tuple->t_infomask & HEAP_MOVED_OFF)
         {
-               if (TransactionIdDidCommit(xvac))
+               if (xidstatus == XID_COMMITTED)
                 {
                         SetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
                                                 InvalidTransactionId);
@@ -1630,7 +1567,7 @@ IsMovedTupleVisible(HeapTuple htup, Buffer buffer)
         /* Used by pre-9.0 binary upgrades */
         else if (tuple->t_infomask & HEAP_MOVED_IN)
         {
-               if (TransactionIdDidCommit(xvac))
+               if (xidstatus == XID_COMMITTED)
                 {
                         SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
                                                 InvalidTransactionId);
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c

index aad6ba5639f12095354299b63181e714dc255a9b..d78499e9f28023e842e7b1208bb65497a481771a 100644 (file)
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -202,12 +202,12 @@ static const char *const subdirs[] = {
         "global",
         "pg_xlog/archive_status",
         "pg_clog",
+       "pg_csnlog",
         "pg_commit_ts",
         "pg_dynshmem",
         "pg_notify",
         "pg_serial",
         "pg_snapshots",
-       "pg_subtrans",
         "pg_twophase",
         "pg_multixact",
         "pg_multixact/members",
diff --git a/src/include/access/clog.h b/src/include/access/clog.h

index 06c069ae3ae59ee1ba652febc16d948cc0df2b04..5637912ab58bb567b5357c3b6c7205e44a3a136c 100644 (file)
--- a/src/include/access/clog.h
+++ b/src/include/access/clog.h
@@ -17,21 +17,24 @@
  /*
   * Possible transaction statuses --- note that all-zeroes is the initial
   * state.
- *
- * A "subcommitted" transaction is a committed subtransaction whose parent
- * hasn't committed or aborted yet.
   */
-typedef int XidStatus;
+typedef int CLogXidStatus;
+
+#define CLOG_XID_STATUS_IN_PROGRESS            0x00
+#define CLOG_XID_STATUS_COMMITTED              0x01
+#define CLOG_XID_STATUS_ABORTED                        0x02
  
-#define TRANSACTION_STATUS_IN_PROGRESS         0x00
-#define TRANSACTION_STATUS_COMMITTED           0x01
-#define TRANSACTION_STATUS_ABORTED                     0x02
-#define TRANSACTION_STATUS_SUB_COMMITTED       0x03
+/*
+ * A "subcommitted" transaction is a committed subtransaction whose parent
+ * hasn't committed or aborted yet. We don't create these anymore, but accept
+ * them in existing clog, if we've been pg_upgraded from an older version.
+ */
+#define CLOG_XID_STATUS_SUB_COMMITTED  0x03
  
  
-extern void TransactionIdSetTreeStatus(TransactionId xid, int nsubxids,
-                                  TransactionId *subxids, XidStatus status, XLogRecPtr lsn);
-extern XidStatus TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn);
+extern void CLogSetTreeStatus(TransactionId xid, int nsubxids,
+                                 TransactionId *subxids, CLogXidStatus status, XLogRecPtr lsn);
+extern CLogXidStatus CLogGetStatus(TransactionId xid, XLogRecPtr *lsn);
  
  extern Size CLOGShmemBuffers(void);
  extern Size CLOGShmemSize(void);
diff --git a/src/include/access/csnlog.h b/src/include/access/csnlog.h

new file mode 100644 (file)

index 0000000..dfe4f6b
--- /dev/null
+++ b/src/include/access/csnlog.h
@@ -0,0 +1,31 @@
+/*
+ * csnlog.h
+ *
+ * Commit-Sequence-Number log.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/clog.h
+ */
+#ifndef CSNLOG_H
+#define CSNLOG_H
+
+#include "access/xlog.h"
+
+extern void CSNLogSetCommitSeqNo(TransactionId xid, int nsubxids,
+                                        TransactionId *subxids, CommitSeqNo csn);
+extern CommitSeqNo CSNLogGetCommitSeqNo(TransactionId xid);
+
+extern Size CSNLOGShmemBuffers(void);
+extern Size CSNLOGShmemSize(void);
+extern void CSNLOGShmemInit(void);
+extern void BootStrapCSNLOG(void);
+extern void StartupCSNLOG(TransactionId oldestActiveXID);
+extern void TrimCSNLOG(void);
+extern void ShutdownCSNLOG(void);
+extern void CheckPointCSNLOG(void);
+extern void ExtendCSNLOG(TransactionId newestXact);
+extern void TruncateCSNLOG(TransactionId oldestXact);
+
+#endif   /* CSNLOG_H */
diff --git a/src/include/access/mvccvars.h b/src/include/access/mvccvars.h

new file mode 100644 (file)

index 0000000..bfb0800
--- /dev/null
+++ b/src/include/access/mvccvars.h
@@ -0,0 +1,88 @@
+/*-------------------------------------------------------------------------
+ *
+ * mvccvars.h
+ *       Shared memory variables for XID assignment and snapshots
+ *
+ *
+ * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/mvccvars.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef MVCCVARS_H
+#define MVCCVARS_H
+
+#include "port/atomics.h"
+
+/*
+ * VariableCache is a data structure in shared memory that is used to track
+ * OID and XID assignment state.  For largely historical reasons, there is
+ * just one struct with different fields that are protected by different
+ * LWLocks.
+ *
+ * Note: xidWrapLimit and oldestXidDB are not "active" values, but are
+ * used just to generate useful messages when xidWarnLimit or xidStopLimit
+ * are exceeded.
+ */
+typedef struct VariableCacheData
+{
+       /*
+        * These fields are protected by OidGenLock.
+        */
+       Oid                     nextOid;                /* next OID to assign */
+       uint32          oidCount;               /* OIDs available before must do XLOG work */
+
+       /*
+        * These fields are protected by XidGenLock.
+        */
+       TransactionId nextXid;          /* next XID to assign */
+
+       TransactionId oldestXid;        /* cluster-wide minimum datfrozenxid */
+       TransactionId xidVacLimit;      /* start forcing autovacuums here */
+       TransactionId xidWarnLimit; /* start complaining here */
+       TransactionId xidStopLimit; /* refuse to advance nextXid beyond here */
+       TransactionId xidWrapLimit; /* where the world ends */
+       Oid                     oldestXidDB;    /* database with minimum datfrozenxid */
+
+
+       /*
+        * Fields related to MVCC snapshots.
+        *
+        * lastCommitSeqNo is the CSN assigned to last committed transaction.
+        * It is protected by CommitSeqNoLock.
+        *
+        * latestCompletedXid is the highest XID that has committed. Anything
+        * > this is seen by still in-progress by everyone. Use atomic ops to
+        * update.
+        *
+        * oldestActiveXid is the XID of the oldest transaction that's still
+        * in-progress. (Or rather, the oldest XID among all still in-progress
+        * transactions; it's not necessarily the one that started first).
+        * Must hold ProcArrayLock in shared mode, and use atomic ops, to update.
+        *
+        * globalXmin is the oldest XMIN among all still in-progress transactions.
+        * Anything older than this is visible to everyone, and can be
+        * frozen/vacuumed. This does not include lazy VACUUM transactions. Must
+        * hold ProcArrayLock in shared mode, and use atomic ops to update.
+        */
+       pg_atomic_uint64 nextCommitSeqNo;
+       pg_atomic_uint32 latestCompletedXid;
+       pg_atomic_uint32 oldestActiveXid;
+       pg_atomic_uint32 globalXmin;
+
+       /*
+        * These fields are protected by CommitTsLock
+        */
+       TransactionId oldestCommitTsXid;
+       TransactionId newestCommitTsXid;
+
+} VariableCacheData;
+
+typedef VariableCacheData *VariableCache;
+
+/* in transam/varsup.c */
+extern PGDLLIMPORT VariableCache ShmemVariableCache;
+
+#endif   /* MVCCVARS_H */
diff --git a/src/include/access/subtrans.h b/src/include/access/subtrans.h

index f39c6d388fb8a843d87d5dd8a2659d42629b40eb..578f38de8d6763ca4147d2b6588ac3c3877b2881 100644 (file)
--- a/src/include/access/subtrans.h
+++ b/src/include/access/subtrans.h
@@ -11,20 +11,9 @@
  #ifndef SUBTRANS_H
  #define SUBTRANS_H
  
-/* Number of SLRU buffers to use for subtrans */
-#define NUM_SUBTRANS_BUFFERS   32
-
+/* these are in csnlog.c now */
  extern void SubTransSetParent(TransactionId xid, TransactionId parent, bool overwriteOK);
  extern TransactionId SubTransGetParent(TransactionId xid);
  extern TransactionId SubTransGetTopmostTransaction(TransactionId xid);
  
-extern Size SUBTRANSShmemSize(void);
-extern void SUBTRANSShmemInit(void);
-extern void BootStrapSUBTRANS(void);
-extern void StartupSUBTRANS(TransactionId oldestActiveXID);
-extern void ShutdownSUBTRANS(void);
-extern void CheckPointSUBTRANS(void);
-extern void ExtendSUBTRANS(TransactionId newestXact);
-extern void TruncateSUBTRANS(TransactionId oldestXact);
-
  #endif   /* SUBTRANS_H */
diff --git a/src/include/access/transam.h b/src/include/access/transam.h

index 969eff93795d27333472a7ea99e384ab23a42715..66dcf311afb41549cad02d550c763f41c64944d2 100644 (file)
--- a/src/include/access/transam.h
+++ b/src/include/access/transam.h
@@ -58,6 +58,20 @@
                 (dest)--; \
         } while ((dest) < FirstNormalTransactionId)
  
+static inline TransactionId
+TransactionIdNext(TransactionId xid)
+{
+       TransactionIdAdvance(xid);
+       return xid;
+}
+
+static inline TransactionId
+TransactionIdPrev(TransactionId xid)
+{
+       TransactionIdRetreat(xid);
+       return xid;
+}
+
  /* compare two XIDs already known to be normal; this is a macro for speed */
  #define NormalTransactionIdPrecedes(id1, id2) \
         (AssertMacro(TransactionIdIsNormal(id1) && TransactionIdIsNormal(id2)), \
@@ -93,51 +107,6 @@
  #define FirstBootstrapObjectId 10000
  #define FirstNormalObjectId            16384
  
-/*
- * VariableCache is a data structure in shared memory that is used to track
- * OID and XID assignment state.  For largely historical reasons, there is
- * just one struct with different fields that are protected by different
- * LWLocks.
- *
- * Note: xidWrapLimit and oldestXidDB are not "active" values, but are
- * used just to generate useful messages when xidWarnLimit or xidStopLimit
- * are exceeded.
- */
-typedef struct VariableCacheData
-{
-       /*
-        * These fields are protected by OidGenLock.
-        */
-       Oid                     nextOid;                /* next OID to assign */
-       uint32          oidCount;               /* OIDs available before must do XLOG work */
-
-       /*
-        * These fields are protected by XidGenLock.
-        */
-       TransactionId nextXid;          /* next XID to assign */
-
-       TransactionId oldestXid;        /* cluster-wide minimum datfrozenxid */
-       TransactionId xidVacLimit;      /* start forcing autovacuums here */
-       TransactionId xidWarnLimit; /* start complaining here */
-       TransactionId xidStopLimit; /* refuse to advance nextXid beyond here */
-       TransactionId xidWrapLimit; /* where the world ends */
-       Oid                     oldestXidDB;    /* database with minimum datfrozenxid */
-
-       /*
-        * These fields are protected by CommitTsLock
-        */
-       TransactionId oldestCommitTsXid;
-       TransactionId newestCommitTsXid;
-
-       /*
-        * These fields are protected by ProcArrayLock.
-        */
-       TransactionId latestCompletedXid;       /* newest XID that has committed or
-                                                                                * aborted */
-} VariableCacheData;
-
-typedef VariableCacheData *VariableCache;
-
  
  /* ----------------
   *             extern declarations
@@ -147,15 +116,39 @@ typedef VariableCacheData *VariableCache;
  /* in transam/xact.c */
  extern bool TransactionStartedDuringRecovery(void);
  
-/* in transam/varsup.c */
-extern PGDLLIMPORT VariableCache ShmemVariableCache;
-
  /*
   * prototypes for functions in transam/transam.c
   */
  extern bool TransactionIdDidCommit(TransactionId transactionId);
  extern bool TransactionIdDidAbort(TransactionId transactionId);
-extern bool TransactionIdIsKnownCompleted(TransactionId transactionId);
+
+
+#define COMMITSEQNO_INPROGRESS UINT64CONST(0x0)
+#define COMMITSEQNO_ABORTED            UINT64CONST(0x1)
+#define COMMITSEQNO_FROZEN             UINT64CONST(0x2)
+#define COMMITSEQNO_COMMITTING UINT64CONST(0x3)
+#define COMMITSEQNO_FIRST_NORMAL UINT64CONST(0x4)
+
+#define COMMITSEQNO_IS_INPROGRESS(csn) ((csn) == COMMITSEQNO_INPROGRESS)
+#define COMMITSEQNO_IS_ABORTED(csn) ((csn) == COMMITSEQNO_ABORTED)
+#define COMMITSEQNO_IS_FROZEN(csn) ((csn) == COMMITSEQNO_FROZEN)
+#define COMMITSEQNO_IS_NORMAL(csn) ((csn) >= COMMITSEQNO_FIRST_NORMAL)
+#define COMMITSEQNO_IS_COMMITTING(csn) ((csn) == COMMITSEQNO_COMMITTING)
+#define COMMITSEQNO_IS_COMMITTED(csn) ((csn) >= COMMITSEQNO_FROZEN && !COMMITSEQNO_IS_SUBTRANS(csn))
+
+#define CSN_SUBTRANS_BIT               (UINT64CONST( 1<<63 ))
+
+#define COMMITSEQNO_IS_SUBTRANS(csn) ((csn) & CSN_SUBTRANS_BIT)
+
+typedef enum
+{
+       XID_COMMITTED,
+       XID_ABORTED,
+       XID_INPROGRESS
+} TransactionIdStatus;
+
+extern CommitSeqNo TransactionIdGetCommitSeqNo(TransactionId xid);
+extern TransactionIdStatus TransactionIdGetStatus(TransactionId transactionId);
  extern void TransactionIdAbort(TransactionId transactionId);
  extern void TransactionIdCommitTree(TransactionId xid, int nxids, TransactionId *xids);
  extern void TransactionIdAsyncCommitTree(TransactionId xid, int nxids, TransactionId *xids, XLogRecPtr lsn);
diff --git a/src/include/access/xact.h b/src/include/access/xact.h

index 503ae1b82d7bdc887eb56511fafee351cd6f5521..44684de3301bdf116ea8badd9e6ac4e22fd0a94c 100644 (file)
--- a/src/include/access/xact.h
+++ b/src/include/access/xact.h
@@ -117,7 +117,7 @@ typedef void (*SubXactCallback) (SubXactEvent event, SubTransactionId mySubid,
  #define XLOG_XACT_ABORT                                0x20
  #define XLOG_XACT_COMMIT_PREPARED      0x30
  #define XLOG_XACT_ABORT_PREPARED       0x40
-#define XLOG_XACT_ASSIGNMENT           0x50
+/* free opcode 0x50 */
  /* free opcode 0x60 */
  /* free opcode 0x70 */
  
@@ -316,7 +316,6 @@ extern TransactionId GetCurrentTransactionId(void);
  extern TransactionId GetCurrentTransactionIdIfAny(void);
  extern TransactionId GetStableLatestTransactionId(void);
  extern SubTransactionId GetCurrentSubTransactionId(void);
-extern void MarkCurrentTransactionIdLoggedIfAny(void);
  extern bool SubTransactionIsActive(SubTransactionId subxid);
  extern CommandId GetCurrentCommandId(bool used);
  extern TimestampTz GetCurrentTransactionStartTimestamp(void);
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h

index 14b7f7f459a98b8d57def2e96147a266728ab09b..989261d8c83c6024cb1ac8621f89c1e543ec64d6 100644 (file)
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -52,11 +52,6 @@ extern bool InRecovery;
   * we haven't yet processed a RUNNING_XACTS or shutdown-checkpoint WAL record
   * to initialize our master-transaction tracking system.
   *
- * When the transaction tracking is initialized, we enter the SNAPSHOT_PENDING
- * state. The tracked information might still be incomplete, so we can't allow
- * connections yet, but redo functions must update the in-memory state when
- * appropriate.
- *
   * In SNAPSHOT_READY mode, we have full knowledge of transactions that are
   * (or were) running in the master at the current WAL location. Snapshots
   * can be taken, and read-only queries can be run.
@@ -65,13 +60,12 @@ typedef enum
  {
         STANDBY_DISABLED,
         STANDBY_INITIALIZED,
-       STANDBY_SNAPSHOT_PENDING,
         STANDBY_SNAPSHOT_READY
  } HotStandbyState;
  
  extern HotStandbyState standbyState;
  
-#define InHotStandby (standbyState >= STANDBY_SNAPSHOT_PENDING)
+#define InHotStandby (standbyState >= STANDBY_SNAPSHOT_READY)
  
  /*
   * Recovery target type.
diff --git a/src/include/c.h b/src/include/c.h

index 4ab3f8027a56362f5c2ce3c4408f6e271edb40c1..fef304da4cb92b891dc1e210a78648141d044dcf 100644 (file)
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -409,6 +409,13 @@ typedef uint32 CommandId;
  #define FirstCommandId ((CommandId) 0)
  #define InvalidCommandId       (~(CommandId)0)
  
+/*
+ * CommitSeqNo is currently an LSN, but keep use a separate datatype for clarity.
+ */
+typedef uint64 CommitSeqNo;
+
+#define InvalidCommitSeqNo             ((CommitSeqNo) 0)
+
  /*
   * Array indexing support
   */
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h

index 6fed7a0d19897940292e66971c53695369b1f68e..94d691c2a3efe7142fb2b19e41c1a0e73e85636f 100644 (file)
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -4910,8 +4910,6 @@ DATA(insert OID = 2945 (  txid_snapshot_xmin              PGNSP PGUID 12 1  0 0 0 f f f f t
  DESCR("get xmin of snapshot");
  DATA(insert OID = 2946 (  txid_snapshot_xmax           PGNSP PGUID 12 1  0 0 0 f f f f t f i s 1 0 20 "2970" _null_ _null_ _null_ _null_ _null_ txid_snapshot_xmax _null_ _null_ _null_ ));
  DESCR("get xmax of snapshot");
-DATA(insert OID = 2947 (  txid_snapshot_xip                    PGNSP PGUID 12 1 50 0 0 f f f f t t i s 1 0 20 "2970" _null_ _null_ _null_ _null_ _null_ txid_snapshot_xip _null_ _null_ _null_ ));
-DESCR("get set of in-progress txids in snapshot");
  DATA(insert OID = 2948 (  txid_visible_in_snapshot     PGNSP PGUID 12 1  0 0 0 f f f f t f i s 2 0 16 "20 2970" _null_ _null_ _null_ _null_ _null_ txid_visible_in_snapshot _null_ _null_ _null_ ));
  DESCR("is txid visible in snapshot?");
  
diff --git a/src/include/replication/snapbuild.h b/src/include/replication/snapbuild.h

index df229a895cbbb6d718512d92a100313b1ad18dda..253ed2b5442d892c22e3ca1b98835e8fdcbbbafb 100644 (file)
--- a/src/include/replication/snapbuild.h
+++ b/src/include/replication/snapbuild.h
@@ -22,16 +22,6 @@ typedef enum
          */
         SNAPBUILD_START,
  
-       /*
-        * We have collected enough information to decode tuples in transactions
-        * that started after this.
-        *
-        * Once we reached this we start to collect changes. We cannot apply them
-        * yet because the might be based on transactions that were still running
-        * when we reached them yet.
-        */
-       SNAPBUILD_FULL_SNAPSHOT,
-
         /*
          * Found a point after hitting built_full_snapshot where all transactions
          * that were running at that point finished. Till we reach that we hold
@@ -51,10 +41,8 @@ struct ReorderBuffer;
  struct xl_heap_new_cid;
  struct xl_running_xacts;
  
-extern void CheckPointSnapBuild(void);
-
  extern SnapBuild *AllocateSnapshotBuilder(struct ReorderBuffer *cache,
-                                               TransactionId xmin_horizon, XLogRecPtr start_lsn);
+                                               XLogRecPtr start_lsn);
  extern void FreeSnapshotBuilder(SnapBuild *cache);
  
  extern void SnapBuildSnapDecRefcount(Snapshot snap);
@@ -80,6 +68,7 @@ extern void SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid,
                                            XLogRecPtr lsn, struct xl_heap_new_cid *cid);
  extern void SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn,
                                                          struct xl_running_xacts *running);
-extern void SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn);
+extern void SnapBuildProcessInitialSnapshot(SnapBuild *builder, XLogRecPtr lsn,
+                                                               TransactionId xmin, TransactionId xmax);
  
  #endif   /* SNAPBUILD_H */
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h

index 959f5f1e4d24e80d78650b8dc5eecd78622e82a6..2d581db43fba7d05ffd641e883dc84a3e8489264 100644 (file)
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -219,8 +219,8 @@ typedef enum BuiltinTrancheIds
  {
         LWTRANCHE_MAIN,
         LWTRANCHE_CLOG_BUFFERS,
+       LWTRANCHE_CSNLOG_BUFFERS,
         LWTRANCHE_COMMITTS_BUFFERS,
-       LWTRANCHE_SUBTRANS_BUFFERS,
         LWTRANCHE_MXACTOFFSET_BUFFERS,
         LWTRANCHE_MXACTMEMBER_BUFFERS,
         LWTRANCHE_ASYNC_BUFFERS,
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h

index f576f052dfe6b71737e84e663f89e0302b7b126b..6ac35cdc4ceba16d3bab2a85ded8bc6e4d68eb21 100644 (file)
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -21,24 +21,6 @@
  #include "storage/pg_sema.h"
  #include "storage/proclist_types.h"
  
-/*
- * Each backend advertises up to PGPROC_MAX_CACHED_SUBXIDS TransactionIds
- * for non-aborted subtransactions of its current top transaction.  These
- * have to be treated as running XIDs by other backends.
- *
- * We also keep track of whether the cache overflowed (ie, the transaction has
- * generated at least one subtransaction that didn't fit in the cache).
- * If none of the caches have overflowed, we can assume that an XID that's not
- * listed anywhere in the PGPROC array is not a running transaction.  Else we
- * have to look at pg_subtrans.
- */
-#define PGPROC_MAX_CACHED_SUBXIDS 64   /* XXX guessed-at value */
-
-struct XidCache
-{
-       TransactionId xids[PGPROC_MAX_CACHED_SUBXIDS];
-};
-
  /* Flags for PGXACT->vacuumFlags */
  #define                PROC_IS_AUTOVACUUM      0x01    /* is it an autovac worker? */
  #define                PROC_IN_VACUUM          0x02    /* currently running lazy vacuum */
@@ -140,8 +122,6 @@ struct PGPROC
          */
         SHM_QUEUE       myProcLocks[NUM_LOCK_PARTITIONS];
  
-       struct XidCache subxids;        /* cache for subtransaction XIDs */
-
         /* Support for group XID clearing. */
         /* true, if member of ProcArray group waiting for XID clear */
         bool            procArrayGroupMember;
@@ -188,6 +168,9 @@ extern PGDLLIMPORT struct PGXACT *MyPgXact;
   * considerably on systems with many CPU cores, by reducing the number of
   * cache lines needing to be fetched.  Thus, think very carefully before adding
   * anything else here.
+ *
+ * XXX: GetSnapshotData no longer does that, so perhaps we should put these
+ * back to PGPROC for simplicity's sake.
   */
  typedef struct PGXACT
  {
@@ -197,15 +180,17 @@ typedef struct PGXACT
  
         TransactionId xmin;                     /* minimal running XID as it was when we were
                                                                  * starting our xact, excluding LAZY VACUUM:
-                                                                * vacuum must not remove tuples deleted by
                                                                  * xid >= xmin ! */
  
+       CommitSeqNo     snapshotcsn;    /* oldest snapshot in use in this backend:
+                                                                * vacuum must not remove tuples deleted by
+                                                                * xacts with commit seqno > snapshotcsn !
+                                                                * XXX: currently unused, vacuum uses just xmin, still.
+                                                                */
+
         uint8           vacuumFlags;    /* vacuum-related flags, see above */
-       bool            overflowed;
         bool            delayChkpt;             /* true if this proc delays checkpoint start;
                                                                  * previously called InCommit */
-
-       uint8           nxids;
  } PGXACT;
  
  /*
diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h

index dd37c0cb07086fc916fb731ca6ecc150b7d290de..d57a2ba9eeaa02514cc67c7b1656987e3afc5adf 100644 (file)
--- a/src/include/storage/procarray.h
+++ b/src/include/storage/procarray.h
@@ -23,25 +23,17 @@
  extern Size ProcArrayShmemSize(void);
  extern void CreateSharedProcArray(void);
  extern void ProcArrayAdd(PGPROC *proc);
-extern void ProcArrayRemove(PGPROC *proc, TransactionId latestXid);
+extern void ProcArrayRemove(PGPROC *proc);
  
-extern void ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid);
+extern void ProcArrayEndTransaction(PGPROC *proc);
  extern void ProcArrayClearTransaction(PGPROC *proc);
  
-extern void ProcArrayInitRecovery(TransactionId initializedUptoXID);
+extern void ProcArrayInitRecovery(TransactionId oldestActiveXID, TransactionId initializedUptoXID);
  extern void ProcArrayApplyRecoveryInfo(RunningTransactions running);
  extern void ProcArrayApplyXidAssignment(TransactionId topxid,
                                                         int nsubxids, TransactionId *subxids);
  
  extern void RecordKnownAssignedTransactionIds(TransactionId xid);
-extern void ExpireTreeKnownAssignedTransactionIds(TransactionId xid,
-                                                                         int nsubxids, TransactionId *subxids,
-                                                                         TransactionId max_xid);
-extern void ExpireAllKnownAssignedTransactionIds(void);
-extern void ExpireOldKnownAssignedTransactionIds(TransactionId xid);
-
-extern int     GetMaxSnapshotXidCount(void);
-extern int     GetMaxSnapshotSubxidCount(void);
  
  extern Snapshot GetSnapshotData(Snapshot snapshot);
  
@@ -51,7 +43,6 @@ extern bool ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc);
  
  extern RunningTransactions GetRunningTransactionData(void);
  
-extern bool TransactionIdIsInProgress(TransactionId xid);
  extern bool TransactionIdIsActive(TransactionId xid);
  extern TransactionId GetOldestXmin(Relation rel, bool ignoreVacuum);
  extern TransactionId GetOldestActiveTransactionId(void);
@@ -65,9 +56,8 @@ extern PGPROC *BackendPidGetProcWithLock(int pid);
  extern int     BackendXidGetPid(TransactionId xid);
  extern bool IsBackendPid(int pid);
  
-extern VirtualTransactionId *GetCurrentVirtualXIDs(TransactionId limitXmin,
-                                         bool excludeXmin0, bool allDbs, int excludeVacuum,
-                                         int *nvxids);
+extern VirtualTransactionId *GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0,
+                                         bool allDbs, int excludeVacuum, int *nvxids);
  extern VirtualTransactionId *GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid);
  extern pid_t CancelVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode);
  
@@ -78,10 +68,6 @@ extern int   CountUserBackends(Oid roleid);
  extern bool CountOtherDBBackends(Oid databaseId,
                                          int *nbackends, int *nprepared);
  
-extern void XidCacheRemoveRunningXids(TransactionId xid,
-                                                 int nxids, const TransactionId *xids,
-                                                 TransactionId latestXid);
-
  extern void ProcArraySetReplicationSlotXmin(TransactionId xmin,
                                                         TransactionId catalog_xmin, bool already_locked);
  
diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h

index dcebf72f85fef84fe616d44083716b70e96ae54a..a94865959b8bce123190870ac6d31d7fd58b11a8 100644 (file)
--- a/src/include/storage/standby.h
+++ b/src/include/storage/standby.h
@@ -50,10 +50,7 @@ extern void StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid
  extern void StandbyReleaseLockTree(TransactionId xid,
                                            int nsubxids, TransactionId *subxids);
  extern void StandbyReleaseAllLocks(void);
-extern void StandbyReleaseOldLocks(int nxids, TransactionId *xids);
-
-#define MinSizeOfXactRunningXacts offsetof(xl_running_xacts, xids)
-
+extern void StandbyReleaseOldLocks(TransactionId oldestRunningXid);
  
  /*
   * Declarations for GetRunningTransactionData(). Similar to Snapshots, but
@@ -69,14 +66,8 @@ extern void StandbyReleaseOldLocks(int nxids, TransactionId *xids);
  
  typedef struct RunningTransactionsData
  {
-       int                     xcnt;                   /* # of xact ids in xids[] */
-       int                     subxcnt;                /* # of subxact ids in xids[] */
-       bool            subxid_overflow;        /* snapshot overflowed, subxids missing */
         TransactionId nextXid;          /* copy of ShmemVariableCache->nextXid */
         TransactionId oldestRunningXid;         /* *not* oldestXmin */
-       TransactionId latestCompletedXid;       /* so we can set xmax */
-
-       TransactionId *xids;            /* array of (sub)xids still running */
  } RunningTransactionsData;
  
  typedef RunningTransactionsData *RunningTransactions;
diff --git a/src/include/storage/standbydefs.h b/src/include/storage/standbydefs.h

index ea22d77e07afe389d25f0ce848c2ac2bbf790cf3..b18fc098eedfcf1662fe8e2188439648756d4695 100644 (file)
--- a/src/include/storage/standbydefs.h
+++ b/src/include/storage/standbydefs.h
@@ -46,16 +46,13 @@ typedef struct xl_standby_locks
   */
  typedef struct xl_running_xacts
  {
-       int                     xcnt;                   /* # of xact ids in xids[] */
-       int                     subxcnt;                /* # of subxact ids in xids[] */
-       bool            subxid_overflow;        /* snapshot overflowed, subxids missing */
         TransactionId nextXid;          /* copy of ShmemVariableCache->nextXid */
         TransactionId oldestRunningXid;         /* *not* oldestXmin */
         TransactionId latestCompletedXid;       /* so we can set xmax */
-
-       TransactionId xids[FLEXIBLE_ARRAY_MEMBER];
  } xl_running_xacts;
  
+#define SizeOfXactRunningXacts (offsetof(xl_running_xacts, latestCompletedXid) + sizeof(TransactionId))
+
  /*
   * Invalidations for standby, currently only when transactions without an
   * assigned xid commit.
diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h

index 9e3827249e2144e48ce7e0fa24477fae84867037..637e38fcf4aa5593504431628ce5f5efa50a767a 100644 (file)
--- a/src/include/utils/snapmgr.h
+++ b/src/include/utils/snapmgr.h
@@ -57,7 +57,6 @@ extern int64 GetOldSnapshotThresholdTimestamp(void);
  extern bool FirstSnapshotSet;
  
  extern TransactionId TransactionXmin;
-extern TransactionId RecentXmin;
  extern TransactionId RecentGlobalXmin;
  extern TransactionId RecentGlobalDataXmin;
  
diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h

index 998e2e593d06ea07eb6b6459332e70d6d7dd3820..fc4d0d35dcf2f186a130bf5f8031aa7d0de45f96 100644 (file)
--- a/src/include/utils/snapshot.h
+++ b/src/include/utils/snapshot.h
@@ -57,37 +57,18 @@ typedef struct SnapshotData
          * just zeroes in special snapshots.  (But xmin and xmax are used
          * specially by HeapTupleSatisfiesDirty.)
          *
-        * An MVCC snapshot can never see the effects of XIDs >= xmax. It can see
-        * the effects of all older XIDs except those listed in the snapshot. xmin
-        * is stored as an optimization to avoid needing to search the XID arrays
-        * for most tuples.
+        * An MVCC snapshot can see the effects of those XIDs that committed
+        * after snapshotlsn. xmin and xmax are stored as an optimization, to
+        * avoid checking the commit LSN for most tuples.
          */
         TransactionId xmin;                     /* all XID < xmin are visible to me */
         TransactionId xmax;                     /* all XID >= xmax are invisible to me */
  
         /*
-        * For normal MVCC snapshot this contains the all xact IDs that are in
-        * progress, unless the snapshot was taken during recovery in which case
-        * it's empty. For historic MVCC snapshots, the meaning is inverted, i.e.
-        * it contains *committed* transactions between xmin and xmax.
-        *
-        * note: all ids in xip[] satisfy xmin <= xip[i] < xmax
-        */
-       TransactionId *xip;
-       uint32          xcnt;                   /* # of xact ids in xip[] */
-
-       /*
-        * For non-historic MVCC snapshots, this contains subxact IDs that are in
-        * progress (and other transactions that are in progress if taken during
-        * recovery). For historic snapshot it contains *all* xids assigned to the
-        * replayed transaction, including the toplevel xid.
-        *
-        * note: all ids in subxip[] are >= xmin, but we don't bother filtering
-        * out any that are >= xmax
+        * This snapshot can see the effects of all transactions with CSN <=
+        * snapshotcsn.
          */
-       TransactionId *subxip;
-       int32           subxcnt;                /* # of xact ids in subxip[] */
-       bool            suboverflowed;  /* has the subxip array overflowed? */
+       CommitSeqNo     snapshotcsn;
  
         bool            takenDuringRecovery;    /* recovery-shaped snapshot? */
         bool            copied;                 /* false if it's a static snapshot */
@@ -100,6 +81,14 @@ typedef struct SnapshotData
          */
         uint32          speculativeToken;
  
+       /*
+        * this_xip contains *all* xids assigned to the replayed transaction,
+        * including the toplevel xid. Used only in a historic MVCC snapshot,
+        * used in logical decoding.
+        */
+       TransactionId *this_xip;
+       uint32          this_xcnt;                      /* # of xact ids in this_xip[] */
+
         /*
          * Book-keeping information, used by the snapshot manager
          */
diff --git a/src/test/regress/expected/txid.out b/src/test/regress/expected/txid.out

index ddd217eb1024f6ddb595155838ff698d36e4618f..6c0b1edf11a6d43aa3a8233f01cb01825c440193 100644 (file)
--- a/src/test/regress/expected/txid.out
+++ b/src/test/regress/expected/txid.out
@@ -1,205 +1,45 @@
  -- txid_snapshot data type and related functions
  -- i/o
-select '12:13:'::txid_snapshot;
+select '12:0/ABCDABCD'::txid_snapshot;
   txid_snapshot 
  ---------------
- 12:13:
-(1 row)
-
-select '12:18:14,16'::txid_snapshot;
- txid_snapshot 
----------------
- 12:18:14,16
-(1 row)
-
-select '12:16:14,14'::txid_snapshot;
- txid_snapshot 
----------------
- 12:16:14
+ 12:0/ABCDABCD
  (1 row)
  
  -- errors
-select '31:12:'::txid_snapshot;
-ERROR:  invalid input syntax for type txid_snapshot: "31:12:"
-LINE 1: select '31:12:'::txid_snapshot;
-               ^
-select '0:1:'::txid_snapshot;
-ERROR:  invalid input syntax for type txid_snapshot: "0:1:"
-LINE 1: select '0:1:'::txid_snapshot;
-               ^
-select '12:13:0'::txid_snapshot;
-ERROR:  invalid input syntax for type txid_snapshot: "12:13:0"
-LINE 1: select '12:13:0'::txid_snapshot;
-               ^
-select '12:16:14,13'::txid_snapshot;
-ERROR:  invalid input syntax for type txid_snapshot: "12:16:14,13"
-LINE 1: select '12:16:14,13'::txid_snapshot;
-               ^
+select '0:0/ABCDABCD'::txid_snapshot;
+ERROR:  invalid input for txid_snapshot: "0:0/ABCDABCD"
+LINE 1: select '0:0/ABCDABCD'::txid_snapshot;
  create temp table snapshot_test (
         nr      integer,
         snap    txid_snapshot
  );
-insert into snapshot_test values (1, '12:13:');
-insert into snapshot_test values (2, '12:20:13,15,18');
-insert into snapshot_test values (3, '100001:100009:100005,100007,100008');
-insert into snapshot_test values (4, '100:150:101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131');
-select snap from snapshot_test order by nr;
-                                                                snap                                                                 
--------------------------------------------------------------------------------------------------------------------------------------
- 12:13:
- 12:20:13,15,18
- 100001:100009:100005,100007,100008
- 100:150:101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131
-(4 rows)
+insert into snapshot_test values (1, '12:0/ABCDABCD');
+     snap      
+---------------
+ 12:0/ABCDABCD
+(1 row)
  
-select  txid_snapshot_xmin(snap),
-       txid_snapshot_xmax(snap),
-       txid_snapshot_xip(snap)
+select txid_snapshot_xmax(snap)
  from snapshot_test order by nr;
- txid_snapshot_xmin | txid_snapshot_xmax | txid_snapshot_xip 
---------------------+--------------------+-------------------
-                 12 |                 20 |                13
-                 12 |                 20 |                15
-                 12 |                 20 |                18
-             100001 |             100009 |            100005
-             100001 |             100009 |            100007
-             100001 |             100009 |            100008
-                100 |                150 |               101
-                100 |                150 |               102
-                100 |                150 |               103
-                100 |                150 |               104
-                100 |                150 |               105
-                100 |                150 |               106
-                100 |                150 |               107
-                100 |                150 |               108
-                100 |                150 |               109
-                100 |                150 |               110
-                100 |                150 |               111
-                100 |                150 |               112
-                100 |                150 |               113
-                100 |                150 |               114
-                100 |                150 |               115
-                100 |                150 |               116
-                100 |                150 |               117
-                100 |                150 |               118
-                100 |                150 |               119
-                100 |                150 |               120
-                100 |                150 |               121
-                100 |                150 |               122
-                100 |                150 |               123
-                100 |                150 |               124
-                100 |                150 |               125
-                100 |                150 |               126
-                100 |                150 |               127
-                100 |                150 |               128
-                100 |                150 |               129
-                100 |                150 |               130
-                100 |                150 |               131
-(37 rows)
+ txid_snapshot_xmax 
+--------------------
+                 12
+(1 row)
  
+/*
  select id, txid_visible_in_snapshot(id, snap)
  from snapshot_test, generate_series(11, 21) id
  where nr = 2;
- id | txid_visible_in_snapshot 
-----+--------------------------
- 11 | t
- 12 | t
- 13 | f
- 14 | t
- 15 | f
- 16 | t
- 17 | t
- 18 | f
- 19 | t
- 20 | f
- 21 | f
-(11 rows)
  
  -- test bsearch
  select id, txid_visible_in_snapshot(id, snap)
  from snapshot_test, generate_series(90, 160) id
  where nr = 4;
- id  | txid_visible_in_snapshot 
------+--------------------------
-  90 | t
-  91 | t
-  92 | t
-  93 | t
-  94 | t
-  95 | t
-  96 | t
-  97 | t
-  98 | t
-  99 | t
- 100 | t
- 101 | f
- 102 | f
- 103 | f
- 104 | f
- 105 | f
- 106 | f
- 107 | f
- 108 | f
- 109 | f
- 110 | f
- 111 | f
- 112 | f
- 113 | f
- 114 | f
- 115 | f
- 116 | f
- 117 | f
- 118 | f
- 119 | f
- 120 | f
- 121 | f
- 122 | f
- 123 | f
- 124 | f
- 125 | f
- 126 | f
- 127 | f
- 128 | f
- 129 | f
- 130 | f
- 131 | f
- 132 | t
- 133 | t
- 134 | t
- 135 | t
- 136 | t
- 137 | t
- 138 | t
- 139 | t
- 140 | t
- 141 | t
- 142 | t
- 143 | t
- 144 | t
- 145 | t
- 146 | t
- 147 | t
- 148 | t
- 149 | t
- 150 | f
- 151 | f
- 152 | f
- 153 | f
- 154 | f
- 155 | f
- 156 | f
- 157 | f
- 158 | f
- 159 | f
- 160 | f
-(71 rows)
  
  -- test current values also
  select txid_current() >= txid_snapshot_xmin(txid_current_snapshot());
- ?column? 
-----------
- t
-(1 row)
+*/
  
  -- we can't assume current is always less than xmax, however
  select txid_visible_in_snapshot(txid_current(), txid_current_snapshot());
@@ -208,33 +48,12 @@ select txid_visible_in_snapshot(txid_current(), txid_current_snapshot());
   f
  (1 row)
  
+/*
  -- test 64bitness
  select txid_snapshot '1000100010001000:1000100010001100:1000100010001012,1000100010001013';
-                            txid_snapshot                            
----------------------------------------------------------------------
- 1000100010001000:1000100010001100:1000100010001012,1000100010001013
-(1 row)
-
  select txid_visible_in_snapshot('1000100010001012', '1000100010001000:1000100010001100:1000100010001012,1000100010001013');
- txid_visible_in_snapshot 
---------------------------
- f
-(1 row)
-
  select txid_visible_in_snapshot('1000100010001015', '1000100010001000:1000100010001100:1000100010001012,1000100010001013');
- txid_visible_in_snapshot 
---------------------------
- t
-(1 row)
-
  -- test 64bit overflow
  SELECT txid_snapshot '1:9223372036854775807:3';
-      txid_snapshot      
--------------------------
- 1:9223372036854775807:3
-(1 row)
-
  SELECT txid_snapshot '1:9223372036854775808:3';
-ERROR:  invalid input syntax for type txid_snapshot: "1:9223372036854775808:3"
-LINE 1: SELECT txid_snapshot '1:9223372036854775808:3';
-                             ^
+*/
diff --git a/src/test/regress/sql/txid.sql b/src/test/regress/sql/txid.sql

index b6650b922e6d4b0df87fde088be491600b386e2f..b3809b0cfa10cb28e30d0b627fc7eeb8eacece45 100644 (file)
--- a/src/test/regress/sql/txid.sql
+++ b/src/test/regress/sql/txid.sql
@@ -1,32 +1,22 @@
  -- txid_snapshot data type and related functions
  
  -- i/o
-select '12:13:'::txid_snapshot;
-select '12:18:14,16'::txid_snapshot;
-select '12:16:14,14'::txid_snapshot;
+select '12:0/ABCDABCD'::txid_snapshot;
  
  -- errors
-select '31:12:'::txid_snapshot;
-select '0:1:'::txid_snapshot;
-select '12:13:0'::txid_snapshot;
-select '12:16:14,13'::txid_snapshot;
+select '0:0/ABCDABCD'::txid_snapshot;
  
  create temp table snapshot_test (
         nr      integer,
         snap    txid_snapshot
  );
  
-insert into snapshot_test values (1, '12:13:');
-insert into snapshot_test values (2, '12:20:13,15,18');
-insert into snapshot_test values (3, '100001:100009:100005,100007,100008');
-insert into snapshot_test values (4, '100:150:101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131');
+insert into snapshot_test values (1, '12:0/ABCDABCD');
  select snap from snapshot_test order by nr;
  
-select  txid_snapshot_xmin(snap),
-       txid_snapshot_xmax(snap),
-       txid_snapshot_xip(snap)
+select  txid_snapshot_xmax(snap)
  from snapshot_test order by nr;
-
+/*
  select id, txid_visible_in_snapshot(id, snap)
  from snapshot_test, generate_series(11, 21) id
  where nr = 2;
@@ -35,7 +25,7 @@ where nr = 2;
  select id, txid_visible_in_snapshot(id, snap)
  from snapshot_test, generate_series(90, 160) id
  where nr = 4;
-
+*/
  -- test current values also
  select txid_current() >= txid_snapshot_xmin(txid_current_snapshot());
  
@@ -43,6 +33,7 @@ select txid_current() >= txid_snapshot_xmin(txid_current_snapshot());
  
  select txid_visible_in_snapshot(txid_current(), txid_current_snapshot());
  
+/*
  -- test 64bitness
  
  select txid_snapshot '1000100010001000:1000100010001100:1000100010001012,1000100010001013';
@@ -52,3 +43,4 @@ select txid_visible_in_snapshot('1000100010001015', '1000100010001000:1000100010
  -- test 64bit overflow
  SELECT txid_snapshot '1:9223372036854775807:3';
  SELECT txid_snapshot '1:9223372036854775808:3';
+*/
author	Heikki Linnakangas <heikki.linnakangas@iki.fi>
	Mon, 22 Aug 2016 11:00:57 +0000 (14:00 +0300)
committer	Heikki Linnakangas <heikki.linnakangas@iki.fi>
	Mon, 22 Aug 2016 18:21:58 +0000 (21:21 +0300)
doc/src/sgml/func.sgml		patch \| blob \| blame \| history
src/backend/access/heap/heapam.c		patch \| blob \| blame \| history
src/backend/access/nbtree/README		patch \| blob \| blame \| history
src/backend/access/rmgrdesc/standbydesc.c		patch \| blob \| blame \| history
src/backend/access/rmgrdesc/xactdesc.c		patch \| blob \| blame \| history
src/backend/access/transam/Makefile		patch \| blob \| blame \| history
src/backend/access/transam/README		patch \| blob \| blame \| history
src/backend/access/transam/clog.c		patch \| blob \| blame \| history
src/backend/access/transam/commit_ts.c		patch \| blob \| blame \| history
src/backend/access/transam/csnlog.c	[new file with mode: 0644]	patch \| blob
src/backend/access/transam/multixact.c		patch \| blob \| blame \| history
src/backend/access/transam/subtrans.c	[deleted file]	patch \| blob \| blame \| history
src/backend/access/transam/transam.c		patch \| blob \| blame \| history
src/backend/access/transam/twophase.c		patch \| blob \| blame \| history
src/backend/access/transam/varsup.c		patch \| blob \| blame \| history
src/backend/access/transam/xact.c		patch \| blob \| blame \| history
src/backend/access/transam/xlog.c		patch \| blob \| blame \| history
src/backend/catalog/heap.c		patch \| blob \| blame \| history
src/backend/commands/async.c		patch \| blob \| blame \| history
src/backend/commands/matview.c		patch \| blob \| blame \| history
src/backend/commands/tablecmds.c		patch \| blob \| blame \| history
src/backend/replication/logical/decode.c		patch \| blob \| blame \| history
src/backend/replication/logical/logical.c		patch \| blob \| blame \| history
src/backend/replication/logical/reorderbuffer.c		patch \| blob \| blame \| history
src/backend/replication/logical/snapbuild.c		patch \| blob \| blame \| history
src/backend/storage/ipc/ipci.c		patch \| blob \| blame \| history
src/backend/storage/ipc/procarray.c		patch \| blob \| blame \| history
src/backend/storage/ipc/shmem.c		patch \| blob \| blame \| history
src/backend/storage/ipc/standby.c		patch \| blob \| blame \| history
src/backend/storage/lmgr/lmgr.c		patch \| blob \| blame \| history
src/backend/storage/lmgr/lwlocknames.txt		patch \| blob \| blame \| history
src/backend/storage/lmgr/predicate.c		patch \| blob \| blame \| history
src/backend/storage/lmgr/proc.c		patch \| blob \| blame \| history
src/backend/utils/adt/txid.c		patch \| blob \| blame \| history
src/backend/utils/probes.d		patch \| blob \| blame \| history
src/backend/utils/time/snapmgr.c		patch \| blob \| blame \| history
src/backend/utils/time/tqual.c		patch \| blob \| blame \| history
src/bin/initdb/initdb.c		patch \| blob \| blame \| history
src/include/access/clog.h		patch \| blob \| blame \| history
src/include/access/csnlog.h	[new file with mode: 0644]	patch \| blob
src/include/access/mvccvars.h	[new file with mode: 0644]	patch \| blob
src/include/access/subtrans.h		patch \| blob \| blame \| history
src/include/access/transam.h		patch \| blob \| blame \| history
src/include/access/xact.h		patch \| blob \| blame \| history
src/include/access/xlog.h		patch \| blob \| blame \| history
src/include/c.h		patch \| blob \| blame \| history
src/include/catalog/pg_proc.h		patch \| blob \| blame \| history
src/include/replication/snapbuild.h		patch \| blob \| blame \| history
src/include/storage/lwlock.h		patch \| blob \| blame \| history
src/include/storage/proc.h		patch \| blob \| blame \| history
src/include/storage/procarray.h		patch \| blob \| blame \| history
src/include/storage/standby.h		patch \| blob \| blame \| history
src/include/storage/standbydefs.h		patch \| blob \| blame \| history
src/include/utils/snapmgr.h		patch \| blob \| blame \| history
src/include/utils/snapshot.h		patch \| blob \| blame \| history
src/test/regress/expected/txid.out		patch \| blob \| blame \| history
src/test/regress/sql/txid.sql		patch \| blob \| blame \| history