From 6aad888295d1a6bebe53f2e2c2850bf3f8dc0d58 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@enterprisedb.com>
Date: Wed, 28 Jan 2009 10:04:35 +0200
Subject: [PATCH] Comment changes. Inline GetRedoLocationForCheckpoint into
 CreateCheckPoint again.

---
 src/backend/access/transam/xlog.c   | 133 ++++++++++------------------
 src/backend/postmaster/bgwriter.c   |  30 ++++---
 src/backend/postmaster/postmaster.c |  27 +++---
 3 files changed, 80 insertions(+), 110 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index b7a1e3504b..226e96953d 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -119,7 +119,15 @@ CheckpointStatsData CheckpointStats;
  */
 TimeLineID	ThisTimeLineID = 0;
 
-/* Are we doing recovery from XLOG? */
+/*
+ * Are we doing recovery from XLOG? 
+ *
+ * This is only ever true in the startup process, when it's replaying WAL.
+ * It's used in functions that need to act differently when called from a
+ * redo function (e.g skip WAL logging).  To check whether the system is in
+ * recovery regardless of what process you're running in, use
+ * IsRecoveryProcessingMode().
+ */
 bool		InRecovery = false;
 
 /* Are we recovering using offline XLOG archives? */
@@ -251,30 +259,10 @@ static XLogRecPtr RedoRecPtr;
  * ControlFileLock: must be held to read/update control file or create
  * new log file.
  *
- * CheckpointLock: must be held to do a checkpoint or restartpoint, ensuring
- * we get just one of those at any time. In 8.4+ recovery, both startup and
- * bgwriter processes may take restartpoints, so this locking must be strict 
- * to ensure there are no mistakes.
+ * CheckpointLock: must be held to do a checkpoint (ensures only one
+ * checkpointer at a time; currently, with all checkpoints done by the
+ * bgwriter, this is just pro forma).
  *
- * In 8.4 we progress through a number of states at startup. Initially, the
- * postmaster is in PM_STARTUP state and spawns the Startup process. We then
- * progress until the database is in a consistent state, then if we are in
- * InArchiveRecovery we go into PM_RECOVERY state. The bgwriter then starts
- * up and takes over responsibility for performing restartpoints. We then
- * progress until the end of recovery when we enter PM_RUN state upon
- * termination of the Startup process. In summary:
- * 
- * PM_STARTUP state:	Startup process performs restartpoints
- * PM_RECOVERY state:	bgwriter process performs restartpoints
- * PM_RUN state: 		bgwriter process performs checkpoints
- *
- * These transitions are fairly delicate, with many things that need to
- * happen at the same time in order to change state successfully throughout
- * the system. Changing PM_STARTUP to PM_RECOVERY only occurs when we can
- * prove the databases are in a consistent state. Changing from PM_RECOVERY
- * to PM_RUN happens whenever recovery ends, which could be forced upon us
- * externally or it can occur becasue of damage or termination of the WAL
- * sequence.
  *----------
  */
 
@@ -344,15 +332,7 @@ typedef struct XLogCtlData
 
 	/*
 	 * SharedRecoveryProcessingMode indicates if we're still in crash or
-	 * archive recovery. You should use IsRecoveryProcessingMode() instead
-	 * of peeking at this variable directly, because it uses a cached value
-	 * after we exit recovery.
-	 *
-	 * We also retain a local state variable InRecovery. InRecovery=true
-	 * means the code is being executed by Startup process and therefore
-	 * always during Recovery Processing Mode. This allows us to identify
-	 * code executed *during* Recovery Processing Mode but not necessarily
-	 * by Startup process itself.
+	 * archive recovery. It's checked by IsRecoveryProcessingMode()
 	 */
 	bool		SharedRecoveryProcessingMode;
 
@@ -455,7 +435,6 @@ static void exitArchiveRecovery(TimeLineID endTLI,
 static void exitRecovery(void);
 static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
-static XLogRecPtr GetRedoLocationForCheckpoint(void);
 
 static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
 				XLogRecPtr *lsn, BkpBlock *bkpb);
@@ -1789,6 +1768,7 @@ XLogFlush(XLogRecPtr record)
 	XLogRecPtr	WriteRqstPtr;
 	XLogwrtRqst WriteRqst;
 
+	/* Disabled during REDO */
 	if (IsRecoveryProcessingMode())
 		return;
 
@@ -5939,6 +5919,7 @@ CreateCheckPoint(int flags)
 	XLogRecPtr	recptr;
 	XLogCtlInsert *Insert = &XLogCtl->Insert;
 	XLogRecData rdata;
+	uint32		freespace;
 	uint32		_logId;
 	uint32		_logSeg;
 	TransactionId *inCommitXids;
@@ -5946,8 +5927,9 @@ CreateCheckPoint(int flags)
 
 	/*
 	 * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
-	 * That shouldn't be happening, but checkpoints are an important aspect
-	 * of our resilience, so we take no chances.
+	 * (This is just pro forma, since in the present system structure there is
+	 * only one process that is allowed to issue checkpoints at any given
+	 * time.)
 	 */
 	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
 
@@ -6036,7 +6018,34 @@ CreateCheckPoint(int flags)
 	 * the buffer flush work.  Those XLOG records are logically after the
 	 * checkpoint, even though physically before it.  Got that?
 	 */
-	checkPoint.redo = GetRedoLocationForCheckpoint();
+	freespace = INSERT_FREESPACE(Insert);
+	if (freespace < SizeOfXLogRecord)
+	{
+		(void) AdvanceXLInsertBuffer(false);
+		/* OK to ignore update return flag, since we will do flush anyway */
+		freespace = INSERT_FREESPACE(Insert);
+	}
+	INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);
+
+	/*
+	 * Here we update the shared RedoRecPtr for future XLogInsert calls; this
+	 * must be done while holding the insert lock AND the info_lck.
+	 *
+	 * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
+	 * pointing past where it really needs to point.  This is okay; the only
+	 * consequence is that XLogInsert might back up whole buffers that it
+	 * didn't really need to.  We can't postpone advancing RedoRecPtr because
+	 * XLogInserts that happen while we are dumping buffers must assume that
+	 * their buffer changes are not included in the checkpoint.
+	 */
+	{
+		/* use volatile pointer to prevent code rearrangement */
+		volatile XLogCtlData *xlogctl = XLogCtl;
+
+		SpinLockAcquire(&xlogctl->info_lck);
+		RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
+		SpinLockRelease(&xlogctl->info_lck);
+	}
 
 	/*
 	 * Now we can release WAL insert lock, allowing other xacts to proceed
@@ -6161,18 +6170,15 @@ CreateCheckPoint(int flags)
 	 * that this is executed by bgwriter after the death of Startup process.
 	 */
 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-
 	if (shutdown)
 		ControlFile->state = DB_SHUTDOWNED;
 	else
 		ControlFile->state = DB_IN_PRODUCTION;
-
 	ControlFile->prevCheckPoint = ControlFile->checkPoint;
 	ControlFile->checkPoint = ProcLastRecPtr;
 	ControlFile->checkPointCopy = checkPoint;
 	ControlFile->time = (pg_time_t) time(NULL);
 	UpdateControlFile();
-
 	LWLockRelease(ControlFileLock);
 
 	/* Update shared-memory copy of checkpoint XID/epoch */
@@ -6235,51 +6241,6 @@ CreateCheckPoint(int flags)
 	LWLockRelease(CheckpointLock);
 }
 
-/* 
- * GetRedoLocationForCheckpoint()
- *
- * When !IsRecoveryProcessingMode() this must be called while holding 
- * WALInsertLock.
- */
-static XLogRecPtr
-GetRedoLocationForCheckpoint()
-{
-	XLogCtlInsert  *Insert = &XLogCtl->Insert;
-	uint32			freespace;
-	XLogRecPtr		redo;
-
-	freespace = INSERT_FREESPACE(Insert);
-	if (freespace < SizeOfXLogRecord)
-	{
-		(void) AdvanceXLInsertBuffer(false);
-		/* OK to ignore update return flag, since we will do flush anyway */
-		freespace = INSERT_FREESPACE(Insert);
-	}
-	INSERT_RECPTR(redo, Insert, Insert->curridx);
-
-	/*
-	 * Here we update the shared RedoRecPtr for future XLogInsert calls; this
-	 * must be done while holding the insert lock AND the info_lck.
-	 *
-	 * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
-	 * pointing past where it really needs to point.  This is okay; the only
-	 * consequence is that XLogInsert might back up whole buffers that it
-	 * didn't really need to.  We can't postpone advancing RedoRecPtr because
-	 * XLogInserts that happen while we are dumping buffers must assume that
-	 * their buffer changes are not included in the checkpoint.
-	 */
-	{
-		/* use volatile pointer to prevent code rearrangement */
-		volatile XLogCtlData *xlogctl = XLogCtl;
-
-		SpinLockAcquire(&xlogctl->info_lck);
-		RedoRecPtr = xlogctl->Insert.RedoRecPtr = redo;
-		SpinLockRelease(&xlogctl->info_lck);
-	}
-
-	return redo;
-}
-
 /*
  * Flush all data in shared memory to disk, and fsync
  *
@@ -6551,7 +6512,7 @@ exitRecovery(void)
 }
 
 /*
- * XLOG resource manager's routines.
+ * XLOG resource manager's routines
  *
  * Definitions of message info are in include/catalog/pg_control.h,
  * though not all messages relate to control file processing.
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index 8cbb7a3744..f9ab290823 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -362,10 +362,19 @@ BackgroundWriterMain(void)
 	BgWriterRecoveryMode = IsRecoveryProcessingMode();
 
 	if (BgWriterRecoveryMode)
-		elog(DEBUG1, "bgwriter starting during recovery, pid = %u", 
-			BgWriterShmem->bgwriter_pid);
+		elog(DEBUG1, "bgwriter starting during recovery");
+	else
+		InitXLOGAccess();
 
-	/* If someone requested a checkpoint before we started up, process that */
+	/*
+	 * If someone requested a checkpoint before we started up, process that.
+	 *
+	 * This check exists primarily for crash recovery: after the startup
+	 * process is finished with WAL replay, it will request a checkpoint, but
+	 * the background writer might not have started yet. This check will
+	 * actually not notice a checkpoint that's been requested without any
+	 * flags, but it's good enough for the startup checkpoint.
+	 */
 	SpinLockAcquire(&bgs->ckpt_lck);
 	if (bgs->ckpt_flags)
 		checkpoint_requested = true;
@@ -417,6 +426,13 @@ BackgroundWriterMain(void)
 			/* Normal exit from the bgwriter is here */
 			proc_exit(0);		/* done */
 		}
+ 		if (BgWriterRecoveryMode && !IsRecoveryProcessingMode())
+  		{
+			elog(DEBUG1, "bgwriter changing from recovery to normal mode");
+ 
+			InitXLOGAccess();
+			BgWriterRecoveryMode = false;
+		}
 
 		/*
 		 * Force a checkpoint if too much time has elapsed since the last one.
@@ -434,14 +450,6 @@ BackgroundWriterMain(void)
 			flags |= CHECKPOINT_CAUSE_TIME;
 		}
 
- 		if (BgWriterRecoveryMode && !IsRecoveryProcessingMode())
-  		{
-			elog(DEBUG2, "bgwriter changing from recovery to normal mode");
- 
-			InitXLOGAccess();
-			BgWriterRecoveryMode = false;
-		}
-
 		/*
 		 * Do a checkpoint if requested, otherwise do one cycle of
 		 * dirty-buffer writing.
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 0c0e5e4507..221c9b2aac 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -228,17 +228,18 @@ static bool FatalError = false; /* T if recovering from backend crash */
 
 /*
  * We use a simple state machine to control startup, shutdown, and
- * crash recovery (which is rather like shutdown followed by startup).
+ * recovery.
  *
- * Recovery is split into two phases: crash recovery and archive recovery.
- * The startup process begins with crash recovery, replaying WAL until
- * a self-consistent database state is reached. At that point, it signals
- * postmaster, and we switch to archive recovery phase. The background
- * writer is launched, and we can start accepting connections to perform
- * read-only queries, while the startup process continues applying WAL.
- * When the startup process exits, we switch to PM_RUN state. The startup
- * process can also skip the archive recovery altogether, as it will during
- * normal startup when there's no recovery to be done, for example.
+ * Recovery is split into two phases: crash recovery and consistent (archive)
+ * recovery.  The startup process begins with crash recovery, replaying WAL
+ * until a self-consistent database state is reached. At that point, it
+ * signals postmaster, and we switch to consistent recovery phase. The
+ * background writer is launched, while the startup process continues
+ * applying WAL.  We could start accepting connections to perform read-only
+ * queries at this point, if we had the infrastructure to do that. When the
+ * startup process exits, we switch to PM_RUN state. The startup process can
+ * also skip the consistent recovery altogether, as it will during normal
+ * startup when there's no recovery to be done, for example.
  *
  * Normal child backends can only be launched when we are in PM_RUN state.
  * (We also allow it in PM_WAIT_BACKUP state, but only for superusers.)
@@ -264,7 +265,7 @@ typedef enum
 {
 	PM_INIT,					/* postmaster starting */
 	PM_STARTUP,					/* waiting for startup subprocess */
-	PM_RECOVERY,				/* archive recovery mode */
+	PM_RECOVERY,				/* consistent recovery mode */
 	PM_RUN,						/* normal "database is alive" state */
 	PM_WAIT_BACKUP,				/* waiting for online backup mode to end */
 	PM_WAIT_BACKENDS,			/* waiting for live backends to exit */
@@ -2169,8 +2170,8 @@ reaper(SIGNAL_ARGS)
 
 			/*
 			 * Crank up the background writer, if we didn't do that already
-			 * when we entered archive recovery phase.  It doesn't matter if
-			 * this fails, we'll just try again later.
+			 * when we entered consistent recovery phase.  It doesn't matter
+			 * if this fails, we'll just try again later.
 			 */
 			if (BgWriterPID == 0)
 				BgWriterPID = StartBackgroundWriter();
-- 
2.39.5