From 5bf2280f7ef62315b4edaecd81ec64b96f1464fb Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@enterprisedb.com>
Date: Thu, 5 Feb 2009 13:21:20 +0200
Subject: [PATCH] Bring back startup checkpoints. Plus some other small changes

---
 src/backend/access/transam/xlog.c | 200 ++++++++----------------------
 src/backend/postmaster/bgwriter.c |  13 --
 src/include/access/xlog.h         |  11 +-
 src/include/catalog/pg_control.h  |   2 +-
 4 files changed, 55 insertions(+), 171 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 961bcf3c43..0f647de5aa 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -422,7 +422,7 @@ static XLogRecPtr EndRecPtr;	/* end+1 of last record read. Also in shared mem */
 static XLogRecord *nextRecord = NULL;
 static TimeLineID lastPageTLI = 0;
 static XLogRecPtr minRecoveryPoint; /* local copy of ControlFile->minRecoveryPoint */
-static bool		  updateMinRecoveryPoint = true;
+static bool	updateMinRecoveryPoint = true;
 
 static bool InRedo = false;
 
@@ -440,7 +440,6 @@ static void XLogArchiveCleanup(const char *xlog);
 static void readRecoveryCommandFile(void);
 static void exitArchiveRecovery(TimeLineID endTLI,
 					uint32 endLogId, uint32 endLogSeg);
-static void exitRecovery(void);
 static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
 
@@ -526,10 +525,9 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 	bool		updrqst;
 	bool		doPageWrites;
 	bool		isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
-	bool		isRecoveryEnd = (rmid == RM_XLOG_ID && info == XLOG_RECOVERY_END);
 
 	/* cross-check on whether we should be here or not */
-	if (IsRecoveryProcessingMode() && !isRecoveryEnd)
+	if (IsRecoveryProcessingMode())
 		elog(FATAL, "cannot make new WAL entries during recovery");
 
 	/* info's high bits are reserved for use by me */
@@ -1826,7 +1824,10 @@ XLogFlush(XLogRecPtr record)
 	XLogRecPtr	WriteRqstPtr;
 	XLogwrtRqst WriteRqst;
 
-	/* During REDO, we don't try to flush the WAL, but update minRecoveryPoint instead */
+	/*
+	 * During REDO, we don't try to flush the WAL, but update minRecoveryPoint
+	 * instead.
+	 */
 	if (IsRecoveryProcessingMode())
 	{
 		UpdateMinRecoveryPoint(record);
@@ -1930,7 +1931,7 @@ XLogFlush(XLogRecPtr record)
 	 * and so we will not force a restart for a bad LSN on a data page.
 	 */
 	if (XLByteLT(LogwrtResult.Flush, record))
-		elog(ERROR,
+		elog(InRecovery ? WARNING : ERROR,
 		"xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
 			 record.xlogid, record.xrecoff,
 			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
@@ -2508,6 +2509,7 @@ XLogFileRead(uint32 log, uint32 seg, int emode)
 			snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
 					 xlogfname);
 			set_ps_display(activitymsg, false);
+
 			return fd;
 		}
 		if (errno != ENOENT)	/* unexpected failure? */
@@ -2788,7 +2790,7 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 	 */
 	if (shutdown_requested && InRedo)
 	{
-		/* XXX: We should update minRecoveryPoint to the exact value here */
+		/* XXX: Is EndRecPtr always the right value? */
 		UpdateMinRecoveryPoint(EndRecPtr);
 		proc_exit(0);
 	}
@@ -4835,13 +4837,15 @@ exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
 	unlink(recoveryPath);		/* ignore any error */
 
 	/*
-	 * As of 8.4 we no longer rename the recovery.conf file out of the
-	 * way until after we have performed a full checkpoint. This ensures
-	 * that any crash between now and the end of the checkpoint does not
-	 * attempt to restart from a WAL file that is no longer available to us.
-	 * As soon as we remove recovery.conf we lose our recovery_command and
-	 * cannot reaccess WAL files from the archive.
+	 * Rename the config file out of the way, so that we don't accidentally
+	 * re-enter archive recovery mode in a subsequent crash.
 	 */
+	unlink(RECOVERY_COMMAND_DONE);
+	if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
+		ereport(FATAL,
+				(errcode_for_file_access(),
+				 errmsg("could not rename file \"%s\" to \"%s\": %m",
+						RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
 
 	ereport(LOG,
 			(errmsg("archive recovery complete")));
@@ -4977,7 +4981,6 @@ StartupXLOG(void)
 	bool		wasShutdown;
 	bool		reachedStopPoint = false;
 	bool		reachedMinRecoveryPoint = false;
-	bool		performedRecovery = false;
 	bool		haveBackupLabel = false;
 	XLogRecPtr	RecPtr,
 				LastRec,
@@ -5331,24 +5334,14 @@ StartupXLOG(void)
 				{
 					/*
 					 * We were requested to exit without finishing recovery.
-					 *
-					 * XXX: We should update minRecoveryPoint to the exact
-					 * value here.
 					 */
-					UpdateMinRecoveryPoint(EndRecPtr);
+					UpdateMinRecoveryPoint(ReadRecPtr);
 					proc_exit(0);
 				}
 
 				/*
 				 * Have we reached our safe starting point? If so, we can
-				 * signal postmaster to enter consistent recovery mode.
-				 * XXX
-				 * There are two points in the log we must pass. The first is
-				 * the minRecoveryPoint, which is the LSN at the time the
-				 * base backup was taken that we are about to rollfoward from.
-				 * If recovery has ever crashed or was stopped there is 
-				 * another point also: minSafeStartPoint, which is the
-				 * latest LSN that recovery could have reached prior to crash.
+				 * tell postmaster that the database is consistent now.
 				 */
 				if (!reachedMinRecoveryPoint && 
 					 XLByteLE(minRecoveryPoint, EndRecPtr))
@@ -5437,7 +5430,7 @@ StartupXLOG(void)
 	 * Complain if we did not roll forward far enough to render the backup
 	 * dump consistent.
 	 */
-	if (InRecovery && !reachedMinRecoveryPoint)
+	if (InRecovery && XLByteLT(EndOfLog, minRecoveryPoint))
 	{
 		if (reachedStopPoint)	/* stopped because of stop request */
 			ereport(FATAL,
@@ -5539,6 +5532,12 @@ StartupXLOG(void)
 	/* Pre-scan prepared transactions to find out the range of XIDs present */
 	oldestActiveXID = PrescanPreparedTransactions();
 
+	/*
+	 * Allow writing WAL for us. But not for other backends! That's done
+	 * after writing the shutdown checkpoint and finishing recovery.
+	 */
+	LocalRecoveryProcessingMode = false;
+
 	if (InRecovery)
 	{
 		int			rmid;
@@ -5559,14 +5558,30 @@ StartupXLOG(void)
 		XLogCheckInvalidPages();
 
 		/*
-		 * Finally exit recovery and mark that in WAL. Pre-8.4 we wrote
-		 * a shutdown checkpoint here, but we ask bgwriter to do that now.
+		 * Perform a checkpoint to update all our recovery activity to disk.
+		 *
+		 * Note that we write a shutdown checkpoint rather than an on-line
+		 * one. This is not particularly critical, but since we may be
+		 * assigning a new TLI, using a shutdown checkpoint allows us to have
+		 * the rule that TLI only changes in shutdown checkpoints, which
+		 * allows some extra error checking in xlog_redo.
 		 */
-		exitRecovery();
-
-		performedRecovery = true;
+		CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
 	}
 
+	/*
+	 * Preallocate additional log files, if wanted.
+	 */
+	PreallocXlogFiles(EndOfLog);
+
+	InRecovery = false;
+
+	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+	ControlFile->state = DB_IN_PRODUCTION;
+	ControlFile->time = (pg_time_t) time(NULL);
+	UpdateControlFile();
+	LWLockRelease(ControlFileLock);
+
 	/* start the archive_timeout timer running */
 	XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
 
@@ -5605,38 +5620,9 @@ StartupXLOG(void)
 	}
 
 	/*
-	 * If we had to replay any WAL records, request a checkpoint. This isn't
-	 * strictly necessary: if we crash now, the recovery will simply restart
-	 * from the same point as this time (or from the last restartpoint). The
-	 * control file is left in DB_IN_*_RECOVERY state; the first checkpoint
-	 * will change that to DB_IN_PRODUCTION.
+	 * All done. Allow others to write WAL.
 	 */
-	if (performedRecovery)
-	{
-		/*
-		 * Okay, we can come up now. Allow others to write WAL.
-		 */
-		XLogCtl->SharedRecoveryProcessingMode = false;
-
-		RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE |
-						  CHECKPOINT_STARTUP);
-	}
-	else
-	{
-		/*
-		 * No recovery, so let's just get on with it. 
-		 */
-		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-		ControlFile->state = DB_IN_PRODUCTION;
-		ControlFile->time = (pg_time_t) time(NULL);
-		UpdateControlFile();
-		LWLockRelease(ControlFileLock);
-
-		/*
-		 * Okay, we're officially UP.
-		 */
-		XLogCtl->SharedRecoveryProcessingMode = false;
-	}
+	XLogCtl->SharedRecoveryProcessingMode = false;
 }
 
 /*
@@ -5946,7 +5932,6 @@ LogCheckpointStart(int flags, bool restartpoint)
 
 	elog(LOG, msg,
 		 (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
-		 (flags & CHECKPOINT_STARTUP) ? " startup" : "",
 		 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
 		 (flags & CHECKPOINT_FORCE) ? " force" : "",
 		 (flags & CHECKPOINT_WAIT) ? " wait" : "",
@@ -6030,7 +6015,6 @@ CreateCheckPoint(int flags)
 	uint32		_logSeg;
 	TransactionId *inCommitXids;
 	int			nInCommit;
-	bool		leavingArchiveRecovery;
 
 	/* shouldn't happen */
 	if (IsRecoveryProcessingMode())
@@ -6044,13 +6028,6 @@ CreateCheckPoint(int flags)
 	 */
 	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
 
-	/*
-	 * Find out if this is the first checkpoint after archive recovery.
-	 */
-	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-	leavingArchiveRecovery = (ControlFile->state == DB_IN_ARCHIVE_RECOVERY);
-	LWLockRelease(ControlFileLock);
-
 	/*
 	 * Prepare to accumulate statistics.
 	 *
@@ -6284,10 +6261,6 @@ CreateCheckPoint(int flags)
 	 * if this is the first checkpoint after recovery.
 	 */
 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-	if (shutdown)
-		ControlFile->state = DB_SHUTDOWNED;
-	else
-		ControlFile->state = DB_IN_PRODUCTION;
 	ControlFile->prevCheckPoint = ControlFile->checkPoint;
 	ControlFile->checkPoint = ProcLastRecPtr;
 	ControlFile->checkPointCopy = checkPoint;
@@ -6295,21 +6268,6 @@ CreateCheckPoint(int flags)
 	UpdateControlFile();
 	LWLockRelease(ControlFileLock);
 
-	if (leavingArchiveRecovery)
-	{
-		/*
-		 * Rename the config file out of the way, so that we don't accidentally
-		 * re-enter archive recovery mode in a subsequent crash. Prior to
-		 * 8.4 this step was performed at end of exitArchiveRecovery().
-		 */
-		unlink(RECOVERY_COMMAND_DONE);
-		if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
-			ereport(ERROR,
-					(errcode_for_file_access(),
-					 errmsg("could not rename file \"%s\" to \"%s\": %m",
-							RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
-	}
-
 	/* Update shared-memory copy of checkpoint XID/epoch */
 	{
 		/* use volatile pointer to prevent code rearrangement */
@@ -6588,39 +6546,6 @@ RequestXLogSwitch(void)
 	return RecPtr;
 }
 
-/*
- * exitRecovery()
- *
- * Exit recovery state and write a XLOG_RECOVERY_END record. This is the
- * only record type that can record a change of timelineID. We assume
- * caller has already set ThisTimeLineID, if appropriate.
- */
-static void
-exitRecovery(void)
-{
-	XLogRecData rdata;
-
-	rdata.buffer = InvalidBuffer;
-	rdata.data = (char *) (&ThisTimeLineID);
-	rdata.len = sizeof(TimeLineID);
-	rdata.next = NULL;
-
-	/*
-	 * This is the only type of WAL message that can be inserted during
-	 * recovery. This ensures that we don't allow others to get access
-	 * until after we have changed state.
-	 */
-	(void) XLogInsert(RM_XLOG_ID, XLOG_RECOVERY_END, &rdata);
-
-	/*
-	 * We don't XLogFlush() here otherwise we'll end up zeroing the WAL
-	 * file ourselves. So just let bgwriter's forthcoming checkpoint do
-	 * that for us.
-	 */
-
-	InRecovery = false;
-}
-
 /*
  * XLOG resource manager's routines
  *
@@ -6669,33 +6594,6 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
 
 		RecoveryRestartPoint(&checkPoint);
 	}
-	else if (info == XLOG_RECOVERY_END)
-	{
-		TimeLineID	tli;
-
-		memcpy(&tli, XLogRecGetData(record), sizeof(TimeLineID));
-
-		/*
-		 * TLI may change when recovery ends, but it shouldn't decrease.
-		 *
-		 * This is the only WAL record that can tell us to change timelineID
-		 * while we process WAL records. 
-		 *
-		 * We can *choose* to stop recovery at any point, generating a
-		 * new timelineID which is recorded using this record type.
-		 */
-		if (tli != ThisTimeLineID)
-		{
-			if (tli < ThisTimeLineID ||
-				!list_member_int(expectedTLIs,
-								 (int) tli))
-				ereport(PANIC,
-						(errmsg("unexpected timeline ID %u (after %u) at recovery end record",
-								tli, ThisTimeLineID)));
-			/* Following WAL records should be run with new TLI */
-			ThisTimeLineID = tli;
-		}
-	}
 	else if (info == XLOG_CHECKPOINT_ONLINE)
 	{
 		CheckPoint	checkPoint;
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index 4c8c54c587..d38e0c6452 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -889,7 +889,6 @@ BgWriterShmemInit(void)
  *
  * flags is a bitwise OR of the following:
  *	CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
- *	CHECKPOINT_IS_STARTUP: checkpoint is for database startup.
  *	CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
  *		ignoring checkpoint_completion_target parameter.
  *	CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
@@ -956,18 +955,6 @@ RequestCheckpoint(int flags)
 	{
 		if (BgWriterShmem->bgwriter_pid == 0)
 		{
-			/*
-			 * The only difference between a startup checkpoint and a normal
-			 * online checkpoint is that it's quite normal for the bgwriter
-			 * to not be up yet when the startup checkpoint is requested.
-			 * (it might be, though). That's ok, background writer will
-			 * perform the checkpoint as soon as it starts up.
-			 */
-			if (flags & CHECKPOINT_STARTUP)
-			{
-				Assert(!(flags & CHECKPOINT_WAIT));
-				break;
-			}
 			if (ntries >= 20)		/* max wait 2.0 sec */
 			{
 				elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG,
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index b97a6afbf0..2a9ed7078e 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -168,14 +168,13 @@ extern bool XLOG_DEBUG;
 
 /* These directly affect the behavior of CreateCheckPoint and subsidiaries */
 #define CHECKPOINT_IS_SHUTDOWN	0x0001	/* Checkpoint is for shutdown */
-#define CHECKPOINT_IS_STARTUP	0x0002	/* Startup checkpoint */
-#define CHECKPOINT_IMMEDIATE	0x0003	/* Do it without delays */
-#define CHECKPOINT_FORCE		0x0008	/* Force even if no activity */
+#define CHECKPOINT_IMMEDIATE	0x0002	/* Do it without delays */
+#define CHECKPOINT_FORCE		0x0004	/* Force even if no activity */
 /* These are important to RequestCheckpoint */
-#define CHECKPOINT_WAIT			0x0010	/* Wait for completion */
+#define CHECKPOINT_WAIT			0x0008	/* Wait for completion */
 /* These indicate the cause of a checkpoint request */
-#define CHECKPOINT_CAUSE_XLOG	0x0020	/* XLOG consumption */
-#define CHECKPOINT_CAUSE_TIME	0x0040	/* Elapsed time */
+#define CHECKPOINT_CAUSE_XLOG	0x0010	/* XLOG consumption */
+#define CHECKPOINT_CAUSE_TIME	0x0020	/* Elapsed time */
 
 /* Checkpoint statistics */
 typedef struct CheckpointStatsData
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index 275fc1dddf..400f32c749 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -46,7 +46,7 @@ typedef struct CheckPoint
 #define XLOG_NOOP						0x20
 #define XLOG_NEXTOID					0x30
 #define XLOG_SWITCH						0x40
-#define XLOG_RECOVERY_END			0x50
+
 
 /* System status indicator */
 typedef enum DBState
-- 
2.39.5