From aece05fbe4baf5c38d7c6d4eb8c159883be717d7 Mon Sep 17 00:00:00 2001
From: Tatsuo Ishii <ishii@sraoss.co.jp>
Date: Tue, 11 May 2021 19:51:19 +0900
Subject: [PATCH] Fix race condition between detach_false_primary and
 follow_primary command.

It was reported that if detach_false_primary and follow_primary
command are running concurrently, many problem occured:

https://www.pgpool.net/pipermail/pgpool-general/2021-April/007583.html

Typical problem is, no primary node is found at the end.

I confirmed that this can be easily reproduced:

https://www.pgpool.net/pipermail/pgpool-hackers/2021-May/003893.html

In this commit new functions pool_acquire_follow_primary_lock(bool
block) and pool_release_follow_primary_lock(void) are introduced. They
are responsible for acquiring or releasing the lock. There are 3
places where those functions are used:

1) find_primary_node

This function is called upon startup and failover in the main pgpool
process to find new primary node.

2) failover

This function is called in the follow_primary_command subprocess
forked off by pgpool main process to execute follow_primary_command
script. The lock should be help until all follow_primary_command are
completed.

3) streaming replication check

Before starting verify_backend_node, which is the work horse of
detach_false_primary, the lock must be acquired. If it fails, just
skip the streaming replication check cycle.

The commit also deal with the case when watchdog is enabled.

https://www.pgpool.net/pipermail/pgpool-hackers/2021-May/003894.html

Multiple pgpool nodes perform detach_false_primary concurrently and
this is the cause of the problem.  To fix this detach_false_primary is
performed only on the leader node. Also if the quorum is absent,
detach_false_primary is not performed.
---
 src/include/pool.h                            |  10 +-
 src/main/pgpool_main.c                        |  79 ++++++++++++
 src/streaming_replication/pool_worker_child.c | 120 +++++++++++++-----
 .../tests/018.detach_primary/test.sh          |  61 +++++++++
 4 files changed, 237 insertions(+), 33 deletions(-)

diff --git a/src/include/pool.h b/src/include/pool.h
index 23599eb3c..9dd247786 100644
--- a/src/include/pool.h
+++ b/src/include/pool.h
@@ -379,7 +379,7 @@ typedef enum
 #define Min(x, y)		((x) < (y) ? (x) : (y))
 
 
-#define MAX_NUM_SEMAPHORES		7
+#define MAX_NUM_SEMAPHORES		8
 #define CONN_COUNTER_SEM		0
 #define REQUEST_INFO_SEM		1
 #define SHM_CACHE_SEM			2
@@ -387,6 +387,7 @@ typedef enum
 #define PCP_REQUEST_SEM			4
 #define ACCEPT_FD_SEM			5
 #define SI_CRITICAL_REGION_SEM	6
+#define FOLLOW_PRIMARY_SEM		7
 #define MAX_REQUEST_QUEUE_SIZE	10
 
 #define MAX_SEC_WAIT_FOR_CLUSTER_TRANSATION 10	/* time in seconds to keep
@@ -448,6 +449,9 @@ typedef struct
 	int			conn_counter;
 	bool		switching;		/* it true, failover or failback is in
 								 * progress */
+	/* false if follow primary command or detach_false_primary in
+	 * execution */
+	bool		follow_primary_count;
 }			POOL_REQUEST_INFO;
 
 /* description of row. corresponding to RowDescription message */
@@ -626,8 +630,8 @@ extern POOL_NODE_STATUS * verify_backend_node_status(POOL_CONNECTION_POOL_SLOT *
 extern POOL_NODE_STATUS * pool_get_node_status(void);
 extern void pool_set_backend_status_changed_time(int backend_id);
 extern int	get_next_main_node(void);
-
-
+extern bool pool_acquire_follow_primary_lock(bool block);
+extern void pool_release_follow_primary_lock(void);
 
 /* strlcpy.c */
 #ifndef HAVE_STRLCPY
diff --git a/src/main/pgpool_main.c b/src/main/pgpool_main.c
index 58947ae86..3d600af22 100644
--- a/src/main/pgpool_main.c
+++ b/src/main/pgpool_main.c
@@ -3102,6 +3102,7 @@ verify_backend_node_status(POOL_CONNECTION_POOL_SLOT * *slots)
 				}
 			}
 		}
+
 	}
 
 	return pool_node_status;
@@ -3184,7 +3185,9 @@ find_primary_node(void)
 		pfree(password);
 
 	/* Verify backend status */
+	pool_acquire_follow_primary_lock(true);
 	status = verify_backend_node_status(slots);
+	pool_release_follow_primary_lock();
 
 	for (i = 0; i < NUM_BACKENDS; i++)
 	{
@@ -3285,6 +3288,7 @@ fork_follow_child(int old_main_node, int new_primary, int old_primary)
 	{
 		on_exit_reset();
 		SetProcessGlobalVaraibles(PT_FOLLOWCHILD);
+		pool_acquire_follow_primary_lock(true);
 		ereport(LOG,
 				(errmsg("start triggering follow command.")));
 		for (i = 0; i < pool_config->backend_desc->num_backends; i++)
@@ -3296,6 +3300,7 @@ fork_follow_child(int old_main_node, int new_primary, int old_primary)
 				trigger_failover_command(i, pool_config->follow_primary_command,
 										 old_main_node, new_primary, old_primary);
 		}
+		pool_release_follow_primary_lock();
 		exit(0);
 	}
 	else if (pid == -1)
@@ -4220,3 +4225,77 @@ pool_set_backend_status_changed_time(int backend_id)
 	tval = time(NULL);
 	BACKEND_INFO(backend_id).status_changed_time = tval;
 }
+
+/*
+ * Acquire lock on follow primary command execution.  Follow primary command
+ * and detach_false_primary must acquire this lock before execution because
+ * they are conflicting each other.  If argument "block" is true, this
+ * function will not return until it succeeds in acquiring the lock.  This
+ * function returns true if succeeded in acquiring the lock.
+ */
+bool
+pool_acquire_follow_primary_lock(bool block)
+{
+	pool_sigset_t oldmask;
+	volatile int	follow_primary_count;
+
+	for (;;)
+	{
+		POOL_SETMASK2(&BlockSig, &oldmask);
+		pool_semaphore_lock(FOLLOW_PRIMARY_SEM);
+		follow_primary_count = Req_info->follow_primary_count;
+
+		if (follow_primary_count <= 0)
+		{
+			/* the lock is not held by anyone */
+			ereport(DEBUG1,
+					(errmsg("pool_acquire_follow_primary_lock: lock was not held by anyone")));
+			break;
+		}
+
+		else if (follow_primary_count > 0 && !block)
+		{
+			pool_semaphore_unlock(FOLLOW_PRIMARY_SEM);
+			POOL_SETMASK(&oldmask);
+			/* return and inform that the lock was held by someone */
+			ereport(DEBUG1,
+					(errmsg("pool_acquire_follow_primary_lock: lock was held by someone %d", follow_primary_count)));
+			return false;
+		}
+
+		pool_semaphore_unlock(FOLLOW_PRIMARY_SEM);
+		POOL_SETMASK(&oldmask);
+		ereport(DEBUG1,
+				(errmsg("pool_acquire_follow_primary_lock: lock was held by someone %d sleeping...", follow_primary_count)));
+		sleep(1);
+	}
+
+	/* acquire lock */
+	Req_info->follow_primary_count = 1;
+	pool_semaphore_unlock(FOLLOW_PRIMARY_SEM);
+	POOL_SETMASK(&oldmask);
+
+	ereport(DEBUG1,
+			(errmsg("pool_acquire_follow_primary_lock: succeeded in acquiring lock")));
+
+	return true;
+}
+
+/*
+ * Release lock on follow primary command execution.
+ */
+void
+pool_release_follow_primary_lock(void)
+{
+	pool_sigset_t oldmask;
+
+	POOL_SETMASK2(&BlockSig, &oldmask);
+	pool_semaphore_lock(FOLLOW_PRIMARY_SEM);
+	Req_info->follow_primary_count = 0;
+	pool_semaphore_unlock(FOLLOW_PRIMARY_SEM);
+	POOL_SETMASK(&oldmask);
+
+	ereport(DEBUG1,
+			(errmsg("pool_release_follow_primary_lock called")));
+
+}
diff --git a/src/streaming_replication/pool_worker_child.c b/src/streaming_replication/pool_worker_child.c
index 5c2e9719e..8d3dcdc02 100644
--- a/src/streaming_replication/pool_worker_child.c
+++ b/src/streaming_replication/pool_worker_child.c
@@ -66,6 +66,9 @@
 #include "auth/md5.h"
 #include "auth/pool_hba.h"
 
+#include "watchdog/wd_internal_commands.h"
+#include "watchdog/watchdog.h"
+
 static POOL_CONNECTION_POOL_SLOT * slots[MAX_NUM_BACKENDS];
 static volatile sig_atomic_t reload_config_request = 0;
 static volatile sig_atomic_t restart_request = 0;
@@ -78,6 +81,7 @@ static unsigned long long int text_to_lsn(char *text);
 static RETSIGTYPE my_signal_handler(int sig);
 static RETSIGTYPE reload_config_handler(int sig);
 static void reload_config(void);
+static void sr_check_will_die(int code, Datum arg);
 
 #define CHECK_REQUEST \
 	do { \
@@ -96,6 +100,8 @@ static void reload_config(void);
 #define PG10_SERVER_VERSION	100000	/* PostgreSQL 10 server version num */
 #define PG91_SERVER_VERSION	90100	/* PostgreSQL 9.1 server version num */
 
+static	volatile bool follow_primary_lock_acquired;
+
 /*
 * worker child main loop
 */
@@ -112,6 +118,11 @@ do_worker_child(void)
 	init_ps_display("", "", "", "");
 	set_ps_display("worker process", false);
 
+	/*
+	 * install the call back for preparation of exit
+	 */
+	on_system_exit(sr_check_will_die, (Datum) NULL);
+
 	/* set up signal handlers */
 	signal(SIGALRM, SIG_DFL);
 	signal(SIGTERM, my_signal_handler);
@@ -154,6 +165,7 @@ do_worker_child(void)
 	{
 		MemoryContextSwitchTo(WorkerMemoryContext);
 		MemoryContextResetAndDeleteChildren(WorkerMemoryContext);
+		WD_STATES	wd_status;
 
 		CHECK_REQUEST;
 
@@ -163,51 +175,91 @@ do_worker_child(void)
 		}
 
 		/*
-		 * If streaming replication mode, do time lag checking
+		 * Get watchdog status if watchdog is enabled.
 		 */
+		if (pool_config->use_watchdog)
+		{
+			wd_status = wd_internal_get_watchdog_local_node_state();
+			ereport(DEBUG1,
+					(errmsg("watchdog status: %d", wd_status)));
+		}
 
-		if (pool_config->sr_check_period > 0 && STREAM)
+		/*
+		 * If streaming replication mode, do time lag checking
+		 * Also skip if failover/failback is ongoing.
+		 */
+		if (pool_config->sr_check_period > 0 && STREAM &&
+			Req_info->switching == false)
 		{
-			establish_persistent_connection();
-			PG_TRY();
-			{
-				POOL_NODE_STATUS *node_status;
-				int			i;
+			/*
+			 * Acquire follow primary lock. If fail to acqure lock, try again.
+			 */
+			follow_primary_lock_acquired = false;
 
-				/* Do replication time lag checking */
-				check_replication_time_lag();
+			if (pool_acquire_follow_primary_lock(false) == true)
+			{
+				follow_primary_lock_acquired = true;
 
-				/* Check node status */
-				node_status = verify_backend_node_status(slots);
-				for (i = 0; i < NUM_BACKENDS; i++)
+				establish_persistent_connection();
+				PG_TRY();
 				{
-					ereport(DEBUG1,
-							(errmsg("node status[%d]: %d", i, node_status[i])));
+					POOL_NODE_STATUS *node_status;
+					int			i;
+
+					/* Do replication time lag checking */
+					check_replication_time_lag();
 
-					if (node_status[i] == POOL_NODE_STATUS_INVALID)
+					/* Check node status */
+					node_status = verify_backend_node_status(slots);
+
+
+					for (i = 0; i < NUM_BACKENDS; i++)
 					{
-						int			n;
+						ereport(DEBUG1,
+								(errmsg("node status[%d]: %d", i, node_status[i])));
 
-						ereport(LOG,
-								(errmsg("pgpool_worker_child: invalid node found %d", i)));
-						if (pool_config->detach_false_primary)
+						if (node_status[i] == POOL_NODE_STATUS_INVALID)
 						{
-							n = i;
-							degenerate_backend_set(&n, 1, REQ_DETAIL_SWITCHOVER | REQ_DETAIL_CONFIRMED);
+							int			n;
+
+							ereport(LOG,
+									(errmsg("pgpool_worker_child: invalid node found %d", i)));
+							/*
+							 * If detach_false_primary is enabled, send
+							 * degenerate request to detach invalid node.
+							 * This should only happen on leader watchdog node
+							 * and quorum exists if watchdog is enabled. Other
+							 * nodes will be informed by the leader node later
+							 * on.
+							 */
+							if ((pool_config->detach_false_primary && !pool_config->use_watchdog) ||
+								(pool_config->detach_false_primary && pool_config->use_watchdog &&
+								 wd_internal_get_watchdog_quorum_state() >= 0 && wd_status == WD_COORDINATOR))
+							{
+								n = i;
+								degenerate_backend_set(&n, 1, REQ_DETAIL_SWITCHOVER | REQ_DETAIL_CONFIRMED);
+							}
 						}
 					}
 				}
-			}
-			PG_CATCH();
-			{
+				PG_CATCH();
+				{
+					discard_persistent_connection();
+					pool_release_follow_primary_lock();
+					follow_primary_lock_acquired = false;
+					sleep(pool_config->sr_check_period);
+					PG_RE_THROW();
+				}
+				PG_END_TRY();
+
+				/* Discard persistent connections */
 				discard_persistent_connection();
-				sleep(pool_config->sr_check_period);
-				PG_RE_THROW();
+				if (follow_primary_lock_acquired)
+				{
+					pool_release_follow_primary_lock();
+					follow_primary_lock_acquired = false;
+				}
 			}
-			PG_END_TRY();
-
-			/* Discard persistent connections */
-			discard_persistent_connection();
 		}
 		sleep(pool_config->sr_check_period);
 	}
@@ -599,3 +651,11 @@ get_query_result(POOL_CONNECTION_POOL_SLOT * *slots, int backend_id, char *query
 	sts = 0;
 	return sts;
 }
+
+static void
+sr_check_will_die(int code, Datum arg)
+{
+	if (follow_primary_lock_acquired)
+		pool_release_follow_primary_lock();
+
+}
diff --git a/src/test/regression/tests/018.detach_primary/test.sh b/src/test/regression/tests/018.detach_primary/test.sh
index 8d54d09e8..d3006324c 100755
--- a/src/test/regression/tests/018.detach_primary/test.sh
+++ b/src/test/regression/tests/018.detach_primary/test.sh
@@ -54,4 +54,65 @@ fi
 
 ./shutdownall
 
+#
+# test with watchdog enabled
+#
+
+# wipe out everything
+cd ..
+rm -fr $TESTDIR
+mkdir $TESTDIR
+cd $TESTDIR
+
+# create 3 node pgpool with 3 backends.
+$WATCHDOG_SETUP -wn 3 -n 3
+
+# enable detach_false_primary
+for i in 0 1 2
+do
+    echo "detach_false_primary = on" >> pgpool$i/etc/pgpool.conf
+done
+
+# start only pgpool0 and backend so that the quorum is absent.
+cd pgpool0
+source ./bashrc.ports
+./startall
+cd ..
+export PGPORT=$PGPOOL_PORT
+wait_for_pgpool_startup
+
+# promote #3 node to create false primary
+$PG_CTL -D pgpool0/data2 promote
+sleep 10
+wait_for_pgpool_startup
+
+$PSQL -c "show pool_nodes" postgres|grep down
+if [ $? = 0 ];then
+    echo "node is down despite that the quorum is absent"
+    ./shutdownall
+    exit 1
+fi
+
+# start pgpool1 and pgpool2 so that the quorum exists.
+echo "testing the case when the quorum exists"
+cd pgpool1
+./startall
+cd ..
+cd pgpool2
+./startall
+cd ..
+sleep 10
+pcp_watchdog_info -v -w -p $PCP_PORT
+
+$PSQL -c "show pool_nodes" postgres
+
+$PSQL -c "show pool_nodes" postgres|grep down
+if [ $? != 0 ];then
+    echo "node is not down despite that the quorum exists"
+    ./shutdownall
+    exit 1
+fi
+
+./shutdownall
+
 exit 0
-- 
2.39.5