From 59fdb1b8d598e61c62053ad70f3e8e4140b453e7 Mon Sep 17 00:00:00 2001
From: Tatsuo Ishii <ishii@sraoss.co.jp>
Date: Tue, 8 Jun 2021 19:06:13 +0900
Subject: [PATCH] Fix pcp_detach_node leaves down node.

Detaching primary node using pcp_detach_node leaves a standby node
after follow primary command was executed.

This can be reproduced reliably by following steps:

$ pgpool_setup -n 4
$ ./startall
$ pcp_detatch_node -p 11001 0

This is caused by that pcp_recovery_node is denied by pcp child process:

2021-06-05 07:22:17: follow_child pid 6593: LOG:  execute command: /home/t-ishii/work/Pgpool-II/current/x/etc/follow_primary.sh 3 /tmp 11005 /home/t-ishii/work/Pgpool-II/current/x/data3 1 0 /tmp 0 11002 /home/t-ishii/work/Pgpool-II/current/x/data0
2021-06-05 07:22:17: pcp_main pid 6848: LOG:  forked new pcp worker, pid=7027 socket=6
2021-06-05 07:22:17: pcp_child pid 7027: ERROR:  failed to process PCP request at the moment
2021-06-05 07:22:17: pcp_child pid 7027: DETAIL:  failback is in progress

it complains that a failback request is still going. The reason why the
failback is not completed is, find_primary_node_repeatedly() is trying
to acquire the follow primary lock. However the follow primary command
has already acquired the lock and it is waiting for the completion of
the failback request. Thus this is a kind of dead lock situation.

How to solve this?

The purpose of the follow primary lock is to prevent concurrent run of
follow primary command and detach false primary by the streaming
replication check. We cannot throw it away. However it is not always
necessary to acquire the lock by find_primary_node_repeatedly(). If
it does not try to acquire the lock, failover/failback will not be
blocked and will finish soon, thus Req_info->switching flags will be
promptly turned to false.

When a primary node is detached, failover command is called and new
primary is selected. At this point find_primary_node_repeatedly() is
surely needed to run to find the new primary. However, once follow
primary command starts, the primary will not be changed. So my idea
is, find_primary_node_repeatedly() checks whether follow primary
command is running or not. If it is running, just returns the current
primary. Otherwise acquires the lock.

For this purpose, new shared memory variable
Req_info->follow_primary_ongoing was introduced. The flag is set/unset
by follow primary process.

New regression test 075.detach_primary_left_down_node is added.

Discussion: https://www.pgpool.net/pipermail/pgpool-hackers/2021-June/003916.html
---
 src/include/pool.h                            |  1 +
 src/main/pgpool_main.c                        | 22 +++++++++
 .../075.detach_primary_left_down_node/test.sh | 49 +++++++++++++++++++
 3 files changed, 72 insertions(+)
 create mode 100755 src/test/regression/tests/075.detach_primary_left_down_node/test.sh

diff --git a/src/include/pool.h b/src/include/pool.h
index 4a3f67cee..2950d2904 100644
--- a/src/include/pool.h
+++ b/src/include/pool.h
@@ -452,6 +452,7 @@ typedef struct
 	/* greater than 0 if follow primary command or detach_false_primary in
 	 * execution */
 	bool		follow_primary_count;
+	bool		follow_primary_ongoing;	/* true if follow primary command is ongoing */
 }			POOL_REQUEST_INFO;
 
 /* description of row. corresponding to RowDescription message */
diff --git a/src/main/pgpool_main.c b/src/main/pgpool_main.c
index 35c6bc0d2..8f65ea857 100644
--- a/src/main/pgpool_main.c
+++ b/src/main/pgpool_main.c
@@ -2040,8 +2040,11 @@ failover(void)
 		}
 		need_to_restart_pcp = true;
 	}
+
+	pool_semaphore_lock(REQUEST_INFO_SEM);
 	switching = 0;
 	Req_info->switching = false;
+	pool_semaphore_unlock(REQUEST_INFO_SEM);
 
 	/*
 	 * kick wakeup_handler in pcp_child to notice that failover/failback done
@@ -2824,9 +2827,12 @@ trigger_failover_command(int node, const char *command_line,
 
 	if (strlen(exec_cmd->data) != 0)
 	{
+		pool_sigset_t oldmask;
 		ereport(LOG,
 				(errmsg("execute command: %s", exec_cmd->data)));
+		POOL_SETMASK2(&UnBlockSig, &oldmask);
 		r = system(exec_cmd->data);
+		POOL_SETMASK(&oldmask);
 	}
 
 	free_string(exec_cmd);
@@ -3246,6 +3252,20 @@ find_primary_node_repeatedly(void)
 		return -1;
 	}
 
+	/*
+	 * If follow primary command is ongoing, skip primary node check.  Just
+	 * return current primary node to avoid deadlock between pgpool main
+	 * failover() and follow primary process.
+	 */
+	if (Req_info->follow_primary_ongoing)
+	{
+		ereport(LOG,
+				(errmsg("find_primary_node_repeatedly: follow primary is ongoing. return current primary: %d",
+						Req_info->primary_node_id)));
+
+		return Req_info->primary_node_id;
+	}
+
 	/*
 	 * If all of the backends are down, there's no point to keep on searching
 	 * primary node.
@@ -3296,6 +3316,7 @@ fork_follow_child(int old_main_node, int new_primary, int old_primary)
 		on_exit_reset();
 		SetProcessGlobalVariables(PT_FOLLOWCHILD);
 		pool_acquire_follow_primary_lock(true);
+		Req_info->follow_primary_ongoing = true;
 		ereport(LOG,
 				(errmsg("start triggering follow command.")));
 		for (i = 0; i < pool_config->backend_desc->num_backends; i++)
@@ -3307,6 +3328,7 @@ fork_follow_child(int old_main_node, int new_primary, int old_primary)
 				trigger_failover_command(i, pool_config->follow_primary_command,
 										 old_main_node, new_primary, old_primary);
 		}
+		Req_info->follow_primary_ongoing = false;
 		pool_release_follow_primary_lock();
 		exit(0);
 	}
diff --git a/src/test/regression/tests/075.detach_primary_left_down_node/test.sh b/src/test/regression/tests/075.detach_primary_left_down_node/test.sh
new file mode 100755
index 000000000..b6a51e61a
--- /dev/null
+++ b/src/test/regression/tests/075.detach_primary_left_down_node/test.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+#-------------------------------------------------------------------
+# test script for a case: detach primary node left down node.
+#
+# reported: https://www.pgpool.net/pipermail/pgpool-hackers/2021-June/003916.html
+
+source $TESTLIBS
+WHOAMI=`whoami`
+TESTDIR=testdir
+
+rm -fr $TESTDIR
+mkdir $TESTDIR
+cd $TESTDIR
+
+# create test environment
+echo -n "creating test environment..."
+$PGPOOL_SETUP -n 4 || exit 1
+echo "done."
+
+source ./bashrc.ports
+export PGPORT=$PGPOOL_PORT
+
+./startall
+wait_for_pgpool_startup
+
+# detach node 0
+$PGPOOL_INSTALL_DIR/bin/pcp_detach_node -w -p $PCP_PORT 0
+wait_for_pgpool_startup
+
+# check to see if alll nodes are up
+echo -n "starting to check follow primary results: "
+date
+cnt=60
+while [ $cnt -gt 0 ]
+do
+    $PGBIN/psql -c "show pool_nodes" test|grep down
+    if [ $? != 0 ];then
+	echo "test succeeded"
+	./shutdownall
+	exit 0
+    fi
+    cnt=`expr $cnt - 1`
+    echo "cnt: $cnt"
+    sleep 1
+done
+echo "test failed".
+./shutdownall
+
+exit 1
-- 
2.39.5