Fix pcp_detach_node leaves down node.

author Tatsuo Ishii <ishii@sraoss.co.jp>

Tue, 8 Jun 2021 10:06:13 +0000 (19:06 +0900)

committer Tatsuo Ishii <ishii@sraoss.co.jp>

Tue, 8 Jun 2021 10:21:36 +0000 (19:21 +0900)
author Tatsuo Ishii <ishii@sraoss.co.jp>
Tue, 8 Jun 2021 10:06:13 +0000 (19:06 +0900)
committer Tatsuo Ishii <ishii@sraoss.co.jp>
Tue, 8 Jun 2021 10:21:36 +0000 (19:21 +0900)
diff --git a/src/include/pool.h b/src/include/pool.h

index 13be313a55bd535393ced7dc3f4a84712c28178f..8fb7e7b0423497a4a7bbd2184f614f31f3640704 100644 (file)
--- a/src/include/pool.h
+++ b/src/include/pool.h
@@ -452,6 +452,7 @@ typedef struct
         /* false if follow primary command or detach_false_primary in
          * execution */
         bool            follow_primary_count;
+       bool            follow_primary_ongoing; /* true if follow primary command is ongoing */
  }                      POOL_REQUEST_INFO;
  
  /* description of row. corresponding to RowDescription message */
diff --git a/src/main/pgpool_main.c b/src/main/pgpool_main.c

index f221d1f14581efa4974c5acacf80f795eca62838..8a5c11c29e023cb965b117b399a0afb4932a429d 100644 (file)
--- a/src/main/pgpool_main.c
+++ b/src/main/pgpool_main.c
@@ -2040,8 +2040,11 @@ failover(void)
                 }
                 need_to_restart_pcp = true;
         }
+
+       pool_semaphore_lock(REQUEST_INFO_SEM);
         switching = 0;
         Req_info->switching = false;
+       pool_semaphore_unlock(REQUEST_INFO_SEM);
  
         /*
          * kick wakeup_handler in pcp_child to notice that failover/failback done
@@ -2824,9 +2827,12 @@ trigger_failover_command(int node, const char *command_line,
  
         if (strlen(exec_cmd->data) != 0)
         {
+               pool_sigset_t oldmask;
                 ereport(LOG,
                                 (errmsg("execute command: %s", exec_cmd->data)));
+               POOL_SETMASK2(&UnBlockSig, &oldmask);
                 r = system(exec_cmd->data);
+               POOL_SETMASK(&oldmask);
         }
  
         free_string(exec_cmd);
@@ -3246,6 +3252,20 @@ find_primary_node_repeatedly(void)
                 return -1;
         }
  
+       /*
+        * If follow primary command is ongoing, skip primary node check.  Just
+        * return current primary node to avoid deadlock between pgpool main
+        * failover() and follow primary process.
+        */
+       if (Req_info->follow_primary_ongoing)
+       {
+               ereport(LOG,
+                               (errmsg("find_primary_node_repeatedly: follow primary is ongoing. return current primary: %d",
+                                               Req_info->primary_node_id)));
+
+               return Req_info->primary_node_id;
+       }
+
         /*
          * If all of the backends are down, there's no point to keep on searching
          * primary node.
@@ -3296,6 +3316,7 @@ fork_follow_child(int old_main_node, int new_primary, int old_primary)
                 on_exit_reset();
                 SetProcessGlobalVaraibles(PT_FOLLOWCHILD);
                 pool_acquire_follow_primary_lock(true);
+               Req_info->follow_primary_ongoing = true;
                 ereport(LOG,
                                 (errmsg("start triggering follow command.")));
                 for (i = 0; i < pool_config->backend_desc->num_backends; i++)
@@ -3307,6 +3328,7 @@ fork_follow_child(int old_main_node, int new_primary, int old_primary)
                                 trigger_failover_command(i, pool_config->follow_primary_command,
                                                                                  old_main_node, new_primary, old_primary);
                 }
+               Req_info->follow_primary_ongoing = false;
                 pool_release_follow_primary_lock();
                 exit(0);
         }
diff --git a/src/test/regression/tests/075.detach_primary_left_down_node/test.sh b/src/test/regression/tests/075.detach_primary_left_down_node/test.sh

new file mode 100755 (executable)

index 0000000..b6a51e6
--- /dev/null
+++ b/src/test/regression/tests/075.detach_primary_left_down_node/test.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+#-------------------------------------------------------------------
+# test script for a case: detach primary node left down node.
+#
+# reported: https://www.pgpool.net/pipermail/pgpool-hackers/2021-June/003916.html
+
+source $TESTLIBS
+WHOAMI=`whoami`
+TESTDIR=testdir
+
+rm -fr $TESTDIR
+mkdir $TESTDIR
+cd $TESTDIR
+
+# create test environment
+echo -n "creating test environment..."
+$PGPOOL_SETUP -n 4 || exit 1
+echo "done."
+
+source ./bashrc.ports
+export PGPORT=$PGPOOL_PORT
+
+./startall
+wait_for_pgpool_startup
+
+# detach node 0
+$PGPOOL_INSTALL_DIR/bin/pcp_detach_node -w -p $PCP_PORT 0
+wait_for_pgpool_startup
+
+# check to see if alll nodes are up
+echo -n "starting to check follow primary results: "
+date
+cnt=60
+while [ $cnt -gt 0 ]
+do
+    $PGBIN/psql -c "show pool_nodes" test|grep down
+    if [ $? != 0 ];then
+       echo "test succeeded"
+       ./shutdownall
+       exit 0
+    fi
+    cnt=`expr $cnt - 1`
+    echo "cnt: $cnt"
+    sleep 1
+done
+echo "test failed".
+./shutdownall
+
+exit 1
author	Tatsuo Ishii <ishii@sraoss.co.jp>
	Tue, 8 Jun 2021 10:06:13 +0000 (19:06 +0900)
committer	Tatsuo Ishii <ishii@sraoss.co.jp>
	Tue, 8 Jun 2021 10:21:36 +0000 (19:21 +0900)
src/include/pool.h		patch \| blob \| blame \| history
src/main/pgpool_main.c		patch \| blob \| blame \| history
src/test/regression/tests/075.detach_primary_left_down_node/test.sh	[new file with mode: 0755]	patch \| blob