From aece05fbe4baf5c38d7c6d4eb8c159883be717d7 Mon Sep 17 00:00:00 2001 From: Tatsuo Ishii Date: Tue, 11 May 2021 19:51:19 +0900 Subject: [PATCH] Fix race condition between detach_false_primary and follow_primary command. It was reported that if detach_false_primary and follow_primary command are running concurrently, many problem occured: https://www.pgpool.net/pipermail/pgpool-general/2021-April/007583.html Typical problem is, no primary node is found at the end. I confirmed that this can be easily reproduced: https://www.pgpool.net/pipermail/pgpool-hackers/2021-May/003893.html In this commit new functions pool_acquire_follow_primary_lock(bool block) and pool_release_follow_primary_lock(void) are introduced. They are responsible for acquiring or releasing the lock. There are 3 places where those functions are used: 1) find_primary_node This function is called upon startup and failover in the main pgpool process to find new primary node. 2) failover This function is called in the follow_primary_command subprocess forked off by pgpool main process to execute follow_primary_command script. The lock should be help until all follow_primary_command are completed. 3) streaming replication check Before starting verify_backend_node, which is the work horse of detach_false_primary, the lock must be acquired. If it fails, just skip the streaming replication check cycle. The commit also deal with the case when watchdog is enabled. https://www.pgpool.net/pipermail/pgpool-hackers/2021-May/003894.html Multiple pgpool nodes perform detach_false_primary concurrently and this is the cause of the problem. To fix this detach_false_primary is performed only on the leader node. Also if the quorum is absent, detach_false_primary is not performed. --- src/include/pool.h | 10 +- src/main/pgpool_main.c | 79 ++++++++++++ src/streaming_replication/pool_worker_child.c | 120 +++++++++++++----- .../tests/018.detach_primary/test.sh | 61 +++++++++ 4 files changed, 237 insertions(+), 33 deletions(-) diff --git a/src/include/pool.h b/src/include/pool.h index 23599eb3c..9dd247786 100644 --- a/src/include/pool.h +++ b/src/include/pool.h @@ -379,7 +379,7 @@ typedef enum #define Min(x, y) ((x) < (y) ? (x) : (y)) -#define MAX_NUM_SEMAPHORES 7 +#define MAX_NUM_SEMAPHORES 8 #define CONN_COUNTER_SEM 0 #define REQUEST_INFO_SEM 1 #define SHM_CACHE_SEM 2 @@ -387,6 +387,7 @@ typedef enum #define PCP_REQUEST_SEM 4 #define ACCEPT_FD_SEM 5 #define SI_CRITICAL_REGION_SEM 6 +#define FOLLOW_PRIMARY_SEM 7 #define MAX_REQUEST_QUEUE_SIZE 10 #define MAX_SEC_WAIT_FOR_CLUSTER_TRANSATION 10 /* time in seconds to keep @@ -448,6 +449,9 @@ typedef struct int conn_counter; bool switching; /* it true, failover or failback is in * progress */ + /* false if follow primary command or detach_false_primary in + * execution */ + bool follow_primary_count; } POOL_REQUEST_INFO; /* description of row. corresponding to RowDescription message */ @@ -626,8 +630,8 @@ extern POOL_NODE_STATUS * verify_backend_node_status(POOL_CONNECTION_POOL_SLOT * extern POOL_NODE_STATUS * pool_get_node_status(void); extern void pool_set_backend_status_changed_time(int backend_id); extern int get_next_main_node(void); - - +extern bool pool_acquire_follow_primary_lock(bool block); +extern void pool_release_follow_primary_lock(void); /* strlcpy.c */ #ifndef HAVE_STRLCPY diff --git a/src/main/pgpool_main.c b/src/main/pgpool_main.c index 58947ae86..3d600af22 100644 --- a/src/main/pgpool_main.c +++ b/src/main/pgpool_main.c @@ -3102,6 +3102,7 @@ verify_backend_node_status(POOL_CONNECTION_POOL_SLOT * *slots) } } } + } return pool_node_status; @@ -3184,7 +3185,9 @@ find_primary_node(void) pfree(password); /* Verify backend status */ + pool_acquire_follow_primary_lock(true); status = verify_backend_node_status(slots); + pool_release_follow_primary_lock(); for (i = 0; i < NUM_BACKENDS; i++) { @@ -3285,6 +3288,7 @@ fork_follow_child(int old_main_node, int new_primary, int old_primary) { on_exit_reset(); SetProcessGlobalVaraibles(PT_FOLLOWCHILD); + pool_acquire_follow_primary_lock(true); ereport(LOG, (errmsg("start triggering follow command."))); for (i = 0; i < pool_config->backend_desc->num_backends; i++) @@ -3296,6 +3300,7 @@ fork_follow_child(int old_main_node, int new_primary, int old_primary) trigger_failover_command(i, pool_config->follow_primary_command, old_main_node, new_primary, old_primary); } + pool_release_follow_primary_lock(); exit(0); } else if (pid == -1) @@ -4220,3 +4225,77 @@ pool_set_backend_status_changed_time(int backend_id) tval = time(NULL); BACKEND_INFO(backend_id).status_changed_time = tval; } + +/* + * Acquire lock on follow primary command execution. Follow primary command + * and detach_false_primary must acquire this lock before execution because + * they are conflicting each other. If argument "block" is true, this + * function will not return until it succeeds in acquiring the lock. This + * function returns true if succeeded in acquiring the lock. + */ +bool +pool_acquire_follow_primary_lock(bool block) +{ + pool_sigset_t oldmask; + volatile int follow_primary_count; + + for (;;) + { + POOL_SETMASK2(&BlockSig, &oldmask); + pool_semaphore_lock(FOLLOW_PRIMARY_SEM); + follow_primary_count = Req_info->follow_primary_count; + + if (follow_primary_count <= 0) + { + /* the lock is not held by anyone */ + ereport(DEBUG1, + (errmsg("pool_acquire_follow_primary_lock: lock was not held by anyone"))); + break; + } + + else if (follow_primary_count > 0 && !block) + { + pool_semaphore_unlock(FOLLOW_PRIMARY_SEM); + POOL_SETMASK(&oldmask); + /* return and inform that the lock was held by someone */ + ereport(DEBUG1, + (errmsg("pool_acquire_follow_primary_lock: lock was held by someone %d", follow_primary_count))); + return false; + } + + pool_semaphore_unlock(FOLLOW_PRIMARY_SEM); + POOL_SETMASK(&oldmask); + ereport(DEBUG1, + (errmsg("pool_acquire_follow_primary_lock: lock was held by someone %d sleeping...", follow_primary_count))); + sleep(1); + } + + /* acquire lock */ + Req_info->follow_primary_count = 1; + pool_semaphore_unlock(FOLLOW_PRIMARY_SEM); + POOL_SETMASK(&oldmask); + + ereport(DEBUG1, + (errmsg("pool_acquire_follow_primary_lock: succeeded in acquiring lock"))); + + return true; +} + +/* + * Release lock on follow primary command execution. + */ +void +pool_release_follow_primary_lock(void) +{ + pool_sigset_t oldmask; + + POOL_SETMASK2(&BlockSig, &oldmask); + pool_semaphore_lock(FOLLOW_PRIMARY_SEM); + Req_info->follow_primary_count = 0; + pool_semaphore_unlock(FOLLOW_PRIMARY_SEM); + POOL_SETMASK(&oldmask); + + ereport(DEBUG1, + (errmsg("pool_release_follow_primary_lock called"))); + +} diff --git a/src/streaming_replication/pool_worker_child.c b/src/streaming_replication/pool_worker_child.c index 5c2e9719e..8d3dcdc02 100644 --- a/src/streaming_replication/pool_worker_child.c +++ b/src/streaming_replication/pool_worker_child.c @@ -66,6 +66,9 @@ #include "auth/md5.h" #include "auth/pool_hba.h" +#include "watchdog/wd_internal_commands.h" +#include "watchdog/watchdog.h" + static POOL_CONNECTION_POOL_SLOT * slots[MAX_NUM_BACKENDS]; static volatile sig_atomic_t reload_config_request = 0; static volatile sig_atomic_t restart_request = 0; @@ -78,6 +81,7 @@ static unsigned long long int text_to_lsn(char *text); static RETSIGTYPE my_signal_handler(int sig); static RETSIGTYPE reload_config_handler(int sig); static void reload_config(void); +static void sr_check_will_die(int code, Datum arg); #define CHECK_REQUEST \ do { \ @@ -96,6 +100,8 @@ static void reload_config(void); #define PG10_SERVER_VERSION 100000 /* PostgreSQL 10 server version num */ #define PG91_SERVER_VERSION 90100 /* PostgreSQL 9.1 server version num */ +static volatile bool follow_primary_lock_acquired; + /* * worker child main loop */ @@ -112,6 +118,11 @@ do_worker_child(void) init_ps_display("", "", "", ""); set_ps_display("worker process", false); + /* + * install the call back for preparation of exit + */ + on_system_exit(sr_check_will_die, (Datum) NULL); + /* set up signal handlers */ signal(SIGALRM, SIG_DFL); signal(SIGTERM, my_signal_handler); @@ -154,6 +165,7 @@ do_worker_child(void) { MemoryContextSwitchTo(WorkerMemoryContext); MemoryContextResetAndDeleteChildren(WorkerMemoryContext); + WD_STATES wd_status; CHECK_REQUEST; @@ -163,51 +175,91 @@ do_worker_child(void) } /* - * If streaming replication mode, do time lag checking + * Get watchdog status if watchdog is enabled. */ + if (pool_config->use_watchdog) + { + wd_status = wd_internal_get_watchdog_local_node_state(); + ereport(DEBUG1, + (errmsg("watchdog status: %d", wd_status))); + } - if (pool_config->sr_check_period > 0 && STREAM) + /* + * If streaming replication mode, do time lag checking + * Also skip if failover/failback is ongoing. + */ + if (pool_config->sr_check_period > 0 && STREAM && + Req_info->switching == false) { - establish_persistent_connection(); - PG_TRY(); - { - POOL_NODE_STATUS *node_status; - int i; + /* + * Acquire follow primary lock. If fail to acqure lock, try again. + */ + follow_primary_lock_acquired = false; - /* Do replication time lag checking */ - check_replication_time_lag(); + if (pool_acquire_follow_primary_lock(false) == true) + { + follow_primary_lock_acquired = true; - /* Check node status */ - node_status = verify_backend_node_status(slots); - for (i = 0; i < NUM_BACKENDS; i++) + establish_persistent_connection(); + PG_TRY(); { - ereport(DEBUG1, - (errmsg("node status[%d]: %d", i, node_status[i]))); + POOL_NODE_STATUS *node_status; + int i; + + /* Do replication time lag checking */ + check_replication_time_lag(); - if (node_status[i] == POOL_NODE_STATUS_INVALID) + /* Check node status */ + node_status = verify_backend_node_status(slots); + + + for (i = 0; i < NUM_BACKENDS; i++) { - int n; + ereport(DEBUG1, + (errmsg("node status[%d]: %d", i, node_status[i]))); - ereport(LOG, - (errmsg("pgpool_worker_child: invalid node found %d", i))); - if (pool_config->detach_false_primary) + if (node_status[i] == POOL_NODE_STATUS_INVALID) { - n = i; - degenerate_backend_set(&n, 1, REQ_DETAIL_SWITCHOVER | REQ_DETAIL_CONFIRMED); + int n; + + ereport(LOG, + (errmsg("pgpool_worker_child: invalid node found %d", i))); + /* + * If detach_false_primary is enabled, send + * degenerate request to detach invalid node. + * This should only happen on leader watchdog node + * and quorum exists if watchdog is enabled. Other + * nodes will be informed by the leader node later + * on. + */ + if ((pool_config->detach_false_primary && !pool_config->use_watchdog) || + (pool_config->detach_false_primary && pool_config->use_watchdog && + wd_internal_get_watchdog_quorum_state() >= 0 && wd_status == WD_COORDINATOR)) + { + n = i; + degenerate_backend_set(&n, 1, REQ_DETAIL_SWITCHOVER | REQ_DETAIL_CONFIRMED); + } } } } - } - PG_CATCH(); - { + PG_CATCH(); + { + discard_persistent_connection(); + pool_release_follow_primary_lock(); + follow_primary_lock_acquired = false; + sleep(pool_config->sr_check_period); + PG_RE_THROW(); + } + PG_END_TRY(); + + /* Discard persistent connections */ discard_persistent_connection(); - sleep(pool_config->sr_check_period); - PG_RE_THROW(); + if (follow_primary_lock_acquired) + { + pool_release_follow_primary_lock(); + follow_primary_lock_acquired = false; + } } - PG_END_TRY(); - - /* Discard persistent connections */ - discard_persistent_connection(); } sleep(pool_config->sr_check_period); } @@ -599,3 +651,11 @@ get_query_result(POOL_CONNECTION_POOL_SLOT * *slots, int backend_id, char *query sts = 0; return sts; } + +static void +sr_check_will_die(int code, Datum arg) +{ + if (follow_primary_lock_acquired) + pool_release_follow_primary_lock(); + +} diff --git a/src/test/regression/tests/018.detach_primary/test.sh b/src/test/regression/tests/018.detach_primary/test.sh index 8d54d09e8..d3006324c 100755 --- a/src/test/regression/tests/018.detach_primary/test.sh +++ b/src/test/regression/tests/018.detach_primary/test.sh @@ -54,4 +54,65 @@ fi ./shutdownall +# +# test with watchdog enabled +# + +# wipe out everything +cd .. +rm -fr $TESTDIR +mkdir $TESTDIR +cd $TESTDIR + +# create 3 node pgpool with 3 backends. +$WATCHDOG_SETUP -wn 3 -n 3 + +# enable detach_false_primary +for i in 0 1 2 +do + echo "detach_false_primary = on" >> pgpool$i/etc/pgpool.conf +done + +# start only pgpool0 and backend so that the quorum is absent. +cd pgpool0 +source ./bashrc.ports +./startall +cd .. +export PGPORT=$PGPOOL_PORT +wait_for_pgpool_startup + +# promote #3 node to create false primary +$PG_CTL -D pgpool0/data2 promote +sleep 10 +wait_for_pgpool_startup + +$PSQL -c "show pool_nodes" postgres|grep down +if [ $? = 0 ];then + echo "node is down despite that the quorum is absent" + ./shutdownall + exit 1 +fi + +# start pgpool1 and pgpool2 so that the quorum exists. +echo "testing the case when the quorum exists" +cd pgpool1 +./startall +cd .. +cd pgpool2 +./startall +cd .. +sleep 10 +pcp_watchdog_info -v -w -p $PCP_PORT + +$PSQL -c "show pool_nodes" postgres + +$PSQL -c "show pool_nodes" postgres|grep down +if [ $? != 0 ];then + echo "node is not down despite that the quorum exists" + ./shutdownall + exit 1 +fi + +./shutdownall + exit 0 -- 2.39.5