From 832274d0c16743a84c89855dfbd82bb168e5b232 Mon Sep 17 00:00:00 2001 From: Bo Peng Date: Mon, 5 Apr 2021 23:31:18 +0900 Subject: [PATCH] Improve sample scripts. - Empty pg_replslot directory of the standby node after running pg_rewind, because pg_replslot directory may be copied from the primary node in old PostgreSQL versions. - While creating/dropping replication slot, access remote database using psql instead of uing ssh. --- src/sample/scripts/failover.sh.sample | 21 ++++++------ src/sample/scripts/follow_primary.sh.sample | 33 +++++++++++++------ src/sample/scripts/pgpool_remote_start.sample | 4 +-- src/sample/scripts/recovery_1st_stage.sample | 20 +++++++---- 4 files changed, 48 insertions(+), 30 deletions(-) diff --git a/src/sample/scripts/failover.sh.sample b/src/sample/scripts/failover.sh.sample index 5be9dd9d7..cc20cfc77 100755 --- a/src/sample/scripts/failover.sh.sample +++ b/src/sample/scripts/failover.sh.sample @@ -34,7 +34,8 @@ OLD_PRIMARY_NODE_PORT="${12}" PGHOME=/usr/pgsql-13 -echo failover.sh: start: failed_node_id=$FAILED_NODE_ID old_primary_node_id=$OLD_PRIMARY_NODE_ID failed_host=$FAILED_NODE_HOST new_main_host=$NEW_MAIN_NODE_HOST +echo failover.sh: start: failed_node_id=$FAILED_NODE_ID failed_host=$FAILED_NODE_HOST \ + old_primary_node_id=$OLD_PRIMARY_NODE_ID new_main_node_id=$NEW_MAIN_NODE_ID new_main_host=$NEW_MAIN_NODE_HOST ## If there's no main node anymore, skip failover. if [ $NEW_MAIN_NODE_ID -lt 0 ]; then @@ -53,30 +54,28 @@ fi ## If Standby node is down, skip failover. if [ $FAILED_NODE_ID -ne $OLD_PRIMARY_NODE_ID ]; then - ssh -T -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null postgres@$OLD_PRIMARY_NODE_HOST -i ~/.ssh/id_rsa_pgpool " - ${PGHOME}/bin/psql -p $OLD_PRIMARY_NODE_PORT -c \"SELECT pg_drop_replication_slot('${FAILED_NODE_HOST}')\" - " + # If Standby node is down, drop replication slot. + ${PGHOME}/bin/psql -h ${OLD_PRIMARY_NODE_HOST} -p ${OLD_PRIMARY_NODE_PORT} \ + -c "SELECT pg_drop_replication_slot('${FAILED_NODE_HOST}');" >/dev/null 2>&1 if [ $? -ne 0 ]; then - echo failover.sh: Standby node is down. Skipping failover. - echo failover.sh: end: drop replication slot "${FAILED_NODE_HOST}" failed - exit 1 + echo ERROR: failover.sh: drop replication slot \"${FAILED_NODE_HOST}\" failed. You may need to drop replication slot manually. fi - echo failover.sh: end: Standby node is down. Skipping failover. + echo failover.sh: end: standby node is down. Skipping failover. exit 0 fi ## Promote Standby node. -echo failover.sh: Primary node is down, promote standby node ${NEW_MAIN_NODE_HOST}. +echo failover.sh: primary node is down, promote new_main_node_id=$NEW_MAIN_NODE_ID on ${NEW_MAIN_NODE_HOST}. ssh -T -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ postgres@${NEW_MAIN_NODE_HOST} -i ~/.ssh/id_rsa_pgpool ${PGHOME}/bin/pg_ctl -D ${NEW_MAIN_NODE_PGDATA} -w promote if [ $? -ne 0 ]; then - echo failover.sh: end: failover failed + echo ERROR: failover.sh: end: failover failed exit 1 fi -echo failover.sh: end: new_main_node_id=$NEW_MAIN_NODE_ID is promoted to a primary +echo failover.sh: end: new_main_node_id=$NEW_MAIN_NODE_ID on ${NEW_MAIN_NODE_HOST} is promoted to a primary exit 0 diff --git a/src/sample/scripts/follow_primary.sh.sample b/src/sample/scripts/follow_primary.sh.sample index f4bea216e..5db5c6340 100755 --- a/src/sample/scripts/follow_primary.sh.sample +++ b/src/sample/scripts/follow_primary.sh.sample @@ -67,9 +67,12 @@ if [ $? -eq 0 ]; then echo follow_primary.sh: pg_rewind for node ${NODE_ID} # Create replication slot "${NODE_HOST}" - ssh -T -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null postgres@${NEW_PRIMARY_NODE_HOST} -i ~/.ssh/id_rsa_pgpool " - ${PGHOME}/bin/psql -p ${NEW_PRIMARY_NODE_PORT} -c \"SELECT pg_create_physical_replication_slot('${NODE_HOST}');\" - " + ${PGHOME}/bin/psql -h ${NEW_PRIMARY_NODE_HOST} -p ${NEW_PRIMARY_NODE_PORT} \ + -c "SELECT pg_create_physical_replication_slot('${NODE_HOST}');" >/dev/null 2>&1 + + if [ $? -ne 0 ]; then + echo follow_primary.sh: create replication slot \"${NODE_HOST}\" failed. You may need to create replication slot manually. + fi ssh -T -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null postgres@${NODE_HOST} -i ~/.ssh/id_rsa_pgpool " @@ -79,6 +82,8 @@ if [ $? -eq 0 ]; then ${PGHOME}/bin/pg_rewind -D ${NODE_PGDATA} --source-server=\"user=postgres host=${NEW_PRIMARY_NODE_HOST} port=${NEW_PRIMARY_NODE_PORT}\" + rm -rf ${NODE_PGDATA}/pg_replslot/* + cat > ${RECOVERYCONF} << EOT primary_conninfo = 'host=${NEW_PRIMARY_NODE_HOST} port=${NEW_PRIMARY_NODE_PORT} user=${REPLUSER} application_name=${NODE_HOST} passfile=''/var/lib/pgsql/.pgpass''' recovery_target_timeline = 'latest' @@ -128,10 +133,14 @@ EOT " if [ $? -ne 0 ]; then + # drop replication slot - ssh -T -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null postgres@${NEW_PRIMARY_NODE_HOST} -i ~/.ssh/id_rsa_pgpool " - ${PGHOME}/bin/psql -p ${NEW_PRIMARY_NODE_PORT} -c \"SELECT pg_drop_replication_slot('${NODE_HOST}')\" - " + ${PGHOME}/bin/psql -h ${NEW_PRIMARY_NODE_HOST} -p ${NEW_PRIMARY_NODE_PORT} \ + -c "SELECT pg_drop_replication_slot('${NODE_HOST}');" >/dev/null 2>&1 + + if [ $? -ne 0 ]; then + echo ERROR: follow_primary.sh: drop replication slot \"${NODE_HOST}\" failed. You may need to drop replication slot manually. + fi echo follow_primary.sh: end: pg_basebackup failed exit 1 @@ -150,17 +159,21 @@ EOT ${PGPOOL_PATH}/pcp_attach_node -w -h localhost -U $PCP_USER -p ${PCP_PORT} -n ${NODE_ID} if [ $? -ne 0 ]; then - echo follow_primary.sh: end: pcp_attach_node failed + echo ERROR: follow_primary.sh: end: pcp_attach_node failed exit 1 fi # If start Standby failed, drop replication slot "${NODE_HOST}" else - ssh -T -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null postgres@${NEW_PRIMARY_NODE_HOST} -i ~/.ssh/id_rsa_pgpool \ - ${PGHOME}/bin/psql -p ${NEW_PRIMARY_NODE_PORT} -c "SELECT pg_drop_replication_slot('${NODE_HOST}')" + ${PGHOME}/bin/psql -h ${NEW_PRIMARY_NODE_HOST} -p ${NEW_PRIMARY_NODE_PORT} \ + -c "SELECT pg_drop_replication_slot('${NODE_HOST}');" >/dev/null 2>&1 + + if [ $? -ne 0 ]; then + echo ERROR: follow_primary.sh: drop replication slot \"${NODE_HOST}\" failed. You may need to drop replication slot manually. + fi - echo follow_primary.sh: end: follow primary command failed + echo ERROR: follow_primary.sh: end: follow primary command failed exit 1 fi diff --git a/src/sample/scripts/pgpool_remote_start.sample b/src/sample/scripts/pgpool_remote_start.sample index edc5bc320..0bd92d8a7 100755 --- a/src/sample/scripts/pgpool_remote_start.sample +++ b/src/sample/scripts/pgpool_remote_start.sample @@ -14,7 +14,7 @@ echo pgpool_remote_start: start: remote start Standby node $DEST_NODE_HOST ssh -T -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null postgres@${DEST_NODE_HOST} -i ~/.ssh/id_rsa_pgpool ls /tmp > /dev/null if [ $? -ne 0 ]; then - echo pgpool_remote_start: passwrodless SSH to postgres@${DEST_NODE_HOST} failed. Please setup passwrodless SSH. + echo ERROR: pgpool_remote_start: passwrodless SSH to postgres@${DEST_NODE_HOST} failed. Please setup passwrodless SSH. exit 1 fi @@ -24,7 +24,7 @@ ssh -T -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null postgres@$DES " if [ $? -ne 0 ]; then - echo pgpool_remote_start: $DEST_NODE_HOST PostgreSQL start failed. + echo ERROR: pgpool_remote_start: $DEST_NODE_HOST PostgreSQL start failed. exit 1 fi diff --git a/src/sample/scripts/recovery_1st_stage.sample b/src/sample/scripts/recovery_1st_stage.sample index 8b874279f..2aaca2151 100755 --- a/src/sample/scripts/recovery_1st_stage.sample +++ b/src/sample/scripts/recovery_1st_stage.sample @@ -34,9 +34,12 @@ else fi ## Create replication slot "${DEST_NODE_HOST}" -${PGHOME}/bin/psql -p ${PRIMARY_NODE_PORT} << EOQ -SELECT pg_create_physical_replication_slot('${DEST_NODE_HOST}'); -EOQ +${PGHOME}/bin/psql -h ${PRIMARY_NODE_HOST} -p ${PRIMARY_NODE_PORT} \ + -c "SELECT pg_create_physical_replication_slot('${DEST_NODE_HOST}');" >/dev/null 2>&1 + +if [ $? -ne 0 ]; then + echo ERROR: recovery_1st_stage: create replication slot \"${DEST_NODE_HOST}\" failed. You may need to create replication slot manually. +fi ## Execute pg_basebackup to recovery Standby node ssh -T -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null postgres@$DEST_NODE_HOST -i ~/.ssh/id_rsa_pgpool " @@ -71,11 +74,14 @@ EOT if [ $? -ne 0 ]; then - ${PGHOME}/bin/psql -p ${PRIMARY_NODE_PORT} << EOQ -SELECT pg_drop_replication_slot('${DEST_NODE_HOST}'); -EOQ + ${PGHOME}/bin/psql -h ${PRIMARY_NODE_HOST} -p ${PRIMARY_NODE_PORT} \ + -c "SELECT pg_drop_replication_slot('${DEST_NODE_HOST}');" >/dev/null 2>&1 + + if [ $? -ne 0 ]; then + echo ERROR: recovery_1st_stage: drop replication slot \"${DEST_NODE_HOST}\" failed. You may need to drop replication slot manually. + fi - echo recovery_1st_stage: end: pg_basebackup failed. online recovery failed + echo ERROR: recovery_1st_stage: end: pg_basebackup failed. online recovery failed exit 1 fi -- 2.39.5