Refactor failover().

author Tatsuo Ishii <ishii@sraoss.co.jp>

Tue, 8 Feb 2022 05:26:29 +0000 (14:26 +0900)

committer Tatsuo Ishii <ishii@sraoss.co.jp>

Tue, 8 Feb 2022 05:38:12 +0000 (14:38 +0900)
author Tatsuo Ishii <ishii@sraoss.co.jp>
Tue, 8 Feb 2022 05:26:29 +0000 (14:26 +0900)
committer Tatsuo Ishii <ishii@sraoss.co.jp>
Tue, 8 Feb 2022 05:38:12 +0000 (14:38 +0900)
diff --git a/src/main/pgpool_main.c b/src/main/pgpool_main.c

index 13049e1f9b4520e709ab0cdcea6ed29a8a2faef2..4dc58a518607af93ea89134eaa55596704b5b92e 100644 (file)
--- a/src/main/pgpool_main.c
+++ b/src/main/pgpool_main.c
@@ -5,7 +5,7 @@
   * pgpool: a language independent connection pool server for PostgreSQL
   * written by Tatsuo Ishii
   *
- * Copyright (c) 2003-2021     PgPool Global Development Group
+ * Copyright (c) 2003-2022     PgPool Global Development Group
   *
   * Permission to use, copy, modify, and distribute this software and
   * its documentation for any purpose and without fee is hereby
@@ -115,6 +115,30 @@ typedef struct User1SignalSlot
  
  #define PGPOOLMAXLITSENQUEUELENGTH 10000
  
+/*
+ * Context data while exectuing failover()
+ */
+typedef struct
+{
+       bool            all_backend_down;       /* true if all backends are down */
+       bool            search_primary;         /* true if we need to seach primary node */
+       bool            need_to_restart_children;       /* true if we need to restart child process */
+       bool            need_to_restart_pcp;    /* true if we need to restart pc process */
+       bool            partial_restart;        /* true if partial restart is needed */
+       bool            sync_required;          /* true if watchdog synchronization is necessary */
+
+       POOL_REQUEST_KIND reqkind;
+       int                     node_id_set[MAX_NUM_BACKENDS];
+       int                     node_count;
+       unsigned char request_details;
+
+       /*
+        * An array to hold down nodes information. Each array member corresponds
+        * to node id.  If nodes[i] is 1, the node i is down.
+       */
+       int                     nodes[MAX_NUM_BACKENDS];
+} FAILOVER_CONTEXT;
+
  static void signal_user1_to_parent_with_reason(User1SignalReason reason);
  
  static void FileUnlink(int code, Datum path);
@@ -153,6 +177,19 @@ static void update_backend_quarantine_status(void);
  static int     get_server_version(POOL_CONNECTION_POOL_SLOT * *slots, int node_id);
  static void get_info_from_conninfo(char *conninfo, char *host, int hostlen, char *port, int portlen);
  
+/*
+ * Subroutines of failover()
+ */
+static int handle_failback_request(FAILOVER_CONTEXT *failover_context, int node_id);
+static int handle_failover_request(FAILOVER_CONTEXT *failover_context, int node_id);
+static void kill_failover_children(FAILOVER_CONTEXT *failover_context, int node_id);
+static void exec_failover_command(FAILOVER_CONTEXT *failover_context, int new_main_node_id, int promote_node_id);
+static int determine_new_primary_node(FAILOVER_CONTEXT *failover_context, int node_id);
+static int exec_follow_primary_command(FAILOVER_CONTEXT *failover_context, int node_id, int new_primary_node_id);
+static void save_node_info(FAILOVER_CONTEXT *failover_context, int new_primary_node_id, int new_main_node_id);
+static void exec_child_restart(FAILOVER_CONTEXT *failover_context, int node_id);
+static void exec_notice_pcp_child(FAILOVER_CONTEXT *failover_context);
+
  static struct sockaddr_un un_addr;     /* unix domain socket path */
  static struct sockaddr_un pcp_un_addr; /* unix domain socket path for PCP */
  ProcessInfo *process_info = NULL;      /* Per child info table on shmem */
@@ -1357,32 +1394,22 @@ check_all_backend_down(void)
  }
  
  /*
- * backend connection error, failover/failback request, if possible
- * failover() must be called under protecting signals.
+ * The workhorse of failover processing. Directly called with pgpool main
+ * process or called from sigusr1_interrupt_processor() after SIGUSR1 signal
+ * is received.
   */
  static void
  failover(void)
  {
-       int                     i,
-                               j,
-                               k;
+       FAILOVER_CONTEXT failover_context;
         int                     node_id;
         int                     new_main_node;
         int                     new_primary = -1;
-       int                     nodes[MAX_NUM_BACKENDS];
-       bool            need_to_restart_children = true;
-       bool            partial_restart = false;
-       int                     status;
-       int                     sts;
-       bool            need_to_restart_pcp = false;
-       bool            all_backend_down = true;
-       bool            sync_required = false;
+       int                     i;
  
         ereport(DEBUG1,
                         (errmsg("failover handler called")));
  
-       memset(nodes, 0, sizeof(int) * MAX_NUM_BACKENDS);
-
         /*
          * this could happen in a child process if a signal has been sent before
          * resetting signal handler
@@ -1418,16 +1445,20 @@ failover(void)
                 return;
         }
  
+       /* initialize failover context */
+       memset(&failover_context, 0, sizeof(failover_context));
+       failover_context.search_primary = true;
+
         Req_info->switching = true;
         switching = 1;
+
+       /*
+        * This loop processes each failover/failback request until the queue is
+        * empty.
+        */
         for (;;)
         {
-               POOL_REQUEST_KIND reqkind;
                 int                     queue_index;
-               int                     node_id_set[MAX_NUM_BACKENDS];
-               int                     node_count;
-               unsigned char request_details;
-               bool            search_primary = true;
                 int                     promote_node = 0;
  
                 pool_semaphore_lock(REQUEST_INFO_SEM);
@@ -1444,17 +1475,17 @@ failover(void)
                 /* make a local copy of request */
                 Req_info->request_queue_head++;
                 queue_index = Req_info->request_queue_head % MAX_REQUEST_QUEUE_SIZE;
-               memcpy(node_id_set, Req_info->request[queue_index].node_id, (sizeof(int) * Req_info->request[queue_index].count));
-               reqkind = Req_info->request[queue_index].kind;
-               request_details = Req_info->request[queue_index].request_details;
-               node_count = Req_info->request[queue_index].count;
+               memcpy(failover_context.node_id_set, Req_info->request[queue_index].node_id, (sizeof(int) * Req_info->request[queue_index].count));
+               failover_context.reqkind = Req_info->request[queue_index].kind;
+               failover_context.request_details = Req_info->request[queue_index].request_details;
+               failover_context.node_count = Req_info->request[queue_index].count;
                 pool_semaphore_unlock(REQUEST_INFO_SEM);
  
                 ereport(DEBUG1,
                                 (errmsg("failover handler"),
-                                errdetail("kind: %d flags: %x node_count: %d index:%d", reqkind, request_details, node_count, queue_index)));
+                                errdetail("kind: %d flags: %x node_count: %d index:%d", failover_context.reqkind, failover_context.request_details, failover_context.node_count, queue_index)));
  
-               if (reqkind == CLOSE_IDLE_REQUEST)
+               if (failover_context.reqkind == CLOSE_IDLE_REQUEST)
                 {
                         kill_all_children(SIGUSR1);
                         continue;
@@ -1476,115 +1507,24 @@ failover(void)
                  * should be replaced by the requested node. The requested
                  * node should be REAL_PRIMARY_NODE_ID.
                  */
-               if (request_details & REQ_DETAIL_PROMOTE)
+               if (failover_context.request_details & REQ_DETAIL_PROMOTE)
                 {
-                       promote_node = node_id_set[0];
-                       for (i = 0; i < node_count; i++)
+                       promote_node = failover_context.node_id_set[0];
+                       for (i = 0; i < failover_context.node_count; i++)
                         {
-                               node_id_set[i] = REAL_PRIMARY_NODE_ID;
+                               failover_context.node_id_set[i] = REAL_PRIMARY_NODE_ID;
                         }
                 }
  
-               node_id = node_id_set[0];
+               node_id = failover_context.node_id_set[0];
  
                 /* failback request? */
-               if (reqkind == NODE_UP_REQUEST)
+               if (failover_context.reqkind == NODE_UP_REQUEST)
                 {
-                       if (node_id < 0 || node_id >= MAX_NUM_BACKENDS ||
-                               (reqkind == NODE_UP_REQUEST && !(RAW_MODE &&
-                                                                                                BACKEND_INFO(node_id).backend_status == CON_DOWN) && VALID_BACKEND(node_id)) ||
-                               (reqkind == NODE_DOWN_REQUEST && !VALID_BACKEND(node_id)))
-                       {
-                               if (node_id < 0 || node_id >= MAX_NUM_BACKENDS)
-                                       ereport(LOG,
-                                                       (errmsg("invalid failback request, node id: %d is invalid. node id must be between [0 and %d]", node_id, MAX_NUM_BACKENDS)));
-                               else
-                                       ereport(LOG,
-                                                       (errmsg("invalid failback request, status: [%d] of node id : %d is invalid for failback", BACKEND_INFO(node_id).backend_status, node_id)));
-
+                       if (handle_failback_request(&failover_context, node_id) < 0)
                                 continue;
-                       }
-
-                       ereport(LOG,
-                                       (errmsg("starting fail back. reconnect host %s(%d)",
-                                                       BACKEND_INFO(node_id).backend_hostname,
-                                                       BACKEND_INFO(node_id).backend_port)));
-
-                       /* Check to see if all backends are down */
-                       all_backend_down = check_all_backend_down();
-
-                       BACKEND_INFO(node_id).backend_status = CON_CONNECT_WAIT;        /* unset down status */
-                       pool_set_backend_status_changed_time(node_id);
-
-                       if ((request_details & REQ_DETAIL_UPDATE))
-                       {
-                               /* remove the quarantine flag */
-                               BACKEND_INFO(node_id).quarantine = false;
-
-                               /*
-                                * do not search for primary node when handling the quarantine
-                                * nodes
-                                */
-                               search_primary = false;
-
-                               /*
-                                * recalculate the main node id after setting the backend
-                                * status of quarantined node, this will bring us to the old
-                                * main_node_id that was before the quarantine state
-                                */
-                               Req_info->main_node_id = get_next_main_node();
-                               if (Req_info->primary_node_id == -1 &&
-                                       BACKEND_INFO(node_id).role == ROLE_PRIMARY)
-                               {
-                                       /*
-                                        * if the failback request is for the quarantined node and
-                                        * that node had a primary role before it was quarantined,
-                                        * restore the primary node status for that node. this is
-                                        * important for the failover script to get the proper
-                                        * value of old primary
-                                        */
-                                       ereport(LOG,
-                                                       (errmsg("failover: failing back the quarantine node that was primary before it was quarantined"),
-                                                        errdetail("all children needs a restart")));
-                                       Req_info->primary_node_id = node_id;
-
-                                       /*
-                                        * since we changed the primary node so restart of all
-                                        * children is required
-                                        */
-                                       need_to_restart_children = true;
-                                       partial_restart = false;
-                               }
-                               else if (all_backend_down == false)
-                               {
-                                       ereport(LOG,
-                                                       (errmsg("Do not restart children because we are failing back node id %d host: %s port: %d and we are in streaming replication mode and not all backends were down", node_id,
-                                                                       BACKEND_INFO(node_id).backend_hostname,
-                                                                       BACKEND_INFO(node_id).backend_port)));
-                                       need_to_restart_children = false;
-                                       partial_restart = false;
-                               }
-                               else
-                               {
-                                       need_to_restart_children = true;
-                                       partial_restart = false;
-                               }
-                       }
-                       else
-                       {
-                               /*
-                                * The request is a proper failbak request and not because of
-                                * the update status of quarantined node
-                                */
-                               (void) write_status_file();
-
-                               trigger_failover_command(node_id, pool_config->failback_command,
-                                                                                MAIN_NODE_ID, get_next_main_node(), PRIMARY_NODE_ID);
-                       }
-
-                       sync_required = true;
                 }
-               else if (reqkind == PROMOTE_NODE_REQUEST)
+               else if (failover_context.reqkind == PROMOTE_NODE_REQUEST)
                 {
                         if (node_id != -1 && VALID_BACKEND(node_id))
                         {
@@ -1603,63 +1543,14 @@ failover(void)
                 else                                    /* NODE_DOWN_REQUEST &&
                                                                  * NODE_QUARANTINE_REQUEST */
                 {
-                       int                     cnt = 0;
-
-                       for (i = 0; i < node_count; i++)
-                       {
-                               if (node_id_set[i] != -1 && (BACKEND_INFO(node_id_set[i]).quarantine == true ||
-                                                                                        ((RAW_MODE && VALID_BACKEND_RAW(node_id_set[i])) ||
-                                                                                         VALID_BACKEND(node_id_set[i]))))
-                               {
-                                       ereport(LOG,
-                                                       (errmsg("starting %s. shutdown host %s(%d)",
-                                                                       (reqkind == NODE_QUARANTINE_REQUEST) ? "quarantine" : "degeneration",
-                                                                       BACKEND_INFO(node_id_set[i]).backend_hostname,
-                                                                       BACKEND_INFO(node_id_set[i]).backend_port)));
-
-                                       BACKEND_INFO(node_id_set[i]).backend_status = CON_DOWN; /* set down status */
-                                       pool_set_backend_status_changed_time(node_id_set[i]);
-                                       if (reqkind == NODE_QUARANTINE_REQUEST)
-                                       {
-                                               BACKEND_INFO(node_id_set[i]).quarantine = true;
-                                       }
-                                       else
-                                       {
-                                               /*
-                                                * if the degeneration request is for the quarantined
-                                                * node and that node had a primary role before it was
-                                                * quarantined, Restore the primary node status for
-                                                * that node before degenerating it. This is important
-                                                * for the failover script to get the proper value of
-                                                * old primary
-                                                */
-                                               if (Req_info->primary_node_id == -1 &&
-                                                       BACKEND_INFO(node_id_set[i]).quarantine == true &&
-                                                       BACKEND_INFO(node_id_set[i]).role == ROLE_PRIMARY)
-                                               {
-                                                       ereport(DEBUG2,
-                                                                       (errmsg("failover: degenerating the node that was primary node before it was quarantined")));
-                                                       Req_info->primary_node_id = node_id_set[i];
-                                                       search_primary = false;
-                                               }
-                                               BACKEND_INFO(node_id_set[i]).quarantine = false;
-                                               (void) write_status_file();
-                                       }
-
-                                       /* save down node */
-                                       nodes[node_id_set[i]] = 1;
-                                       cnt++;
-                               }
-                       }
-
-                       if (cnt == 0)
-                       {
-                               ereport(LOG,
-                                               (errmsg("failover: no backends are degenerated")));
+                       
+                       if (handle_failover_request(&failover_context, node_id) < 0)
                                 continue;
-                       }
                 }
  
+               /*
+                * Determin new main node.
+                */
                 new_main_node = get_next_main_node();
  
                 if (new_main_node < 0)
@@ -1670,566 +1561,131 @@ failover(void)
  
                 ereport(DEBUG1,
                                 (errmsg("failover/failback request details: STREAM: %d reqkind: %d detail: %x node_id: %d",
-                                               STREAM, reqkind, request_details & REQ_DETAIL_SWITCHOVER,
+                                               STREAM, failover_context.reqkind, failover_context.request_details & REQ_DETAIL_SWITCHOVER,
                                                 node_id)));
  
                 /*
-                * On 2011/5/2 Tatsuo Ishii says: if mode is streaming replication and
-                * request is NODE_UP_REQUEST (failback case) we don't need to restart
-                * all children. Existing session will not use newly attached node,
-                * but load balanced node is not changed until this session ends, so
-                * it's harmless anyway.
+                * Kill child process to prepare failover/failback.
                  */
+               kill_failover_children(&failover_context, node_id);
  
                 /*
-                * On 2015/9/21 Tatsuo Ishii says: this judgment is not sufficient if
-                * all backends were down. Child process has local status in which all
-                * backends are down. In this case even if new connection arrives from
-                * frontend, the child will not accept it because the local status
-                * shows all backends are down. For this purpose we refer to
-                * "all_backend_down" variable, which was set before updating backend
-                * status.
-                *
-                * See bug 248 for more details.
+                * Exec failover_command if needed. We do not execute failover when
+                * request is quarantine type.  Also if the request is to promote
+                * specified node, execute failover command.
                  */
+               exec_failover_command(&failover_context, new_main_node, promote_node);
  
                 /*
-                * We also need to think about a case when the former primary node did
-                * not exist.  In the case we need to restart all children as
-                * well. For example when previous primary node id is 0 and then it
-                * went down, restarted, re-attached without promotion. Then existing
-                * child process loses connection slot to node 0 and keeps on using it
-                * when node 0 comes back. This could result in segfault later on in
-                * the child process because there's no connection to node id 0.
-                *
-                * Actually we need to think about when ALWAYS_PRIMARY flag is set
-                * *but* DISALLOW_TO_FAILOVER flag is not set case. In the case after
-                * primary failover Req_info->primary_node_id is set, but connection
-                * to the primary node does not exist. So we should do full restart if
-                * requested node id is the former primary node.
-                *
-                * See bug 672 for more details.
+                * Determine new primary node id. Possibly call find_primary_node_repeatedly().
                  */
-               if (STREAM && reqkind == NODE_UP_REQUEST && all_backend_down == false &&
-                       Req_info->primary_node_id >= 0 && Req_info->primary_node_id != node_id)
-               {
-                       /*
-                        * The decision to restart/no-restart children for update status
-                        * request has already been made
-                        */
-                       if (!(request_details & REQ_DETAIL_UPDATE))
-                       {
-                               ereport(LOG,
-                                               (errmsg("Do not restart children because we are failing back node id %d host: %s port: %d and we are in streaming replication mode and not all backends were down", node_id,
-                                                               BACKEND_INFO(node_id).backend_hostname,
-                                                               BACKEND_INFO(node_id).backend_port)));
+               new_primary = determine_new_primary_node(&failover_context, node_id);
+               
+               /*
+                * If follow_primary_command is provided and in streaming
+                * replication mode, we start degenerating all backends as they are
+                * not replicated anymore.
+                */
+               i = exec_follow_primary_command(&failover_context, node_id, new_primary);
  
-                               need_to_restart_children = false;
-                               partial_restart = false;
-                       }
-               }
+               /* if follow primary command was executed, main node can be changed */
+               if (i >= 0)
+                       new_main_node = i;
  
                 /*
-                * If the mode is streaming replication and the request is
-                * NODE_DOWN_REQUEST and it's actually a switch over request, we don't
-                * need to restart all children, except the node is primary.
+                * Now new primary node and new main node are established.
+                * Save them into shared memory. Also update status changed time.
                  */
-               else if (STREAM && (reqkind == NODE_DOWN_REQUEST || reqkind == NODE_QUARANTINE_REQUEST) &&
-                                request_details & REQ_DETAIL_SWITCHOVER && node_id != PRIMARY_NODE_ID)
-               {
-                       ereport(LOG,
-                                       (errmsg("Do not restart children because we are switching over node id %d host: %s port: %d and we are in streaming replication mode", node_id,
-                                                       BACKEND_INFO(node_id).backend_hostname,
-                                                       BACKEND_INFO(node_id).backend_port)));
+               save_node_info(&failover_context, new_primary, new_main_node);
  
-                       need_to_restart_children = true;
-                       partial_restart = true;
+               /* Kill children and restart them if needed */
+               exec_child_restart(&failover_context, node_id);
+       }
  
-                       for (i = 0; i < pool_config->num_init_children; i++)
-                       {
-                               bool            restart = false;
+       /*
+        * We are almost done.
+        * Unlock flags.
+        */
+       pool_semaphore_lock(REQUEST_INFO_SEM);
+       switching = 0;
+       Req_info->switching = false;
+       pool_semaphore_unlock(REQUEST_INFO_SEM);
  
-                               for (j = 0; j < pool_config->max_pool; j++)
-                               {
-                                       for (k = 0; k < NUM_BACKENDS; k++)
-                                       {
-                                               ConnectionInfo *con = pool_coninfo(i, j, k);
+       /*
+        * kick wakeup_handler in pcp_child to notice that failover/failback done.
+        */
+       exec_notice_pcp_child(&failover_context);
+}
  
-                                               if (con->connected && con->load_balancing_node == node_id)
-                                               {
-                                                       ereport(LOG,
-                                                                       (errmsg("child pid %d needs to restart because pool %d uses backend %d",
-                                                                                       process_info[i].pid, j, node_id)));
-                                                       restart = true;
-                                                       break;
-                                               }
-                                       }
-                               }
+#ifdef NOT_USED
+/*
+ * health check timer handler
+ */
+static RETSIGTYPE health_check_timer_handler(int sig)
+{
+       int                     save_errno = errno;
  
-                               if (restart)
-                               {
-                                       pid_t           pid = process_info[i].pid;
+       POOL_SETMASK(&BlockSig);
+       health_check_timer_expired = 1;
+       POOL_SETMASK(&UnBlockSig);
+       errno = save_errno;
+}
  
-                                       if (pid)
-                                       {
-                                               kill(pid, SIGQUIT);
-                                               ereport(DEBUG1,
-                                                               (errmsg("failover handler"),
-                                                                errdetail("kill process with PID:%d", pid)));
-                                       }
-                               }
-                       }
-               }
-               else
-               {
-                       ereport(LOG,
-                                       (errmsg("Restart all children")));
+/*
+ * do_health_check() performs the health check on all backend nodes.
+ * The inout parameter health_check_node_id is the starting backend
+ * node number for health check and when the function returns or
+ * exits with an error health_check_node_id contains the value
+ * of last backend node number on which health check was performed.
+ *
+ * Function returns false if all backend nodes are down and true if all
+ * backend nodes are in healthy state
+ */
+static bool
+do_health_check(bool use_template_db, volatile int *health_check_node_id)
+{
+       POOL_CONNECTION_POOL_SLOT *slot;
+       BackendInfo *bkinfo;
+       static char *dbname;
+       int                     i;
+       bool            all_nodes_healthy = false;
+       char       *password = get_pgpool_config_user_password(pool_config->health_check_user,
+                                                                                                                  pool_config->health_check_password);
  
-                       /* kill all children */
-                       for (i = 0; i < pool_config->num_init_children; i++)
-                       {
-                               pid_t           pid = process_info[i].pid;
+       /* Do not execute health check during recovery */
+       if (*InRecovery)
+               return false;
  
-                               if (pid)
-                               {
-                                       kill(pid, SIGQUIT);
-                                       ereport(DEBUG1,
-                                                       (errmsg("failover handler"),
-                                                        errdetail("kill process with PID:%d", pid)));
-                               }
-                       }
+       if (!strcmp(pool_config->health_check_database, ""))
+               dbname = use_template_db ? "template1" : "postgres";
+       else
+               dbname = pool_config->health_check_database;
  
-                       need_to_restart_children = true;
-                       partial_restart = false;
-               }
+       ereport(DEBUG1,
+                       (errmsg("doing health check against database:%s user:%s",
+                                       dbname, pool_config->health_check_user)));
+
+       /*
+        * Start checking the backed nodes starting from the previously failed
+        * node
+        */
+       for (i = *health_check_node_id; i < pool_config->backend_desc->num_backends; i++)
+       {
+               *health_check_node_id = i;
  
                 /*
-                * Exec failover_command if needed. We do not execute failover when
-                * request is quarantine type.
+                * Make sure that health check timer has not been expired. Before
+                * called health_check(), health_check_timer_expired is set to 0.
+                * However it is possible that while processing DB nodes health check
+                * timer expired.
                  */
-               if (reqkind == NODE_DOWN_REQUEST)
+               if (health_check_timer_expired)
                 {
-                       for (i = 0; i < pool_config->backend_desc->num_backends; i++)
-                       {
-                               if (nodes[i])
-                               {
-                                       /* If this is prmoting specified node, new_main_node
-                                        * should be replaced by the requested node. The requested
-                                        * node should be REAL_PRIMARY_NODE_ID.
-                                        */
-                                       if (request_details & REQ_DETAIL_PROMOTE)
-                                       {
-                                               trigger_failover_command(i, pool_config->failover_command,
-                                                                                                MAIN_NODE_ID, promote_node, REAL_PRIMARY_NODE_ID);
-                                       }
-                                       else
-                                       {
-                                               trigger_failover_command(i, pool_config->failover_command,
-                                                                                                MAIN_NODE_ID, new_main_node, REAL_PRIMARY_NODE_ID);
-                                       }
-                                       sync_required = true;
-                               }
-                       }
+                       ereport(ERROR,
+                                       (errmsg("health check timer has been already expired before attempting to connect backend node %d", i)));
                 }
  
-               if (reqkind == PROMOTE_NODE_REQUEST && VALID_BACKEND(node_id))
-               {
-                       new_primary = node_id;
-               }
-               else if (reqkind == NODE_QUARANTINE_REQUEST)
-               {
-                       /*
-                        * if the quarantine node was the primary node set the newprimary
-                        * to -1 (invalid)
-                        */
-                       if (Req_info->primary_node_id == node_id)
-                       {
-                               /*
-                                * set the role of the node, This will help us restore the
-                                * primary node id when the node will come out from quarantine
-                                * state
-                                */
-                               BACKEND_INFO(node_id).role = ROLE_PRIMARY;
-                               new_primary = -1;
-                       }
-                       else if (SL_MODE)
-                       {
-                               new_primary = Req_info->primary_node_id;
-                       }
-               }
-
-               /*
-                * If the down node was a standby node in streaming replication mode,
-                * we can avoid calling find_primary_node_repeatedly() and recognize
-                * the former primary as the new primary node, which will reduce the
-                * time to process standby down.
-                * This does not apply to the case when no primary node existed
-                * (Req_info->primary_node_id < 0). In this case
-                * find_primary_node_repeatedly() should be called.
-                */
-               else if (SL_MODE &&
-                                reqkind == NODE_DOWN_REQUEST)
-               {
-                       if (Req_info->primary_node_id >= 0 && Req_info->primary_node_id != node_id)
-                       {
-                               new_primary = Req_info->primary_node_id;
-                       }
-                       else
-                       {
-                               if (Req_info->primary_node_id >= 0)
-                                       BACKEND_INFO(Req_info->primary_node_id).role = ROLE_STANDBY;
-                               new_primary = find_primary_node_repeatedly();
-                       }
-               }
-               else if (search_primary == false)
-               {
-                       ereport(DEBUG1,
-                                       (errmsg("failover was called on quarantined node. No need to search for primary node")));
-                       new_primary = Req_info->primary_node_id;
-               }
-               else
-               {
-                       new_primary = find_primary_node_repeatedly();
-               }
-
-               /*
-                * If follow_primary_command is provided and in streaming
-                * replication mode, we start degenerating all backends as they are
-                * not replicated anymore.
-                */
-               int                     follow_cnt = 0;
-
-               if (STREAM)
-               {
-                       if (*pool_config->follow_primary_command != '\0' ||
-                               reqkind == PROMOTE_NODE_REQUEST)
-                       {
-                               /*
-                                * follow primary command is executed in following cases:
-                                * - failover against the current primary
-                                * - no primary exists and new primary is created by failover
-                                * - promote node request
-                                */
-                               if (((reqkind == NODE_DOWN_REQUEST) &&
-                                        Req_info->primary_node_id >= 0 &&
-                                        (nodes[Req_info->primary_node_id])) ||
-                                       (reqkind == NODE_DOWN_REQUEST && Req_info->primary_node_id < 0 && new_primary >= 0) ||
-                                       (node_id >= 0 && (reqkind == PROMOTE_NODE_REQUEST) &&
-                                        (VALID_BACKEND(node_id))))
-                               {
-
-                                       for (i = 0; i < pool_config->backend_desc->num_backends; i++)
-                                       {
-                                               /* do not degenerate the new primary */
-                                               if ((new_primary >= 0) && (i != new_primary))
-                                               {
-                                                       BackendInfo *bkinfo;
-
-                                                       bkinfo = pool_get_node_info(i);
-                                                       ereport(LOG,
-                                                                       (errmsg("starting follow degeneration. shutdown host %s(%d)",
-                                                                                       bkinfo->backend_hostname,
-                                                                                       bkinfo->backend_port)));
-                                                       bkinfo->backend_status = CON_DOWN;      /* set down status */
-                                                       pool_set_backend_status_changed_time(i);
-                                                       (void) write_status_file();
-
-                                                       follow_cnt++;
-                                               }
-                                       }
-
-                                       if (follow_cnt == 0)
-                                       {
-                                               ereport(LOG,
-                                                               (errmsg("failover: no follow backends are degenerated")));
-                                       }
-                                       else
-                                       {
-                                               /* update new primary node */
-                                               new_main_node = get_next_main_node();
-                                               ereport(LOG,
-                                                               (errmsg("failover: %d follow backends have been degenerated", follow_cnt)));
-                                       }
-                               }
-                       }
-               }
-
-               if ((follow_cnt > 0) && (*pool_config->follow_primary_command != '\0'))
-               {
-                       follow_pid = fork_follow_child(Req_info->primary_node_id, new_primary,
-                                                                                  Req_info->primary_node_id);
-               }
-
-               /* Save primary node id */
-               if (Req_info->primary_node_id != new_primary)
-               {
-                       if (Req_info->primary_node_id >= 0)
-                       {
-                               pool_set_backend_status_changed_time(Req_info->primary_node_id);
-                       }
-                       if (new_primary >= 0)
-                       {
-                               BACKEND_INFO(new_primary).role = ROLE_PRIMARY;
-                               pool_set_backend_status_changed_time(new_primary);
-                       }
-               }
-               Req_info->primary_node_id = new_primary;
-               ereport(LOG,
-                               (errmsg("failover: set new primary node: %d", Req_info->primary_node_id)));
-
-               if (new_main_node >= 0)
-               {
-                       Req_info->main_node_id = new_main_node;
-                       sync_required = true;
-                       ereport(LOG,
-                                       (errmsg("failover: set new main node: %d", Req_info->main_node_id)));
-               }
-
-
-               /* Kill children and restart them if needed */
-               if (need_to_restart_children)
-               {
-                       for (i = 0; i < pool_config->num_init_children; i++)
-                       {
-                               /*
-                                * Try to kill pgpool child because previous kill signal may
-                                * not be received by pgpool child. This could happen if
-                                * multiple PostgreSQL are going down (or even starting
-                                * pgpool, without starting PostgreSQL can trigger this).
-                                * Child calls degenerate_backend() and it tries to acquire
-                                * semaphore to write a failover request. In this case the
-                                * signal mask is set as well, thus signals are never
-                                * received.
-                                */
-
-                               bool            restart = false;
-
-                               if (partial_restart)
-                               {
-                                       for (j = 0; j < pool_config->max_pool; j++)
-                                       {
-                                               for (k = 0; k < NUM_BACKENDS; k++)
-                                               {
-                                                       ConnectionInfo *con = pool_coninfo(i, j, k);
-
-                                                       if (con->connected && con->load_balancing_node == node_id)
-                                                       {
-
-                                                               ereport(LOG,
-                                                                               (errmsg("child pid %d needs to restart because pool %d uses backend %d",
-                                                                                               process_info[i].pid, j, node_id)));
-                                                               restart = true;
-                                                               break;
-                                                       }
-                                               }
-                                       }
-                               }
-                               else
-                                       restart = true;
-
-                               if (restart)
-                               {
-                                       if (process_info[i].pid)
-                                       {
-                                               kill(process_info[i].pid, SIGQUIT);
-
-                                               process_info[i].pid = fork_a_child(fds, i);
-                                               process_info[i].start_time = time(NULL);
-                                               process_info[i].client_connection_count = 0;
-                                               process_info[i].status = WAIT_FOR_CONNECT;
-                                               process_info[i].connected = 0;
-                                               process_info[i].wait_for_connect = 0;
-                                       }
-                               }
-                               else
-                                       process_info[i].need_to_restart = 1;
-                       }
-               }
-
-               else
-               {
-                       /*
-                        * Set restart request to each child. Children will exit(1)
-                        * whenever they are convenient.
-                        */
-                       for (i = 0; i < pool_config->num_init_children; i++)
-                       {
-                               process_info[i].need_to_restart = 1;
-                       }
-               }
-
-               /*
-                * Send restart request to worker child.
-                */
-               kill(worker_pid, SIGUSR1);
-
-               if (sync_required)
-                       wd_failover_end();
-
-               if (reqkind == NODE_UP_REQUEST)
-               {
-                       ereport(LOG,
-                                       (errmsg("failback done. reconnect host %s(%d)",
-                                                       BACKEND_INFO(node_id).backend_hostname,
-                                                       BACKEND_INFO(node_id).backend_port)));
-
-                       /* Fork health check process if needed */
-                       for (i = 0; i < NUM_BACKENDS; i++)
-                       {
-                               if (health_check_pids[i] == 0)
-                               {
-                                       ereport(LOG,
-                                                       (errmsg("start health check process for host %s(%d)",
-                                                                       BACKEND_INFO(i).backend_hostname,
-                                                                       BACKEND_INFO(i).backend_port)));
-
-                                       health_check_pids[i] = worker_fork_a_child(PT_HEALTH_CHECK, do_health_check_child, &i);
-                               }
-                       }
-               }
-               else if (reqkind == PROMOTE_NODE_REQUEST)
-               {
-                       ereport(LOG,
-                                       (errmsg("promotion done. promoted host %s(%d)",
-                                                       BACKEND_INFO(node_id).backend_hostname,
-                                                       BACKEND_INFO(node_id).backend_port)));
-               }
-               else
-               {
-                       /*
-                        * Temporary black magic. Without this regression 055 does not
-                        * finish
-                        */
-                       fprintf(stderr, "%s done. shutdown host %s(%d)",
-                                       (reqkind == NODE_DOWN_REQUEST) ? "failover" : "quarantine",
-                                       BACKEND_INFO(node_id).backend_hostname,
-                                       BACKEND_INFO(node_id).backend_port);
-
-                       ereport(LOG,
-                                       (errmsg("%s done. shutdown host %s(%d)",
-                                                       (reqkind == NODE_DOWN_REQUEST) ? "failover" : "quarantine",
-                                                       BACKEND_INFO(node_id).backend_hostname,
-                                                       BACKEND_INFO(node_id).backend_port)));
-               }
-               need_to_restart_pcp = true;
-       }
-
-       pool_semaphore_lock(REQUEST_INFO_SEM);
-       switching = 0;
-       Req_info->switching = false;
-       pool_semaphore_unlock(REQUEST_INFO_SEM);
-
-       /*
-        * kick wakeup_handler in pcp_child to notice that failover/failback done
-        */
-       kill(pcp_pid, SIGUSR2);
-
-       if (need_to_restart_pcp)
-       {
-               sleep(1);
-
-               /*
-                * Send restart request to pcp child.
-                */
-               kill(pcp_pid, SIGUSR1);
-               for (;;)
-               {
-                       sts = waitpid(pcp_pid, &status, 0);
-                       if (sts != -1)
-                               break;
-
-                       if (errno == EINTR)
-                               continue;
-                       else
-                       {
-                               ereport(WARNING,
-                                               (errmsg("failover: waitpid failed"),
-                                                errdetail("%m")));
-                               continue;
-                       }
-               }
-               if (WIFSIGNALED(status))
-                       ereport(LOG,
-                                       (errmsg("PCP child %d exits with status %d by signal %d in failover()", pcp_pid, status, WTERMSIG(status))));
-               else
-                       ereport(LOG,
-                                       (errmsg("PCP child %d exits with status %d in failover()", pcp_pid, status)));
-
-               pcp_pid = pcp_fork_a_child(pcp_unix_fd, pcp_inet_fd, pcp_conf_file);
-               ereport(LOG,
-                               (errmsg("fork a new PCP child pid %d in failover()", pcp_pid)));
-       }
-}
-
-#ifdef NOT_USED
-/*
- * health check timer handler
- */
-static RETSIGTYPE health_check_timer_handler(int sig)
-{
-       int                     save_errno = errno;
-
-       POOL_SETMASK(&BlockSig);
-       health_check_timer_expired = 1;
-       POOL_SETMASK(&UnBlockSig);
-       errno = save_errno;
-}
-
-/*
- * do_health_check() performs the health check on all backend nodes.
- * The inout parameter health_check_node_id is the starting backend
- * node number for health check and when the function returns or
- * exits with an error health_check_node_id contains the value
- * of last backend node number on which health check was performed.
- *
- * Function returns false if all backend nodes are down and true if all
- * backend nodes are in healthy state
- */
-static bool
-do_health_check(bool use_template_db, volatile int *health_check_node_id)
-{
-       POOL_CONNECTION_POOL_SLOT *slot;
-       BackendInfo *bkinfo;
-       static char *dbname;
-       int                     i;
-       bool            all_nodes_healthy = false;
-       char       *password = get_pgpool_config_user_password(pool_config->health_check_user,
-                                                                                                                  pool_config->health_check_password);
-
-       /* Do not execute health check during recovery */
-       if (*InRecovery)
-               return false;
-
-       if (!strcmp(pool_config->health_check_database, ""))
-               dbname = use_template_db ? "template1" : "postgres";
-       else
-               dbname = pool_config->health_check_database;
-
-       ereport(DEBUG1,
-                       (errmsg("doing health check against database:%s user:%s",
-                                       dbname, pool_config->health_check_user)));
-
-       /*
-        * Start checking the backed nodes starting from the previously failed
-        * node
-        */
-       for (i = *health_check_node_id; i < pool_config->backend_desc->num_backends; i++)
-       {
-               *health_check_node_id = i;
-
-               /*
-                * Make sure that health check timer has not been expired. Before
-                * called health_check(), health_check_timer_expired is set to 0.
-                * However it is possible that while processing DB nodes health check
-                * timer expired.
-                */
-               if (health_check_timer_expired)
-               {
-                       ereport(ERROR,
-                                       (errmsg("health check timer has been already expired before attempting to connect backend node %d", i)));
-               }
-
-               bkinfo = pool_get_node_info(i);
+               bkinfo = pool_get_node_info(i);
  
                 ereport(DEBUG1,
                                 (errmsg("Backend DB node %d status is %d", i, bkinfo->backend_status)));
@@ -4553,3 +4009,721 @@ pool_release_follow_primary_lock(bool remote_request)
                         (errmsg("pool_release_follow_primary_lock called")));
  
  }
+
+/*
+ * -------------------------------------------------------------------------
+ * Subroutines for failover() begin
+ * -------------------------------------------------------------------------
+ */
+
+/*
+ * Handle failback request. Called from failover().
+ */
+static int
+handle_failback_request(FAILOVER_CONTEXT *failover_context, int node_id)
+{
+       if (node_id < 0 || node_id >= MAX_NUM_BACKENDS ||
+               (failover_context->reqkind == NODE_UP_REQUEST && !(RAW_MODE &&
+                                                                                BACKEND_INFO(node_id).backend_status == CON_DOWN) && VALID_BACKEND(node_id)) ||
+               (failover_context->reqkind == NODE_DOWN_REQUEST && !VALID_BACKEND(node_id)))
+       {
+               if (node_id < 0 || node_id >= MAX_NUM_BACKENDS)
+                       ereport(LOG,
+                                       (errmsg("invalid failback request, node id: %d is invalid. node id must be between [0 and %d]", node_id, MAX_NUM_BACKENDS)));
+               else
+                       ereport(LOG,
+                                       (errmsg("invalid failback request, status: [%d] of node id : %d is invalid for failback", BACKEND_INFO(node_id).backend_status, node_id)));
+
+               return -1;
+       }
+
+       ereport(LOG,
+                       (errmsg("starting fail back. reconnect host %s(%d)",
+                                       BACKEND_INFO(node_id).backend_hostname,
+                                       BACKEND_INFO(node_id).backend_port)));
+
+       /* Check to see if all backends are down */
+       failover_context->all_backend_down = check_all_backend_down();
+
+       BACKEND_INFO(node_id).backend_status = CON_CONNECT_WAIT;        /* unset down status */
+       pool_set_backend_status_changed_time(node_id);
+
+       if ((failover_context->request_details & REQ_DETAIL_UPDATE))
+       {
+               /* remove the quarantine flag */
+               BACKEND_INFO(node_id).quarantine = false;
+
+               /*
+                * do not search for primary node when handling the quarantine
+                * nodes
+                */
+               failover_context->search_primary = false;
+
+               /*
+                * recalculate the main node id after setting the backend
+                * status of quarantined node, this will bring us to the old
+                * main_node_id that was before the quarantine state
+                */
+               Req_info->main_node_id = get_next_main_node();
+               if (Req_info->primary_node_id == -1 &&
+                       BACKEND_INFO(node_id).role == ROLE_PRIMARY)
+               {
+                       /*
+                        * if the failback request is for the quarantined node and
+                        * that node had a primary role before it was quarantined,
+                        * restore the primary node status for that node. this is
+                        * important for the failover script to get the proper
+                        * value of old primary
+                        */
+                       ereport(LOG,
+                                       (errmsg("failover: failing back the quarantine node that was primary before it was quarantined"),
+                                        errdetail("all children needs a restart")));
+                       Req_info->primary_node_id = node_id;
+
+                       /*
+                        * since we changed the primary node so restart of all
+                        * children is required
+                        */
+                       failover_context->need_to_restart_children = true;
+                       failover_context->partial_restart = false;
+               }
+               else if (failover_context->all_backend_down == false)
+               {
+                       ereport(LOG,
+                                       (errmsg("Do not restart children because we are failing back node id %d host: %s port: %d and we are in streaming replication mode and not all backends were down", node_id,
+                                                       BACKEND_INFO(node_id).backend_hostname,
+                                                       BACKEND_INFO(node_id).backend_port)));
+                       failover_context->need_to_restart_children = false;
+                       failover_context->partial_restart = false;
+               }
+               else
+               {
+                       failover_context->need_to_restart_children = true;
+                       failover_context->partial_restart = false;
+               }
+       }
+       else
+       {
+               /*
+                * The request is a proper failbak request and not because of
+                * the update status of quarantined node
+                */
+               (void) write_status_file();
+
+               trigger_failover_command(node_id, pool_config->failback_command,
+                                                                MAIN_NODE_ID, get_next_main_node(), PRIMARY_NODE_ID);
+       }
+
+       failover_context->sync_required = true;
+
+       return 0;
+}
+
+/*
+ * Handle failover request. Called from failover().
+ * return -1 if no node is requested failover.
+ */
+static int
+handle_failover_request(FAILOVER_CONTEXT *failover_context, int node_id)
+{
+       int             cnt = 0;        /* number of down node ids */
+       int             i;
+
+       for (i = 0; i < failover_context->node_count; i++)
+       {
+               if (failover_context->node_id_set[i] != -1 && (BACKEND_INFO(failover_context->node_id_set[i]).quarantine == true ||
+                                                                                                         ((RAW_MODE && VALID_BACKEND_RAW(failover_context->node_id_set[i])) ||
+                                                                                                          VALID_BACKEND(failover_context->node_id_set[i]))))
+               {
+                       ereport(LOG,
+                                       (errmsg("starting %s. shutdown host %s(%d)",
+                                                       (failover_context->reqkind == NODE_QUARANTINE_REQUEST) ? "quarantine" : "degeneration",
+                                                       BACKEND_INFO(failover_context->node_id_set[i]).backend_hostname,
+                                                       BACKEND_INFO(failover_context->node_id_set[i]).backend_port)));
+
+                       BACKEND_INFO(failover_context->node_id_set[i]).backend_status = CON_DOWN; /* set down status */
+                       pool_set_backend_status_changed_time(failover_context->node_id_set[i]);
+                       if (failover_context->reqkind == NODE_QUARANTINE_REQUEST)
+                       {
+                               BACKEND_INFO(failover_context->node_id_set[i]).quarantine = true;
+                       }
+                       else
+                       {
+                               /*
+                                * if the degeneration request is for the quarantined
+                                * node and that node had a primary role before it was
+                                * quarantined, Restore the primary node status for
+                                * that node before degenerating it. This is important
+                                * for the failover script to get the proper value of
+                                * old primary
+                                */
+                               if (Req_info->primary_node_id == -1 &&
+                                       BACKEND_INFO(failover_context->node_id_set[i]).quarantine == true &&
+                                       BACKEND_INFO(failover_context->node_id_set[i]).role == ROLE_PRIMARY)
+                               {
+                                       ereport(DEBUG2,
+                                                       (errmsg("failover: degenerating the node that was primary node before it was quarantined")));
+                                       Req_info->primary_node_id = failover_context->node_id_set[i];
+                                       failover_context->search_primary = false;
+                               }
+                               BACKEND_INFO(failover_context->node_id_set[i]).quarantine = false;
+                               (void) write_status_file();
+                       }
+
+                       /* save down node */
+                       failover_context->nodes[failover_context->node_id_set[i]] = 1;
+                       cnt++;
+               }
+       }
+
+       if (cnt == 0)
+       {
+               ereport(LOG,
+                               (errmsg("failover: no backends are degenerated")));
+               return -1;
+       }
+       return 0;
+}
+
+/*
+ * Kill child process to prepare failover/failback.
+ */
+static void
+kill_failover_children(FAILOVER_CONTEXT *failover_context, int node_id)
+{
+       int             i, j, k;
+       /*
+        * On 2011/5/2 Tatsuo Ishii says: if mode is streaming replication and
+        * request is NODE_UP_REQUEST (failback case) we don't need to restart
+        * all children. Existing session will not use newly attached node,
+        * but load balanced node is not changed until this session ends, so
+        * it's harmless anyway.
+        */
+
+       /*
+        * On 2015/9/21 Tatsuo Ishii says: this judgment is not sufficient if
+        * all backends were down. Child process has local status in which all
+        * backends are down. In this case even if new connection arrives from
+        * frontend, the child will not accept it because the local status
+        * shows all backends are down. For this purpose we refer to
+        * "all_backend_down" variable, which was set before updating backend
+        * status.
+        *
+        * See bug 248 for more details.
+        */
+
+       /*
+        * We also need to think about a case when the former primary node did
+        * not exist.  In the case we need to restart all children as
+        * well. For example when previous primary node id is 0 and then it
+        * went down, restarted, re-attached without promotion. Then existing
+        * child process loses connection slot to node 0 and keeps on using it
+        * when node 0 comes back. This could result in segfault later on in
+        * the child process because there's no connection to node id 0.
+        *
+        * Actually we need to think about when ALWAYS_PRIMARY flag is set
+        * *but* DISALLOW_TO_FAILOVER flag is not set case. In the case after
+        * primary failover Req_info->primary_node_id is set, but connection
+        * to the primary node does not exist. So we should do full restart if
+        * requested node id is the former primary node.
+        *
+        * See bug 672 for more details.
+        */
+       if (STREAM && failover_context->reqkind == NODE_UP_REQUEST && failover_context->all_backend_down == false &&
+               Req_info->primary_node_id >= 0 && Req_info->primary_node_id != node_id)
+       {
+               /*
+                * The decision to restart/no-restart children for update status
+                * request has already been made
+                */
+               if (!(failover_context->request_details & REQ_DETAIL_UPDATE))
+               {
+                       ereport(LOG,
+                                       (errmsg("Do not restart children because we are failing back node id %d host: %s port: %d and we are in streaming replication mode and not all backends were down", node_id,
+                                                       BACKEND_INFO(node_id).backend_hostname,
+                                                       BACKEND_INFO(node_id).backend_port)));
+
+                       failover_context->need_to_restart_children = false;
+                       failover_context->partial_restart = false;
+               }
+       }
+
+       /*
+        * If the mode is streaming replication and the request is
+        * NODE_DOWN_REQUEST and it's actually a switch over request, we don't
+        * need to restart all children, except the node is primary.
+        */
+       else if (STREAM && (failover_context->reqkind == NODE_DOWN_REQUEST || failover_context->reqkind == NODE_QUARANTINE_REQUEST) &&
+                        failover_context->request_details & REQ_DETAIL_SWITCHOVER && node_id != PRIMARY_NODE_ID)
+       {
+               ereport(LOG,
+                               (errmsg("Do not restart children because we are switching over node id %d host: %s port: %d and we are in streaming replication mode", node_id,
+                                               BACKEND_INFO(node_id).backend_hostname,
+                                               BACKEND_INFO(node_id).backend_port)));
+
+               failover_context->need_to_restart_children = true;
+               failover_context->partial_restart = true;
+
+               for (i = 0; i < pool_config->num_init_children; i++)
+               {
+                       bool            restart = false;
+
+                       for (j = 0; j < pool_config->max_pool; j++)
+                       {
+                               for (k = 0; k < NUM_BACKENDS; k++)
+                               {
+                                       ConnectionInfo *con = pool_coninfo(i, j, k);
+
+                                       if (con->connected && con->load_balancing_node == node_id)
+                                       {
+                                               ereport(LOG,
+                                                               (errmsg("child pid %d needs to restart because pool %d uses backend %d",
+                                                                               process_info[i].pid, j, node_id)));
+                                               restart = true;
+                                               break;
+                                       }
+                               }
+                       }
+
+                       if (restart)
+                       {
+                               pid_t           pid = process_info[i].pid;
+
+                               if (pid)
+                               {
+                                       kill(pid, SIGQUIT);
+                                       ereport(DEBUG1,
+                                                       (errmsg("failover handler"),
+                                                        errdetail("kill process with PID:%d", pid)));
+                               }
+                       }
+               }
+       }
+       else
+       {
+               ereport(LOG,
+                               (errmsg("Restart all children")));
+
+               /* kill all children */
+               for (i = 0; i < pool_config->num_init_children; i++)
+               {
+                       pid_t           pid = process_info[i].pid;
+
+                       if (pid)
+                       {
+                               kill(pid, SIGQUIT);
+                               ereport(DEBUG1,
+                                               (errmsg("failover handler"),
+                                                errdetail("kill process with PID:%d", pid)));
+                       }
+               }
+
+               failover_context->need_to_restart_children = true;
+               failover_context->partial_restart = false;
+       }
+}
+
+/*
+ * Exec failover_command if needed. We do not execute failover when request is
+ * quarantine type.  Also if the request is to promote specified node, execute
+ * failover command.
+ */
+static void
+exec_failover_command(FAILOVER_CONTEXT *failover_context, int new_main_node_id, int promote_node_id)
+{
+       int             i;
+
+       if (failover_context->reqkind == NODE_DOWN_REQUEST)
+       {
+               for (i = 0; i < pool_config->backend_desc->num_backends; i++)
+               {
+                       if (failover_context->nodes[i])
+                       {
+                               /* If this is prmoting specified node, new_main_node
+                                * should be replaced by the requested node. The requested
+                                * node should be REAL_PRIMARY_NODE_ID.
+                                */
+                               if (failover_context->request_details & REQ_DETAIL_PROMOTE)
+                               {
+                                       trigger_failover_command(i, pool_config->failover_command,
+                                                                                        MAIN_NODE_ID, promote_node_id, REAL_PRIMARY_NODE_ID);
+                               }
+                               else
+                               {
+                                       trigger_failover_command(i, pool_config->failover_command,
+                                                                                        MAIN_NODE_ID, new_main_node_id, REAL_PRIMARY_NODE_ID);
+                               }
+                               failover_context->sync_required = true;
+                       }
+               }
+       }
+}
+
+/*
+ * Determine new primary node id. Possibly call find_primary_node_repeatedly().
+ */
+static int
+determine_new_primary_node(FAILOVER_CONTEXT *failover_context, int node_id)
+{
+       int             new_primary;
+
+       if (failover_context->reqkind == PROMOTE_NODE_REQUEST && VALID_BACKEND(node_id))
+       {
+               new_primary = node_id;
+       }
+       else if (failover_context->reqkind == NODE_QUARANTINE_REQUEST)
+       {
+               /*
+                * If the quarantine node was the primary node, set the new primary
+                * to -1 (invalid).
+                */
+               if (Req_info->primary_node_id == node_id)
+               {
+                       /*
+                        * set the role of the node, This will help us restore the
+                        * primary node id when the node will come out from quarantine
+                        * state
+                        */
+                       BACKEND_INFO(node_id).role = ROLE_PRIMARY;
+                       new_primary = -1;
+               }
+               else
+               {
+                       new_primary = Req_info->primary_node_id;
+               }
+       }
+
+       /*
+        * If the down node was a standby node in streaming replication mode,
+        * we can avoid calling find_primary_node_repeatedly() and recognize
+        * the former primary as the new primary node, which will reduce the
+        * time to process standby down.
+        * This does not apply to the case when no primary node existed
+        * (Req_info->primary_node_id < 0). In this case
+        * find_primary_node_repeatedly() should be called.
+        */
+       else if (SL_MODE &&
+                        failover_context->reqkind == NODE_DOWN_REQUEST)
+       {
+               if (Req_info->primary_node_id >= 0 && Req_info->primary_node_id != node_id)
+               {
+                       new_primary = Req_info->primary_node_id;
+               }
+               else
+               {
+                       if (Req_info->primary_node_id >= 0)
+                               BACKEND_INFO(Req_info->primary_node_id).role = ROLE_STANDBY;
+                       new_primary = find_primary_node_repeatedly();
+               }
+       }
+       else if (failover_context->search_primary == false)
+       {
+               ereport(DEBUG1,
+                               (errmsg("failover was called on quarantined node. No need to search for primary node")));
+               new_primary = Req_info->primary_node_id;
+       }
+       else
+       {
+               new_primary = find_primary_node_repeatedly();
+       }
+
+       return new_primary;
+}
+
+/*
+ * Execute follow primary command if neccessary.
+ * return new main node id if it needs to be changed.
+ * If not changed, -1 will be returned.
+ */
+static int
+exec_follow_primary_command(FAILOVER_CONTEXT *failover_context, int node_id, int new_primary_node_id)
+{
+       int             follow_cnt = 0;
+       int             new_main_node_id = -1;
+       int             i;
+
+       if (!STREAM)
+               return -1;
+
+       if (*pool_config->follow_primary_command != '\0' ||
+               failover_context->reqkind == PROMOTE_NODE_REQUEST)
+       {
+               /*
+                * follow primary command is executed in following cases:
+                * - failover against the current primary
+                * - no primary exists and new primary is created by failover
+                * - promote node request
+                */
+               if (((failover_context->reqkind == NODE_DOWN_REQUEST) &&
+                        Req_info->primary_node_id >= 0 &&
+                        (failover_context->nodes[Req_info->primary_node_id])) ||
+                       (failover_context->reqkind == NODE_DOWN_REQUEST && Req_info->primary_node_id < 0 && new_primary_node_id >= 0) ||
+                       (node_id >= 0 && (failover_context->reqkind == PROMOTE_NODE_REQUEST) &&
+                        (VALID_BACKEND(node_id))))
+               {
+
+                       for (i = 0; i < pool_config->backend_desc->num_backends; i++)
+                       {
+                               /* do not degenerate the new primary */
+                               if ((new_primary_node_id >= 0) && (i != new_primary_node_id))
+                               {
+                                       BackendInfo *bkinfo;
+
+                                       bkinfo = pool_get_node_info(i);
+                                       ereport(LOG,
+                                                       (errmsg("starting follow degeneration. shutdown host %s(%d)",
+                                                                       bkinfo->backend_hostname,
+                                                                       bkinfo->backend_port)));
+                                       bkinfo->backend_status = CON_DOWN;      /* set down status */
+                                       pool_set_backend_status_changed_time(i);
+                                       (void) write_status_file();
+
+                                       follow_cnt++;
+                               }
+                       }
+
+                       if (follow_cnt == 0)
+                       {
+                               ereport(LOG,
+                                               (errmsg("failover: no follow backends are degenerated")));
+                       }
+                       else
+                       {
+                               /* update new primary node */
+                               new_main_node_id = get_next_main_node();
+                               ereport(LOG,
+                                               (errmsg("failover: %d follow backends have been degenerated", follow_cnt)));
+                       }
+               }
+       }
+
+       if ((follow_cnt > 0) && (*pool_config->follow_primary_command != '\0'))
+       {
+               /* exec follow child */
+               follow_pid = fork_follow_child(Req_info->primary_node_id, new_primary_node_id,
+                                                                          Req_info->primary_node_id);
+       }
+
+       return new_main_node_id;
+}
+
+/*
+ * Now new primary node and new main node are established.
+ * Save them into shared memory. Also update status changed time.
+ */
+static void
+save_node_info(FAILOVER_CONTEXT *failover_context, int new_primary_node_id, int new_main_node_id)
+{
+       /* Save primary node id */
+       if (Req_info->primary_node_id != new_primary_node_id)
+       {
+               if (Req_info->primary_node_id >= 0)
+               {
+                       pool_set_backend_status_changed_time(Req_info->primary_node_id);
+               }
+               if (new_primary_node_id >= 0)
+               {
+                       BACKEND_INFO(new_primary_node_id).role = ROLE_PRIMARY;
+                       pool_set_backend_status_changed_time(new_primary_node_id);
+               }
+       }
+       Req_info->primary_node_id = new_primary_node_id;
+       ereport(LOG,
+                       (errmsg("failover: set new primary node: %d", Req_info->primary_node_id)));
+
+       if (new_main_node_id >= 0)
+       {
+               Req_info->main_node_id = new_main_node_id;
+               failover_context->sync_required = true;
+               ereport(LOG,
+                               (errmsg("failover: set new main node: %d", Req_info->main_node_id)));
+       }
+}
+
+/*
+ * Rstart child process if needed.
+ */
+static void
+exec_child_restart(FAILOVER_CONTEXT *failover_context, int node_id)
+{
+       int             i, j, k;
+
+       if (failover_context->need_to_restart_children)
+       {
+               for (i = 0; i < pool_config->num_init_children; i++)
+               {
+                       /*
+                        * Try to kill pgpool child because previous kill signal may
+                        * not be received by pgpool child. This could happen if
+                        * multiple PostgreSQL are going down (or even starting
+                        * pgpool, without starting PostgreSQL can trigger this).
+                        * Child calls degenerate_backend() and it tries to acquire
+                        * semaphore to write a failover request. In this case the
+                        * signal mask is set as well, thus signals are never
+                        * received.
+                        */
+
+                       bool            restart = false;
+
+                       if (failover_context->partial_restart)
+                       {
+                               for (j = 0; j < pool_config->max_pool; j++)
+                               {
+                                       for (k = 0; k < NUM_BACKENDS; k++)
+                                       {
+                                               ConnectionInfo *con = pool_coninfo(i, j, k);
+
+                                               if (con->connected && con->load_balancing_node == node_id)
+                                               {
+
+                                                       ereport(LOG,
+                                                                       (errmsg("child pid %d needs to restart because pool %d uses backend %d",
+                                                                                       process_info[i].pid, j, node_id)));
+                                                       restart = true;
+                                                       break;
+                                               }
+                                       }
+                               }
+                       }
+                       else
+                               restart = true;
+
+                       if (restart)
+                       {
+                               if (process_info[i].pid)
+                               {
+                                       kill(process_info[i].pid, SIGQUIT);
+
+                                       process_info[i].pid = fork_a_child(fds, i);
+                                       process_info[i].start_time = time(NULL);
+                                       process_info[i].client_connection_count = 0;
+                                       process_info[i].status = WAIT_FOR_CONNECT;
+                                       process_info[i].connected = 0;
+                                       process_info[i].wait_for_connect = 0;
+                               }
+                       }
+                       else
+                               process_info[i].need_to_restart = 1;
+               }
+       }
+
+       else
+       {
+               /*
+                * Set restart request to each child. Children will exit(1)
+                * whenever they are convenient.
+                */
+               for (i = 0; i < pool_config->num_init_children; i++)
+               {
+                       process_info[i].need_to_restart = 1;
+               }
+       }
+
+       /*
+        * Send restart request to worker child.
+        */
+       kill(worker_pid, SIGUSR1);
+
+       if (failover_context->sync_required)
+               wd_failover_end();
+
+       if (failover_context->reqkind == NODE_UP_REQUEST)
+       {
+               ereport(LOG,
+                               (errmsg("failback done. reconnect host %s(%d)",
+                                               BACKEND_INFO(node_id).backend_hostname,
+                                               BACKEND_INFO(node_id).backend_port)));
+
+               /* Fork health check process if needed */
+               for (i = 0; i < NUM_BACKENDS; i++)
+               {
+                       if (health_check_pids[i] == 0)
+                       {
+                               ereport(LOG,
+                                               (errmsg("start health check process for host %s(%d)",
+                                                               BACKEND_INFO(i).backend_hostname,
+                                                               BACKEND_INFO(i).backend_port)));
+
+                               health_check_pids[i] = worker_fork_a_child(PT_HEALTH_CHECK, do_health_check_child, &i);
+                       }
+               }
+       }
+       else if (failover_context->reqkind == PROMOTE_NODE_REQUEST)
+       {
+               ereport(LOG,
+                               (errmsg("promotion done. promoted host %s(%d)",
+                                               BACKEND_INFO(node_id).backend_hostname,
+                                               BACKEND_INFO(node_id).backend_port)));
+       }
+       else
+       {
+               /*
+                * Temporary black magic. Without this regression 055 does not
+                * finish
+                */
+               fprintf(stderr, "%s done. shutdown host %s(%d)",
+                               (failover_context->reqkind == NODE_DOWN_REQUEST) ? "failover" : "quarantine",
+                               BACKEND_INFO(node_id).backend_hostname,
+                               BACKEND_INFO(node_id).backend_port);
+
+               ereport(LOG,
+                               (errmsg("%s done. shutdown host %s(%d)",
+                                               (failover_context->reqkind == NODE_DOWN_REQUEST) ? "failover" : "quarantine",
+                                               BACKEND_INFO(node_id).backend_hostname,
+                                               BACKEND_INFO(node_id).backend_port)));
+       }
+       failover_context->need_to_restart_pcp = true;
+}
+
+/*
+ * kick wakeup_handler in pcp_child to notice that failover/failback done.
+ */
+static void
+exec_notice_pcp_child(FAILOVER_CONTEXT *failover_context)
+{
+       int                     status;
+       int                     sts;
+
+       kill(pcp_pid, SIGUSR2);
+
+       if (failover_context->need_to_restart_pcp)
+       {
+               sleep(1);
+
+               /*
+                * Send restart request to pcp child.
+                */
+               kill(pcp_pid, SIGUSR1);
+               for (;;)
+               {
+                       sts = waitpid(pcp_pid, &status, 0);
+                       if (sts != -1)
+                               break;
+
+                       if (errno == EINTR)
+                               continue;
+                       else
+                       {
+                               ereport(WARNING,
+                                               (errmsg("failover: waitpid failed"),
+                                                errdetail("%m")));
+                               continue;
+                       }
+               }
+               if (WIFSIGNALED(status))
+                       ereport(LOG,
+                                       (errmsg("PCP child %d exits with status %d by signal %d in failover()", pcp_pid, status, WTERMSIG(status))));
+               else
+                       ereport(LOG,
+                                       (errmsg("PCP child %d exits with status %d in failover()", pcp_pid, status)));
+
+               pcp_pid = pcp_fork_a_child(pcp_unix_fd, pcp_inet_fd, pcp_conf_file);
+               ereport(LOG,
+                               (errmsg("fork a new PCP child pid %d in failover()", pcp_pid)));
+       }
+}
+/*
+ * -------------------------------------------------------------------------
+ * Subroutines for failover() end
+ * -------------------------------------------------------------------------
+ */
author	Tatsuo Ishii <ishii@sraoss.co.jp>
	Tue, 8 Feb 2022 05:26:29 +0000 (14:26 +0900)
committer	Tatsuo Ishii <ishii@sraoss.co.jp>
	Tue, 8 Feb 2022 05:38:12 +0000 (14:38 +0900)