Overhaul health check debug facility.
authorTatsuo Ishii <ishii@sraoss.co.jp>
Tue, 6 Aug 2019 02:27:30 +0000 (11:27 +0900)
committerTatsuo Ishii <ishii@sraoss.co.jp>
Tue, 6 Aug 2019 02:27:30 +0000 (11:27 +0900)
check_backend_down_request() in health_check.c is intended to simulate
the situation where communication failure between health check and
PostgreSQL backend node by creating a file containing lines:

1 down

where the first numeric is the node id starting from 0, tab, and
"down". When health check process finds the file, let health check
fails on node 1.

After health check brings the node into down status,
check_backend_down_request() change "down" to "already_down" to
prevent repeating node failure.

However, questions is, this is necessary at all. I think
check_backend_down_request() should keep on reporting the down status
and it should be called inside establish_persistent_connection() to
prevent repeating node failure because it could be better simulated
the failing situation in this way. For example, currently the health
check retry is not simulated but the new way can do it.

Moreover, in current watchdog implementation, to bring a node into
quarantine state requires *two" times of node communication error
detection. Since check_backend_down_request() only allows to raise
node down even *once" (after the down state is changed to already_down
state), it's impossible to test the watchdog quarantine using
check_backend_down_request(). I changed check_backend_down_request()
so that it continues to raise "down" event as long as the down request
file exists.

This commit enhances check_backend_down_request() as described above.

1) caller of check_backend_down_request() is
   establish_persistent_connection(), rather than
   do_health_check_child().

2) check_backend_down_request() does not change "down" to
   "already_down" anymore. This means that the second argument of
   check_backend_down_request() is not useful anymore. Probably I
   should remove the argument later on.

src/main/health_check.c

index 75fd63821f6eece6aa2b8e10807366e3611f32c3..fe4ff30d4f905c48a18a6edd12ea921abb31c916 100644 (file)
@@ -183,11 +183,7 @@ do_health_check_child(int *node_id)
 
                        result = establish_persistent_connection(*node_id);
 
-#ifdef HEALTHCHECK_DEBUG
-                       if (check_backend_down_request(*node_id, false) || (result && slot == NULL))
-#else
                        if (result && slot == NULL)
-#endif
                        {
                                if (POOL_DISALLOW_TO_FAILOVER(BACKEND_INFO(*node_id).flag))
                                {
@@ -216,15 +212,12 @@ do_health_check_child(int *node_id)
                                        }
                                }
                        }
-                       else if (bkinfo->backend_status == CON_DOWN && bkinfo->quarantine == true)
+                       else if (slot && bkinfo->backend_status == CON_DOWN && bkinfo->quarantine == true)
                        {
                                /* The node has become reachable again. Reset
                                 * the quarantine state
                                 */
-#ifdef HEALTHCHECK_DEBUG
-                               if (check_backend_down_request(*node_id, true) == false)
-#endif
-                                       send_failback_request(*node_id, false, REQ_DETAIL_UPDATE | REQ_DETAIL_WATCHDOG);
+                               send_failback_request(*node_id, false, REQ_DETAIL_UPDATE | REQ_DETAIL_WATCHDOG);
                        }
 
                        /* Discard persistent connections */
@@ -316,6 +309,13 @@ establish_persistent_connection(int node)
                                CLEAR_ALARM;
                        }
 
+#ifdef HEALTHCHECK_DEBUG
+                       if (slot && check_backend_down_request(node, false) == true)
+                       {
+                               discard_persistent_connection(node);
+                       }
+#endif
+
                        if (slot)
                        {
                                if (retry_cnt != pool_config->health_check_params[node].health_check_max_retries)
@@ -342,6 +342,7 @@ establish_persistent_connection(int node)
 
                if (password)
                        pfree(password);
+
                if (check_failback && !Req_info->switching && slot)
                {
                                ereport(LOG,
@@ -544,6 +545,7 @@ check_backend_down_request(int node, bool done_requests)
        if (!found)
                return false;
 
+#ifdef NOT_USED
        fd = fopen(backend_down_request_file, "w");
        if (!fd)
        {
@@ -564,6 +566,7 @@ check_backend_down_request(int node, bool done_requests)
                return false;
        }
        fclose(fd);
+#endif
 
        return true;
 }