From 383cb4e802d971c4c23eeb1c29b8d9d8858fc1c2 Mon Sep 17 00:00:00 2001 From: "dementii.priadko" <45518657+DEMNERD@users.noreply.github.com> Date: Fri, 18 Jul 2025 22:32:49 +0300 Subject: [PATCH 1/2] Added backup-related metrics and dashboard --- ...Dashboard_1_Node_performance_overview.json | 10 +- ...Dashboard_2_Aggregated_query_analysis.json | 10 +- .../Dashboard_3_Single_query_analysis.json | 10 +- .../Dashboard_4_Wait_Sampling_Dashboard.json | 14 +- .../dashboards/Dashboard_5_Backup_stats.json | 391 ++++++++++++++++++ config/pgwatch-prometheus/metrics.yml | 78 +++- 6 files changed, 489 insertions(+), 24 deletions(-) create mode 100644 config/grafana/dashboards/Dashboard_5_Backup_stats.json diff --git a/config/grafana/dashboards/Dashboard_1_Node_performance_overview.json b/config/grafana/dashboards/Dashboard_1_Node_performance_overview.json index 183ea61..dd1b1d6 100644 --- a/config/grafana/dashboards/Dashboard_1_Node_performance_overview.json +++ b/config/grafana/dashboards/Dashboard_1_Node_performance_overview.json @@ -18,7 +18,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, - "id": 4, + "id": 2, "links": [], "panels": [ { @@ -4026,8 +4026,8 @@ }, { "current": { - "text": "target_database", - "value": "target_database" + "text": "postgres", + "value": "postgres" }, "definition": "label_values(pgwatch_pg_database_wraparound_age_datfrozenxid{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", "label": "DB name", @@ -4050,7 +4050,7 @@ }, "timepicker": {}, "timezone": "utc", - "title": "1. Postgres node performance overview (high-level)", + "title": "1. Single node performance overview (high-level)", "uid": "f90500a0-a12e-4081-a2f0-07ed96f27915", - "version": 5 + "version": 2 } \ No newline at end of file diff --git a/config/grafana/dashboards/Dashboard_2_Aggregated_query_analysis.json b/config/grafana/dashboards/Dashboard_2_Aggregated_query_analysis.json index abe650f..ed89800 100644 --- a/config/grafana/dashboards/Dashboard_2_Aggregated_query_analysis.json +++ b/config/grafana/dashboards/Dashboard_2_Aggregated_query_analysis.json @@ -18,7 +18,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 1, + "id": 3, "links": [], "panels": [ { @@ -2277,8 +2277,8 @@ }, { "current": { - "text": "target_database", - "value": "target_database" + "text": "postgres", + "value": "postgres" }, "definition": "label_values(pgwatch_pg_database_wraparound_age_datfrozenxid{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", "label": "DB name", @@ -2301,7 +2301,7 @@ }, "timepicker": {}, "timezone": "utc", - "title": "2. Postgres aggregated query performance analysis", + "title": "2. Query performance analysis (top-N)", "uid": "3ceb2e98-639d-48df-8e1f-7686d2052170", - "version": 5 + "version": 2 } \ No newline at end of file diff --git a/config/grafana/dashboards/Dashboard_3_Single_query_analysis.json b/config/grafana/dashboards/Dashboard_3_Single_query_analysis.json index a0a5dae..be090ad 100644 --- a/config/grafana/dashboards/Dashboard_3_Single_query_analysis.json +++ b/config/grafana/dashboards/Dashboard_3_Single_query_analysis.json @@ -18,7 +18,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 8, + "id": 4, "links": [], "panels": [ { @@ -2173,8 +2173,8 @@ }, { "current": { - "text": "target_database", - "value": "target_database" + "text": "postgres", + "value": "postgres" }, "definition": "label_values(pgwatch_pg_database_wraparound_age_datfrozenxid{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", "label": "DB name", @@ -2197,7 +2197,7 @@ }, "timepicker": {}, "timezone": "utc", - "title": "3. Postgres single query performance analysis", + "title": "3. Single queryid analysis", "uid": "db52944d-b025-4e18-b70b-89c0af3e7e41", - "version": 26 + "version": 2 } \ No newline at end of file diff --git a/config/grafana/dashboards/Dashboard_4_Wait_Sampling_Dashboard.json b/config/grafana/dashboards/Dashboard_4_Wait_Sampling_Dashboard.json index 41369d3..295b2ba 100644 --- a/config/grafana/dashboards/Dashboard_4_Wait_Sampling_Dashboard.json +++ b/config/grafana/dashboards/Dashboard_4_Wait_Sampling_Dashboard.json @@ -18,7 +18,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 11, + "id": 1, "links": [], "panels": [ { @@ -688,8 +688,10 @@ "allowCustomValue": false, "current": { "text": [ + "CPU*" ], "value": [ + "CPU*" ] }, "definition": "label_values(pgwatch_wait_events_total,wait_event_type)", @@ -711,8 +713,10 @@ "allowCustomValue": false, "current": { "text": [ + "CPU*" ], "value": [ + "CPU*" ] }, "definition": "label_values(pgwatch_wait_events_total{wait_event_type=~\"$wait_event_type\"},wait_event)", @@ -767,8 +771,8 @@ }, { "current": { - "text": "target_database", - "value": "target_database" + "text": "postgres", + "value": "postgres" }, "definition": "label_values(pgwatch_pg_database_wraparound_age_datfrozenxid{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", "label": "DB name", @@ -791,7 +795,7 @@ }, "timepicker": {}, "timezone": "browser", - "title": "4. Wait sampling dashboard", + "title": "4. Wait event analysis (Active Session History)", "uid": "a222b233-acef-4bac-a451-1591023e4d4f", - "version": 13 + "version": 2 } \ No newline at end of file diff --git a/config/grafana/dashboards/Dashboard_5_Backup_stats.json b/config/grafana/dashboards/Dashboard_5_Backup_stats.json new file mode 100644 index 0000000..61e1b60 --- /dev/null +++ b/config/grafana/dashboards/Dashboard_5_Backup_stats.json @@ -0,0 +1,391 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 5, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "bars", + "fillOpacity": 100, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 1, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ops" + }, + "overrides": [ + { + "matcher": { + "id": "byFrameRefID", + "options": "A" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byFrameRefID", + "options": "B" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 2, + "interval": "30s", + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "editorMode": "code", + "expr": "irate(pgwatch_archive_lag_archived_count[$__rate_interval])", + "hide": false, + "instant": false, + "interval": "60", + "legendFormat": "Archived count", + "range": true, + "refId": "B" + }, + { + "editorMode": "code", + "exemplar": false, + "expr": "irate(pgwatch_archive_lag_failed_count[$__rate_interval])", + "interval": "60", + "legendFormat": "Failed count", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "editorMode": "code", + "expr": "pgwatch_archive_lag_archived_count", + "hide": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "C" + } + ], + "title": "Archive success and errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 1, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "editorMode": "code", + "expr": "irate(pgwatch_archive_lag_archived_count[$__rate_interval])/(irate(pgwatch_archive_lag_archived_count[$__rate_interval])+irate(pgwatch_archive_lag_failed_count[$__rate_interval]))", + "interval": "60", + "legendFormat": "Percentage of succesful operations", + "range": true, + "refId": "A" + } + ], + "title": "WAL archive success rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 1, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "pgwatch_archive_lag_current_lsn_numeric - pgwatch_archive_lag_archived_wal_finish_lsn_numeric", + "fullMetaSearch": false, + "includeNullMetadata": true, + "interval": "10", + "legendFormat": "Archive lag", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Archive lag", + "type": "timeseries" + } + ], + "preload": false, + "schemaVersion": 41, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "5. Backups and DR", + "uid": "cb67ac04-67fa-4c70-968c-5bc5ece03aa7", + "version": 19 +} \ No newline at end of file diff --git a/config/pgwatch-prometheus/metrics.yml b/config/pgwatch-prometheus/metrics.yml index 0fc45f9..0712c04 100644 --- a/config/pgwatch-prometheus/metrics.yml +++ b/config/pgwatch-prometheus/metrics.yml @@ -1956,6 +1956,75 @@ metrics: order by grp, index_size_bytes desc; gauges: - '*' + + archive_lag: + description: > + This metric measures the lag in WAL archive processing. + It provides insights into the time taken to archive WAL logs and the current status of the archive process. + This metric helps administrators monitor the WAL archive process and identify any performance issues. + sqls: + 11: | + -- postgresql wal archiving lag monitor + with wal_info as ( + select + last_archived_wal, + last_archived_time, + substr(last_archived_wal, 9, 8) as log_id_hex, + ('x' || substr(last_archived_wal, 17, 8))::bit(32)::bigint as segment_dec, + archived_count, + failed_count, + -- wal_segment_size is already in bytes! + (select setting::bigint + from pg_settings + where name = 'wal_segment_size') as wal_segment_size_bytes + from pg_stat_archiver + where last_archived_wal is not null + ) + select + pg_wal_lsn_diff(pg_current_wal_lsn(), '0/0')::bigint as current_lsn_numeric, + pg_wal_lsn_diff((log_id_hex || '/' || lpad(to_hex((segment_dec + 1) * wal_segment_size_bytes), 8, '0'))::pg_lsn, '0/0')::bigint as archived_wal_finish_lsn_numeric, + pg_size_pretty( + pg_wal_lsn_diff( + pg_current_wal_lsn(), + (log_id_hex || '/' || lpad(to_hex(segment_dec * wal_segment_size_bytes), 8, '0'))::pg_lsn + ) + ) as lag_human, + + -- dynamic calculation based on actual segment size + round( + pg_wal_lsn_diff( + pg_current_wal_lsn(), + (log_id_hex || '/' || lpad(to_hex(segment_dec * wal_segment_size_bytes), 8, '0'))::pg_lsn + )::numeric / wal_segment_size_bytes, + 2 + ) as wal_files_behind, + + pg_size_pretty(wal_segment_size_bytes) as wal_segment_size, + extract(epoch from (now() - last_archived_time))::int as seconds_since_archive, + + -- alert thresholds based on segment size + case + when pg_wal_lsn_diff( + pg_current_wal_lsn(), + (log_id_hex || '/' || lpad(to_hex(segment_dec * wal_segment_size_bytes), 8, '0'))::pg_lsn + ) > 6 * wal_segment_size_bytes then 'critical' + when pg_wal_lsn_diff( + pg_current_wal_lsn(), + (log_id_hex || '/' || lpad(to_hex(segment_dec * wal_segment_size_bytes), 8, '0'))::pg_lsn + ) > 3 * wal_segment_size_bytes then 'warning' + else 'ok' + end as tag_status, + + archived_count, + failed_count + from wal_info; + gauges: + - 'seconds_since_archive' + - 'archived_count' + - 'failed_count' + - 'wal_files_behind' + - 'current_lsn_numeric' + - 'archived_wal_start_lsn_numeric' presets: full: @@ -2006,7 +2075,8 @@ presets: pg_gin_index: 5 pg_table_bloat: 86400 pg_btree_bloat: 86400 - pg_invalid_indexes: 86400 - redundant_indexes: 86400 - unused_indexes: 86400 - rarely_used_indexes: 86400 \ No newline at end of file + pg_invalid_indexes: 60 + redundant_indexes: 60 + unused_indexes: 60 + rarely_used_indexes: 60 + archive_lag: 60 \ No newline at end of file -- GitLab From e9ffd8b041acb7c2946c8f257909f3a8b4b95690 Mon Sep 17 00:00:00 2001 From: "dementii.priadko" <45518657+DEMNERD@users.noreply.github.com> Date: Mon, 21 Jul 2025 16:16:23 +0300 Subject: [PATCH 2/2] Removed lag_human from metrics.yml --- config/pgwatch-prometheus/metrics.yml | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/config/pgwatch-prometheus/metrics.yml b/config/pgwatch-prometheus/metrics.yml index 0712c04..fe223a7 100644 --- a/config/pgwatch-prometheus/metrics.yml +++ b/config/pgwatch-prometheus/metrics.yml @@ -1966,7 +1966,7 @@ metrics: 11: | -- postgresql wal archiving lag monitor with wal_info as ( - select + select last_archived_wal, last_archived_time, substr(last_archived_wal, 9, 8) as log_id_hex, @@ -1983,14 +1983,7 @@ metrics: select pg_wal_lsn_diff(pg_current_wal_lsn(), '0/0')::bigint as current_lsn_numeric, pg_wal_lsn_diff((log_id_hex || '/' || lpad(to_hex((segment_dec + 1) * wal_segment_size_bytes), 8, '0'))::pg_lsn, '0/0')::bigint as archived_wal_finish_lsn_numeric, - pg_size_pretty( - pg_wal_lsn_diff( - pg_current_wal_lsn(), - (log_id_hex || '/' || lpad(to_hex(segment_dec * wal_segment_size_bytes), 8, '0'))::pg_lsn - ) - ) as lag_human, - -- dynamic calculation based on actual segment size round( pg_wal_lsn_diff( pg_current_wal_lsn(), -- GitLab