From 775dfab60f4e6df4e6f5e1a1e2ef71e240230adb Mon Sep 17 00:00:00 2001 From: "dementii.priadko" <45518657+DEMNERD@users.noreply.github.com> Date: Tue, 26 Aug 2025 17:59:45 +0300 Subject: [PATCH 1/3] Remove others aggregation from pg_stat_statements metric in prometheus - Removed UNION ALL clause that aggregated queries beyond top 500 - This allows all individual queries to be tracked without artificial grouping - Improves granularity of pg_stat_statements monitoring data --- config/pgwatch-prometheus/metrics.yml | 48 --------------------------- 1 file changed, 48 deletions(-) diff --git a/config/pgwatch-prometheus/metrics.yml b/config/pgwatch-prometheus/metrics.yml index 406e236..5d0ffae 100644 --- a/config/pgwatch-prometheus/metrics.yml +++ b/config/pgwatch-prometheus/metrics.yml @@ -901,30 +901,6 @@ metrics: temp_bytes_read::int8 as temp_bytes_read, temp_bytes_written::int8 as temp_bytes_written from ranked_statements - where rn <= 500 - union all - select - 'tail_dummy_user' as tag_user, - current_database() as tag_datname, - -1 as tag_queryid, - sum(calls)::int8 as calls, - sum(plans_total)::int8 as plans_total, - sum(exec_time_total)::int8 as exec_time_total, - sum(plan_time_total)::int8 as plan_time_total, - sum(rows)::int8 as rows, - sum(shared_bytes_hit_total)::int8 as shared_bytes_hit_total, - sum(shared_bytes_read_total)::int8 as shared_bytes_read_total, - sum(shared_bytes_dirtied_total)::int8 as shared_bytes_dirtied_total, - sum(shared_bytes_written_total)::int8 as shared_bytes_written_total, - sum(block_read_total)::int8 as block_read_total, - sum(block_write_total)::int8 as block_write_total, - sum(wal_records)::int8 as wal_records, - sum(wal_fpi)::int8 as wal_fpi, - sum(wal_bytes)::int8 as wal_bytes, - sum(temp_bytes_read)::int8 as temp_bytes_read, - sum(temp_bytes_written)::int8 as temp_bytes_written - from ranked_statements - where rn > 500 17: | WITH ranked_statements as ( select @@ -973,30 +949,6 @@ metrics: temp_bytes_read::int8 as temp_bytes_read, temp_bytes_written::int8 as temp_bytes_written from ranked_statements - where rn <= 500 - union all - select - 'tail_dummy_user' as tag_user, - current_database() as tag_datname, - -1 as tag_queryid, - sum(calls)::int8 as calls, - sum(plans_total)::int8 as plans_total, - sum(exec_time_total)::int8 as exec_time_total, - sum(plan_time_total)::int8 as plan_time_total, - sum(rows)::int8 as rows, - sum(shared_bytes_hit_total)::int8 as shared_bytes_hit_total, - sum(shared_bytes_read_total)::int8 as shared_bytes_read_total, - sum(shared_bytes_dirtied_total)::int8 as shared_bytes_dirtied_total, - sum(shared_bytes_written_total)::int8 as shared_bytes_written_total, - sum(block_read_total)::int8 as block_read_total, - sum(block_write_total)::int8 as block_write_total, - sum(wal_records)::int8 as wal_records, - sum(wal_fpi)::int8 as wal_fpi, - sum(wal_bytes)::int8 as wal_bytes, - sum(temp_bytes_read)::int8 as temp_bytes_read, - sum(temp_bytes_written)::int8 as temp_bytes_written - from ranked_statements - where rn > 500 gauges: - calls - plans_total -- GitLab From 6a9a33d9f1e24a311876536cc53164970324ada7 Mon Sep 17 00:00:00 2001 From: "dementii.priadko" <45518657+DEMNERD@users.noreply.github.com> Date: Thu, 28 Aug 2025 16:19:58 +0300 Subject: [PATCH 2/3] Added bgwriter and vacuum graphs --- ...Dashboard_1_Node_performance_overview.json | 331 +++++++++++++++++- .../Dashboard_7_Autovacuum_and_bloat.json | 317 ++++++++++++++--- ...ble_Stats => Dashboard_9_Table_Stats.json} | 0 3 files changed, 579 insertions(+), 69 deletions(-) rename config/grafana/dashboards/{Dashboard_9_Table_Stats => Dashboard_9_Table_Stats.json} (100%) diff --git a/config/grafana/dashboards/Dashboard_1_Node_performance_overview.json b/config/grafana/dashboards/Dashboard_1_Node_performance_overview.json index 600fb55..b5d9ad8 100644 --- a/config/grafana/dashboards/Dashboard_1_Node_performance_overview.json +++ b/config/grafana/dashboards/Dashboard_1_Node_performance_overview.json @@ -18,7 +18,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, - "id": 2, + "id": 8, "links": [], "panels": [ { @@ -1157,7 +1157,38 @@ }, "unit": "percent" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byFrameRefID", + "options": "A" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byFrameRefID", + "options": "B" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] }, "gridPos": { "h": 8, @@ -1185,6 +1216,20 @@ }, "pluginVersion": "12.0.2", "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "editorMode": "code", + "expr": "sum(irate(pgwatch_db_stats_xact_rollback{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval]))/(sum(irate(pgwatch_db_stats_xact_commit{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval]))+sum(irate(pgwatch_db_stats_xact_rollback{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval]))) * 100", + "hide": false, + "instant": false, + "interval": "20", + "legendFormat": "Rollbacks", + "range": true, + "refId": "B" + }, { "editorMode": "code", "expr": "sum(irate(pgwatch_db_stats_xact_commit{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval]))/(sum(irate(pgwatch_db_stats_xact_commit{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval]))+sum(irate(pgwatch_db_stats_xact_rollback{cluster='$cluster_name', node_name='$node_name', datname=\"$db_name\"}[$__rate_interval]))) * 100", @@ -1194,7 +1239,7 @@ "refId": "A" } ], - "title": "Commit ratio", + "title": "Commit vs Rollback ratio", "type": "timeseries" }, { @@ -3791,7 +3836,269 @@ "refId": "A" } ], - "title": "Age of the oldest multi-transaction ID", + "title": "Age of the oldest multi-transaction ID that has not been frozen", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 1, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 149 + }, + "id": 41, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(pgwatch_bgwriter_buffers_checkpoint{datname=\"$db_name\"}[$__rate_interval]) * on(datname) pgwatch_settings_numeric_value{datname=\"$db_name\", setting_name=\"block_size\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "interval": "20", + "legendFormat": "Size of buffers cleaned by the checkpointer", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "editorMode": "code", + "expr": "irate(pgwatch_bgwriter_buffers_clean[$__rate_interval]) * on(datname) pgwatch_settings_numeric_value{datname=\"$db_name\", setting_name=\"block_size\"}", + "hide": false, + "instant": false, + "interval": "20", + "legendFormat": "Size of buffers cleaned by the background writer", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "editorMode": "code", + "expr": "irate(pgwatch_bgwriter_buffers_backend[$__rate_interval]) * on(datname) pgwatch_settings_numeric_value{datname=\"$db_name\", setting_name=\"block_size\"}", + "hide": false, + "instant": false, + "interval": "20", + "legendFormat": "Size of buffers cleaned directly by a backend.", + "range": true, + "refId": "C" + } + ], + "title": "bgwriter and checkpointer", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 7, + "axisSoftMin": 1, + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "points", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "mappings": [ + { + "options": { + "1": { + "index": 0, + "text": "Initial" + }, + "2": { + "index": 1, + "text": "Scanning heap" + }, + "3": { + "index": 2, + "text": "Vacuuming indexes" + }, + "4": { + "index": 3, + "text": "Vacuuming heap" + }, + "5": { + "index": 4, + "text": "Cleaning up indexes" + }, + "6": { + "index": 5, + "text": "Truncating heap" + }, + "7": { + "index": 6, + "text": "Final cleanup" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 24, + "x": 0, + "y": 159 + }, + "id": 42, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "code", + "expr": "(\n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{phase=\"7\"}) * 0 + 7 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{phase=\"6\"}) * 0 + 6 or\n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{phase=\"5\"}) * 0 + 5 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{phase=\"4\"}) * 0 + 4 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{phase=\"3\"}) * 0 + 3 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{phase=\"2\"}) * 0 + 2 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{phase=\"1\"}) * 0 + 1\n)", + "fullMetaSearch": false, + "includeNullMetadata": true, + "interval": "", + "legendFormat": "{{schema_name}}.{{table_name}} - {{vacuum_mode}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Vacuum timeline", "type": "timeseries" }, { @@ -3803,7 +4110,7 @@ "h": 3, "w": 24, "x": 0, - "y": 149 + "y": 170 }, "id": 40, "options": { @@ -3827,8 +4134,8 @@ "list": [ { "current": { - "text": "local", - "value": "local" + "text": "default", + "value": "default" }, "definition": "label_values(pgwatch_settings_configured,cluster)", "label": "Cluster name", @@ -3845,8 +4152,8 @@ }, { "current": { - "text": "node-01", - "value": "node-01" + "text": "prod-db", + "value": "prod-db" }, "definition": "label_values(pgwatch_settings_configured{cluster=\"$cluster_name\"},node_name)", "label": "Node name", @@ -3882,12 +4189,12 @@ ] }, "time": { - "from": "now-30m", + "from": "now-2d", "to": "now" }, "timepicker": {}, "timezone": "utc", - "title": "01. Single node performance overview (high-level)", + "title": "01. Single node performance overview (high-level)", "uid": "f90500a0-a12e-4081-a2f0-07ed96f27915", - "version": 2 + "version": 7 } \ No newline at end of file diff --git a/config/grafana/dashboards/Dashboard_7_Autovacuum_and_bloat.json b/config/grafana/dashboards/Dashboard_7_Autovacuum_and_bloat.json index 0f9e858..21b981e 100644 --- a/config/grafana/dashboards/Dashboard_7_Autovacuum_and_bloat.json +++ b/config/grafana/dashboards/Dashboard_7_Autovacuum_and_bloat.json @@ -1,65 +1,268 @@ { - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "grafana", - "uid": "-- Grafana --" + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 6, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "P7A0D6631BB10B34F" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 7, + "axisSoftMin": 1, + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "points", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" + "decimals": 0, + "mappings": [ + { + "options": { + "1": { + "index": 0, + "text": "Initial" + }, + "2": { + "index": 1, + "text": "Scanning heap" + }, + "3": { + "index": 2, + "text": "Vacuuming indexes" + }, + "4": { + "index": 3, + "text": "Vacuuming heap" + }, + "5": { + "index": 4, + "text": "Cleaning up indexes" + }, + "6": { + "index": 5, + "text": "Truncating heap" + }, + "7": { + "index": 6, + "text": "Final cleanup" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 14, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "code", + "expr": "(\n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{phase=\"7\", schema_name=~\"$schema_name\", table_name=~\"$table_name\"}) * 0 + 7 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{phase=\"6\", schema_name=~\"$schema_name\", table_name=~\"$table_name\"}) * 0 + 6 or\n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{phase=\"5\", schema_name=~\"$schema_name\", table_name=~\"$table_name\"}) * 0 + 5 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{phase=\"4\", schema_name=~\"$schema_name\", table_name=~\"$table_name\"}) * 0 + 4 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{phase=\"3\", schema_name=~\"$schema_name\", table_name=~\"$table_name\"}) * 0 + 3 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{phase=\"2\", schema_name=~\"$schema_name\", table_name=~\"$table_name\"}) * 0 + 2 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{phase=\"1\", schema_name=~\"$schema_name\", table_name=~\"$table_name\"}) * 0 + 1\n)", + "fullMetaSearch": false, + "includeNullMetadata": true, + "interval": "", + "legendFormat": "{{schema_name}}.{{table_name}} - {{vacuum_mode}}", + "range": true, + "refId": "A", + "useBackend": false } - ] - }, - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 0, - "id": 9, - "links": [], - "panels": [ + ], + "title": "Vacuum timeline", + "type": "timeseries" + } + ], + "preload": false, + "schemaVersion": 41, + "tags": [], + "templating": { + "list": [ { - "fieldConfig": { - "defaults": {}, - "overrides": [] + "current": { + "text": "default", + "value": "default" }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 0 + "definition": "label_values(pgwatch_settings_configured,cluster)", + "label": "Cluster name", + "name": "cluster_name", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(pgwatch_settings_configured,cluster)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" }, - "id": 1, - "options": { - "code": { - "language": "plaintext", - "showLineNumbers": false, - "showMiniMap": false - }, - "content": "# Coming soon...", - "mode": "markdown" + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "current": { + "text": "prod-db", + "value": "prod-db" + }, + "definition": "label_values(pgwatch_settings_configured{cluster=\"$cluster_name\"},node_name)", + "label": "Node name", + "name": "node_name", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(pgwatch_settings_configured{cluster=\"$cluster_name\"},node_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "current": { + "text": "postgres", + "value": "postgres" + }, + "definition": "label_values(pgwatch_pg_database_wraparound_age_datfrozenxid{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", + "label": "DB name", + "name": "db_name", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(pgwatch_pg_database_wraparound_age_datfrozenxid{cluster=\"$cluster_name\", node_name=\"$node_name\", datname!=\"template1\"},datname)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "allValue": ".+", + "current": { + "text": "public", + "value": "public" + }, + "definition": "label_values(pgwatch_table_stats_tx_freeze_age,schema)", + "includeAll": true, + "name": "schema_name", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(pgwatch_table_stats_tx_freeze_age,schema)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "allValue": ".+", + "current": { + "text": "All", + "value": [ + "$__all" + ] + }, + "definition": "label_values(pgwatch_table_stats_tx_freeze_age{schema=~\"$schema_name\"},table_name)", + "includeAll": true, + "multi": true, + "name": "table_name", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(pgwatch_table_stats_tx_freeze_age{schema=~\"$schema_name\"},table_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" }, - "pluginVersion": "12.0.2", - "title": "", - "type": "text" + "refresh": 1, + "regex": "", + "type": "query" } - ], - "preload": false, - "schemaVersion": 41, - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": {}, - "timezone": "browser", - "title": "07. Autovacuum and bloat -- \"Metrics are collected (part of health check); dashboard – TODO\"", - "uid": "caffad19-4605-41fe-87f7-484ab67200e8", - "version": 1 - } \ No newline at end of file + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "07. Autovacuum and bloat -- \"Metrics are collected (part of health check); dashboard – TODO\"", + "uid": "caffad19-4605-41fe-87f7-484ab67200e8", + "version": 5 +} \ No newline at end of file diff --git a/config/grafana/dashboards/Dashboard_9_Table_Stats b/config/grafana/dashboards/Dashboard_9_Table_Stats.json similarity index 100% rename from config/grafana/dashboards/Dashboard_9_Table_Stats rename to config/grafana/dashboards/Dashboard_9_Table_Stats.json -- GitLab From 842f289d989f296caceb34ea77c7fb0f7286cc6a Mon Sep 17 00:00:00 2001 From: "dementii.priadko" <45518657+DEMNERD@users.noreply.github.com> Date: Tue, 2 Sep 2025 16:53:23 +0300 Subject: [PATCH 3/3] docs: add PostgresAI monitoring reference documentation Add comprehensive monitoring reference guide that documents: - PostgresAI monitoring architecture and components - Detailed dashboard descriptions and key metrics - Complete graph inventory across all 9 dashboards - Updated to follow PostgresAI documentation standards: * Sentence-style capitalization throughout * Consistent terminology (Postgres vs PostgreSQL) * Professional formatting and structure --- MONITORING_REFERENCE.md | 206 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 206 insertions(+) create mode 100644 MONITORING_REFERENCE.md diff --git a/MONITORING_REFERENCE.md b/MONITORING_REFERENCE.md new file mode 100644 index 0000000..65148da --- /dev/null +++ b/MONITORING_REFERENCE.md @@ -0,0 +1,206 @@ +# PostgresAI monitoring reference documentation + +## Overview + +PostgresAI monitoring is a comprehensive Postgres database monitoring solution built on pgwatch, Grafana, and Prometheus. This system provides real-time insights into Postgres database performance, health, and operations through a set of specialized dashboards. + +## Architecture + +The monitoring stack consists of: +- **pgwatch**: Postgres monitoring agent that collects metrics +- **Grafana**: Visualization and dashboard platform +- **Flask Backend**: Additional API services for enhanced functionality +- **prometheus and Postgres**: Storage for metrics and query texts + +## Dashboard Reference + +### Dashboard 1: Node Performance Overview +**Purpose**: High-level overview of Postgres database performance and health + +**Key Metrics**: +- **Active session history**: Database wait events by type (CPU, locks, I/O) +- **Sessions**: Connection states (Active, Idle, Idle-in-transaction, Waiting) +- **Transactions**: Commit vs rollback ratios and rates +- **Query performance**: Calls, execution time, and latency metrics +- **Buffer cache**: Hit ratios and I/O patterns +- **WAL activity**: Write-ahead log generation and archiving + +### Dashboard 2: Aggregated Query Analysis +**Purpose**: Identify top-performing and problematic queries across the database + +**Key Metrics**: +- **Detailed table view**: Table of stats for each query from pg_stat_statements +- **Top queries by calls**: Most frequently executed queries +- **Top queries by execution time**: Queries consuming most total time +- **Top queries by latency**: Slowest individual query executions +- **I/O analysis**: Queries with highest disk read/write activity +- **Buffer usage**: Queries with best/worst cache efficiency +- **Temp file usage**: Queries spilling to disk for sorting/hashing +- **WAL generation**: Queries generating most write-ahead log data + + +### Dashboard 3: Single Query Analysis +**Purpose**: Deep-dive analysis of individual queries by query ID + +**Key Metrics**: +- **Execution Timeline**: Calls and execution time over time +- **Wait Events**: Specific wait types for this query +- **Resource Usage**: Buffer hits, disk I/O, WAL generation +- **Performance Metrics**: Latency, rows returned, temp file usage +- **Per-Call Analysis**: Average metrics per query execution + + +### Dashboard 4: Wait sampling dashboard +**Purpose**: Detailed analysis of database wait events and blocking + +**Key Metrics**: +- **Active session history**: All wait events including background processes +- **Active session history by event type**: Detailed categorization by event type +- **Active session history by event type and event**: Wait events correlated with specific queries + +### Dashboard 5: Backup stats +**Purpose**: Monitor backup and recovery processes + +**Key Metrics**: +- **Archive success and errors**: Rate of successful WAL archives versus failed archive attempts +- **Archive lag**: Amount of WAL data in bytes that has been generated but not yet archived +- **WAL archive success rate**: Percentage of successful WAL archive operations + +### Dashboard 7: Autovacuum and bloat +**Purpose**: Monitor Postgres maintenance processes and table health + +**Key Metrics**: +- **Vacuum Timeline**: Autovacuum progress through different phases + + +### Dashboard 8: Index health +**Purpose**: Monitor index performance and maintenance needs + +**Key Metrics**: +- **Index Bloat** +- **Index Size** + + +### Dashboard 9: Table stats +**Purpose**: Monitor table-level operations and data patterns + +**Key Metrics**: +- **CRUD operations**: Insert, update, delete rates by table + + +## Complete Graph Inventory + +### Dashboard 1: Node Performance Overview (36 graphs) +1. **Active session history** - Shows database wait events by type (CPU, locks, I/O) to identify performance bottlenecks +2. **Host stats** - Displays system-level metrics like CPU, memory, and disk usage +3. **Postgres stats** - Core Postgres instance metrics and version information +4. **Sessions** - Connection states (Active, Idle, Idle-in-transaction, Waiting) with max_connections limit +5. **Non-idle sessions** - Active database connections excluding idle ones for workload monitoring +6. **Calls (pg_stat_statements)** - Total SQL statement executions per second across all queries +7. **Transactions** - Transaction commit vs rollback rates and overall transaction activity +9. **Commit vs rollback ratio** - Ratio of successful vs failed transactions indicating application health +10. **Statements total time (pg_stat_statements)** - Total execution time per second for all SQL statements +11. **Statements time per call (pg_stat_statements) aka latency** - Average execution time per query call (key latency metric) +12. **Total rows (pg_stat_statements)** - Total rows returned per second across all queries +13. **Rows per call (pg_stat_statements)** - Average rows returned per query execution +14. **blk_read_time vs blk_write_time (s/s) (pg_stat_statements)** - Time spent reading/writing disk blocks per second +15. **blk_read_time vs blk_write_time per call (pg_stat_statements)** - Average disk I/O time per query execution +16. **shared_blks_hit (bytes) (pg_stat_statements)** - Data read from shared buffer cache (good performance indicator) +17. **shared_blks_hit (bytes) per call (pg_stat_statements)** - Average cache hits per query execution +18. **shared_blks_read (bytes) (pg_stat_statements)** - Data read from disk (cache misses - expensive operations) +19. **shared_blks_read (bytes) per call (pg_stat_statements)** - Average disk reads per query execution +20. **shared_blks_written (bytes) (pg_stat_statements)** - Data written from buffers to disk per second +21. **shared_blks_written (bytes) per call (pg_stat_statements)** - Average buffer writes per query execution +22. **shared_blks_dirtied (bytes) (pg_stat_statements)** - Buffer blocks modified (dirtied) per second +23. **shared_blks_dirtied (bytes) per call (pg_stat_statements)** - Average buffer modifications per query +24. **shared_blks_read_ratio (pg_stat_statements)** - Cache miss ratio (< 10-20% indicates good cache efficiency) +25. **WAL bytes (pg_current_wal_lsn)** - Write-ahead log generation rate (affects replication and recovery) +26. **WAL bytes per call (pg_current_wal_lsn)** - Average WAL generation per query execution +27. **WAL fpi (pg_stat_statements)** - WAL full page images generated per second +28. **WAL fpi per call (pg_current_wal_lsn)** - Average full page images per query execution +29. **temp_bytes_read vs temp_bytes_written (pg_stat_statements)** - Temporary file I/O operations +30. **temp_bytes_read vs temp_bytes_written per call (pg_stat_statements)** - Average temp file usage per query +31. **Locks by mode** - Active locks by type (AccessShareLock, RowExclusiveLock, etc.) +32. **Longest non-idle transaction age, > 1 min** - Age of oldest active transaction (>1min threshold) +33. **Age of the oldest transaction ID that has not been frozen** - Transaction ID age (watch for wraparound issues) +34. **Age of the oldest multi-transaction ID that has not been frozen** - Multi-transaction ID age monitoring +35. **bgwriter and checkpointer** - Background writer vs checkpointer activity comparison +36. **Vacuum timeline** - VACUUM operation progress through different phases + +### Dashboard 2: Aggregated Query Analysis (25 graphs) +1. **Detailed table view (pg_stat_statements)** - Tabular view of query performance metrics with sorting and filtering +2. **Top $top_n queries analysis (pg_stat_statements)** - Overview of most significant queries by multiple metrics +3. **Top $top_n statements by calls (pg_stat_statements)** - Most frequently executed queries (call frequency) +4. **Top $top_n statements by execution time (pg_stat_statements)** - Queries consuming most total execution time +5. **Top $top_n statements by execution time per call (pg_stat_statements)** - Slowest individual query executions +6. **Top $top_n statements by planning time (pg_stat_statements)** - Queries with highest total query planning time +7. **Top $top_n statements by planning time per call (pg_stat_statements)** - Queries with slowest planning per execution +8. **Top $top_n statements by rows (pg_stat_statements)** - Queries returning most total rows +9. **Top $top_n statements by rows per call (pg_stat_statements)** - Queries with highest average rows per execution +10. **Top $top_n statements by shared_blks_hit (in bytes) (pg_stat_statements)** - Queries with best cache efficiency (most hits) +11. **Top $top_n statements by shared_blks_hit (in bytes) per call (pg_stat_statements)** - Best average cache hits per query +12. **Top $top_n statements by shared_blks_read (in bytes) (pg_stat_statements)** - Queries causing most disk reads (worst cache performance) +13. **Top $top_n statements by shared_blks_read (in bytes) per call (pg_stat_statements)** - Highest average disk reads per query +14. **Top $top_n statements by shared_blks_written (in bytes) (pg_stat_statements)** - Queries writing most data to buffers +15. **Top $top_n statements by shared_blks_written (in bytes) per call (pg_stat_statements)** - Highest average buffer writes per query +16. **Top $top_n statements by shared_blks_dirtied (in bytes) per call (pg_stat_statements)** - Queries modifying most buffer data +17. **Top $top_n statements by WAL bytes (pg_stat_statements)** - Queries generating most write-ahead log data +18. **Top $top_n statements by WAL bytes per call (pg_stat_statements)** - Highest average WAL generation per query +19. **Top $top_n statements by WAL fpi (pg_stat_statements)** - Queries generating most WAL full page images +20. **Top $top_n statements by WAL fpi per call (pg_stat_statements)** - Highest average FPI generation per query +21. **Top $top_n statements by temp bytes read (pg_stat_statements)** - Queries reading most from temporary files +22. **Top $top_n statements by temp bytes read per call (pg_stat_statements)** - Highest average temp file reads per query +23. **Top $top_n statements by temp bytes written (pg_stat_statements)** - Queries writing most to temporary files +24. **Top $top_n statements by temp bytes written per call (pg_stat_statements)** - Highest average temp file writes per query +25. **Query Analysis panels (multiple instances)** - Drill-down analysis panels for individual queries + +### Dashboard 3: Single Query Analysis (17 graphs) +1. **Active session history** - Wait events specifically for the selected query ID +2. **Calls (pg_stat_statements)** - Execution frequency of the specific query over time +3. **Execution time (pg_stat_statements)** - Total execution time for the specific query per second +4. **Execution time per call (pg_stat_statements)** - Average execution time per call for the specific query +5. **Rows (pg_stat_statements)** - Total rows returned by the specific query per second +6. **Rows per call (pg_stat_statements)** - Average rows returned per execution of the specific query +7. **shared_blks_hit (in bytes) (pg_stat_statements)** - Cache efficiency for the specific query (bytes from memory) +8. **shared_blks_hit (in bytes) per call (pg_stat_statements)** - Average cache hits per execution of the specific query +9. **WAL bytes (pg_stat_statements)** - WAL generation rate for the specific query +10. **WAL bytes per call (pg_stat_statements)** - Average WAL generation per execution of the specific query +11. **WAL fpi (in bytes) (pg_stat_statements)** - Full page images generated by the specific query +12. **WAL fpi per call (pg_stat_statements)** - Average FPI generation per execution of the specific query +13. **Temp bytes read (pg_stat_statements)** - Temporary file reads for the specific query +14. **Temp bytes read per call (pg_stat_statements)** - Average temp file reads per execution of the specific query +15. **Temp bytes written (pg_stat_statements)** - Temporary file writes for the specific query +16. **Temp bytes written per call (pg_stat_statements)** - Average temp file writes per execution of the specific query +17. **Query Analysis panels (multiple instances)** - Detailed analysis panels for the selected query + +### Dashboard 4: Wait sampling dashboard (4 graphs) +1. **Active session history** - Comprehensive view of all database wait events including background processes +2. **Active session history by event type** - Wait events grouped by category (CPU, I/O, locks, etc.) +3. **Active session history by event type and event** - Detailed breakdown with specific event names and query IDs +4. **Query Analysis** - Drill-down analysis for queries associated with wait events + +### Dashboard 5: Backup stats (3 graphs) +1. **Archive success and errors** - Rate of successful vs failed WAL archive operations +2. **WAL archive success rate** - Percentage of successful archive operations (should be 100%) +3. **Archive lag** - Amount of WAL data waiting to be archived (data loss window) + +### Dashboard 7: Autovacuum and bloat (1 graph) +1. **Vacuum timeline** - Progress of VACUUM operations through phases (scanning, vacuuming, cleaning, etc.) + +### Dashboard 8: Index health (6 graphs) +1. **Detailed index view** - Tabular view of all indexes with bloat, size, and usage statistics +2. **Top $top_n index analysis** - Overview of most problematic indexes by various metrics +3. **Top $top_n indexes by estimated bloat %** - Indexes with highest percentage of wasted space +4. **Top $top_n indexes by estimated bloat size** - Indexes with largest absolute amount of wasted space +5. **Top $top_n indexes by size** - Largest indexes by total size (memory and disk impact) +6. **Query Analysis panels (multiple instances)** - Detailed analysis for index-related queries + +### Dashboard 9: Table stats (7 graphs) +1. **Tuple operations** - Total CRUD operations (insert, update, delete, hot update) across all tables +2. **Tuple operations (%)** - Percentage breakdown of different operation types +3. **Number of inserted tuples by table** - Insert rates for individual tables over time +4. **Number of updated tuples by table** - Update rates for individual tables (watch for bloat impact) +5. **Number of hot updated tuples by table** - HOT updates by table (efficient updates avoiding index updates) +6. **Number of deleted tuples by table** - Delete rates by table (triggers vacuum operations) +7. **Table details panels (multiple instances)** - Detailed statistics and metrics for individual tables + -- GitLab