diff --git a/documentation/metrics.md b/documentation/metrics.md
index 15ef4a1..d507a8e 100644
--- a/documentation/metrics.md
+++ b/documentation/metrics.md
@@ -3691,7 +3691,8 @@ Default config:
### Replication
Default config:
- lag_more_than_in_sec = 300
+ lag_more_than_in_sec = 300\
+ critical_bytes_held_by_non_active_slot = 1073741824 bytes
### Items
@@ -3763,6 +3764,37 @@ Default config:
*Non-active Replication Slots* calculates as count of slots with `false` active status.
+- **Bytes Held By Non-active Replication Slots**
+
+ Zabbix item:
+
+
+ Name |
+ PostgreSQL Replication: Bytes held by non-active slot {#NON_ACTIVE_SLOT_NAME} |
+
+
+ Key |
+ pgsql.replication.non_active_slots_discovery[] |
+
+
+ Type |
+ Numeric (float) |
+
+
+ Units |
+ Bytes |
+
+
+ Delta |
+ As Is |
+
+
+ Supported Version |
+ 10+ |
+
+
+
+ *Non-active Replication Slots* calculates as count of slots with `false` active status.
- **Streaming Replication Lag**
@@ -3861,12 +3893,40 @@ Default config:
+- **PostgreSQL Replication: Non-active Slots Discovery**
+
+ Items:
+
+
+ Name |
+ PostgreSQL Replication: Bytes held by non-active slot {#NON_ACTIVE_SLOT_NAME} |
+
+
+ Key |
+ pgsql.replication.non_active_slots_discovery[] |
+
+
+ Type |
+ Numeric (float) |
+
+
+ Units |
+ Bytes |
+
+
+ Delta |
+ As Is |
+
+
+
### Triggers
- **PostgreSQL Instance: server mode has been changed on {HOSTNAME} to {ITEM.LASTVALUE}**
- **PostgreSQL number of non-active replication slots on {HOSTNAME} (value={ITEM.LASTVALUE})**
-
+ Disabled by default
+- **PostgreSQL Replication: bytes held by slot {#NON_ACTIVE_SLOT_NAME} is too high (value={ITEM.LASTVALUE})**
+ Triggers if *PostgreSQL Replication: Bytes held by non-active slot {#NON_ACTIVE_SLOT_NAME}* exceeds `critical_bytes_held_by_non_active_slot`.
- **PostgreSQL streaming lag too high on {HOSTNAME} (value={ITEM.LASTVALUE})**
Triggers if *PostgreSQL Replication: Streaming Replication Lag* exceeds `lag_more_than_in_sec`.
diff --git a/mamonsu/lib/default_config.py b/mamonsu/lib/default_config.py
index c7f2d98..12791a1 100644
--- a/mamonsu/lib/default_config.py
+++ b/mamonsu/lib/default_config.py
@@ -35,6 +35,8 @@ def default_host():
host = os.environ.get('PGHOST') or 'auto'
if platform.FREEBSD:
host = os.environ.get('PGHOST') or 'auto'
+ if platform.DARWIN:
+ host = os.environ.get('PGHOST') or 'auto'
return host
@staticmethod
diff --git a/mamonsu/lib/platform.py b/mamonsu/lib/platform.py
index 5ea5faa..279200d 100644
--- a/mamonsu/lib/platform.py
+++ b/mamonsu/lib/platform.py
@@ -3,5 +3,6 @@
LINUX = (sys.platform == 'linux' or sys.platform == 'linux2')
WINDOWS = (sys.platform == 'win32' or sys.platform == 'win64')
FREEBSD = ('freebsd' in sys.platform)
+DARWIN = sys.platform == 'darwin'
UNIX = LINUX or FREEBSD
INTEGER_TYPES = int,
diff --git a/mamonsu/plugins/pgsql/driver/pool.py b/mamonsu/plugins/pgsql/driver/pool.py
index 6576f92..a8433d9 100644
--- a/mamonsu/plugins/pgsql/driver/pool.py
+++ b/mamonsu/plugins/pgsql/driver/pool.py
@@ -86,7 +86,7 @@ class Pool(object):
"""
SELECT application_name,
{0}
- coalesce((pg_{1}_{2}_diff(pg_current_{1}_{2}(), replay_lsn))::int, 0) AS total_lag
+ coalesce((pg_{1}_{2}_diff(pg_current_{1}_{2}(), replay_{2}))::int, 0) AS total_lag
FROM pg_stat_replication;
""",
"""
@@ -95,6 +95,30 @@ class Pool(object):
total_lag
FROM mamonsu.count_{1}_lag_lsn();
"""
+ ),
+ "wal_held_bytes_master": (
+ """
+ SELECT slot_name,
+ coalesce((pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn))::int, 0) AS wal_held_bytes
+ FROM pg_replication_slots;
+ """,
+ """
+ SELECT slot_name,
+ wal_held_bytes
+ FROM mamonsu.bytes_held_by_inactive_slot_on_master();
+ """
+ ),
+ "wal_held_bytes_replica": (
+ """
+ SELECT slot_name,
+ coalesce((pg_wal_lsn_diff(pg_last_wal_replay_lsn(), restart_lsn))::int, 0) AS wal_held_bytes
+ FROM pg_replication_slots;
+ """,
+ """
+ SELECT slot_name,
+ wal_held_bytes
+ FROM mamonsu.bytes_held_by_inactive_slot_on_replica();
+ """
)
}
diff --git a/mamonsu/plugins/pgsql/replication.py b/mamonsu/plugins/pgsql/replication.py
index 8a51889..7ed701c 100644
--- a/mamonsu/plugins/pgsql/replication.py
+++ b/mamonsu/plugins/pgsql/replication.py
@@ -13,7 +13,8 @@ class Replication(Plugin):
AgentPluginType = "pg"
# key: (macro, value)
plugin_macros = {
- "critical_lag_seconds": [("macro", "{$CRITICAL_LAG_SECONDS}"), ("value", 60 * 5)]
+ "critical_lag_seconds": [("macro", "{$CRITICAL_LAG_SECONDS}"), ("value", 60 * 5)],
+ "critical_bytes_held_by_none_active_slot": [("macro", "{$CRITICAL_BYTES_HELD_BY_NON_ACTIVE_SLOT}"), ("value", 1024 * 1024 * 1024)]
}
# get time of replication lag
@@ -30,8 +31,15 @@ class Replication(Plugin):
WHERE active = 'false';
"""
+ query_bytes_held_by_non_active_slot = """
+ SELECT slot_name, coalesce(pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::bigint, 0) AS wal_size_bytes
+ FROM pg_replication_slots
+ WHERE active = 'false';
+ """
+
# for discovery rule for name of each replica
key_lsn_replication_discovery = "pgsql.replication.discovery{0}"
+ key_replication_non_active_slots_discovery = "pgsql.replication.non_active_slots_discovery{0}"
key_total_lag = "pgsql.replication.total_lag{0}"
# for PG 10 and higher
key_flush = "pgsql.replication.flush_lag{0}"
@@ -42,6 +50,7 @@ class Replication(Plugin):
key_replication = "pgsql.replication_lag{0}"
key_non_active_slots = "pgsql.replication.non_active_slots{0}"
+ key_non_active_slots_held_bytes = "pgsql.replication.non_active_slots_held_bytes{0}"
def run(self, zbx):
@@ -79,6 +88,14 @@ def run(self, zbx):
zbx.send("pgsql.replication.replay_lag[{0}]".format(info[0]), float(info[5]))
zbx.send("pgsql.replication.discovery[]", zbx.json({"data": lags}))
del lags
+ bytes_held_by_non_active_slot = Pooler.run_sql_type("wal_held_bytes_master", args=[])
+ if bytes_held_by_non_active_slot:
+ discovery = []
+ for info in bytes_held_by_non_active_slot:
+ discovery.append({"{#NON_ACTIVE_SLOT_NAME}": info[0]})
+ zbx.send("pgsql.replication.non_active_slots_held_bytes[{0}]".format(info[0]), int(info[1]))
+ zbx.send("pgsql.replication.non_active_slots_discovery[]", zbx.json({"data": discovery}))
+ del discovery
elif Pooler.is_superuser() or Pooler.is_bootstraped():
result_lags = Pooler.run_sql_type("wal_lag_lsn", args=[" ", "xlog", "location"])
if result_lags:
@@ -90,7 +107,15 @@ def run(self, zbx):
del lags
else:
self.disable_and_exit_if_not_superuser()
-
+ else:
+ bytes_held_by_non_active_slot = Pooler.run_sql_type("wal_held_bytes_replica", args=[])
+ if bytes_held_by_non_active_slot:
+ discovery = []
+ for info in bytes_held_by_non_active_slot:
+ discovery.append({"{#NON_ACTIVE_SLOT_NAME}": info[0]})
+ zbx.send("pgsql.replication.non_active_slots_held_bytes[{0}]".format(info[0]), int(info[1]))
+ zbx.send("pgsql.replication.non_active_slots_discovery[]", zbx.json({"data": discovery}))
+ del discovery
non_active_slots = Pooler.query(self.query_non_active_slots)
zbx.send(self.key_non_active_slots.format("[]"), int(non_active_slots[0][0]))
@@ -132,7 +157,8 @@ def triggers(self, template, dashboard=False):
}) + template.trigger({
"name": "PostgreSQL Replication: number of non-active replication slots on {HOSTNAME} (value={ITEM.LASTVALUE})",
"expression": "{#TEMPLATE:" + self.right_type(self.key_non_active_slots) + ".last()}>" + str(
- NUMBER_NON_ACTIVE_SLOTS)
+ NUMBER_NON_ACTIVE_SLOTS),
+ "status": 1
})
return triggers
@@ -198,7 +224,42 @@ def discovery_rules(self, template, dashboard=False):
]
}
]
- return template.discovery_rule(rule=rule, conditions=conditions, items=items, graphs=graphs)
+ active_slots_discovery_rule = template.discovery_rule(rule=rule, conditions=conditions, items=items, graphs=graphs)
+
+ rule = {
+ "name": "PostgreSQL Replication: Non Active Slots Discovery",
+ "key": self.key_replication_non_active_slots_discovery.format("[{0}]".format(self.Macros[self.Type]))
+ }
+ if Plugin.old_zabbix:
+ conditions = []
+ rule["filter"] = "{#NON_ACTIVE_SLOT_NAME}:.*"
+ else:
+ conditions = [{
+ "condition": [
+ {"macro": "{#NON_ACTIVE_SLOT_NAME}",
+ "value": ".*",
+ "operator": 8,
+ "formulaid": "A"}
+ ]
+ }]
+ items = [
+ {"key": self.right_type(self.key_non_active_slots_held_bytes, var_discovery="{#NON_ACTIVE_SLOT_NAME},"),
+ "name": "PostgreSQL Replication: Bytes held by non-active slot {#NON_ACTIVE_SLOT_NAME}",
+ "value_type": Plugin.VALUE_TYPE.numeric_float,
+ "delay": self.plugin_config("interval"),
+ "drawtype": 2}
+ ]
+ graphs = []
+ triggers = [
+ {
+ "name": "PostgreSQL Replication: bytes held by slot {#NON_ACTIVE_SLOT_NAME} is too high (value={ITEM.LASTVALUE})",
+ "expression": "{#TEMPLATE:" + self.right_type(self.key_non_active_slots_held_bytes, var_discovery="{#NON_ACTIVE_SLOT_NAME},") + ".last()}>" +
+ self.plugin_macros["critical_bytes_held_by_none_active_slot"][0][1]
+ }
+ ]
+ non_active_slots_discovery_rule = template.discovery_rule(rule=rule, conditions=conditions, items=items, graphs=graphs, triggers=triggers)
+
+ return active_slots_discovery_rule + non_active_slots_discovery_rule
def keys_and_queries(self, template_zabbix):
result = []
diff --git a/mamonsu/tools/bootstrap/sql.py b/mamonsu/tools/bootstrap/sql.py
index f37be0f..bf99442 100644
--- a/mamonsu/tools/bootstrap/sql.py
+++ b/mamonsu/tools/bootstrap/sql.py
@@ -236,6 +236,23 @@
coalesce((pg_{7}_diff(pg_current_{7}(), replay_{9}))::bigint, 0) AS total_lag
FROM pg_stat_replication
$$ LANGUAGE SQL SECURITY DEFINER;
+
+DROP FUNCTION IF EXISTS mamonsu.bytes_held_by_inactive_slot_on_master();
+CREATE OR REPLACE FUNCTION mamonsu.bytes_held_by_inactive_slot_on_master()
+RETURNS TABLE(slot_name TEXT, wal_held_bytes BIGINT) AS $$
+SELECT slot_name::TEXT, coalesce((pg_{7}_diff(pg_current_wal_lsn(), restart_lsn))::bigint, 0) AS wal_held_bytes
+FROM pg_replication_slots
+WHERE active = 'false'
+$$ LANGUAGE SQL SECURITY DEFINER;
+
+DROP FUNCTION IF EXISTS mamonsu.bytes_held_by_inactive_slot_on_replica();
+CREATE OR REPLACE FUNCTION mamonsu.bytes_held_by_inactive_slot_on_replica()
+RETURNS TABLE(slot_name TEXT, wal_held_bytes BIGINT) AS $$
+SELECT slot_name::TEXT, coalesce((pg_{7}_diff(pg_last_wal_replay_lsn(), restart_lsn))::bigint, 0) AS wal_held_bytes
+FROM pg_replication_slots
+WHERE active = 'false'
+$$ LANGUAGE SQL SECURITY DEFINER;
+
"""
CreatePgBuffercacheFunctionsSQL = """