Skip to content

Commit dcb6224

Browse files
feat: added metric that shows bytes held by non-active replication slot
and according trigger
1 parent 973a30f commit dcb6224

File tree

12 files changed

+178
-7
lines changed

12 files changed

+178
-7
lines changed

documentation/metrics.md

Lines changed: 62 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3691,7 +3691,8 @@ Default config:
36913691
### Replication
36923692

36933693
Default config:
3694-
        lag_more_than_in_sec = 300
3694+
        lag_more_than_in_sec = 300\
3695+
        critical_bytes_held_by_non_active_slot = 1073741824 bytes
36953696

36963697
### Items
36973698

@@ -3763,6 +3764,37 @@ Default config:
37633764

37643765
*Non-active Replication Slots* calculates as count of slots with `false` active status.
37653766

3767+
- **Bytes Held By Non-active Replication Slots**
3768+
3769+
Zabbix item:
3770+
<table>
3771+
<tr>
3772+
<th>Name</th>
3773+
<td>PostgreSQL Replication: Bytes held by non-active slot {#NON_ACTIVE_SLOT_NAME}</td>
3774+
</tr>
3775+
<tr>
3776+
<th>Key</th>
3777+
<td>pgsql.replication.non_active_slots_discovery[]</td>
3778+
</tr>
3779+
<tr>
3780+
<th>Type</th>
3781+
<td>Numeric (float)</td>
3782+
</tr>
3783+
<tr>
3784+
<th>Units</th>
3785+
<td>Bytes</td>
3786+
</tr>
3787+
<tr>
3788+
<th>Delta</th>
3789+
<td>As Is</td>
3790+
</tr>
3791+
<tr>
3792+
<th>Supported Version</th>
3793+
<td>10+</td>
3794+
</tr>
3795+
</table>
3796+
3797+
*Non-active Replication Slots* calculates as count of slots with `false` active status.
37663798

37673799
- **Streaming Replication Lag**
37683800

@@ -3861,12 +3893,40 @@ Default config:
38613893
</tr>
38623894
</table>
38633895

3896+
- **PostgreSQL Replication: Non-active Slots Discovery**
3897+
3898+
Items:
3899+
<table>
3900+
<tr>
3901+
<th>Name</th>
3902+
<td>PostgreSQL Replication: Bytes held by non-active slot {#NON_ACTIVE_SLOT_NAME}</td>
3903+
</tr>
3904+
<tr>
3905+
<th>Key</th>
3906+
<td>pgsql.replication.non_active_slots_discovery[]</td>
3907+
</tr>
3908+
<tr>
3909+
<th>Type</th>
3910+
<td>Numeric (float)</td>
3911+
</tr>
3912+
<tr>
3913+
<th>Units</th>
3914+
<td>Bytes</td>
3915+
</tr>
3916+
<tr>
3917+
<th>Delta</th>
3918+
<td>As Is</td>
3919+
</tr>
3920+
</table>
3921+
38643922
### Triggers
38653923

38663924
- **PostgreSQL Instance: server mode has been changed on {HOSTNAME} to {ITEM.LASTVALUE}**
38673925

38683926
- **PostgreSQL number of non-active replication slots on {HOSTNAME} (value={ITEM.LASTVALUE})**
3869-
3927+
Disabled by default
3928+
- **PostgreSQL Replication: bytes held by slot {#NON_ACTIVE_SLOT_NAME} is too high (value={ITEM.LASTVALUE})**
3929+
Triggers if *PostgreSQL Replication: Bytes held by non-active slot {#NON_ACTIVE_SLOT_NAME}* exceeds `critical_bytes_held_by_non_active_slot`.
38703930
- **PostgreSQL streaming lag too high on {HOSTNAME} (value={ITEM.LASTVALUE})**
38713931
Triggers if *PostgreSQL Replication: Streaming Replication Lag* exceeds `lag_more_than_in_sec`.
38723932

github-actions-tests/sources/metrics-linux-12.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ pgsql.relation.size[]
6464
pgsql.relation.size[mamonsu_test_db.mamonsu.config]
6565
pgsql.relation.size[postgres.pg_catalog.pg_class]
6666
pgsql.replication.non_active_slots[]
67+
pgsql.replication.non_active_slots_held_bytes[]
6768
pgsql.replication_lag[sec]
6869
pgsql.replication_lag[sec]
6970
pgsql.stat[dirty_bytes]

github-actions-tests/sources/metrics-linux-13.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ pgsql.relation.size[]
6565
pgsql.relation.size[mamonsu_test_db.mamonsu.config]
6666
pgsql.relation.size[postgres.pg_catalog.pg_class]
6767
pgsql.replication.non_active_slots[]
68+
pgsql.replication.non_active_slots_held_bytes[]
6869
pgsql.replication_lag[sec]
6970
pgsql.replication_lag[sec]
7071
pgsql.stat[dirty_bytes]

github-actions-tests/sources/metrics-linux-14.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ pgsql.relation.size[]
6565
pgsql.relation.size[mamonsu_test_db.mamonsu.config]
6666
pgsql.relation.size[postgres.pg_catalog.pg_class]
6767
pgsql.replication.non_active_slots[]
68+
pgsql.replication.non_active_slots_held_bytes[]
6869
pgsql.replication_lag[sec]
6970
pgsql.replication_lag[sec]
7071
pgsql.stat[dirty_bytes]

github-actions-tests/sources/metrics-linux-16.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ pgsql.relation.size[]
6060
pgsql.relation.size[mamonsu_test_db.mamonsu.config]
6161
pgsql.relation.size[postgres.pg_catalog.pg_class]
6262
pgsql.replication.non_active_slots[]
63+
pgsql.replication.non_active_slots_held_bytes[]
6364
pgsql.replication_lag[sec]
6465
pgsql.replication_lag[sec]
6566
pgsql.stat[dirty_bytes]

github-actions-tests/sources/metrics-linux-17.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ pgsql.relation.size[]
5757
pgsql.relation.size[mamonsu_test_db.mamonsu.config]
5858
pgsql.relation.size[postgres.pg_catalog.pg_class]
5959
pgsql.replication.non_active_slots[]
60+
pgsql.replication.non_active_slots_held_bytes[]
6061
pgsql.replication_lag[sec]
6162
pgsql.replication_lag[sec]
6263
pgsql.temp[bytes]

github-actions-tests/sources/metrics-linux-full-list.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ pgsql.relation.size[]
6565
pgsql.relation.size[mamonsu_test_db.mamonsu.config]
6666
pgsql.relation.size[postgres.pg_catalog.pg_class]
6767
pgsql.replication.non_active_slots[]
68+
pgsql.replication.non_active_slots_held_bytes[]
6869
pgsql.replication_lag[sec]
6970
pgsql.replication_lag[sec]
7071
pgsql.stat[dirty_bytes]

mamonsu/lib/default_config.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ def default_host():
3535
host = os.environ.get('PGHOST') or 'auto'
3636
if platform.FREEBSD:
3737
host = os.environ.get('PGHOST') or 'auto'
38+
if platform.DARWIN:
39+
host = os.environ.get('PGHOST') or 'auto'
3840
return host
3941

4042
@staticmethod

mamonsu/lib/platform.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,6 @@
33
LINUX = (sys.platform == 'linux' or sys.platform == 'linux2')
44
WINDOWS = (sys.platform == 'win32' or sys.platform == 'win64')
55
FREEBSD = ('freebsd' in sys.platform)
6+
DARWIN = sys.platform == 'darwin'
67
UNIX = LINUX or FREEBSD
78
INTEGER_TYPES = int,

mamonsu/plugins/pgsql/driver/pool.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ class Pool(object):
8686
"""
8787
SELECT application_name,
8888
{0}
89-
coalesce((pg_{1}_{2}_diff(pg_current_{1}_{2}(), replay_lsn))::int, 0) AS total_lag
89+
coalesce((pg_{1}_{2}_diff(pg_current_{1}_{2}(), replay_{2}))::int, 0) AS total_lag
9090
FROM pg_stat_replication;
9191
""",
9292
"""
@@ -95,6 +95,30 @@ class Pool(object):
9595
total_lag
9696
FROM mamonsu.count_{1}_lag_lsn();
9797
"""
98+
),
99+
"wal_held_bytes_master": (
100+
"""
101+
SELECT slot_name,
102+
coalesce((pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn))::int, 0) AS wal_held_bytes
103+
FROM pg_replication_slots;
104+
""",
105+
"""
106+
SELECT slot_name,
107+
wal_held_bytes
108+
FROM mamonsu.bytes_held_by_inactive_slot_on_master();
109+
"""
110+
),
111+
"wal_held_bytes_replica": (
112+
"""
113+
SELECT slot_name,
114+
coalesce((pg_wal_lsn_diff(pg_last_wal_replay_lsn(), restart_lsn))::int, 0) AS wal_held_bytes
115+
FROM pg_replication_slots;
116+
""",
117+
"""
118+
SELECT slot_name,
119+
wal_held_bytes
120+
FROM mamonsu.bytes_held_by_inactive_slot_on_replica();
121+
"""
98122
)
99123
}
100124

0 commit comments

Comments
 (0)