1212 - name: ob-rule
1313 rules:
1414 - alert: cluster_active_session
15- expr: sum(ob_session_active_num ) by (ob_cluster_name, obzone, svr_ip) > 10000
15+ expr: sum(ob_active_session_num ) by (ob_cluster_name, obzone, svr_ip) > 10000
1616 for: 1m
1717 labels:
1818 instance_type: obcluster
4444 annotations:
4545 description: 'Cluster {{ $labels.ob_cluster_name }} has {{ $value }} index fail tables.'
4646 summary: 'Found index fail table in cluster {{ $labels.ob_cluster_name }}.'
47- - alert: frozen_version_check
48- expr: max(ob_zone_stat{name="frozen_version"}) by (ob_cluster_name) - min(ob_zone_stat{name="last_merged_version"}) by (ob_cluster_name) > 1
49- for: 1m
50- labels:
51- instance_type: obcluster
52- rule_name: frozen_version_check
53- rule_type: builtin
54- severity: warning
55- annotations:
56- description: 'Cluster {{ $labels.ob_cluster_name }} has {{ $value }} delta versions between merged and frozen data.'
57- summary: 'Frozen version is too much larger than merged version.'
58- - alert: cluster_merge_error
59- expr: max(ob_zone_stat{name="is_merge_error"}) by (ob_cluster_name) > 0
60- for: 1m
61- labels:
62- instance_type: obcluster
63- rule_name: cluster_merge_error
64- rule_type: builtin
65- severity: warning
66- annotations:
67- description: 'Cluster {{ $labels.ob_cluster_name }} merge error.'
68- summary: 'Cluster {{ $labels.ob_cluster_name }} merge error.'
69- - alert: cluster_merge_timeout
70- expr: max(ob_zone_stat{name="is_merge_timeout"}) by (ob_cluster_name) > 0
71- for: 1m
72- labels:
73- instance_type: obcluster
74- rule_name: cluster_merge_timeout
75- rule_type: builtin
76- severity: warning
77- annotations:
78- description: 'Cluster {{ $labels.ob_cluster_name }} merge timeout.'
79- summary: 'Cluster {{ $labels.ob_cluster_name }} merge timeout.'
80- - alert: cluster_no_frozen
81- expr: (max(ob_zone_current_timestamp{name="frozen_time"}) by (ob_cluster_name) - max(ob_zone_stat{name="frozen_time"}) by (ob_cluster_name)) / 1000000 > 90000
82- for: 1m
83- labels:
84- instance_type: obcluster
85- rule_name: cluster_no_frozen
86- rule_type: builtin
87- severity: warning
88- annotations:
89- description: 'Cluster {{ $labels.ob_cluster_name }} has not frozen for {{ $value }} seconds.'
90- summary: 'Cluster {{ $labels.ob_cluster_name }} has not frozen for a long time.'
91- - alert: cluster_no_merge
92- expr: (max(ob_zone_current_timestamp{name="merge_start_time"}) by (ob_cluster_name) - max(ob_zone_stat{name="merge_start_time"}) by (ob_cluster_name)) / 1000000 > 90000
93- for: 1m
94- labels:
95- instance_type: obcluster
96- rule_name: cluster_no_merge
97- rule_type: builtin
98- severity: warning
99- annotations:
100- description: 'Cluster {{ $labels.ob_cluster_name }} has not merge for {{ $value }} seconds.'
101- summary: 'Cluster {{ $labels.ob_cluster_name }} has not merge for a long time.'
10247 - alert: tenant_active_session
103- expr: sum(ob_session_active_num ) by (ob_cluster_name, obzone, svr_ip, tenant_name) > 2000
48+ expr: sum(ob_active_session_num ) by (ob_cluster_name, obzone, svr_ip, tenant_name) > 2000
10449 for: 1m
10550 labels:
10651 instance_type: obtenant
@@ -176,28 +121,17 @@ data:
176121 annotations:
177122 description: 'Tenant {{ $labels.tenant_name }} of obcluster {{ $labels.ob_cluster_name }} has not frozen for {{ $value }} seconds.'
178123 summary: 'Tenant {{ $labels.tenant_name }} of obcluster {{ $labels.ob_cluster_name }} has not frozen for a long time.'
179- - alert: tenant_partition_leader_absent
180- expr: max(partition_leader_absent_count) by (ob_cluster_name, tenant_name) > 100
181- for: 3m
182- labels:
183- instance_type: obtenant
184- rule_name: tenant_partition_leader_absent
185- rule_type: builtin
186- severity: warning
187- annotations:
188- description: 'Tenant {{ $labels.tenant_name }} of obcluster {{ $labels.ob_cluster_name }} found {{ $value }} partition leader absent.'
189- summary: 'Tenant {{ $labels.tenant_name }} of obcluster {{ $labels.ob_cluster_name }} found partition leader absent.'
190- - alert: tenant_partition_replica_absent
191- expr: max(partition_replica_absent_count) by (ob_cluster_name, tenant_name) > 100
124+ - alert: tenant500_mem_hold_percent
125+ expr: 100 * sum(ob_tenant500_memory_hold_bytes) by (@GBLABELS) / sum(ob_server_resource_memory_bytes) by (@GBLABELS) > 95
192126 for: 3m
193127 labels:
194128 instance_type: obtenant
195- rule_name: tenant_partition_replica_absent
129+ rule_name: tenant500_mem_hold_percent
196130 rule_type: builtin
197131 severity: warning
198132 annotations:
199- description: 'Tenant {{ $labels.tenant_name }} of obcluster {{ $labels.ob_cluster_name }} found {{ $value }} partition replica absent .'
200- summary: 'Tenant {{ $labels.tenant_name }} of obcluster {{ $labels.ob_cluster_name }} found partition replica absent .'
133+ description: 'Tenant 500 of obcluster {{ $labels.ob_cluster_name }} memory hold percent too high, {{ $value }}.'
134+ summary: 'Tenant 500 of obcluster {{ $labels.ob_cluster_name }} memory hold percent too high .'
201135 - alert: tenant_task_timeout
202136 expr: max(ob_tenant_task_max_duration_seconds) by (ob_cluster_name, tenant_name) > 10800
203137 for: 1m
@@ -209,28 +143,6 @@ data:
209143 annotations:
210144 description: 'Tenant {{ $labels.tenant_name }} of obcluster {{ $labels.ob_cluster_name }} found task not finished for {{ $value }} seconds.'
211145 summary: 'Tenant {{ $labels.tenant_name }} of obcluster {{ $labels.ob_cluster_name }} found task not finished for a long time.'
212- - alert: standby_tenant_sync_delay
213- expr: max(standby_tenant_delay_seconds) by (ob_cluster_name, tenant_name) > 600
214- for: 1m
215- labels:
216- instance_type: obtenant
217- rule_name: standby_tenant_sync_delay
218- rule_type: builtin
219- severity: caution
220- annotations:
221- description: 'Standby tenant {{ $labels.tenant_name }} of obcluster {{ $labels.ob_cluster_name }} sync delay {{ $value }} seconds.'
222- summary: 'Standby tenant {{ $labels.tenant_name }} of obcluster {{ $labels.ob_cluster_name }} sync delay too long.'
223- - alert: standby_tenant_sync_error
224- expr: max(standby_tenant_restore_status_code) by (ob_cluster_name, tenant_name) == 2
225- for: 1m
226- labels:
227- instance_type: obtenant
228- rule_name: standby_tenant_sync_error
229- rule_type: builtin
230- severity: warning
231- annotations:
232- description: 'Standby tenant {{ $labels.tenant_name }} of obcluster {{ $labels.ob_cluster_name }} sync error.'
233- summary: 'Standby tenant {{ $labels.tenant_name }} of obcluster {{ $labels.ob_cluster_name }} sync error.'
234146 - alert: tenant_memstore_percent
235147 expr: 100 * sum(ob_sysstat{stat_id="130001"}) by (ob_cluster_name, obzone, svr_ip, tenant_name) / sum(ob_sysstat{stat_id="130004"}) by (ob_cluster_name, obzone, svr_ip, tenant_name) > 95
236148 for: 2m
0 commit comments