Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion jsonnet/custom.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
},
{
alert: 'etcdHighCommitDurations',
expr: 'histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.5',
expr: 'histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.08',
'for': '10m',
labels: {
severity: 'warning',
Expand All @@ -71,6 +71,19 @@
summary: 'etcd cluster 99th percentile commit durations are too high.',
},
},
{
alert: 'etcdHighFsyncDurations',
expr: 'histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.05',
'for': '10m',
labels: {
severity: 'critical',
},
annotations: {
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.',
summary: 'etcd cluster 99th percentile fsync durations are too high.',
runbook_url: 'https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdHighFsyncDurations.md'
},
},
{
alert: 'etcdHighNumberOfFailedGRPCRequests',
expr: |||
Expand Down
2 changes: 1 addition & 1 deletion jsonnet/jsonnetfile.lock.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"subdir": "contrib/mixin"
}
},
"version": "e4d6a05f8f1ec972384e24a83c420f707a6644f2",
"version": "9f1883643231ae949be4019f7f9a6ec07d2c34d1",
"sum": "XmXkOCriQIZmXwlIIFhqlJMa0e6qGWdxZD+ZDYaN0Po="
},
{
Expand Down
2 changes: 1 addition & 1 deletion jsonnet/main.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ local promRules = if std.objectHasAll(etcdMixin, 'prometheusRules') then etcdMix

// Exclude rules that are either OpenShift specific or do not work for OpenShift.
// List should be ordered!
local excludedAlerts = ['etcdDatabaseQuotaLowSpace', 'etcdGRPCRequestsSlow', 'etcdHighCommitDurations', 'etcdHighNumberOfFailedGRPCRequests', 'etcdHighNumberOfLeaderChanges', 'etcdInsufficientMembers', 'etcdMembersDown'];
local excludedAlerts = ['etcdDatabaseQuotaLowSpace', 'etcdGRPCRequestsSlow', 'etcdHighCommitDurations', 'etcdHighFsyncDurations', 'etcdHighNumberOfFailedGRPCRequests', 'etcdHighNumberOfLeaderChanges', 'etcdInsufficientMembers', 'etcdMembersDown'];
local excludeRules = std.map(
function(group) group {
rules: std.filter(
Expand Down
33 changes: 10 additions & 23 deletions manifests/0000_90_etcd-operator_03_prometheusrule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,28 +39,6 @@ spec:
for: 15m
labels:
severity: warning
- alert: etcdHighFsyncDurations
annotations:
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdHighFsyncDurations.md
summary: etcd cluster 99th percentile fsync durations are too high.
expr: |
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.5
for: 10m
labels:
severity: warning
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC we're removing the warning severity for etcdHighFsyncDurations. Do we have another rule which can notify platform admins before the critical alert first?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a little tough for this, we don't actually know when a cluster falls over. Just that upstream recommendations are optimistic and we have thousands of clusters running much higher. I'm estimating what level of chaos we're willing to cause to lower these down to sensible levels again with the 5% alerting rate, I could trim some off the recommendations here and call that a warning threshold, but then we might be over our 5% fleet rate.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While we don't provide stability guarantees for alerting rules, I presume that some cluster admins will be puzzled by the removal of the warning severity. As stated in https://github.com/openshift/enhancements/blob/master/enhancements/monitoring/alerting-consistency.md#warning-alerts warning alerts don't require immediate action but they help identifying potential issues. We could use a higher for value to avoid the alerting rule triggering too often.

cc @typeid

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok how about I use these limits currently in the pr for warning, and add a critical level a little higher

- alert: etcdHighFsyncDurations
annotations:
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdHighFsyncDurations.md
summary: etcd cluster 99th percentile fsync durations are too high.
expr: |
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 1
for: 10m
labels:
severity: critical
- alert: etcdExcessiveDatabaseGrowth
annotations:
description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.'
Expand Down Expand Up @@ -128,10 +106,19 @@ spec:
annotations:
description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
summary: etcd cluster 99th percentile commit durations are too high.
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.5
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.08
for: 10m
labels:
severity: warning
- alert: etcdHighFsyncDurations
annotations:
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdHighFsyncDurations.md
summary: etcd cluster 99th percentile fsync durations are too high.
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.05
for: 10m
labels:
severity: critical
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
Expand Down