openshift · dgoodwin · Oct 9, 2025 · Nov 6, 2025 · Nov 7, 2025 · simonpasquier
diff --git a/jsonnet/custom.libsonnet b/jsonnet/custom.libsonnet
@@ -61,7 +61,7 @@
           },
           {
             alert: 'etcdHighCommitDurations',
-            expr: 'histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.5',
+            expr: 'histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.08',
             'for': '10m',
             labels: {
               severity: 'warning',
@@ -71,6 +71,19 @@
               summary: 'etcd cluster 99th percentile commit durations are too high.',
             },
           },
+          {
+            alert: 'etcdHighFsyncDurations',
+            expr: 'histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.05',
+            'for': '10m',
+            labels: {
+              severity: 'critical',
+            },
+            annotations: {
+              description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.',
+              summary: 'etcd cluster 99th percentile fsync durations are too high.',
+              runbook_url: 'https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdHighFsyncDurations.md'
+            },
+          },
           {
             alert: 'etcdHighNumberOfFailedGRPCRequests',
             expr: |||

diff --git a/jsonnet/jsonnetfile.lock.json b/jsonnet/jsonnetfile.lock.json
@@ -8,7 +8,7 @@
           "subdir": "contrib/mixin"
         }
       },
-      "version": "e4d6a05f8f1ec972384e24a83c420f707a6644f2",
+      "version": "9f1883643231ae949be4019f7f9a6ec07d2c34d1",
       "sum": "XmXkOCriQIZmXwlIIFhqlJMa0e6qGWdxZD+ZDYaN0Po="
     },
     {

diff --git a/jsonnet/main.jsonnet b/jsonnet/main.jsonnet
@@ -6,7 +6,7 @@ local promRules = if std.objectHasAll(etcdMixin, 'prometheusRules') then etcdMix
 
 // Exclude rules that are either OpenShift specific or do not work for OpenShift.
 // List should be ordered!
-local excludedAlerts = ['etcdDatabaseQuotaLowSpace', 'etcdGRPCRequestsSlow', 'etcdHighCommitDurations', 'etcdHighNumberOfFailedGRPCRequests', 'etcdHighNumberOfLeaderChanges', 'etcdInsufficientMembers', 'etcdMembersDown'];
+local excludedAlerts = ['etcdDatabaseQuotaLowSpace', 'etcdGRPCRequestsSlow', 'etcdHighCommitDurations', 'etcdHighFsyncDurations', 'etcdHighNumberOfFailedGRPCRequests', 'etcdHighNumberOfLeaderChanges', 'etcdInsufficientMembers', 'etcdMembersDown'];
 local excludeRules = std.map(
   function(group) group {
     rules: std.filter(

diff --git a/manifests/0000_90_etcd-operator_03_prometheusrule.yaml b/manifests/0000_90_etcd-operator_03_prometheusrule.yaml
@@ -39,28 +39,6 @@ spec:
       for: 15m
       labels:
         severity: warning
-    - alert: etcdHighFsyncDurations
-      annotations:
-        description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
-        runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdHighFsyncDurations.md
-        summary: etcd cluster 99th percentile fsync durations are too high.
-      expr: |
-        histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
-        > 0.5
-      for: 10m
-      labels:
-        severity: warning
-    - alert: etcdHighFsyncDurations
-      annotations:
-        description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
-        runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdHighFsyncDurations.md
-        summary: etcd cluster 99th percentile fsync durations are too high.
-      expr: |
-        histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
-        > 1
-      for: 10m
-      labels:
-        severity: critical
     - alert: etcdExcessiveDatabaseGrowth
       annotations:
         description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.'
@@ -128,10 +106,19 @@ spec:
       annotations:
         description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
         summary: etcd cluster 99th percentile commit durations are too high.
-      expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.5
+      expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.08
       for: 10m
       labels:
         severity: warning
+    - alert: etcdHighFsyncDurations
+      annotations:
+        description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+        runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdHighFsyncDurations.md
+        summary: etcd cluster 99th percentile fsync durations are too high.
+      expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.05
+      for: 10m
+      labels:
+        severity: critical
     - alert: etcdHighNumberOfFailedGRPCRequests
       annotations:
         description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'