Skip to content

Commit 96f3cd1

Browse files
committed
Skip CO condition tests on SNO
This pull skips all CO tests on SNO. - `Available=False` and `Degrade=True` are not checked at all no matter if the test case is executed in an upgrade test suite, or not. Before it was handled as an exception and thus the job would be just flaky instead of failing. Thus, the relevant exceptions can be removed. - All checks on the `Progressing` condition are skipped as well on a SNO cluster. The logging logic was inherited if it fails to determine the control plane topology because I am not sure on which type of clusters an error will show up.
1 parent e4729c9 commit 96f3cd1

3 files changed

Lines changed: 46 additions & 54 deletions

File tree

pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414

1515
configv1 "github.com/openshift/api/config/v1"
1616
"k8s.io/client-go/rest"
17+
e2e "k8s.io/kubernetes/test/e2e/framework"
1718
)
1819

1920
type legacyMonitorTests struct {
@@ -90,15 +91,21 @@ func (w *legacyMonitorTests) EvaluateTestsFromConstructedIntervals(ctx context.C
9091
junits = append(junits, testOperatorOSUpdateStartedEventRecorded(finalIntervals, w.adminRESTConfig)...)
9192

9293
isUpgrade := platformidentification.DidUpgradeHappenDuringCollection(finalIntervals, time.Time{}, time.Time{})
94+
topology, err := getControlPlaneTopology(w.adminRESTConfig)
95+
if err != nil {
96+
e2e.Logf("failed to get control plane topology: %v", err)
97+
}
98+
singleNode := topology == configv1.SingleReplicaTopologyMode
99+
93100
if isUpgrade {
94-
junits = append(junits, testUpgradeOperatorStateTransitions(finalIntervals, w.adminRESTConfig)...)
101+
junits = append(junits, testUpgradeOperatorStateTransitions(finalIntervals, w.adminRESTConfig, topology)...)
95102
level, err := getUpgradeLevel(w.adminRESTConfig)
96103
if err != nil || level == unknownUpgradeLevel {
97104
return nil, fmt.Errorf("failed to determine upgrade level: %w", err)
98105
}
99-
junits = append(junits, testUpgradeOperatorProgressingStateTransitions(finalIntervals, level == patchUpgradeLevel)...)
106+
junits = append(junits, testUpgradeOperatorProgressingStateTransitions(finalIntervals, level == patchUpgradeLevel, singleNode)...)
100107
} else {
101-
junits = append(junits, testStableSystemOperatorStateTransitions(finalIntervals, w.adminRESTConfig)...)
108+
junits = append(junits, testStableSystemOperatorStateTransitions(finalIntervals, w.adminRESTConfig, singleNode)...)
102109
}
103110

104111
return junits, nil

pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go

Lines changed: 20 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,7 @@ func checkAuthenticationAvailableExceptions(condition *configv1.ClusterOperatorS
4646
return false
4747
}
4848

49-
func testStableSystemOperatorStateTransitions(events monitorapi.Intervals, clientConfig *rest.Config) []*junitapi.JUnitTestCase {
50-
topology, err := getControlPlaneTopology(clientConfig)
51-
if err != nil {
52-
logrus.Warnf("Error checking for ControlPlaneTopology configuration (unable to make topology exceptions): %v", err)
53-
}
54-
isSingleNode := topology == configv1.SingleReplicaTopologyMode
55-
49+
func testStableSystemOperatorStateTransitions(events monitorapi.Intervals, clientConfig *rest.Config, singleNode bool) []*junitapi.JUnitTestCase {
5650
except := func(operator string, condition *configv1.ClusterOperatorStatusCondition, _ monitorapi.Interval, clientConfig *rest.Config) string {
5751
if condition.Status == configv1.ConditionTrue {
5852
if condition.Type == configv1.OperatorAvailable {
@@ -64,30 +58,6 @@ func testStableSystemOperatorStateTransitions(events monitorapi.Intervals, clien
6458
}
6559
}
6660

67-
if isSingleNode {
68-
switch operator {
69-
case "dns":
70-
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse &&
71-
strings.Contains(condition.Message, `DNS "default" is unavailable.`) {
72-
return "dns operator is allowed to have Available=False due to serial taint tests on single node"
73-
}
74-
if condition.Type == configv1.OperatorDegraded && condition.Status == configv1.ConditionTrue &&
75-
strings.Contains(condition.Message, `DNS default is degraded`) {
76-
return "dns operator is allowed to have Degraded=True due to serial taint tests on single node"
77-
}
78-
case "openshift-apiserver":
79-
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse &&
80-
strings.Contains(condition.Message, `connect: connection refused`) {
81-
return "openshift apiserver operator is allowed to have Available=False due kube-apiserver force rollout test on single node"
82-
}
83-
case "csi-snapshot-controller":
84-
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse &&
85-
strings.Contains(condition.Message, `Waiting for Deployment`) {
86-
return "csi snapshot controller is allowed to have Available=False due to CSI webhook test on single node"
87-
}
88-
}
89-
}
90-
9161
// For the non-upgrade case, if any operator has Available=False, fail the test.
9262
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse {
9363
if operator == "authentication" {
@@ -156,7 +126,7 @@ func testStableSystemOperatorStateTransitions(events monitorapi.Intervals, clien
156126
return "We are not worried about other operator condition blips for stable-system tests yet."
157127
}
158128

159-
return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except, clientConfig, false)
129+
return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except, clientConfig, false, singleNode)
160130
}
161131

162132
func getControlPlaneTopology(clientConfig *rest.Config) (configv1.TopologyMode, error) {
@@ -268,14 +238,9 @@ func hasUpgradeFailedEvent(eventList monitorapi.Intervals) bool {
268238
return false
269239
}
270240

271-
func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConfig *rest.Config) []*junitapi.JUnitTestCase {
241+
func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConfig *rest.Config, topology configv1.TopologyMode) []*junitapi.JUnitTestCase {
272242
upgradeWindows := getUpgradeWindows(events)
273-
topology, err := getControlPlaneTopology(clientConfig)
274-
if err != nil {
275-
logrus.Warnf("Error checking for ControlPlaneTopology configuration on upgrade (unable to make topology exceptions): %v", err)
276-
}
277243

278-
isSingleNode := topology == configv1.SingleReplicaTopologyMode
279244
isTwoNode := topology == configv1.HighlyAvailableArbiterMode || topology == configv1.DualReplicaTopologyMode
280245
upgradeFailed := hasUpgradeFailedEvent(events)
281246

@@ -284,10 +249,6 @@ func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConf
284249
if upgradeFailed {
285250
return "upgrade failed, not recording unexpected operator transitions as failure"
286251
}
287-
// SingleNode is expected to go Available=False and Degraded=True for most / all operators during upgrade
288-
if isSingleNode {
289-
return "single node is allowed to be unavailable/degraded during upgrades"
290-
}
291252

292253
if condition.Status == configv1.ConditionTrue {
293254
if condition.Type == configv1.OperatorAvailable {
@@ -461,9 +422,6 @@ func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConf
461422
}
462423
case "kube-apiserver":
463424
if condition.Type == configv1.OperatorDegraded && condition.Status == configv1.ConditionTrue {
464-
if isSingleNode && condition.Reason == "NodeInstaller_InstallerPodFailed" {
465-
return "https://issues.redhat.com/browse/OCPBUGS-38678"
466-
}
467425
return "https://issues.redhat.com/browse/OCPBUGS-38661"
468426
}
469427
case "kube-controller-manager":
@@ -486,7 +444,7 @@ func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConf
486444
return ""
487445
}
488446

489-
return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except, clientConfig, true)
447+
return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except, clientConfig, true, topology == configv1.SingleReplicaTopologyMode)
490448
}
491449

492450
func isVSphere(config *rest.Config) (bool, error) {
@@ -520,7 +478,7 @@ func checkReplicas(namespace string, operator string, clientConfig *rest.Config)
520478
return 0, fmt.Errorf("Error fetching replicas")
521479
}
522480

523-
func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []configv1.ClusterStatusConditionType, except exceptionCallback, clientConfig *rest.Config, upgrade bool) []*junitapi.JUnitTestCase {
481+
func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []configv1.ClusterStatusConditionType, except exceptionCallback, clientConfig *rest.Config, upgrade, singleNode bool) []*junitapi.JUnitTestCase {
524482
ret := []*junitapi.JUnitTestCase{}
525483

526484
var start, stop time.Time
@@ -548,7 +506,16 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []
548506
})
549507
continue
550508
}
551-
509+
if singleNode {
510+
// SingleNode is expected to go Available=False and Degraded=True for most / all operators during upgrade
511+
ret = append(ret, &junitapi.JUnitTestCase{
512+
Name: testName,
513+
SkipMessage: &junitapi.SkipMessage{
514+
Message: "Test skipped on a single-node cluster",
515+
},
516+
})
517+
continue
518+
}
552519
excepted := []string{}
553520
fatal := []string{}
554521

@@ -633,7 +600,7 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []
633600
return ret
634601
}
635602

636-
func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, isPatchLevelUpgrade bool) []*junitapi.JUnitTestCase {
603+
func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, isPatchLevelUpgrade, singleNode bool) []*junitapi.JUnitTestCase {
637604
var ret []*junitapi.JUnitTestCase
638605
upgradeWindows := getUpgradeWindows(events)
639606
multiUpgrades := platformidentification.UpgradeNumberDuringCollection(events, time.Time{}, time.Time{}) > 1
@@ -729,6 +696,10 @@ func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals,
729696
mcTestCase.SkipMessage = &junitapi.SkipMessage{
730697
Message: "Test skipped in a patch-level upgrade test",
731698
}
699+
} else if singleNode {
700+
mcTestCase.SkipMessage = &junitapi.SkipMessage{
701+
Message: "Test skipped on a single-node cluster",
702+
}
732703
} else if t, ok := coProgressingStart[operatorName]; !ok || t.IsZero() {
733704
output := fmt.Sprintf("clusteroperator/%s was never Progressing=True during the upgrade window from %s to %s", operatorName, start.Format(time.RFC3339), stop.Format(time.RFC3339))
734705
exception = except(operatorName, "")

test/extended/machines/scale.go

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ import (
1111
o "github.com/onsi/gomega"
1212
configv1 "github.com/openshift/api/config/v1"
1313
configclient "github.com/openshift/client-go/config/clientset/versioned"
14-
bmhelper "github.com/openshift/origin/test/extended/baremetal"
14+
configv1client "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1"
1515
"github.com/stretchr/objx"
1616
corev1 "k8s.io/api/core/v1"
1717
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -23,6 +23,9 @@ import (
2323
"k8s.io/client-go/scale"
2424
e2e "k8s.io/kubernetes/test/e2e/framework"
2525
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
26+
27+
bmhelper "github.com/openshift/origin/test/extended/baremetal"
28+
exutil "github.com/openshift/origin/test/extended/util"
2629
)
2730

2831
const (
@@ -266,7 +269,18 @@ var _ = g.Describe("[sig-cluster-lifecycle][Feature:Machines][Serial] Managed cl
266269
violations = append(violations, operator)
267270
}
268271
}
269-
o.Expect(violations).To(o.BeEmpty(), "those cluster operators left Progressing=False while cluster was scaling: %v", violations)
272+
273+
cfg, err := e2e.LoadConfig()
274+
o.Expect(err).NotTo(o.HaveOccurred())
275+
configV1Client, err := configv1client.NewForConfig(cfg)
276+
o.Expect(err).NotTo(o.HaveOccurred())
277+
topo, err := exutil.GetControlPlaneTopologyFromConfigClient(configV1Client)
278+
if err != nil {
279+
e2e.Logf("failed to get control plane topology: %v", err)
280+
}
281+
if *topo != configv1.SingleReplicaTopologyMode {
282+
o.Expect(violations).To(o.BeEmpty(), "those cluster operators left Progressing=False while cluster was scaling: %v", violations)
283+
}
270284
})
271285

272286
// The 30m timeout is essentially required by the baremetal platform environment,

0 commit comments

Comments
 (0)