redhat-best-practices-for-k8s · acornett21 · Jul 14, 2025 · edcdavid · Jul 15, 2025 · acornett21
diff --git a/CATALOG.md b/CATALOG.md
@@ -7,7 +7,7 @@ Depending on the workload type, not all tests are required to pass to satisfy be
 
 ## Test cases summary
 
-### Total test cases: 119
+### Total test cases: 120
 
 ### Total suites: 10
 
@@ -18,7 +18,7 @@ Depending on the workload type, not all tests are required to pass to satisfy be
 |lifecycle|18|[lifecycle](#lifecycle)|
 |manageability|2|[manageability](#manageability)|
 |networking|12|[networking](#networking)|
-|observability|5|[observability](#observability)|
+|observability|6|[observability](#observability)|
 |operator|12|[operator](#operator)|
 |performance|6|[performance](#performance)|
 |platform-alteration|14|[platform-alteration](#platform-alteration)|
@@ -36,11 +36,11 @@ Depending on the workload type, not all tests are required to pass to satisfy be
 |---|---|---|
 |8|1|
 
-### Non-Telco specific tests only: 70
+### Non-Telco specific tests only: 71
 
 |Mandatory|Optional|
 |---|---|---|
-|43|27|
+|43|28|
 
 ### Telco specific tests only: 27
 
@@ -1203,6 +1203,23 @@ Test Cases are the specifications used to perform a meaningful test. Test cases
 |Non-Telco|Mandatory|
 |Telco|Mandatory|
 
+#### observability-pod-count
+
+|Property|Description|
+|---|---|
+|Unique ID|observability-pod-count|
+|Description|Checks that all pods running at the beginning of the tests, continue to run throughout the test|
+|Suggested Remediation|Ensure all expected pods are running|
+|Best Practice Reference|https://redhat-best-practices-for-k8s.github.io/guide/#observability-pod-count|
+|Exception Process|No exceptions|
+|Impact Statement|Inconsistency of running pods can cause instability of the application.|
+|Tags|common,observability|
+|**Scenario**|**Optional/Mandatory**|
+|Extended|Optional|
+|Far-Edge|Optional|
+|Non-Telco|Optional|
+|Telco|Optional|
+
 #### observability-pod-disruption-budget
 
 |Property|Description|

diff --git a/cmd/certsuite/main_test.go b/cmd/certsuite/main_test.go
@@ -67,6 +67,7 @@ func TestCertsuiteInfoCmd(t *testing.T) {
 | observability-termination-policy                         |
 | observability-pod-disruption-budget                      |
 | observability-compatibility-with-next-ocp-release        |
+| observability-pod-count                                  |
 ------------------------------------------------------------
 `
 	assert.Equal(t, expectedOutput, string(out))

diff --git a/expected_results.yaml b/expected_results.yaml
@@ -56,6 +56,7 @@ testCases:
     - observability-pod-disruption-budget
     - observability-compatibility-with-next-ocp-release
     - observability-termination-policy
+    - observability-pod-count
     - operator-crd-versioning
     - operator-crd-openapi-schema
     - operator-install-source

diff --git a/pkg/autodiscover/autodiscover.go b/pkg/autodiscover/autodiscover.go
@@ -130,7 +130,7 @@ var data = DiscoveredTestData{}
 const labelRegex = `(\S*)\s*:\s*(\S*)`
 const labelRegexMatches = 3
 
-func createLabels(labelStrings []string) (labelObjects []labelObject) {
+func CreateLabels(labelStrings []string) (labelObjects []labelObject) {
 	for _, label := range labelStrings {
 		r := regexp.MustCompile(labelRegex)
 
@@ -159,8 +159,8 @@ func DoAutoDiscover(config *configuration.TestConfiguration) DiscoveredTestData
 		log.Fatal("Failed to retrieve storageClasses - err: %v", err)
 	}
 
-	podsUnderTestLabelsObjects := createLabels(config.PodsUnderTestLabels)
-	operatorsUnderTestLabelsObjects := createLabels(config.OperatorsUnderTestLabels)
+	podsUnderTestLabelsObjects := CreateLabels(config.PodsUnderTestLabels)
+	operatorsUnderTestLabelsObjects := CreateLabels(config.OperatorsUnderTestLabels)
 
 	log.Debug("Pods under test labels: %+v", podsUnderTestLabelsObjects)
 	log.Debug("Operators under test labels: %+v", operatorsUnderTestLabelsObjects)
@@ -181,11 +181,11 @@ func DoAutoDiscover(config *configuration.TestConfiguration) DiscoveredTestData
 	data.AllPackageManifests = getAllPackageManifests(oc.OlmPkgClient.PackageManifests(""))
 
 	data.Namespaces = namespacesListToStringList(config.TargetNameSpaces)
-	data.Pods, data.AllPods = findPodsByLabels(oc.K8sClient.CoreV1(), podsUnderTestLabelsObjects, data.Namespaces)
+	data.Pods, data.AllPods = FindPodsByLabels(oc.K8sClient.CoreV1(), podsUnderTestLabelsObjects, data.Namespaces)
 	data.AbnormalEvents = findAbnormalEvents(oc.K8sClient.CoreV1(), data.Namespaces)
 	probeLabels := []labelObject{{LabelKey: probeHelperPodsLabelName, LabelValue: probeHelperPodsLabelValue}}
 	probeNS := []string{config.ProbeDaemonSetNamespace}
-	data.ProbePods, _ = findPodsByLabels(oc.K8sClient.CoreV1(), probeLabels, probeNS)
+	data.ProbePods, _ = FindPodsByLabels(oc.K8sClient.CoreV1(), probeLabels, probeNS)
 	data.ResourceQuotaItems, err = getResourceQuotas(oc.K8sClient.CoreV1())
 	if err != nil {
 		log.Fatal("Cannot get resource quotas, err: %v", err)
@@ -223,7 +223,7 @@ func DoAutoDiscover(config *configuration.TestConfiguration) DiscoveredTestData
 	}
 
 	// Best effort mode autodiscovery for operand (running-only) pods.
-	pods, _ := findPodsByLabels(oc.K8sClient.CoreV1(), nil, data.Namespaces)
+	pods, _ := FindPodsByLabels(oc.K8sClient.CoreV1(), nil, data.Namespaces)
 	if err != nil {
 		log.Fatal("Failed to get running pods, err: %v", err)
 	}

diff --git a/pkg/autodiscover/autodiscover_pods.go b/pkg/autodiscover/autodiscover_pods.go
@@ -41,7 +41,7 @@ func findPodsMatchingAtLeastOneLabel(oc corev1client.CoreV1Interface, labels []l
 	return allPods
 }
 
-func findPodsByLabels(oc corev1client.CoreV1Interface, labels []labelObject, namespaces []string) (runningPods, allPods []corev1.Pod) {
+func FindPodsByLabels(oc corev1client.CoreV1Interface, labels []labelObject, namespaces []string) (runningPods, allPods []corev1.Pod) {
 	runningPods = []corev1.Pod{}
 	allPods = []corev1.Pod{}
 	// Iterate through namespaces

diff --git a/pkg/autodiscover/autodiscover_pods_test.go b/pkg/autodiscover/autodiscover_pods_test.go
@@ -91,7 +91,7 @@ func TestFindPodsUnderTest(t *testing.T) {
 		testRuntimeObjects = append(testRuntimeObjects, generatePod(tc.testPodName, tc.testPodNamespace, tc.queryLabel))
 		oc := clientsholder.GetTestClientsHolder(testRuntimeObjects)
 
-		podResult, _ := findPodsByLabels(oc.K8sClient.CoreV1(), testLabel, testNamespaces)
+		podResult, _ := FindPodsByLabels(oc.K8sClient.CoreV1(), testLabel, testNamespaces)
 		assert.Equal(t, tc.expectedResults, podResult)
 	}
 }
diff --git a/pkg/autodiscover/autodiscover_test.go b/pkg/autodiscover/autodiscover_test.go
@@ -50,8 +50,8 @@ func TestCreateLabels(t *testing.T) {
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			if gotLabelObjects := createLabels(tt.args.labelStrings); !reflect.DeepEqual(gotLabelObjects, tt.wantLabelObjects) {
-				t.Errorf("createLabels() = %v, want %v", gotLabelObjects, tt.wantLabelObjects)
+			if gotLabelObjects := CreateLabels(tt.args.labelStrings); !reflect.DeepEqual(gotLabelObjects, tt.wantLabelObjects) {
+				t.Errorf("CreateLabels() = %v, want %v", gotLabelObjects, tt.wantLabelObjects)
 			}
 		})
 	}

diff --git a/tests/identifiers/identifiers.go b/tests/identifiers/identifiers.go
@@ -170,6 +170,7 @@ var (
 	TestNamespaceResourceQuotaIdentifier                             claim.Identifier
 	TestPodDisruptionBudgetIdentifier                                claim.Identifier
 	TestAPICompatibilityWithNextOCPReleaseIdentifier                 claim.Identifier
+	TestPodCountIdentifier                                           claim.Identifier
 	TestPodTolerationBypassIdentifier                                claim.Identifier
 	TestPersistentVolumeReclaimPolicyIdentifier                      claim.Identifier
 	TestContainersImageTag                                           claim.Identifier
@@ -1677,6 +1678,22 @@ that Node's kernel may not have the same hacks.'`,
 		},
 		TagCommon)
 
+	TestPodCountIdentifier = AddCatalogEntry(
+		"pod-count",
+		common.ObservabilityTestKey,
+		`Checks that all pods running at the beginning of the tests, continue to run throughout the test`,
+		"Ensure all expected pods are running",
+		NoExceptions,
+		"https://redhat-best-practices-for-k8s.github.io/guide/#observability-pod-count",
+		true,
+		map[string]string{
+			FarEdge:  Optional,
+			Telco:    Optional,
+			NonTelco: Optional,
+			Extended: Optional,
+		},
+		TagCommon)
+
 	TestPodTolerationBypassIdentifier = AddCatalogEntry(
 		"pod-toleration-bypass",
 		common.LifecycleTestKey,

diff --git a/tests/identifiers/impact.go b/tests/identifiers/impact.go
@@ -136,6 +136,7 @@ const (
 	TestCrdsStatusSubresourceIdentifierImpact              = `Missing status subresources prevent proper monitoring and automation based on custom resource states.`
 	TestPodDisruptionBudgetIdentifierImpact                = `Improper disruption budgets can prevent necessary maintenance operations or allow too many pods to be disrupted simultaneously.`
 	TestAPICompatibilityWithNextOCPReleaseIdentifierImpact = `Deprecated API usage can cause applications to break during OpenShift upgrades, requiring emergency fixes.`
+	TestPodCountIdentifierImpact                           = `Inconsistency of running pods can cause instability of the application.`
 
 	// Manageability Test Suite Impact Statements
 	TestContainersImageTagImpact      = `Missing image tags make it difficult to track versions, perform rollbacks, and maintain deployment consistency.`
@@ -277,6 +278,7 @@ var ImpactMap = map[string]string{
 	"observability-crd-status":                          TestCrdsStatusSubresourceIdentifierImpact,
 	"observability-pod-disruption-budget":               TestPodDisruptionBudgetIdentifierImpact,
 	"observability-compatibility-with-next-ocp-release": TestAPICompatibilityWithNextOCPReleaseIdentifierImpact,
+	"observability-pod-count":                           TestPodCountIdentifierImpact,
 
 	// Manageability Test Suite
 	"manageability-containers-image-tag":       TestContainersImageTagImpact,

diff --git a/tests/observability/suite.go b/tests/observability/suite.go
@@ -24,16 +24,16 @@ import (
 	"strings"
 
 	"github.com/Masterminds/semver/v3"
-	"github.com/redhat-best-practices-for-k8s/certsuite/tests/common"
-	"github.com/redhat-best-practices-for-k8s/certsuite/tests/identifiers"
-	pdbv1 "github.com/redhat-best-practices-for-k8s/certsuite/tests/observability/pdb"
-
 	apiserv1 "github.com/openshift/api/apiserver/v1"
 	"github.com/redhat-best-practices-for-k8s/certsuite/internal/clientsholder"
 	"github.com/redhat-best-practices-for-k8s/certsuite/internal/log"
+	"github.com/redhat-best-practices-for-k8s/certsuite/pkg/autodiscover"
 	"github.com/redhat-best-practices-for-k8s/certsuite/pkg/checksdb"
 	"github.com/redhat-best-practices-for-k8s/certsuite/pkg/provider"
 	"github.com/redhat-best-practices-for-k8s/certsuite/pkg/testhelper"
+	"github.com/redhat-best-practices-for-k8s/certsuite/tests/common"
+	"github.com/redhat-best-practices-for-k8s/certsuite/tests/identifiers"
+	pdbv1 "github.com/redhat-best-practices-for-k8s/certsuite/tests/observability/pdb"
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/labels"
@@ -88,6 +88,13 @@ func LoadChecks() {
 			testAPICompatibilityWithNextOCPRelease(c, &env)
 			return nil
 		}))
+
+	checksGroup.Add(checksdb.NewCheck(identifiers.GetTestIDAndLabels(identifiers.TestPodCountIdentifier)).
+		WithSkipCheckFn(testhelper.GetNoPodsUnderTestSkipFn(&env)).
+		WithCheckFn(func(c *checksdb.Check) error {
+			testComparePodCount(c, &env)
+			return nil
+		}))
 }
 
 // containerHasLoggingOutput helper function to get the last line of logging output from
@@ -423,3 +430,70 @@ func testAPICompatibilityWithNextOCPRelease(check *checksdb.Check, env *provider
 	// Add test results
 	check.SetResult(compliantObjects, nonCompliantObjects)
 }
+
+// Function to compare the number of running pods to those loaded during autodiscover at the start of test execution.
+func testComparePodCount(check *checksdb.Check, env *provider.TestEnvironment) {
+	oc := clientsholder.GetClientsHolder()
+
+	originalPods := env.Pods
+
+	currentPods, _ := autodiscover.FindPodsByLabels(oc.K8sClient.CoreV1(), autodiscover.CreateLabels(env.Config.PodsUnderTestLabels), env.Namespaces)
+
+	var compliantObjects []*testhelper.ReportObject
+	var nonCompliantObjects []*testhelper.ReportObject
+
+	// Compare pod counts
+	originalPodCount := len(originalPods)
+	currentPodCount := len(currentPods)
+
+	if originalPodCount == currentPodCount {
+		check.LogInfo("Pod count is consistent")
+		compliantObjects = append(compliantObjects,
+			testhelper.NewReportObject("Pod count is consistent", "PodCount", true).AddField("OriginalCount", fmt.Sprintf("%d", originalPodCount)).AddField("CurrentCount", fmt.Sprintf("%d", currentPodCount)))
+	} else {
+		check.LogError("Pod count mismatch: original=%d, current=%d", originalPodCount, currentPodCount)
+		nonCompliantObjects = append(nonCompliantObjects,
+			testhelper.NewReportObject("Pod count mismatch", "PodCount", false).AddField("OriginalCount", fmt.Sprintf("%d", originalPodCount)).AddField("CurrentCount", fmt.Sprintf("%d", currentPodCount)))
+	}
+
+	// Create maps for detailed comparison
+	originalPodsMap := make(map[string]struct{})
+	for _, pod := range originalPods {
+		key := fmt.Sprintf("%s/%s", pod.Namespace, pod.Name)
+		originalPodsMap[key] = struct{}{}
+	}
+
+	currentPodsMap := make(map[string]struct{})
+	for i := range currentPods {
+		pod := currentPods[i]
+		key := fmt.Sprintf("%s/%s", pod.Namespace, pod.Name)
+		currentPodsMap[key] = struct{}{}
+	}
+
+	// Check for missing pods (in original but not in current)
+	for _, originalPod := range originalPods {
+		podKey := fmt.Sprintf("%s/%s", originalPod.Namespace, originalPod.Name)
+		if _, exists := currentPodsMap[podKey]; !exists {
+			check.LogError("Pod %q is missing from current state", originalPod.String())
+			nonCompliantObjects = append(nonCompliantObjects,
+				testhelper.NewReportObject("Pod is missing from current state", testhelper.PodType, false).AddField(testhelper.PodName, originalPod.Name).AddField(testhelper.Namespace, originalPod.Namespace))
+		} else {
+			check.LogInfo("Pod %q is present in current state", originalPod.String())
+			compliantObjects = append(compliantObjects,
+				testhelper.NewReportObject("Pod is present in current state", testhelper.PodType, true).AddField(testhelper.PodName, originalPod.Name).AddField(testhelper.Namespace, originalPod.Namespace))
+		}
+	}
+
+	// Check for extra pods (in current but not in original)
+	for i := range currentPods {
+		currentPod := currentPods[i]
+		podKey := fmt.Sprintf("%s/%s", currentPod.Namespace, currentPod.Name)
+		if _, exists := originalPodsMap[podKey]; !exists {
+			check.LogError("Extra pod %s/%s found in current state", currentPod.Namespace, currentPod.Name)
+			nonCompliantObjects = append(nonCompliantObjects,
+				testhelper.NewReportObject("Extra pod found in current state", testhelper.PodType, false).AddField(testhelper.PodName, currentPod.Name).AddField(testhelper.Namespace, currentPod.Namespace))
+		}
+	}
+
+	check.SetResult(compliantObjects, nonCompliantObjects)
+}