test: simplify and stabilize ballon device-related tests

pb8o · zulinx86 · zulinx86 · commit 32a5a251a456 · 2023-08-11T10:23:15.000+01:00
- Specify memory size arg in MiB in make_guest_dirty_memory()
- Add retries of "cat /tmp/fillmem.txt"
- Lower threshold for a stable RSS

Signed-off-by: Pablo Barbáchano &lt;pablob@amazon.com&gt;
Co-authored-by: Takahiro Itazuri &lt;itazur@amazon.com&gt;
diff --git a/tests/integration_tests/functional/test_balloon.py b/tests/integration_tests/functional/test_balloon.py
@@ -11,40 +11,39 @@
 from framework.builder import MicrovmBuilder, SnapshotBuilder, SnapshotType
 from framework.utils import get_free_mem_ssh, run_cmd
 
-MB_TO_PAGES = 256
 STATS_POLLING_INTERVAL_S = 1
 
 
 @retry(delay=0.5, tries=10)
-def get_stable_rss_mem_by_pid(pid, percentage_delta=0.5):
+def get_stable_rss_mem_by_pid(pid, percentage_delta=1):
     """
     Get the RSS memory that a guest uses, given the pid of the guest.
 
     Wait till the fluctuations in RSS drop below percentage_delta. If timeout
     is reached before the fluctuations drop, raise an exception.
     """
 
+    # All values are reported as KiB
+
     def get_rss_from_pmap():
         _, output, _ = run_cmd("pmap -X {}".format(pid))
         return int(output.split("\n")[-2].split()[1], 10)
 
     first_rss = get_rss_from_pmap()
     time.sleep(1)
     second_rss = get_rss_from_pmap()
-
-    delta = (abs(first_rss - second_rss) / float(first_rss)) * 100
-    assert delta < percentage_delta
-
+    print(f"RSS readings: {first_rss}, {second_rss}")
+    abs_diff = abs(first_rss - second_rss)
+    abs_delta = 100 * abs_diff / first_rss
+    assert abs_delta < percentage_delta or abs_diff < 2**10
     return second_rss
 
 
-def make_guest_dirty_memory(ssh_connection, should_oom=False, amount=8192):
+def make_guest_dirty_memory(ssh_connection, should_oom=False, amount_mib=32):
     """Tell the guest, over ssh, to dirty `amount` pages of memory."""
     logger = logging.getLogger("make_guest_dirty_memory")
 
-    amount_in_mbytes = amount / MB_TO_PAGES
-
-    cmd = f"/sbin/fillmem {amount_in_mbytes}"
+    cmd = f"/sbin/fillmem {amount_mib}"
     exit_code, stdout, stderr = ssh_connection.execute_command(cmd)
     # add something to the logs for troubleshooting
     if exit_code != 0:
@@ -53,18 +52,23 @@ def make_guest_dirty_memory(ssh_connection, should_oom=False, amount=8192):
         logger.error("stderr: %s", stderr)
 
     cmd = "cat /tmp/fillmem_output.txt"
-    _, stdout, _ = ssh_connection.execute_command(cmd)
+    tries = 3
+    while tries > 0:
+        # it may take a bit of time to dirty the memory and the OOM to kick-in
+        time.sleep(0.5)
+        _, stdout, _ = ssh_connection.execute_command(cmd)
+        if stdout != "":
+            break
+        tries -= 1
+
     if should_oom:
-        assert (
-            "OOM Killer stopped the program with "
-            "signal 9, exit code 0" in stdout
-        )
+        assert "OOM Killer stopped the program with signal 9, exit code 0" in stdout
     else:
         assert exit_code == 0, stderr
         assert "Memory filling was successful" in stdout, stdout
 
 
-def _test_rss_memory_lower(test_microvm):
+def _test_rss_memory_lower(test_microvm, stable_delta=1):
     """Check inflating the balloon makes guest use less rss memory."""
     # Get the firecracker pid, and open an ssh connection.
     firecracker_pid = test_microvm.jailer_clone_pid
@@ -75,20 +79,22 @@ def _test_rss_memory_lower(test_microvm):
     assert test_microvm.api_session.is_status_no_content(response.status_code)
 
     # Get initial rss consumption.
-    init_rss = get_stable_rss_mem_by_pid(firecracker_pid)
+    init_rss = get_stable_rss_mem_by_pid(firecracker_pid, percentage_delta=stable_delta)
 
     # Get the balloon back to 0.
     response = test_microvm.balloon.patch(amount_mib=0)
     assert test_microvm.api_session.is_status_no_content(response.status_code)
     # This call will internally wait for rss to become stable.
-    _ = get_stable_rss_mem_by_pid(firecracker_pid)
+    _ = get_stable_rss_mem_by_pid(firecracker_pid, percentage_delta=stable_delta)
 
     # Dirty memory, then inflate balloon and get ballooned rss consumption.
     make_guest_dirty_memory(ssh_connection)
 
     response = test_microvm.balloon.patch(amount_mib=200)
     assert test_microvm.api_session.is_status_no_content(response.status_code)
-    balloon_rss = get_stable_rss_mem_by_pid(firecracker_pid)
+    balloon_rss = get_stable_rss_mem_by_pid(
+        firecracker_pid, percentage_delta=stable_delta
+    )
 
     # Check that the ballooning reclaimed the memory.
     assert balloon_rss - init_rss <= 15000
@@ -236,7 +242,7 @@ def test_reinflate_balloon(test_microvm_with_api):
     _ = get_stable_rss_mem_by_pid(firecracker_pid)
 
     # Get the guest to dirty memory.
-    make_guest_dirty_memory(test_microvm.ssh)
+    make_guest_dirty_memory(test_microvm.ssh, amount_mib=32)
     first_reading = get_stable_rss_mem_by_pid(firecracker_pid)
 
     # Now inflate the balloon.
@@ -251,7 +257,7 @@ def test_reinflate_balloon(test_microvm_with_api):
     _ = get_stable_rss_mem_by_pid(firecracker_pid)
 
     # Now have the guest dirty memory again.
-    make_guest_dirty_memory(test_microvm.ssh)
+    make_guest_dirty_memory(test_microvm.ssh, amount_mib=32)
     third_reading = get_stable_rss_mem_by_pid(firecracker_pid)
 
     # Now inflate the balloon again.
@@ -335,7 +341,7 @@ def test_stats(test_microvm_with_api):
     initial_stats = test_microvm.balloon.get_stats().json()
 
     # Dirty 10MB of pages.
-    make_guest_dirty_memory(test_microvm.ssh, amount=10 * MB_TO_PAGES)
+    make_guest_dirty_memory(test_microvm.ssh, amount_mib=10)
     time.sleep(1)
     # This call will internally wait for rss to become stable.
     _ = get_stable_rss_mem_by_pid(firecracker_pid)
@@ -395,7 +401,7 @@ def test_stats_update(test_microvm_with_api):
     firecracker_pid = test_microvm.jailer_clone_pid
 
     # Dirty 30MB of pages.
-    make_guest_dirty_memory(test_microvm.ssh, amount=30 * MB_TO_PAGES)
+    make_guest_dirty_memory(test_microvm.ssh, amount_mib=30)
 
     # This call will internally wait for rss to become stable.
     _ = get_stable_rss_mem_by_pid(firecracker_pid)
@@ -453,7 +459,7 @@ def test_balloon_snapshot(bin_cloner_path, microvm_factory, guest_kernel, rootfs
     vm.start()
 
     # Dirty 60MB of pages.
-    make_guest_dirty_memory(vm.ssh, amount=60 * MB_TO_PAGES)
+    make_guest_dirty_memory(vm.ssh, amount_mib=60)
     time.sleep(1)
 
     # Get the firecracker pid, and open an ssh connection.
@@ -502,7 +508,7 @@ def test_balloon_snapshot(bin_cloner_path, microvm_factory, guest_kernel, rootfs
     third_reading = get_stable_rss_mem_by_pid(firecracker_pid)
 
     # Dirty 60MB of pages.
-    make_guest_dirty_memory(microvm.ssh, amount=60 * MB_TO_PAGES)
+    make_guest_dirty_memory(microvm.ssh, amount_mib=60)
 
     # Check memory usage.
     fourth_reading = get_stable_rss_mem_by_pid(firecracker_pid)
@@ -583,7 +589,7 @@ def test_memory_scrub(microvm_factory, guest_kernel, rootfs):
     microvm.start()
 
     # Dirty 60MB of pages.
-    make_guest_dirty_memory(microvm.ssh, amount=60 * MB_TO_PAGES)
+    make_guest_dirty_memory(microvm.ssh, amount_mib=60)
 
     # Now inflate the balloon with 60MB of pages.
     response = microvm.balloon.patch(amount_mib=60)