From 235493696f5980556e9d9afa3ae8dc0b6f524d00 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Wed, 13 Mar 2024 17:18:52 +0000 Subject: [PATCH 1/6] fix(vmm): only use memfd if no vhost-user-blk devices configured Page faults are more expensive for shared memory mapping (like memfd). For this reason, we only back guest memory with a memfd if a vhost-user-blk device is configured in the VM, otherwise we fall back to anonymous private memory. This is recovering performance demonstrated before commit 027a9929d62c77fa580eaac4792a9acc1bd3ae1c had been merged. The vhost-user-blk branch in VM creation is not currently covered by Rust integration tests, because we are looking for a converging solution, so guest memory is always created in the same way, and implementing required integration test infrastructure code (spawing a backend process) would not be useful long term. Signed-off-by: Nikita Kalyazin --- src/vmm/src/builder.rs | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 41b2b79ae36..b6b3fad97e8 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -237,12 +237,39 @@ pub fn build_microvm_for_boot( .ok_or(MissingKernelConfig)?; let track_dirty_pages = vm_resources.track_dirty_pages(); - let guest_memory = GuestMemoryMmap::memfd_backed( - vm_resources.vm_config.mem_size_mib, - track_dirty_pages, - vm_resources.vm_config.huge_pages, - ) - .map_err(StartMicrovmError::GuestMemory)?; + + let vhost_user_device_used = vm_resources + .block + .devices + .iter() + .any(|b| b.lock().expect("Poisoned lock").is_vhost_user()); + + // Page faults are more expensive for shared memory mapping, including memfd. + // For this reason, we only back guest memory with a memfd + // if a vhost-user-blk device is configured in the VM, otherwise we fall back to + // an anonymous private memory. + // + // The vhost-user-blk branch is not currently covered by integration tests in Rust, + // because that would require running a backend process. If in the future we converge to + // a single way of backing guest memory for vhost-user and non-vhost-user cases, + // that would not be worth the effort. + let guest_memory = if vhost_user_device_used { + GuestMemoryMmap::memfd_backed( + vm_resources.vm_config.mem_size_mib, + track_dirty_pages, + vm_resources.vm_config.huge_pages, + ) + .map_err(StartMicrovmError::GuestMemory)? + } else { + let regions = crate::arch::arch_memory_regions(vm_resources.vm_config.mem_size_mib << 20); + GuestMemoryMmap::from_raw_regions( + ®ions, + track_dirty_pages, + vm_resources.vm_config.huge_pages, + ) + .map_err(StartMicrovmError::GuestMemory)? + }; + let entry_addr = load_kernel(boot_config, &guest_memory)?; let initrd = load_initrd_from_config(boot_config, &guest_memory)?; // Clone the command-line so that a failed boot doesn't pollute the original. From 1ace020bbe055acc3eee28b62816dd27b3d5590f Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Thu, 14 Mar 2024 10:28:56 +0000 Subject: [PATCH 2/6] test(api): adjust error in negative memory test This is because guest memory is backed by anon private mapping in a regular case. Signed-off-by: Nikita Kalyazin --- tests/integration_tests/functional/test_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration_tests/functional/test_api.py b/tests/integration_tests/functional/test_api.py index 4c3c8c6998d..971507cca79 100644 --- a/tests/integration_tests/functional/test_api.py +++ b/tests/integration_tests/functional/test_api.py @@ -392,7 +392,7 @@ def test_api_machine_config(uvm_plain): test_microvm.api.machine_config.patch(mem_size_mib=bad_size) fail_msg = re.escape( - "Invalid Memory Configuration: Cannot resize memfd file: Custom { kind: InvalidInput, error: TryFromIntError(()) }" + "Invalid Memory Configuration: Cannot create mmap region: Out of memory (os error 12)" ) with pytest.raises(RuntimeError, match=fail_msg): test_microvm.start() From 187ddd802c7b006f17b56f43a8dda8fe0775054f Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Thu, 14 Mar 2024 10:39:50 +0000 Subject: [PATCH 3/6] test(hugepages): adjust for anon private memory Huge pages tests search for memory mapping name, which is now anon private instead of memfd. Signed-off-by: Nikita Kalyazin --- tests/integration_tests/performance/test_huge_pages.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/integration_tests/performance/test_huge_pages.py b/tests/integration_tests/performance/test_huge_pages.py index 4a88190ff07..eba44ac872f 100644 --- a/tests/integration_tests/performance/test_huge_pages.py +++ b/tests/integration_tests/performance/test_huge_pages.py @@ -18,11 +18,13 @@ def check_hugetlbfs_in_use(pid: int, allocation_name: str): `allocation_name` should be the name of the smaps entry for which we want to verify that huge pages are used. For memfd-backed guest memory, this would be "memfd:guest_mem" (the `guest_mem` part originating from the name - we give the memfd in memory.rs), for anonymous memory this would be "/anon_hugepage" + we give the memfd in memory.rs), for anonymous memory this would be "/anon_hugepage". + Note: in our testing, we do not currently configure vhost-user-blk devices, so we only exercise + the "/anon_hugepage" case. """ # Format of a sample smaps entry: - # 7fc2bc400000-7fc2cc400000 rw-s 00000000 00:10 25488401 /memfd:guest_mem (deleted) + # 7fc2bc400000-7fc2cc400000 rw-s 00000000 00:10 25488401 /anon_hugepage # Size: 262144 kB # KernelPageSize: 2048 kB # MMUPageSize: 2048 kB @@ -70,7 +72,7 @@ def test_hugetlbfs_boot(uvm_plain): check_hugetlbfs_in_use( uvm_plain.firecracker_pid, - "memfd:guest_mem", + "/anon_hugepage", ) @@ -97,7 +99,7 @@ def test_hugetlbfs_snapshot( rc, _, _ = vm.ssh.run("true") assert not rc - check_hugetlbfs_in_use(vm.firecracker_pid, "memfd:guest_mem") + check_hugetlbfs_in_use(vm.firecracker_pid, "/anon_hugepage") snapshot = vm.snapshot_full() From 47022305f9c034448518b729253e33073924386e Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Thu, 14 Mar 2024 11:21:27 +0000 Subject: [PATCH 4/6] test(jailer): adjust file size limit test for anon mapping This brings the former negative file size limit test from before 60737eb93169016d2a58dac2b7cbd8f14786121e , which is valid for anon private guest memory backing. Signed-off-by: Nikita Kalyazin --- tests/integration_tests/security/test_jail.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/tests/integration_tests/security/test_jail.py b/tests/integration_tests/security/test_jail.py index ff4fd6ff3dd..f718c550eb7 100644 --- a/tests/integration_tests/security/test_jail.py +++ b/tests/integration_tests/security/test_jail.py @@ -489,21 +489,23 @@ def test_positive_file_size_limit(uvm_plain): def test_negative_file_size_limit(uvm_plain): """ - Test creating vm fails when memory size exceeds `fsize` limit. - This is caused by the fact that we back guest memory by memfd. + Test creating snapshot file fails when size exceeds `fsize` limit. """ - - vm_mem_size = 128 - jail_limit = (vm_mem_size - 1) << 20 - test_microvm = uvm_plain - test_microvm.jailer.resource_limits = [f"fsize={jail_limit}"] + # limit to 1MB, to account for logs and metrics + test_microvm.jailer.resource_limits = [f"fsize={2**20}"] test_microvm.spawn() - test_microvm.basic_config(mem_size_mib=vm_mem_size) + test_microvm.basic_config() + test_microvm.start() - # Attempt to start a vm. + test_microvm.pause() + + # Attempt to create a snapshot. try: - test_microvm.start() + test_microvm.api.snapshot_create.put( + mem_file_path="/vm.mem", + snapshot_path="/vm.vmstate", + ) except ( http_client.RemoteDisconnected, urllib3.exceptions.ProtocolError, From 586de7948a4c4060a5009294d4db642f2fed6197 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Thu, 14 Mar 2024 11:04:18 +0000 Subject: [PATCH 5/6] doc(vhost-user-blk): add performance considerations Mention memfd page fault overhead and host pagecache usage for virtio block devices compared to vhost-user. Signed-off-by: Nikita Kalyazin --- docs/api_requests/block-vhost-user.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/docs/api_requests/block-vhost-user.md b/docs/api_requests/block-vhost-user.md index a6b0dd51125..8325992caf4 100644 --- a/docs/api_requests/block-vhost-user.md +++ b/docs/api_requests/block-vhost-user.md @@ -74,6 +74,28 @@ be coming from the fact that the custom logic is implemented in the same process that handles Virtio queues, which reduces the number of required context switches. +## Disadvantages + +In order for the backend to be able to process virtio requests, guest memory +needs to be shared by the frontend to the backend. This means, a shared memory +mapping is required to back guest memory. When a vhost-user device is +configured, Firecracker uses `memfd_create` instead of creating an anonymous +private mapping to achieve that. It was observed that page faults to a shared +memory mapping take significantly longer (up to 24% in our testing), because +Linux memory subsystem has to use atomic memory operations to update page +status, which is an expensive operation under specific conditions. We advise +users to profile performance on their workloads when considering to use +vhost-user devices. + +## Other considerations + +Compared to virtio block device where Firecracker interacts with a drive file on +the host, vhost-user block device is handled by the backend directly. Some +workloads may benefit from caching and readahead that the host pagecache offers +for the backing file. This benefit is not available in vhost-user block case. +Users may need to implement internal caching within the backend if they find it +appropriate. + ## Backends There are a number of open source implementations of a vhost-user backend From 58c7c56eb0319699b7061042197f5dacec4e1771 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Thu, 14 Mar 2024 11:48:51 +0000 Subject: [PATCH 6/6] doc(changelog): mention usage of anon private for non-vhost-user vms Adds a changelog entry that VMs that do not use vhost-user-blk devices are backed by anon private memory mapping. Signed-off-by: Nikita Kalyazin --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 45a9fe6fd28..e973916cbbf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,6 +41,10 @@ and this project adheres to information about page size to the payload Firecracker sends to the UFFD handler. Each memory region object now contains a `page_size_kib` field. See also the [hugepages documentation](docs/hugepages.md). +- [#4498](https://github.com/firecracker-microvm/firecracker/pull/4498): Only + use memfd to back guest memory if a vhost-user-blk device is configured, + otherwise use anonymous private memory. This is because serving page faults of + shared memory used by memfd is slower and may impact workloads. ### Deprecated