From ab2fe8b2d7bda74023f302a7670cf323d329cbe5 Mon Sep 17 00:00:00 2001 From: Albin Kerouanton Date: Wed, 17 Dec 2025 15:47:16 +0100 Subject: [PATCH] net: add a new flag NET_FLAG_INCLUDE_VNET_HEADER This flag should be used to indicate to libkrun that downstream network backend wants to receive and transmit the virtio-net header along with Ethernet frames. Network backends using this flag can then forward unmodified headers to another VM or build a sensible virtio_net_hdr (e.g. with GSO fields correctly set) such that receiving VM handles GSO'd frames properly. Signed-off-by: Albin Kerouanton --- include/libkrun.h | 1 + src/devices/src/virtio/net/device.rs | 4 +++ src/devices/src/virtio/net/mod.rs | 7 +++- src/devices/src/virtio/net/tap.rs | 37 +++++++++++++++++---- src/devices/src/virtio/net/unixgram.rs | 42 +++++++++++++++++++----- src/devices/src/virtio/net/unixstream.rs | 35 +++++++++++++------- src/devices/src/virtio/net/worker.rs | 30 +++++++++-------- src/libkrun/src/lib.rs | 24 +++++++++----- src/vmm/src/vmm_config/net.rs | 12 +++++-- 9 files changed, 140 insertions(+), 52 deletions(-) diff --git a/include/libkrun.h b/include/libkrun.h index 462a7166c..5383fdbd3 100644 --- a/include/libkrun.h +++ b/include/libkrun.h @@ -326,6 +326,7 @@ int32_t krun_add_virtiofs2(uint32_t ctx_id, /* Send the VFKIT magic after establishing the connection, as required by gvproxy in vfkit mode. */ #define NET_FLAG_VFKIT 1 << 0 +#define NET_FLAG_INCLUDE_VNET_HEADER 1 << 1 /* Taken from uapi/linux/virtio_net.h */ #define NET_FEATURE_CSUM 1 << 0 diff --git a/src/devices/src/virtio/net/device.rs b/src/devices/src/virtio/net/device.rs index fc0e27c4f..e6dcf429b 100644 --- a/src/devices/src/virtio/net/device.rs +++ b/src/devices/src/virtio/net/device.rs @@ -54,6 +54,7 @@ struct VirtioNetConfig { mac: [u8; 6], status: u16, max_virtqueue_pairs: u16, + include_vnet_header: bool, } // Safe because it only has data and has no implicit padding. @@ -91,6 +92,7 @@ impl Net { cfg_backend: VirtioNetBackend, mac: [u8; 6], features: u32, + include_vnet_header: bool, ) -> Result { let avail_features = features as u64 | (1 << VIRTIO_NET_F_MAC) @@ -108,6 +110,7 @@ impl Net { mac, status: 0, max_virtqueue_pairs: 0, + include_vnet_header, }; Ok(Net { @@ -207,6 +210,7 @@ impl VirtioDevice for Net { interrupt.clone(), mem.clone(), self.acked_features, + self.config.include_vnet_header, self.cfg_backend.clone(), ) { Ok(worker) => { diff --git a/src/devices/src/virtio/net/mod.rs b/src/devices/src/virtio/net/mod.rs index 367b6013d..8f3a8667a 100644 --- a/src/devices/src/virtio/net/mod.rs +++ b/src/devices/src/virtio/net/mod.rs @@ -4,7 +4,12 @@ use std::{io, mem, result}; use virtio_bindings::virtio_net::virtio_net_hdr_v1; -pub const MAX_BUFFER_SIZE: usize = 65562; +/// Each frame forwarded to a unixstream backend is prepended by a 4 byte "header". +/// It is interpreted as a big-endian u32 integer and is the length of the following ethernet frame. +/// In order to avoid unnecessary allocations and copies, the TX buffer is allocated with extra +/// space to accommodate this header. +const FRAME_HEADER_LEN: usize = 4; +pub const MAX_BUFFER_SIZE: usize = 65562 + FRAME_HEADER_LEN; pub const QUEUE_SIZE: u16 = 1024; pub const NUM_QUEUES: usize = 2; pub const QUEUE_SIZES: &[u16] = &[QUEUE_SIZE; NUM_QUEUES]; diff --git a/src/devices/src/virtio/net/tap.rs b/src/devices/src/virtio/net/tap.rs index 1c8bde34e..f43166080 100644 --- a/src/devices/src/virtio/net/tap.rs +++ b/src/devices/src/virtio/net/tap.rs @@ -14,6 +14,7 @@ use virtio_bindings::virtio_net::{ }; use super::backend::{ConnectError, NetBackend, ReadError, WriteError}; +use super::{write_virtio_net_hdr, FRAME_HEADER_LEN}; ioctl_write_ptr!(tunsetiff, b'T', 202, c_int); ioctl_write_int!(tunsetoffload, b'T', 208); @@ -21,11 +22,16 @@ ioctl_write_ptr!(tunsetvnethdrsz, b'T', 216, c_int); pub struct Tap { fd: OwnedFd, + include_vnet_header: bool, } impl Tap { /// Create an endpoint using the file descriptor of a tap device - pub fn new(tap_name: String, vnet_features: u64) -> Result { + pub fn new( + tap_name: String, + vnet_features: u64, + include_vnet_header: bool, + ) -> Result { let fd = match open("/dev/net/tun", OFlag::O_RDWR, Mode::empty()) { Ok(fd) => fd, Err(err) => return Err(ConnectError::OpenNetTun(err)), @@ -41,7 +47,10 @@ impl Tap { ); } - req.ifr_ifru.ifru_flags = IFF_TAP as i16 | IFF_NO_PI as i16 | IFF_VNET_HDR as i16; + req.ifr_ifru.ifru_flags = IFF_TAP as i16 | IFF_NO_PI as i16; + if include_vnet_header { + req.ifr_ifru.ifru_flags |= IFF_VNET_HDR as i16; + } let mut offload_flags: u64 = 0; if (vnet_features & (1 << VIRTIO_NET_F_GUEST_CSUM)) != 0 { @@ -84,7 +93,10 @@ impl Tap { Err(e) => error!("couldn't obtain fd flags id={fd:?}, err={e}"), }; - Ok(Self { fd }) + Ok(Self { + fd, + include_vnet_header, + }) } } @@ -92,7 +104,13 @@ impl NetBackend for Tap { /// Try to read a frame from the tap devie. If no bytes are available reports /// ReadError::NothingRead. fn read_frame(&mut self, buf: &mut [u8]) -> Result { - let frame_length = match read(&self.fd, buf) { + let buf_offset = if !self.include_vnet_header { + write_virtio_net_hdr(buf) + } else { + 0 + }; + + let frame_length = match read(&self.fd, &mut buf[buf_offset..]) { Ok(f) => f, #[allow(unreachable_patterns)] Err(nix::Error::EAGAIN | nix::Error::EWOULDBLOCK) => { @@ -103,12 +121,17 @@ impl NetBackend for Tap { } }; debug!("Read eth frame from tap: {frame_length} bytes"); - Ok(frame_length) + Ok(buf_offset + frame_length) } /// Try to write a frame to the tap device. - fn write_frame(&mut self, _hdr_len: usize, buf: &mut [u8]) -> Result<(), WriteError> { - let ret = write(&self.fd, buf).map_err(WriteError::Internal)?; + fn write_frame(&mut self, hdr_len: usize, buf: &mut [u8]) -> Result<(), WriteError> { + let buf_offset = if !self.include_vnet_header { + hdr_len + } else { + FRAME_HEADER_LEN + }; + let ret = write(&self.fd, buf[buf_offset..]).map_err(WriteError::Internal)?; debug!("Written frame size={}, written={}", buf.len(), ret); Ok(()) } diff --git a/src/devices/src/virtio/net/unixgram.rs b/src/devices/src/virtio/net/unixgram.rs index 04e230066..7b3749684 100644 --- a/src/devices/src/virtio/net/unixgram.rs +++ b/src/devices/src/virtio/net/unixgram.rs @@ -8,17 +8,18 @@ use std::os::fd::{AsRawFd, OwnedFd, RawFd}; use std::path::PathBuf; use super::backend::{ConnectError, NetBackend, ReadError, WriteError}; -use super::write_virtio_net_hdr; +use super::{write_virtio_net_hdr, FRAME_HEADER_LEN}; const VFKIT_MAGIC: [u8; 4] = *b"VFKT"; pub struct Unixgram { fd: OwnedFd, + include_vnet_header: bool, } impl Unixgram { /// Create the backend with a pre-established connection to the userspace network proxy. - pub fn new(fd: OwnedFd) -> Self { + pub fn new(fd: OwnedFd, include_vnet_header: bool) -> Self { // Ensure the socket is in non-blocking mode. match fcntl(&fd, FcntlArg::F_GETFL) { Ok(flags) => match OFlag::from_bits(flags) { @@ -47,11 +48,18 @@ impl Unixgram { }; } - Self { fd } + Self { + fd, + include_vnet_header, + } } /// Create the backend opening a connection to the userspace network proxy. - pub fn open(path: PathBuf, send_vfkit_magic: bool) -> Result { + pub fn open( + path: PathBuf, + send_vfkit_magic: bool, + include_vnet_header: bool, + ) -> Result { // We cannot create a non-blocking socket on macOS here. This is done later in new(). let fd = socket( AddressFamily::Unix, @@ -90,15 +98,24 @@ impl Unixgram { getsockopt(&fd, sockopt::RcvBuf) ); - Ok(Self::new(fd)) + Ok(Self::new(fd, include_vnet_header)) } } impl NetBackend for Unixgram { /// Try to read a frame the proxy. If no bytes are available reports ReadError::NothingRead fn read_frame(&mut self, buf: &mut [u8]) -> Result { - let hdr_len = write_virtio_net_hdr(buf); - let frame_length = match recv(self.fd.as_raw_fd(), &mut buf[hdr_len..], MsgFlags::empty()) { + let buf_offset = if !self.include_vnet_header { + write_virtio_net_hdr(buf) + } else { + 0 + }; + + let frame_length = match recv( + self.fd.as_raw_fd(), + &mut buf[buf_offset..], + MsgFlags::empty(), + ) { Ok(f) => f, #[allow(unreachable_patterns)] Err(nix::Error::EAGAIN | nix::Error::EWOULDBLOCK) => { @@ -109,12 +126,19 @@ impl NetBackend for Unixgram { } }; debug!("Read eth frame from proxy: {frame_length} bytes"); - Ok(hdr_len + frame_length) + Ok(buf_offset + frame_length) } /// Try to write a frame to the proxy. fn write_frame(&mut self, hdr_len: usize, buf: &mut [u8]) -> Result<(), WriteError> { - let ret = send(self.fd.as_raw_fd(), &buf[hdr_len..], MsgFlags::empty()) + let buf_offset = if !self.include_vnet_header { + hdr_len + } else { + // Unixgram backends don't include the frame length header. + FRAME_HEADER_LEN + }; + + let ret = send(self.fd.as_raw_fd(), &buf[buf_offset..], MsgFlags::empty()) .map_err(WriteError::Internal)?; debug!( "Written frame size={}, written={}", diff --git a/src/devices/src/virtio/net/unixstream.rs b/src/devices/src/virtio/net/unixstream.rs index 023be6b28..7c31eabde 100644 --- a/src/devices/src/virtio/net/unixstream.rs +++ b/src/devices/src/virtio/net/unixstream.rs @@ -10,11 +10,7 @@ use std::{ use crate::virtio::net::backend::ConnectError; use super::backend::{NetBackend, ReadError, WriteError}; -use super::write_virtio_net_hdr; - -/// Each frame the network proxy is prepended by a 4 byte "header". -/// It is interpreted as a big-endian u32 integer and is the length of the following ethernet frame. -const FRAME_HEADER_LEN: usize = 4; +use super::{write_virtio_net_hdr, FRAME_HEADER_LEN}; pub struct Unixstream { fd: OwnedFd, @@ -22,11 +18,12 @@ pub struct Unixstream { expecting_frame_length: u32, // 0 if last write is fully complete, otherwise the length that was written last_partial_write_length: usize, + include_vnet_header: bool, } impl Unixstream { /// Create the backend with a pre-established connection to the userspace network proxy. - pub fn new(fd: OwnedFd) -> Self { + pub fn new(fd: OwnedFd, include_vnet_header: bool) -> Self { if let Err(e) = setsockopt(&fd, sockopt::SndBuf, &(16 * 1024 * 1024)) { log::warn!("Failed to increase SO_SNDBUF (performance may be decreased): {e}"); } @@ -41,11 +38,12 @@ impl Unixstream { fd, expecting_frame_length: 0, last_partial_write_length: 0, + include_vnet_header, } } /// Create the backend opening a connection to the userspace network proxy. - pub fn open(path: PathBuf) -> Result { + pub fn open(path: PathBuf, include_vnet_header: bool) -> Result { let fd = socket( AddressFamily::Unix, SockType::Stream, @@ -70,6 +68,7 @@ impl Unixstream { fd, expecting_frame_length: 0, last_partial_write_length: 0, + include_vnet_header, }) } @@ -159,13 +158,17 @@ impl NetBackend for Unixstream { }; } - let hdr_len = write_virtio_net_hdr(buf); - let buf = &mut buf[hdr_len..]; + let buf_offset = if !self.include_vnet_header { + write_virtio_net_hdr(buf) + } else { + 0 + }; + let buf = &mut buf[buf_offset..]; let frame_length = self.expecting_frame_length as usize; self.read_loop(&mut buf[..frame_length], false)?; self.expecting_frame_length = 0; log::trace!("Read eth frame from network proxy: {frame_length} bytes"); - Ok(hdr_len + frame_length) + Ok(buf_offset + frame_length) } /// Try to write a frame to the proxy. @@ -188,10 +191,18 @@ impl NetBackend for Unixstream { assert!(buf.len() > hdr_len); let frame_length = buf.len() - hdr_len; - buf[hdr_len - FRAME_HEADER_LEN..hdr_len] + // If the vnet header is not included, overwrite it with the frame length, otherwise + // write the frame length before the vnet header. + let buf_offset = if !self.include_vnet_header { + hdr_len - FRAME_HEADER_LEN + } else { + 0 + }; + + buf[buf_offset..buf_offset + FRAME_HEADER_LEN] .copy_from_slice(&(frame_length as u32).to_be_bytes()); - self.write_loop(&buf[hdr_len - FRAME_HEADER_LEN..])?; + self.write_loop(&buf[buf_offset..buf_offset + frame_length])?; Ok(()) } diff --git a/src/devices/src/virtio/net/worker.rs b/src/devices/src/virtio/net/worker.rs index 9ab122fe5..63a8f90cc 100644 --- a/src/devices/src/virtio/net/worker.rs +++ b/src/devices/src/virtio/net/worker.rs @@ -3,7 +3,7 @@ use crate::virtio::net::backend::ConnectError; use crate::virtio::net::tap::Tap; use crate::virtio::net::unixgram::Unixgram; use crate::virtio::net::unixstream::Unixstream; -use crate::virtio::net::{MAX_BUFFER_SIZE, QUEUE_SIZE, RX_INDEX, TX_INDEX}; +use crate::virtio::net::{FRAME_HEADER_LEN, MAX_BUFFER_SIZE, QUEUE_SIZE, RX_INDEX, TX_INDEX}; use crate::virtio::{InterruptTransport, Queue}; use super::backend::{NetBackend, ReadError, WriteError}; @@ -30,7 +30,7 @@ pub struct NetWorker { rx_has_deferred_frame: bool, tx_iovec: Vec<(GuestAddress, usize)>, - tx_frame_buf: [u8; MAX_BUFFER_SIZE], + tx_frame_buf: [u8; MAX_BUFFER_SIZE + FRAME_HEADER_LEN], tx_frame_len: usize, } @@ -42,6 +42,7 @@ impl NetWorker { interrupt: InterruptTransport, mem: GuestMemoryMmap, _vnet_features: u64, + include_vnet_header: bool, cfg_backend: VirtioNetBackend, ) -> Result { let backend = match cfg_backend { @@ -49,23 +50,26 @@ impl NetWorker { // SAFETY: we need to trust that the library user has configured // the backend with a healthy file descriptor. let owned_fd = unsafe { OwnedFd::from_raw_fd(fd) }; - Box::new(Unixstream::new(owned_fd)) as Box + Box::new(Unixstream::new(owned_fd, include_vnet_header)) + as Box } VirtioNetBackend::UnixstreamPath(path) => { - Box::new(Unixstream::open(path)?) as Box + Box::new(Unixstream::open(path, include_vnet_header)?) as Box } VirtioNetBackend::UnixgramFd(fd) => { // SAFETY: we need to trust that the library user has configured // the backend with a healthy file descriptor. let owned_fd = unsafe { OwnedFd::from_raw_fd(fd) }; - Box::new(Unixgram::new(owned_fd)) as Box + Box::new(Unixgram::new(owned_fd, include_vnet_header)) as Box } VirtioNetBackend::UnixgramPath(path, vfkit_magic) => { - Box::new(Unixgram::open(path, vfkit_magic)?) as Box + Box::new(Unixgram::open(path, vfkit_magic, include_vnet_header)?) + as Box } #[cfg(target_os = "linux")] VirtioNetBackend::Tap(tap_name) => { - Box::new(Tap::new(tap_name, _vnet_features)?) as Box + Box::new(Tap::new(tap_name, _vnet_features, include_vnet_header)?) + as Box } }; @@ -81,7 +85,7 @@ impl NetWorker { rx_frame_buf_len: 0, rx_has_deferred_frame: false, - tx_frame_buf: [0u8; MAX_BUFFER_SIZE], + tx_frame_buf: [0u8; MAX_BUFFER_SIZE + FRAME_HEADER_LEN], tx_frame_len: 0, tx_iovec: Vec::with_capacity(QUEUE_SIZE as usize), }) @@ -306,7 +310,7 @@ impl NetWorker { } // Copy buffer from across multiple descriptors. - let mut read_count = 0; + let mut read_count = FRAME_HEADER_LEN; for (desc_addr, desc_len) in self.tx_iovec.drain(..) { let limit = cmp::min(read_count + desc_len, self.tx_frame_buf.len()); @@ -326,10 +330,10 @@ impl NetWorker { } self.tx_frame_len = read_count; - match self - .backend - .write_frame(vnet_hdr_len(), &mut self.tx_frame_buf[..read_count]) - { + match self.backend.write_frame( + vnet_hdr_len() + FRAME_HEADER_LEN, + &mut self.tx_frame_buf[..read_count], + ) { Ok(()) => { self.tx_frame_len = 0; tx_queue diff --git a/src/libkrun/src/lib.rs b/src/libkrun/src/lib.rs index 3862de2b4..d63783e50 100644 --- a/src/libkrun/src/lib.rs +++ b/src/libkrun/src/lib.rs @@ -885,6 +885,9 @@ pub unsafe extern "C" fn krun_set_data_disk(ctx_id: u32, c_disk_path: *const c_c #[cfg(feature = "net")] const NET_FLAG_VFKIT: u32 = 1 << 0; +#[cfg(feature = "net")] +const NET_FLAG_INCLUDE_VNET_HEADER: u32 = 1 << 1; + /* Taken from uapi/linux/virtio_net.h */ #[cfg(feature = "net")] const NET_FEATURE_CSUM: u32 = 1 << 0; @@ -962,19 +965,21 @@ pub unsafe extern "C" fn krun_add_net_unixstream( Err(_) => return -libc::EINVAL, }; - /* The unixstream backend doesn't support any flags */ - if flags != 0 { + if (features & !NET_ALL_FEATURES) != 0 { return -libc::EINVAL; } - if (features & !NET_ALL_FEATURES) != 0 { + // Unixstream backends don't support NET_FLAG_VFKIT. + if (flags & !NET_FLAG_INCLUDE_VNET_HEADER) != 0 { return -libc::EINVAL; } + let include_vnet_header: bool = flags & NET_FLAG_INCLUDE_VNET_HEADER != 0; + match CTX_MAP.lock().unwrap().entry(ctx_id) { Entry::Occupied(mut ctx_cfg) => { let cfg = ctx_cfg.get_mut(); - create_virtio_net(cfg, backend, mac, features); + create_virtio_net(cfg, backend, mac, features, include_vnet_header); } Entry::Vacant(_) => return -libc::ENOENT, } @@ -1017,10 +1022,11 @@ pub unsafe extern "C" fn krun_add_net_unixgram( return -libc::EINVAL; } - if (flags & !NET_FLAG_VFKIT) != 0 { + if (flags & !(NET_FLAG_VFKIT | NET_FLAG_INCLUDE_VNET_HEADER)) != 0 { return -libc::EINVAL; } let send_vfkit_magic: bool = flags & NET_FLAG_VFKIT != 0; + let include_vnet_header: bool = flags & NET_FLAG_INCLUDE_VNET_HEADER != 0; let backend = if let Some(path) = path { VirtioNetBackend::UnixgramPath(path, send_vfkit_magic) @@ -1031,7 +1037,7 @@ pub unsafe extern "C" fn krun_add_net_unixgram( match CTX_MAP.lock().unwrap().entry(ctx_id) { Entry::Occupied(mut ctx_cfg) => { let cfg = ctx_cfg.get_mut(); - create_virtio_net(cfg, backend, mac, features); + create_virtio_net(cfg, backend, mac, features, include_vnet_header); } Entry::Vacant(_) => return -libc::ENOENT, } @@ -1080,7 +1086,7 @@ pub unsafe extern "C" fn krun_add_net_tap( match CTX_MAP.lock().unwrap().entry(ctx_id) { Entry::Occupied(mut ctx_cfg) => { let cfg = ctx_cfg.get_mut(); - create_virtio_net(cfg, VirtioNetBackend::Tap(tap_name), mac, features); + create_virtio_net(cfg, VirtioNetBackend::Tap(tap_name), mac, features, true); } Entry::Vacant(_) => return -libc::ENOENT, } @@ -1922,12 +1928,14 @@ fn create_virtio_net( backend: VirtioNetBackend, mac: [u8; 6], features: u32, + include_vnet_header: bool, ) { let network_interface_config = NetworkInterfaceConfig { iface_id: format!("eth{}", ctx_cfg.net_index), backend, mac, features, + include_vnet_header, }; ctx_cfg.net_index += 1; ctx_cfg @@ -2579,7 +2587,7 @@ pub extern "C" fn krun_start_enter(ctx_id: u32) -> i32 { let mac = ctx_cfg .legacy_mac .unwrap_or([0x5a, 0x94, 0xef, 0xe4, 0x0c, 0xee]); - create_virtio_net(&mut ctx_cfg, backend, mac, NET_COMPAT_FEATURES); + create_virtio_net(&mut ctx_cfg, backend, mac, NET_COMPAT_FEATURES, false); } } diff --git a/src/vmm/src/vmm_config/net.rs b/src/vmm/src/vmm_config/net.rs index 444692d8f..b55b8f5c7 100644 --- a/src/vmm/src/vmm_config/net.rs +++ b/src/vmm/src/vmm_config/net.rs @@ -18,6 +18,8 @@ pub struct NetworkInterfaceConfig { pub mac: [u8; 6], /// virtio-net features for the network interface. pub features: u32, + /// Whether vnet headers should be sent to and received from the network backend. + pub include_vnet_header: bool, } /// Errors associated with `NetworkInterfaceConfig`. @@ -65,7 +67,13 @@ impl NetBuilder { /// Creates a Net device from a NetworkInterfaceConfig. pub fn create_net(cfg: NetworkInterfaceConfig) -> Result { // Create and return the Net device - Net::new(cfg.iface_id, cfg.backend, cfg.mac, cfg.features) - .map_err(NetworkInterfaceError::CreateNetworkDevice) + Net::new( + cfg.iface_id, + cfg.backend, + cfg.mac, + cfg.features, + cfg.include_vnet_header, + ) + .map_err(NetworkInterfaceError::CreateNetworkDevice) } }