From 0548566d8ded1bf59e589517df99f3ed98948e04 Mon Sep 17 00:00:00 2001 From: Ashwin Naren Date: Sun, 10 May 2026 14:34:55 -0700 Subject: [PATCH 01/11] add support for virtio block driver --- scripts/qemu_runner.py | 3 +- src/drivers/block/mod.rs | 64 ++++++++++++ src/drivers/block/virtio.rs | 203 ++++++++++++++++++++++++++++++++++++ src/drivers/init.rs | 5 + src/drivers/mod.rs | 7 +- 5 files changed, 280 insertions(+), 2 deletions(-) create mode 100644 src/drivers/block/mod.rs create mode 100644 src/drivers/block/virtio.rs diff --git a/scripts/qemu_runner.py b/scripts/qemu_runner.py index e519f125..65283bfb 100755 --- a/scripts/qemu_runner.py +++ b/scripts/qemu_runner.py @@ -43,10 +43,11 @@ "-s": None, "-kernel": bin_executable_location, "-append": f"{append_args} --rootfs=ext4fs --automount=/dev,devfs --automount=/tmp,tmpfs --automount=/proc,procfs --automount=/sys,sysfs", + "-drive": "file=ubuntu-noble-arm64.img,format=raw,if=none,readonly=on,cache=none,id=x0", } # Arguments that can appear multiple times (e.g. -device) -extra_args = ["-device", "virtio-rng-device"] +extra_args = ["-device", "virtio-rng-device", "-device", "virtio-blk-device,drive=x0"] if args.debug: default_args["-S"] = None diff --git a/src/drivers/block/mod.rs b/src/drivers/block/mod.rs new file mode 100644 index 00000000..beeb52df --- /dev/null +++ b/src/drivers/block/mod.rs @@ -0,0 +1,64 @@ +use alloc::{boxed::Box, sync::Arc, vec::Vec}; +use async_trait::async_trait; +use libkernel::{error::Result, fs::BlockDevice}; +use log::info; + +use crate::sync::SpinLock; + +pub mod virtio; + +struct RegisteredBlockDevice { + name: &'static str, + device: Arc, +} + +struct SharedBlockDevice { + inner: Arc, +} + +#[async_trait] +impl BlockDevice for SharedBlockDevice { + async fn read(&self, block_id: u64, buf: &mut [u8]) -> Result<()> { + self.inner.read(block_id, buf).await + } + + async fn write(&self, block_id: u64, buf: &[u8]) -> Result<()> { + self.inner.write(block_id, buf).await + } + + fn block_size(&self) -> usize { + self.inner.block_size() + } + + async fn sync(&self) -> Result<()> { + self.inner.sync().await + } +} + +static BLOCK_DEVICES: SpinLock> = SpinLock::new(Vec::new()); + +pub fn register_block_device(name: &'static str, device: Arc) -> usize { + let mut devices = BLOCK_DEVICES.lock_save_irq(); + let index = devices.len(); + + devices.push(RegisteredBlockDevice { name, device }); + info!("registered block device {name} as index {index}"); + + index +} + +pub fn get_block_device(index: usize) -> Option<(&'static str, Box)> { + let devices = BLOCK_DEVICES.lock_save_irq(); + let device = devices.get(index)?; + + Some(( + device.name, + Box::new(SharedBlockDevice { + inner: device.device.clone(), + }), + )) +} + +pub fn first_block_device() -> Option<(&'static str, Box)> { + get_block_device(0) +} diff --git a/src/drivers/block/virtio.rs b/src/drivers/block/virtio.rs new file mode 100644 index 00000000..d82a6ea4 --- /dev/null +++ b/src/drivers/block/virtio.rs @@ -0,0 +1,203 @@ +use crate::drivers::virtio_hal::VirtioHal; +use crate::sync::SpinLock; +use crate::{ + arch::ArchImpl, + drivers::{ + Driver, DriverManager, + init::PlatformBus, + probe::{DeviceDescriptor, DeviceMatchType}, + }, + kernel_driver, +}; +use alloc::{boxed::Box, sync::Arc}; +use async_trait::async_trait; +use core::ptr::NonNull; +use libkernel::memory::proc_vm::address_space::{KernAddressSpace, VirtualMemory}; +use libkernel::{ + error::{IoError, KernelError, ProbeError, Result}, + fs::BlockDevice, + memory::{ + address::{PA, VA}, + region::PhysMemoryRegion, + }, +}; +use log::info; +use virtio_drivers::{ + Error as VirtioError, + device::blk::{SECTOR_SIZE, VirtIOBlk}, + transport::{ + DeviceType, Transport, + mmio::{MmioTransport, VirtIOHeader}, + }, +}; + +pub struct VirtioBlkDriver { + fdt_name: Option<&'static str>, + blk: SpinLock>, + capacity_sectors: u64, + readonly: bool, +} + +impl VirtioBlkDriver { + pub fn new(fdt_name: Option<&'static str>, transport: T) -> Result { + let blk = VirtIOBlk::::new(transport) + .map_err(|_| KernelError::Other("virtio-blk init failed"))?; + + let capacity_sectors = blk.capacity(); + let readonly = blk.readonly(); + + info!( + "virtio-blk capacity={} sectors ({} bytes), readonly={readonly}", + capacity_sectors, + capacity_sectors * SECTOR_SIZE as u64, + ); + + Ok(Self { + fdt_name, + blk: SpinLock::new(blk), + capacity_sectors, + readonly, + }) + } + + fn validate_io(&self, block_id: u64, len: usize) -> Result { + if len == 0 { + return usize::try_from(block_id).map_err(|_| KernelError::RangeError); + } + + if !len.is_multiple_of(SECTOR_SIZE) { + return Err(KernelError::InvalidValue); + } + + let sectors = (len / SECTOR_SIZE) as u64; + let end = block_id.checked_add(sectors).ok_or(IoError::OutOfBounds)?; + if end > self.capacity_sectors { + return Err(IoError::OutOfBounds.into()); + } + + usize::try_from(block_id).map_err(|_| KernelError::RangeError) + } +} + +impl Driver for VirtioBlkDriver { + fn name(&self) -> &'static str { + self.fdt_name.unwrap_or("virtio-blk") + } + + fn as_block_device(self: Arc) -> Option> { + Some(self) + } +} + +#[async_trait] +impl BlockDevice for VirtioBlkDriver { + async fn read(&self, block_id: u64, buf: &mut [u8]) -> Result<()> { + if buf.is_empty() { + return Ok(()); + } + + let block_id = self.validate_io(block_id, buf.len())?; + let mut blk = self.blk.lock_save_irq(); + + blk.read_blocks(block_id, buf).map_err(map_virtio_error) + } + + async fn write(&self, block_id: u64, buf: &[u8]) -> Result<()> { + if buf.is_empty() { + return Ok(()); + } + + if self.readonly { + return Err(KernelError::NotPermitted); + } + + let block_id = self.validate_io(block_id, buf.len())?; + let mut blk = self.blk.lock_save_irq(); + + blk.write_blocks(block_id, buf).map_err(map_virtio_error) + } + + fn block_size(&self) -> usize { + SECTOR_SIZE + } + + async fn sync(&self) -> Result<()> { + let mut blk = self.blk.lock_save_irq(); + blk.flush().map_err(map_virtio_error) + } +} + +fn map_virtio_error(error: VirtioError) -> KernelError { + match error { + VirtioError::QueueFull => KernelError::BufferFull, + VirtioError::NotReady => KernelError::Other("virtio-blk device not ready"), + VirtioError::WrongToken => KernelError::Other("virtio-blk queue token mismatch"), + VirtioError::AlreadyUsed => KernelError::InUse, + VirtioError::InvalidParam => KernelError::InvalidValue, + VirtioError::DmaError => KernelError::NoMemory, + VirtioError::IoError => KernelError::Other("virtio-blk I/O error"), + VirtioError::Unsupported => KernelError::NotSupported, + VirtioError::ConfigSpaceTooSmall => KernelError::Other("virtio-blk config space too small"), + VirtioError::ConfigSpaceMissing => KernelError::Other("virtio-blk config space missing"), + VirtioError::SocketDeviceError(_) => KernelError::Other("virtio-blk transport error"), + } +} + +fn virtio_blk_probe(_dm: &mut DriverManager, d: DeviceDescriptor) -> Result> { + match d { + DeviceDescriptor::Fdt(fdt_node, _flags) => { + let region = fdt_node + .reg() + .ok_or(ProbeError::NoReg)? + .next() + .ok_or(ProbeError::NoReg)?; + + let size = region.size.ok_or(ProbeError::NoRegSize)?; + + let mapped: VA = + ArchImpl::kern_address_space() + .lock_save_irq() + .map_mmio(PhysMemoryRegion::new( + PA::from_value(region.address as usize), + size, + ))?; + + let header = NonNull::new(mapped.value() as *mut VirtIOHeader) + .ok_or(KernelError::InvalidValue)?; + + let transport = unsafe { + match MmioTransport::new(header, size) { + Ok(t) => t, + Err(_) => return Err(KernelError::Probe(ProbeError::NoMatch)), + } + }; + + if !matches!(transport.device_type(), DeviceType::Block) { + return Err(KernelError::Probe(ProbeError::NoMatch)); + } + + info!("virtio-blk found at {mapped:?} (node {})", fdt_node.name); + + Ok(Arc::new(VirtioBlkDriver::new( + Some(fdt_node.name), + transport, + )?)) + } + } +} + +pub fn virtio_blk_init(bus: &mut PlatformBus, _dm: &mut DriverManager) -> Result<()> { + bus.register_platform_driver( + DeviceMatchType::FdtCompatible("virtio,mmio"), + Box::new(virtio_blk_probe), + ); + + bus.register_platform_driver( + DeviceMatchType::FdtCompatible("virtio-mmio"), + Box::new(virtio_blk_probe), + ); + + Ok(()) +} + +kernel_driver!(virtio_blk_init); diff --git a/src/drivers/init.rs b/src/drivers/init.rs index efd82d6a..11d87563 100644 --- a/src/drivers/init.rs +++ b/src/drivers/init.rs @@ -1,5 +1,6 @@ use super::{ Driver, DriverManager, + block::register_block_device, probe::{DeviceDescriptor, DeviceMatchType, ProbeFn}, }; use crate::{drivers::DM, sync::SpinLock}; @@ -58,6 +59,10 @@ impl PlatformBus { for probe_fn in probe_fns { match (probe_fn)(dm, descr.clone()) { Ok(driver) => { + if let Some(block_device) = driver.clone().as_block_device() { + register_block_device(driver.name(), block_device); + } + dm.insert_driver(driver.clone()); return Ok(Some(driver)); } diff --git a/src/drivers/mod.rs b/src/drivers/mod.rs index a3a05792..9570f77a 100644 --- a/src/drivers/mod.rs +++ b/src/drivers/mod.rs @@ -10,7 +10,7 @@ use alloc::{ }; use libkernel::{ error::{KernelError, Result}, - fs::OpenFlags, + fs::{BlockDevice, OpenFlags}, }; use probe::DeviceDescriptor; @@ -20,6 +20,7 @@ use crate::{ sync::SpinLock, }; +pub mod block; pub mod chrdev; pub mod display; pub mod fdt_prober; @@ -54,6 +55,10 @@ pub trait Driver: Send + Sync + Any { fn as_filesystem_driver(self: Arc) -> Option> { None } + + fn as_block_device(self: Arc) -> Option> { + None + } } pub trait OpenableDevice: Send + Sync { From 84d95166e6f86f32fb12c54fc40e5dd7ab731040 Mon Sep 17 00:00:00 2001 From: Ashwin Naren Date: Mon, 11 May 2026 15:26:36 -0700 Subject: [PATCH 02/11] expose disk for syscalls --- libkernel/src/driver.rs | 10 +++ src/drivers/block/mod.rs | 72 ++++++++++++++++++-- src/drivers/fs/dev.rs | 63 +++++++++++++++--- src/fs/blk.rs | 127 ++++++++++++++++++++++++++++++++++++ src/fs/mod.rs | 14 +++- src/fs/syscalls/at/stat.rs | 14 +++- src/fs/syscalls/at/statx.rs | 15 +++++ src/fs/syscalls/mount.rs | 99 ++++++++++++++++++++++++---- 8 files changed, 381 insertions(+), 33 deletions(-) create mode 100644 src/fs/blk.rs diff --git a/libkernel/src/driver.rs b/libkernel/src/driver.rs index 04cb0176..51a2367d 100644 --- a/libkernel/src/driver.rs +++ b/libkernel/src/driver.rs @@ -8,3 +8,13 @@ pub struct CharDevDescriptor { /// The minor device number (identifies the device instance). pub minor: u64, } + +impl CharDevDescriptor { + /// Encodes this device descriptor into a Linux-style `dev_t` value. + pub const fn dev_t(self) -> u64 { + (self.minor & 0xff) + | ((self.major & 0xfff) << 8) + | ((self.minor & !0xff) << 12) + | ((self.major & !0xfff) << 32) + } +} diff --git a/src/drivers/block/mod.rs b/src/drivers/block/mod.rs index beeb52df..063c6e79 100644 --- a/src/drivers/block/mod.rs +++ b/src/drivers/block/mod.rs @@ -1,14 +1,21 @@ -use alloc::{boxed::Box, sync::Arc, vec::Vec}; +use alloc::{boxed::Box, format, string::String, sync::Arc, vec::Vec}; use async_trait::async_trait; -use libkernel::{error::Result, fs::BlockDevice}; +use libkernel::{ + driver::CharDevDescriptor, + error::Result, + fs::{BlockDevice, attr::FilePermissions}, +}; use log::info; -use crate::sync::SpinLock; +use crate::{drivers::fs::dev::devfs, sync::SpinLock}; pub mod virtio; +pub const BLOCK_DEVICE_MAJOR: u64 = 254; + struct RegisteredBlockDevice { name: &'static str, + descriptor: CharDevDescriptor, device: Arc, } @@ -37,16 +44,54 @@ impl BlockDevice for SharedBlockDevice { static BLOCK_DEVICES: SpinLock> = SpinLock::new(Vec::new()); +fn block_device_devfs_name(index: usize) -> String { + let mut suffix = String::new(); + let mut value = index; + + loop { + let c = (b'a' + (value % 26) as u8) as char; + suffix.insert(0, c); + + if value < 26 { + break; + } + + value = value / 26 - 1; + } + + format!("vd{suffix}") +} + pub fn register_block_device(name: &'static str, device: Arc) -> usize { let mut devices = BLOCK_DEVICES.lock_save_irq(); let index = devices.len(); + let descriptor = CharDevDescriptor { + major: BLOCK_DEVICE_MAJOR, + minor: index as u64, + }; + let devfs_name = block_device_devfs_name(index); + let block_size = device.block_size() as u32; - devices.push(RegisteredBlockDevice { name, device }); - info!("registered block device {name} as index {index}"); + devfs() + .mknod_block( + devfs_name.clone(), + descriptor, + FilePermissions::from_bits_retain(0o660), + block_size, + ) + .expect("newly-allocated block device name should be unique"); + + devices.push(RegisteredBlockDevice { + name, + descriptor, + device, + }); + info!("registered block device {name} as index {index} at /dev/{devfs_name} ({descriptor:?})"); index } +#[expect(unused)] pub fn get_block_device(index: usize) -> Option<(&'static str, Box)> { let devices = BLOCK_DEVICES.lock_save_irq(); let device = devices.get(index)?; @@ -59,6 +104,23 @@ pub fn get_block_device(index: usize) -> Option<(&'static str, Box Option<(&'static str, Box)> { + let devices = BLOCK_DEVICES.lock_save_irq(); + let device = devices + .iter() + .find(|device| device.descriptor == descriptor)?; + + Some(( + device.name, + Box::new(SharedBlockDevice { + inner: device.device.clone(), + }), + )) +} + +#[expect(unused)] pub fn first_block_device() -> Option<(&'static str, Box)> { get_block_device(0) } diff --git a/src/drivers/fs/dev.rs b/src/drivers/fs/dev.rs index 7f2d0c01..5f0f2b46 100644 --- a/src/drivers/fs/dev.rs +++ b/src/drivers/fs/dev.rs @@ -52,11 +52,13 @@ impl DevFs { }) } - pub fn mknod( + fn insert_device( &self, name: String, - device_id: CharDevDescriptor, + file_type: FileType, + kind: InodeKind, permissions: FilePermissions, + block_size: u32, ) -> Result<()> { let InodeKind::Directory(ref children) = self.root.kind else { // This should be impossible as the root is always a directory. @@ -77,17 +79,48 @@ impl DevFs { id, attr: SpinLock::new(FileAttr { id, - file_type: FileType::CharDevice(device_id), + file_type, permissions, + block_size, ..FileAttr::default() }), - // This is the crucial part: we store the device handle. - kind: InodeKind::CharDevice { device_id }, + kind, }); - children.insert(name.to_string(), new_inode); + children.insert(name, new_inode); Ok(()) } + + pub fn mknod( + &self, + name: String, + device_id: CharDevDescriptor, + permissions: FilePermissions, + ) -> Result<()> { + self.insert_device( + name, + FileType::CharDevice(device_id), + InodeKind::CharDevice { device_id }, + permissions, + 0, + ) + } + + pub fn mknod_block( + &self, + name: String, + device_id: CharDevDescriptor, + permissions: FilePermissions, + block_size: u32, + ) -> Result<()> { + self.insert_device( + name, + FileType::BlockDevice(device_id), + InodeKind::BlockDevice { device_id }, + permissions, + block_size, + ) + } } #[async_trait] @@ -112,6 +145,8 @@ enum InodeKind { Directory(SpinLock>>), /// A character device, which stores its major/minor handle (`dev_t`). CharDevice { device_id: CharDevDescriptor }, + /// A block device, which stores its major/minor handle (`dev_t`). + BlockDevice { device_id: CharDevDescriptor }, } struct DevDirStreamer { @@ -159,14 +194,20 @@ impl Inode for DevFsINode { .map(|inode| inode.clone() as Arc) .ok_or_else(|| FsError::NotFound.into()) } - InodeKind::CharDevice { .. } => Err(FsError::NotADirectory.into()), + InodeKind::CharDevice { .. } | InodeKind::BlockDevice { .. } => { + Err(FsError::NotADirectory.into()) + } } } async fn getattr(&self) -> Result { let mut attr = self.attr.lock_save_irq().clone(); - if let InodeKind::CharDevice { device_id } = self.kind { - attr.file_type = FileType::CharDevice(device_id); + match self.kind { + InodeKind::CharDevice { device_id } => attr.file_type = FileType::CharDevice(device_id), + InodeKind::BlockDevice { device_id } => { + attr.file_type = FileType::BlockDevice(device_id); + } + InodeKind::Directory(..) => {} } Ok(attr) } @@ -180,7 +221,9 @@ impl Inode for DevFsINode { idx: start_offset as usize, })) } - InodeKind::CharDevice { .. } => Err(FsError::NotADirectory.into()), + InodeKind::CharDevice { .. } | InodeKind::BlockDevice { .. } => { + Err(FsError::NotADirectory.into()) + } } } diff --git a/src/fs/blk.rs b/src/fs/blk.rs new file mode 100644 index 00000000..48c7d771 --- /dev/null +++ b/src/fs/blk.rs @@ -0,0 +1,127 @@ +use super::{fops::FileOps, open_file::FileCtx}; +use crate::memory::uaccess::{copy_from_user_slice, copy_to_user_slice}; +use alloc::{boxed::Box, vec}; +use async_trait::async_trait; +use core::{cmp::min, future::Future, pin::Pin}; +use libkernel::{ + error::{KernelError, Result}, + fs::{BlockDevice, SeekFrom}, + memory::address::UA, +}; + +pub struct BlockFile { + device: Box, +} + +impl BlockFile { + pub fn new(device: Box) -> Self { + Self { device } + } +} + +#[async_trait] +impl FileOps for BlockFile { + async fn readat( + &mut self, + mut user_buf: UA, + mut count: usize, + mut offset: u64, + ) -> Result { + let block_size = self.device.block_size(); + if block_size == 0 { + return Err(KernelError::InvalidValue); + } + + let mut total_bytes_read = 0; + let mut block_buf = vec![0; block_size]; + + while count > 0 { + let block_id = offset / block_size as u64; + let block_offset = (offset % block_size as u64) as usize; + let chunk_size = min(count, block_size - block_offset); + + self.device.read(block_id, &mut block_buf).await?; + copy_to_user_slice( + &block_buf[block_offset..block_offset + chunk_size], + user_buf, + ) + .await?; + + offset += chunk_size as u64; + count -= chunk_size; + total_bytes_read += chunk_size; + user_buf = user_buf.add_bytes(chunk_size); + } + + Ok(total_bytes_read) + } + + async fn writeat( + &mut self, + mut user_buf: UA, + mut count: usize, + mut offset: u64, + ) -> Result { + let block_size = self.device.block_size(); + if block_size == 0 { + return Err(KernelError::InvalidValue); + } + + let mut total_bytes_written = 0; + let mut block_buf = vec![0; block_size]; + + while count > 0 { + let block_id = offset / block_size as u64; + let block_offset = (offset % block_size as u64) as usize; + let chunk_size = min(count, block_size - block_offset); + + if block_offset != 0 || chunk_size != block_size { + self.device.read(block_id, &mut block_buf).await?; + } + + copy_from_user_slice( + user_buf, + &mut block_buf[block_offset..block_offset + chunk_size], + ) + .await?; + self.device.write(block_id, &block_buf).await?; + + offset += chunk_size as u64; + count -= chunk_size; + total_bytes_written += chunk_size; + user_buf = user_buf.add_bytes(chunk_size); + } + + Ok(total_bytes_written) + } + + fn poll_read_ready(&self) -> Pin> + 'static + Send>> { + Box::pin(async { Ok(()) }) + } + + fn poll_write_ready(&self) -> Pin> + 'static + Send>> { + Box::pin(async { Ok(()) }) + } + + async fn seek(&mut self, ctx: &mut FileCtx, pos: SeekFrom) -> Result { + fn saturating_add_signed(value: u64, delta: i64) -> u64 { + if delta >= 0 { + value.saturating_add(delta as u64) + } else { + value.saturating_sub((-delta) as u64) + } + } + + match pos { + SeekFrom::Start(offset) => ctx.pos = offset, + SeekFrom::Current(delta) => ctx.pos = saturating_add_signed(ctx.pos, delta), + SeekFrom::End(_) => return Err(KernelError::NotSupported), + } + + Ok(ctx.pos) + } + + async fn flush(&self, _ctx: &FileCtx) -> Result<()> { + self.device.sync().await + } +} diff --git a/src/fs/mod.rs b/src/fs/mod.rs index 0db5a581..ceec0b42 100644 --- a/src/fs/mod.rs +++ b/src/fs/mod.rs @@ -1,11 +1,12 @@ use crate::clock::realtime::date; use crate::{ - drivers::{DM, Driver}, + drivers::{DM, Driver, block::get_block_device_by_descriptor}, process::Task, sync::SpinLock, }; use alloc::{borrow::ToOwned, boxed::Box, collections::btree_map::BTreeMap, sync::Arc, vec::Vec}; use async_trait::async_trait; +use blk::BlockFile; use core::any::Any; use core::sync::atomic::{AtomicU64, Ordering}; use dir::DirFile; @@ -20,6 +21,7 @@ use libkernel::{ use open_file::OpenFile; use reg::RegFile; +pub mod blk; pub mod dir; pub mod fops; pub mod open_file; @@ -427,7 +429,15 @@ impl VFS { Ok(Arc::new(open_file)) } FileType::Symlink => unimplemented!(), // this is implemented at resolve_path_internal - FileType::BlockDevice(_) => todo!(), + FileType::BlockDevice(block_dev_descriptor) => { + let (_, block_device) = get_block_device_by_descriptor(block_dev_descriptor) + .ok_or(FsError::NoDevice)?; + + let mut open_file = OpenFile::new(Box::new(BlockFile::new(block_device)), flags); + open_file.update(target_inode, path.to_owned()); + + Ok(Arc::new(open_file)) + } FileType::CharDevice(char_dev_descriptor) => { let char_driver = DM .lock_save_irq() diff --git a/src/fs/syscalls/at/stat.rs b/src/fs/syscalls/at/stat.rs index dcc3fd7c..e2621675 100644 --- a/src/fs/syscalls/at/stat.rs +++ b/src/fs/syscalls/at/stat.rs @@ -13,6 +13,14 @@ use libkernel::{ use super::AtFlags; +fn special_file_rdev(attr: &FileAttr) -> u64 { + match attr.file_type { + libkernel::fs::FileType::BlockDevice(device) + | libkernel::fs::FileType::CharDevice(device) => device.dev_t(), + _ => 0, + } +} + #[repr(C)] #[derive(Debug, Clone, Copy)] pub struct Stat { @@ -42,6 +50,8 @@ unsafe impl UserCopyable for Stat {} impl From for Stat { fn from(value: FileAttr) -> Self { + let st_rdev = special_file_rdev(&value); + Self { st_dev: value.id.fs_id(), st_ino: value.id.inode_id(), @@ -49,12 +59,12 @@ impl From for Stat { st_nlink: value.nlinks, st_uid: value.uid.into(), st_gid: value.gid.into(), - st_rdev: 0, + st_rdev, __pad1: 0, st_size: value.size as _, st_blksize: value.block_size as _, __pad2: 0, - st_blocks: 0, + st_blocks: value.blocks as _, st_atime: value.atime.as_secs() as _, st_atime_nsec: value.atime.subsec_nanos() as _, st_mtime: value.mtime.as_secs() as _, diff --git a/src/fs/syscalls/at/statx.rs b/src/fs/syscalls/at/statx.rs index b2e7acff..c9b0c8ee 100644 --- a/src/fs/syscalls/at/statx.rs +++ b/src/fs/syscalls/at/statx.rs @@ -115,6 +115,16 @@ impl From for StatXTimestamp { unsafe impl UserCopyable for StatX {} +fn special_file_device( + attr: &libkernel::fs::attr::FileAttr, +) -> Option { + match attr.file_type { + libkernel::fs::FileType::BlockDevice(device) + | libkernel::fs::FileType::CharDevice(device) => Some(device), + _ => None, + } +} + pub async fn sys_statx( ctx: &ProcessCtx, dirfd: Fd, @@ -205,6 +215,11 @@ pub async fn sys_statx( stat_x.stx_mnt_id = attr.id.fs_id(); } + if let Some(device) = special_file_device(&attr) { + stat_x.stx_rdev_major = device.major as u32; + stat_x.stx_rdev_minor = device.minor as u32; + } + stat_x.stx_attributes_mask = StatXAttr::STATX_ATTR_MOUNT_ROOT.bits(); if VFS.is_mount_root(attr.id) { stat_x.stx_attributes |= StatXAttr::STATX_ATTR_MOUNT_ROOT.bits(); diff --git a/src/fs/syscalls/mount.rs b/src/fs/syscalls/mount.rs index f4672ab0..f404b0ef 100644 --- a/src/fs/syscalls/mount.rs +++ b/src/fs/syscalls/mount.rs @@ -1,10 +1,13 @@ +use alloc::boxed::Box; + +use crate::drivers::block::get_block_device_by_descriptor; use crate::fs::VFS; use crate::memory::uaccess::cstr::UserCStr; use crate::sched::syscall_ctx::ProcessCtx; use bitflags::bitflags; use core::ffi::c_char; -use libkernel::error::{KernelError, Result}; -use libkernel::fs::path::Path; +use libkernel::error::{FsError, KernelError, Result}; +use libkernel::fs::{BlockDevice, FileType, path::Path}; use libkernel::memory::address::{TUA, UA}; bitflags! { @@ -45,6 +48,61 @@ bitflags! { } } +fn mount_filesystem_type(name: &str) -> &str { + match name { + "proc" | "procfs" => "procfs", + "devtmpfs" | "devfs" => "devfs", + "cgroup2" | "cgroupfs" => "cgroupfs", + "sysfs" => "sysfs", + "tmpfs" => "tmpfs", + "ext4" | "ext4fs" => "ext4fs", + "fat" | "fat32" | "fat32fs" | "msdos" | "vfat" => "fat32fs", + other => other, + } +} + +fn fallback_mount_filesystem_type(source: &str) -> Option<&str> { + match source { + "proc" | "procfs" => Some("procfs"), + "devtmpfs" | "devfs" => Some("devfs"), + "cgroup2" | "cgroupfs" => Some("cgroupfs"), + "sysfs" => Some("sysfs"), + "tmpfs" => Some("tmpfs"), + _ => None, + } +} + +fn source_is_non_device(source: &str, fs_type: &str) -> bool { + matches!(source, "" | "none") + || matches!( + (source, fs_type), + ("proc", "procfs") + | ("procfs", "procfs") + | ("devtmpfs", "devfs") + | ("devfs", "devfs") + | ("cgroup2", "cgroupfs") + | ("cgroupfs", "cgroupfs") + | ("sysfs", "sysfs") + | ("tmpfs", "tmpfs") + ) +} + +async fn resolve_mount_block_device( + ctx: &ProcessCtx, + source: &str, +) -> Result> { + let task = ctx.shared().clone(); + let cwd = task.cwd.lock_save_irq().0.clone(); + let inode = VFS.resolve_path(Path::new(source), cwd, &task).await?; + + match inode.getattr().await?.file_type { + FileType::BlockDevice(device_id) => get_block_device_by_descriptor(device_id) + .map(|(_, device)| device) + .ok_or(FsError::NoDevice.into()), + _ => Err(FsError::NoDevice.into()), + } +} + pub async fn sys_mount( ctx: &ProcessCtx, dev_name: TUA, @@ -58,6 +116,7 @@ pub async fn sys_mount( // TODO: Handle later return Ok(0); } + let mut buf = [0u8; 1024]; let dev_name = if dev_name.is_null() { None @@ -68,29 +127,41 @@ pub async fn sys_mount( .await?, ) }; + let mut buf = [0u8; 1024]; let dir_name = UserCStr::from_ptr(dir_name) .copy_from_user(&mut buf) .await?; - let mount_point = VFS - .resolve_path(Path::new(dir_name), VFS.root_inode(), ctx.shared()) - .await?; + let mut buf = [0u8; 1024]; - let fs_type = if type_.is_null() { + let mount_type = if type_.is_null() { None } else { Some(UserCStr::from_ptr(type_).copy_from_user(&mut buf).await?) }; - let fs_name = fs_type.or(dev_name).ok_or(KernelError::NotSupported)?; - let fs_name = match fs_name { - "proc" => "procfs", - "devtmpfs" => "devfs", - "sysfs" => "sysfs", - "cgroup2" => "cgroupfs", - s => s, + let task = ctx.shared().clone(); + let cwd = task.cwd.lock_save_irq().0.clone(); + let mount_point = VFS.resolve_path(Path::new(dir_name), cwd, &task).await?; + + let fs_type = if let Some(mount_type) = mount_type { + mount_filesystem_type(mount_type) + } else if let Some(source) = dev_name { + fallback_mount_filesystem_type(source).ok_or(KernelError::NotSupported)? + } else { + return Err(KernelError::NotSupported); + }; + + let blkdev = if let Some(source) = dev_name { + if source_is_non_device(source, fs_type) { + None + } else { + Some(resolve_mount_block_device(ctx, source).await?) + } + } else { + None }; - VFS.mount(mount_point, fs_name, None).await?; + VFS.mount(mount_point, fs_type, blkdev).await?; Ok(0) } From e3522185b6b933205a2b7e5a6b1e3ce44a46b2a0 Mon Sep 17 00:00:00 2001 From: Ashwin Naren Date: Mon, 11 May 2026 16:08:27 -0700 Subject: [PATCH 03/11] bounce share DMA support for disks --- src/drivers/virtio_hal.rs | 145 ++++++++++++++++++++++++++++++++++---- 1 file changed, 131 insertions(+), 14 deletions(-) diff --git a/src/drivers/virtio_hal.rs b/src/drivers/virtio_hal.rs index cbb59ae9..eeafecae 100644 --- a/src/drivers/virtio_hal.rs +++ b/src/drivers/virtio_hal.rs @@ -1,12 +1,21 @@ -use crate::arch::{Arch, ArchImpl}; +use crate::arch::ArchImpl; use crate::memory::PageOffsetTranslator; +use crate::sync::SpinLock; +use alloc::vec::Vec; use core::ptr::NonNull; use libkernel::memory::PAGE_SIZE; -use libkernel::memory::address::{PA, TPA}; +use libkernel::memory::address::{PA, TPA, VA}; +use libkernel::memory::proc_vm::address_space::VirtualMemory; use libkernel::memory::region::PhysMemoryRegion; -use log::trace; use virtio_drivers::{BufferDirection, Hal, PhysAddr}; +struct BouncedShare { + paddr: PhysAddr, + pages: usize, +} + +static BOUNCED_SHARES: SpinLock> = SpinLock::new(Vec::new()); + pub(super) struct VirtioHal; impl VirtioHal { @@ -16,6 +25,87 @@ impl VirtioHal { let rounded = pages.next_power_of_two(); rounded.ilog2() as u8 } + + fn translated_phys_addr(vaddr: VA) -> Option { + ArchImpl::kern_address_space() + .lock_save_irq() + .translate(vaddr) + .map(|pa| pa.value() as PhysAddr) + } + + fn translate_buffer(vaddr: VA, len: usize) -> Option { + debug_assert!(len > 0); + + let first_page_va = vaddr.page_aligned(); + let last_byte_va = vaddr.add_bytes(len - 1); + let last_page_va = last_byte_va.page_aligned(); + + let first_page_pa = Self::translated_phys_addr(first_page_va)?; + let mut page_va = first_page_va; + let mut expected_page_pa = first_page_pa; + + loop { + let page_pa = Self::translated_phys_addr(page_va)?; + if page_pa != expected_page_pa { + return None; + } + + if page_va == last_page_va { + break; + } + + page_va = page_va.add_pages(1); + expected_page_pa += PAGE_SIZE as PhysAddr; + } + + Some(first_page_pa + vaddr.page_offset() as PhysAddr) + } + + fn bounce_copy_in(paddr: PhysAddr, src: &[u8]) { + let bounce = PA::from_value(paddr as usize) + .cast::() + .to_va::() + .as_ptr_mut(); + + unsafe { + core::ptr::copy_nonoverlapping(src.as_ptr(), bounce, src.len()); + } + } + + fn bounce_copy_out(paddr: PhysAddr, dst: &mut [u8]) { + let bounce = PA::from_value(paddr as usize) + .cast::() + .to_va::() + .as_ptr(); + + unsafe { + core::ptr::copy_nonoverlapping(bounce, dst.as_mut_ptr(), dst.len()); + } + } + + fn share_via_bounce(buffer: &[u8], direction: BufferDirection) -> PhysAddr { + let pages = buffer.len().div_ceil(PAGE_SIZE); + let (paddr, _vaddr) = ::dma_alloc(pages, direction); + + if matches!( + direction, + BufferDirection::DriverToDevice | BufferDirection::Both + ) { + Self::bounce_copy_in(paddr, buffer); + } + + BOUNCED_SHARES + .lock_save_irq() + .push(BouncedShare { paddr, pages }); + + // trace!( + // "virtio share: bounced {:p} len={} direction={direction:?} to paddr={paddr:#x}", + // buffer.as_ptr(), + // buffer.len(), + // ); + + paddr + } } unsafe impl Hal for VirtioHal { @@ -40,12 +130,12 @@ unsafe impl Hal for VirtioHal { core::ptr::write_bytes(vaddr.as_ptr(), 0, pages * PAGE_SIZE); } - trace!("alloc DMA: paddr={paddr:#x}, pages={pages}, order={order}"); + // trace!("alloc DMA: paddr={paddr:#x}, pages={pages}, order={order}"); (paddr, vaddr) } unsafe fn dma_dealloc(paddr: PhysAddr, _vaddr: NonNull, pages: usize) -> i32 { - trace!("dealloc DMA: paddr={paddr:#x}, pages={pages}"); + // trace!("dealloc DMA: paddr={paddr:#x}, pages={pages}"); let order = Self::pages_to_order(pages); let region = PhysMemoryRegion::new( @@ -72,18 +162,45 @@ unsafe impl Hal for VirtioHal { NonNull::new(vaddr).unwrap() } - unsafe fn share(buffer: NonNull<[u8]>, _direction: BufferDirection) -> PhysAddr { - // We're assuming that all ram is DMA-coherent in QEMU. - // We don't need to adjust page table mapping properties to disable caching and the like to make this work. - let vaddr = buffer.as_ptr() as *mut u8 as usize; + unsafe fn share(buffer: NonNull<[u8]>, direction: BufferDirection) -> PhysAddr { + // We're assuming that all RAM is DMA-coherent in QEMU, so once we have + // a valid DMA-visible physical address we don't need extra cache + // maintenance here. + let buffer = unsafe { buffer.as_ref() }; + assert!(!buffer.is_empty(), "virtio share: empty buffer"); + let vaddr = VA::from_value(buffer.as_ptr() as usize); - // Buffer must be in the direct map for this fast translation. - if vaddr < ArchImpl::PAGE_OFFSET { - panic!("virtio share: buffer VA is not in direct map: {vaddr:#x}"); + if let Some(paddr) = Self::translate_buffer(vaddr, buffer.len()) { + return paddr; } - (vaddr - ArchImpl::PAGE_OFFSET) as PhysAddr + Self::share_via_bounce(buffer, direction) } - unsafe fn unshare(_paddr: PhysAddr, _buffer: NonNull<[u8]>, _direction: BufferDirection) {} + unsafe fn unshare(paddr: PhysAddr, mut buffer: NonNull<[u8]>, direction: BufferDirection) { + let mut bounced = BOUNCED_SHARES.lock_save_irq(); + let Some(index) = bounced.iter().position(|share| share.paddr == paddr) else { + return; + }; + let pages = bounced.swap_remove(index).pages; + drop(bounced); + + let buffer = unsafe { buffer.as_mut() }; + if matches!( + direction, + BufferDirection::DeviceToDriver | BufferDirection::Both + ) { + Self::bounce_copy_out(paddr, buffer); + } + + let vaddr = PA::from_value(paddr as usize) + .cast::() + .to_va::() + .as_ptr_mut(); + let vaddr = NonNull::new(vaddr).expect("virtio bounce buffer VA should never be null"); + + unsafe { + ::dma_dealloc(paddr, vaddr, pages); + } + } } From 72e3b084c35e270be257c24ed2432673c9b13ebf Mon Sep 17 00:00:00 2001 From: Ashwin Naren Date: Mon, 11 May 2026 16:08:34 -0700 Subject: [PATCH 04/11] fix parameters --- scripts/qemu_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/qemu_runner.py b/scripts/qemu_runner.py index 65283bfb..b990ca62 100755 --- a/scripts/qemu_runner.py +++ b/scripts/qemu_runner.py @@ -41,9 +41,10 @@ "-rtc": "base=utc,clock=host", "-nographic": None, "-s": None, + "-snapshot": None, "-kernel": bin_executable_location, "-append": f"{append_args} --rootfs=ext4fs --automount=/dev,devfs --automount=/tmp,tmpfs --automount=/proc,procfs --automount=/sys,sysfs", - "-drive": "file=ubuntu-noble-arm64.img,format=raw,if=none,readonly=on,cache=none,id=x0", + "-drive": "file=ubuntu-noble-arm64.img,format=raw,if=none,cache=none,id=x0", } # Arguments that can appear multiple times (e.g. -device) From 950539f8a36247387d1404d9b2a6119645d8ce96 Mon Sep 17 00:00:00 2001 From: Ashwin Naren Date: Mon, 11 May 2026 16:19:16 -0700 Subject: [PATCH 05/11] implement pivot_root --- etc/syscalls_linux_aarch64.md | 2 +- src/arch/arm64/exceptions/syscall.rs | 2 + src/fs/mod.rs | 54 +++++++++++ src/fs/syscalls/mod.rs | 1 + src/fs/syscalls/pivot_root.rs | 138 +++++++++++++++++++++++++++ 5 files changed, 196 insertions(+), 1 deletion(-) create mode 100644 src/fs/syscalls/pivot_root.rs diff --git a/etc/syscalls_linux_aarch64.md b/etc/syscalls_linux_aarch64.md index 8696d05e..2c89a674 100644 --- a/etc/syscalls_linux_aarch64.md +++ b/etc/syscalls_linux_aarch64.md @@ -42,7 +42,7 @@ | 0x26 (38) | renameat | (int olddfd, const char *oldname, int newdfd, const char *newname) | __arm64_sys_renameat | true | | 0x27 (39) | umount | (char *name, int flags) | __arm64_sys_umount | false | | 0x28 (40) | mount | (char *dev_name, char *dir_name, char *type, unsigned long flags, void *data) | __arm64_sys_mount | partial | -| 0x29 (41) | pivot_root | (const char *new_root, const char *put_old) | __arm64_sys_pivot_root | false | +| 0x29 (41) | pivot_root | (const char *new_root, const char *put_old) | __arm64_sys_pivot_root | partial | | 0x2b (43) | statfs | (const char *pathname, struct statfs *buf) | __arm64_sys_statfs | partial | | 0x2c (44) | fstatfs | (unsigned int fd, struct statfs *buf) | __arm64_sys_fstatfs | partial | | 0x2d (45) | truncate | (const char *path, long length) | __arm64_sys_truncate | true | diff --git a/src/arch/arm64/exceptions/syscall.rs b/src/arch/arm64/exceptions/syscall.rs index 6ecef8c9..67faf4dd 100644 --- a/src/arch/arm64/exceptions/syscall.rs +++ b/src/arch/arm64/exceptions/syscall.rs @@ -36,6 +36,7 @@ use crate::{ iov::{sys_preadv, sys_preadv2, sys_pwritev, sys_pwritev2, sys_readv, sys_writev}, listxattr::{sys_flistxattr, sys_listxattr, sys_llistxattr}, mount::sys_mount, + pivot_root::sys_pivot_root, removexattr::{sys_fremovexattr, sys_lremovexattr, sys_removexattr}, rw::{sys_pread64, sys_pwrite64, sys_read, sys_write}, seek::sys_lseek, @@ -295,6 +296,7 @@ pub async fn handle_syscall(mut ctx: ProcessCtx) { ) .await } + 0x29 => sys_pivot_root(&ctx, TUA::from_value(arg1 as _), TUA::from_value(arg2 as _)).await, 0x2b => sys_statfs(&ctx, TUA::from_value(arg1 as _), TUA::from_value(arg2 as _)).await, 0x2c => sys_fstatfs(&ctx, arg1.into(), TUA::from_value(arg2 as _)).await, 0x2d => sys_truncate(&ctx, TUA::from_value(arg1 as _), arg2 as _).await, diff --git a/src/fs/mod.rs b/src/fs/mod.rs index ceec0b42..d1583520 100644 --- a/src/fs/mod.rs +++ b/src/fs/mod.rs @@ -632,6 +632,60 @@ impl VFS { .values() .any(|mount| mount.root_inode.id() == id) } + + pub fn is_mount_point(&self, id: InodeId) -> bool { + self.state.lock_save_irq().mounts.contains_key(&id) + } + + pub fn mount_point_for_root(&self, id: InodeId) -> Option { + self.state + .lock_save_irq() + .mounts + .iter() + .find_map(|(mount_point, mount)| (mount.root_inode.id() == id).then_some(*mount_point)) + } + + pub fn pivot_root( + &self, + new_root_mount_point: InodeId, + put_old_mount_point: InodeId, + ) -> Result<(Arc, Arc)> { + let old_root_inode = self + .root_inode + .lock_save_irq() + .as_ref() + .cloned() + .ok_or(FsError::NotFound)?; + + let (old_root_root, new_root_root) = { + let mut state = self.state.lock_save_irq(); + + if state.mounts.contains_key(&put_old_mount_point) { + return Err(KernelError::InUse); + } + + let old_root_mount = state + .mounts + .remove(&old_root_inode.id()) + .ok_or(FsError::NotFound)?; + let new_root_mount = state + .mounts + .remove(&new_root_mount_point) + .ok_or(FsError::InvalidInput)?; + + let old_root_root = old_root_mount.root_inode.clone(); + let new_root_root = new_root_mount.root_inode.clone(); + + state.mounts.insert(new_root_root.id(), new_root_mount); + state.mounts.insert(put_old_mount_point, old_root_mount); + + (old_root_root, new_root_root) + }; + + *self.root_inode.lock_save_irq() = Some(new_root_root.clone()); + + Ok((old_root_root, new_root_root)) + } } pub static VFS: VFS = VFS::new(); diff --git a/src/fs/syscalls/mod.rs b/src/fs/syscalls/mod.rs index 1cc4c3c9..586fac6b 100644 --- a/src/fs/syscalls/mod.rs +++ b/src/fs/syscalls/mod.rs @@ -10,6 +10,7 @@ pub mod iov; pub mod listxattr; pub mod mount; pub mod open; +pub mod pivot_root; pub mod removexattr; pub mod rw; pub mod seek; diff --git a/src/fs/syscalls/pivot_root.rs b/src/fs/syscalls/pivot_root.rs new file mode 100644 index 00000000..70694344 --- /dev/null +++ b/src/fs/syscalls/pivot_root.rs @@ -0,0 +1,138 @@ +use crate::{ + fs::VFS, memory::uaccess::cstr::UserCStr, process::TASK_LIST, sched::syscall_ctx::ProcessCtx, +}; +use alloc::sync::Arc; +use core::ffi::c_char; +use libkernel::{ + error::{FsError, KernelError, Result}, + fs::{FileType, Inode, InodeId, path::Path}, + memory::address::TUA, + proc::caps::CapabilitiesFlags, +}; + +async fn resolve_attachment_inode( + ctx: &ProcessCtx, + path: &Path, +) -> Result<(Arc, InodeId)> { + let task = ctx.shared().clone(); + let cwd = task.cwd.lock_save_irq().0.clone(); + let resolved = VFS.resolve_path(path, cwd.clone(), &task).await?; + + let attachment = if let Some(name) = path.file_name() { + let parent = if let Some(parent_path) = path.parent() { + VFS.resolve_path(parent_path, cwd, &task).await? + } else if path.is_absolute() { + task.root.lock_save_irq().0.clone() + } else { + cwd + }; + + parent.lookup(name).await?.id() + } else if let Some(mount_point) = VFS.mount_point_for_root(resolved.id()) { + // This supports mount-root paths like "." when they already resolve to + // the root of a mounted filesystem. + mount_point + } else { + resolved.id() + }; + + Ok((resolved, attachment)) +} + +async fn path_is_descendant_or_same( + mut path: Arc, + ancestor: Arc, +) -> Result { + loop { + if path.id() == ancestor.id() { + return Ok(true); + } + + let parent = path.lookup("..").await?; + if parent.id() == path.id() { + return Ok(false); + } + + path = parent; + } +} + +pub async fn sys_pivot_root( + ctx: &ProcessCtx, + new_root: TUA, + put_old: TUA, +) -> Result { + let task = ctx.shared().clone(); + task.creds + .lock_save_irq() + .caps() + .check_capable(CapabilitiesFlags::CAP_SYS_ADMIN)?; + + let old_root = task.root.lock_save_irq().0.clone(); + if !VFS.is_mount_root(old_root.id()) { + return Err(KernelError::InvalidValue); + } + + let mut buf = [0u8; 1024]; + let new_root = Path::new( + UserCStr::from_ptr(new_root) + .copy_from_user(&mut buf) + .await?, + ); + + let mut buf = [0u8; 1024]; + let put_old = Path::new(UserCStr::from_ptr(put_old).copy_from_user(&mut buf).await?); + + let (new_root_inode, new_root_attachment) = resolve_attachment_inode(ctx, new_root).await?; + let (put_old_inode, put_old_attachment) = resolve_attachment_inode(ctx, put_old).await?; + + if new_root_inode.getattr().await?.file_type != FileType::Directory + || put_old_inode.getattr().await?.file_type != FileType::Directory + { + return Err(FsError::NotADirectory.into()); + } + + if new_root_inode.id().fs_id() == old_root.id().fs_id() + || put_old_inode.id().fs_id() == old_root.id().fs_id() + { + return Err(KernelError::InUse); + } + + if !VFS.is_mount_root(new_root_inode.id()) { + return Err(KernelError::InvalidValue); + } + + if !path_is_descendant_or_same(put_old_inode.clone(), new_root_inode.clone()).await? { + return Err(KernelError::InvalidValue); + } + + if VFS.is_mount_point(put_old_attachment) { + return Err(KernelError::InUse); + } + + let (old_root_inode, new_root_inode) = + VFS.pivot_root(new_root_attachment, put_old_attachment)?; + + let tasks: alloc::vec::Vec<_> = TASK_LIST + .lock_save_irq() + .values() + .filter_map(|work| work.upgrade()) + .collect(); + + for work in tasks { + let task = work.task.t_shared.clone(); + + let mut root = task.root.lock_save_irq(); + if root.0.id() == old_root_inode.id() { + *root = (new_root_inode.clone(), "/".into()); + } + drop(root); + + let mut cwd = task.cwd.lock_save_irq(); + if cwd.0.id() == old_root_inode.id() { + *cwd = (new_root_inode.clone(), "/".into()); + } + } + + Ok(0) +} From ff924a40df2cf054efe23e9d9be34e94a91cd5b1 Mon Sep 17 00:00:00 2001 From: Ashwin Naren Date: Mon, 11 May 2026 16:22:33 -0700 Subject: [PATCH 06/11] minor typo fix --- src/fs/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fs/mod.rs b/src/fs/mod.rs index d1583520..06de85e5 100644 --- a/src/fs/mod.rs +++ b/src/fs/mod.rs @@ -193,7 +193,7 @@ impl VFS { } #[expect(unused)] - pub async fn unmount(&self, mount_point: Arc) -> Result<()> { + pub async fn umount(&self, mount_point: Arc) -> Result<()> { let mount_point_id = mount_point.id(); // Lock the state and remove the mount. From b02f71105a09f05df470111bc17671f04e91331f Mon Sep 17 00:00:00 2001 From: Ashwin Naren Date: Mon, 11 May 2026 16:28:33 -0700 Subject: [PATCH 07/11] implement unmount --- etc/syscalls_linux_aarch64.md | 2 +- src/arch/arm64/exceptions/syscall.rs | 2 + src/fs/mod.rs | 106 ++++++++++++++++++++++++--- src/fs/syscalls/mod.rs | 1 + src/fs/syscalls/umount.rs | 62 ++++++++++++++++ src/process/fd_table.rs | 11 +++ usertest/src/fs.rs | 42 +++++++++++ 7 files changed, 214 insertions(+), 12 deletions(-) create mode 100644 src/fs/syscalls/umount.rs diff --git a/etc/syscalls_linux_aarch64.md b/etc/syscalls_linux_aarch64.md index 2c89a674..1c1a0a03 100644 --- a/etc/syscalls_linux_aarch64.md +++ b/etc/syscalls_linux_aarch64.md @@ -40,7 +40,7 @@ | 0x24 (36) | symlinkat | (const char *oldname, int newdfd, const char *newname) | __arm64_sys_symlinkat | true | | 0x25 (37) | linkat | (int olddfd, const char *oldname, int newdfd, const char *newname, int flags) | __arm64_sys_linkat | true | | 0x26 (38) | renameat | (int olddfd, const char *oldname, int newdfd, const char *newname) | __arm64_sys_renameat | true | -| 0x27 (39) | umount | (char *name, int flags) | __arm64_sys_umount | false | +| 0x27 (39) | umount | (char *name, int flags) | __arm64_sys_umount | true | | 0x28 (40) | mount | (char *dev_name, char *dir_name, char *type, unsigned long flags, void *data) | __arm64_sys_mount | partial | | 0x29 (41) | pivot_root | (const char *new_root, const char *put_old) | __arm64_sys_pivot_root | partial | | 0x2b (43) | statfs | (const char *pathname, struct statfs *buf) | __arm64_sys_statfs | partial | diff --git a/src/arch/arm64/exceptions/syscall.rs b/src/arch/arm64/exceptions/syscall.rs index 67faf4dd..b485e404 100644 --- a/src/arch/arm64/exceptions/syscall.rs +++ b/src/arch/arm64/exceptions/syscall.rs @@ -46,6 +46,7 @@ use crate::{ statfs::{sys_fstatfs, sys_statfs}, sync::{sys_fdatasync, sys_fsync, sys_sync, sys_syncfs}, trunc::{sys_ftruncate, sys_truncate}, + umount::sys_umount2, }, }, kernel::{ @@ -285,6 +286,7 @@ pub async fn handle_syscall(mut ctx: ProcessCtx) { ) .await } + 0x27 => sys_umount2(&ctx, TUA::from_value(arg1 as _), arg2 as _).await, 0x28 => { sys_mount( &ctx, diff --git a/src/fs/mod.rs b/src/fs/mod.rs index 06de85e5..47311d47 100644 --- a/src/fs/mod.rs +++ b/src/fs/mod.rs @@ -1,7 +1,7 @@ use crate::clock::realtime::date; use crate::{ drivers::{DM, Driver, block::get_block_device_by_descriptor}, - process::Task, + process::{TASK_LIST, Task}, sync::SpinLock, }; use alloc::{borrow::ToOwned, boxed::Box, collections::btree_map::BTreeMap, sync::Arc, vec::Vec}; @@ -45,6 +45,7 @@ impl Inode for DummyInode { } /// Represents a mounted filesystem. +#[derive(Clone)] struct Mount { fs: Arc, root_inode: Arc, @@ -88,10 +89,40 @@ impl VfsState { } /// Removes a mount point by its inode ID. - fn remove_mount(&mut self, mount_point_id: &InodeId) -> Option<()> { + fn remove_mount(&mut self, mount_point_id: &InodeId) -> Option { let mount = self.mounts.remove(mount_point_id)?; self.filesystems.remove(&mount.fs.id())?; - Some(()) + Some(mount) + } + + /// Collects the mount identified by `mount_point_id` and every nested mount + /// reachable beneath it. + fn collect_mount_subtree(&self, mount_point_id: InodeId) -> Option> { + let mut pending = Vec::new(); + let mut subtree = Vec::new(); + + pending.push(mount_point_id); + + while let Some(current_mount_point_id) = pending.pop() { + let mount = self.mounts.get(¤t_mount_point_id)?.clone(); + let current_fs_id = mount.fs.id(); + + subtree.push((current_mount_point_id, mount)); + + for child_mount_point_id in self.mounts.keys().copied() { + if child_mount_point_id != current_mount_point_id + && child_mount_point_id.fs_id() == current_fs_id + && !subtree + .iter() + .any(|(seen, _)| *seen == child_mount_point_id) + && !pending.contains(&child_mount_point_id) + { + pending.push(child_mount_point_id); + } + } + } + + Some(subtree) } /// Checks if an inode is a mount point and returns the root inode of the @@ -192,15 +223,68 @@ impl VFS { Ok(()) } - #[expect(unused)] - pub async fn umount(&self, mount_point: Arc) -> Result<()> { - let mount_point_id = mount_point.id(); + pub async fn umount(&self, mount_point: Arc, detach: bool) -> Result<()> { + let mount_point_id = self + .mount_point_for_root(mount_point.id()) + .unwrap_or(mount_point.id()); - // Lock the state and remove the mount. - self.state - .lock_save_irq() - .remove_mount(&mount_point_id) - .ok_or(FsError::NotFound)?; + if mount_point_id + == self + .root_inode + .lock_save_irq() + .as_ref() + .ok_or(FsError::NotFound)? + .id() + { + return Err(KernelError::InUse); + } + + let subtree = { + let state = self.state.lock_save_irq(); + state + .collect_mount_subtree(mount_point_id) + .ok_or(KernelError::InvalidValue)? + }; + + if !detach && subtree.len() > 1 { + return Err(KernelError::InUse); + } + + let target_fs_id = subtree + .first() + .map(|(_, mount)| mount.fs.id()) + .ok_or(KernelError::InvalidValue)?; + + if !detach { + let tasks: Vec<_> = TASK_LIST + .lock_save_irq() + .values() + .filter_map(|work| work.upgrade()) + .collect(); + + for work in tasks { + let task = work.task.t_shared.clone(); + + if task.root.lock_save_irq().0.id().fs_id() == target_fs_id + || task.cwd.lock_save_irq().0.id().fs_id() == target_fs_id + || task.fd_table.lock_save_irq().any_inode_on_fs(target_fs_id) + { + return Err(KernelError::InUse); + } + } + } + + let filesystems: Vec<_> = subtree.iter().map(|(_, mount)| mount.fs.clone()).collect(); + for fs in filesystems { + fs.sync().await?; + } + + let mut state = self.state.lock_save_irq(); + for (mount_point_id, _) in subtree { + state + .remove_mount(&mount_point_id) + .ok_or(KernelError::InvalidValue)?; + } Ok(()) } diff --git a/src/fs/syscalls/mod.rs b/src/fs/syscalls/mod.rs index 586fac6b..85e66c69 100644 --- a/src/fs/syscalls/mod.rs +++ b/src/fs/syscalls/mod.rs @@ -20,3 +20,4 @@ pub mod stat; pub mod statfs; pub mod sync; pub mod trunc; +pub mod umount; diff --git a/src/fs/syscalls/umount.rs b/src/fs/syscalls/umount.rs new file mode 100644 index 00000000..e3263dc8 --- /dev/null +++ b/src/fs/syscalls/umount.rs @@ -0,0 +1,62 @@ +use crate::{fs::VFS, memory::uaccess::cstr::UserCStr, sched::syscall_ctx::ProcessCtx}; +use bitflags::bitflags; +use core::ffi::c_char; +use libkernel::{ + error::{FsError, KernelError, Result}, + fs::path::Path, + memory::address::TUA, + proc::caps::CapabilitiesFlags, +}; + +bitflags! { + #[derive(Debug, Clone, Copy, PartialEq, Eq)] + struct UmountFlags: u32 { + const MNT_FORCE = 0x1; + const MNT_DETACH = 0x2; + const MNT_EXPIRE = 0x4; + const UMOUNT_NOFOLLOW = 0x8; + } +} + +pub async fn sys_umount2(ctx: &ProcessCtx, target: TUA, flags: i64) -> Result { + let flags = u32::try_from(flags) + .ok() + .and_then(UmountFlags::from_bits) + .ok_or(KernelError::InvalidValue)?; + + if flags.contains(UmountFlags::MNT_EXPIRE) + && flags.intersects(UmountFlags::MNT_FORCE | UmountFlags::MNT_DETACH) + { + return Err(KernelError::InvalidValue); + } + + if flags.contains(UmountFlags::MNT_EXPIRE) { + // TODO: Implement two-phase expiry semantics. + return Err(KernelError::TryAgain); + } + + let task = ctx.shared().clone(); + task.creds + .lock_save_irq() + .caps() + .check_capable(CapabilitiesFlags::CAP_SYS_ADMIN)?; + + let mut buf = [0u8; 1024]; + let target = UserCStr::from_ptr(target).copy_from_user(&mut buf).await?; + if target.is_empty() { + return Err(FsError::NotFound.into()); + } + + let cwd = task.cwd.lock_save_irq().0.clone(); + let target_path = Path::new(target); + let target = if flags.contains(UmountFlags::UMOUNT_NOFOLLOW) { + VFS.resolve_path_nofollow(target_path, cwd, &task).await? + } else { + VFS.resolve_path(target_path, cwd, &task).await? + }; + + VFS.umount(target, flags.contains(UmountFlags::MNT_DETACH)) + .await?; + + Ok(0) +} diff --git a/src/process/fd_table.rs b/src/process/fd_table.rs index 294d2c4b..b98c7b58 100644 --- a/src/process/fd_table.rs +++ b/src/process/fd_table.rs @@ -201,6 +201,17 @@ impl FileDescriptorTable { } } + /// Returns `true` if any open file descriptor refers to an inode on the + /// given filesystem. + pub fn any_inode_on_fs(&self, fs_id: u64) -> bool { + self.entries.iter().flatten().any(|entry| { + entry + .file + .inode() + .is_some_and(|inode| inode.id().fs_id() == fs_id) + }) + } + /// Number of file descriptors in use. pub fn len(&self) -> usize { self.entries.iter().filter(|e| e.is_some()).count() diff --git a/usertest/src/fs.rs b/usertest/src/fs.rs index a3f62543..cf350f51 100644 --- a/usertest/src/fs.rs +++ b/usertest/src/fs.rs @@ -109,6 +109,48 @@ fn test_chroot() { register_test!(test_chroot); +fn test_mount_umount() { + use std::path::Path; + + let pid = unsafe { libc::getpid() }; + let mount_point = format!("/tmp/mount_umount_test_{pid}"); + let test_file = format!("{mount_point}/hello.txt"); + let c_mount_point = CString::new(mount_point.clone()).unwrap(); + let c_source = CString::new("tmpfs").unwrap(); + let c_type = CString::new("tmpfs").unwrap(); + + fs::create_dir(&mount_point).expect("Failed to create mount point"); + + unsafe { + if libc::mount( + c_source.as_ptr(), + c_mount_point.as_ptr(), + c_type.as_ptr(), + 0, + std::ptr::null(), + ) != 0 + { + panic!("mount failed: {}", std::io::Error::last_os_error()); + } + } + + fs::write(&test_file, b"hello").expect("Failed to write file on mounted fs"); + assert!(Path::new(&test_file).exists()); + + unsafe { + if libc::umount(c_mount_point.as_ptr()) != 0 { + panic!("umount failed: {}", std::io::Error::last_os_error()); + } + } + + assert!(!Path::new(&test_file).exists()); + assert_eq!(fs::read_dir(&mount_point).unwrap().count(), 0); + + fs::remove_dir(&mount_point).expect("Failed to remove mount point"); +} + +register_test!(test_mount_umount); + fn test_chmod() { let dir_path = "/tmp/chmod_test"; let c_dir_path = CString::new(dir_path).unwrap(); From ba0af460bcdb30d3b13fe54a552707d3dd10da3b Mon Sep 17 00:00:00 2001 From: Ashwin Naren Date: Mon, 11 May 2026 16:36:15 -0700 Subject: [PATCH 08/11] stub getgroups --- src/arch/arm64/exceptions/syscall.rs | 7 ++++--- src/process/creds.rs | 9 +++++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/arch/arm64/exceptions/syscall.rs b/src/arch/arm64/exceptions/syscall.rs index b485e404..f1bd6a26 100644 --- a/src/arch/arm64/exceptions/syscall.rs +++ b/src/arch/arm64/exceptions/syscall.rs @@ -73,9 +73,9 @@ use crate::{ caps::{sys_capget, sys_capset}, clone::sys_clone, creds::{ - sys_getegid, sys_geteuid, sys_getgid, sys_getresgid, sys_getresuid, sys_getsid, - sys_gettid, sys_getuid, sys_setfsgid, sys_setfsuid, sys_setgid, sys_setregid, - sys_setresgid, sys_setresuid, sys_setreuid, sys_setsid, sys_setuid, + sys_getegid, sys_geteuid, sys_getgid, sys_getgroups, sys_getresgid, sys_getresuid, + sys_getsid, sys_gettid, sys_getuid, sys_setfsgid, sys_setfsuid, sys_setgid, + sys_setregid, sys_setresgid, sys_setresuid, sys_setreuid, sys_setsid, sys_setuid, }, epoll::{sys_epoll_create1, sys_epoll_ctl, sys_epoll_pwait}, exec::sys_execve, @@ -620,6 +620,7 @@ pub async fn handle_syscall(mut ctx: ProcessCtx) { 0x9b => sys_getpgid(&ctx, arg1 as _), 0x9c => sys_getsid(&ctx).await, 0x9d => sys_setsid(&ctx).await, + 0x9e => sys_getgroups(&ctx, arg1 as _, TUA::from_value(arg2 as _)).map_err(|e| match e {}), 0xa0 => sys_uname(TUA::from_value(arg1 as _)).await, 0xa1 => sys_sethostname(&ctx, TUA::from_value(arg1 as _), arg2 as _).await, 0xa3 => Err(KernelError::InvalidValue), diff --git a/src/process/creds.rs b/src/process/creds.rs index 5e635672..2fa360bf 100644 --- a/src/process/creds.rs +++ b/src/process/creds.rs @@ -328,6 +328,15 @@ pub fn sys_setfsgid(ctx: &ProcessCtx, _new_id: usize) -> core::result::Result, +) -> core::result::Result { + // Supplementary groups are not implemented yet. + Ok(0) +} + pub fn sys_gettid(ctx: &ProcessCtx) -> core::result::Result { let tid: u32 = ctx.shared().tid.0; From 2b0921ac7383a48db5709f92a8f61549dd43d02f Mon Sep 17 00:00:00 2001 From: Ashwin Naren Date: Mon, 11 May 2026 16:36:32 -0700 Subject: [PATCH 09/11] add executable information to panic message --- src/main.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/main.rs b/src/main.rs index b41c707d..794a7a0e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -9,6 +9,7 @@ #![reexport_test_harness_main = "test_main"] #![test_runner(crate::testing::test_runner)] +use crate::sched::current_work; use alloc::{ boxed::Box, string::{String, ToString}, @@ -70,6 +71,15 @@ fn on_panic(info: &PanicInfo) -> ! { location.column(), panic_msg ); + let work = current_work(); + error!( + "Executable: {:?}", + work.process + .executable + .lock_save_irq() + .as_ref() + .map(|p| p.as_str()) + ); } else { error!("Kernel panicked at unknown location: {panic_msg}"); } From b1f253e57aa5e627d398473048ae4ccaecbb4615 Mon Sep 17 00:00:00 2001 From: Ashwin Naren Date: Mon, 11 May 2026 16:42:54 -0700 Subject: [PATCH 10/11] make disk optional --- scripts/qemu_runner.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/qemu_runner.py b/scripts/qemu_runner.py index b990ca62..3edb8074 100755 --- a/scripts/qemu_runner.py +++ b/scripts/qemu_runner.py @@ -14,6 +14,7 @@ parser.add_argument("--memory", default="2G") parser.add_argument("--debug", action="store_true", help="Enable QEMU debugging") parser.add_argument("--display", action="store_true", help="Add a display device to the VM") +parser.add_argument("--disk", action="store_true", help="Add a disk device to the VM") @@ -44,11 +45,10 @@ "-snapshot": None, "-kernel": bin_executable_location, "-append": f"{append_args} --rootfs=ext4fs --automount=/dev,devfs --automount=/tmp,tmpfs --automount=/proc,procfs --automount=/sys,sysfs", - "-drive": "file=ubuntu-noble-arm64.img,format=raw,if=none,cache=none,id=x0", } # Arguments that can appear multiple times (e.g. -device) -extra_args = ["-device", "virtio-rng-device", "-device", "virtio-blk-device,drive=x0"] +extra_args = ["-device", "virtio-rng-device"] if args.debug: default_args["-S"] = None @@ -61,6 +61,10 @@ default_args["-serial"] = "stdio" extra_args += ["-device", "virtio-gpu-device"] +if args.disk: + default_args["-drive"] = "file=rootfs.img,format=raw,if=none,cache=none,id=x0" + extra_args += ["-device", "virtio-blk-device,drive=x0"] + qemu_command = ["qemu-system-aarch64"] for key, value in default_args.items(): From f890857f80bda14ab304bafb82c7a05d4ed7c656 Mon Sep 17 00:00:00 2001 From: Ashwin Naren Date: Wed, 13 May 2026 14:20:40 -0700 Subject: [PATCH 11/11] address review --- etc/syscalls_linux_aarch64.md | 2 +- libkernel/src/arch/arm64/memory/pg_walk.rs | 77 ++++++++++++++++++++-- src/arch/arm64/memory/address_space.rs | 17 +++-- src/arch/arm64/memory/mmu.rs | 17 ++--- src/drivers/virtio_hal.rs | 56 +++++++--------- src/fs/syscalls/pivot_root.rs | 28 +------- 6 files changed, 111 insertions(+), 86 deletions(-) diff --git a/etc/syscalls_linux_aarch64.md b/etc/syscalls_linux_aarch64.md index 1c1a0a03..48866ed4 100644 --- a/etc/syscalls_linux_aarch64.md +++ b/etc/syscalls_linux_aarch64.md @@ -158,7 +158,7 @@ | 0x9b (155) | getpgid | (pid_t pid) | __arm64_sys_getpgid | true | | 0x9c (156) | getsid | (pid_t pid) | __arm64_sys_getsid | true | | 0x9d (157) | setsid | () | __arm64_sys_setsid | true | -| 0x9e (158) | getgroups | (int gidsetsize, gid_t *grouplist) | __arm64_sys_getgroups | false | +| 0x9e (158) | getgroups | (int gidsetsize, gid_t *grouplist) | __arm64_sys_getgroups | stub | | 0x9f (159) | setgroups | (int gidsetsize, gid_t *grouplist) | __arm64_sys_setgroups | false | | 0xa0 (160) | newuname | (struct new_utsname *name) | __arm64_sys_newuname | true | | 0xa1 (161) | sethostname | (char *name, int len) | __arm64_sys_sethostname | true | diff --git a/libkernel/src/arch/arm64/memory/pg_walk.rs b/libkernel/src/arch/arm64/memory/pg_walk.rs index 59b48d22..62f59a47 100644 --- a/libkernel/src/arch/arm64/memory/pg_walk.rs +++ b/libkernel/src/arch/arm64/memory/pg_walk.rs @@ -2,18 +2,20 @@ use super::{ pg_descriptors::L3Descriptor, - pg_tables::{L0Table, L3Table}, + pg_tables::{L0Table, L1Table, L3Table}, }; use crate::{ error::{MapError, Result}, memory::{ PAGE_SIZE, - address::{TPA, VA}, + address::{PA, TPA, VA}, paging::{ - NullTlbInvalidator, PageTableEntry, PageTableMapper, PgTable, PgTableArray, - walk::{RecursiveWalker, WalkContext}, + NullTlbInvalidator, PaMapper, PageTableEntry, PageTableMapper, PgTable, PgTableArray, + TableMapper, + permissions::PtePermissions, + walk::{RecursiveWalker, Translator, WalkContext}, }, - region::VirtMemoryRegion, + region::{PhysMemoryRegion, VirtMemoryRegion}, }, }; @@ -111,6 +113,71 @@ pub fn get_pte( Ok(descriptor) } +impl Translator for L0Table { + fn translate( + table_pa: TPA>, + va: VA, + ctx: &mut WalkContext, + ) -> Result> { + let desc = unsafe { + ctx.mapper + .with_page_table(table_pa, |pgtable| Self::from_ptr(pgtable).get_desc(va))? + }; + + match desc.next_table_address() { + Some(next_pa) => L1Table::translate(next_pa, va, ctx), + None if desc.is_valid() => Err(MapError::InvalidDescriptor.into()), + None => Ok(None), + } + } +} + +impl Translator for L3Table { + fn translate( + table_pa: TPA>, + va: VA, + ctx: &mut WalkContext, + ) -> Result> { + let desc = unsafe { + ctx.mapper + .with_page_table(table_pa, |pgtable| Self::from_ptr(pgtable).get_desc(va))? + }; + + match desc.mapped_address() { + Some(pa) => Ok(Some(( + pa, + 1 << Self::Descriptor::MAP_SHIFT, + desc.permissions().unwrap(), + ))), + None if desc.is_valid() => Err(MapError::InvalidDescriptor.into()), + None => Ok(None), + } + } +} + +/// Translates the VA into a physical region plus an offset and permissions. +pub fn translate( + l0_table: TPA>, + va: VA, + mapper: &mut PM, +) -> Result> { + let mut walk_ctx = WalkContext { + mapper, + // Safe to not invalidate the TLB, as we are not modifying any PTEs. + invalidator: &NullTlbInvalidator {}, + }; + + if let Some((pa, blk_sz, perms)) = L0Table::translate(l0_table, va, &mut walk_ctx)? { + debug_assert!(blk_sz.is_power_of_two()); + + let offset = va.value() & (blk_sz - 1); + + Ok(Some((PhysMemoryRegion::new(pa, blk_sz), offset, perms))) + } else { + Ok(None) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/arch/arm64/memory/address_space.rs b/src/arch/arm64/memory/address_space.rs index 74848bb5..70be5ce1 100644 --- a/src/arch/arm64/memory/address_space.rs +++ b/src/arch/arm64/memory/address_space.rs @@ -14,7 +14,7 @@ use libkernel::{ pg_descriptors::{L3Descriptor, MemoryType}, pg_tables::{L0Table, MapAttributes, MappingContext, map_range}, pg_tear_down::tear_down_address_space, - pg_walk::{get_pte, walk_and_modify_region}, + pg_walk::{translate as translate_va, walk_and_modify_region}, }, error::{KernelError, MapError, Result}, memory::{ @@ -137,16 +137,15 @@ impl UserAddressSpace for Arm64ProcessAddressSpace { } fn translate(&self, va: VA) -> Option { - let pte = get_pte( - self.l0_table, - va.page_aligned(), - &mut PageOffsetPgTableMapper {}, - ) - .unwrap()?; + let (region, offset, perms) = + translate_va(self.l0_table, va, &mut PageOffsetPgTableMapper {}) + .ok() + .flatten()?; + let pa = region.start_address().add_bytes(offset); Some(PageInfo { - pfn: pte.mapped_address()?.to_pfn(), - perms: pte.permissions()?, + pfn: pa.to_pfn(), + perms, }) } diff --git a/src/arch/arm64/memory/mmu.rs b/src/arch/arm64/memory/mmu.rs index 9b841139..c9a5a44f 100644 --- a/src/arch/arm64/memory/mmu.rs +++ b/src/arch/arm64/memory/mmu.rs @@ -4,12 +4,12 @@ use libkernel::{ arch::arm64::memory::{ pg_descriptors::MemoryType, pg_tables::{L0Table, MapAttributes, MappingContext, map_range}, - pg_walk::get_pte, + pg_walk::translate as translate_va, }, error::Result, memory::{ address::{PA, TPA, VA}, - paging::{PaMapper, PgTableArray, permissions::PtePermissions}, + paging::{PgTableArray, permissions::PtePermissions}, proc_vm::address_space::KernAddressSpace, region::{PhysMemoryRegion, VirtMemoryRegion}, }, @@ -39,16 +39,11 @@ impl Arm64KernelAddressSpace { map_range(self.kernel_l0, map_attrs, &mut ctx) } - pub fn translate(&self, va: VA) -> Option { - let pg_offset = va.page_offset(); - - let pte = get_pte(self.kernel_l0, va, &mut PageOffsetPgTableMapper {}) + pub fn translate(&self, va: VA) -> Option<(PhysMemoryRegion, usize)> { + translate_va(self.kernel_l0, va, &mut PageOffsetPgTableMapper {}) .ok() - .flatten()?; - - let pa = pte.mapped_address()?; - - Some(pa.add_bytes(pg_offset)) + .flatten() + .map(|(region, offset, _)| (region, offset)) } pub fn table_pa(&self) -> PA { diff --git a/src/drivers/virtio_hal.rs b/src/drivers/virtio_hal.rs index eeafecae..510a7d06 100644 --- a/src/drivers/virtio_hal.rs +++ b/src/drivers/virtio_hal.rs @@ -26,39 +26,36 @@ impl VirtioHal { rounded.ilog2() as u8 } - fn translated_phys_addr(vaddr: VA) -> Option { - ArchImpl::kern_address_space() - .lock_save_irq() - .translate(vaddr) - .map(|pa| pa.value() as PhysAddr) - } - fn translate_buffer(vaddr: VA, len: usize) -> Option { debug_assert!(len > 0); - let first_page_va = vaddr.page_aligned(); - let last_byte_va = vaddr.add_bytes(len - 1); - let last_page_va = last_byte_va.page_aligned(); - - let first_page_pa = Self::translated_phys_addr(first_page_va)?; - let mut page_va = first_page_va; - let mut expected_page_pa = first_page_pa; - - loop { - let page_pa = Self::translated_phys_addr(page_va)?; - if page_pa != expected_page_pa { - return None; + let addr_space = ArchImpl::kern_address_space().lock_save_irq(); + let mut next_va = vaddr; + let mut remaining = len; + let mut start_pa = None; + let mut expected_next_pa = None; + + while remaining > 0 { + let (phys_region, offset) = addr_space.translate(next_va)?; + let translated_pa = phys_region.start_address().add_bytes(offset).value() as PhysAddr; + + if let Some(expected_pa) = expected_next_pa { + if translated_pa != expected_pa { + return None; + } + } else { + start_pa = Some(translated_pa); } - if page_va == last_page_va { - break; - } + let mapped_len = phys_region.size() - offset; + let covered_len = mapped_len.min(remaining); - page_va = page_va.add_pages(1); - expected_page_pa += PAGE_SIZE as PhysAddr; + next_va = next_va.add_bytes(covered_len); + remaining -= covered_len; + expected_next_pa = Some(translated_pa + covered_len as PhysAddr); } - Some(first_page_pa + vaddr.page_offset() as PhysAddr) + start_pa } fn bounce_copy_in(paddr: PhysAddr, src: &[u8]) { @@ -98,12 +95,6 @@ impl VirtioHal { .lock_save_irq() .push(BouncedShare { paddr, pages }); - // trace!( - // "virtio share: bounced {:p} len={} direction={direction:?} to paddr={paddr:#x}", - // buffer.as_ptr(), - // buffer.len(), - // ); - paddr } } @@ -130,13 +121,10 @@ unsafe impl Hal for VirtioHal { core::ptr::write_bytes(vaddr.as_ptr(), 0, pages * PAGE_SIZE); } - // trace!("alloc DMA: paddr={paddr:#x}, pages={pages}, order={order}"); (paddr, vaddr) } unsafe fn dma_dealloc(paddr: PhysAddr, _vaddr: NonNull, pages: usize) -> i32 { - // trace!("dealloc DMA: paddr={paddr:#x}, pages={pages}"); - let order = Self::pages_to_order(pages); let region = PhysMemoryRegion::new( PA::from_value(paddr as usize), diff --git a/src/fs/syscalls/pivot_root.rs b/src/fs/syscalls/pivot_root.rs index 70694344..72b15b70 100644 --- a/src/fs/syscalls/pivot_root.rs +++ b/src/fs/syscalls/pivot_root.rs @@ -1,6 +1,4 @@ -use crate::{ - fs::VFS, memory::uaccess::cstr::UserCStr, process::TASK_LIST, sched::syscall_ctx::ProcessCtx, -}; +use crate::{fs::VFS, memory::uaccess::cstr::UserCStr, sched::syscall_ctx::ProcessCtx}; use alloc::sync::Arc; use core::ffi::c_char; use libkernel::{ @@ -110,29 +108,7 @@ pub async fn sys_pivot_root( return Err(KernelError::InUse); } - let (old_root_inode, new_root_inode) = - VFS.pivot_root(new_root_attachment, put_old_attachment)?; - - let tasks: alloc::vec::Vec<_> = TASK_LIST - .lock_save_irq() - .values() - .filter_map(|work| work.upgrade()) - .collect(); - - for work in tasks { - let task = work.task.t_shared.clone(); - - let mut root = task.root.lock_save_irq(); - if root.0.id() == old_root_inode.id() { - *root = (new_root_inode.clone(), "/".into()); - } - drop(root); - - let mut cwd = task.cwd.lock_save_irq(); - if cwd.0.id() == old_root_inode.id() { - *cwd = (new_root_inode.clone(), "/".into()); - } - } + let _ = VFS.pivot_root(new_root_attachment, put_old_attachment)?; Ok(0) }