From ba2d3238e61907576080edc97bcde9df291ddee8 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Mon, 16 Mar 2026 13:33:18 -0700 Subject: [PATCH 1/2] support NVMe Deallocate --- .../src/lib/stats/virtual_disk.rs | 3 +- lib/propolis/src/block/crucible.rs | 2 +- lib/propolis/src/block/file.rs | 12 +- lib/propolis/src/block/in_memory.rs | 2 +- lib/propolis/src/block/mem_async.rs | 2 +- lib/propolis/src/block/minder.rs | 6 +- lib/propolis/src/block/mod.rs | 26 +++-- lib/propolis/src/hw/nvme/bits.rs | 35 ++++++ lib/propolis/src/hw/nvme/cmds.rs | 107 +++++++++++++++++- lib/propolis/src/hw/nvme/mod.rs | 2 + lib/propolis/src/hw/nvme/requests.rs | 38 ++++++- lib/propolis/src/hw/virtio/block.rs | 4 +- 12 files changed, 212 insertions(+), 27 deletions(-) diff --git a/bin/propolis-server/src/lib/stats/virtual_disk.rs b/bin/propolis-server/src/lib/stats/virtual_disk.rs index 4a39cfde0..3049ef403 100644 --- a/bin/propolis-server/src/lib/stats/virtual_disk.rs +++ b/bin/propolis-server/src/lib/stats/virtual_disk.rs @@ -77,9 +77,10 @@ impl VirtualDiskStats { self.on_write_completion(result, len, duration) } Operation::Flush => self.on_flush_completion(result, duration), - Operation::Discard(..) => { + Operation::Discard => { // Discard is not wired up in backends we care about for now, so // it can safely be ignored. + // XXX no longer true } } } diff --git a/lib/propolis/src/block/crucible.rs b/lib/propolis/src/block/crucible.rs index 5ba04a14b..1ddc68171 100644 --- a/lib/propolis/src/block/crucible.rs +++ b/lib/propolis/src/block/crucible.rs @@ -145,7 +145,7 @@ impl WorkerState { let _ = block.flush(None).await?; } } - block::Operation::Discard(..) => { + block::Operation::Discard => { // Crucible does not support discard operations for now return Err(Error::Unsupported); } diff --git a/lib/propolis/src/block/file.rs b/lib/propolis/src/block/file.rs index 0d95aa43f..5fac93ba0 100644 --- a/lib/propolis/src/block/file.rs +++ b/lib/propolis/src/block/file.rs @@ -113,12 +113,16 @@ impl SharedState { self.fp.sync_data().map_err(|_| "io error")?; } } - block::Operation::Discard(off, len) => { + block::Operation::Discard => { if let Some(mech) = self.discard_mech { - dkioc::do_discard(&self.fp, mech, off as u64, len as u64) + for &(off, len) in &req.ranges { + dkioc::do_discard( + &self.fp, mech, off as u64, len as u64, + ) .map_err(|_| { - "io error while attempting to free block(s)" - })?; + "io error while attempting to free block(s)" + })?; + } } else { unreachable!("handled above in processing_loop()"); } diff --git a/lib/propolis/src/block/in_memory.rs b/lib/propolis/src/block/in_memory.rs index 949412a45..9d45c1fa1 100644 --- a/lib/propolis/src/block/in_memory.rs +++ b/lib/propolis/src/block/in_memory.rs @@ -86,7 +86,7 @@ impl SharedState { block::Operation::Flush => { // nothing to do } - block::Operation::Discard(..) => { + block::Operation::Discard => { unreachable!("handled in processing_loop()"); } } diff --git a/lib/propolis/src/block/mem_async.rs b/lib/propolis/src/block/mem_async.rs index d834bd388..d376189dc 100644 --- a/lib/propolis/src/block/mem_async.rs +++ b/lib/propolis/src/block/mem_async.rs @@ -89,7 +89,7 @@ impl SharedState { block::Operation::Flush => { // nothing to do } - block::Operation::Discard(..) => { + block::Operation::Discard => { unreachable!("handled in processing_loop()") } } diff --git a/lib/propolis/src/block/minder.rs b/lib/propolis/src/block/minder.rs index c9c62f576..e9ff8fa54 100644 --- a/lib/propolis/src/block/minder.rs +++ b/lib/propolis/src/block/minder.rs @@ -283,9 +283,9 @@ impl QueueMinder { Operation::Flush => { probes::block_begin_flush!(|| { (devqid, id) }); } - Operation::Discard(off, len) => { + Operation::Discard => { probes::block_begin_discard!(|| { - (devqid, id, off as u64, len as u64) + (devqid, id, req.ranges.len() as u64) }); } } @@ -355,7 +355,7 @@ impl QueueMinder { (devqid, id, rescode, ns_processed, ns_queued) }); } - Operation::Discard(..) => { + Operation::Discard => { probes::block_complete_discard!(|| { (devqid, id, rescode, ns_processed, ns_queued) }); diff --git a/lib/propolis/src/block/mod.rs b/lib/propolis/src/block/mod.rs index 674f2a985..380acfb34 100644 --- a/lib/propolis/src/block/mod.rs +++ b/lib/propolis/src/block/mod.rs @@ -50,7 +50,7 @@ mod probes { fn block_begin_read(devq_id: u64, req_id: u64, offset: u64, len: u64) {} fn block_begin_write(devq_id: u64, req_id: u64, offset: u64, len: u64) {} fn block_begin_flush(devq_id: u64, req_id: u64) {} - fn block_begin_discard(devq_id: u64, req_id: u64, offset: u64, len: u64) {} + fn block_begin_discard(devq_id: u64, req_id: u64, nr: u64) {} fn block_complete_read( devq_id: u64, @@ -106,8 +106,8 @@ pub enum Operation { Write(ByteOffset, ByteLen), /// Flush buffer(s) Flush, - /// Discard/UNMAP/deallocate region - Discard(ByteOffset, ByteLen), + /// Discard/UNMAP/deallocate some ranges, which are specified in Request::ranges + Discard, } impl Operation { pub const fn is_read(&self) -> bool { @@ -120,7 +120,7 @@ impl Operation { matches!(self, Operation::Flush) } pub const fn is_discard(&self) -> bool { - matches!(self, Operation::Discard(..)) + matches!(self, Operation::Discard) } } @@ -203,6 +203,10 @@ pub struct Request { /// A list of regions of guest memory to read/write into as part of the I/O /// request pub regions: Vec, + + /// A list of byte ranges to discard as part of the I/O request. This is only + /// relevant for discard operations, and is expected to be empty otherwise. + pub ranges: Vec<(ByteOffset, ByteLen)>, } impl Request { pub fn new_read( @@ -210,7 +214,7 @@ impl Request { len: ByteLen, regions: Vec, ) -> Self { - Self { op: Operation::Read(off, len), regions } + Self { op: Operation::Read(off, len), regions, ranges: Vec::new() } } pub fn new_write( @@ -218,17 +222,17 @@ impl Request { len: ByteLen, regions: Vec, ) -> Self { - Self { op: Operation::Write(off, len), regions } + Self { op: Operation::Write(off, len), regions, ranges: Vec::new() } } pub fn new_flush() -> Self { let op = Operation::Flush; - Self { op, regions: Vec::new() } + Self { op, regions: Vec::new(), ranges: Vec::new() } } - pub fn new_discard(off: ByteOffset, len: ByteLen) -> Self { - let op = Operation::Discard(off, len); - Self { op, regions: Vec::new() } + pub fn new_discard(ranges: Vec<(ByteOffset, ByteLen)>) -> Self { + let op = Operation::Discard; + Self { op, regions: Vec::new(), ranges } } pub fn mappings<'a>(&self, mem: &'a MemCtx) -> Option>> { @@ -239,7 +243,7 @@ impl Request { Operation::Write(..) => { self.regions.iter().map(|r| mem.readable_region(r)).collect() } - Operation::Flush | Operation::Discard(..) => None, + Operation::Flush | Operation::Discard => None, } } } diff --git a/lib/propolis/src/hw/nvme/bits.rs b/lib/propolis/src/hw/nvme/bits.rs index c227ae1f4..dab6d3757 100644 --- a/lib/propolis/src/hw/nvme/bits.rs +++ b/lib/propolis/src/hw/nvme/bits.rs @@ -4,6 +4,7 @@ #![allow(dead_code)] +use crate::block::{ByteLen, ByteOffset}; use bitstruct::bitstruct; use zerocopy::FromBytes; @@ -176,6 +177,38 @@ impl CompletionQueueEntry { } } +/// A Dataset Management Range Definition as represented in memory. +/// +/// See NVMe 1.0e Section 6.6 Figure 114: Dataset Management – Range Definition +#[derive(Debug, Default, Copy, Clone, FromBytes)] +#[repr(C, packed(1))] +pub struct DatasetManagementRangeDefinition { + /// The context attributes specified for each range provides information about how the range + /// is intended to be used by host software. The use of this information is optional and the + /// controller is not required to perform any specific action. + pub context_attributes: u32, + + pub number_logical_blocks: u32, + + pub starting_lba: u64, +} +impl DatasetManagementRangeDefinition { + pub fn new( + context_attributes: u32, + number_logical_blocks: u32, + starting_lba: u64, + ) -> Self { + Self { context_attributes, number_logical_blocks, starting_lba } + } + + pub fn offset_len(&self, lba_data_size: u64) -> (ByteOffset, ByteLen) { + ( + (self.starting_lba * lba_data_size) as ByteOffset, + (self.number_logical_blocks as u64 * lba_data_size) as ByteLen, + ) + } +} + // Register bits bitstruct! { @@ -539,6 +572,8 @@ pub const NVM_OPC_FLUSH: u8 = 0x00; pub const NVM_OPC_WRITE: u8 = 0x01; /// Read Command Opcode pub const NVM_OPC_READ: u8 = 0x02; +/// Dataset Mangement Command Opcode +pub const NVM_OPC_DATASET_MANAGEMENT: u8 = 0x09; // Generic Command Status values // See NVMe 1.0e Section 4.5.1.2.1, Figure 17 Status Code - Generic Command Status Values diff --git a/lib/propolis/src/hw/nvme/cmds.rs b/lib/propolis/src/hw/nvme/cmds.rs index b11234aea..5187d1ff5 100644 --- a/lib/propolis/src/hw/nvme/cmds.rs +++ b/lib/propolis/src/hw/nvme/cmds.rs @@ -2,7 +2,10 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use super::bits::{self, StatusCodeType, SubmissionQueueEntry}; +use super::bits::{ + self, DatasetManagementRangeDefinition, StatusCodeType, + SubmissionQueueEntry, +}; use super::queue::{QueueCreateErr, QueueId}; use crate::block; use crate::common::*; @@ -678,6 +681,8 @@ pub enum NvmCmd { Write(WriteCmd), /// Read data and metadata Read(ReadCmd), + /// Dataset Management Command + DatasetManagement(DatasetManagementCmd), /// An unknown NVM command Unknown(GuestData), } @@ -709,6 +714,17 @@ impl NvmCmd { prp1: raw.prp1, prp2: raw.prp2, }), + bits::NVM_OPC_DATASET_MANAGEMENT => { + NvmCmd::DatasetManagement(DatasetManagementCmd { + prp1: raw.prp1, + prp2: raw.prp2, + // Convert from 0's based value + nr: (raw.cdw10 & 0xFF) as u16 + 1, + ad: raw.cdw11 & (1 << 2) != 0, + idw: raw.cdw11 & (1 << 1) != 0, + idr: raw.cdw11 & (1 << 0) != 0, + }) + } _ => NvmCmd::Unknown(raw), }; Ok(cmd) @@ -779,6 +795,95 @@ impl ReadCmd { } } +/// Dataset Management Command Parameters +#[derive(Debug)] +#[allow(dead_code)] +pub struct DatasetManagementCmd { + /// PRP Entry 1 (PRP1) + /// + /// Indicates a data buffer that contains the LBA range information. + prp1: u64, + + /// PRP Entry 2 (PRP2) + /// + /// This field contains the second PRP entry that specifies the location where data should be + /// transferred from (if there is a physical discontinuity). + prp2: u64, + + /// Number of Ranges (NR) + /// + /// Indicates the number of 16 byte range sets that are specified in the command. This is a + /// 0’s based value. + pub nr: u16, + + /// Attribute – Deallocate (AD) + /// + /// If set to ‘1’ then the NVM subsystem may deallocate all provided ranges. If a read occurs + /// to a deallocated range, the NVM Express subsystem shall return all zeros, all ones, or + /// the last data written to the associated LBA. + /// + /// Note: The operation of the Deallocate function is similar to the ATA DATA SET MANAGEMENT + /// with Trim feature described in ACS-2 and SCSI UNMAP command described in SBC-3. + ad: bool, + + /// Attribute – Integral Dataset for Write (IDW) + /// + /// If set to ‘1’ then the dataset should be optimized for write access as an integral unit. + /// The host expects to perform operations on all ranges provided as an integral unit for + /// writes, indicating that if a portion of the dataset is written it is expected that all of + /// the ranges in the dataset are going to be written. + idw: bool, + + /// Attribute – Integral Dataset for Read (IDR) + /// + /// If set to ‘1’ then the dataset should be optimized for read access as an integral unit. + /// The host expects to perform operations on all ranges provided as an integral unit for + /// reads, indicating that if a portion of the dataset is read it is expected that all of the + /// ranges in the dataset are going to be read. + idr: bool, +} + +impl DatasetManagementCmd { + /// Returns an Iterator that yields [`GuestRegion`]'s which contain the array of LBA ranges. + pub fn data<'a>(&self, mem: &'a MemCtx) -> PrpIter<'a> { + PrpIter::new( + u64::from(self.nr) + * size_of::() as u64, + self.prp1, + self.prp2, + mem, + ) + } + + /// Returns an Iterator that yields the LBA ranges specified in this command. Note that if + /// some of the ranges couldn't be read from guest memory, this will yield fewer than + /// `Self.nr` ranges. + pub fn ranges<'a>( + &self, + mem: &'a MemCtx, + ) -> impl Iterator + 'a { + self.data(mem).flat_map(|region| { + let mut ranges = Vec::new(); + if let Some(mapping) = mem.readable_region(®ion) { + ranges.resize_with( + mapping.len() + / size_of::(), + Default::default, + ); + if mapping.read_many(&mut ranges).is_err() { + ranges.clear(); + } + }; + + ranges.into_iter() + }) + } + + pub fn is_deallocate(&self) -> bool { + self.ad + } +} + /// Indicates the possible states of a [`PrpIter`]. #[derive(Clone, Copy, Eq, PartialEq, Debug)] enum PrpNext { diff --git a/lib/propolis/src/hw/nvme/mod.rs b/lib/propolis/src/hw/nvme/mod.rs index 9a3bf657d..7f8f639b1 100644 --- a/lib/propolis/src/hw/nvme/mod.rs +++ b/lib/propolis/src/hw/nvme/mod.rs @@ -846,6 +846,8 @@ impl PciNvme { // Supporting multiple namespaces complicates I/O dispatching, // so for now we limit the device to a single namespace. nn: 1, + // bit 2 indicates support for the Dataset Management command + oncs: (1 << 2), // bit 0 indicates volatile write cache is present vwc: 1, // bit 8 indicates Doorbell Buffer support diff --git a/lib/propolis/src/hw/nvme/requests.rs b/lib/propolis/src/hw/nvme/requests.rs index 4821b47f0..f4ec661ee 100644 --- a/lib/propolis/src/hw/nvme/requests.rs +++ b/lib/propolis/src/hw/nvme/requests.rs @@ -38,6 +38,9 @@ mod probes { } fn nvme_write_complete(devsq_id: u64, cid: u16, res: u8) {} + fn nvme_discard_enqueue(devsq_id: u64, idx: u16, cid: u16, nr: u16) {} + fn nvme_discard_complete(devsq_id: u64, cid: u16, res: u8) {} + fn nvme_flush_enqueue(devsq_id: u64, idx: u16, cid: u16) {} fn nvme_flush_complete(devsq_id: u64, cid: u16, res: u8) {} @@ -142,6 +145,37 @@ impl block::DeviceQueue for NvmeBlockQueue { Request::new_read(off as usize, size as usize, bufs); return Some((req, permit, None)); } + Ok(NvmCmd::DatasetManagement(cmd)) => { + // We only support the "deallocate" (discard/trim) operation of Dataset + // Management. XXX should we return success? + if !cmd.is_deallocate() { + permit.complete( + Completion::generic_err(bits::STS_INVAL_FIELD) + .dnr(), + ); + continue; + } + probes::nvme_discard_enqueue!(|| ( + sq.devq_id(), + idx, + cid, + cmd.nr, + )); + let ranges: Vec<_> = cmd + .ranges(&mem) + .map(|r| r.offset_len(params.lba_data_size)) + .collect(); + if ranges.len() != cmd.nr as usize { + // If we couldn't read all the ranges, fail the command + permit.complete( + Completion::generic_err(bits::STS_DATA_XFER_ERR) + .dnr(), + ); + continue; + } + let req = Request::new_discard(ranges); + return Some((req, permit, None)); + } Ok(NvmCmd::Flush) => { probes::nvme_flush_enqueue!(|| (sq.devq_id(), idx, cid)); let req = Request::new_flush(); @@ -179,8 +213,8 @@ impl block::DeviceQueue for NvmeBlockQueue { Operation::Flush => { probes::nvme_flush_complete!(|| (devsq_id, cid, resnum)); } - Operation::Discard(..) => { - unreachable!("discard not supported in NVMe for now"); + Operation::Discard => { + probes::nvme_discard_complete!(|| (devsq_id, cid, resnum)); } } diff --git a/lib/propolis/src/hw/virtio/block.rs b/lib/propolis/src/hw/virtio/block.rs index 84c662019..75659e46d 100644 --- a/lib/propolis/src/hw/virtio/block.rs +++ b/lib/propolis/src/hw/virtio/block.rs @@ -198,7 +198,7 @@ impl block::DeviceQueue for BlockVq { rid, off as u64, sz as u64, )); Ok(( - block::Request::new_discard(off, sz), + block::Request::new_discard(vec![(off, sz)]), CompletionToken { rid, chain }, None, )) @@ -245,7 +245,7 @@ impl block::DeviceQueue for BlockVq { block::Operation::Flush => { probes::vioblk_flush_complete!(|| (rid, resnum)); } - block::Operation::Discard(..) => { + block::Operation::Discard => { probes::vioblk_discard_complete!(|| (rid, resnum)); } } From eed15c633713eda959aeedbb32e447b405484605 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Mon, 13 Apr 2026 12:06:09 -0700 Subject: [PATCH 2/2] ixi code review --- Cargo.lock | 1 + .../src/lib/stats/virtual_disk.rs | 5 +- lib/propolis/Cargo.toml | 1 + lib/propolis/src/block/crucible.rs | 6 +- lib/propolis/src/hw/nvme/cmds.rs | 59 +++++++++++-------- lib/propolis/src/hw/nvme/requests.rs | 23 +++++--- lib/propolis/src/vmm/mem.rs | 22 +++++++ 7 files changed, 79 insertions(+), 38 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c6320d360..482e96ece 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6434,6 +6434,7 @@ dependencies = [ "futures", "iddqd", "ispf", + "itertools 0.13.0", "lazy_static", "libc", "libloading 0.7.4", diff --git a/bin/propolis-server/src/lib/stats/virtual_disk.rs b/bin/propolis-server/src/lib/stats/virtual_disk.rs index 3049ef403..8ee0a4c85 100644 --- a/bin/propolis-server/src/lib/stats/virtual_disk.rs +++ b/bin/propolis-server/src/lib/stats/virtual_disk.rs @@ -78,9 +78,8 @@ impl VirtualDiskStats { } Operation::Flush => self.on_flush_completion(result, duration), Operation::Discard => { - // Discard is not wired up in backends we care about for now, so - // it can safely be ignored. - // XXX no longer true + // Discard is only partially wired up in backends we care about, and it's not + // clear what stats (if any) to report yet. } } } diff --git a/lib/propolis/Cargo.toml b/lib/propolis/Cargo.toml index d092121f6..464a293db 100644 --- a/lib/propolis/Cargo.toml +++ b/lib/propolis/Cargo.toml @@ -44,6 +44,7 @@ async-trait.workspace = true iddqd.workspace = true nix.workspace = true vm-attest.workspace = true +itertools.workspace = true # falcon libloading = { workspace = true, optional = true } diff --git a/lib/propolis/src/block/crucible.rs b/lib/propolis/src/block/crucible.rs index 1ddc68171..bdcf6ab60 100644 --- a/lib/propolis/src/block/crucible.rs +++ b/lib/propolis/src/block/crucible.rs @@ -146,8 +146,10 @@ impl WorkerState { } } block::Operation::Discard => { - // Crucible does not support discard operations for now - return Err(Error::Unsupported); + // Crucible does not support discard operations for now, so we implement this as + // a no-op (which technically is a valid implementation of discard, just one that + // doesn't actually free any space). + return Ok(()); } } Ok(()) diff --git a/lib/propolis/src/hw/nvme/cmds.rs b/lib/propolis/src/hw/nvme/cmds.rs index 5187d1ff5..0754811c4 100644 --- a/lib/propolis/src/hw/nvme/cmds.rs +++ b/lib/propolis/src/hw/nvme/cmds.rs @@ -31,6 +31,10 @@ pub enum ParseErr { /// An invalid value was specified in the FUSE bits of `CDW0`. #[error("reserved FUSE value specified")] ReservedFuse, + + /// A reserved field was set to a non-zero value. + #[error("reserved field value specified")] + Reserved, } /// A parsed Admin Command @@ -715,14 +719,19 @@ impl NvmCmd { prp2: raw.prp2, }), bits::NVM_OPC_DATASET_MANAGEMENT => { + if (raw.cdw11 & !0b111) != 0 { + // Only the lowest 3 bits of CDW11 are used for Dataset + // Management, so reject if any other bits are set. + return Err(ParseErr::Reserved); + } NvmCmd::DatasetManagement(DatasetManagementCmd { prp1: raw.prp1, prp2: raw.prp2, // Convert from 0's based value nr: (raw.cdw10 & 0xFF) as u16 + 1, ad: raw.cdw11 & (1 << 2) != 0, - idw: raw.cdw11 & (1 << 1) != 0, - idr: raw.cdw11 & (1 << 0) != 0, + _idw: raw.cdw11 & (1 << 1) != 0, + _idr: raw.cdw11 & (1 << 0) != 0, }) } _ => NvmCmd::Unknown(raw), @@ -797,7 +806,6 @@ impl ReadCmd { /// Dataset Management Command Parameters #[derive(Debug)] -#[allow(dead_code)] pub struct DatasetManagementCmd { /// PRP Entry 1 (PRP1) /// @@ -806,14 +814,13 @@ pub struct DatasetManagementCmd { /// PRP Entry 2 (PRP2) /// - /// This field contains the second PRP entry that specifies the location where data should be - /// transferred from (if there is a physical discontinuity). + /// Indicates a second data buffer that contains LBA range information. It may not be a PRP + /// List. prp2: u64, /// Number of Ranges (NR) /// - /// Indicates the number of 16 byte range sets that are specified in the command. This is a - /// 0’s based value. + /// Indicates the number of 16 byte range sets that are specified in the command. pub nr: u16, /// Attribute – Deallocate (AD) @@ -832,7 +839,9 @@ pub struct DatasetManagementCmd { /// The host expects to perform operations on all ranges provided as an integral unit for /// writes, indicating that if a portion of the dataset is written it is expected that all of /// the ranges in the dataset are going to be written. - idw: bool, + /// + /// Note: this field is advisory, and we ignore it. + _idw: bool, /// Attribute – Integral Dataset for Read (IDR) /// @@ -840,7 +849,9 @@ pub struct DatasetManagementCmd { /// The host expects to perform operations on all ranges provided as an integral unit for /// reads, indicating that if a portion of the dataset is read it is expected that all of the /// ranges in the dataset are going to be read. - idr: bool, + /// + /// Note: this field is advisory, and we ignore it. + _idr: bool, } impl DatasetManagementCmd { @@ -855,27 +866,23 @@ impl DatasetManagementCmd { ) } - /// Returns an Iterator that yields the LBA ranges specified in this command. Note that if - /// some of the ranges couldn't be read from guest memory, this will yield fewer than - /// `Self.nr` ranges. + /// Returns an Iterator that yields the LBA ranges specified in this command. If any of the + /// ranges cannot be read from guest memory, yields an error for that range instead. pub fn ranges<'a>( &self, mem: &'a MemCtx, - ) -> impl Iterator + 'a { + ) -> impl Iterator< + Item = Result, + > + 'a { self.data(mem).flat_map(|region| { - let mut ranges = Vec::new(); - if let Some(mapping) = mem.readable_region(®ion) { - ranges.resize_with( - mapping.len() - / size_of::(), - Default::default, - ); - if mapping.read_many(&mut ranges).is_err() { - ranges.clear(); - } - }; - - ranges.into_iter() + if let Some(Ok(defs)) = mem + .readable_region(®ion) + .map(|mapping| mapping.read_many_owned()) + { + defs.into_iter().map(Ok).collect::>().into_iter() + } else { + vec![Err("Failed to read LBA range")].into_iter() + } }) } diff --git a/lib/propolis/src/hw/nvme/requests.rs b/lib/propolis/src/hw/nvme/requests.rs index f4ec661ee..e00158525 100644 --- a/lib/propolis/src/hw/nvme/requests.rs +++ b/lib/propolis/src/hw/nvme/requests.rs @@ -2,12 +2,14 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use itertools::Itertools; use std::sync::Arc; use std::time::Instant; use super::{cmds::NvmCmd, queue::Permit, PciNvme}; use crate::accessors::MemAccessor; use crate::block::{self, Operation, Request}; +use crate::hw::nvme::cmds::ParseErr; use crate::hw::nvme::{bits, cmds::Completion, queue::SubQueue}; #[usdt::provider(provider = "propolis")] @@ -147,7 +149,7 @@ impl block::DeviceQueue for NvmeBlockQueue { } Ok(NvmCmd::DatasetManagement(cmd)) => { // We only support the "deallocate" (discard/trim) operation of Dataset - // Management. XXX should we return success? + // Management. if !cmd.is_deallocate() { permit.complete( Completion::generic_err(bits::STS_INVAL_FIELD) @@ -161,18 +163,18 @@ impl block::DeviceQueue for NvmeBlockQueue { cid, cmd.nr, )); - let ranges: Vec<_> = cmd + let Ok(ranges): Result, _> = cmd .ranges(&mem) - .map(|r| r.offset_len(params.lba_data_size)) - .collect(); - if ranges.len() != cmd.nr as usize { - // If we couldn't read all the ranges, fail the command + .map_ok(|r| r.offset_len(params.lba_data_size)) + .try_collect() + else { + // If we couldn't read the ranges, fail the command permit.complete( Completion::generic_err(bits::STS_DATA_XFER_ERR) .dnr(), ); continue; - } + }; let req = Request::new_discard(ranges); return Some((req, permit, None)); } @@ -181,6 +183,13 @@ impl block::DeviceQueue for NvmeBlockQueue { let req = Request::new_flush(); return Some((req, permit, None)); } + Err(ParseErr::ReservedFuse) | Err(ParseErr::Reserved) => { + // For commands that fail parsing due to reserved fields being set, + // complete with an invalid field error + let comp = + Completion::generic_err(bits::STS_INVAL_FIELD).dnr(); + permit.complete(comp); + } Ok(NvmCmd::Unknown(_)) | Err(_) => { // For any other unrecognized or malformed command, // just immediately complete it with an error diff --git a/lib/propolis/src/vmm/mem.rs b/lib/propolis/src/vmm/mem.rs index fbce4b3da..3a7a2482a 100644 --- a/lib/propolis/src/vmm/mem.rs +++ b/lib/propolis/src/vmm/mem.rs @@ -557,6 +557,28 @@ impl SubMapping<'_> { Ok(unsafe { typed.read_unaligned() }) } + /// Read the entire mapping as an array of `T` objects. + /// The size of the mapping must be aligned to `size_of::()`. + pub fn read_many_owned(&self) -> Result> { + self.check_read_access()?; + if !self.len.is_multiple_of(size_of::()) { + return Err(Error::new( + ErrorKind::InvalidInput, + "Mapping size not aligned to value type", + )); + } + let count = self.len / size_of::(); + let mut vec = Vec::with_capacity(count); + + self.read_many(&mut vec.spare_capacity_mut()[..count])?; + // Safety: read_many() was successful and just initialized the first `count` elements of + // the vector. + unsafe { + vec.set_len(count); + } + Ok(vec) + } + /// Read `values` from the mapping. pub fn read_many( &self,