diff --git a/Cargo.lock b/Cargo.lock index c6320d360..482e96ece 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6434,6 +6434,7 @@ dependencies = [ "futures", "iddqd", "ispf", + "itertools 0.13.0", "lazy_static", "libc", "libloading 0.7.4", diff --git a/bin/propolis-server/src/lib/stats/virtual_disk.rs b/bin/propolis-server/src/lib/stats/virtual_disk.rs index 4a39cfde0..8ee0a4c85 100644 --- a/bin/propolis-server/src/lib/stats/virtual_disk.rs +++ b/bin/propolis-server/src/lib/stats/virtual_disk.rs @@ -77,9 +77,9 @@ impl VirtualDiskStats { self.on_write_completion(result, len, duration) } Operation::Flush => self.on_flush_completion(result, duration), - Operation::Discard(..) => { - // Discard is not wired up in backends we care about for now, so - // it can safely be ignored. + Operation::Discard => { + // Discard is only partially wired up in backends we care about, and it's not + // clear what stats (if any) to report yet. } } } diff --git a/lib/propolis/Cargo.toml b/lib/propolis/Cargo.toml index d092121f6..464a293db 100644 --- a/lib/propolis/Cargo.toml +++ b/lib/propolis/Cargo.toml @@ -44,6 +44,7 @@ async-trait.workspace = true iddqd.workspace = true nix.workspace = true vm-attest.workspace = true +itertools.workspace = true # falcon libloading = { workspace = true, optional = true } diff --git a/lib/propolis/src/block/crucible.rs b/lib/propolis/src/block/crucible.rs index 5ba04a14b..bdcf6ab60 100644 --- a/lib/propolis/src/block/crucible.rs +++ b/lib/propolis/src/block/crucible.rs @@ -145,9 +145,11 @@ impl WorkerState { let _ = block.flush(None).await?; } } - block::Operation::Discard(..) => { - // Crucible does not support discard operations for now - return Err(Error::Unsupported); + block::Operation::Discard => { + // Crucible does not support discard operations for now, so we implement this as + // a no-op (which technically is a valid implementation of discard, just one that + // doesn't actually free any space). + return Ok(()); } } Ok(()) diff --git a/lib/propolis/src/block/file.rs b/lib/propolis/src/block/file.rs index 0d95aa43f..5fac93ba0 100644 --- a/lib/propolis/src/block/file.rs +++ b/lib/propolis/src/block/file.rs @@ -113,12 +113,16 @@ impl SharedState { self.fp.sync_data().map_err(|_| "io error")?; } } - block::Operation::Discard(off, len) => { + block::Operation::Discard => { if let Some(mech) = self.discard_mech { - dkioc::do_discard(&self.fp, mech, off as u64, len as u64) + for &(off, len) in &req.ranges { + dkioc::do_discard( + &self.fp, mech, off as u64, len as u64, + ) .map_err(|_| { - "io error while attempting to free block(s)" - })?; + "io error while attempting to free block(s)" + })?; + } } else { unreachable!("handled above in processing_loop()"); } diff --git a/lib/propolis/src/block/in_memory.rs b/lib/propolis/src/block/in_memory.rs index 949412a45..9d45c1fa1 100644 --- a/lib/propolis/src/block/in_memory.rs +++ b/lib/propolis/src/block/in_memory.rs @@ -86,7 +86,7 @@ impl SharedState { block::Operation::Flush => { // nothing to do } - block::Operation::Discard(..) => { + block::Operation::Discard => { unreachable!("handled in processing_loop()"); } } diff --git a/lib/propolis/src/block/mem_async.rs b/lib/propolis/src/block/mem_async.rs index d834bd388..d376189dc 100644 --- a/lib/propolis/src/block/mem_async.rs +++ b/lib/propolis/src/block/mem_async.rs @@ -89,7 +89,7 @@ impl SharedState { block::Operation::Flush => { // nothing to do } - block::Operation::Discard(..) => { + block::Operation::Discard => { unreachable!("handled in processing_loop()") } } diff --git a/lib/propolis/src/block/minder.rs b/lib/propolis/src/block/minder.rs index c9c62f576..e9ff8fa54 100644 --- a/lib/propolis/src/block/minder.rs +++ b/lib/propolis/src/block/minder.rs @@ -283,9 +283,9 @@ impl QueueMinder { Operation::Flush => { probes::block_begin_flush!(|| { (devqid, id) }); } - Operation::Discard(off, len) => { + Operation::Discard => { probes::block_begin_discard!(|| { - (devqid, id, off as u64, len as u64) + (devqid, id, req.ranges.len() as u64) }); } } @@ -355,7 +355,7 @@ impl QueueMinder { (devqid, id, rescode, ns_processed, ns_queued) }); } - Operation::Discard(..) => { + Operation::Discard => { probes::block_complete_discard!(|| { (devqid, id, rescode, ns_processed, ns_queued) }); diff --git a/lib/propolis/src/block/mod.rs b/lib/propolis/src/block/mod.rs index 674f2a985..380acfb34 100644 --- a/lib/propolis/src/block/mod.rs +++ b/lib/propolis/src/block/mod.rs @@ -50,7 +50,7 @@ mod probes { fn block_begin_read(devq_id: u64, req_id: u64, offset: u64, len: u64) {} fn block_begin_write(devq_id: u64, req_id: u64, offset: u64, len: u64) {} fn block_begin_flush(devq_id: u64, req_id: u64) {} - fn block_begin_discard(devq_id: u64, req_id: u64, offset: u64, len: u64) {} + fn block_begin_discard(devq_id: u64, req_id: u64, nr: u64) {} fn block_complete_read( devq_id: u64, @@ -106,8 +106,8 @@ pub enum Operation { Write(ByteOffset, ByteLen), /// Flush buffer(s) Flush, - /// Discard/UNMAP/deallocate region - Discard(ByteOffset, ByteLen), + /// Discard/UNMAP/deallocate some ranges, which are specified in Request::ranges + Discard, } impl Operation { pub const fn is_read(&self) -> bool { @@ -120,7 +120,7 @@ impl Operation { matches!(self, Operation::Flush) } pub const fn is_discard(&self) -> bool { - matches!(self, Operation::Discard(..)) + matches!(self, Operation::Discard) } } @@ -203,6 +203,10 @@ pub struct Request { /// A list of regions of guest memory to read/write into as part of the I/O /// request pub regions: Vec, + + /// A list of byte ranges to discard as part of the I/O request. This is only + /// relevant for discard operations, and is expected to be empty otherwise. + pub ranges: Vec<(ByteOffset, ByteLen)>, } impl Request { pub fn new_read( @@ -210,7 +214,7 @@ impl Request { len: ByteLen, regions: Vec, ) -> Self { - Self { op: Operation::Read(off, len), regions } + Self { op: Operation::Read(off, len), regions, ranges: Vec::new() } } pub fn new_write( @@ -218,17 +222,17 @@ impl Request { len: ByteLen, regions: Vec, ) -> Self { - Self { op: Operation::Write(off, len), regions } + Self { op: Operation::Write(off, len), regions, ranges: Vec::new() } } pub fn new_flush() -> Self { let op = Operation::Flush; - Self { op, regions: Vec::new() } + Self { op, regions: Vec::new(), ranges: Vec::new() } } - pub fn new_discard(off: ByteOffset, len: ByteLen) -> Self { - let op = Operation::Discard(off, len); - Self { op, regions: Vec::new() } + pub fn new_discard(ranges: Vec<(ByteOffset, ByteLen)>) -> Self { + let op = Operation::Discard; + Self { op, regions: Vec::new(), ranges } } pub fn mappings<'a>(&self, mem: &'a MemCtx) -> Option>> { @@ -239,7 +243,7 @@ impl Request { Operation::Write(..) => { self.regions.iter().map(|r| mem.readable_region(r)).collect() } - Operation::Flush | Operation::Discard(..) => None, + Operation::Flush | Operation::Discard => None, } } } diff --git a/lib/propolis/src/hw/nvme/bits.rs b/lib/propolis/src/hw/nvme/bits.rs index c227ae1f4..dab6d3757 100644 --- a/lib/propolis/src/hw/nvme/bits.rs +++ b/lib/propolis/src/hw/nvme/bits.rs @@ -4,6 +4,7 @@ #![allow(dead_code)] +use crate::block::{ByteLen, ByteOffset}; use bitstruct::bitstruct; use zerocopy::FromBytes; @@ -176,6 +177,38 @@ impl CompletionQueueEntry { } } +/// A Dataset Management Range Definition as represented in memory. +/// +/// See NVMe 1.0e Section 6.6 Figure 114: Dataset Management – Range Definition +#[derive(Debug, Default, Copy, Clone, FromBytes)] +#[repr(C, packed(1))] +pub struct DatasetManagementRangeDefinition { + /// The context attributes specified for each range provides information about how the range + /// is intended to be used by host software. The use of this information is optional and the + /// controller is not required to perform any specific action. + pub context_attributes: u32, + + pub number_logical_blocks: u32, + + pub starting_lba: u64, +} +impl DatasetManagementRangeDefinition { + pub fn new( + context_attributes: u32, + number_logical_blocks: u32, + starting_lba: u64, + ) -> Self { + Self { context_attributes, number_logical_blocks, starting_lba } + } + + pub fn offset_len(&self, lba_data_size: u64) -> (ByteOffset, ByteLen) { + ( + (self.starting_lba * lba_data_size) as ByteOffset, + (self.number_logical_blocks as u64 * lba_data_size) as ByteLen, + ) + } +} + // Register bits bitstruct! { @@ -539,6 +572,8 @@ pub const NVM_OPC_FLUSH: u8 = 0x00; pub const NVM_OPC_WRITE: u8 = 0x01; /// Read Command Opcode pub const NVM_OPC_READ: u8 = 0x02; +/// Dataset Mangement Command Opcode +pub const NVM_OPC_DATASET_MANAGEMENT: u8 = 0x09; // Generic Command Status values // See NVMe 1.0e Section 4.5.1.2.1, Figure 17 Status Code - Generic Command Status Values diff --git a/lib/propolis/src/hw/nvme/cmds.rs b/lib/propolis/src/hw/nvme/cmds.rs index b11234aea..0754811c4 100644 --- a/lib/propolis/src/hw/nvme/cmds.rs +++ b/lib/propolis/src/hw/nvme/cmds.rs @@ -2,7 +2,10 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use super::bits::{self, StatusCodeType, SubmissionQueueEntry}; +use super::bits::{ + self, DatasetManagementRangeDefinition, StatusCodeType, + SubmissionQueueEntry, +}; use super::queue::{QueueCreateErr, QueueId}; use crate::block; use crate::common::*; @@ -28,6 +31,10 @@ pub enum ParseErr { /// An invalid value was specified in the FUSE bits of `CDW0`. #[error("reserved FUSE value specified")] ReservedFuse, + + /// A reserved field was set to a non-zero value. + #[error("reserved field value specified")] + Reserved, } /// A parsed Admin Command @@ -678,6 +685,8 @@ pub enum NvmCmd { Write(WriteCmd), /// Read data and metadata Read(ReadCmd), + /// Dataset Management Command + DatasetManagement(DatasetManagementCmd), /// An unknown NVM command Unknown(GuestData), } @@ -709,6 +718,22 @@ impl NvmCmd { prp1: raw.prp1, prp2: raw.prp2, }), + bits::NVM_OPC_DATASET_MANAGEMENT => { + if (raw.cdw11 & !0b111) != 0 { + // Only the lowest 3 bits of CDW11 are used for Dataset + // Management, so reject if any other bits are set. + return Err(ParseErr::Reserved); + } + NvmCmd::DatasetManagement(DatasetManagementCmd { + prp1: raw.prp1, + prp2: raw.prp2, + // Convert from 0's based value + nr: (raw.cdw10 & 0xFF) as u16 + 1, + ad: raw.cdw11 & (1 << 2) != 0, + _idw: raw.cdw11 & (1 << 1) != 0, + _idr: raw.cdw11 & (1 << 0) != 0, + }) + } _ => NvmCmd::Unknown(raw), }; Ok(cmd) @@ -779,6 +804,93 @@ impl ReadCmd { } } +/// Dataset Management Command Parameters +#[derive(Debug)] +pub struct DatasetManagementCmd { + /// PRP Entry 1 (PRP1) + /// + /// Indicates a data buffer that contains the LBA range information. + prp1: u64, + + /// PRP Entry 2 (PRP2) + /// + /// Indicates a second data buffer that contains LBA range information. It may not be a PRP + /// List. + prp2: u64, + + /// Number of Ranges (NR) + /// + /// Indicates the number of 16 byte range sets that are specified in the command. + pub nr: u16, + + /// Attribute – Deallocate (AD) + /// + /// If set to ‘1’ then the NVM subsystem may deallocate all provided ranges. If a read occurs + /// to a deallocated range, the NVM Express subsystem shall return all zeros, all ones, or + /// the last data written to the associated LBA. + /// + /// Note: The operation of the Deallocate function is similar to the ATA DATA SET MANAGEMENT + /// with Trim feature described in ACS-2 and SCSI UNMAP command described in SBC-3. + ad: bool, + + /// Attribute – Integral Dataset for Write (IDW) + /// + /// If set to ‘1’ then the dataset should be optimized for write access as an integral unit. + /// The host expects to perform operations on all ranges provided as an integral unit for + /// writes, indicating that if a portion of the dataset is written it is expected that all of + /// the ranges in the dataset are going to be written. + /// + /// Note: this field is advisory, and we ignore it. + _idw: bool, + + /// Attribute – Integral Dataset for Read (IDR) + /// + /// If set to ‘1’ then the dataset should be optimized for read access as an integral unit. + /// The host expects to perform operations on all ranges provided as an integral unit for + /// reads, indicating that if a portion of the dataset is read it is expected that all of the + /// ranges in the dataset are going to be read. + /// + /// Note: this field is advisory, and we ignore it. + _idr: bool, +} + +impl DatasetManagementCmd { + /// Returns an Iterator that yields [`GuestRegion`]'s which contain the array of LBA ranges. + pub fn data<'a>(&self, mem: &'a MemCtx) -> PrpIter<'a> { + PrpIter::new( + u64::from(self.nr) + * size_of::() as u64, + self.prp1, + self.prp2, + mem, + ) + } + + /// Returns an Iterator that yields the LBA ranges specified in this command. If any of the + /// ranges cannot be read from guest memory, yields an error for that range instead. + pub fn ranges<'a>( + &self, + mem: &'a MemCtx, + ) -> impl Iterator< + Item = Result, + > + 'a { + self.data(mem).flat_map(|region| { + if let Some(Ok(defs)) = mem + .readable_region(®ion) + .map(|mapping| mapping.read_many_owned()) + { + defs.into_iter().map(Ok).collect::>().into_iter() + } else { + vec![Err("Failed to read LBA range")].into_iter() + } + }) + } + + pub fn is_deallocate(&self) -> bool { + self.ad + } +} + /// Indicates the possible states of a [`PrpIter`]. #[derive(Clone, Copy, Eq, PartialEq, Debug)] enum PrpNext { diff --git a/lib/propolis/src/hw/nvme/mod.rs b/lib/propolis/src/hw/nvme/mod.rs index 9a3bf657d..7f8f639b1 100644 --- a/lib/propolis/src/hw/nvme/mod.rs +++ b/lib/propolis/src/hw/nvme/mod.rs @@ -846,6 +846,8 @@ impl PciNvme { // Supporting multiple namespaces complicates I/O dispatching, // so for now we limit the device to a single namespace. nn: 1, + // bit 2 indicates support for the Dataset Management command + oncs: (1 << 2), // bit 0 indicates volatile write cache is present vwc: 1, // bit 8 indicates Doorbell Buffer support diff --git a/lib/propolis/src/hw/nvme/requests.rs b/lib/propolis/src/hw/nvme/requests.rs index 4821b47f0..e00158525 100644 --- a/lib/propolis/src/hw/nvme/requests.rs +++ b/lib/propolis/src/hw/nvme/requests.rs @@ -2,12 +2,14 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use itertools::Itertools; use std::sync::Arc; use std::time::Instant; use super::{cmds::NvmCmd, queue::Permit, PciNvme}; use crate::accessors::MemAccessor; use crate::block::{self, Operation, Request}; +use crate::hw::nvme::cmds::ParseErr; use crate::hw::nvme::{bits, cmds::Completion, queue::SubQueue}; #[usdt::provider(provider = "propolis")] @@ -38,6 +40,9 @@ mod probes { } fn nvme_write_complete(devsq_id: u64, cid: u16, res: u8) {} + fn nvme_discard_enqueue(devsq_id: u64, idx: u16, cid: u16, nr: u16) {} + fn nvme_discard_complete(devsq_id: u64, cid: u16, res: u8) {} + fn nvme_flush_enqueue(devsq_id: u64, idx: u16, cid: u16) {} fn nvme_flush_complete(devsq_id: u64, cid: u16, res: u8) {} @@ -142,11 +147,49 @@ impl block::DeviceQueue for NvmeBlockQueue { Request::new_read(off as usize, size as usize, bufs); return Some((req, permit, None)); } + Ok(NvmCmd::DatasetManagement(cmd)) => { + // We only support the "deallocate" (discard/trim) operation of Dataset + // Management. + if !cmd.is_deallocate() { + permit.complete( + Completion::generic_err(bits::STS_INVAL_FIELD) + .dnr(), + ); + continue; + } + probes::nvme_discard_enqueue!(|| ( + sq.devq_id(), + idx, + cid, + cmd.nr, + )); + let Ok(ranges): Result, _> = cmd + .ranges(&mem) + .map_ok(|r| r.offset_len(params.lba_data_size)) + .try_collect() + else { + // If we couldn't read the ranges, fail the command + permit.complete( + Completion::generic_err(bits::STS_DATA_XFER_ERR) + .dnr(), + ); + continue; + }; + let req = Request::new_discard(ranges); + return Some((req, permit, None)); + } Ok(NvmCmd::Flush) => { probes::nvme_flush_enqueue!(|| (sq.devq_id(), idx, cid)); let req = Request::new_flush(); return Some((req, permit, None)); } + Err(ParseErr::ReservedFuse) | Err(ParseErr::Reserved) => { + // For commands that fail parsing due to reserved fields being set, + // complete with an invalid field error + let comp = + Completion::generic_err(bits::STS_INVAL_FIELD).dnr(); + permit.complete(comp); + } Ok(NvmCmd::Unknown(_)) | Err(_) => { // For any other unrecognized or malformed command, // just immediately complete it with an error @@ -179,8 +222,8 @@ impl block::DeviceQueue for NvmeBlockQueue { Operation::Flush => { probes::nvme_flush_complete!(|| (devsq_id, cid, resnum)); } - Operation::Discard(..) => { - unreachable!("discard not supported in NVMe for now"); + Operation::Discard => { + probes::nvme_discard_complete!(|| (devsq_id, cid, resnum)); } } diff --git a/lib/propolis/src/hw/virtio/block.rs b/lib/propolis/src/hw/virtio/block.rs index 84c662019..75659e46d 100644 --- a/lib/propolis/src/hw/virtio/block.rs +++ b/lib/propolis/src/hw/virtio/block.rs @@ -198,7 +198,7 @@ impl block::DeviceQueue for BlockVq { rid, off as u64, sz as u64, )); Ok(( - block::Request::new_discard(off, sz), + block::Request::new_discard(vec![(off, sz)]), CompletionToken { rid, chain }, None, )) @@ -245,7 +245,7 @@ impl block::DeviceQueue for BlockVq { block::Operation::Flush => { probes::vioblk_flush_complete!(|| (rid, resnum)); } - block::Operation::Discard(..) => { + block::Operation::Discard => { probes::vioblk_discard_complete!(|| (rid, resnum)); } } diff --git a/lib/propolis/src/vmm/mem.rs b/lib/propolis/src/vmm/mem.rs index fbce4b3da..3a7a2482a 100644 --- a/lib/propolis/src/vmm/mem.rs +++ b/lib/propolis/src/vmm/mem.rs @@ -557,6 +557,28 @@ impl SubMapping<'_> { Ok(unsafe { typed.read_unaligned() }) } + /// Read the entire mapping as an array of `T` objects. + /// The size of the mapping must be aligned to `size_of::()`. + pub fn read_many_owned(&self) -> Result> { + self.check_read_access()?; + if !self.len.is_multiple_of(size_of::()) { + return Err(Error::new( + ErrorKind::InvalidInput, + "Mapping size not aligned to value type", + )); + } + let count = self.len / size_of::(); + let mut vec = Vec::with_capacity(count); + + self.read_many(&mut vec.spare_capacity_mut()[..count])?; + // Safety: read_many() was successful and just initialized the first `count` elements of + // the vector. + unsafe { + vec.set_len(count); + } + Ok(vec) + } + /// Read `values` from the mapping. pub fn read_many( &self,