Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions bin/propolis-server/src/lib/stats/virtual_disk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,9 @@ impl VirtualDiskStats {
self.on_write_completion(result, len, duration)
}
Operation::Flush => self.on_flush_completion(result, duration),
Operation::Discard(..) => {
// Discard is not wired up in backends we care about for now, so
// it can safely be ignored.
Operation::Discard => {
// Discard is only partially wired up in backends we care about, and it's not
// clear what stats (if any) to report yet.
}
}
}
Expand Down
1 change: 1 addition & 0 deletions lib/propolis/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ async-trait.workspace = true
iddqd.workspace = true
nix.workspace = true
vm-attest.workspace = true
itertools.workspace = true

# falcon
libloading = { workspace = true, optional = true }
Expand Down
8 changes: 5 additions & 3 deletions lib/propolis/src/block/crucible.rs
Original file line number Diff line number Diff line change
Expand Up @@ -145,9 +145,11 @@ impl WorkerState {
let _ = block.flush(None).await?;
}
}
block::Operation::Discard(..) => {
// Crucible does not support discard operations for now
return Err(Error::Unsupported);
block::Operation::Discard => {
// Crucible does not support discard operations for now, so we implement this as
// a no-op (which technically is a valid implementation of discard, just one that
// doesn't actually free any space).
return Ok(());
}
}
Ok(())
Expand Down
12 changes: 8 additions & 4 deletions lib/propolis/src/block/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,12 +113,16 @@ impl SharedState {
self.fp.sync_data().map_err(|_| "io error")?;
}
}
block::Operation::Discard(off, len) => {
block::Operation::Discard => {
if let Some(mech) = self.discard_mech {
dkioc::do_discard(&self.fp, mech, off as u64, len as u64)
for &(off, len) in &req.ranges {
dkioc::do_discard(
&self.fp, mech, off as u64, len as u64,
)
.map_err(|_| {
"io error while attempting to free block(s)"
})?;
"io error while attempting to free block(s)"
})?;
}
} else {
unreachable!("handled above in processing_loop()");
}
Expand Down
2 changes: 1 addition & 1 deletion lib/propolis/src/block/in_memory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ impl SharedState {
block::Operation::Flush => {
// nothing to do
}
block::Operation::Discard(..) => {
block::Operation::Discard => {
unreachable!("handled in processing_loop()");
}
}
Expand Down
2 changes: 1 addition & 1 deletion lib/propolis/src/block/mem_async.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ impl SharedState {
block::Operation::Flush => {
// nothing to do
}
block::Operation::Discard(..) => {
block::Operation::Discard => {
unreachable!("handled in processing_loop()")
}
}
Expand Down
6 changes: 3 additions & 3 deletions lib/propolis/src/block/minder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -283,9 +283,9 @@ impl QueueMinder {
Operation::Flush => {
probes::block_begin_flush!(|| { (devqid, id) });
}
Operation::Discard(off, len) => {
Operation::Discard => {
probes::block_begin_discard!(|| {
(devqid, id, off as u64, len as u64)
(devqid, id, req.ranges.len() as u64)
});
}
}
Expand Down Expand Up @@ -355,7 +355,7 @@ impl QueueMinder {
(devqid, id, rescode, ns_processed, ns_queued)
});
}
Operation::Discard(..) => {
Operation::Discard => {
probes::block_complete_discard!(|| {
(devqid, id, rescode, ns_processed, ns_queued)
});
Expand Down
26 changes: 15 additions & 11 deletions lib/propolis/src/block/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ mod probes {
fn block_begin_read(devq_id: u64, req_id: u64, offset: u64, len: u64) {}
fn block_begin_write(devq_id: u64, req_id: u64, offset: u64, len: u64) {}
fn block_begin_flush(devq_id: u64, req_id: u64) {}
fn block_begin_discard(devq_id: u64, req_id: u64, offset: u64, len: u64) {}
fn block_begin_discard(devq_id: u64, req_id: u64, nr: u64) {}

fn block_complete_read(
devq_id: u64,
Expand Down Expand Up @@ -106,8 +106,8 @@ pub enum Operation {
Write(ByteOffset, ByteLen),
/// Flush buffer(s)
Flush,
/// Discard/UNMAP/deallocate region
Discard(ByteOffset, ByteLen),
/// Discard/UNMAP/deallocate some ranges, which are specified in Request::ranges
Discard,
}
impl Operation {
pub const fn is_read(&self) -> bool {
Expand All @@ -120,7 +120,7 @@ impl Operation {
matches!(self, Operation::Flush)
}
pub const fn is_discard(&self) -> bool {
matches!(self, Operation::Discard(..))
matches!(self, Operation::Discard)
}
}

Expand Down Expand Up @@ -203,32 +203,36 @@ pub struct Request {
/// A list of regions of guest memory to read/write into as part of the I/O
/// request
pub regions: Vec<GuestRegion>,

/// A list of byte ranges to discard as part of the I/O request. This is only
/// relevant for discard operations, and is expected to be empty otherwise.
pub ranges: Vec<(ByteOffset, ByteLen)>,
}
impl Request {
pub fn new_read(
off: ByteOffset,
len: ByteLen,
regions: Vec<GuestRegion>,
) -> Self {
Self { op: Operation::Read(off, len), regions }
Self { op: Operation::Read(off, len), regions, ranges: Vec::new() }
}

pub fn new_write(
off: ByteOffset,
len: ByteLen,
regions: Vec<GuestRegion>,
) -> Self {
Self { op: Operation::Write(off, len), regions }
Self { op: Operation::Write(off, len), regions, ranges: Vec::new() }
}

pub fn new_flush() -> Self {
let op = Operation::Flush;
Self { op, regions: Vec::new() }
Self { op, regions: Vec::new(), ranges: Vec::new() }
}

pub fn new_discard(off: ByteOffset, len: ByteLen) -> Self {
let op = Operation::Discard(off, len);
Self { op, regions: Vec::new() }
pub fn new_discard(ranges: Vec<(ByteOffset, ByteLen)>) -> Self {
let op = Operation::Discard;
Self { op, regions: Vec::new(), ranges }
}

pub fn mappings<'a>(&self, mem: &'a MemCtx) -> Option<Vec<SubMapping<'a>>> {
Expand All @@ -239,7 +243,7 @@ impl Request {
Operation::Write(..) => {
self.regions.iter().map(|r| mem.readable_region(r)).collect()
}
Operation::Flush | Operation::Discard(..) => None,
Operation::Flush | Operation::Discard => None,
}
}
}
Expand Down
35 changes: 35 additions & 0 deletions lib/propolis/src/hw/nvme/bits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#![allow(dead_code)]

use crate::block::{ByteLen, ByteOffset};
use bitstruct::bitstruct;
use zerocopy::FromBytes;

Expand Down Expand Up @@ -176,6 +177,38 @@ impl CompletionQueueEntry {
}
}

/// A Dataset Management Range Definition as represented in memory.
///
/// See NVMe 1.0e Section 6.6 Figure 114: Dataset Management – Range Definition
#[derive(Debug, Default, Copy, Clone, FromBytes)]
#[repr(C, packed(1))]
pub struct DatasetManagementRangeDefinition {
/// The context attributes specified for each range provides information about how the range
/// is intended to be used by host software. The use of this information is optional and the
/// controller is not required to perform any specific action.
pub context_attributes: u32,

pub number_logical_blocks: u32,

pub starting_lba: u64,
}
impl DatasetManagementRangeDefinition {
pub fn new(
context_attributes: u32,
number_logical_blocks: u32,
starting_lba: u64,
) -> Self {
Self { context_attributes, number_logical_blocks, starting_lba }
}

pub fn offset_len(&self, lba_data_size: u64) -> (ByteOffset, ByteLen) {
(
(self.starting_lba * lba_data_size) as ByteOffset,
(self.number_logical_blocks as u64 * lba_data_size) as ByteLen,
)
}
}

// Register bits

bitstruct! {
Expand Down Expand Up @@ -539,6 +572,8 @@ pub const NVM_OPC_FLUSH: u8 = 0x00;
pub const NVM_OPC_WRITE: u8 = 0x01;
/// Read Command Opcode
pub const NVM_OPC_READ: u8 = 0x02;
/// Dataset Mangement Command Opcode
pub const NVM_OPC_DATASET_MANAGEMENT: u8 = 0x09;

// Generic Command Status values
// See NVMe 1.0e Section 4.5.1.2.1, Figure 17 Status Code - Generic Command Status Values
Expand Down
114 changes: 113 additions & 1 deletion lib/propolis/src/hw/nvme/cmds.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use super::bits::{self, StatusCodeType, SubmissionQueueEntry};
use super::bits::{
self, DatasetManagementRangeDefinition, StatusCodeType,
SubmissionQueueEntry,
};
use super::queue::{QueueCreateErr, QueueId};
use crate::block;
use crate::common::*;
Expand All @@ -28,6 +31,10 @@ pub enum ParseErr {
/// An invalid value was specified in the FUSE bits of `CDW0`.
#[error("reserved FUSE value specified")]
ReservedFuse,

/// A reserved field was set to a non-zero value.
#[error("reserved field value specified")]
Reserved,
}

/// A parsed Admin Command
Expand Down Expand Up @@ -678,6 +685,8 @@ pub enum NvmCmd {
Write(WriteCmd),
/// Read data and metadata
Read(ReadCmd),
/// Dataset Management Command
DatasetManagement(DatasetManagementCmd),
/// An unknown NVM command
Unknown(GuestData<SubmissionQueueEntry>),
}
Expand Down Expand Up @@ -709,6 +718,22 @@ impl NvmCmd {
prp1: raw.prp1,
prp2: raw.prp2,
}),
bits::NVM_OPC_DATASET_MANAGEMENT => {
if (raw.cdw11 & !0b111) != 0 {
// Only the lowest 3 bits of CDW11 are used for Dataset
// Management, so reject if any other bits are set.
return Err(ParseErr::Reserved);
}
NvmCmd::DatasetManagement(DatasetManagementCmd {
prp1: raw.prp1,
prp2: raw.prp2,
// Convert from 0's based value
nr: (raw.cdw10 & 0xFF) as u16 + 1,
ad: raw.cdw11 & (1 << 2) != 0,
_idw: raw.cdw11 & (1 << 1) != 0,
_idr: raw.cdw11 & (1 << 0) != 0,
})
}
_ => NvmCmd::Unknown(raw),
};
Ok(cmd)
Expand Down Expand Up @@ -779,6 +804,93 @@ impl ReadCmd {
}
}

/// Dataset Management Command Parameters
#[derive(Debug)]
pub struct DatasetManagementCmd {
/// PRP Entry 1 (PRP1)
///
/// Indicates a data buffer that contains the LBA range information.
prp1: u64,

/// PRP Entry 2 (PRP2)
///
/// Indicates a second data buffer that contains LBA range information. It may not be a PRP
/// List.
prp2: u64,

/// Number of Ranges (NR)
///
/// Indicates the number of 16 byte range sets that are specified in the command.
pub nr: u16,

/// Attribute – Deallocate (AD)
///
/// If set to ‘1’ then the NVM subsystem may deallocate all provided ranges. If a read occurs
/// to a deallocated range, the NVM Express subsystem shall return all zeros, all ones, or
/// the last data written to the associated LBA.
///
/// Note: The operation of the Deallocate function is similar to the ATA DATA SET MANAGEMENT
/// with Trim feature described in ACS-2 and SCSI UNMAP command described in SBC-3.
ad: bool,

/// Attribute – Integral Dataset for Write (IDW)
///
/// If set to ‘1’ then the dataset should be optimized for write access as an integral unit.
/// The host expects to perform operations on all ranges provided as an integral unit for
/// writes, indicating that if a portion of the dataset is written it is expected that all of
/// the ranges in the dataset are going to be written.
///
/// Note: this field is advisory, and we ignore it.
_idw: bool,

/// Attribute – Integral Dataset for Read (IDR)
///
/// If set to ‘1’ then the dataset should be optimized for read access as an integral unit.
/// The host expects to perform operations on all ranges provided as an integral unit for
/// reads, indicating that if a portion of the dataset is read it is expected that all of the
/// ranges in the dataset are going to be read.
///
/// Note: this field is advisory, and we ignore it.
_idr: bool,
}

impl DatasetManagementCmd {
/// Returns an Iterator that yields [`GuestRegion`]'s which contain the array of LBA ranges.
pub fn data<'a>(&self, mem: &'a MemCtx) -> PrpIter<'a> {
PrpIter::new(
u64::from(self.nr)
* size_of::<DatasetManagementRangeDefinition>() as u64,
self.prp1,
self.prp2,
mem,
)
}

/// Returns an Iterator that yields the LBA ranges specified in this command. If any of the
/// ranges cannot be read from guest memory, yields an error for that range instead.
pub fn ranges<'a>(
&self,
mem: &'a MemCtx,
) -> impl Iterator<
Item = Result<DatasetManagementRangeDefinition, &'static str>,
> + 'a {
self.data(mem).flat_map(|region| {
if let Some(Ok(defs)) = mem
.readable_region(&region)
.map(|mapping| mapping.read_many_owned())
{
defs.into_iter().map(Ok).collect::<Vec<_>>().into_iter()
} else {
vec![Err("Failed to read LBA range")].into_iter()
}
})
}

pub fn is_deallocate(&self) -> bool {
self.ad
}
}

/// Indicates the possible states of a [`PrpIter`].
#[derive(Clone, Copy, Eq, PartialEq, Debug)]
enum PrpNext {
Expand Down
2 changes: 2 additions & 0 deletions lib/propolis/src/hw/nvme/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -846,6 +846,8 @@ impl PciNvme {
// Supporting multiple namespaces complicates I/O dispatching,
// so for now we limit the device to a single namespace.
nn: 1,
// bit 2 indicates support for the Dataset Management command
oncs: (1 << 2),
// bit 0 indicates volatile write cache is present
vwc: 1,
// bit 8 indicates Doorbell Buffer support
Expand Down
Loading
Loading