Skip to content

Commit 224f78f

Browse files
committed
support NVMe Deallocate
1 parent bc489dd commit 224f78f

12 files changed

Lines changed: 212 additions & 27 deletions

File tree

bin/propolis-server/src/lib/stats/virtual_disk.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,10 @@ impl VirtualDiskStats {
7777
self.on_write_completion(result, len, duration)
7878
}
7979
Operation::Flush => self.on_flush_completion(result, duration),
80-
Operation::Discard(..) => {
80+
Operation::Discard => {
8181
// Discard is not wired up in backends we care about for now, so
8282
// it can safely be ignored.
83+
// XXX no longer true
8384
}
8485
}
8586
}

lib/propolis/src/block/crucible.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ impl WorkerState {
145145
let _ = block.flush(None).await?;
146146
}
147147
}
148-
block::Operation::Discard(..) => {
148+
block::Operation::Discard => {
149149
// Crucible does not support discard operations for now
150150
return Err(Error::Unsupported);
151151
}

lib/propolis/src/block/file.rs

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -113,12 +113,16 @@ impl SharedState {
113113
self.fp.sync_data().map_err(|_| "io error")?;
114114
}
115115
}
116-
block::Operation::Discard(off, len) => {
116+
block::Operation::Discard => {
117117
if let Some(mech) = self.discard_mech {
118-
dkioc::do_discard(&self.fp, mech, off as u64, len as u64)
118+
for &(off, len) in &req.ranges {
119+
dkioc::do_discard(
120+
&self.fp, mech, off as u64, len as u64,
121+
)
119122
.map_err(|_| {
120-
"io error while attempting to free block(s)"
121-
})?;
123+
"io error while attempting to free block(s)"
124+
})?;
125+
}
122126
} else {
123127
unreachable!("handled above in processing_loop()");
124128
}

lib/propolis/src/block/in_memory.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ impl SharedState {
8686
block::Operation::Flush => {
8787
// nothing to do
8888
}
89-
block::Operation::Discard(..) => {
89+
block::Operation::Discard => {
9090
unreachable!("handled in processing_loop()");
9191
}
9292
}

lib/propolis/src/block/mem_async.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ impl SharedState {
8989
block::Operation::Flush => {
9090
// nothing to do
9191
}
92-
block::Operation::Discard(..) => {
92+
block::Operation::Discard => {
9393
unreachable!("handled in processing_loop()")
9494
}
9595
}

lib/propolis/src/block/minder.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -283,9 +283,9 @@ impl QueueMinder {
283283
Operation::Flush => {
284284
probes::block_begin_flush!(|| { (devqid, id) });
285285
}
286-
Operation::Discard(off, len) => {
286+
Operation::Discard => {
287287
probes::block_begin_discard!(|| {
288-
(devqid, id, off as u64, len as u64)
288+
(devqid, id, req.ranges.len() as u64)
289289
});
290290
}
291291
}
@@ -355,7 +355,7 @@ impl QueueMinder {
355355
(devqid, id, rescode, ns_processed, ns_queued)
356356
});
357357
}
358-
Operation::Discard(..) => {
358+
Operation::Discard => {
359359
probes::block_complete_discard!(|| {
360360
(devqid, id, rescode, ns_processed, ns_queued)
361361
});

lib/propolis/src/block/mod.rs

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ mod probes {
5050
fn block_begin_read(devq_id: u64, req_id: u64, offset: u64, len: u64) {}
5151
fn block_begin_write(devq_id: u64, req_id: u64, offset: u64, len: u64) {}
5252
fn block_begin_flush(devq_id: u64, req_id: u64) {}
53-
fn block_begin_discard(devq_id: u64, req_id: u64, offset: u64, len: u64) {}
53+
fn block_begin_discard(devq_id: u64, req_id: u64, nr: u64) {}
5454

5555
fn block_complete_read(
5656
devq_id: u64,
@@ -106,8 +106,8 @@ pub enum Operation {
106106
Write(ByteOffset, ByteLen),
107107
/// Flush buffer(s)
108108
Flush,
109-
/// Discard/UNMAP/deallocate region
110-
Discard(ByteOffset, ByteLen),
109+
/// Discard/UNMAP/deallocate some ranges, which are specified in Request::ranges
110+
Discard,
111111
}
112112
impl Operation {
113113
pub const fn is_read(&self) -> bool {
@@ -120,7 +120,7 @@ impl Operation {
120120
matches!(self, Operation::Flush)
121121
}
122122
pub const fn is_discard(&self) -> bool {
123-
matches!(self, Operation::Discard(..))
123+
matches!(self, Operation::Discard)
124124
}
125125
}
126126

@@ -203,32 +203,36 @@ pub struct Request {
203203
/// A list of regions of guest memory to read/write into as part of the I/O
204204
/// request
205205
pub regions: Vec<GuestRegion>,
206+
207+
/// A list of byte ranges to discard as part of the I/O request. This is only
208+
/// relevant for discard operations, and is expected to be empty otherwise.
209+
pub ranges: Vec<(ByteOffset, ByteLen)>,
206210
}
207211
impl Request {
208212
pub fn new_read(
209213
off: ByteOffset,
210214
len: ByteLen,
211215
regions: Vec<GuestRegion>,
212216
) -> Self {
213-
Self { op: Operation::Read(off, len), regions }
217+
Self { op: Operation::Read(off, len), regions, ranges: Vec::new() }
214218
}
215219

216220
pub fn new_write(
217221
off: ByteOffset,
218222
len: ByteLen,
219223
regions: Vec<GuestRegion>,
220224
) -> Self {
221-
Self { op: Operation::Write(off, len), regions }
225+
Self { op: Operation::Write(off, len), regions, ranges: Vec::new() }
222226
}
223227

224228
pub fn new_flush() -> Self {
225229
let op = Operation::Flush;
226-
Self { op, regions: Vec::new() }
230+
Self { op, regions: Vec::new(), ranges: Vec::new() }
227231
}
228232

229-
pub fn new_discard(off: ByteOffset, len: ByteLen) -> Self {
230-
let op = Operation::Discard(off, len);
231-
Self { op, regions: Vec::new() }
233+
pub fn new_discard(ranges: Vec<(ByteOffset, ByteLen)>) -> Self {
234+
let op = Operation::Discard;
235+
Self { op, regions: Vec::new(), ranges }
232236
}
233237

234238
pub fn mappings<'a>(&self, mem: &'a MemCtx) -> Option<Vec<SubMapping<'a>>> {
@@ -239,7 +243,7 @@ impl Request {
239243
Operation::Write(..) => {
240244
self.regions.iter().map(|r| mem.readable_region(r)).collect()
241245
}
242-
Operation::Flush | Operation::Discard(..) => None,
246+
Operation::Flush | Operation::Discard => None,
243247
}
244248
}
245249
}

lib/propolis/src/hw/nvme/bits.rs

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
#![allow(dead_code)]
66

7+
use crate::block::{ByteLen, ByteOffset};
78
use bitstruct::bitstruct;
89
use zerocopy::FromBytes;
910

@@ -176,6 +177,38 @@ impl CompletionQueueEntry {
176177
}
177178
}
178179

180+
/// A Dataset Management Range Definition as represented in memory.
181+
///
182+
/// See NVMe 1.0e Section 6.6 Figure 114: Dataset Management – Range Definition
183+
#[derive(Debug, Default, Copy, Clone, FromBytes)]
184+
#[repr(C, packed(1))]
185+
pub struct DatasetManagementRangeDefinition {
186+
/// The context attributes specified for each range provides information about how the range
187+
/// is intended to be used by host software. The use of this information is optional and the
188+
/// controller is not required to perform any specific action.
189+
pub context_attributes: u32,
190+
191+
pub number_logical_blocks: u32,
192+
193+
pub starting_lba: u64,
194+
}
195+
impl DatasetManagementRangeDefinition {
196+
pub fn new(
197+
context_attributes: u32,
198+
number_logical_blocks: u32,
199+
starting_lba: u64,
200+
) -> Self {
201+
Self { context_attributes, number_logical_blocks, starting_lba }
202+
}
203+
204+
pub fn offset_len(&self, lba_data_size: u64) -> (ByteOffset, ByteLen) {
205+
(
206+
(self.starting_lba * lba_data_size) as ByteOffset,
207+
(self.number_logical_blocks as u64 * lba_data_size) as ByteLen,
208+
)
209+
}
210+
}
211+
179212
// Register bits
180213

181214
bitstruct! {
@@ -539,6 +572,8 @@ pub const NVM_OPC_FLUSH: u8 = 0x00;
539572
pub const NVM_OPC_WRITE: u8 = 0x01;
540573
/// Read Command Opcode
541574
pub const NVM_OPC_READ: u8 = 0x02;
575+
/// Dataset Mangement Command Opcode
576+
pub const NVM_OPC_DATASET_MANAGEMENT: u8 = 0x09;
542577

543578
// Generic Command Status values
544579
// See NVMe 1.0e Section 4.5.1.2.1, Figure 17 Status Code - Generic Command Status Values

lib/propolis/src/hw/nvme/cmds.rs

Lines changed: 106 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@
22
// License, v. 2.0. If a copy of the MPL was not distributed with this
33
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
44

5-
use super::bits::{self, StatusCodeType, SubmissionQueueEntry};
5+
use super::bits::{
6+
self, DatasetManagementRangeDefinition, StatusCodeType,
7+
SubmissionQueueEntry,
8+
};
69
use super::queue::{QueueCreateErr, QueueId};
710
use crate::block;
811
use crate::common::*;
@@ -678,6 +681,8 @@ pub enum NvmCmd {
678681
Write(WriteCmd),
679682
/// Read data and metadata
680683
Read(ReadCmd),
684+
/// Dataset Management Command
685+
DatasetManagement(DatasetManagementCmd),
681686
/// An unknown NVM command
682687
Unknown(GuestData<SubmissionQueueEntry>),
683688
}
@@ -709,6 +714,17 @@ impl NvmCmd {
709714
prp1: raw.prp1,
710715
prp2: raw.prp2,
711716
}),
717+
bits::NVM_OPC_DATASET_MANAGEMENT => {
718+
NvmCmd::DatasetManagement(DatasetManagementCmd {
719+
prp1: raw.prp1,
720+
prp2: raw.prp2,
721+
// Convert from 0's based value
722+
nr: (raw.cdw10 & 0xFF) as u16 + 1,
723+
ad: raw.cdw11 & (1 << 2) != 0,
724+
idw: raw.cdw11 & (1 << 1) != 0,
725+
idr: raw.cdw11 & (1 << 0) != 0,
726+
})
727+
}
712728
_ => NvmCmd::Unknown(raw),
713729
};
714730
Ok(cmd)
@@ -779,6 +795,95 @@ impl ReadCmd {
779795
}
780796
}
781797

798+
/// Dataset Management Command Parameters
799+
#[derive(Debug)]
800+
#[allow(dead_code)]
801+
pub struct DatasetManagementCmd {
802+
/// PRP Entry 1 (PRP1)
803+
///
804+
/// Indicates a data buffer that contains the LBA range information.
805+
prp1: u64,
806+
807+
/// PRP Entry 2 (PRP2)
808+
///
809+
/// This field contains the second PRP entry that specifies the location where data should be
810+
/// transferred from (if there is a physical discontinuity).
811+
prp2: u64,
812+
813+
/// Number of Ranges (NR)
814+
///
815+
/// Indicates the number of 16 byte range sets that are specified in the command. This is a
816+
/// 0’s based value.
817+
pub nr: u16,
818+
819+
/// Attribute – Deallocate (AD)
820+
///
821+
/// If set to ‘1’ then the NVM subsystem may deallocate all provided ranges. If a read occurs
822+
/// to a deallocated range, the NVM Express subsystem shall return all zeros, all ones, or
823+
/// the last data written to the associated LBA.
824+
///
825+
/// Note: The operation of the Deallocate function is similar to the ATA DATA SET MANAGEMENT
826+
/// with Trim feature described in ACS-2 and SCSI UNMAP command described in SBC-3.
827+
ad: bool,
828+
829+
/// Attribute – Integral Dataset for Write (IDW)
830+
///
831+
/// If set to ‘1’ then the dataset should be optimized for write access as an integral unit.
832+
/// The host expects to perform operations on all ranges provided as an integral unit for
833+
/// writes, indicating that if a portion of the dataset is written it is expected that all of
834+
/// the ranges in the dataset are going to be written.
835+
idw: bool,
836+
837+
/// Attribute – Integral Dataset for Read (IDR)
838+
///
839+
/// If set to ‘1’ then the dataset should be optimized for read access as an integral unit.
840+
/// The host expects to perform operations on all ranges provided as an integral unit for
841+
/// reads, indicating that if a portion of the dataset is read it is expected that all of the
842+
/// ranges in the dataset are going to be read.
843+
idr: bool,
844+
}
845+
846+
impl DatasetManagementCmd {
847+
/// Returns an Iterator that yields [`GuestRegion`]'s which contain the array of LBA ranges.
848+
pub fn data<'a>(&self, mem: &'a MemCtx) -> PrpIter<'a> {
849+
PrpIter::new(
850+
u64::from(self.nr)
851+
* size_of::<DatasetManagementRangeDefinition>() as u64,
852+
self.prp1,
853+
self.prp2,
854+
mem,
855+
)
856+
}
857+
858+
/// Returns an Iterator that yields the LBA ranges specified in this command. Note that if
859+
/// some of the ranges couldn't be read from guest memory, this will yield fewer than
860+
/// `Self.nr` ranges.
861+
pub fn ranges<'a>(
862+
&self,
863+
mem: &'a MemCtx,
864+
) -> impl Iterator<Item = DatasetManagementRangeDefinition> + 'a {
865+
self.data(mem).flat_map(|region| {
866+
let mut ranges = Vec::new();
867+
if let Some(mapping) = mem.readable_region(&region) {
868+
ranges.resize_with(
869+
mapping.len()
870+
/ size_of::<DatasetManagementRangeDefinition>(),
871+
Default::default,
872+
);
873+
if mapping.read_many(&mut ranges).is_err() {
874+
ranges.clear();
875+
}
876+
};
877+
878+
ranges.into_iter()
879+
})
880+
}
881+
882+
pub fn is_deallocate(&self) -> bool {
883+
self.ad
884+
}
885+
}
886+
782887
/// Indicates the possible states of a [`PrpIter`].
783888
#[derive(Clone, Copy, Eq, PartialEq, Debug)]
784889
enum PrpNext {

lib/propolis/src/hw/nvme/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -846,6 +846,8 @@ impl PciNvme {
846846
// Supporting multiple namespaces complicates I/O dispatching,
847847
// so for now we limit the device to a single namespace.
848848
nn: 1,
849+
// bit 2 indicates support for the Dataset Management command
850+
oncs: (1 << 2),
849851
// bit 0 indicates volatile write cache is present
850852
vwc: 1,
851853
// bit 8 indicates Doorbell Buffer support

0 commit comments

Comments
 (0)