Skip to content

Commit efc2566

Browse files
authored
nexus: provide stable PCI slot assignments for disks (#3408)
Add a `slot` column to the disk table and modify `instance_attach_disk` to set it when a disk is attached to an instance. Slots are chosen using the `NextItem` query generator that NICs use for their slot assignments. The bulk of this change is new code to combine the `NextItem` fragment with the other SQL statements that mark a disk as attached (changing its state and setting its `attach_instance_id`) in a manner that allows the resulting query fragment to be passed as the body of the SET clause of the UPDATE statement that `instance_attach_disk` passes to the `attach_resource` generic function. Slot assignments remain stable for as long as a disk is attached, but they aren't preserved if a disk is detached and reattached (the algorithm will choose the first available slot, even if the disk's previously assigned slot is available). Tests: - New integration test - On a dev cluster, created an instance with a single boot disk, then attached disks with alphabetically earlier and later names. Verified that the boot disk remained in slot 0 and that booting the instance succeeded (i.e. guest firmware was able to find the boot OS successfully). Also played around with detaching and reattaching disks and making sure via CRDB shell that their slot assignments were as expected. Fixes #3224.
1 parent fdaded2 commit efc2566

File tree

11 files changed

+378
-21
lines changed

11 files changed

+378
-21
lines changed

common/src/sql/dbinit.sql

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -947,6 +947,7 @@ CREATE TABLE omicron.public.disk (
947947
*/
948948
attach_instance_id UUID,
949949
state_generation INT NOT NULL,
950+
slot INT2 CHECK (slot >= 0 AND slot < 8),
950951
time_state_updated TIMESTAMPTZ NOT NULL,
951952

952953
/* Disk configuration */

nexus/db-model/src/disk.rs

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
44

55
use super::{BlockSize, ByteCount, DiskState, Generation};
6-
use crate::schema::disk;
6+
use crate::{schema::disk, unsigned::SqlU8};
77
use chrono::{DateTime, Utc};
88
use db_macros::Resource;
99
use nexus_types::external_api::params;
@@ -44,6 +44,16 @@ pub struct Disk {
4444
#[diesel(embed)]
4545
pub runtime_state: DiskRuntimeState,
4646

47+
/// The PCI slot (within the bank of slots reserved to disks) to which this
48+
/// disk should be attached if its attached instance is started, or None
49+
/// if there is no such assignment.
50+
///
51+
/// Slot assignments are managed entirely in Nexus and aren't modified by
52+
/// runtime state changes in the sled agent, so this field is part of the
53+
/// "main" disk struct and not the runtime state (even though the attachment
54+
/// state and slot assignment will often change together).
55+
pub slot: Option<SqlU8>,
56+
4757
/// size of the Disk
4858
#[diesel(column_name = size_bytes)]
4959
pub size: ByteCount,
@@ -95,6 +105,7 @@ impl Disk {
95105
project_id,
96106
volume_id,
97107
runtime_state: runtime_initial,
108+
slot: None,
98109
size: params.size.into(),
99110
block_size,
100111
create_snapshot_id,

nexus/db-model/src/schema.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ table! {
2121
attach_instance_id -> Nullable<Uuid>,
2222
state_generation -> Int8,
2323
time_state_updated -> Timestamptz,
24+
slot -> Nullable<Int2>,
2425
size_bytes -> Int8,
2526
block_size -> crate::BlockSizeEnum,
2627
origin_snapshot -> Nullable<Uuid>,

nexus/db-model/src/unsigned.rs

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,57 @@ use diesel::sql_types;
1010
use serde::{Deserialize, Serialize};
1111
use std::convert::TryFrom;
1212

13+
/// Representation of a [`u8`] in the database. The database does not support
14+
/// unsigned types; this adapter converts from the database's INT2 (a 2-byte
15+
/// signed integer) to an actual u8.
16+
#[derive(
17+
Copy,
18+
Clone,
19+
Debug,
20+
AsExpression,
21+
Eq,
22+
Ord,
23+
PartialEq,
24+
PartialOrd,
25+
FromSqlRow,
26+
Serialize,
27+
Deserialize,
28+
)]
29+
#[diesel(sql_type = sql_types::Int2)]
30+
#[repr(transparent)]
31+
pub struct SqlU8(pub u8);
32+
33+
NewtypeFrom! { () pub struct SqlU8(u8); }
34+
NewtypeDeref! { () pub struct SqlU8(u8); }
35+
36+
impl SqlU8 {
37+
pub fn new(value: u8) -> Self {
38+
Self(value)
39+
}
40+
}
41+
42+
impl ToSql<sql_types::Int2, Pg> for SqlU8 {
43+
fn to_sql<'a>(
44+
&'a self,
45+
out: &mut serialize::Output<'a, '_, Pg>,
46+
) -> serialize::Result {
47+
<i16 as ToSql<sql_types::Int2, Pg>>::to_sql(
48+
&i16::from(self.0),
49+
&mut out.reborrow(),
50+
)
51+
}
52+
}
53+
54+
impl<DB> FromSql<sql_types::Int2, DB> for SqlU8
55+
where
56+
DB: Backend,
57+
i16: FromSql<sql_types::Int2, DB>,
58+
{
59+
fn from_sql(bytes: DB::RawValue<'_>) -> deserialize::Result<Self> {
60+
u8::try_from(i16::from_sql(bytes)?).map(SqlU8).map_err(|e| e.into())
61+
}
62+
}
63+
1364
/// Representation of a [`u16`] in the database.
1465
/// We need this because the database does not support unsigned types.
1566
/// This handles converting from the database's INT4 to the actual u16.

nexus/db-queries/src/db/datastore/disk.rs

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ use crate::db::model::Instance;
2626
use crate::db::model::Name;
2727
use crate::db::model::Project;
2828
use crate::db::pagination::paginated;
29+
use crate::db::queries::disk::DiskSetClauseForAttach;
2930
use crate::db::update_and_check::UpdateAndCheck;
3031
use crate::db::update_and_check::UpdateStatus;
3132
use async_bb8_diesel::AsyncRunQueryDsl;
@@ -183,26 +184,22 @@ impl DataStore {
183184
db::model::InstanceState(api::external::InstanceState::Stopped),
184185
];
185186

186-
let attached_label =
187-
api::external::DiskState::Attached(authz_instance.id()).label();
187+
let attach_update = DiskSetClauseForAttach::new(authz_instance.id());
188188

189-
let (instance, disk) = Instance::attach_resource(
189+
let query = Instance::attach_resource(
190190
authz_instance.id(),
191191
authz_disk.id(),
192-
instance::table
193-
.into_boxed()
194-
.filter(instance::dsl::state.eq_any(ok_to_attach_instance_states)),
195-
disk::table
196-
.into_boxed()
197-
.filter(disk::dsl::disk_state.eq_any(ok_to_attach_disk_state_labels)),
192+
instance::table.into_boxed().filter(
193+
instance::dsl::state.eq_any(ok_to_attach_instance_states),
194+
),
195+
disk::table.into_boxed().filter(
196+
disk::dsl::disk_state.eq_any(ok_to_attach_disk_state_labels),
197+
),
198198
max_disks,
199-
diesel::update(disk::dsl::disk)
200-
.set((
201-
disk::dsl::disk_state.eq(attached_label),
202-
disk::dsl::attach_instance_id.eq(authz_instance.id())
203-
))
204-
)
205-
.attach_and_get_result_async(self.pool_authorized(opctx).await?)
199+
diesel::update(disk::dsl::disk).set(attach_update),
200+
);
201+
202+
let (instance, disk) = query.attach_and_get_result_async(self.pool_authorized(opctx).await?)
206203
.await
207204
.or_else(|e| {
208205
match e {
@@ -330,7 +327,8 @@ impl DataStore {
330327
diesel::update(disk::dsl::disk)
331328
.set((
332329
disk::dsl::disk_state.eq(detached_label),
333-
disk::dsl::attach_instance_id.eq(Option::<Uuid>::None)
330+
disk::dsl::attach_instance_id.eq(Option::<Uuid>::None),
331+
disk::dsl::slot.eq(Option::<i16>::None)
334332
))
335333
)
336334
.detach_and_get_result_async(self.pool_authorized(opctx).await?)

nexus/db-queries/src/db/datastore/instance.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,7 @@ impl DataStore {
285285
diesel::update(disk::dsl::disk).set((
286286
disk::dsl::disk_state.eq(detached_label),
287287
disk::dsl::attach_instance_id.eq(Option::<Uuid>::None),
288+
disk::dsl::slot.eq(Option::<i16>::None),
288289
)),
289290
)
290291
.detach_and_get_result_async(self.pool_authorized(opctx).await?)
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
// This Source Code Form is subject to the terms of the Mozilla Public
2+
// License, v. 2.0. If a copy of the MPL was not distributed with this
3+
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
5+
//! Helper queries for working with disks.
6+
7+
use crate::db;
8+
use crate::db::queries::next_item::{DefaultShiftGenerator, NextItem};
9+
use diesel::{
10+
pg::Pg,
11+
query_builder::{AstPass, QueryFragment, QueryId},
12+
sql_types, Column, QueryResult,
13+
};
14+
use uuid::Uuid;
15+
16+
/// The maximum number of disks that can be attached to an instance.
17+
//
18+
// This is defined here for layering reasons: the main Nexus crate depends on
19+
// the db-queries crate, so the disk-per-instance limit lives here and Nexus
20+
// proper re-exports it.
21+
pub const MAX_DISKS_PER_INSTANCE: u32 = 8;
22+
23+
/// A wrapper for the query that selects a PCI slot for a newly-attached disk.
24+
///
25+
/// The general idea is to produce a query that left joins a single-column table
26+
/// containing the sequence [0, 1, 2, ..., MAX_DISKS] against the disk table,
27+
/// filtering for disks that are attached to the instance of interest and whose
28+
/// currently-assigned slots match the slots in the sequence. Filtering this
29+
/// query to just those rows where a slot has no disk attached yields the first
30+
/// available slot number. If no slots are available, the returned value is
31+
/// MAX_DISKS, which is not a valid slot number and which will be rejected when
32+
/// attempting to update the row (the "slot" column has a CHECK clause that
33+
/// enforces this at the database level).
34+
///
35+
/// See the `NextItem` documentation for more details.
36+
#[derive(Debug, Clone, Copy)]
37+
struct NextDiskSlot {
38+
inner: NextItem<
39+
db::schema::disk::table,
40+
i16,
41+
db::schema::disk::dsl::slot,
42+
Uuid,
43+
db::schema::disk::dsl::attach_instance_id,
44+
>,
45+
}
46+
47+
impl NextDiskSlot {
48+
fn new(instance_id: Uuid) -> Self {
49+
let generator = DefaultShiftGenerator {
50+
base: 0,
51+
max_shift: i64::try_from(MAX_DISKS_PER_INSTANCE).unwrap(),
52+
min_shift: 0,
53+
};
54+
Self { inner: NextItem::new_scoped(generator, instance_id) }
55+
}
56+
}
57+
58+
impl QueryId for NextDiskSlot {
59+
type QueryId = ();
60+
const HAS_STATIC_QUERY_ID: bool = false;
61+
}
62+
63+
impl QueryFragment<Pg> for NextDiskSlot {
64+
fn walk_ast<'a>(&'a self, mut out: AstPass<'_, 'a, Pg>) -> QueryResult<()> {
65+
self.inner.walk_ast(out.reborrow())?;
66+
Ok(())
67+
}
68+
}
69+
70+
/// Produces a query fragment containing the SET clause to use in the UPDATE
71+
/// statement when attaching a disk to an instance.
72+
///
73+
/// The expected form of the statement is
74+
///
75+
/// ```sql
76+
/// SET attach_instance_id = instance_id,
77+
/// disk_state = 'attached',
78+
/// slot = (SELECT 0 + shift AS slot FROM
79+
/// (SELECT generate_series(0, 8) AS shift
80+
/// UNION ALL
81+
/// SELECT generate_series(0, -1) AS shift)
82+
/// LEFT OUTER JOIN disk
83+
/// ON (attach_instance_id, slot, time_deleted IS NULL) =
84+
/// (instance_id, 0 + shift, TRUE)
85+
/// WHERE slot IS NULL
86+
/// LIMIT 1)
87+
/// ```
88+
///
89+
/// This fragment can be passed to an `attach_resource` operation by supplying
90+
/// it as the argument to a `set`, e.g.
91+
/// `diesel::update(disk::dsl::disk).set(DiskSetClauseForAttach::new(instance_id))`.
92+
#[derive(Debug, Clone)]
93+
pub struct DiskSetClauseForAttach {
94+
attach_instance_id: Uuid,
95+
next_slot: NextDiskSlot,
96+
}
97+
98+
impl DiskSetClauseForAttach {
99+
pub fn new(instance_id: Uuid) -> Self {
100+
Self {
101+
attach_instance_id: instance_id,
102+
next_slot: NextDiskSlot::new(instance_id),
103+
}
104+
}
105+
}
106+
107+
impl QueryId for DiskSetClauseForAttach {
108+
type QueryId = ();
109+
const HAS_STATIC_QUERY_ID: bool = false;
110+
}
111+
112+
impl QueryFragment<Pg> for DiskSetClauseForAttach {
113+
fn walk_ast<'a>(&'a self, mut out: AstPass<'_, 'a, Pg>) -> QueryResult<()> {
114+
let attached_label =
115+
omicron_common::api::external::DiskState::Attached(
116+
self.attach_instance_id,
117+
)
118+
.label();
119+
120+
// Ideally this code would not have to handcraft the entire SET
121+
// statement. It would be cleaner to write something like
122+
//
123+
// diesel::update(disk::dsl::disk).set(
124+
// (disk::dsl::disk::attach_instance_id.eq(attach_instance_id),
125+
// disk::dsl::disk::disk_state.eq(attached_label),
126+
// disk::dsl::disk::slot.eq(next_slot_query))
127+
//
128+
// where `next_slot_query` is a wrapper around the `NextItem` query
129+
// above that yields the slot-choosing SELECT statement. Diesel provides
130+
// a `single_value` adapter function that's supposed to do this: given
131+
// a query that returns a single column, it appends a LIMIT 1 to the
132+
// query and parenthesizes it so that it can be used as the argument to
133+
// a SET statement. The sticky bit is the trait bounds: SingleValueDsl
134+
// is implemented for SelectQuery + LimitDsl, and while it's easy enough
135+
// to impl SelectQuery for NextDiskSlot, implementing LimitDsl is harder
136+
// (and seems to be discouraged by the trait docs)--it is natively
137+
// implemented for diesel::Table, but here there's no table type, in
138+
// part because the left side of the join is synthesized from thin air
139+
// by the NextItem query fragment.
140+
//
141+
// Rather than spar extensively with Diesel over these details, just
142+
// implement the whole SET by hand.
143+
out.push_identifier(db::schema::disk::dsl::attach_instance_id::NAME)?;
144+
out.push_sql(" = ");
145+
out.push_bind_param::<sql_types::Uuid, Uuid>(&self.attach_instance_id)?;
146+
out.push_sql(", ");
147+
out.push_identifier(db::schema::disk::dsl::disk_state::NAME)?;
148+
out.push_sql(" = ");
149+
out.push_bind_param::<sql_types::Text, str>(attached_label)?;
150+
out.push_sql(", ");
151+
out.push_identifier(db::schema::disk::dsl::slot::NAME)?;
152+
out.push_sql(" = (");
153+
self.next_slot.walk_ast(out.reborrow())?;
154+
out.push_sql(")");
155+
156+
Ok(())
157+
}
158+
}
159+
160+
// Required to pass `DiskSetClauseForAttach` to `set`.
161+
impl diesel::query_builder::AsChangeset for DiskSetClauseForAttach {
162+
type Target = db::schema::disk::dsl::disk;
163+
type Changeset = Self;
164+
165+
fn as_changeset(self) -> Self::Changeset {
166+
self
167+
}
168+
}

nexus/db-queries/src/db/queries/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
//! Specialized queries for inserting database records, usually to maintain
66
//! complex invariants that are most accurately expressed in a single query.
77
8+
pub mod disk;
89
pub mod external_ip;
910
pub mod ip_pool;
1011
#[macro_use]

nexus/src/app/instance.rs

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -625,12 +625,30 @@ impl super::Nexus {
625625
.await?;
626626

627627
let mut disk_reqs = vec![];
628-
for (i, disk) in disks.iter().enumerate() {
628+
for disk in &disks {
629+
// Disks that are attached to an instance should always have a slot
630+
// assignment, but if for some reason this one doesn't, return an
631+
// error instead of taking down the whole process.
632+
let slot = match disk.slot {
633+
Some(s) => s,
634+
None => {
635+
error!(self.log, "attached disk has no PCI slot assignment";
636+
"disk_id" => %disk.id(),
637+
"disk_name" => disk.name().to_string(),
638+
"instance" => ?disk.runtime_state.attach_instance_id);
639+
640+
return Err(Error::internal_error(&format!(
641+
"disk {} is attached but has no PCI slot assignment",
642+
disk.id()
643+
)));
644+
}
645+
};
646+
629647
let volume =
630648
self.db_datastore.volume_checkout(disk.volume_id).await?;
631649
disk_reqs.push(sled_agent_client::types::DiskRequest {
632650
name: disk.name().to_string(),
633-
slot: sled_agent_client::types::Slot(i as u8),
651+
slot: sled_agent_client::types::Slot(slot.0),
634652
read_only: false,
635653
device: "nvme".to_string(),
636654
volume_construction_request: serde_json::from_str(

nexus/src/app/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ pub mod sagas;
6666
// TODO: When referring to API types, we should try to include
6767
// the prefix unless it is unambiguous.
6868

69-
pub(crate) const MAX_DISKS_PER_INSTANCE: u32 = 8;
69+
pub(crate) use nexus_db_queries::db::queries::disk::MAX_DISKS_PER_INSTANCE;
7070

7171
pub(crate) const MAX_NICS_PER_INSTANCE: usize = 8;
7272

0 commit comments

Comments
 (0)