Skip to content

[sled-agent] Integrate config-reconciler #8064

New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion clients/sled-agent-client/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ progenitor::generate_api!(
OmicronPhysicalDiskConfig = omicron_common::disk::OmicronPhysicalDiskConfig,
OmicronPhysicalDisksConfig = omicron_common::disk::OmicronPhysicalDisksConfig,
OmicronSledConfig = nexus_sled_agent_shared::inventory::OmicronSledConfig,
OmicronSledConfigResult = nexus_sled_agent_shared::inventory::OmicronSledConfigResult,
OmicronZoneConfig = nexus_sled_agent_shared::inventory::OmicronZoneConfig,
OmicronZoneDataset = nexus_sled_agent_shared::inventory::OmicronZoneDataset,
OmicronZoneImageSource = nexus_sled_agent_shared::inventory::OmicronZoneImageSource,
Expand Down
52 changes: 0 additions & 52 deletions dev-tools/omdb/src/bin/omdb/sled_agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,6 @@ enum SledAgentCommands {
#[clap(subcommand)]
Zones(ZoneCommands),

/// print information about zpools
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you expecting that inventory will supplant this info? Or are you planning on replacing this access to the sled agent later?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was expecting that inventory would supplant this. (I think maybe it already has, in practice? I definitely only look at inventory when I'm curious about zpools; I don't think I've ever used these omdb subcommands.)

#[clap(subcommand)]
Zpools(ZpoolCommands),

/// print information about datasets
#[clap(subcommand)]
Datasets(DatasetCommands),

/// print information about the local bootstore node
#[clap(subcommand)]
Bootstore(BootstoreCommands),
Expand Down Expand Up @@ -97,12 +89,6 @@ impl SledAgentArgs {
SledAgentCommands::Zones(ZoneCommands::List) => {
cmd_zones_list(&client).await
}
SledAgentCommands::Zpools(ZpoolCommands::List) => {
cmd_zpools_list(&client).await
}
SledAgentCommands::Datasets(DatasetCommands::List) => {
cmd_datasets_list(&client).await
}
SledAgentCommands::Bootstore(BootstoreCommands::Status) => {
cmd_bootstore_status(&client).await
}
Expand All @@ -129,44 +115,6 @@ async fn cmd_zones_list(
Ok(())
}

/// Runs `omdb sled-agent zpools list`
async fn cmd_zpools_list(
client: &sled_agent_client::Client,
) -> Result<(), anyhow::Error> {
let response = client.zpools_get().await.context("listing zpools")?;
let zpools = response.into_inner();

println!("zpools:");
if zpools.is_empty() {
println!(" <none>");
}
for zpool in &zpools {
println!(" {:?}", zpool);
}

Ok(())
}

/// Runs `omdb sled-agent datasets list`
async fn cmd_datasets_list(
client: &sled_agent_client::Client,
) -> Result<(), anyhow::Error> {
let response = client.datasets_get().await.context("listing datasets")?;
let response = response.into_inner();

println!("dataset configuration @ generation {}:", response.generation);
let datasets = response.datasets;

if datasets.is_empty() {
println!(" <none>");
}
for dataset in &datasets {
println!(" {:?}", dataset);
}

Ok(())
}

/// Runs `omdb sled-agent bootstore status`
async fn cmd_bootstore_status(
client: &sled_agent_client::Client,
Expand Down
1 change: 0 additions & 1 deletion dev-tools/omdb/tests/test_all_output.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,6 @@ async fn test_omdb_usage_errors() {
&["nexus", "sleds"],
&["sled-agent"],
&["sled-agent", "zones"],
&["sled-agent", "zpools"],
&["oximeter", "--help"],
&["oxql", "--help"],
// Mispelled argument
Expand Down
28 changes: 0 additions & 28 deletions dev-tools/omdb/tests/usage_errors.out
Original file line number Diff line number Diff line change
Expand Up @@ -909,8 +909,6 @@ Usage: omdb sled-agent [OPTIONS] <COMMAND>

Commands:
zones print information about zones
zpools print information about zpools
datasets print information about datasets
bootstore print information about the local bootstore node
help Print this message or the help of the given subcommand(s)

Expand Down Expand Up @@ -949,32 +947,6 @@ Connection Options:
--sled-agent-url <SLED_AGENT_URL> URL of the Sled internal API [env: OMDB_SLED_AGENT_URL=]
--dns-server <DNS_SERVER> [env: OMDB_DNS_SERVER=]

Safety Options:
-w, --destructive Allow potentially-destructive subcommands
=============================================
EXECUTING COMMAND: omdb ["sled-agent", "zpools"]
termination: Exited(2)
---------------------------------------------
stdout:
---------------------------------------------
stderr:
print information about zpools

Usage: omdb sled-agent zpools [OPTIONS] <COMMAND>

Commands:
list Print list of all zpools managed by the sled agent
help Print this message or the help of the given subcommand(s)

Options:
--log-level <LOG_LEVEL> log level filter [env: LOG_LEVEL=] [default: warn]
--color <COLOR> Color output [default: auto] [possible values: auto, always, never]
-h, --help Print help

Connection Options:
--sled-agent-url <SLED_AGENT_URL> URL of the Sled internal API [env: OMDB_SLED_AGENT_URL=]
--dns-server <DNS_SERVER> [env: OMDB_DNS_SERVER=]

Safety Options:
-w, --destructive Allow potentially-destructive subcommands
=============================================
Expand Down
59 changes: 45 additions & 14 deletions nexus-sled-agent-shared/src/inventory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,7 @@ use omicron_common::{
external::{ByteCount, Generation},
internal::shared::{NetworkInterface, SourceNatConfig},
},
disk::{
DatasetConfig, DatasetManagementStatus, DiskManagementStatus,
DiskVariant, OmicronPhysicalDiskConfig,
},
disk::{DatasetConfig, DiskVariant, OmicronPhysicalDiskConfig},
zpool_name::ZpoolName,
};
use omicron_uuid_kinds::{DatasetUuid, OmicronZoneUuid};
Expand Down Expand Up @@ -131,6 +128,50 @@ pub struct ConfigReconcilerInventory {
pub zones: BTreeMap<OmicronZoneUuid, ConfigReconcilerInventoryResult>,
}

impl ConfigReconcilerInventory {
/// Iterate over all running zones as reported by the last reconciliation
/// result.
///
/// This includes zones that are both present in `last_reconciled_config`
/// and whose status in `zones` indicates "successfully running".
pub fn running_omicron_zones(
&self,
) -> impl Iterator<Item = &OmicronZoneConfig> {
self.zones.iter().filter_map(|(zone_id, result)| {
match result {
ConfigReconcilerInventoryResult::Ok => (),
ConfigReconcilerInventoryResult::Err { .. } => return None,
};
self.last_reconciled_config.zones.get(zone_id)
})
}

/// Given a sled config, produce a reconciler result that sled-agent could
/// have emitted if reconciliation succeeded.
///
/// This method should only be used by tests and dev tools; real code should
/// look at the actual `last_reconciliation` value from the parent
/// [`Inventory`].
pub fn debug_assume_success(config: OmicronSledConfig) -> Self {
let external_disks = config
.disks
.iter()
.map(|d| (d.id, ConfigReconcilerInventoryResult::Ok))
.collect();
let datasets = config
.datasets
.iter()
.map(|d| (d.id, ConfigReconcilerInventoryResult::Ok))
.collect();
let zones = config
.zones
.iter()
.map(|z| (z.id, ConfigReconcilerInventoryResult::Ok))
.collect();
Self { last_reconciled_config: config, external_disks, datasets, zones }
}
}

#[derive(Clone, Debug, PartialEq, Eq, Deserialize, JsonSchema, Serialize)]
#[serde(tag = "result", rename_all = "snake_case")]
pub enum ConfigReconcilerInventoryResult {
Expand Down Expand Up @@ -186,8 +227,6 @@ pub enum SledRole {
}

/// Describes the set of Reconfigurator-managed configuration elements of a sled
// TODO this struct should have a generation number; at the moment, each of
// the fields has a separete one internally.
#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq)]
pub struct OmicronSledConfig {
pub generation: Generation,
Expand Down Expand Up @@ -222,14 +261,6 @@ impl Ledgerable for OmicronSledConfig {
}
}

/// Result of the currently-synchronous `omicron_config_put` endpoint.
#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)]
#[must_use = "this `DatasetManagementResult` may contain errors, which should be handled"]
pub struct OmicronSledConfigResult {
pub disks: Vec<DiskManagementStatus>,
pub datasets: Vec<DatasetManagementStatus>,
}

/// Describes the set of Omicron-managed zones running on a sled
#[derive(
Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, Hash,
Expand Down
27 changes: 3 additions & 24 deletions nexus/inventory/src/examples.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ use gateway_client::types::SpType;
use gateway_types::rot::RotSlot;
use nexus_sled_agent_shared::inventory::Baseboard;
use nexus_sled_agent_shared::inventory::ConfigReconcilerInventory;
use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryResult;
use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryStatus;
use nexus_sled_agent_shared::inventory::Inventory;
use nexus_sled_agent_shared::inventory::InventoryDataset;
Expand Down Expand Up @@ -643,29 +642,9 @@ pub fn sled_agent(
ledgered_sled_config: Option<OmicronSledConfig>,
) -> Inventory {
// Assume the `ledgered_sled_config` was reconciled successfully.
let last_reconciliation = ledgered_sled_config.clone().map(|config| {
let external_disks = config
.disks
.iter()
.map(|d| (d.id, ConfigReconcilerInventoryResult::Ok))
.collect();
let datasets = config
.datasets
.iter()
.map(|d| (d.id, ConfigReconcilerInventoryResult::Ok))
.collect();
let zones = config
.zones
.iter()
.map(|z| (z.id, ConfigReconcilerInventoryResult::Ok))
.collect();
ConfigReconcilerInventory {
last_reconciled_config: config,
external_disks,
datasets,
zones,
}
});
let last_reconciliation = ledgered_sled_config
.clone()
.map(ConfigReconcilerInventory::debug_assume_success);

let reconciler_status = if last_reconciliation.is_some() {
ConfigReconcilerInventoryStatus::Idle {
Expand Down
79 changes: 9 additions & 70 deletions nexus/reconfigurator/execution/src/omicron_sled_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,13 @@ use anyhow::anyhow;
use futures::StreamExt;
use futures::stream;
use nexus_db_queries::context::OpContext;
use nexus_sled_agent_shared::inventory::OmicronSledConfigResult;
use nexus_types::deployment::BlueprintSledConfig;
use omicron_uuid_kinds::GenericUuid;
use omicron_uuid_kinds::SledUuid;
use slog::Logger;
use slog::info;
use slog::warn;
use slog_error_chain::InlineErrorChain;
use std::collections::BTreeMap;
use update_engine::merge_anyhow_list;

/// Idempotently ensure that the specified Omicron sled configs are deployed to
/// the corresponding sleds
Expand Down Expand Up @@ -63,13 +61,14 @@ pub(crate) async fn deploy_sled_configs(
format!("Failed to put {config:#?} to sled {sled_id}")
});
match result {
Ok(_) => None,
Err(error) => {
warn!(log, "{error:#}");
warn!(
log, "failed to put sled config";
InlineErrorChain::new(error.as_ref()),
);
Some(error)
}
Ok(result) => {
parse_config_result(result.into_inner(), &log).err()
}
}
})
.collect()
Expand All @@ -78,69 +77,6 @@ pub(crate) async fn deploy_sled_configs(
if errors.is_empty() { Ok(()) } else { Err(errors) }
}

fn parse_config_result(
result: OmicronSledConfigResult,
log: &Logger,
) -> anyhow::Result<()> {
let (disk_errs, disk_successes): (Vec<_>, Vec<_>) =
result.disks.into_iter().partition(|status| status.err.is_some());

if !disk_errs.is_empty() {
warn!(
log,
"Failed to deploy disks for sled agent";
"successfully configured disks" => disk_successes.len(),
"failed disk configurations" => disk_errs.len(),
);
for err in &disk_errs {
warn!(log, "{err:?}");
}
return Err(merge_anyhow_list(disk_errs.into_iter().map(|status| {
anyhow!(
"failed to deploy disk {:?}: {:#}",
status.identity,
// `disk_errs` was partitioned by `status.err.is_some()`, so
// this is safe to unwrap.
status.err.unwrap(),
)
})));
}

let (dataset_errs, dataset_successes): (Vec<_>, Vec<_>) =
result.datasets.into_iter().partition(|status| status.err.is_some());

if !dataset_errs.is_empty() {
warn!(
log,
"Failed to deploy datasets for sled agent";
"successfully configured datasets" => dataset_successes.len(),
"failed dataset configurations" => dataset_errs.len(),
);
for err in &dataset_errs {
warn!(log, "{err:?}");
}
return Err(merge_anyhow_list(dataset_errs.into_iter().map(
|status| {
anyhow!(
"failed to deploy dataset {}: {:#}",
status.dataset_name.full_name(),
// `dataset_errs` was partitioned by `status.err.is_some()`,
// so this is safe to unwrap.
status.err.unwrap(),
)
},
)));
}

info!(
log,
"Successfully deployed config to sled agent";
"successfully configured disks" => disk_successes.len(),
"successfully configured datasets" => dataset_successes.len(),
);
Ok(())
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down Expand Up @@ -327,6 +263,9 @@ mod tests {

// Observe the latest configuration stored on the simulated sled agent,
// and verify that this output matches the input.
//
// TODO-cleanup Simulated sled-agent should report a unified
// `OmicronSledConfig`.
let observed_disks =
sim_sled_agent.omicron_physical_disks_list().unwrap();
let observed_datasets = sim_sled_agent.datasets_config_list().unwrap();
Expand Down
Loading
Loading