From 501f74f994db8472b0bc55cda335fbefc7bb996f Mon Sep 17 00:00:00 2001
From: John Gallagher <john@oxidecomputer.com>
Date: Fri, 25 Apr 2025 17:24:20 -0400
Subject: [PATCH 01/10] [sled-agent] Integrate config-reconciler

---
 clients/sled-agent-client/src/lib.rs          |    1 -
 dev-tools/omdb/src/bin/omdb/sled_agent.rs     |   52 -
 nexus-sled-agent-shared/src/inventory.rs      |   15 +-
 .../execution/src/omicron_sled_config.rs      |   79 +-
 openapi/sled-agent.json                       |  313 +--
 sled-agent/Cargo.toml                         |    1 +
 sled-agent/api/src/lib.rs                     |   31 +-
 sled-agent/src/artifact_store.rs              |   81 +-
 sled-agent/src/bootstrap/bootstore_setup.rs   |   23 +-
 sled-agent/src/bootstrap/http_entrypoints.rs  |    6 +-
 sled-agent/src/bootstrap/pre_server.rs        |   15 +-
 sled-agent/src/bootstrap/rack_ops.rs          |   12 +-
 sled-agent/src/bootstrap/rss_handle.rs        |    6 +-
 sled-agent/src/bootstrap/server.rs            |   47 +-
 sled-agent/src/hardware_monitor.rs            |   55 +-
 sled-agent/src/http_entrypoints.rs            |   58 +-
 sled-agent/src/instance.rs                    |  114 +-
 sled-agent/src/instance_manager.rs            |   94 +-
 sled-agent/src/lib.rs                         |    1 -
 sled-agent/src/long_running_tasks.rs          |  111 +-
 sled-agent/src/probe_manager.rs               |  158 +-
 sled-agent/src/rack_setup/service.rs          |  138 +-
 sled-agent/src/services.rs                    | 1949 +----------------
 sled-agent/src/sim/http_entrypoints.rs        |   29 +-
 sled-agent/src/sim/sled_agent.rs              |   24 +-
 sled-agent/src/sled_agent.rs                  |  604 ++---
 sled-agent/src/storage_monitor.rs             |  116 -
 sled-agent/src/support_bundle/logs.rs         |   20 +-
 sled-agent/src/support_bundle/storage.rs      |  146 +-
 sled-agent/src/zone_bundle.rs                 |   57 +-
 30 files changed, 782 insertions(+), 3574 deletions(-)
 delete mode 100644 sled-agent/src/storage_monitor.rs

diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs
index b27f3048439..5eb8c3ecafe 100644
--- a/clients/sled-agent-client/src/lib.rs
+++ b/clients/sled-agent-client/src/lib.rs
@@ -67,7 +67,6 @@ progenitor::generate_api!(
         OmicronPhysicalDiskConfig = omicron_common::disk::OmicronPhysicalDiskConfig,
         OmicronPhysicalDisksConfig = omicron_common::disk::OmicronPhysicalDisksConfig,
         OmicronSledConfig = nexus_sled_agent_shared::inventory::OmicronSledConfig,
-        OmicronSledConfigResult = nexus_sled_agent_shared::inventory::OmicronSledConfigResult,
         OmicronZoneConfig = nexus_sled_agent_shared::inventory::OmicronZoneConfig,
         OmicronZoneDataset = nexus_sled_agent_shared::inventory::OmicronZoneDataset,
         OmicronZoneImageSource = nexus_sled_agent_shared::inventory::OmicronZoneImageSource,
diff --git a/dev-tools/omdb/src/bin/omdb/sled_agent.rs b/dev-tools/omdb/src/bin/omdb/sled_agent.rs
index d02203e052b..fbd6260bd8f 100644
--- a/dev-tools/omdb/src/bin/omdb/sled_agent.rs
+++ b/dev-tools/omdb/src/bin/omdb/sled_agent.rs
@@ -34,14 +34,6 @@ enum SledAgentCommands {
     #[clap(subcommand)]
     Zones(ZoneCommands),
 
-    /// print information about zpools
-    #[clap(subcommand)]
-    Zpools(ZpoolCommands),
-
-    /// print information about datasets
-    #[clap(subcommand)]
-    Datasets(DatasetCommands),
-
     /// print information about the local bootstore node
     #[clap(subcommand)]
     Bootstore(BootstoreCommands),
@@ -97,12 +89,6 @@ impl SledAgentArgs {
             SledAgentCommands::Zones(ZoneCommands::List) => {
                 cmd_zones_list(&client).await
             }
-            SledAgentCommands::Zpools(ZpoolCommands::List) => {
-                cmd_zpools_list(&client).await
-            }
-            SledAgentCommands::Datasets(DatasetCommands::List) => {
-                cmd_datasets_list(&client).await
-            }
             SledAgentCommands::Bootstore(BootstoreCommands::Status) => {
                 cmd_bootstore_status(&client).await
             }
@@ -129,44 +115,6 @@ async fn cmd_zones_list(
     Ok(())
 }
 
-/// Runs `omdb sled-agent zpools list`
-async fn cmd_zpools_list(
-    client: &sled_agent_client::Client,
-) -> Result<(), anyhow::Error> {
-    let response = client.zpools_get().await.context("listing zpools")?;
-    let zpools = response.into_inner();
-
-    println!("zpools:");
-    if zpools.is_empty() {
-        println!("    <none>");
-    }
-    for zpool in &zpools {
-        println!("    {:?}", zpool);
-    }
-
-    Ok(())
-}
-
-/// Runs `omdb sled-agent datasets list`
-async fn cmd_datasets_list(
-    client: &sled_agent_client::Client,
-) -> Result<(), anyhow::Error> {
-    let response = client.datasets_get().await.context("listing datasets")?;
-    let response = response.into_inner();
-
-    println!("dataset configuration @ generation {}:", response.generation);
-    let datasets = response.datasets;
-
-    if datasets.is_empty() {
-        println!("    <none>");
-    }
-    for dataset in &datasets {
-        println!("    {:?}", dataset);
-    }
-
-    Ok(())
-}
-
 /// Runs `omdb sled-agent bootstore status`
 async fn cmd_bootstore_status(
     client: &sled_agent_client::Client,
diff --git a/nexus-sled-agent-shared/src/inventory.rs b/nexus-sled-agent-shared/src/inventory.rs
index 4b1b525e0e6..8495c774c49 100644
--- a/nexus-sled-agent-shared/src/inventory.rs
+++ b/nexus-sled-agent-shared/src/inventory.rs
@@ -18,10 +18,7 @@ use omicron_common::{
         external::{ByteCount, Generation},
         internal::shared::{NetworkInterface, SourceNatConfig},
     },
-    disk::{
-        DatasetConfig, DatasetManagementStatus, DiskManagementStatus,
-        DiskVariant, OmicronPhysicalDiskConfig,
-    },
+    disk::{DatasetConfig, DiskVariant, OmicronPhysicalDiskConfig},
     zpool_name::ZpoolName,
 };
 use omicron_uuid_kinds::{DatasetUuid, OmicronZoneUuid};
@@ -186,8 +183,6 @@ pub enum SledRole {
 }
 
 /// Describes the set of Reconfigurator-managed configuration elements of a sled
-// TODO this struct should have a generation number; at the moment, each of
-// the fields has a separete one internally.
 #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq)]
 pub struct OmicronSledConfig {
     pub generation: Generation,
@@ -222,14 +217,6 @@ impl Ledgerable for OmicronSledConfig {
     }
 }
 
-/// Result of the currently-synchronous `omicron_config_put` endpoint.
-#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)]
-#[must_use = "this `DatasetManagementResult` may contain errors, which should be handled"]
-pub struct OmicronSledConfigResult {
-    pub disks: Vec<DiskManagementStatus>,
-    pub datasets: Vec<DatasetManagementStatus>,
-}
-
 /// Describes the set of Omicron-managed zones running on a sled
 #[derive(
     Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, Hash,
diff --git a/nexus/reconfigurator/execution/src/omicron_sled_config.rs b/nexus/reconfigurator/execution/src/omicron_sled_config.rs
index bd08e96a859..70ca8493a4a 100644
--- a/nexus/reconfigurator/execution/src/omicron_sled_config.rs
+++ b/nexus/reconfigurator/execution/src/omicron_sled_config.rs
@@ -10,15 +10,13 @@ use anyhow::anyhow;
 use futures::StreamExt;
 use futures::stream;
 use nexus_db_queries::context::OpContext;
-use nexus_sled_agent_shared::inventory::OmicronSledConfigResult;
 use nexus_types::deployment::BlueprintSledConfig;
 use omicron_uuid_kinds::GenericUuid;
 use omicron_uuid_kinds::SledUuid;
-use slog::Logger;
 use slog::info;
 use slog::warn;
+use slog_error_chain::InlineErrorChain;
 use std::collections::BTreeMap;
-use update_engine::merge_anyhow_list;
 
 /// Idempotently ensure that the specified Omicron sled configs are deployed to
 /// the corresponding sleds
@@ -63,13 +61,14 @@ pub(crate) async fn deploy_sled_configs(
                     format!("Failed to put {config:#?} to sled {sled_id}")
                 });
             match result {
+                Ok(_) => None,
                 Err(error) => {
-                    warn!(log, "{error:#}");
+                    warn!(
+                        log, "failed to put sled config";
+                        InlineErrorChain::new(error.as_ref()),
+                    );
                     Some(error)
                 }
-                Ok(result) => {
-                    parse_config_result(result.into_inner(), &log).err()
-                }
             }
         })
         .collect()
@@ -78,69 +77,6 @@ pub(crate) async fn deploy_sled_configs(
     if errors.is_empty() { Ok(()) } else { Err(errors) }
 }
 
-fn parse_config_result(
-    result: OmicronSledConfigResult,
-    log: &Logger,
-) -> anyhow::Result<()> {
-    let (disk_errs, disk_successes): (Vec<_>, Vec<_>) =
-        result.disks.into_iter().partition(|status| status.err.is_some());
-
-    if !disk_errs.is_empty() {
-        warn!(
-            log,
-            "Failed to deploy disks for sled agent";
-            "successfully configured disks" => disk_successes.len(),
-            "failed disk configurations" => disk_errs.len(),
-        );
-        for err in &disk_errs {
-            warn!(log, "{err:?}");
-        }
-        return Err(merge_anyhow_list(disk_errs.into_iter().map(|status| {
-            anyhow!(
-                "failed to deploy disk {:?}: {:#}",
-                status.identity,
-                // `disk_errs` was partitioned by `status.err.is_some()`, so
-                // this is safe to unwrap.
-                status.err.unwrap(),
-            )
-        })));
-    }
-
-    let (dataset_errs, dataset_successes): (Vec<_>, Vec<_>) =
-        result.datasets.into_iter().partition(|status| status.err.is_some());
-
-    if !dataset_errs.is_empty() {
-        warn!(
-            log,
-            "Failed to deploy datasets for sled agent";
-            "successfully configured datasets" => dataset_successes.len(),
-            "failed dataset configurations" => dataset_errs.len(),
-        );
-        for err in &dataset_errs {
-            warn!(log, "{err:?}");
-        }
-        return Err(merge_anyhow_list(dataset_errs.into_iter().map(
-            |status| {
-                anyhow!(
-                    "failed to deploy dataset {}: {:#}",
-                    status.dataset_name.full_name(),
-                    // `dataset_errs` was partitioned by `status.err.is_some()`,
-                    // so this is safe to unwrap.
-                    status.err.unwrap(),
-                )
-            },
-        )));
-    }
-
-    info!(
-        log,
-        "Successfully deployed config to sled agent";
-        "successfully configured disks" => disk_successes.len(),
-        "successfully configured datasets" => dataset_successes.len(),
-    );
-    Ok(())
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -327,6 +263,9 @@ mod tests {
 
         // Observe the latest configuration stored on the simulated sled agent,
         // and verify that this output matches the input.
+        //
+        // TODO-cleanup Simulated sled-agent should report a unified
+        // `OmicronSledConfig`.
         let observed_disks =
             sim_sled_agent.omicron_physical_disks_list().unwrap();
         let observed_datasets = sim_sled_agent.datasets_config_list().unwrap();
diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json
index 4279de8605d..6f7c0b62dcd 100644
--- a/openapi/sled-agent.json
+++ b/openapi/sled-agent.json
@@ -334,30 +334,6 @@
         }
       }
     },
-    "/datasets": {
-      "get": {
-        "summary": "Lists the datasets that this sled is configured to use",
-        "operationId": "datasets_get",
-        "responses": {
-          "200": {
-            "description": "successful operation",
-            "content": {
-              "application/json": {
-                "schema": {
-                  "$ref": "#/components/schemas/DatasetsConfig"
-                }
-              }
-            }
-          },
-          "4XX": {
-            "$ref": "#/components/responses/Error"
-          },
-          "5XX": {
-            "$ref": "#/components/responses/Error"
-          }
-        }
-      }
-    },
     "/disks/{disk_id}": {
       "put": {
         "operationId": "disk_put",
@@ -516,38 +492,8 @@
           "required": true
         },
         "responses": {
-          "200": {
-            "description": "successful operation",
-            "content": {
-              "application/json": {
-                "schema": {
-                  "$ref": "#/components/schemas/OmicronSledConfigResult"
-                }
-              }
-            }
-          },
-          "4XX": {
-            "$ref": "#/components/responses/Error"
-          },
-          "5XX": {
-            "$ref": "#/components/responses/Error"
-          }
-        }
-      }
-    },
-    "/omicron-physical-disks": {
-      "get": {
-        "operationId": "omicron_physical_disks_get",
-        "responses": {
-          "200": {
-            "description": "successful operation",
-            "content": {
-              "application/json": {
-                "schema": {
-                  "$ref": "#/components/schemas/OmicronPhysicalDisksConfig"
-                }
-              }
-            }
+          "204": {
+            "description": "resource updated"
           },
           "4XX": {
             "$ref": "#/components/responses/Error"
@@ -2181,33 +2127,6 @@
           }
         }
       }
-    },
-    "/zpools": {
-      "get": {
-        "operationId": "zpools_get",
-        "responses": {
-          "200": {
-            "description": "successful operation",
-            "content": {
-              "application/json": {
-                "schema": {
-                  "title": "Array_of_Zpool",
-                  "type": "array",
-                  "items": {
-                    "$ref": "#/components/schemas/Zpool"
-                  }
-                }
-              }
-            }
-          },
-          "4XX": {
-            "$ref": "#/components/responses/Error"
-          },
-          "5XX": {
-            "$ref": "#/components/responses/Error"
-          }
-        }
-      }
     }
   },
   "components": {
@@ -3815,22 +3734,6 @@
         "description": "The kind of dataset. See the `DatasetKind` enum in omicron-common for possible values.",
         "type": "string"
       },
-      "DatasetManagementStatus": {
-        "description": "Identifies how a single dataset management operation may have succeeded or failed.",
-        "type": "object",
-        "properties": {
-          "dataset_name": {
-            "$ref": "#/components/schemas/DatasetName"
-          },
-          "err": {
-            "nullable": true,
-            "type": "string"
-          }
-        },
-        "required": [
-          "dataset_name"
-        ]
-      },
       "DatasetName": {
         "type": "object",
         "properties": {
@@ -3846,29 +3749,6 @@
           "pool_name"
         ]
       },
-      "DatasetsConfig": {
-        "type": "object",
-        "properties": {
-          "datasets": {
-            "type": "object",
-            "additionalProperties": {
-              "$ref": "#/components/schemas/DatasetConfig"
-            }
-          },
-          "generation": {
-            "description": "generation number of this configuration\n\nThis generation number is owned by the control plane (i.e., RSS or Nexus, depending on whether RSS-to-Nexus handoff has happened).  It should not be bumped within Sled Agent.\n\nSled Agent rejects attempts to set the configuration to a generation older than the one it's currently running.\n\nNote that \"Generation::new()\", AKA, the first generation number, is reserved for \"no datasets\". This is the default configuration for a sled before any requests have been made.",
-            "allOf": [
-              {
-                "$ref": "#/components/schemas/Generation"
-              }
-            ]
-          }
-        },
-        "required": [
-          "datasets",
-          "generation"
-        ]
-      },
       "DhcpConfig": {
         "description": "DHCP configuration for a port\n\nNot present here: Hostname (DHCPv4 option 12; used in DHCPv6 option 39); we use `InstanceRuntimeState::hostname` for this value.",
         "type": "object",
@@ -3945,128 +3825,6 @@
           "vendor"
         ]
       },
-      "DiskManagementError": {
-        "oneOf": [
-          {
-            "type": "object",
-            "properties": {
-              "type": {
-                "type": "string",
-                "enum": [
-                  "not_found"
-                ]
-              }
-            },
-            "required": [
-              "type"
-            ]
-          },
-          {
-            "type": "object",
-            "properties": {
-              "type": {
-                "type": "string",
-                "enum": [
-                  "internal_disk_control_plane_request"
-                ]
-              },
-              "value": {
-                "$ref": "#/components/schemas/TypedUuidForPhysicalDiskKind"
-              }
-            },
-            "required": [
-              "type",
-              "value"
-            ]
-          },
-          {
-            "type": "object",
-            "properties": {
-              "type": {
-                "type": "string",
-                "enum": [
-                  "zpool_uuid_mismatch"
-                ]
-              },
-              "value": {
-                "type": "object",
-                "properties": {
-                  "expected": {
-                    "$ref": "#/components/schemas/TypedUuidForZpoolKind"
-                  },
-                  "observed": {
-                    "$ref": "#/components/schemas/TypedUuidForZpoolKind"
-                  }
-                },
-                "required": [
-                  "expected",
-                  "observed"
-                ]
-              }
-            },
-            "required": [
-              "type",
-              "value"
-            ]
-          },
-          {
-            "type": "object",
-            "properties": {
-              "type": {
-                "type": "string",
-                "enum": [
-                  "key_manager"
-                ]
-              },
-              "value": {
-                "type": "string"
-              }
-            },
-            "required": [
-              "type",
-              "value"
-            ]
-          },
-          {
-            "type": "object",
-            "properties": {
-              "type": {
-                "type": "string",
-                "enum": [
-                  "other"
-                ]
-              },
-              "value": {
-                "type": "string"
-              }
-            },
-            "required": [
-              "type",
-              "value"
-            ]
-          }
-        ]
-      },
-      "DiskManagementStatus": {
-        "description": "Identifies how a single disk management operation may have succeeded or failed.",
-        "type": "object",
-        "properties": {
-          "err": {
-            "nullable": true,
-            "allOf": [
-              {
-                "$ref": "#/components/schemas/DiskManagementError"
-              }
-            ]
-          },
-          "identity": {
-            "$ref": "#/components/schemas/DiskIdentity"
-          }
-        },
-        "required": [
-          "identity"
-        ]
-      },
       "DiskRuntimeState": {
         "description": "Runtime state of the Disk, which includes its attach state and some minimal metadata",
         "type": "object",
@@ -4365,13 +4123,6 @@
           }
         ]
       },
-      "DiskType": {
-        "type": "string",
-        "enum": [
-          "U2",
-          "M2"
-        ]
-      },
       "DiskVariant": {
         "type": "string",
         "enum": [
@@ -5629,29 +5380,6 @@
           "pool_id"
         ]
       },
-      "OmicronPhysicalDisksConfig": {
-        "type": "object",
-        "properties": {
-          "disks": {
-            "type": "array",
-            "items": {
-              "$ref": "#/components/schemas/OmicronPhysicalDiskConfig"
-            }
-          },
-          "generation": {
-            "description": "generation number of this configuration\n\nThis generation number is owned by the control plane (i.e., RSS or Nexus, depending on whether RSS-to-Nexus handoff has happened).  It should not be bumped within Sled Agent.\n\nSled Agent rejects attempts to set the configuration to a generation older than the one it's currently running.",
-            "allOf": [
-              {
-                "$ref": "#/components/schemas/Generation"
-              }
-            ]
-          }
-        },
-        "required": [
-          "disks",
-          "generation"
-        ]
-      },
       "OmicronSledConfig": {
         "description": "Describes the set of Reconfigurator-managed configuration elements of a sled",
         "type": "object",
@@ -5684,28 +5412,6 @@
           "zones"
         ]
       },
-      "OmicronSledConfigResult": {
-        "description": "Result of the currently-synchronous `omicron_config_put` endpoint.",
-        "type": "object",
-        "properties": {
-          "datasets": {
-            "type": "array",
-            "items": {
-              "$ref": "#/components/schemas/DatasetManagementStatus"
-            }
-          },
-          "disks": {
-            "type": "array",
-            "items": {
-              "$ref": "#/components/schemas/DiskManagementStatus"
-            }
-          }
-        },
-        "required": [
-          "datasets",
-          "disks"
-        ]
-      },
       "OmicronZoneConfig": {
         "description": "Describes one Omicron-managed zone running on a sled",
         "type": "object",
@@ -7762,21 +7468,6 @@
           "version"
         ]
       },
-      "Zpool": {
-        "type": "object",
-        "properties": {
-          "disk_type": {
-            "$ref": "#/components/schemas/DiskType"
-          },
-          "id": {
-            "$ref": "#/components/schemas/TypedUuidForZpoolKind"
-          }
-        },
-        "required": [
-          "disk_type",
-          "id"
-        ]
-      },
       "ZpoolName": {
         "title": "The name of a Zpool",
         "description": "Zpool names are of the format ox{i,p}_<UUID>. They are either Internal or External, and should be unique",
diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml
index 1b3fb78698b..acacb4cc1d6 100644
--- a/sled-agent/Cargo.toml
+++ b/sled-agent/Cargo.toml
@@ -138,6 +138,7 @@ tokio-stream.workspace = true
 tokio-util.workspace = true
 
 illumos-utils = { workspace = true, features = ["testing"] }
+sled-agent-config-reconciler = { workspace = true, features = ["testing"] }
 sled-storage = { workspace = true, features = ["testing"] }
 
 #
diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs
index 7bdba01e1bc..bd4242a347f 100644
--- a/sled-agent/api/src/lib.rs
+++ b/sled-agent/api/src/lib.rs
@@ -13,7 +13,7 @@ use dropshot::{
     TypedBody,
 };
 use nexus_sled_agent_shared::inventory::{
-    Inventory, OmicronSledConfig, OmicronSledConfigResult, SledRole,
+    Inventory, OmicronSledConfig, SledRole,
 };
 use omicron_common::{
     api::external::Generation,
@@ -24,7 +24,7 @@ use omicron_common::{
             SledIdentifiers, SwitchPorts, VirtualNetworkInterfaceHost,
         },
     },
-    disk::{DatasetsConfig, DiskVariant, OmicronPhysicalDisksConfig},
+    disk::DiskVariant,
     ledger::Ledgerable,
 };
 use omicron_uuid_kinds::{
@@ -260,32 +260,7 @@ pub trait SledAgentApi {
     async fn omicron_config_put(
         rqctx: RequestContext<Self::Context>,
         body: TypedBody<OmicronSledConfig>,
-    ) -> Result<HttpResponseOk<OmicronSledConfigResult>, HttpError>;
-
-    /// Lists the datasets that this sled is configured to use
-    #[endpoint {
-        method = GET,
-        path = "/datasets",
-    }]
-    async fn datasets_get(
-        rqctx: RequestContext<Self::Context>,
-    ) -> Result<HttpResponseOk<DatasetsConfig>, HttpError>;
-
-    #[endpoint {
-        method = GET,
-        path = "/omicron-physical-disks",
-    }]
-    async fn omicron_physical_disks_get(
-        rqctx: RequestContext<Self::Context>,
-    ) -> Result<HttpResponseOk<OmicronPhysicalDisksConfig>, HttpError>;
-
-    #[endpoint {
-        method = GET,
-        path = "/zpools",
-    }]
-    async fn zpools_get(
-        rqctx: RequestContext<Self::Context>,
-    ) -> Result<HttpResponseOk<Vec<Zpool>>, HttpError>;
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError>;
 
     #[endpoint {
         method = GET,
diff --git a/sled-agent/src/artifact_store.rs b/sled-agent/src/artifact_store.rs
index c2b6965a32a..e73461020d5 100644
--- a/sled-agent/src/artifact_store.rs
+++ b/sled-agent/src/artifact_store.rs
@@ -17,11 +17,11 @@
 //! Operations that list or modify artifacts or the configuration are called by
 //! Nexus and handled by the Sled Agent API.
 
-use std::collections::BTreeMap;
 use std::future::Future;
 use std::io::{ErrorKind, Write};
 use std::net::SocketAddrV6;
 use std::str::FromStr;
+use std::sync::Arc;
 use std::time::Duration;
 
 use atomicwrites::{AtomicFile, OverwriteBehavior};
@@ -40,8 +40,8 @@ use sha2::{Digest, Sha256};
 use sled_agent_api::{
     ArtifactConfig, ArtifactListResponse, ArtifactPutResponse,
 };
-use sled_storage::dataset::M2_ARTIFACT_DATASET;
-use sled_storage::manager::StorageHandle;
+use sled_agent_config_reconciler::ConfigReconcilerHandle;
+use sled_agent_config_reconciler::InternalDisksReceiver;
 use slog::{Logger, error, info};
 use slog_error_chain::{InlineErrorChain, SlogInlineError};
 use tokio::fs::File;
@@ -49,8 +49,6 @@ use tokio::sync::{OwnedSemaphorePermit, mpsc, oneshot, watch};
 use tokio::task::JoinSet;
 use tufaceous_artifact::ArtifactHash;
 
-use crate::services::ServiceManager;
-
 // These paths are defined under the artifact storage dataset. They
 // cannot conflict with any artifact paths because all artifact paths are
 // hexadecimal-encoded SHA-256 checksums.
@@ -93,7 +91,7 @@ impl<T: DatasetsManager> ArtifactStore<T> {
     pub(crate) async fn new(
         log: &Logger,
         storage: T,
-        services: Option<ServiceManager>,
+        config_reconciler: Option<Arc<ConfigReconcilerHandle>>,
     ) -> ArtifactStore<T> {
         let log = log.new(slog::o!("component" => "ArtifactStore"));
 
@@ -133,7 +131,7 @@ impl<T: DatasetsManager> ArtifactStore<T> {
         tokio::task::spawn(ledger_manager(
             log.clone(),
             ledger_paths,
-            services,
+            config_reconciler,
             ledger_rx,
             config_tx,
         ));
@@ -164,13 +162,15 @@ impl<T: DatasetsManager> ArtifactStore<T> {
     }
 }
 
-impl ArtifactStore<StorageHandle> {
+impl ArtifactStore<InternalDisksReceiver> {
     pub(crate) async fn start(
-        self,
+        self: Arc<Self>,
         sled_address: SocketAddrV6,
         dropshot_config: &ConfigDropshot,
-    ) -> Result<dropshot::HttpServer<ArtifactStore<StorageHandle>>, StartError>
-    {
+    ) -> Result<
+        dropshot::HttpServer<Arc<ArtifactStore<InternalDisksReceiver>>>,
+        StartError,
+    > {
         let mut depot_address = sled_address;
         depot_address.set_port(REPO_DEPOT_PORT);
 
@@ -259,7 +259,7 @@ impl<T: DatasetsManager> ArtifactStore<T> {
     /// Open an artifact file by hash from a storage handle.
     ///
     /// This is the same as [ArtifactStore::get], but can be called with only
-    /// a [StorageHandle].
+    /// a storage implementation.
     pub(crate) async fn get_from_storage(
         storage: &T,
         log: &Logger,
@@ -452,11 +452,11 @@ type LedgerManagerRequest =
 async fn ledger_manager(
     log: Logger,
     ledger_paths: Vec<Utf8PathBuf>,
-    services: Option<ServiceManager>,
+    config_reconciler: Option<Arc<ConfigReconcilerHandle>>,
     mut rx: mpsc::Receiver<LedgerManagerRequest>,
     config_channel: watch::Sender<Option<ArtifactConfig>>,
 ) {
-    let services = services.as_ref();
+    let config_reconciler = config_reconciler.as_ref();
     let handle_request = async |new_config: ArtifactConfig| {
         if ledger_paths.is_empty() {
             return Err(Error::NoUpdateDataset);
@@ -466,21 +466,11 @@ async fn ledger_manager(
         {
             if new_config.generation > ledger.data().generation {
                 // New config generation. First check that the configuration
-                // contains all artifacts that are presently in use.
-                let mut missing = BTreeMap::new();
-                // Check artifacts from the current zone configuration.
-                if let Some(services) = services {
-                    for zone in services.omicron_zones_list().await.zones {
-                        if let Some(hash) = zone.image_source.artifact_hash() {
-                            if !new_config.artifacts.contains(&hash) {
-                                missing
-                                    .insert(hash, "current zone configuration");
-                            }
-                        }
-                    }
-                }
-                if !missing.is_empty() {
-                    return Err(Error::InUseArtifactsMissing(missing));
+                // is valid against the current ledgered sled config.
+                if let Some(config_reconciler) = config_reconciler {
+                    config_reconciler
+                        .validate_artifact_config(new_config.clone())
+                        .await??;
                 }
 
                 // Everything looks okay; update the ledger.
@@ -614,14 +604,11 @@ pub trait DatasetsManager: Clone + Send + Sync + 'static {
     }
 }
 
-impl DatasetsManager for StorageHandle {
+impl DatasetsManager for InternalDisksReceiver {
     async fn artifact_storage_paths(
         &self,
     ) -> impl Iterator<Item = Utf8PathBuf> + '_ {
-        self.get_latest_disks()
-            .await
-            .all_m2_mountpoints(M2_ARTIFACT_DATASET)
-            .into_iter()
+        self.current().all_artifact_datasets().collect::<Vec<_>>().into_iter()
     }
 }
 
@@ -766,11 +753,11 @@ impl ArtifactWriter {
 }
 
 /// Implementation of the Repo Depot API backed by an
-/// `ArtifactStore<StorageHandle>`.
+/// `ArtifactStore<InternalDisksReceiver>`.
 enum RepoDepotImpl {}
 
 impl RepoDepotApi for RepoDepotImpl {
-    type Context = ArtifactStore<StorageHandle>;
+    type Context = Arc<ArtifactStore<InternalDisksReceiver>>;
 
     async fn artifact_get_by_sha256(
         rqctx: RequestContext<Self::Context>,
@@ -831,8 +818,15 @@ pub enum Error {
     #[error("Digest mismatch: expected {expected}, actual {actual}")]
     HashMismatch { expected: ArtifactHash, actual: ArtifactHash },
 
-    #[error("Artifacts in use are not present in new config: {0:?}")]
-    InUseArtifactsMissing(BTreeMap<ArtifactHash, &'static str>),
+    #[error("New config is invalid per current sled config")]
+    InvalidPerSledConfig(
+        #[from] sled_agent_config_reconciler::LedgerArtifactConfigError,
+    ),
+
+    #[error("Cannot validate incoming config against sled config")]
+    CannotValidateAgainstSledConfig(
+        #[from] sled_agent_config_reconciler::LedgerTaskError,
+    ),
 
     #[error("Blocking task failed")]
     Join(#[source] tokio::task::JoinError),
@@ -863,7 +857,7 @@ impl From<Error> for HttpError {
         match err {
             // 4xx errors
             Error::HashMismatch { .. }
-            | Error::InUseArtifactsMissing { .. }
+            | Error::InvalidPerSledConfig { .. }
             | Error::NoConfig
             | Error::NotInConfig { .. } => {
                 HttpError::for_bad_request(None, err.to_string())
@@ -894,9 +888,12 @@ impl From<Error> for HttpError {
             | Error::File { .. }
             | Error::Join(_)
             | Error::LedgerCommit(_)
-            | Error::LedgerChannel => HttpError::for_internal_error(
-                InlineErrorChain::new(&err).to_string(),
-            ),
+            | Error::LedgerChannel
+            | Error::CannotValidateAgainstSledConfig(_) => {
+                HttpError::for_internal_error(
+                    InlineErrorChain::new(&err).to_string(),
+                )
+            }
         }
     }
 }
diff --git a/sled-agent/src/bootstrap/bootstore_setup.rs b/sled-agent/src/bootstrap/bootstore_setup.rs
index 1b148290104..c9e3cefbd8e 100644
--- a/sled-agent/src/bootstrap/bootstore_setup.rs
+++ b/sled-agent/src/bootstrap/bootstore_setup.rs
@@ -15,7 +15,6 @@ use omicron_ddm_admin_client::Client as DdmAdminClient;
 use sled_hardware_types::Baseboard;
 use sled_hardware_types::underlay::BootstrapInterface;
 use sled_storage::dataset::CLUSTER_DATASET;
-use sled_storage::resources::AllDisks;
 use slog::Logger;
 use std::collections::BTreeSet;
 use std::net::Ipv6Addr;
@@ -26,7 +25,7 @@ const BOOTSTORE_FSM_STATE_FILE: &str = "bootstore-fsm-state.json";
 const BOOTSTORE_NETWORK_CONFIG_FILE: &str = "bootstore-network-config.json";
 
 pub fn new_bootstore_config(
-    all_disks: &AllDisks,
+    cluster_dataset_paths: &[Utf8PathBuf],
     baseboard: Baseboard,
     global_zone_bootstrap_ip: Ipv6Addr,
 ) -> Result<bootstore::Config, StartError> {
@@ -37,19 +36,20 @@ pub fn new_bootstore_config(
         learn_timeout: Duration::from_secs(5),
         rack_init_timeout: Duration::from_secs(300),
         rack_secret_request_timeout: Duration::from_secs(5),
-        fsm_state_ledger_paths: bootstore_fsm_state_paths(&all_disks)?,
+        fsm_state_ledger_paths: bootstore_fsm_state_paths(
+            cluster_dataset_paths,
+        )?,
         network_config_ledger_paths: bootstore_network_config_paths(
-            &all_disks,
+            cluster_dataset_paths,
         )?,
     })
 }
 
 fn bootstore_fsm_state_paths(
-    all_disks: &AllDisks,
+    cluster_dataset_paths: &[Utf8PathBuf],
 ) -> Result<Vec<Utf8PathBuf>, StartError> {
-    let paths: Vec<_> = all_disks
-        .all_m2_mountpoints(CLUSTER_DATASET)
-        .into_iter()
+    let paths: Vec<_> = cluster_dataset_paths
+        .iter()
         .map(|p| p.join(BOOTSTORE_FSM_STATE_FILE))
         .collect();
 
@@ -60,11 +60,10 @@ fn bootstore_fsm_state_paths(
 }
 
 fn bootstore_network_config_paths(
-    all_disks: &AllDisks,
+    cluster_dataset_paths: &[Utf8PathBuf],
 ) -> Result<Vec<Utf8PathBuf>, StartError> {
-    let paths: Vec<_> = all_disks
-        .all_m2_mountpoints(CLUSTER_DATASET)
-        .into_iter()
+    let paths: Vec<_> = cluster_dataset_paths
+        .iter()
         .map(|p| p.join(BOOTSTORE_NETWORK_CONFIG_FILE))
         .collect();
 
diff --git a/sled-agent/src/bootstrap/http_entrypoints.rs b/sled-agent/src/bootstrap/http_entrypoints.rs
index 44bd09387ff..12a986cf2fb 100644
--- a/sled-agent/src/bootstrap/http_entrypoints.rs
+++ b/sled-agent/src/bootstrap/http_entrypoints.rs
@@ -23,10 +23,10 @@ use dropshot::{
 use omicron_common::api::external::Error;
 use omicron_uuid_kinds::RackInitUuid;
 use omicron_uuid_kinds::RackResetUuid;
+use sled_agent_config_reconciler::InternalDisksReceiver;
 use sled_agent_types::rack_init::RackInitializeRequest;
 use sled_agent_types::rack_ops::RackOperationStatus;
 use sled_hardware_types::Baseboard;
-use sled_storage::manager::StorageHandle;
 use slog::Logger;
 use slog_error_chain::InlineErrorChain;
 use sprockets_tls::keys::SprocketsConfig;
@@ -37,7 +37,7 @@ use tokio::sync::{mpsc, oneshot};
 pub(crate) struct BootstrapServerContext {
     pub(crate) base_log: Logger,
     pub(crate) global_zone_bootstrap_ip: Ipv6Addr,
-    pub(crate) storage_manager: StorageHandle,
+    pub(crate) internal_disks_rx: InternalDisksReceiver,
     pub(crate) bootstore_node_handle: bootstore::NodeHandle,
     pub(crate) baseboard: Baseboard,
     pub(crate) rss_access: RssAccess,
@@ -56,7 +56,7 @@ impl BootstrapServerContext {
             &self.base_log,
             self.sprockets.clone(),
             self.global_zone_bootstrap_ip,
-            &self.storage_manager,
+            &self.internal_disks_rx,
             &self.bootstore_node_handle,
             request,
         )
diff --git a/sled-agent/src/bootstrap/pre_server.rs b/sled-agent/src/bootstrap/pre_server.rs
index d5b7553d54b..546a523e3e0 100644
--- a/sled-agent/src/bootstrap/pre_server.rs
+++ b/sled-agent/src/bootstrap/pre_server.rs
@@ -20,7 +20,6 @@ use crate::long_running_tasks::{
     LongRunningTaskHandles, spawn_all_longrunning_tasks,
 };
 use crate::services::ServiceManager;
-use crate::services::TimeSyncConfig;
 use crate::sled_agent::SledAgent;
 use camino::Utf8PathBuf;
 use cancel_safe_futures::TryStreamExt;
@@ -134,23 +133,17 @@ impl BootstrapAgentStartup {
         let global_zone_bootstrap_ip =
             startup_networking.global_zone_bootstrap_ip;
 
-        let time_sync = if let Some(true) = config.skip_timesync {
-            TimeSyncConfig::Skip
-        } else {
-            TimeSyncConfig::Normal
-        };
-
         let service_manager = ServiceManager::new(
             &base_log,
             ddm_reconciler,
             startup_networking,
             sled_mode,
-            time_sync,
             config.sidecar_revision.clone(),
             config.switch_zone_maghemite_links.clone(),
-            long_running_task_handles.storage_manager.clone(),
-            long_running_task_handles.zone_bundler.clone(),
-            long_running_task_handles.zone_image_resolver.clone(),
+            long_running_task_handles
+                .config_reconciler
+                .internal_disks_rx()
+                .clone(),
         );
 
         // Inform the hardware monitor that the service manager is ready
diff --git a/sled-agent/src/bootstrap/rack_ops.rs b/sled-agent/src/bootstrap/rack_ops.rs
index 2be59fd5880..db2e79d3ec9 100644
--- a/sled-agent/src/bootstrap/rack_ops.rs
+++ b/sled-agent/src/bootstrap/rack_ops.rs
@@ -9,9 +9,9 @@ use crate::rack_setup::service::SetupServiceError;
 use bootstore::schemes::v0 as bootstore;
 use omicron_uuid_kinds::RackInitUuid;
 use omicron_uuid_kinds::RackResetUuid;
+use sled_agent_config_reconciler::InternalDisksReceiver;
 use sled_agent_types::rack_init::RackInitializeRequest;
 use sled_agent_types::rack_ops::{RackOperationStatus, RssStep};
-use sled_storage::manager::StorageHandle;
 use slog::Logger;
 use sprockets_tls::keys::SprocketsConfig;
 use std::mem;
@@ -146,7 +146,7 @@ impl RssAccess {
         parent_log: &Logger,
         sprockets: SprocketsConfig,
         global_zone_bootstrap_ip: Ipv6Addr,
-        storage_manager: &StorageHandle,
+        internal_disks_rx: &InternalDisksReceiver,
         bootstore_node_handle: &bootstore::NodeHandle,
         request: RackInitializeRequest,
     ) -> Result<RackInitUuid, RssAccessError> {
@@ -182,7 +182,7 @@ impl RssAccess {
                 *status = RssStatus::Initializing { id, completion, step_rx };
                 mem::drop(status);
                 let parent_log = parent_log.clone();
-                let storage_manager = storage_manager.clone();
+                let internal_disks_rx = internal_disks_rx.clone();
                 let bootstore_node_handle = bootstore_node_handle.clone();
                 let status = Arc::clone(&self.status);
                 tokio::spawn(async move {
@@ -190,7 +190,7 @@ impl RssAccess {
                         &parent_log,
                         sprockets,
                         global_zone_bootstrap_ip,
-                        storage_manager,
+                        internal_disks_rx,
                         bootstore_node_handle,
                         request,
                         step_tx,
@@ -328,7 +328,7 @@ async fn rack_initialize(
     parent_log: &Logger,
     sprockets: SprocketsConfig,
     global_zone_bootstrap_ip: Ipv6Addr,
-    storage_manager: StorageHandle,
+    internal_disks_rx: InternalDisksReceiver,
     bootstore_node_handle: bootstore::NodeHandle,
     request: RackInitializeRequest,
     step_tx: watch::Sender<RssStep>,
@@ -338,7 +338,7 @@ async fn rack_initialize(
         sprockets,
         request,
         global_zone_bootstrap_ip,
-        storage_manager,
+        internal_disks_rx,
         bootstore_node_handle,
         step_tx,
     )
diff --git a/sled-agent/src/bootstrap/rss_handle.rs b/sled-agent/src/bootstrap/rss_handle.rs
index efddfa2aa25..f3872b90feb 100644
--- a/sled-agent/src/bootstrap/rss_handle.rs
+++ b/sled-agent/src/bootstrap/rss_handle.rs
@@ -14,10 +14,10 @@ use futures::stream::FuturesUnordered;
 use omicron_common::backoff::BackoffError;
 use omicron_common::backoff::retry_notify;
 use omicron_common::backoff::retry_policy_local;
+use sled_agent_config_reconciler::InternalDisksReceiver;
 use sled_agent_types::rack_init::RackInitializeRequest;
 use sled_agent_types::rack_ops::RssStep;
 use sled_agent_types::sled::StartSledAgentRequest;
-use sled_storage::manager::StorageHandle;
 use slog::Logger;
 use sprockets_tls::keys::SprocketsConfig;
 use std::net::Ipv6Addr;
@@ -50,7 +50,7 @@ impl RssHandle {
         sprockets: SprocketsConfig,
         config: RackInitializeRequest,
         our_bootstrap_address: Ipv6Addr,
-        storage_manager: StorageHandle,
+        internal_disks_rx: InternalDisksReceiver,
         bootstore: bootstore::NodeHandle,
         step_tx: watch::Sender<RssStep>,
     ) -> Result<(), SetupServiceError> {
@@ -59,7 +59,7 @@ impl RssHandle {
         let rss = RackSetupService::new(
             log.new(o!("component" => "RSS")),
             config,
-            storage_manager,
+            internal_disks_rx,
             tx,
             bootstore,
             step_tx,
diff --git a/sled-agent/src/bootstrap/server.rs b/sled-agent/src/bootstrap/server.rs
index 288f35d2a33..9788bb943ad 100644
--- a/sled-agent/src/bootstrap/server.rs
+++ b/sled-agent/src/bootstrap/server.rs
@@ -39,11 +39,11 @@ use omicron_ddm_admin_client::DdmError;
 use omicron_ddm_admin_client::types::EnableStatsRequest;
 use omicron_uuid_kinds::GenericUuid;
 use omicron_uuid_kinds::RackInitUuid;
+use sled_agent_config_reconciler::InternalDisksReceiver;
 use sled_agent_types::rack_init::RackInitializeRequest;
 use sled_agent_types::sled::StartSledAgentRequest;
 use sled_hardware::underlay;
 use sled_storage::dataset::CONFIG_DATASET;
-use sled_storage::manager::StorageHandle;
 use slog::Logger;
 use std::io;
 use std::net::SocketAddr;
@@ -183,9 +183,11 @@ impl Server {
         } = BootstrapAgentStartup::run(config).await?;
 
         // Do we have a StartSledAgentRequest stored in the ledger?
-        let paths =
-            sled_config_paths(&long_running_task_handles.storage_manager)
-                .await?;
+        let internal_disks_rx = long_running_task_handles
+            .config_reconciler
+            .internal_disks_rx()
+            .clone();
+        let paths = sled_config_paths(&internal_disks_rx).await?;
         let maybe_ledger =
             Ledger::<StartSledAgentRequest>::new(&startup_log, paths).await;
 
@@ -204,7 +206,7 @@ impl Server {
         let bootstrap_context = BootstrapServerContext {
             base_log: base_log.clone(),
             global_zone_bootstrap_ip,
-            storage_manager: long_running_task_handles.storage_manager.clone(),
+            internal_disks_rx,
             bootstore_node_handle: long_running_task_handles.bootstore.clone(),
             baseboard: long_running_task_handles.hardware_manager.baseboard(),
             rss_access,
@@ -257,11 +259,6 @@ impl Server {
                 .map_err(|_| ())
                 .expect("Failed to send to StorageMonitor");
 
-            // For cold boot specifically, we now need to load the services
-            // we're responsible for, while continuing to handle hardware
-            // notifications. This cannot fail: we retry indefinitely until
-            // we're done loading services.
-            sled_agent.load_services().await;
             SledAgentState::ServerStarted(sled_agent_server)
         } else {
             SledAgentState::Bootstrapping(Some(sled_agent_started_tx))
@@ -386,9 +383,6 @@ async fn start_sled_agent(
         }
     }
 
-    // Inform the storage service that the key manager is available
-    long_running_task_handles.storage_manager.key_manager_ready().await;
-
     // Inform our DDM reconciler of our underlay subnet and the information it
     // needs for maghemite to enable Oximeter stats.
     let ddm_reconciler = service_manager.ddm_reconciler();
@@ -413,8 +407,10 @@ async fn start_sled_agent(
 
     // Record this request so the sled agent can be automatically
     // initialized on the next boot.
-    let paths =
-        sled_config_paths(&long_running_task_handles.storage_manager).await?;
+    let paths = sled_config_paths(
+        long_running_task_handles.config_reconciler.internal_disks_rx(),
+    )
+    .await?;
 
     let mut ledger = Ledger::new_with(&log, paths, request);
     ledger.commit().await?;
@@ -468,12 +464,11 @@ impl From<MissingM2Paths> for SledAgentServerStartError {
 }
 
 async fn sled_config_paths(
-    storage: &StorageHandle,
+    internal_disks_rx: &InternalDisksReceiver,
 ) -> Result<Vec<Utf8PathBuf>, MissingM2Paths> {
-    let resources = storage.get_latest_disks().await;
-    let paths: Vec<_> = resources
-        .all_m2_mountpoints(CONFIG_DATASET)
-        .into_iter()
+    let paths: Vec<_> = internal_disks_rx
+        .current()
+        .all_config_datasets()
         .map(|p| p.join(SLED_AGENT_REQUEST_FILE))
         .collect();
 
@@ -620,15 +615,13 @@ impl Inner {
     }
 
     async fn uninstall_sled_local_config(&self) -> Result<(), BootstrapError> {
-        let config_dirs = self
+        let internal_disks = self
             .long_running_task_handles
-            .storage_manager
-            .get_latest_disks()
-            .await
-            .all_m2_mountpoints(CONFIG_DATASET)
-            .into_iter();
+            .config_reconciler
+            .internal_disks_rx()
+            .current();
 
-        for dir in config_dirs {
+        for dir in internal_disks.all_config_datasets() {
             for entry in dir.read_dir_utf8().map_err(|err| {
                 BootstrapError::Io { message: format!("Deleting {dir}"), err }
             })? {
diff --git a/sled-agent/src/hardware_monitor.rs b/sled-agent/src/hardware_monitor.rs
index 9508a11bfba..0e8cd00463d 100644
--- a/sled-agent/src/hardware_monitor.rs
+++ b/sled-agent/src/hardware_monitor.rs
@@ -8,10 +8,10 @@
 
 use crate::services::ServiceManager;
 use crate::sled_agent::SledAgent;
+use sled_agent_config_reconciler::RawDisksSender;
 use sled_hardware::{HardwareManager, HardwareUpdate};
 use sled_hardware_types::Baseboard;
 use sled_storage::disk::RawDisk;
-use sled_storage::manager::StorageHandle;
 use slog::Logger;
 use tokio::sync::broadcast::error::RecvError;
 use tokio::sync::{broadcast, oneshot};
@@ -68,8 +68,8 @@ pub struct HardwareMonitor {
     // A reference to the hardware manager
     hardware_manager: HardwareManager,
 
-    // A handle to [`sled_hardware::manager::StorageManger`]
-    storage_manager: StorageHandle,
+    // A handle to send raw disk updates to the config-reconciler system.
+    raw_disks_tx: RawDisksSender,
 
     // A handle to the sled-agent
     //
@@ -91,7 +91,7 @@ impl HardwareMonitor {
     pub fn new(
         log: &Logger,
         hardware_manager: &HardwareManager,
-        storage_manager: &StorageHandle,
+        raw_disks_tx: RawDisksSender,
     ) -> (
         HardwareMonitor,
         oneshot::Sender<SledAgent>,
@@ -112,7 +112,7 @@ impl HardwareMonitor {
                 service_manager_ready_rx,
                 hardware_rx,
                 hardware_manager: hardware_manager.clone(),
-                storage_manager: storage_manager.clone(),
+                raw_disks_tx,
                 sled_agent: None,
                 tofino_manager,
             },
@@ -177,36 +177,16 @@ impl HardwareMonitor {
                     }
                 }
                 HardwareUpdate::DiskAdded(disk) => {
-                    // We notify the storage manager of the hardware, but do not need to
-                    // wait for the result to be fully processed.
-                    //
-                    // Here and below, we're "dropping a future" rather than
-                    // awaiting it. That's intentional - the hardware monitor
-                    // doesn't care when this work is finished, just when it's
-                    // enqueued.
-                    #[allow(clippy::let_underscore_future)]
-                    let _ = self
-                        .storage_manager
-                        .detected_raw_disk(disk.into())
-                        .await;
+                    self.raw_disks_tx
+                        .add_or_update_raw_disk(disk.into(), &self.log);
                 }
                 HardwareUpdate::DiskRemoved(disk) => {
-                    // We notify the storage manager of the hardware, but do not need to
-                    // wait for the result to be fully processed.
-                    #[allow(clippy::let_underscore_future)]
-                    let _ = self
-                        .storage_manager
-                        .detected_raw_disk_removal(disk.into())
-                        .await;
+                    self.raw_disks_tx
+                        .remove_raw_disk(disk.identity(), &self.log);
                 }
                 HardwareUpdate::DiskUpdated(disk) => {
-                    // We notify the storage manager of the hardware, but do not need to
-                    // wait for the result to be fully processed.
-                    #[allow(clippy::let_underscore_future)]
-                    let _ = self
-                        .storage_manager
-                        .detected_raw_disk_update(disk.into())
-                        .await;
+                    self.raw_disks_tx
+                        .add_or_update_raw_disk(disk.into(), &self.log);
                 }
             },
             Err(broadcast::error::RecvError::Lagged(count)) => {
@@ -280,14 +260,9 @@ impl HardwareMonitor {
             self.deactivate_switch().await;
         }
 
-        // We notify the storage manager of the hardware, but do not need to
-        // wait for the result to be fully processed.
-        #[allow(clippy::let_underscore_future)]
-        let _ = self
-            .storage_manager
-            .ensure_using_exactly_these_disks(
-                self.hardware_manager.disks().into_values().map(RawDisk::from),
-            )
-            .await;
+        self.raw_disks_tx.set_raw_disks(
+            self.hardware_manager.disks().into_values().map(RawDisk::from),
+            &self.log,
+        );
     }
 }
diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs
index 0056056eb33..f96802e9732 100644
--- a/sled-agent/src/http_entrypoints.rs
+++ b/sled-agent/src/http_entrypoints.rs
@@ -18,7 +18,7 @@ use dropshot::{
     Query, RequestContext, StreamingBody, TypedBody,
 };
 use nexus_sled_agent_shared::inventory::{
-    Inventory, OmicronSledConfig, OmicronSledConfigResult, SledRole,
+    Inventory, OmicronSledConfig, SledRole,
 };
 use omicron_common::api::external::Error;
 use omicron_common::api::internal::nexus::{DiskRuntimeState, SledVmmState};
@@ -26,9 +26,6 @@ use omicron_common::api::internal::shared::{
     ExternalIpGatewayMap, ResolvedVpcRouteSet, ResolvedVpcRouteState,
     SledIdentifiers, SwitchPorts, VirtualNetworkInterfaceHost,
 };
-use omicron_common::disk::{
-    DatasetsConfig, DiskVariant, M2Slot, OmicronPhysicalDisksConfig,
-};
 use range_requests::RequestContextEx;
 use sled_agent_api::*;
 use sled_agent_types::boot_disk::{
@@ -400,13 +397,6 @@ impl SledAgentApi for SledAgentImpl {
         Ok(HttpResponseDeleted())
     }
 
-    async fn datasets_get(
-        rqctx: RequestContext<Self::Context>,
-    ) -> Result<HttpResponseOk<DatasetsConfig>, HttpError> {
-        let sa = rqctx.context();
-        Ok(HttpResponseOk(sa.datasets_config_list().await?))
-    }
-
     async fn zone_bundle_cleanup(
         rqctx: RequestContext<Self::Context>,
     ) -> Result<HttpResponseOk<BTreeMap<Utf8PathBuf, CleanupCount>>, HttpError>
@@ -428,27 +418,11 @@ impl SledAgentApi for SledAgentImpl {
     async fn omicron_config_put(
         rqctx: RequestContext<Self::Context>,
         body: TypedBody<OmicronSledConfig>,
-    ) -> Result<HttpResponseOk<OmicronSledConfigResult>, HttpError> {
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
         let sa = rqctx.context();
         let body_args = body.into_inner();
-        sa.set_omicron_config(body_args)
-            .await
-            .map(HttpResponseOk)
-            .map_err(HttpError::from)
-    }
-
-    async fn omicron_physical_disks_get(
-        rqctx: RequestContext<Self::Context>,
-    ) -> Result<HttpResponseOk<OmicronPhysicalDisksConfig>, HttpError> {
-        let sa = rqctx.context();
-        Ok(HttpResponseOk(sa.omicron_physical_disks_list().await?))
-    }
-
-    async fn zpools_get(
-        rqctx: RequestContext<Self::Context>,
-    ) -> Result<HttpResponseOk<Vec<Zpool>>, HttpError> {
-        let sa = rqctx.context();
-        Ok(HttpResponseOk(sa.zpools_get().await))
+        sa.set_omicron_config(body_args).await??;
+        Ok(HttpResponseUpdatedNoContent())
     }
 
     async fn sled_role_get(
@@ -792,29 +766,7 @@ impl SledAgentApi for SledAgentImpl {
         let boot_disk = path_params.into_inner().boot_disk;
 
         // Find our corresponding disk.
-        let maybe_disk_path =
-            sa.storage().get_latest_disks().await.iter_managed().find_map(
-                |(_identity, disk)| {
-                    // Synthetic disks panic if asked for their `slot()`, so filter
-                    // them out first; additionally, filter out any non-M2 disks.
-                    if disk.is_synthetic() || disk.variant() != DiskVariant::M2
-                    {
-                        return None;
-                    }
-
-                    // Convert this M2 disk's slot to an M2Slot, and skip any that
-                    // don't match the requested boot_disk.
-                    let Ok(slot) = M2Slot::try_from(disk.slot()) else {
-                        return None;
-                    };
-                    if slot != boot_disk {
-                        return None;
-                    }
-
-                    let raw_devs_path = true;
-                    Some(disk.boot_image_devfs_path(raw_devs_path))
-                },
-            );
+        let maybe_disk_path = sa.boot_image_raw_devfs_path(boot_disk);
 
         let disk_path = match maybe_disk_path {
             Some(Ok(path)) => path,
diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs
index 5481672b17d..573fc717811 100644
--- a/sled-agent/src/instance.rs
+++ b/sled-agent/src/instance.rs
@@ -36,10 +36,9 @@ use propolis_client::Client as PropolisClient;
 use propolis_client::instance_spec::{ComponentV0, SpecKey};
 use rand::SeedableRng;
 use rand::prelude::IteratorRandom;
+use sled_agent_config_reconciler::AvailableDatasetsReceiver;
 use sled_agent_types::instance::*;
 use sled_agent_types::zone_bundle::ZoneBundleCause;
-use sled_storage::dataset::ZONE_DATASET;
-use sled_storage::manager::StorageHandle;
 use slog::Logger;
 use std::net::IpAddr;
 use std::net::SocketAddr;
@@ -530,8 +529,8 @@ struct InstanceRunner {
     // Connection to Nexus
     nexus_client: NexusClient,
 
-    // Storage resources
-    storage: StorageHandle,
+    // Available datasets for choosing zone roots
+    available_datasets_rx: AvailableDatasetsReceiver,
 
     // Used to create propolis zones
     zone_builder_factory: ZoneBuilderFactory,
@@ -1556,10 +1555,10 @@ impl Instance {
             nexus_client,
             vnic_allocator,
             port_manager,
-            storage,
             zone_bundler,
             zone_builder_factory,
             metrics_queue,
+            available_datasets_rx,
         } = services;
 
         let mut dhcp_config = DhcpCfg {
@@ -1645,7 +1644,7 @@ impl Instance {
             state: InstanceStates::new(vmm_runtime, migration_id),
             running_state: None,
             nexus_client,
-            storage,
+            available_datasets_rx,
             zone_builder_factory,
             zone_bundler,
             metrics_queue,
@@ -1991,13 +1990,10 @@ impl InstanceRunner {
         // configured VNICs.
         let zname = propolis_zone_name(&self.propolis_id);
         let mut rng = rand::rngs::StdRng::from_entropy();
-        let latest_disks = self
-            .storage
-            .get_latest_disks()
-            .await
-            .all_u2_mountpoints(ZONE_DATASET);
 
-        let root = latest_disks
+        let root = self
+            .available_datasets_rx
+            .all_mounted_zone_root_datasets()
             .into_iter()
             .choose(&mut rng)
             .ok_or_else(|| Error::U2NotFound)?;
@@ -2275,11 +2271,16 @@ mod tests {
     use omicron_common::api::external::{Generation, Hostname};
     use omicron_common::api::internal::nexus::VmmState;
     use omicron_common::api::internal::shared::{DhcpConfig, SledIdentifiers};
+    use omicron_common::disk::DiskIdentity;
+    use omicron_uuid_kinds::ZpoolUuid;
     use propolis_client::types::{
         InstanceMigrateStatusResponse, InstanceStateMonitorResponse,
     };
+    use sled_agent_config_reconciler::{
+        CurrentlyManagedZpoolsReceiver, InternalDisksReceiver,
+    };
     use sled_agent_types::zone_bundle::CleanupContext;
-    use sled_storage::manager_test_harness::StorageManagerTestHarness;
+    use sled_storage::config::MountConfig;
     use std::net::SocketAddrV6;
     use std::net::{Ipv4Addr, Ipv6Addr, SocketAddrV4};
     use std::str::FromStr;
@@ -2409,25 +2410,11 @@ mod tests {
             .expect("single-stepping mock server failed unexpectedly");
     }
 
-    async fn setup_storage_manager(log: &Logger) -> StorageManagerTestHarness {
-        let mut harness = StorageManagerTestHarness::new(log).await;
-        let raw_disks =
-            harness.add_vdevs(&["u2_under_test.vdev", "m2_helping.vdev"]).await;
-        harness.handle().key_manager_ready().await;
-        let config = harness.make_config(1, &raw_disks);
-        let _ = harness
-            .handle()
-            .omicron_physical_disks_ensure(config.clone())
-            .await
-            .expect("Ensuring disks should work after key manager is ready");
-        harness
-    }
-
     async fn instance_struct(
         log: &Logger,
         propolis_addr: SocketAddr,
         nexus_client: NexusClient,
-        storage_handle: StorageHandle,
+        available_datasets_rx: AvailableDatasetsReceiver,
         temp_dir: &str,
     ) -> (Instance, MetricsRx) {
         let id = InstanceUuid::new_v4();
@@ -2439,7 +2426,7 @@ mod tests {
 
         let (services, rx) = fake_instance_manager_services(
             log,
-            storage_handle,
+            available_datasets_rx,
             nexus_client,
             temp_dir,
         )
@@ -2523,7 +2510,7 @@ mod tests {
 
     async fn fake_instance_manager_services(
         log: &Logger,
-        storage_handle: StorageHandle,
+        available_datasets_rx: AvailableDatasetsReceiver,
         nexus_client: NexusClient,
         temp_dir: &str,
     ) -> (InstanceManagerServices, MetricsRx) {
@@ -2540,7 +2527,19 @@ mod tests {
         let cleanup_context = CleanupContext::default();
         let zone_bundler = ZoneBundler::new(
             log.new(o!("component" => "ZoneBundler")),
-            storage_handle.clone(),
+            InternalDisksReceiver::fake_static(
+                Arc::new(MountConfig::default()),
+                [(
+                    DiskIdentity {
+                        vendor: "test-vendor".to_string(),
+                        model: "test-model".to_string(),
+                        serial: "test-serial".to_string(),
+                    },
+                    ZpoolName::new_external(ZpoolUuid::new_v4()),
+                )]
+                .into_iter(),
+            ),
+            available_datasets_rx.clone(),
             cleanup_context,
         )
         .await;
@@ -2550,7 +2549,7 @@ mod tests {
             nexus_client,
             vnic_allocator,
             port_manager,
-            storage: storage_handle,
+            available_datasets_rx,
             zone_bundler,
             zone_builder_factory: ZoneBuilderFactory::fake(
                 Some(temp_dir),
@@ -2565,7 +2564,6 @@ mod tests {
     /// interactions with other parts of the system (e.g. Nexus and metrics).
     #[allow(dead_code)]
     struct InstanceTestObjects {
-        storage_harness: StorageManagerTestHarness,
         nexus: FakeNexusParts,
         _temp_guard: Utf8TempDir,
         instance_manager: crate::instance_manager::InstanceManager,
@@ -2574,12 +2572,13 @@ mod tests {
 
     impl InstanceTestObjects {
         async fn new(log: &slog::Logger) -> Self {
-            let storage_harness = setup_storage_manager(log).await;
             let nexus = FakeNexusParts::new(&log).await;
             let temp_guard = Utf8TempDir::new().unwrap();
             let (services, metrics_rx) = fake_instance_manager_services(
                 log,
-                storage_harness.handle().clone(),
+                AvailableDatasetsReceiver::fake_in_tempdir_for_tests(
+                    ZpoolOrRamdisk::Ramdisk,
+                ),
                 nexus.nexus_client.clone(),
                 temp_guard.path().as_str(),
             )
@@ -2589,7 +2588,7 @@ mod tests {
                 nexus_client,
                 vnic_allocator,
                 port_manager,
-                storage,
+                available_datasets_rx,
                 zone_bundler,
                 zone_builder_factory,
                 metrics_queue,
@@ -2603,7 +2602,10 @@ mod tests {
                     nexus_client,
                     vnic_allocator,
                     port_manager,
-                    storage,
+                    CurrentlyManagedZpoolsReceiver::fake_static(
+                        std::iter::empty(),
+                    ),
+                    available_datasets_rx,
                     zone_bundler,
                     zone_builder_factory,
                     vmm_reservoir_manager,
@@ -2612,17 +2614,12 @@ mod tests {
                 .unwrap();
 
             Self {
-                storage_harness,
                 nexus,
                 _temp_guard: temp_guard,
                 instance_manager,
                 metrics_rx,
             }
         }
-
-        async fn cleanup(mut self) {
-            self.storage_harness.cleanup().await;
-        }
     }
 
     #[tokio::test]
@@ -2642,9 +2639,6 @@ mod tests {
             _nexus_server,
         } = FakeNexusParts::new(&log).await;
 
-        let mut storage_harness = setup_storage_manager(&log).await;
-        let storage_handle = storage_harness.handle().clone();
-
         let temp_guard = Utf8TempDir::new().unwrap();
 
         let (inst, mut metrics_rx) = timeout(
@@ -2653,7 +2647,9 @@ mod tests {
                 &log,
                 propolis_addr,
                 nexus_client,
-                storage_handle,
+                AvailableDatasetsReceiver::fake_in_tempdir_for_tests(
+                    ZpoolOrRamdisk::Ramdisk,
+                ),
                 temp_guard.path().as_str(),
             ),
         )
@@ -2708,7 +2704,6 @@ mod tests {
             .try_recv()
             .expect_err("The metrics request queue should have one message");
 
-        storage_harness.cleanup().await;
         logctx.cleanup_successful();
     }
 
@@ -2727,9 +2722,6 @@ mod tests {
             _nexus_server,
         } = FakeNexusParts::new(&log).await;
 
-        let mut storage_harness = setup_storage_manager(&logctx.log).await;
-        let storage_handle = storage_harness.handle().clone();
-
         let temp_guard = Utf8TempDir::new().unwrap();
 
         let (inst, _) = timeout(
@@ -2739,7 +2731,9 @@ mod tests {
                 // we want to test propolis not ever coming up
                 SocketAddr::V6(SocketAddrV6::new(Ipv6Addr::LOCALHOST, 1, 0, 0)),
                 nexus_client,
-                storage_handle,
+                AvailableDatasetsReceiver::fake_in_tempdir_for_tests(
+                    ZpoolOrRamdisk::Ramdisk,
+                ),
                 temp_guard.path().as_str(),
             ),
         )
@@ -2775,7 +2769,6 @@ mod tests {
             );
         }
 
-        storage_harness.cleanup().await;
         logctx.cleanup_successful();
     }
 
@@ -2874,7 +2867,6 @@ mod tests {
             .try_recv()
             .expect_err("The metrics request queue should have one message");
 
-        test_objects.cleanup().await;
         logctx.cleanup_successful();
     }
 
@@ -3020,7 +3012,6 @@ mod tests {
         .expect("timed out waiting for VmmState::Stopped in FakeNexus")
         .expect("failed to receive FakeNexus' InstanceState");
 
-        test_objects.cleanup().await;
         logctx.cleanup_successful();
     }
 
@@ -3067,7 +3058,7 @@ mod tests {
                 nexus_client,
                 vnic_allocator,
                 port_manager,
-                storage,
+                available_datasets_rx,
                 zone_bundler,
                 zone_builder_factory,
                 metrics_queue,
@@ -3102,7 +3093,7 @@ mod tests {
                 state: InstanceStates::new(vmm_runtime, migration_id),
                 running_state: None,
                 nexus_client,
-                storage,
+                available_datasets_rx,
                 zone_builder_factory,
                 zone_bundler,
                 metrics_queue,
@@ -3125,12 +3116,10 @@ mod tests {
         // them directly.
         _nexus_server: HttpServer<ServerContext>,
         _dns_server: TransientServer,
-        storage_harness: StorageManagerTestHarness,
     }
 
     impl TestInstanceRunner {
         async fn new(log: &slog::Logger) -> Self {
-            let storage_harness = setup_storage_manager(&log).await;
             let FakeNexusParts {
                 nexus_client,
                 _nexus_server,
@@ -3141,7 +3130,9 @@ mod tests {
             let temp_guard = Utf8TempDir::new().unwrap();
             let (services, _metrics_rx) = fake_instance_manager_services(
                 &log,
-                storage_harness.handle().clone(),
+                AvailableDatasetsReceiver::fake_in_tempdir_for_tests(
+                    ZpoolOrRamdisk::Ramdisk,
+                ),
                 nexus_client,
                 temp_guard.path().as_str(),
             )
@@ -3183,7 +3174,6 @@ mod tests {
                 remove_rx,
                 _nexus_server,
                 _dns_server,
-                storage_harness,
             }
         }
     }
@@ -3204,7 +3194,6 @@ mod tests {
             mut remove_rx,
             _nexus_server,
             _dns_server,
-            mut storage_harness,
         } = TestInstanceRunner::new(&log).await;
 
         let (resp_tx, resp_rx) = oneshot::channel();
@@ -3258,7 +3247,6 @@ mod tests {
         drop(terminate_tx);
         let _ = runner_task.await;
 
-        storage_harness.cleanup().await;
         logctx.cleanup_successful();
     }
 
@@ -3278,7 +3266,6 @@ mod tests {
             mut remove_rx,
             _nexus_server,
             _dns_server,
-            mut storage_harness,
         } = TestInstanceRunner::new(&log).await;
 
         let (resp_tx, resp_rx) = oneshot::channel();
@@ -3301,7 +3288,6 @@ mod tests {
         };
 
         assert_eq!(state.vmm_state.state, VmmState::Failed);
-        storage_harness.cleanup().await;
         logctx.cleanup_successful();
     }
 }
diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs
index 9bd0ad76497..fa8a11c89d8 100644
--- a/sled-agent/src/instance_manager.rs
+++ b/sled-agent/src/instance_manager.rs
@@ -17,15 +17,14 @@ use illumos_utils::link::VnicAllocator;
 use illumos_utils::opte::PortManager;
 use illumos_utils::running_zone::ZoneBuilderFactory;
 use omicron_common::api::external::ByteCount;
-use omicron_common::api::external::Generation;
 use omicron_common::api::internal::nexus::SledVmmState;
 use omicron_common::api::internal::shared::SledIdentifiers;
 use omicron_uuid_kinds::PropolisUuid;
+use sled_agent_config_reconciler::AvailableDatasetsReceiver;
+use sled_agent_config_reconciler::CurrentlyManagedZpoolsReceiver;
 use sled_agent_types::instance::*;
-use sled_storage::manager::StorageHandle;
-use sled_storage::resources::AllDisks;
 use slog::Logger;
-use std::collections::{BTreeMap, HashSet};
+use std::collections::BTreeMap;
 use std::sync::Arc;
 use tokio::sync::{mpsc, oneshot};
 use uuid::Uuid;
@@ -66,10 +65,10 @@ pub(crate) struct InstanceManagerServices {
     pub nexus_client: NexusClient,
     pub vnic_allocator: VnicAllocator<Etherstub>,
     pub port_manager: PortManager,
-    pub storage: StorageHandle,
     pub zone_bundler: ZoneBundler,
     pub zone_builder_factory: ZoneBuilderFactory,
     pub metrics_queue: MetricsRequestQueue,
+    pub available_datasets_rx: AvailableDatasetsReceiver,
 }
 
 // Describes the internals of the "InstanceManager", though most of the
@@ -95,7 +94,8 @@ impl InstanceManager {
         nexus_client: NexusClient,
         vnic_allocator: VnicAllocator<Etherstub>,
         port_manager: PortManager,
-        storage: StorageHandle,
+        currently_managed_zpools_rx: CurrentlyManagedZpoolsReceiver,
+        available_datasets_rx: AvailableDatasetsReceiver,
         zone_bundler: ZoneBundler,
         vmm_reservoir_manager: VmmReservoirManagerHandle,
         metrics_queue: MetricsRequestQueue,
@@ -105,7 +105,8 @@ impl InstanceManager {
             nexus_client,
             vnic_allocator,
             port_manager,
-            storage,
+            currently_managed_zpools_rx,
+            available_datasets_rx,
             zone_bundler,
             ZoneBuilderFactory::new(),
             vmm_reservoir_manager,
@@ -123,7 +124,8 @@ impl InstanceManager {
         nexus_client: NexusClient,
         vnic_allocator: VnicAllocator<Etherstub>,
         port_manager: PortManager,
-        storage: StorageHandle,
+        currently_managed_zpools_rx: CurrentlyManagedZpoolsReceiver,
+        available_datasets_rx: AvailableDatasetsReceiver,
         zone_bundler: ZoneBundler,
         zone_builder_factory: ZoneBuilderFactory,
         vmm_reservoir_manager: VmmReservoirManagerHandle,
@@ -142,8 +144,8 @@ impl InstanceManager {
             jobs: BTreeMap::new(),
             vnic_allocator,
             port_manager,
-            storage_generation: None,
-            storage,
+            currently_managed_zpools_rx,
+            available_datasets_rx,
             zone_bundler,
             zone_builder_factory,
             metrics_queue,
@@ -315,23 +317,6 @@ impl InstanceManager {
             .map_err(|_| Error::FailedSendInstanceManagerClosed)?;
         rx.await?
     }
-
-    /// Marks instances failed unless they're using storage from `disks`.
-    ///
-    /// This function looks for transient zone filesystem usage on expunged
-    /// zpools.
-    pub async fn use_only_these_disks(
-        &self,
-        disks: AllDisks,
-    ) -> Result<(), Error> {
-        let (tx, rx) = oneshot::channel();
-        self.inner
-            .tx
-            .send(InstanceManagerRequest::OnlyUseDisks { disks, tx })
-            .await
-            .map_err(|_| Error::FailedSendInstanceManagerClosed)?;
-        rx.await?
-    }
 }
 
 // Most requests that can be sent to the "InstanceManagerRunner" task.
@@ -386,10 +371,6 @@ enum InstanceManagerRequest {
         propolis_id: PropolisUuid,
         tx: oneshot::Sender<Result<SledVmmState, Error>>,
     },
-    OnlyUseDisks {
-        disks: AllDisks,
-        tx: oneshot::Sender<Result<(), Error>>,
-    },
 }
 
 // Requests that the instance manager stop processing information about a
@@ -426,8 +407,8 @@ struct InstanceManagerRunner {
 
     vnic_allocator: VnicAllocator<Etherstub>,
     port_manager: PortManager,
-    storage_generation: Option<Generation>,
-    storage: StorageHandle,
+    currently_managed_zpools_rx: CurrentlyManagedZpoolsReceiver,
+    available_datasets_rx: AvailableDatasetsReceiver,
     zone_bundler: ZoneBundler,
     zone_builder_factory: ZoneBuilderFactory,
     metrics_queue: MetricsRequestQueue,
@@ -458,6 +439,24 @@ impl InstanceManagerRunner {
                         },
                     }
                 },
+                // If the set of currently-managed zpools has changed, shut down
+                // any instances due to disks that have disappeared out from
+                // under them.
+                result = self.currently_managed_zpools_rx.changed() => {
+                    match result {
+                        Ok(()) => {
+                            self.use_only_currently_managed_zpools().await;
+                        }
+                        Err(_) => {
+                            warn!(
+                                self.log,
+                                "InstanceManager's 'current zpools' channel \
+                                 closed; shutting down",
+                            );
+                            break;
+                        }
+                    }
+                }
                 request = self.rx.recv() => {
                     let request_variant = request.as_ref().map(|r| r.to_string());
                     let result = match request {
@@ -495,10 +494,6 @@ impl InstanceManagerRunner {
                             // the state...
                             self.get_instance_state(tx, propolis_id)
                         },
-                        Some(OnlyUseDisks { disks, tx } ) => {
-                            self.use_only_these_disks(disks).await;
-                            tx.send(Ok(())).map_err(|_| Error::FailedSendClientClosed)
-                        },
                         None => {
                             warn!(self.log, "InstanceManager's request channel closed; shutting down");
                             break;
@@ -607,10 +602,10 @@ impl InstanceManagerRunner {
                     nexus_client: self.nexus_client.clone(),
                     vnic_allocator: self.vnic_allocator.clone(),
                     port_manager: self.port_manager.clone(),
-                    storage: self.storage.clone(),
                     zone_bundler: self.zone_bundler.clone(),
                     zone_builder_factory: self.zone_builder_factory.clone(),
                     metrics_queue: self.metrics_queue.clone(),
+                    available_datasets_rx: self.available_datasets_rx.clone(),
                 };
 
                 let state = crate::instance::InstanceInitialState {
@@ -760,24 +755,9 @@ impl InstanceManagerRunner {
         Ok(())
     }
 
-    async fn use_only_these_disks(&mut self, disks: AllDisks) {
-        // Consider the generation number on the incoming request to avoid
-        // applying old requests.
-        let requested_generation = *disks.generation();
-        if let Some(last_gen) = self.storage_generation {
-            if last_gen >= requested_generation {
-                // This request looks old, ignore it.
-                info!(self.log, "use_only_these_disks: Ignoring request";
-                    "last_gen" => ?last_gen, "requested_gen" => ?requested_generation);
-                return;
-            }
-        }
-        self.storage_generation = Some(requested_generation);
-        info!(self.log, "use_only_these_disks: Processing new request";
-            "gen" => ?requested_generation);
-
-        let u2_set: HashSet<_> = disks.all_u2_zpools().into_iter().collect();
-
+    async fn use_only_currently_managed_zpools(&mut self) {
+        let current_zpools =
+            self.currently_managed_zpools_rx.current_and_update();
         let mut to_remove = vec![];
         for (id, instance) in self.jobs.iter() {
             // If we can read the filesystem pool, consider it. Otherwise, move
@@ -792,7 +772,7 @@ impl InstanceManagerRunner {
                 info!(self.log, "use_only_these_disks: Cannot read filesystem pool"; "instance_id" => ?id);
                 continue;
             };
-            if !u2_set.contains(&filesystem_pool) {
+            if !current_zpools.contains(&filesystem_pool) {
                 to_remove.push(*id);
             }
         }
diff --git a/sled-agent/src/lib.rs b/sled-agent/src/lib.rs
index 3a63ff5c117..97d1493467d 100644
--- a/sled-agent/src/lib.rs
+++ b/sled-agent/src/lib.rs
@@ -35,7 +35,6 @@ pub mod rack_setup;
 pub mod server;
 pub mod services;
 mod sled_agent;
-mod storage_monitor;
 mod support_bundle;
 mod swap_device;
 mod updates;
diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs
index 216961072e5..37b646b8a44 100644
--- a/sled-agent/src/long_running_tasks.rs
+++ b/sled-agent/src/long_running_tasks.rs
@@ -20,39 +20,30 @@ use crate::config::Config;
 use crate::hardware_monitor::HardwareMonitor;
 use crate::services::ServiceManager;
 use crate::sled_agent::SledAgent;
-use crate::storage_monitor::{StorageMonitor, StorageMonitorHandle};
 use crate::zone_bundle::ZoneBundler;
 use bootstore::schemes::v0 as bootstore;
 use illumos_utils::zpool::ZpoolName;
 use key_manager::{KeyManager, StorageKeyRequester};
+use sled_agent_config_reconciler::{
+    ConfigReconcilerHandle, RawDisksSender, TimeSyncConfig,
+};
 use sled_agent_types::zone_bundle::CleanupContext;
 use sled_agent_zone_images::{ZoneImageSourceResolver, ZoneImageZpools};
 use sled_hardware::{HardwareManager, SledMode, UnparsedDisk};
 use sled_storage::config::MountConfig;
 use sled_storage::disk::RawSyntheticDisk;
-use sled_storage::manager::{StorageHandle, StorageManager};
-use sled_storage::resources::AllDisks;
 use slog::{Logger, info};
 use std::net::Ipv6Addr;
+use std::sync::Arc;
 use tokio::sync::oneshot;
 
 /// A mechanism for interacting with all long running tasks that can be shared
 /// between the bootstrap-agent and sled-agent code.
 #[derive(Clone)]
 pub struct LongRunningTaskHandles {
-    /// A mechanism for retrieving storage keys. This interacts with the
-    /// [`KeyManager`] task. In the future, there may be other handles for
-    /// retrieving different types of keys. Separating the handles limits the
-    /// access for a given key type to the code that holds the handle.
-    pub storage_key_requester: StorageKeyRequester,
-
-    /// A mechanism for talking to the [`StorageManager`] which is responsible
-    /// for establishing zpools on disks and managing their datasets.
-    pub storage_manager: StorageHandle,
-
-    /// A mechanism for talking to the [`StorageMonitor`], which reacts to disk
-    /// changes and updates the dump devices.
-    pub storage_monitor_handle: StorageMonitorHandle,
+    /// A handle to the set of tasks managed by the sled-agent-config-reconciler
+    /// system.
+    pub config_reconciler: Arc<ConfigReconcilerHandle>,
 
     /// A mechanism for interacting with the hardware device tree
     pub hardware_manager: HardwareManager,
@@ -83,11 +74,18 @@ pub async fn spawn_all_longrunning_tasks(
     oneshot::Sender<ServiceManager>,
 ) {
     let storage_key_requester = spawn_key_manager(log);
-    let mut storage_manager =
-        spawn_storage_manager(log, storage_key_requester.clone());
 
-    let storage_monitor_handle =
-        spawn_storage_monitor(log, storage_manager.clone());
+    let time_sync_config = if let Some(true) = config.skip_timesync {
+        TimeSyncConfig::Skip
+    } else {
+        TimeSyncConfig::Normal
+    };
+    let mut config_reconciler = ConfigReconcilerHandle::new(
+        MountConfig::default(),
+        storage_key_requester,
+        time_sync_config,
+        log,
+    );
 
     let nongimlet_observed_disks =
         config.nongimlet_observed_disks.clone().unwrap_or(vec![]);
@@ -95,38 +93,35 @@ pub async fn spawn_all_longrunning_tasks(
     let hardware_manager =
         spawn_hardware_manager(log, sled_mode, nongimlet_observed_disks).await;
 
-    // Start monitoring for hardware changes
+    // Start monitoring for hardware changes, adding some synthetic disks if
+    // necessary.
+    let raw_disks_tx = config_reconciler.raw_disks_tx();
+    upsert_synthetic_disks_if_needed(&log, &raw_disks_tx, &config).await;
     let (sled_agent_started_tx, service_manager_ready_tx) =
-        spawn_hardware_monitor(log, &hardware_manager, &storage_manager);
-
-    // Add some synthetic disks if necessary.
-    upsert_synthetic_disks_if_needed(&log, &storage_manager, &config).await;
+        spawn_hardware_monitor(log, &hardware_manager, raw_disks_tx);
 
     // Wait for the boot disk so that we can work with any ledgers,
     // such as those needed by the bootstore and sled-agent
     info!(log, "Waiting for boot disk");
-    let (disk_id, boot_zpool) = storage_manager.wait_for_boot_disk().await;
+    let disk_id = config_reconciler.wait_for_boot_disk().await;
     info!(log, "Found boot disk {:?}", disk_id);
 
     let all_disks = storage_manager.get_latest_disks().await;
     let bootstore = spawn_bootstore_tasks(
         log,
-        &all_disks,
+        &config_reconciler,
         &hardware_manager,
         global_zone_bootstrap_ip,
     )
     .await;
 
-    let zone_bundler =
-        spawn_zone_bundler_tasks(log, &mut storage_manager).await;
+    let zone_bundler = spawn_zone_bundler_tasks(log, &config_reconciler);
     let zone_image_resolver =
         make_zone_image_resolver(log, &all_disks, &boot_zpool);
 
     (
         LongRunningTaskHandles {
-            storage_key_requester,
-            storage_manager,
-            storage_monitor_handle,
+            config_reconciler: Arc::new(config_reconciler),
             hardware_manager,
             bootstore,
             zone_bundler,
@@ -146,32 +141,6 @@ fn spawn_key_manager(log: &Logger) -> StorageKeyRequester {
     storage_key_requester
 }
 
-fn spawn_storage_manager(
-    log: &Logger,
-    key_requester: StorageKeyRequester,
-) -> StorageHandle {
-    info!(log, "Starting StorageManager");
-    let (manager, handle) =
-        StorageManager::new(log, MountConfig::default(), key_requester);
-    tokio::spawn(async move {
-        manager.run().await;
-    });
-    handle
-}
-
-fn spawn_storage_monitor(
-    log: &Logger,
-    storage_handle: StorageHandle,
-) -> StorageMonitorHandle {
-    info!(log, "Starting StorageMonitor");
-    let (storage_monitor, handle) =
-        StorageMonitor::new(log, MountConfig::default(), storage_handle);
-    tokio::spawn(async move {
-        storage_monitor.run().await;
-    });
-    handle
-}
-
 async fn spawn_hardware_manager(
     log: &Logger,
     sled_mode: SledMode,
@@ -197,11 +166,11 @@ async fn spawn_hardware_manager(
 fn spawn_hardware_monitor(
     log: &Logger,
     hardware_manager: &HardwareManager,
-    storage_handle: &StorageHandle,
+    raw_disks_tx: RawDisksSender,
 ) -> (oneshot::Sender<SledAgent>, oneshot::Sender<ServiceManager>) {
     info!(log, "Starting HardwareMonitor");
     let (mut monitor, sled_agent_started_tx, service_manager_ready_tx) =
-        HardwareMonitor::new(log, hardware_manager, storage_handle);
+        HardwareMonitor::new(log, hardware_manager, raw_disks_tx);
     tokio::spawn(async move {
         monitor.run().await;
     });
@@ -210,12 +179,16 @@ fn spawn_hardware_monitor(
 
 async fn spawn_bootstore_tasks(
     log: &Logger,
-    all_disks: &AllDisks,
+    config_reconciler: &ConfigReconcilerHandle,
     hardware_manager: &HardwareManager,
     global_zone_bootstrap_ip: Ipv6Addr,
 ) -> bootstore::NodeHandle {
     let config = new_bootstore_config(
-        all_disks,
+        &config_reconciler
+            .internal_disks_rx()
+            .current()
+            .all_cluster_datasets()
+            .collect::<Vec<_>>(),
         hardware_manager.baseboard(),
         global_zone_bootstrap_ip,
     )
@@ -240,12 +213,16 @@ async fn spawn_bootstore_tasks(
 // `ZoneBundler::new` spawns a periodic cleanup task that runs indefinitely
 async fn spawn_zone_bundler_tasks(
     log: &Logger,
-    storage_handle: &mut StorageHandle,
+    config_reconciler: &ConfigReconcilerHandle,
 ) -> ZoneBundler {
     info!(log, "Starting ZoneBundler related tasks");
     let log = log.new(o!("component" => "ZoneBundler"));
-    ZoneBundler::new(log, storage_handle.clone(), CleanupContext::default())
-        .await
+    ZoneBundler::new(
+        log,
+        config_reconciler.internal_disks_rx().clone(),
+        config_reconciler.available_datasets_rx(),
+        CleanupContext::default(),
+    )
 }
 
 fn make_zone_image_resolver(
@@ -262,7 +239,7 @@ fn make_zone_image_resolver(
 
 async fn upsert_synthetic_disks_if_needed(
     log: &Logger,
-    storage_manager: &StorageHandle,
+    raw_disks_tx: &RawDisksSender,
     config: &Config,
 ) {
     if let Some(vdevs) = &config.vdevs {
@@ -275,7 +252,7 @@ async fn upsert_synthetic_disks_if_needed(
             let disk = RawSyntheticDisk::load(vdev, i.try_into().unwrap())
                 .expect("Failed to parse synthetic disk")
                 .into();
-            storage_manager.detected_raw_disk(disk).await.await.unwrap();
+            raw_disks_tx.add_or_update_raw_disk(disk, log);
         }
     }
 }
diff --git a/sled-agent/src/probe_manager.rs b/sled-agent/src/probe_manager.rs
index 75729e3a872..c4f97ed22ac 100644
--- a/sled-agent/src/probe_manager.rs
+++ b/sled-agent/src/probe_manager.rs
@@ -10,8 +10,8 @@ use nexus_client::types::{
     BackgroundTasksActivateRequest, ProbeExternalIp, ProbeInfo,
 };
 use omicron_common::api::external::{
-    Generation, VpcFirewallRuleAction, VpcFirewallRuleDirection,
-    VpcFirewallRulePriority, VpcFirewallRuleStatus,
+    VpcFirewallRuleAction, VpcFirewallRuleDirection, VpcFirewallRulePriority,
+    VpcFirewallRuleStatus,
 };
 use omicron_common::api::internal::shared::{
     NetworkInterface, ResolvedVpcFirewallRule,
@@ -19,9 +19,10 @@ use omicron_common::api::internal::shared::{
 use omicron_uuid_kinds::{GenericUuid, OmicronZoneUuid};
 use rand::SeedableRng;
 use rand::prelude::IteratorRandom;
-use sled_storage::dataset::ZONE_DATASET;
-use sled_storage::manager::StorageHandle;
-use sled_storage::resources::AllDisks;
+use sled_agent_config_reconciler::{
+    AvailableDatasetsReceiver, CurrentlyManagedZpools,
+    CurrentlyManagedZpoolsReceiver,
+};
 use slog::{Logger, error, warn};
 use std::collections::{HashMap, HashSet};
 use std::hash::{Hash, Hasher};
@@ -50,7 +51,6 @@ pub(crate) struct ProbeManager {
 }
 
 struct RunningProbes {
-    storage_generation: Option<Generation>,
     zones: HashMap<Uuid, RunningZone>,
 }
 
@@ -60,10 +60,10 @@ pub(crate) struct ProbeManagerInner {
     log: Logger,
     sled_id: Uuid,
     vnic_allocator: VnicAllocator<Etherstub>,
-    storage: StorageHandle,
     port_manager: PortManager,
     metrics_queue: MetricsRequestQueue,
     running_probes: Mutex<RunningProbes>,
+    available_datasets_rx: AvailableDatasetsReceiver,
 
     zones_api: Arc<dyn illumos_utils::zone::Api>,
 }
@@ -73,9 +73,9 @@ impl ProbeManager {
         sled_id: Uuid,
         nexus_client: NexusClient,
         etherstub: Etherstub,
-        storage: StorageHandle,
         port_manager: PortManager,
         metrics_queue: MetricsRequestQueue,
+        available_datasets_rx: AvailableDatasetsReceiver,
         log: Logger,
     ) -> Self {
         Self {
@@ -87,69 +87,24 @@ impl ProbeManager {
                     Arc::new(illumos_utils::dladm::Dladm::real_api()),
                 ),
                 running_probes: Mutex::new(RunningProbes {
-                    storage_generation: None,
                     zones: HashMap::new(),
                 }),
                 nexus_client,
                 log,
                 sled_id,
-                storage,
                 port_manager,
                 metrics_queue,
+                available_datasets_rx,
                 zones_api: Arc::new(illumos_utils::zone::Zones::real_api()),
             }),
         }
     }
 
-    pub(crate) async fn run(&self) {
-        self.inner.run().await;
-    }
-
-    /// Removes any probes using filesystem roots on zpools that are not
-    /// contained in the set of "disks".
-    pub(crate) async fn use_only_these_disks(&self, disks: &AllDisks) {
-        let u2_set: HashSet<_> = disks.all_u2_zpools().into_iter().collect();
-        let mut probes = self.inner.running_probes.lock().await;
-
-        // Consider the generation number on the incoming request to avoid
-        // applying old requests.
-        let requested_generation = *disks.generation();
-        if let Some(last_gen) = probes.storage_generation {
-            if last_gen >= requested_generation {
-                // This request looks old, ignore it.
-                info!(self.inner.log, "use_only_these_disks: Ignoring request";
-                    "last_gen" => ?last_gen, "requested_gen" => ?requested_generation);
-                return;
-            }
-        }
-        probes.storage_generation = Some(requested_generation);
-        info!(self.inner.log, "use_only_these_disks: Processing new request";
-            "gen" => ?requested_generation);
-
-        let to_remove = probes
-            .zones
-            .iter()
-            .filter_map(|(id, probe)| {
-                let probe_pool = match probe.root_zpool() {
-                    ZpoolOrRamdisk::Zpool(zpool_name) => zpool_name,
-                    ZpoolOrRamdisk::Ramdisk => {
-                        info!(
-                            self.inner.log,
-                            "use_only_these_disks: removing probe on ramdisk";
-                            "id" => ?id,
-                        );
-                        return None;
-                    }
-                };
-
-                if !u2_set.contains(probe_pool) { Some(*id) } else { None }
-            })
-            .collect::<Vec<_>>();
-
-        for probe_id in to_remove {
-            info!(self.inner.log, "use_only_these_disks: Removing probe"; "probe_id" => ?probe_id);
-            self.inner.remove_probe_locked(&mut probes, probe_id).await;
-        }
+    pub(crate) async fn run(
+        &self,
+        currently_managed_zpools_rx: CurrentlyManagedZpoolsReceiver,
+    ) {
+        self.inner.run(currently_managed_zpools_rx).await;
     }
 }
 
@@ -214,18 +169,55 @@ impl TryFrom<Zone> for ProbeState {
 
 impl ProbeManagerInner {
     /// Run the probe manager. If it's already running this is a no-op.
-    async fn run(self: &Arc<Self>) {
+    async fn run(
+        self: &Arc<Self>,
+        currently_managed_zpools_rx: CurrentlyManagedZpoolsReceiver,
+    ) {
         let mut join_handle = self.join_handle.lock().await;
         if join_handle.is_none() {
-            *join_handle = Some(self.clone().reconciler())
+            *join_handle =
+                Some(self.clone().reconciler(currently_managed_zpools_rx))
         }
     }
 
     /// Run the reconciler loop on a background thread.
-    fn reconciler(self: Arc<Self>) -> JoinHandle<()> {
+    fn reconciler(
+        self: Arc<Self>,
+        mut currently_managed_zpools_rx: CurrentlyManagedZpoolsReceiver,
+    ) -> JoinHandle<()> {
         tokio::spawn(async move {
             loop {
-                sleep(RECONCILIATION_INTERVAL).await;
+                let sleep_fut = sleep(RECONCILIATION_INTERVAL);
+                tokio::pin!(sleep_fut);
+
+                // Wait until the next reconciliation tick, but handle any
+                // changes to the set of disks in the meantime.
+                loop {
+                    tokio::select! {
+                        _ = &mut sleep_fut => break,
+
+                        // Cancel-safe per docs on `changed()`
+                        result = currently_managed_zpools_rx.changed() => {
+                            match result {
+                                Ok(()) => {
+                                    self.use_only_these_disks(
+                                        &currently_managed_zpools_rx
+                                            .current_and_update()
+                                    ).await;
+                                    continue;
+                                }
+                                Err(_) => {
+                                    warn!(
+                                        self.log,
+                                        "ProbeManager's 'current zpools' \
+                                         channel closed; shutting down",
+                                    );
+                                    return;
+                                }
+                            }
+                        }
+                    }
+                }
 
                 // Collect the target and current state. Use set operations
                 // to determine what probes need to be added, removed and/or
@@ -268,6 +260,37 @@ impl ProbeManagerInner {
         })
     }
 
+    /// Removes any probes using filesystem roots on zpools that are not
+    /// contained in the set of "disks".
+    async fn use_only_these_disks(&self, disks: &CurrentlyManagedZpools) {
+        let mut probes = self.running_probes.lock().await;
+
+        let to_remove = probes
+            .zones
+            .iter()
+            .filter_map(|(id, probe)| {
+                let probe_pool = match probe.root_zpool() {
+                    ZpoolOrRamdisk::Zpool(zpool_name) => zpool_name,
+                    ZpoolOrRamdisk::Ramdisk => {
+                        info!(
+                            self.log,
+                            "use_only_these_disks: removing probe on ramdisk";
+                            "id" => ?id,
+                        );
+                        return None;
+                    }
+                };
+
+                if !disks.contains(probe_pool) { Some(*id) } else { None }
+            })
+            .collect::<Vec<_>>();
+
+        for probe_id in to_remove {
+            info!(self.log, "use_only_these_disks: Removing probe"; "probe_id" => ?probe_id);
+            self.remove_probe_locked(&mut probes, probe_id).await;
+        }
+    }
+
     /// Add a set of probes to this sled.
     ///
     /// Returns the number of inserted probes.
@@ -291,12 +314,9 @@ impl ProbeManagerInner {
     /// boots the probe zone.
     async fn add_probe(self: &Arc<Self>, probe: &ProbeState) -> Result<()> {
         let mut rng = rand::rngs::StdRng::from_entropy();
-        let current_disks = self
-            .storage
-            .get_latest_disks()
-            .await
-            .all_u2_mountpoints(ZONE_DATASET);
-        let zone_root_path = current_disks
+        let zone_root_path = self
+            .available_datasets_rx
+            .all_mounted_zone_root_datasets()
             .into_iter()
             .choose(&mut rng)
             .ok_or_else(|| anyhow!("u2 not found"))?;
diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs
index ad1778501bb..436cd4503fc 100644
--- a/sled-agent/src/rack_setup/service.rs
+++ b/sled-agent/src/rack_setup/service.rs
@@ -117,6 +117,7 @@ use serde::{Deserialize, Serialize};
 use sled_agent_client::{
     Client as SledAgentClient, Error as SledAgentError, types as SledAgentTypes,
 };
+use sled_agent_config_reconciler::InternalDisksReceiver;
 use sled_agent_types::early_networking::{
     EarlyNetworkConfig, EarlyNetworkConfigBody,
 };
@@ -127,8 +128,6 @@ use sled_agent_types::rack_ops::RssStep;
 use sled_agent_types::sled::StartSledAgentRequest;
 use sled_agent_types::time_sync::TimeSync;
 use sled_hardware_types::underlay::BootstrapInterface;
-use sled_storage::dataset::CONFIG_DATASET;
-use sled_storage::manager::StorageHandle;
 use slog::Logger;
 use slog_error_chain::InlineErrorChain;
 use std::collections::{BTreeMap, BTreeSet, btree_map};
@@ -257,15 +256,14 @@ impl RackSetupService {
     /// Arguments:
     /// - `log`: The logger.
     /// - `config`: The config file, which is used to setup the rack.
-    /// - `storage_manager`: A handle for interacting with the storage manager
-    ///   task
+    /// - `internal_disks_rx`: Tells us about available internal disks
     /// - `local_bootstrap_agent`: Communication channel by which we can send
     ///   commands to our local bootstrap-agent (e.g., to start sled-agents)
     /// - `bootstore` - A handle to call bootstore APIs
     pub(crate) fn new(
         log: Logger,
         config: Config,
-        storage_manager: StorageHandle,
+        internal_disks_rx: InternalDisksReceiver,
         local_bootstrap_agent: BootstrapAgentHandle,
         bootstore: bootstore::NodeHandle,
         step_tx: watch::Sender<RssStep>,
@@ -275,7 +273,7 @@ impl RackSetupService {
             if let Err(e) = svc
                 .run(
                     &config,
-                    &storage_manager,
+                    &internal_disks_rx,
                     local_bootstrap_agent,
                     bootstore,
                     step_tx,
@@ -375,86 +373,9 @@ impl ServiceInner {
                 "datasets" => ?sled_config.datasets,
                 "zones" => ?sled_config.zones,
             );
-            let result = client.omicron_config_put(&sled_config).await;
-            let error = match result {
-                Ok(response) => {
-                    let response = response.into_inner();
-
-                    // An HTTP OK may contain _partial_ success: check whether
-                    // we got any individual disk failures, and split those out
-                    // into transient/permanent cases based on whether they
-                    // indicate we should retry.
-                    let disk_errors =
-                        response.disks.into_iter().filter_map(|status| {
-                            status.err.map(|err| (status.identity, err))
-                        });
-                    let mut transient_errors = Vec::new();
-                    let mut permanent_errors = Vec::new();
-                    for (identity, error) in disk_errors {
-                        if error.retryable() {
-                            transient_errors.push(format!(
-                                "Retryable error initializing disk \
-                                 {} / {} / {}: {}",
-                                identity.vendor,
-                                identity.model,
-                                identity.serial,
-                                InlineErrorChain::new(&error)
-                            ));
-                        } else {
-                            permanent_errors.push(format!(
-                                "Non-retryable error initializing disk \
-                                 {} / {} / {}: {}",
-                                identity.vendor,
-                                identity.model,
-                                identity.serial,
-                                InlineErrorChain::new(&error)
-                            ));
-                        }
-                    }
-                    if !permanent_errors.is_empty() {
-                        return Err(BackoffError::permanent(
-                            SetupServiceError::DiskInitializationPermanent {
-                                permanent_errors,
-                            },
-                        ));
-                    }
-                    if !transient_errors.is_empty() {
-                        return Err(BackoffError::transient(
-                            SetupServiceError::DiskInitializationTransient {
-                                transient_errors,
-                            },
-                        ));
-                    }
-
-                    // No individual disk errors reported; all disks were
-                    // initialized. Check for any dataset errors; these are not
-                    // retryable.
-                    let dataset_errors = response
-                        .datasets
-                        .into_iter()
-                        .filter_map(|status| {
-                            status.err.map(|err| {
-                                format!(
-                                    "Error initializing dataset {}: {err}",
-                                    status.dataset_name.full_name()
-                                )
-                            })
-                        })
-                        .collect::<Vec<_>>();
-                    if !dataset_errors.is_empty() {
-                        return Err(BackoffError::permanent(
-                            SetupServiceError::DatasetInitialization {
-                                errors: dataset_errors,
-                            },
-                        ));
-                    }
-
-                    // No individual dataset errors reported. We don't get
-                    // status for individual zones (any failure there results in
-                    // an HTTP-level error), so everything is good.
-                    return Ok(());
-                }
-                Err(error) => error,
+            let Err(error) = client.omicron_config_put(&sled_config).await
+            else {
+                return Ok(());
             };
 
             if let sled_agent_client::Error::ErrorResponse(response) = &error {
@@ -502,6 +423,16 @@ impl ServiceInner {
         Ok(())
     }
 
+    // Wait until the config reconciler on the target sled has successfully
+    // reconciled the config at `generation`.
+    async fn wait_for_config_reconciliation_on_sled(
+        &self,
+        _sled_address: SocketAddrV6,
+        _generation: Generation,
+    ) -> Result<(), SetupServiceError> {
+        unimplemented!("needs updated inventory")
+    }
+
     // Ensure that the desired sled configuration for a particular zone version
     // is deployed.
     //
@@ -530,21 +461,27 @@ impl ServiceInner {
                     })?
                     .clone();
 
+                // We bump the zone generation as we step through phases of
+                // RSS; use that as the overall sled config generation.
+                let generation = zones_config.generation;
                 let sled_config = OmicronSledConfig {
-                    // We bump the zone generation as we step through phases of
-                    // RSS; use that as the overall sled config generation.
-                    generation: zones_config.generation,
+                    generation,
                     disks: config
                         .disks
                         .iter()
                         .map(|c| c.clone().into())
                         .collect(),
                     datasets: config.datasets.values().cloned().collect(),
-                    zones: zones_config.zones.iter().cloned().collect(),
+                    zones: zones_config.zones.into_iter().collect(),
                     remove_mupdate_override: None,
                 };
 
                 self.set_config_on_sled(*sled_address, sled_config).await?;
+                self.wait_for_config_reconciliation_on_sled(
+                    *sled_address,
+                    generation,
+                )
+                .await?;
 
                 Ok::<(), SetupServiceError>(())
             }),
@@ -1121,7 +1058,7 @@ impl ServiceInner {
     async fn run(
         &self,
         config: &Config,
-        storage_manager: &StorageHandle,
+        internal_disks_rx: &InternalDisksReceiver,
         local_bootstrap_agent: BootstrapAgentHandle,
         bootstore: bootstore::NodeHandle,
         step_tx: watch::Sender<RssStep>,
@@ -1134,19 +1071,18 @@ impl ServiceInner {
             config.az_subnet(),
         )?;
 
-        let started_marker_paths: Vec<Utf8PathBuf> = storage_manager
-            .get_latest_disks()
-            .await
-            .all_m2_mountpoints(CONFIG_DATASET)
-            .into_iter()
+        let config_dataset_paths = internal_disks_rx
+            .current()
+            .all_config_datasets()
+            .collect::<Vec<_>>();
+
+        let started_marker_paths: Vec<Utf8PathBuf> = config_dataset_paths
+            .iter()
             .map(|p| p.join(RSS_STARTED_FILENAME))
             .collect();
 
-        let completed_marker_paths: Vec<Utf8PathBuf> = storage_manager
-            .get_latest_disks()
-            .await
-            .all_m2_mountpoints(CONFIG_DATASET)
-            .into_iter()
+        let completed_marker_paths: Vec<Utf8PathBuf> = config_dataset_paths
+            .iter()
             .map(|p| p.join(RSS_COMPLETED_FILENAME))
             .collect();
 
diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs
index 84a4e33e2ce..31f2f6a70f8 100644
--- a/sled-agent/src/services.rs
+++ b/sled-agent/src/services.rs
@@ -20,12 +20,11 @@
 //! of what other services Nexus wants to have executing on the sled.
 //!
 //! To accomplish this, the following interfaces are exposed:
-//! - [ServiceManager::ensure_all_omicron_zones_persistent] exposes an API to
-//!   request a set of Omicron zones that should persist beyond reboot.
+//! - [ServiceManager::start_omicron_zone] exposes an API to start a new Omicron
+//!   zone.
 //! - [ServiceManager::activate_switch] exposes an API to specifically enable
 //!   or disable (via [ServiceManager::deactivate_switch]) the switch zone.
 
-use crate::artifact_store::ArtifactStore;
 use crate::bootstrap::BootstrapNetworking;
 use crate::bootstrap::early_networking::{
     EarlyNetworkSetup, EarlyNetworkSetupError,
@@ -35,7 +34,6 @@ use crate::ddm_reconciler::DdmReconciler;
 use crate::metrics::MetricsRequestQueue;
 use crate::params::{DendriteAsic, OmicronZoneTypeExt};
 use crate::profile::*;
-use crate::zone_bundle::ZoneBundler;
 use anyhow::anyhow;
 use camino::{Utf8Path, Utf8PathBuf};
 use clickhouse_admin_types::CLICKHOUSE_KEEPER_CONFIG_DIR;
@@ -94,41 +92,33 @@ use omicron_common::backoff::{
     BackoffError, retry_notify, retry_policy_internal_service_aggressive,
 };
 use omicron_common::disk::{DatasetKind, DatasetName};
-use omicron_common::ledger::{self, Ledger, Ledgerable};
+use omicron_common::ledger::Ledgerable;
 use omicron_ddm_admin_client::DdmError;
 use omicron_uuid_kinds::OmicronZoneUuid;
 use rand::prelude::SliceRandom;
-use sled_agent_types::{
-    sled::SWITCH_ZONE_BASEBOARD_FILE, time_sync::TimeSync,
-    zone_bundle::ZoneBundleCause,
-};
 use sled_agent_zone_images::{ZoneImageSourceResolver, ZoneImageZpools};
+use sled_agent_config_reconciler::InternalDisksReceiver;
+use sled_agent_types::sled::SWITCH_ZONE_BASEBOARD_FILE;
 use sled_hardware::SledMode;
 use sled_hardware::is_gimlet;
 use sled_hardware::underlay;
 use sled_hardware_types::Baseboard;
 use sled_storage::config::MountConfig;
-use sled_storage::dataset::{CONFIG_DATASET, ZONE_DATASET};
-use sled_storage::manager::StorageHandle;
+use sled_storage::dataset::{INSTALL_DATASET, ZONE_DATASET};
 use slog::Logger;
 use slog_error_chain::InlineErrorChain;
-use std::collections::BTreeMap;
-use std::collections::HashSet;
 use std::net::{IpAddr, Ipv6Addr, SocketAddr};
-use std::str::FromStr;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::{Arc, OnceLock};
 use tokio::io::AsyncWriteExt;
 use tokio::sync::Mutex;
-use tokio::sync::{MutexGuard, oneshot};
+use tokio::sync::oneshot;
 use tokio::task::JoinHandle;
 use tufaceous_artifact::ArtifactHash;
 use uuid::Uuid;
 
 use illumos_utils::zone::Zones;
 
-const IPV6_UNSPECIFIED: IpAddr = IpAddr::V6(Ipv6Addr::UNSPECIFIED);
-
 // These are all the same binary. They just reside at different paths.
 const CLICKHOUSE_SERVER_BINARY: &str =
     "/opt/oxide/clickhouse_server/clickhouse";
@@ -157,9 +147,6 @@ pub enum Error {
     #[error("Failed to find device {device}")]
     MissingDevice { device: String },
 
-    #[error("Failed to access ledger: {0}")]
-    Ledger(#[from] ledger::Error),
-
     #[error("Sled Agent not initialized yet")]
     SledAgentNotReady,
 
@@ -387,14 +374,6 @@ impl From<Error> for omicron_common::api::external::Error {
     }
 }
 
-/// Result of [ServiceManager::load_services]
-pub enum LoadServicesResult {
-    /// We didn't load anything, there wasn't anything to load
-    NoServicesToLoad,
-    /// We successfully loaded the zones from our ledger.
-    ServicesLoaded,
-}
-
 fn display_zone_init_errors(errors: &[(String, Box<Error>)]) -> String {
     if errors.len() == 1 {
         return format!(
@@ -513,9 +492,6 @@ impl RealSystemApi {
 
 impl SystemApi for RealSystemApi {}
 
-// The filename of the ledger, within the provided directory.
-const ZONES_LEDGER_FILENAME: &str = "omicron-zones.json";
-
 /// Combines the Nexus-provided `OmicronZonesConfig` (which describes what Nexus
 /// wants for all of its zones) with the locally-determined configuration for
 /// these zones.
@@ -729,57 +705,23 @@ enum SwitchZoneState {
     },
 }
 
-// The return type for `start_omicron_zones`.
-//
-// When multiple zones are started concurrently, some can fail while others
-// succeed. This structure allows the function to return this nuanced
-// information.
-#[must_use]
-struct StartZonesResult {
-    // The set of zones which have successfully started.
-    new_zones: Vec<OmicronZone>,
-
-    // The set of (zone name, error) of zones that failed to start.
-    errors: Vec<(String, Error)>,
-}
-
-// A running zone and the configuration which started it.
-#[derive(Debug)]
-struct OmicronZone {
-    runtime: RunningZone,
-    config: OmicronZoneConfigLocal,
-}
-
-impl OmicronZone {
-    fn name(&self) -> &str {
-        self.runtime.name()
-    }
-}
-
-type ZoneMap = BTreeMap<String, OmicronZone>;
-
 /// Manages miscellaneous Sled-local services.
 pub struct ServiceManagerInner {
     log: Logger,
     global_zone_bootstrap_link_local_address: Ipv6Addr,
     switch_zone: Mutex<SwitchZoneState>,
     sled_mode: SledMode,
-    time_sync_config: TimeSyncConfig,
     time_synced: AtomicBool,
     switch_zone_maghemite_links: Vec<PhysicalLink>,
     sidecar_revision: SidecarRevision,
-    // Zones representing running services
-    zones: Mutex<ZoneMap>,
     underlay_vnic_allocator: VnicAllocator<Etherstub>,
     underlay_vnic: EtherstubVnic,
     bootstrap_vnic_allocator: VnicAllocator<Etherstub>,
     ddm_reconciler: DdmReconciler,
     sled_info: OnceLock<SledAgentInfo>,
     switch_zone_bootstrap_address: Ipv6Addr,
-    storage: StorageHandle,
-    zone_bundler: ZoneBundler,
     zone_image_resolver: ZoneImageSourceResolver,
-    ledger_directory_override: OnceLock<Utf8PathBuf>,
+    internal_disks_rx: InternalDisksReceiver,
     system_api: Box<dyn SystemApi>,
 }
 
@@ -795,16 +737,6 @@ struct SledAgentInfo {
     metrics_queue: MetricsRequestQueue,
 }
 
-pub(crate) enum TimeSyncConfig {
-    // Waits for NTP to confirm that time has been synchronized.
-    Normal,
-    // Skips timesync unconditionally.
-    Skip,
-    // Fails timesync unconditionally.
-    #[cfg(all(test, target_os = "illumos"))]
-    Fail,
-}
-
 #[derive(Clone)]
 pub struct ServiceManager {
     inner: Arc<ServiceManagerInner>,
@@ -909,39 +841,33 @@ impl ServiceManager {
     ///
     /// Args:
     /// - `log`: The logger
-    /// - `ddm_client`: Client pointed to our localhost ddmd
+    /// - `ddm_reconciler`: Handle for configuring our localhost ddmd
     /// - `bootstrap_networking`: Collection of etherstubs/VNICs set up when
     ///    bootstrap agent begins
     /// - `sled_mode`: The sled's mode of operation (Gimlet vs Scrimlet).
-    /// - `time_sync_config`: Describes how the sled awaits synced time.
     /// - `sidecar_revision`: Rev of attached sidecar, if present.
     /// - `switch_zone_maghemite_links`: List of physical links on which
     ///    maghemite should listen.
-    /// - `storage`: Shared handle to get the current state of disks/zpools.
-    #[allow(clippy::too_many_arguments)]
+    /// - `internal_disks_rx`: watch channel for changes to internal disks
     pub(crate) fn new(
         log: &Logger,
         ddm_reconciler: DdmReconciler,
         bootstrap_networking: BootstrapNetworking,
         sled_mode: SledMode,
-        time_sync_config: TimeSyncConfig,
         sidecar_revision: SidecarRevision,
         switch_zone_maghemite_links: Vec<PhysicalLink>,
-        storage: StorageHandle,
-        zone_bundler: ZoneBundler,
         zone_image_resolver: ZoneImageSourceResolver,
+        internal_disks_rx: InternalDisksReceiver,
     ) -> Self {
         Self::new_inner(
             log,
             ddm_reconciler,
             bootstrap_networking,
             sled_mode,
-            time_sync_config,
             sidecar_revision,
             switch_zone_maghemite_links,
-            storage,
-            zone_bundler,
             zone_image_resolver,
+            internal_disks_rx,
             RealSystemApi::new(),
         )
     }
@@ -952,12 +878,10 @@ impl ServiceManager {
         ddm_reconciler: DdmReconciler,
         bootstrap_networking: BootstrapNetworking,
         sled_mode: SledMode,
-        time_sync_config: TimeSyncConfig,
         sidecar_revision: SidecarRevision,
         switch_zone_maghemite_links: Vec<PhysicalLink>,
-        storage: StorageHandle,
-        zone_bundler: ZoneBundler,
         zone_image_resolver: ZoneImageSourceResolver,
+        internal_disks_rx: InternalDisksReceiver,
         system_api: Box<dyn SystemApi>,
     ) -> Self {
         let log = log.new(o!("component" => "ServiceManager"));
@@ -971,11 +895,9 @@ impl ServiceManager {
                 // Load the switch zone if it already exists?
                 switch_zone: Mutex::new(SwitchZoneState::Disabled),
                 sled_mode,
-                time_sync_config,
                 time_synced: AtomicBool::new(false),
                 sidecar_revision,
                 switch_zone_maghemite_links,
-                zones: Mutex::new(BTreeMap::new()),
                 underlay_vnic_allocator: VnicAllocator::new(
                     "Service",
                     bootstrap_networking.underlay_etherstub,
@@ -991,25 +913,19 @@ impl ServiceManager {
                 sled_info: OnceLock::new(),
                 switch_zone_bootstrap_address: bootstrap_networking
                     .switch_zone_bootstrap_ip,
-                storage,
-                zone_bundler,
                 zone_image_resolver,
-                ledger_directory_override: OnceLock::new(),
+                internal_disks_rx,
                 system_api,
             }),
         }
     }
 
-    #[cfg(all(test, target_os = "illumos"))]
-    fn override_ledger_directory(&self, path: Utf8PathBuf) {
-        self.inner.ledger_directory_override.set(path).unwrap();
-    }
 
+    // TODO-john still needed?
     #[cfg(all(test, target_os = "illumos"))]
     fn override_image_directory(&self, path: Utf8PathBuf) {
         self.inner.zone_image_resolver.override_image_directory(path);
     }
-
     pub(crate) fn ddm_reconciler(&self) -> &DdmReconciler {
         &self.inner.ddm_reconciler
     }
@@ -1018,126 +934,6 @@ impl ServiceManager {
         self.inner.switch_zone_bootstrap_address
     }
 
-    // TODO: This function refers to an old, deprecated format for storing
-    // service information. It is not deprecated for cleanup purposes, but
-    // should otherwise not be called in new code.
-    async fn all_service_ledgers(&self) -> Vec<Utf8PathBuf> {
-        pub const SERVICES_LEDGER_FILENAME: &str = "services.json";
-        if let Some(dir) = self.inner.ledger_directory_override.get() {
-            return vec![dir.join(SERVICES_LEDGER_FILENAME)];
-        }
-        let resources = self.inner.storage.get_latest_disks().await;
-        resources
-            .all_m2_mountpoints(CONFIG_DATASET)
-            .into_iter()
-            .map(|p| p.join(SERVICES_LEDGER_FILENAME))
-            .collect()
-    }
-
-    async fn all_omicron_zone_ledgers(&self) -> Vec<Utf8PathBuf> {
-        if let Some(dir) = self.inner.ledger_directory_override.get() {
-            return vec![dir.join(ZONES_LEDGER_FILENAME)];
-        }
-        let resources = self.inner.storage.get_latest_disks().await;
-        resources
-            .all_m2_mountpoints(CONFIG_DATASET)
-            .into_iter()
-            .map(|p| p.join(ZONES_LEDGER_FILENAME))
-            .collect()
-    }
-
-    // Loads persistent configuration about any Omicron-managed zones that we're
-    // supposed to be running.
-    async fn load_ledgered_zones(
-        &self,
-        // This argument attempts to ensure that the caller holds the right
-        // lock.
-        _map: &MutexGuard<'_, ZoneMap>,
-    ) -> Result<Option<Ledger<OmicronZonesConfigLocal>>, Error> {
-        let log = &self.inner.log;
-
-        // NOTE: This is a function where we used to access zones by "service
-        // ledgers". This format has since been deprecated, and these files,
-        // if they exist, should not be used.
-        //
-        // We try to clean them up at this spot. Deleting this "removal" code
-        // in the future should be a safe operation; this is a non-load-bearing
-        // cleanup.
-        for path in self.all_service_ledgers().await {
-            match tokio::fs::remove_file(&path).await {
-                Ok(_) => (),
-                Err(ref e) if e.kind() == std::io::ErrorKind::NotFound => (),
-                Err(e) => {
-                    warn!(
-                        log,
-                        "Failed to delete old service ledger";
-                        "err" => ?e,
-                        "path" => ?path,
-                    );
-                }
-            }
-        }
-
-        // Try to load the current software's zone ledger
-        let ledger_paths = self.all_omicron_zone_ledgers().await;
-        info!(log, "Loading Omicron zones from: {ledger_paths:?}");
-        let maybe_ledger =
-            Ledger::<OmicronZonesConfigLocal>::new(log, ledger_paths.clone())
-                .await;
-
-        let Some(ledger) = maybe_ledger else {
-            info!(log, "Loading Omicron zones - No zones detected");
-            return Ok(None);
-        };
-
-        info!(
-            log,
-            "Loaded Omicron zones";
-            "zones_config" => ?ledger.data()
-        );
-        Ok(Some(ledger))
-    }
-
-    // TODO(https://github.com/oxidecomputer/omicron/issues/2973):
-    //
-    // The sled agent retries this function indefinitely at the call-site, but
-    // we could be smarter.
-    //
-    // - If we know that disks are missing, we could wait for them
-    // - We could permanently fail if we are able to distinguish other errors
-    // more clearly.
-    pub async fn load_services(&self) -> Result<LoadServicesResult, Error> {
-        let log = &self.inner.log;
-        let mut existing_zones = self.inner.zones.lock().await;
-        let Some(mut ledger) =
-            self.load_ledgered_zones(&existing_zones).await?
-        else {
-            // Nothing found -- nothing to do.
-            info!(
-                log,
-                "Loading Omicron zones - \
-                no zones nor old-format services found"
-            );
-            return Ok(LoadServicesResult::NoServicesToLoad);
-        };
-
-        let zones_config = ledger.data_mut();
-        info!(
-            log,
-            "Loaded Omicron zones";
-            "zones_config" => ?zones_config
-        );
-        let omicron_zones_config =
-            zones_config.clone().to_omicron_zones_config();
-
-        self.ensure_all_omicron_zones(
-            &mut existing_zones,
-            omicron_zones_config,
-        )
-        .await?;
-        Ok(LoadServicesResult::ServicesLoaded)
-    }
-
     /// Sets up "Sled Agent" information, including underlay info.
     ///
     /// Any subsequent calls after the first invocation return an error.
@@ -1738,6 +1534,33 @@ impl ServiceManager {
         let zpools = ZoneImageZpools {
             root: &all_disks.mount_config().root,
             all_m2_zpools: all_disks.all_m2_zpools(),
+/* TODO-john fix this
+        let zone_image_file_name = match image_source {
+            OmicronZoneImageSource::InstallDataset => None,
+            OmicronZoneImageSource::Artifact { hash } => Some(hash.to_string()),
+        };
+        let internal_disks = self.inner.internal_disks_rx.current();
+        let zone_image_paths = match image_source {
+            OmicronZoneImageSource::InstallDataset => {
+                // Look for the image in the ramdisk first
+                let mut zone_image_paths =
+                    vec![Utf8PathBuf::from("/opt/oxide")];
+
+                // If the boot disk exists, look for the image in the "install"
+                // dataset there too.
+                if let Some(boot_zpool) = internal_disks.boot_disk_zpool() {
+                    zone_image_paths.push(boot_zpool.dataset_mountpoint(
+                        &internal_disks.mount_config().root,
+                        INSTALL_DATASET,
+                    ));
+                }
+
+                zone_image_paths
+            }
+            OmicronZoneImageSource::Artifact { .. } => {
+                internal_disks.all_artifact_datasets().collect()
+            }
+*/
         };
         let boot_zpool =
             all_disks.boot_disk().map(|(_, boot_zpool)| boot_zpool);
@@ -3492,13 +3315,13 @@ impl ServiceManager {
     // - `all_u2_pools` provides a snapshot into durable storage on this sled,
     // which gives the storage manager an opportunity to validate the zone's
     // storage configuration against the reality of the current sled.
-    async fn start_omicron_zone(
+    pub(crate) async fn start_omicron_zone(
         &self,
         mount_config: &MountConfig,
         zone: &OmicronZoneConfig,
         time_is_synchronized: bool,
-        all_u2_pools: &Vec<ZpoolName>,
-    ) -> Result<OmicronZone, Error> {
+        all_u2_pools: &[ZpoolName],
+    ) -> Result<RunningZone, Error> {
         // Ensure the zone has been fully removed before we try to boot it.
         //
         // This ensures that old "partially booted/stopped" zones do not
@@ -3535,343 +3358,7 @@ impl ServiceManager {
             )
             .await?;
 
-        Ok(OmicronZone { runtime, config })
-    }
-
-    // Concurrently attempts to start all zones identified by requests.
-    //
-    // This method is NOT idempotent.
-    //
-    // If we try to start ANY zones concurrently, the result is contained
-    // in the `StartZonesResult` value. This will contain the set of zones which
-    // were initialized successfully, as well as the set of zones which failed
-    // to start.
-    async fn start_omicron_zones(
-        &self,
-        mount_config: &MountConfig,
-        requests: impl Iterator<Item = &OmicronZoneConfig> + Clone,
-        time_is_synchronized: bool,
-        all_u2_pools: &Vec<ZpoolName>,
-    ) -> Result<StartZonesResult, Error> {
-        if let Some(name) =
-            requests.clone().map(|zone| zone.zone_name()).duplicates().next()
-        {
-            return Err(Error::BadServiceRequest {
-                service: name,
-                message: "Should not initialize zone twice".to_string(),
-            });
-        }
-
-        let futures = requests.map(|zone| async move {
-            self.start_omicron_zone(
-                mount_config,
-                &zone,
-                time_is_synchronized,
-                all_u2_pools,
-            )
-            .await
-            .map_err(|err| (zone.zone_name(), err))
-        });
-
-        let results = futures::future::join_all(futures).await;
-
-        let mut new_zones = Vec::new();
-        let mut errors = Vec::new();
-        for result in results {
-            match result {
-                Ok(zone) => {
-                    info!(self.inner.log, "Zone started"; "zone" => zone.name());
-                    new_zones.push(zone);
-                }
-                Err((name, error)) => {
-                    warn!(self.inner.log, "Zone failed to start"; "zone" => &name);
-                    errors.push((name, error))
-                }
-            }
-        }
-        Ok(StartZonesResult { new_zones, errors })
-    }
-
-    /// Returns the current Omicron zone configuration
-    pub async fn omicron_zones_list(&self) -> OmicronZonesConfig {
-        let log = &self.inner.log;
-
-        // We need to take the lock in order for the information in the ledger
-        // to be up-to-date.
-        let _existing_zones = self.inner.zones.lock().await;
-
-        // Read the existing set of services from the ledger.
-        let zone_ledger_paths = self.all_omicron_zone_ledgers().await;
-        let ledger_data = match Ledger::<OmicronZonesConfigLocal>::new(
-            log,
-            zone_ledger_paths.clone(),
-        )
-        .await
-        {
-            Some(ledger) => ledger.data().clone(),
-            None => OmicronZonesConfigLocal::initial(),
-        };
-
-        ledger_data.to_omicron_zones_config()
-    }
-
-    /// Ensures that particular Omicron zones are running
-    ///
-    /// These services will be instantiated by this function, and will be
-    /// recorded to a local file to ensure they start automatically on next
-    /// boot.
-    pub async fn ensure_all_omicron_zones_persistent(
-        &self,
-        mut request: OmicronZonesConfig,
-    ) -> Result<(), Error> {
-        let log = &self.inner.log;
-
-        let mut existing_zones = self.inner.zones.lock().await;
-
-        // Ensure that any zone images from the artifact store are present.
-        for zone in &request.zones {
-            if let Some(hash) = zone.image_source.artifact_hash() {
-                if let Err(err) = ArtifactStore::get_from_storage(
-                    &self.inner.storage,
-                    &self.inner.log,
-                    hash,
-                )
-                .await
-                {
-                    return Err(Error::ZoneArtifactNotFound {
-                        hash,
-                        zone_kind: zone.zone_type.kind().report_str(),
-                        id: zone.id,
-                        err,
-                    });
-                }
-            }
-        }
-
-        // Read the existing set of services from the ledger.
-        let zone_ledger_paths = self.all_omicron_zone_ledgers().await;
-        let mut ledger = match Ledger::<OmicronZonesConfigLocal>::new(
-            log,
-            zone_ledger_paths.clone(),
-        )
-        .await
-        {
-            Some(ledger) => ledger,
-            None => Ledger::<OmicronZonesConfigLocal>::new_with(
-                log,
-                zone_ledger_paths.clone(),
-                OmicronZonesConfigLocal::initial(),
-            ),
-        };
-
-        let ledger_zone_config = ledger.data_mut();
-        debug!(log, "ensure_all_omicron_zones_persistent";
-            "request_generation" => request.generation.to_string(),
-            "ledger_generation" =>
-                ledger_zone_config.omicron_generation.to_string(),
-        );
-
-        // Absolutely refuse to downgrade the configuration.
-        if ledger_zone_config.omicron_generation > request.generation {
-            return Err(Error::RequestedZoneConfigOutdated {
-                requested: request.generation,
-                current: ledger_zone_config.omicron_generation,
-            });
-        }
-
-        // If the generation is the same as what we're running, but the contents
-        // aren't, that's a problem, too.
-        if ledger_zone_config.omicron_generation == request.generation {
-            // Nexus should send us consistent zone orderings; however, we may
-            // reorder the zone list inside `ensure_all_omicron_zones`. To avoid
-            // equality checks failing only because the two lists are ordered
-            // differently, sort them both here before comparing.
-            let mut ledger_zones =
-                ledger_zone_config.clone().to_omicron_zones_config().zones;
-
-            // We sort by ID because we assume no two zones have the same ID. If
-            // that assumption is wrong, we may return an error here where the
-            // conflict is soley the list orders, but in such a case that's the
-            // least of our problems.
-            ledger_zones.sort_by_key(|z| z.id);
-            request.zones.sort_by_key(|z| z.id);
-
-            if ledger_zones != request.zones {
-                return Err(Error::RequestedConfigConflicts(
-                    request.generation,
-                ));
-            }
-        }
-
-        let omicron_generation = request.generation;
-        let ledger_generation = ledger_zone_config.ledger_generation;
-        self.ensure_all_omicron_zones(&mut existing_zones, request).await?;
-        let zones = existing_zones
-            .values()
-            .map(|omicron_zone| omicron_zone.config.clone())
-            .collect();
-
-        let new_config = OmicronZonesConfigLocal {
-            omicron_generation,
-            ledger_generation,
-            zones,
-        };
-
-        // If the contents of the ledger would be identical, we can avoid
-        // performing an update and commit.
-        if *ledger_zone_config == new_config {
-            return Ok(());
-        }
-
-        // Update the zones in the ledger and write it back to both M.2s
-        *ledger_zone_config = new_config;
-        ledger.commit().await?;
-
-        Ok(())
-    }
-
-    // Ensures that only the following Omicron zones are running.
-    //
-    // This method strives to be idempotent.
-    //
-    // - Starting and stopping zones is not an atomic operation - it's possible
-    // that we cannot start a zone after a previous one has been successfully
-    // created (or destroyed) intentionally. As a result, even in error cases,
-    // it's possible that the set of `existing_zones` changes. However, this set
-    // will only change in the direction of `new_request`: zones will only be
-    // removed if they ARE NOT part of `new_request`, and zones will only be
-    // added if they ARE part of `new_request`.
-    // - Zones are generally not updated in-place (i.e., two zone configurations
-    // that differ in any way are treated as entirely distinct), with an
-    // exception for backfilling the `filesystem_pool`, as long as the new
-    // request's filesystem pool matches the actual pool for that zones. This
-    // in-place update is allowed because changing only that property to match
-    // the runtime system does not require reconfiguring the zone or shutting it
-    // down and restarting it.
-    // - This method does not record any information such that these services
-    // are re-instantiated on boot.
-    async fn ensure_all_omicron_zones(
-        &self,
-        // The MutexGuard here attempts to ensure that the caller has the right
-        // lock held when calling this function.
-        existing_zones: &mut MutexGuard<'_, ZoneMap>,
-        new_request: OmicronZonesConfig,
-    ) -> Result<(), Error> {
-        // Do some data-normalization to ensure we can compare the "requested
-        // set" vs the "existing set" as HashSets.
-        let ReconciledNewZonesRequest {
-            zones_to_be_removed,
-            zones_to_be_added,
-        } = reconcile_running_zones_with_new_request(
-            existing_zones,
-            new_request,
-            &self.inner.log,
-        )?;
-
-        // Destroy zones that should not be running
-        for zone in zones_to_be_removed {
-            self.zone_bundle_and_try_remove(existing_zones, &zone).await;
-        }
-
-        // Collect information that's necessary to start new zones
-        let storage = self.inner.storage.get_latest_disks().await;
-        let mount_config = storage.mount_config();
-        let all_u2_pools = storage.all_u2_zpools();
-        let time_is_synchronized =
-            match self.timesync_get_locked(&existing_zones).await {
-                // Time is synchronized
-                Ok(TimeSync { sync: true, .. }) => true,
-                // Time is not synchronized, or we can't check
-                _ => false,
-            };
-
-        // Concurrently boot all new zones
-        let StartZonesResult { new_zones, errors } = self
-            .start_omicron_zones(
-                mount_config,
-                zones_to_be_added.iter(),
-                time_is_synchronized,
-                &all_u2_pools,
-            )
-            .await?;
-
-        // Add the new zones to our tracked zone set
-        existing_zones.extend(
-            new_zones.into_iter().map(|zone| (zone.name().to_string(), zone)),
-        );
-
-        // If any zones failed to start, exit with an error
-        if !errors.is_empty() {
-            return Err(Error::ZoneEnsure { errors });
-        }
-        Ok(())
-    }
-
-    // Attempts to take a zone bundle and remove a zone.
-    //
-    // Logs, but does not return an error on failure.
-    async fn zone_bundle_and_try_remove(
-        &self,
-        existing_zones: &mut MutexGuard<'_, ZoneMap>,
-        zone: &OmicronZoneConfig,
-    ) {
-        let log = &self.inner.log;
-        let expected_zone_name = zone.zone_name();
-        let Some(mut zone) = existing_zones.remove(&expected_zone_name) else {
-            warn!(
-                log,
-                "Expected to remove zone, but could not find it";
-                "zone_name" => &expected_zone_name,
-            );
-            return;
-        };
-        // Ensure that the sled agent's metrics task is not tracking the zone's
-        // VNICs or OPTE ports.
-        if let Some(queue) = self.maybe_metrics_queue() {
-            match queue.untrack_zone_links(&zone.runtime) {
-                Ok(_) => debug!(
-                    log,
-                    "stopped tracking zone datalinks";
-                    "zone_name" => &expected_zone_name,
-                ),
-                Err(errors) => error!(
-                    log,
-                    "failed to stop tracking zone datalinks";
-                    "errors" => ?errors,
-                    "zone_name" => &expected_zone_name
-                ),
-            }
-        }
-        debug!(
-            log,
-            "removing an existing zone";
-            "zone_name" => &expected_zone_name,
-        );
-        if let Err(e) = self
-            .inner
-            .zone_bundler
-            .create(&zone.runtime, ZoneBundleCause::UnexpectedZone)
-            .await
-        {
-            error!(
-                log,
-                "Failed to take bundle of unexpected zone";
-                "zone_name" => &expected_zone_name,
-                InlineErrorChain::new(&e),
-            );
-        }
-        if let Err(e) = zone.runtime.stop().await {
-            error!(log, "Failed to stop zone {}: {e}", zone.name());
-        }
-        if let Err(e) =
-            self.clean_up_after_zone_shutdown(&zone.config.zone).await
-        {
-            error!(
-                log,
-                "Failed to clean up after stopping zone {}", zone.name();
-                InlineErrorChain::new(&e),
-            );
-        }
+        Ok(runtime)
     }
 
     // Ensures that if a zone is about to be installed, it does not exist.
@@ -3969,7 +3456,7 @@ impl ServiceManager {
         &self,
         mount_config: &MountConfig,
         zone: &OmicronZoneConfig,
-        all_u2_pools: &Vec<ZpoolName>,
+        all_u2_pools: &[ZpoolName],
     ) -> Result<PathInPool, Error> {
         let name = zone.zone_name();
 
@@ -4068,110 +3555,12 @@ impl ServiceManager {
         }
     }
 
-    pub async fn timesync_get(&self) -> Result<TimeSync, Error> {
-        let existing_zones = self.inner.zones.lock().await;
-        self.timesync_get_locked(&existing_zones).await
-    }
-
-    async fn timesync_get_locked(
-        &self,
-        existing_zones: &tokio::sync::MutexGuard<'_, ZoneMap>,
-    ) -> Result<TimeSync, Error> {
-        let skip_timesync = match &self.inner.time_sync_config {
-            TimeSyncConfig::Normal => false,
-            TimeSyncConfig::Skip => true,
-            #[cfg(all(test, target_os = "illumos"))]
-            TimeSyncConfig::Fail => {
-                info!(self.inner.log, "Configured to fail timesync checks");
-                return Err(Error::TimeNotSynchronized);
-            }
-        };
-
-        if skip_timesync {
-            info!(self.inner.log, "Configured to skip timesync checks");
-            self.on_time_sync().await;
-            return Ok(TimeSync {
-                sync: true,
-                ref_id: 0,
-                ip_addr: IPV6_UNSPECIFIED,
-                stratum: 0,
-                ref_time: 0.0,
-                correction: 0.00,
-            });
-        };
-
-        let ntp_zone_name =
-            InstalledZone::get_zone_name(ZoneKind::NTP_PREFIX, None);
-
-        let ntp_zone = existing_zones
-            .iter()
-            .find(|(name, _)| name.starts_with(&ntp_zone_name))
-            .ok_or_else(|| Error::NtpZoneNotReady)?
-            .1;
-
-        // XXXNTP - This could be replaced with a direct connection to the
-        // daemon using a patched version of the chrony_candm crate to allow
-        // a custom server socket path. From the GZ, it should be possible to
-        // connect to the UNIX socket at
-        // format!("{}/var/run/chrony/chronyd.sock", ntp_zone.root())
-
-        match ntp_zone.runtime.run_cmd(&["/usr/bin/chronyc", "-c", "tracking"])
-        {
-            Ok(stdout) => {
-                let v: Vec<&str> = stdout.split(',').collect();
-
-                if v.len() > 9 {
-                    let ref_id = u32::from_str_radix(v[0], 16)
-                        .map_err(|_| Error::NtpZoneNotReady)?;
-                    let ip_addr =
-                        IpAddr::from_str(v[1]).unwrap_or(IPV6_UNSPECIFIED);
-                    let stratum = u8::from_str(v[2])
-                        .map_err(|_| Error::NtpZoneNotReady)?;
-                    let ref_time = f64::from_str(v[3])
-                        .map_err(|_| Error::NtpZoneNotReady)?;
-                    let correction = f64::from_str(v[4])
-                        .map_err(|_| Error::NtpZoneNotReady)?;
-
-                    // Per `chronyc waitsync`'s implementation, if either the
-                    // reference IP address is not unspecified or the reference
-                    // ID is not 0 or 0x7f7f0101, we are synchronized to a peer.
-                    let peer_sync = !ip_addr.is_unspecified()
-                        || (ref_id != 0 && ref_id != 0x7f7f0101);
-
-                    let sync = stratum < 10
-                        && ref_time > 1234567890.0
-                        && peer_sync
-                        && correction.abs() <= 0.05;
-
-                    if sync {
-                        self.on_time_sync().await;
-                    }
-
-                    Ok(TimeSync {
-                        sync,
-                        ref_id,
-                        ip_addr,
-                        stratum,
-                        ref_time,
-                        correction,
-                    })
-                } else {
-                    Err(Error::NtpZoneNotReady)
-                }
-            }
-            Err(e) => {
-                error!(self.inner.log, "chronyc command failed: {}", e);
-                Err(Error::NtpZoneNotReady)
-            }
-        }
-    }
-
     /// Check if the synchronization state of the sled has shifted to true and
     /// if so, execute the any out-of-band actions that need to be taken.
     ///
     /// This function only executes the out-of-band actions once, once the
     /// synchronization state has shifted to true.
-    async fn on_time_sync(&self) {
+    pub(crate) async fn on_time_sync(&self) {
         if self
             .inner
             .time_synced
@@ -4988,973 +4377,9 @@ fn internal_dns_addrobj_name(gz_address_index: u32) -> String {
     format!("internaldns{gz_address_index}")
 }
 
-#[derive(Debug)]
-struct ReconciledNewZonesRequest {
-    zones_to_be_removed: HashSet<OmicronZoneConfig>,
-    zones_to_be_added: HashSet<OmicronZoneConfig>,
-}
-
-fn reconcile_running_zones_with_new_request(
-    existing_zones: &mut MutexGuard<'_, ZoneMap>,
-    new_request: OmicronZonesConfig,
-    log: &Logger,
-) -> Result<ReconciledNewZonesRequest, Error> {
-    reconcile_running_zones_with_new_request_impl(
-        existing_zones
-            .values_mut()
-            .map(|z| (&mut z.config.zone, z.runtime.root_zpool())),
-        new_request,
-        log,
-    )
-}
-
-// Separate helper function for `reconcile_running_zones_with_new_request` that
-// allows unit tests to exercise the implementation without having to construct
-// a `&mut MutexGuard<'_, ZoneMap>` for `existing_zones`.
-fn reconcile_running_zones_with_new_request_impl<'a>(
-    existing_zones_with_runtime_zpool: impl Iterator<
-        Item = (&'a mut OmicronZoneConfig, &'a ZpoolOrRamdisk),
-    >,
-    new_request: OmicronZonesConfig,
-    log: &Logger,
-) -> Result<ReconciledNewZonesRequest, Error> {
-    let mut existing_zones_by_id: BTreeMap<_, _> =
-        existing_zones_with_runtime_zpool
-            .map(|(zone, zpool)| (zone.id, (zone, zpool)))
-            .collect();
-    let mut zones_to_be_added = HashSet::new();
-    let mut zones_to_be_removed = HashSet::new();
-    let mut zones_to_update = Vec::new();
-
-    for zone in new_request.zones.into_iter() {
-        let Some((existing_zone, runtime_zpool)) =
-            existing_zones_by_id.remove(&zone.id)
-        else {
-            // This zone isn't in the existing set; add it.
-            zones_to_be_added.insert(zone);
-            continue;
-        };
-
-        // We're already running this zone. If the config hasn't changed, we
-        // have nothing to do.
-        if zone == *existing_zone {
-            continue;
-        }
-
-        // Special case for fixing #7229. We have an incoming request for a zone
-        // that we're already running except the config has changed; normally,
-        // we'd shut the zone down and restart it. However, if we get a new
-        // request that is:
-        //
-        // 1. setting `filesystem_pool`, and
-        // 2. the config for this zone is otherwise identical, and
-        // 3. the new `filesystem_pool` matches the pool on which the zone is
-        //    installed
-        //
-        // then we don't want to shut the zone down and restart it, because the
-        // config hasn't actually changed in any meaningful way; this is just
-        // reconfigurator correcting #7229.
-        if let Some(new_filesystem_pool) = &zone.filesystem_pool {
-            let differs_only_by_filesystem_pool = {
-                // Clone `existing_zone` and mutate its `filesystem_pool` to
-                // match the new request; if they now match, that's the only
-                // field that's different.
-                let mut existing = existing_zone.clone();
-                existing.filesystem_pool = Some(*new_filesystem_pool);
-                existing == zone
-            };
-
-            let runtime_zpool = match runtime_zpool {
-                ZpoolOrRamdisk::Zpool(zpool_name) => zpool_name,
-                ZpoolOrRamdisk::Ramdisk => {
-                    // The only zone we run on the ramdisk is the switch
-                    // zone, for which it isn't possible to get a zone
-                    // request, so it should be fine to put an
-                    // `unreachable!()` here. Out of caution for future
-                    // changes, we'll instead return an error that the
-                    // requested zone is on the ramdisk.
-                    error!(
-                        log,
-                        "fix-7229: unexpectedly received request with a \
-                         zone config for a zone running on ramdisk";
-                        "new_config" => ?zone,
-                        "existing_config" => ?existing_zone,
-                    );
-                    return Err(Error::ZoneIsRunningOnRamdisk {
-                        zone_id: zone.id,
-                    });
-                }
-            };
-
-            if differs_only_by_filesystem_pool {
-                if new_filesystem_pool == runtime_zpool {
-                    // Our #7229 special case: the new config is only filling in
-                    // the pool, and it does so correctly. Move on to the next
-                    // zone in the request without adding this zone to either of
-                    // our `zone_to_be_*` sets.
-                    info!(
-                        log,
-                        "fix-7229: accepted new zone config that changes only \
-                         filesystem_pool";
-                        "new_config" => ?zone,
-                    );
-
-                    // We should update this `existing_zone`, but delay doing so
-                    // until we've processed all zones (so if there are any
-                    // failures later, we don't return having partially-updated
-                    // the existing zones).
-                    zones_to_update.push((existing_zone, zone));
-                    continue;
-                } else {
-                    error!(
-                        log,
-                        "fix-7229: rejected new zone config that changes only \
-                         filesystem_pool (incorrect pool)";
-                        "new_config" => ?zone,
-                        "expected_pool" => %runtime_zpool,
-                    );
-                    return Err(Error::InvalidFilesystemPoolZoneConfig {
-                        zone_id: zone.id,
-                        expected_pool: *runtime_zpool,
-                        got_pool: *new_filesystem_pool,
-                    });
-                }
-            }
-        }
-
-        // End of #7229 special case: this zone is already running, but the new
-        // request has changed it in some way. We need to shut it down and
-        // restart it.
-        zones_to_be_removed.insert(existing_zone.clone());
-        zones_to_be_added.insert(zone);
-    }
-
-    // Any remaining entries in `existing_zones_by_id` should be shut down.
-    zones_to_be_removed
-        .extend(existing_zones_by_id.into_values().map(|(z, _)| z.clone()));
-
-    // All zones have been handled successfully; commit any changes to existing
-    // zones we found in our "fix 7229" special case above.
-    let num_zones_updated = zones_to_update.len();
-    for (existing_zone, new_zone) in zones_to_update {
-        *existing_zone = new_zone;
-    }
-
-    info!(
-        log,
-        "ensure_all_omicron_zones: request reconciliation done";
-        "num_zones_to_be_removed" => zones_to_be_removed.len(),
-        "num_zones_to_be_added" => zones_to_be_added.len(),
-        "num_zones_updated" => num_zones_updated,
-    );
-    Ok(ReconciledNewZonesRequest { zones_to_be_removed, zones_to_be_added })
-}
-
-#[cfg(all(test, target_os = "illumos"))]
-mod illumos_tests {
-    use crate::metrics;
-
-    use super::*;
-    use illumos_utils::dladm::{
-        BOOTSTRAP_ETHERSTUB_NAME, Etherstub, UNDERLAY_ETHERSTUB_NAME,
-        UNDERLAY_ETHERSTUB_VNIC_NAME,
-    };
-
-    use nexus_sled_agent_shared::inventory::OmicronZoneImageSource;
-    use omicron_uuid_kinds::OmicronZoneUuid;
-    use sled_agent_zone_images::ZoneImageZpools;
-    use sled_storage::manager_test_harness::StorageManagerTestHarness;
-    use std::{
-        net::{Ipv6Addr, SocketAddrV6},
-        time::Duration,
-    };
-    use tokio::sync::mpsc::error::TryRecvError;
-    use uuid::Uuid;
-
-    // Just placeholders. Not used.
-    const GLOBAL_ZONE_BOOTSTRAP_IP: Ipv6Addr = Ipv6Addr::LOCALHOST;
-    const SWITCH_ZONE_BOOTSTRAP_IP: Ipv6Addr = Ipv6Addr::LOCALHOST;
-
-    const EXPECTED_PORT: u16 = 12223;
-
-    // Timeout within which we must have received a message about a zone's links
-    // to track. This is very generous.
-    const LINK_NOTIFICATION_TIMEOUT: Duration = Duration::from_secs(5);
-
-    struct FakeSystemApi {
-        fake_install_dir: Utf8PathBuf,
-        dladm: Arc<illumos_utils::fakes::dladm::Dladm>,
-        zones: Arc<illumos_utils::fakes::zone::Zones>,
-    }
-
-    impl FakeSystemApi {
-        fn new(fake_install_dir: Utf8PathBuf) -> Box<dyn SystemApi> {
-            Box::new(Self {
-                fake_install_dir,
-                dladm: illumos_utils::fakes::dladm::Dladm::new(),
-                zones: illumos_utils::fakes::zone::Zones::new(),
-            })
-        }
-    }
-
-    impl SystemApi for FakeSystemApi {
-        fn fake_install_dir(&self) -> Option<&Utf8Path> {
-            Some(&self.fake_install_dir)
-        }
-
-        fn dladm(&self) -> Arc<dyn illumos_utils::dladm::Api> {
-            self.dladm.clone()
-        }
-
-        fn zones(&self) -> Arc<dyn illumos_utils::zone::Api> {
-            self.zones.clone()
-        }
-    }
-
-    fn make_bootstrap_networking_config() -> BootstrapNetworking {
-        BootstrapNetworking {
-            bootstrap_etherstub: Etherstub(
-                BOOTSTRAP_ETHERSTUB_NAME.to_string(),
-            ),
-            global_zone_bootstrap_ip: GLOBAL_ZONE_BOOTSTRAP_IP,
-            global_zone_bootstrap_link_local_ip: GLOBAL_ZONE_BOOTSTRAP_IP,
-            switch_zone_bootstrap_ip: SWITCH_ZONE_BOOTSTRAP_IP,
-            underlay_etherstub: Etherstub(UNDERLAY_ETHERSTUB_NAME.to_string()),
-            underlay_etherstub_vnic: EtherstubVnic(
-                UNDERLAY_ETHERSTUB_VNIC_NAME.to_string(),
-            ),
-        }
-    }
-
-    // Prepare to call "ensure" for a new service, then actually call "ensure".
-    async fn ensure_new_service(
-        mgr: &ServiceManager,
-        id: OmicronZoneUuid,
-        generation: Generation,
-    ) {
-        let address =
-            SocketAddrV6::new(Ipv6Addr::LOCALHOST, EXPECTED_PORT, 0, 0);
-        try_new_service_of_type(
-            mgr,
-            id,
-            generation,
-            OmicronZoneType::InternalNtp { address },
-        )
-        .await
-        .expect("Could not create service");
-    }
-
-    async fn try_new_service_of_type(
-        mgr: &ServiceManager,
-        id: OmicronZoneUuid,
-        generation: Generation,
-        zone_type: OmicronZoneType,
-    ) -> Result<(), Error> {
-        mgr.ensure_all_omicron_zones_persistent(OmicronZonesConfig {
-            generation,
-            zones: vec![OmicronZoneConfig {
-                id,
-                zone_type,
-                filesystem_pool: None,
-                image_source: OmicronZoneImageSource::InstallDataset,
-            }],
-        })
-        .await
-    }
-
-    // Prepare to call "ensure" for a service which already exists. We should
-    // return the service without actually installing a new zone.
-    async fn ensure_existing_service(
-        mgr: &ServiceManager,
-        id: OmicronZoneUuid,
-        generation: Generation,
-    ) {
-        let address =
-            SocketAddrV6::new(Ipv6Addr::LOCALHOST, EXPECTED_PORT, 0, 0);
-        mgr.ensure_all_omicron_zones_persistent(OmicronZonesConfig {
-            generation,
-            zones: vec![OmicronZoneConfig {
-                id,
-                zone_type: OmicronZoneType::InternalNtp { address },
-                filesystem_pool: None,
-                image_source: OmicronZoneImageSource::InstallDataset,
-            }],
-        })
-        .await
-        .unwrap();
-    }
-
-    // Prepare to drop the service manager.
-    //
-    // This will shut down all allocated zones, and delete their
-    // associated VNICs.
-    async fn drop_service_manager(mgr: ServiceManager) {
-        // Also send a message to the metrics task that the VNIC has been
-        // deleted.
-        let queue = mgr.metrics_queue();
-        for zone in mgr.inner.zones.lock().await.values() {
-            if let Err(e) = queue.untrack_zone_links(&zone.runtime) {
-                error!(
-                    mgr.inner.log,
-                    "failed to stop tracking zone datalinks";
-                    "errors" => ?e,
-                );
-            }
-        }
-
-        // Explicitly drop the service manager
-        drop(mgr);
-    }
-
-    struct TestConfig {
-        config_dir: camino_tempfile::Utf8TempDir,
-    }
-
-    impl TestConfig {
-        async fn new() -> Self {
-            let config_dir = camino_tempfile::Utf8TempDir::new().unwrap();
-            Self { config_dir }
-        }
-
-        fn make_config(&self) -> Config {
-            Config {
-                sled_identifiers: SledIdentifiers {
-                    rack_id: Uuid::new_v4(),
-                    sled_id: Uuid::new_v4(),
-                    model: "fake-gimlet".to_string(),
-                    revision: 1,
-                    serial: "fake-serial".to_string(),
-                },
-                sidecar_revision: SidecarRevision::Physical(
-                    "rev_whatever_its_a_test".to_string(),
-                ),
-            }
-        }
-
-        fn override_paths(&self, mgr: &ServiceManager) {
-            let dir = self.config_dir.path();
-            mgr.override_ledger_directory(dir.to_path_buf());
-            mgr.override_image_directory(dir.to_path_buf());
-
-            // We test launching "fake" versions of the zones, but the
-            // logic to find paths relies on checking the existence of
-            // files.
-            std::fs::write(dir.join("oximeter.tar.gz"), "Not a real file")
-                .unwrap();
-            std::fs::write(dir.join("ntp.tar.gz"), "Not a real file").unwrap();
-        }
-    }
-
-    async fn setup_storage(log: &Logger) -> StorageManagerTestHarness {
-        let mut harness = StorageManagerTestHarness::new(&log).await;
-        let raw_disks =
-            harness.add_vdevs(&["u2_test.vdev", "m2_test.vdev"]).await;
-        harness.handle().key_manager_ready().await;
-        let config = harness.make_config(1, &raw_disks);
-        let result = harness
-            .handle()
-            .omicron_physical_disks_ensure(config.clone())
-            .await
-            .expect("Failed to ensure disks");
-        assert!(!result.has_error(), "{:?}", result);
-        harness
-    }
-
-    struct LedgerTestHelper<'a> {
-        log: slog::Logger,
-        storage_test_harness: StorageManagerTestHarness,
-        zone_bundler: ZoneBundler,
-        zone_image_resolver: ZoneImageSourceResolver,
-        test_config: &'a TestConfig,
-    }
-
-    impl<'a> LedgerTestHelper<'a> {
-        async fn new(log: slog::Logger, test_config: &'a TestConfig) -> Self {
-            let storage_test_harness = setup_storage(&log).await;
-            let zone_bundler = ZoneBundler::new(
-                log.clone(),
-                storage_test_harness.handle().clone(),
-                Default::default(),
-            )
-            .await;
-
-            let mut storage_manager = storage_test_harness.handle().clone();
-            let all_disks = storage_manager.get_latest_disks().await;
-            let (_, boot_zpool) = storage_manager.wait_for_boot_disk().await;
-            let zpools = ZoneImageZpools {
-                root: &all_disks.mount_config().root,
-                all_m2_zpools: all_disks.all_m2_zpools(),
-            };
-            let zone_image_resolver =
-                ZoneImageSourceResolver::new(&log, &zpools, &boot_zpool);
-
-            LedgerTestHelper {
-                log,
-                storage_test_harness,
-                zone_bundler,
-                zone_image_resolver,
-                test_config,
-            }
-        }
-
-        async fn cleanup(&mut self) {
-            self.storage_test_harness.cleanup().await;
-        }
-
-        fn new_service_manager(
-            &self,
-            system: Box<dyn SystemApi>,
-        ) -> ServiceManager {
-            self.new_service_manager_with_timesync(TimeSyncConfig::Skip, system)
-        }
-
-        fn new_service_manager_with_timesync(
-            &self,
-            time_sync_config: TimeSyncConfig,
-            system: Box<dyn SystemApi>,
-        ) -> ServiceManager {
-            let log = &self.log;
-            let reconciler =
-                DdmReconciler::new(Ipv6Subnet::new(Ipv6Addr::LOCALHOST), log)
-                    .expect("created DdmReconciler");
-            let mgr = ServiceManager::new_inner(
-                log,
-                reconciler,
-                make_bootstrap_networking_config(),
-                SledMode::Auto,
-                time_sync_config,
-                SidecarRevision::Physical("rev-test".to_string()),
-                vec![],
-                self.storage_test_harness.handle().clone(),
-                self.zone_bundler.clone(),
-                self.zone_image_resolver.clone(),
-                system,
-            );
-            self.test_config.override_paths(&mgr);
-            mgr
-        }
-
-        async fn sled_agent_started(
-            log: &slog::Logger,
-            test_config: &TestConfig,
-            mgr: &ServiceManager,
-            metrics_queue: MetricsRequestQueue,
-        ) {
-            let port_manager = PortManager::new(
-                log.new(o!("component" => "PortManager")),
-                Ipv6Addr::new(
-                    0xfd00, 0x1de, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
-                ),
-            );
-
-            mgr.sled_agent_started(
-                test_config.make_config(),
-                port_manager,
-                Ipv6Addr::LOCALHOST,
-                Uuid::new_v4(),
-                None,
-                metrics_queue,
-            )
-            .await
-            .unwrap();
-        }
-    }
-
-    #[tokio::test]
-    async fn test_ensure_service() {
-        let logctx =
-            omicron_test_utils::dev::test_setup_log("test_ensure_service");
-        let test_config = TestConfig::new().await;
-        let mut helper =
-            LedgerTestHelper::new(logctx.log.clone(), &test_config).await;
-        let mgr = helper.new_service_manager(FakeSystemApi::new(
-            test_config.config_dir.path().to_path_buf(),
-        ));
-        let (metrics_queue, mut metrics_rx) = MetricsRequestQueue::for_test();
-        LedgerTestHelper::sled_agent_started(
-            &logctx.log,
-            &test_config,
-            &mgr,
-            metrics_queue,
-        )
-        .await;
-
-        let v1 = Generation::new();
-        let found = mgr.omicron_zones_list().await;
-        assert_eq!(found.generation, v1);
-        assert!(found.zones.is_empty());
-
-        let v2 = v1.next();
-        let id = OmicronZoneUuid::new_v4();
-        ensure_new_service(&mgr, id, v2).await;
-
-        let found = mgr.omicron_zones_list().await;
-        assert_eq!(found.generation, v2);
-        assert_eq!(found.zones.len(), 1);
-        assert_eq!(found.zones[0].id, id);
-
-        // First check that we received the synced sled notification
-        let synced_message = tokio::time::timeout(
-            LINK_NOTIFICATION_TIMEOUT,
-            metrics_rx.recv(),
-        ).await.expect("Should have received a message about the sled being synced within the timeout")
-            .expect("Should have received a message about the sled being synced");
-        assert_eq!(
-            synced_message,
-            metrics::Message::TimeSynced { sled_id: mgr.sled_id() },
-        );
-
-        // Then, check that we received a message about the zone's VNIC.
-        let vnic_message = tokio::time::timeout(
-            LINK_NOTIFICATION_TIMEOUT,
-            metrics_rx.recv(),
-        )
-            .await
-            .expect(
-                "Should have received a message about the zone's VNIC within the timeout"
-            )
-            .expect("Should have received a message about the zone's VNIC");
-        let zone_name = format!("oxz_ntp_{}", id);
-        assert_eq!(
-            vnic_message,
-            metrics::Message::TrackVnic {
-                zone_name,
-                name: "oxControlService0".into()
-            },
-        );
-        assert_eq!(metrics_rx.try_recv(), Err(TryRecvError::Empty));
-
-        drop_service_manager(mgr).await;
-
-        helper.cleanup().await;
-        logctx.cleanup_successful();
-    }
-
-    #[tokio::test]
-    async fn test_ensure_service_before_timesync() {
-        let logctx = omicron_test_utils::dev::test_setup_log(
-            "test_ensure_service_before_timesync",
-        );
-        let test_config = TestConfig::new().await;
-        let mut helper =
-            LedgerTestHelper::new(logctx.log.clone(), &test_config).await;
-
-        let mgr = helper.new_service_manager_with_timesync(
-            TimeSyncConfig::Fail,
-            FakeSystemApi::new(test_config.config_dir.path().to_path_buf()),
-        );
-        let (metrics_queue, mut metrics_rx) = MetricsRequestQueue::for_test();
-        LedgerTestHelper::sled_agent_started(
-            &logctx.log,
-            &test_config,
-            &mgr,
-            metrics_queue,
-        )
-        .await;
-
-        let v1 = Generation::new();
-        let found = mgr.omicron_zones_list().await;
-        assert_eq!(found.generation, v1);
-        assert!(found.zones.is_empty());
-
-        let v2 = v1.next();
-        let id = OmicronZoneUuid::new_v4();
-
-        // Should fail: time has not yet synchronized.
-        let address =
-            SocketAddrV6::new(Ipv6Addr::LOCALHOST, EXPECTED_PORT, 0, 0);
-        let result = try_new_service_of_type(
-            &mgr,
-            id,
-            v2,
-            OmicronZoneType::Oximeter { address },
-        )
-        .await;
-
-        // First, ensure this is the right kind of error.
-        let err = result.unwrap_err();
-        let errors = match &err {
-            Error::ZoneEnsure { errors } => errors,
-            err => panic!("unexpected result: {err:?}"),
-        };
-        assert_eq!(errors.len(), 1);
-        assert_matches::assert_matches!(
-            errors[0].1,
-            Error::TimeNotSynchronized
-        );
-
-        // Ensure we have _not_ received a message about the zone's VNIC,
-        // because there isn't a zone.
-        assert_eq!(metrics_rx.try_recv(), Err(TryRecvError::Empty));
-
-        // Next, ensure this still converts to an "unavail" common error
-        let common_err = omicron_common::api::external::Error::from(err);
-        assert_matches::assert_matches!(
-            common_err,
-            omicron_common::api::external::Error::ServiceUnavailable { .. }
-        );
-
-        // Should succeed: we don't care that time has not yet synchronized (for
-        // this specific service).
-        try_new_service_of_type(
-            &mgr,
-            id,
-            v2,
-            OmicronZoneType::InternalNtp { address },
-        )
-        .await
-        .unwrap();
-
-        drop_service_manager(mgr).await;
-        helper.cleanup().await;
-        logctx.cleanup_successful();
-    }
-
-    #[tokio::test]
-    async fn test_ensure_service_which_already_exists() {
-        let logctx = omicron_test_utils::dev::test_setup_log(
-            "test_ensure_service_which_already_exists",
-        );
-        let test_config = TestConfig::new().await;
-        let mut helper =
-            LedgerTestHelper::new(logctx.log.clone(), &test_config).await;
-        let mgr = helper.new_service_manager(FakeSystemApi::new(
-            test_config.config_dir.path().to_path_buf(),
-        ));
-        let (metrics_queue, mut metrics_rx) = MetricsRequestQueue::for_test();
-        LedgerTestHelper::sled_agent_started(
-            &logctx.log,
-            &test_config,
-            &mgr,
-            metrics_queue,
-        )
-        .await;
-
-        let v2 = Generation::new().next();
-        let id = OmicronZoneUuid::new_v4();
-        ensure_new_service(&mgr, id, v2).await;
-        let v3 = v2.next();
-        ensure_existing_service(&mgr, id, v3).await;
-        let found = mgr.omicron_zones_list().await;
-        assert_eq!(found.generation, v3);
-        assert_eq!(found.zones.len(), 1);
-        assert_eq!(found.zones[0].id, id);
-
-        // First, we will get a message about the sled being synced.
-        let synced_message = tokio::time::timeout(
-            LINK_NOTIFICATION_TIMEOUT,
-            metrics_rx.recv(),
-        ).await.expect("Should have received a message about the sled being synced within the timeout")
-            .expect("Should have received a message about the sled being synced");
-        assert_eq!(
-            synced_message,
-            metrics::Message::TimeSynced { sled_id: mgr.sled_id() }
-        );
-
-        // In this case, the manager creates the zone once, and then "ensuring"
-        // it a second time is a no-op. So we simply expect the same message
-        // sequence as starting a zone for the first time.
-        let vnic_message = tokio::time::timeout(
-            LINK_NOTIFICATION_TIMEOUT,
-            metrics_rx.recv(),
-        )
-            .await
-            .expect(
-                "Should have received a message about the zone's VNIC within the timeout"
-            )
-            .expect("Should have received a message about the zone's VNIC");
-        let zone_name = format!("oxz_ntp_{}", id);
-        assert_eq!(
-            vnic_message,
-            metrics::Message::TrackVnic {
-                zone_name,
-                name: "oxControlService0".into()
-            },
-        );
-        assert_eq!(metrics_rx.try_recv(), Err(TryRecvError::Empty));
-
-        drop_service_manager(mgr).await;
-
-        helper.cleanup().await;
-        logctx.cleanup_successful();
-    }
-
-    #[tokio::test]
-    async fn test_services_are_recreated_on_reboot() {
-        let logctx = omicron_test_utils::dev::test_setup_log(
-            "test_services_are_recreated_on_reboot",
-        );
-        let test_config = TestConfig::new().await;
-        let mut helper =
-            LedgerTestHelper::new(logctx.log.clone(), &test_config).await;
-
-        // First, spin up a ServiceManager, create a new zone, and then tear
-        // down the ServiceManager.
-        let mgr = helper.new_service_manager(FakeSystemApi::new(
-            test_config.config_dir.path().to_path_buf(),
-        ));
-        let (metrics_queue, mut metrics_rx) = MetricsRequestQueue::for_test();
-        LedgerTestHelper::sled_agent_started(
-            &logctx.log,
-            &test_config,
-            &mgr,
-            metrics_queue,
-        )
-        .await;
-
-        let v2 = Generation::new().next();
-        let id = OmicronZoneUuid::new_v4();
-        ensure_new_service(&mgr, id, v2).await;
-
-        let sled_id = mgr.sled_id();
-        drop_service_manager(mgr).await;
-
-        // First, we will get a message about the sled being synced.
-        let synced_message = tokio::time::timeout(
-            LINK_NOTIFICATION_TIMEOUT,
-            metrics_rx.recv(),
-        ).await.expect("Should have received a message about the sled being synced within the timeout")
-            .expect("Should have received a message about the sled being synced");
-        assert_eq!(synced_message, metrics::Message::TimeSynced { sled_id });
-
-        // Check that we received a message about the zone's VNIC. Since the
-        // manager is being dropped, it should also send a message about the
-        // VNIC being deleted.
-        let zone_name = format!("oxz_ntp_{}", id);
-        for expected_vnic_message in [
-            metrics::Message::TrackVnic {
-                zone_name,
-                name: "oxControlService0".into(),
-            },
-            metrics::Message::UntrackVnic { name: "oxControlService0".into() },
-        ] {
-            println!(
-                "Expecting message from manager: {expected_vnic_message:#?}"
-            );
-            let vnic_message = tokio::time::timeout(
-                LINK_NOTIFICATION_TIMEOUT,
-                metrics_rx.recv(),
-            )
-                .await
-                .expect(
-                    "Should have received a message about the zone's VNIC within the timeout"
-                )
-                .expect("Should have received a message about the zone's VNIC");
-            assert_eq!(vnic_message, expected_vnic_message,);
-        }
-        // Note that the manager has been dropped, so we should get
-        // disconnected, not empty.
-        assert_eq!(metrics_rx.try_recv(), Err(TryRecvError::Disconnected));
-
-        // Before we re-create the service manager - notably, using the same
-        // config file! - expect that a service gets initialized.
-        // TODO?
-        let mgr = helper.new_service_manager(FakeSystemApi::new(
-            test_config.config_dir.path().to_path_buf(),
-        ));
-        let (metrics_queue, mut metrics_rx) = MetricsRequestQueue::for_test();
-        LedgerTestHelper::sled_agent_started(
-            &logctx.log,
-            &test_config,
-            &mgr,
-            metrics_queue,
-        )
-        .await;
-
-        let found = mgr.omicron_zones_list().await;
-        assert_eq!(found.generation, v2);
-        assert_eq!(found.zones.len(), 1);
-        assert_eq!(found.zones[0].id, id);
-
-        // Note that the `omicron_zones_list()` request just returns the
-        // configured zones, stored in the on-disk ledger. There is nothing
-        // above that actually ensures that those zones exist, as far as I can
-        // tell!
-        assert_eq!(metrics_rx.try_recv(), Err(TryRecvError::Empty));
-
-        drop_service_manager(mgr).await;
-
-        helper.cleanup().await;
-        logctx.cleanup_successful();
-    }
-
-    #[tokio::test]
-    async fn test_services_do_not_persist_without_config() {
-        let logctx = omicron_test_utils::dev::test_setup_log(
-            "test_services_do_not_persist_without_config",
-        );
-        let test_config = TestConfig::new().await;
-        let mut helper =
-            LedgerTestHelper::new(logctx.log.clone(), &test_config).await;
-
-        // First, spin up a ServiceManager, create a new zone, and then tear
-        // down the ServiceManager.
-        let mgr = helper.new_service_manager(FakeSystemApi::new(
-            test_config.config_dir.path().to_path_buf(),
-        ));
-        let metrics_handles = MetricsRequestQueue::for_test();
-        LedgerTestHelper::sled_agent_started(
-            &logctx.log,
-            &test_config,
-            &mgr,
-            metrics_handles.0.clone(),
-        )
-        .await;
-
-        let v1 = Generation::new();
-        let v2 = v1.next();
-        let id = OmicronZoneUuid::new_v4();
-        ensure_new_service(&mgr, id, v2).await;
-        drop_service_manager(mgr).await;
-
-        // Next, delete the ledger. This means the zone we just created will not
-        // be remembered on the next initialization.
-        std::fs::remove_file(
-            test_config.config_dir.path().join(ZONES_LEDGER_FILENAME),
-        )
-        .unwrap();
-
-        // Observe that the old service is not re-initialized.
-        let mgr = helper.new_service_manager(FakeSystemApi::new(
-            test_config.config_dir.path().to_path_buf(),
-        ));
-        let metrics_handles = MetricsRequestQueue::for_test();
-        LedgerTestHelper::sled_agent_started(
-            &logctx.log,
-            &test_config,
-            &mgr,
-            metrics_handles.0.clone(),
-        )
-        .await;
-
-        let found = mgr.omicron_zones_list().await;
-        assert_eq!(found.generation, v1);
-        assert!(found.zones.is_empty());
-
-        drop_service_manager(mgr).await;
-
-        helper.cleanup().await;
-        logctx.cleanup_successful();
-    }
-
-    #[tokio::test]
-    async fn test_bad_generations() {
-        // Start like the normal tests.
-        let logctx =
-            omicron_test_utils::dev::test_setup_log("test_bad_generations");
-        let test_config = TestConfig::new().await;
-        let mut helper =
-            LedgerTestHelper::new(logctx.log.clone(), &test_config).await;
-        let mgr = helper.new_service_manager(FakeSystemApi::new(
-            test_config.config_dir.path().to_path_buf(),
-        ));
-        let metrics_handles = MetricsRequestQueue::for_test();
-        LedgerTestHelper::sled_agent_started(
-            &logctx.log,
-            &test_config,
-            &mgr,
-            metrics_handles.0.clone(),
-        )
-        .await;
-
-        // Like the normal tests, set up a generation with one zone in it.
-        let v1 = Generation::new();
-        let v2 = v1.next();
-        let id1 = OmicronZoneUuid::new_v4();
-
-        let address =
-            SocketAddrV6::new(Ipv6Addr::LOCALHOST, EXPECTED_PORT, 0, 0);
-        let mut zones = vec![OmicronZoneConfig {
-            id: id1,
-            zone_type: OmicronZoneType::InternalNtp { address },
-            filesystem_pool: None,
-            image_source: OmicronZoneImageSource::InstallDataset,
-        }];
-
-        mgr.ensure_all_omicron_zones_persistent(OmicronZonesConfig {
-            generation: v2,
-            zones: zones.clone(),
-        })
-        .await
-        .unwrap();
-
-        let found = mgr.omicron_zones_list().await;
-        assert_eq!(found.generation, v2);
-        assert_eq!(found.zones.len(), 1);
-        assert_eq!(found.zones[0].id, id1);
-
-        // Make a new list of zones that we're going to try with a bunch of
-        // different generation numbers.
-        let id2 = OmicronZoneUuid::new_v4();
-        zones.push(OmicronZoneConfig {
-            id: id2,
-            zone_type: OmicronZoneType::InternalNtp { address },
-            filesystem_pool: None,
-            image_source: OmicronZoneImageSource::InstallDataset,
-        });
-
-        // Now try to apply that list with an older generation number.  This
-        // shouldn't work and the reported state should be unchanged.
-        let error = mgr
-            .ensure_all_omicron_zones_persistent(OmicronZonesConfig {
-                generation: v1,
-                zones: zones.clone(),
-            })
-            .await
-            .expect_err("unexpectedly went backwards in zones generation");
-        assert!(matches!(
-            error,
-            Error::RequestedZoneConfigOutdated { requested, current }
-            if requested == v1 && current == v2
-        ));
-        let found2 = mgr.omicron_zones_list().await;
-        assert_eq!(found, found2);
-
-        // Now try to apply that list with the same generation number that we
-        // used before.  This shouldn't work either.
-        let error = mgr
-            .ensure_all_omicron_zones_persistent(OmicronZonesConfig {
-                generation: v2,
-                zones: zones.clone(),
-            })
-            .await
-            .expect_err("unexpectedly changed a single zone generation");
-        assert!(matches!(
-            error,
-            Error::RequestedConfigConflicts(vr) if vr == v2
-        ));
-        let found3 = mgr.omicron_zones_list().await;
-        assert_eq!(found, found3);
-
-        // But we should be able to apply this new list of zones as long as we
-        // advance the generation number.
-        let v3 = v2.next();
-        mgr.ensure_all_omicron_zones_persistent(OmicronZonesConfig {
-            generation: v3,
-            zones: zones.clone(),
-        })
-        .await
-        .expect("failed to remove all zones in a new generation");
-        let found4 = mgr.omicron_zones_list().await;
-        assert_eq!(found4.generation, v3);
-        let mut our_zones = zones;
-        our_zones.sort_by(|a, b| a.id.cmp(&b.id));
-        let mut found_zones = found4.zones;
-        found_zones.sort_by(|a, b| a.id.cmp(&b.id));
-        assert_eq!(our_zones, found_zones);
-
-        drop_service_manager(mgr).await;
-
-        helper.cleanup().await;
-        logctx.cleanup_successful();
-    }
-}
-
 #[cfg(test)]
 mod test {
     use super::*;
-    use nexus_sled_agent_shared::inventory::OmicronZoneImageSource;
-    use omicron_uuid_kinds::ZpoolUuid;
     use sled_agent_types::zone_bundle::ZoneBundleMetadata;
 
     #[test]
@@ -5987,282 +4412,4 @@ mod test {
             &serde_json::to_string_pretty(&schema).unwrap(),
         );
     }
-
-    #[test]
-    fn test_fix_7229_zone_config_reconciliation() {
-        fn make_omicron_zone_config(
-            filesystem_pool: Option<&ZpoolName>,
-        ) -> OmicronZoneConfig {
-            OmicronZoneConfig {
-                id: OmicronZoneUuid::new_v4(),
-                filesystem_pool: filesystem_pool.cloned(),
-                zone_type: OmicronZoneType::Oximeter {
-                    address: "[::1]:0".parse().unwrap(),
-                },
-                image_source: OmicronZoneImageSource::InstallDataset,
-            }
-        }
-
-        let logctx =
-            omicron_test_utils::dev::test_setup_log("test_ensure_service");
-        let log = &logctx.log;
-
-        let some_zpools = (0..10)
-            .map(|_| ZpoolName::new_external(ZpoolUuid::new_v4()))
-            .collect::<Vec<_>>();
-
-        // Test 1: We have some zones; the new config makes no changes.
-        {
-            let mut existing = vec![
-                (
-                    make_omicron_zone_config(None),
-                    ZpoolOrRamdisk::Zpool(some_zpools[0]),
-                ),
-                (
-                    make_omicron_zone_config(Some(&some_zpools[1])),
-                    ZpoolOrRamdisk::Zpool(some_zpools[1]),
-                ),
-                (
-                    make_omicron_zone_config(Some(&some_zpools[2])),
-                    ZpoolOrRamdisk::Zpool(some_zpools[2]),
-                ),
-            ];
-            let new_request = OmicronZonesConfig {
-                generation: Generation::new().next(),
-                zones: existing.iter().map(|(zone, _)| zone.clone()).collect(),
-            };
-            let reconciled = reconcile_running_zones_with_new_request_impl(
-                existing.iter_mut().map(|(z, p)| (z, &*p)),
-                new_request.clone(),
-                log,
-            )
-            .expect("reconciled successfully");
-            assert_eq!(reconciled.zones_to_be_removed, HashSet::new());
-            assert_eq!(reconciled.zones_to_be_added, HashSet::new());
-            assert_eq!(
-                existing.iter().map(|(z, _)| z.clone()).collect::<Vec<_>>(),
-                new_request.zones,
-            );
-        }
-
-        // Test 2: We have some zones; the new config changes `filesystem_pool`
-        // to match our runtime pools (i.e., the #7229 fix).
-        {
-            let mut existing = vec![
-                (
-                    make_omicron_zone_config(None),
-                    ZpoolOrRamdisk::Zpool(some_zpools[0]),
-                ),
-                (
-                    make_omicron_zone_config(None),
-                    ZpoolOrRamdisk::Zpool(some_zpools[1]),
-                ),
-                (
-                    make_omicron_zone_config(None),
-                    ZpoolOrRamdisk::Zpool(some_zpools[2]),
-                ),
-            ];
-            let new_request = OmicronZonesConfig {
-                generation: Generation::new().next(),
-                zones: existing
-                    .iter()
-                    .enumerate()
-                    .map(|(i, (zone, _))| {
-                        let mut zone = zone.clone();
-                        zone.filesystem_pool = Some(some_zpools[i]);
-                        zone
-                    })
-                    .collect(),
-            };
-            let reconciled = reconcile_running_zones_with_new_request_impl(
-                existing.iter_mut().map(|(z, p)| (z, &*p)),
-                new_request.clone(),
-                log,
-            )
-            .expect("reconciled successfully");
-            assert_eq!(reconciled.zones_to_be_removed, HashSet::new());
-            assert_eq!(reconciled.zones_to_be_added, HashSet::new());
-            assert_eq!(
-                existing.iter().map(|(z, _)| z.clone()).collect::<Vec<_>>(),
-                new_request.zones,
-            );
-        }
-
-        // Test 3: We have some zones; the new config changes `filesystem_pool`
-        // to match our runtime pools (i.e., the #7229 fix) but also changes
-        // something else in the config for the final zone; we should attempt to
-        // remove and re-add that final zone.
-        {
-            let mut existing = vec![
-                (
-                    make_omicron_zone_config(None),
-                    ZpoolOrRamdisk::Zpool(some_zpools[0]),
-                ),
-                (
-                    make_omicron_zone_config(None),
-                    ZpoolOrRamdisk::Zpool(some_zpools[1]),
-                ),
-                (
-                    make_omicron_zone_config(None),
-                    ZpoolOrRamdisk::Zpool(some_zpools[2]),
-                ),
-            ];
-            let new_request = OmicronZonesConfig {
-                generation: Generation::new().next(),
-                zones: existing
-                    .iter()
-                    .enumerate()
-                    .map(|(i, (zone, _))| {
-                        let mut zone = zone.clone();
-                        zone.filesystem_pool = Some(some_zpools[i]);
-                        if i == 2 {
-                            zone.zone_type = OmicronZoneType::Oximeter {
-                                address: "[::1]:10000".parse().unwrap(),
-                            };
-                        }
-                        zone
-                    })
-                    .collect(),
-            };
-            let reconciled = reconcile_running_zones_with_new_request_impl(
-                existing.iter_mut().map(|(z, p)| (z, &*p)),
-                new_request.clone(),
-                log,
-            )
-            .expect("reconciled successfully");
-            assert_eq!(
-                reconciled.zones_to_be_removed,
-                HashSet::from([existing[2].0.clone()]),
-            );
-            assert_eq!(
-                reconciled.zones_to_be_added,
-                HashSet::from([new_request.zones[2].clone()]),
-            );
-            // The first two existing zones should have been updated to match
-            // the new request.
-            assert_eq!(
-                Vec::from_iter(existing[..2].iter().map(|(z, _)| z.clone())),
-                &new_request.zones[..2],
-            );
-        }
-
-        // Test 4: We have some zones; the new config changes `filesystem_pool`
-        // to match our runtime pools (i.e., the #7229 fix), except the new pool
-        // on the final zone is incorrect. We should get an error back.
-        {
-            let mut existing = vec![
-                (
-                    make_omicron_zone_config(None),
-                    ZpoolOrRamdisk::Zpool(some_zpools[0]),
-                ),
-                (
-                    make_omicron_zone_config(None),
-                    ZpoolOrRamdisk::Zpool(some_zpools[1]),
-                ),
-                (
-                    make_omicron_zone_config(None),
-                    ZpoolOrRamdisk::Zpool(some_zpools[2]),
-                ),
-            ];
-            let existing_orig =
-                existing.iter().map(|(z, _)| z.clone()).collect::<Vec<_>>();
-            let new_request = OmicronZonesConfig {
-                generation: Generation::new().next(),
-                zones: existing
-                    .iter()
-                    .enumerate()
-                    .map(|(i, (zone, _))| {
-                        let mut zone = zone.clone();
-                        if i < 2 {
-                            zone.filesystem_pool = Some(some_zpools[i]);
-                        } else {
-                            zone.filesystem_pool = Some(some_zpools[4]);
-                        }
-                        zone
-                    })
-                    .collect(),
-            };
-            let err = reconcile_running_zones_with_new_request_impl(
-                existing.iter_mut().map(|(z, p)| (z, &*p)),
-                new_request.clone(),
-                log,
-            )
-            .expect_err("should not have reconciled successfully");
-
-            match err {
-                Error::InvalidFilesystemPoolZoneConfig {
-                    zone_id,
-                    expected_pool,
-                    got_pool,
-                } => {
-                    assert_eq!(zone_id, existing[2].0.id);
-                    assert_eq!(expected_pool, some_zpools[2]);
-                    assert_eq!(got_pool, some_zpools[4]);
-                }
-                _ => panic!("unexpected error: {err}"),
-            }
-            // reconciliation failed, so the contents of our existing configs
-            // should not have changed (even though a couple of the changes
-            // were okay, we should either take all or none to maintain
-            // consistency with the generation-tagged OmicronZonesConfig)
-            assert_eq!(
-                existing.iter().map(|(z, _)| z.clone()).collect::<Vec<_>>(),
-                existing_orig,
-            );
-        }
-
-        // Test 5: We have some zones. The new config applies #7229 fix to the
-        // first zone, doesn't include the remaining zones, and adds some new
-        // zones. We should see "the remaining zones" removed, the "new zones"
-        // added, and the 7229-fixed zone not in either set.
-        {
-            let mut existing = vec![
-                (
-                    make_omicron_zone_config(None),
-                    ZpoolOrRamdisk::Zpool(some_zpools[0]),
-                ),
-                (
-                    make_omicron_zone_config(None),
-                    ZpoolOrRamdisk::Zpool(some_zpools[1]),
-                ),
-                (
-                    make_omicron_zone_config(None),
-                    ZpoolOrRamdisk::Zpool(some_zpools[2]),
-                ),
-            ];
-            let new_request = OmicronZonesConfig {
-                generation: Generation::new().next(),
-                zones: vec![
-                    {
-                        let mut z = existing[0].0.clone();
-                        z.filesystem_pool = Some(some_zpools[0]);
-                        z
-                    },
-                    make_omicron_zone_config(None),
-                    make_omicron_zone_config(None),
-                ],
-            };
-            let reconciled = reconcile_running_zones_with_new_request_impl(
-                existing.iter_mut().map(|(z, p)| (z, &*p)),
-                new_request.clone(),
-                log,
-            )
-            .expect("reconciled successfully");
-
-            assert_eq!(
-                reconciled.zones_to_be_removed,
-                HashSet::from_iter(
-                    existing[1..].iter().map(|(z, _)| z.clone())
-                ),
-            );
-            assert_eq!(
-                reconciled.zones_to_be_added,
-                HashSet::from_iter(new_request.zones[1..].iter().cloned()),
-            );
-            // Only the first existing zone is being kept; ensure it matches the
-            // new request.
-            assert_eq!(existing[0].0, new_request.zones[0]);
-        }
-        logctx.cleanup_successful();
-    }
 }
diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs
index 59eb99a4d6e..1699c06b3e7 100644
--- a/sled-agent/src/sim/http_entrypoints.rs
+++ b/sled-agent/src/sim/http_entrypoints.rs
@@ -25,7 +25,6 @@ use dropshot::TypedBody;
 use dropshot::endpoint;
 use nexus_sled_agent_shared::inventory::Inventory;
 use nexus_sled_agent_shared::inventory::OmicronSledConfig;
-use nexus_sled_agent_shared::inventory::OmicronSledConfigResult;
 use nexus_sled_agent_shared::inventory::SledRole;
 use omicron_common::api::internal::nexus::DiskRuntimeState;
 use omicron_common::api::internal::nexus::SledVmmState;
@@ -35,8 +34,6 @@ use omicron_common::api::internal::shared::VirtualNetworkInterfaceHost;
 use omicron_common::api::internal::shared::{
     ResolvedVpcRouteSet, ResolvedVpcRouteState, SwitchPorts,
 };
-use omicron_common::disk::DatasetsConfig;
-use omicron_common::disk::OmicronPhysicalDisksConfig;
 use range_requests::RequestContextEx;
 use sled_agent_api::*;
 use sled_agent_types::boot_disk::BootDiskOsWriteStatus;
@@ -339,28 +336,14 @@ impl SledAgentApi for SledAgentSimImpl {
         ))
     }
 
-    async fn datasets_get(
-        rqctx: RequestContext<Self::Context>,
-    ) -> Result<HttpResponseOk<DatasetsConfig>, HttpError> {
-        let sa = rqctx.context();
-        Ok(HttpResponseOk(sa.datasets_config_list()?))
-    }
-
-    async fn omicron_physical_disks_get(
-        rqctx: RequestContext<Self::Context>,
-    ) -> Result<HttpResponseOk<OmicronPhysicalDisksConfig>, HttpError> {
-        let sa = rqctx.context();
-        Ok(HttpResponseOk(sa.omicron_physical_disks_list()?))
-    }
-
     async fn omicron_config_put(
         rqctx: RequestContext<Self::Context>,
         body: TypedBody<OmicronSledConfig>,
-    ) -> Result<HttpResponseOk<OmicronSledConfigResult>, HttpError> {
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
         let sa = rqctx.context();
         let body_args = body.into_inner();
-        let result = sa.set_omicron_config(body_args)?;
-        Ok(HttpResponseOk(result))
+        sa.set_omicron_config(body_args)?;
+        Ok(HttpResponseUpdatedNoContent())
     }
 
     async fn sled_add(
@@ -633,12 +616,6 @@ impl SledAgentApi for SledAgentSimImpl {
         method_unimplemented()
     }
 
-    async fn zpools_get(
-        _rqctx: RequestContext<Self::Context>,
-    ) -> Result<HttpResponseOk<Vec<Zpool>>, HttpError> {
-        method_unimplemented()
-    }
-
     async fn sled_role_get(
         _rqctx: RequestContext<Self::Context>,
     ) -> Result<HttpResponseOk<SledRole>, HttpError> {
diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs
index 61adf7e0c59..46fdaa6a356 100644
--- a/sled-agent/src/sim/sled_agent.rs
+++ b/sled-agent/src/sim/sled_agent.rs
@@ -25,8 +25,8 @@ use dropshot::HttpError;
 use futures::Stream;
 use nexus_sled_agent_shared::inventory::{
     ConfigReconcilerInventoryStatus, Inventory, InventoryDataset,
-    InventoryDisk, InventoryZpool, OmicronSledConfig, OmicronSledConfigResult,
-    OmicronZonesConfig, SledRole,
+    InventoryDisk, InventoryZpool, OmicronSledConfig, OmicronZonesConfig,
+    SledRole,
 };
 use omicron_common::api::external::{
     ByteCount, DiskState, Error, Generation, ResourceType,
@@ -913,7 +913,9 @@ impl SledAgent {
     pub fn set_omicron_config(
         &self,
         config: OmicronSledConfig,
-    ) -> Result<OmicronSledConfigResult, HttpError> {
+    ) -> Result<(), HttpError> {
+        // TODO Update the simulator to work on `OmicronSledConfig` instead of
+        // the three separate legacy configs
         let disks_config = OmicronPhysicalDisksConfig {
             generation: config.generation,
             disks: config.disks.into_iter().collect(),
@@ -926,16 +928,14 @@ impl SledAgent {
             generation: config.generation,
             zones: config.zones.into_iter().collect(),
         };
-        let (disks, datasets) = {
-            let mut storage = self.storage.lock();
-            let DisksManagementResult { status: disks } =
-                storage.omicron_physical_disks_ensure(disks_config)?;
-            let DatasetsManagementResult { status: datasets } =
-                storage.datasets_ensure(datasets_config)?;
-            (disks, datasets)
-        };
+
+        let mut storage = self.storage.lock();
+        let _ = storage.omicron_physical_disks_ensure(disks_config)?;
+        let _ = storage.datasets_ensure(datasets_config)?;
         *self.fake_zones.lock().unwrap() = zones_config;
-        Ok(OmicronSledConfigResult { disks, datasets })
+        //*self.sled_config.lock().unwrap() = Some(config);
+
+        Ok(())
     }
 
     pub fn omicron_zones_list(&self) -> OmicronZonesConfig {
diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs
index cff825ba474..737c4fe0168 100644
--- a/sled-agent/src/sled_agent.rs
+++ b/sled-agent/src/sled_agent.rs
@@ -4,26 +4,24 @@
 
 //! Sled agent implementation
 
-use crate::artifact_store::ArtifactStore;
+use crate::artifact_store::{self, ArtifactStore};
 use crate::boot_disk_os_writer::BootDiskOsWriter;
 use crate::bootstrap::config::BOOTSTRAP_AGENT_RACK_INIT_PORT;
 use crate::bootstrap::early_networking::EarlyNetworkSetupError;
 use crate::config::Config;
 use crate::instance_manager::InstanceManager;
 use crate::long_running_tasks::LongRunningTaskHandles;
-use crate::metrics::MetricsManager;
+use crate::metrics::{self, MetricsManager, MetricsRequestQueue};
 use crate::nexus::{
     NexusClient, NexusNotifierHandle, NexusNotifierInput, NexusNotifierTask,
 };
-use crate::params::OmicronZoneTypeExt;
 use crate::probe_manager::ProbeManager;
 use crate::services::{self, ServiceManager};
-use crate::storage_monitor::StorageMonitorHandle;
 use crate::support_bundle::logs::SupportBundleLogs;
 use crate::support_bundle::storage::SupportBundleManager;
 use crate::vmm_reservoir::{ReservoirMode, VmmReservoirManager};
-use crate::zone_bundle;
 use crate::zone_bundle::BundleError;
+use crate::zone_bundle::{self, ZoneBundler};
 use bootstore::schemes::v0 as bootstore;
 use camino::Utf8PathBuf;
 use derive_more::From;
@@ -31,15 +29,17 @@ use dropshot::HttpError;
 use futures::StreamExt;
 use futures::stream::FuturesUnordered;
 use illumos_utils::opte::PortManager;
+use illumos_utils::running_zone::RunningZone;
+use illumos_utils::zpool::ZpoolName;
 use nexus_sled_agent_shared::inventory::{
-    ConfigReconcilerInventoryStatus, Inventory, InventoryDataset,
-    InventoryDisk, InventoryZpool, OmicronSledConfig, OmicronSledConfigResult,
-    OmicronZonesConfig, SledRole,
+    Inventory, OmicronSledConfig, SledRole,
 };
 use omicron_common::address::{
     Ipv6Subnet, SLED_PREFIX, get_sled_address, get_switch_zone_address,
 };
-use omicron_common::api::external::{ByteCount, ByteCountRangeError, Vni};
+use omicron_common::api::external::{
+    ByteCount, ByteCountRangeError, Generation, Vni,
+};
 use omicron_common::api::internal::nexus::{DiskRuntimeState, SledVmmState};
 use omicron_common::api::internal::shared::{
     ExternalIpGatewayMap, HostPortConfig, RackNetworkConfig,
@@ -49,13 +49,14 @@ use omicron_common::api::internal::shared::{
 use omicron_common::backoff::{
     BackoffError, retry_notify, retry_policy_internal_service_aggressive,
 };
-use omicron_common::disk::{
-    DatasetsConfig, DatasetsManagementResult, DisksManagementResult,
-    OmicronPhysicalDisksConfig,
-};
+use omicron_common::disk::M2Slot;
 use omicron_ddm_admin_client::Client as DdmAdminClient;
 use omicron_uuid_kinds::{GenericUuid, PropolisUuid, SledUuid};
-use sled_agent_api::Zpool;
+use sled_agent_config_reconciler::{
+    ConfigReconcilerHandle, InternalDisksReceiver, LedgerNewConfigError,
+    LedgerTaskError, ReconcilerInventory, SledAgentArtifactStore,
+    SledAgentFacilities, TimeSyncStatus,
+};
 use sled_agent_types::disk::DiskStateRequested;
 use sled_agent_types::early_networking::EarlyNetworkConfig;
 use sled_agent_types::instance::{
@@ -66,20 +67,21 @@ use sled_agent_types::sled::{BaseboardId, StartSledAgentRequest};
 use sled_agent_types::time_sync::TimeSync;
 use sled_agent_types::zone_bundle::{
     BundleUtilization, CleanupContext, CleanupCount, CleanupPeriod,
-    PriorityOrder, StorageLimit, ZoneBundleMetadata,
+    PriorityOrder, StorageLimit, ZoneBundleCause, ZoneBundleMetadata,
 };
 use sled_diagnostics::SledDiagnosticsCmdError;
 use sled_diagnostics::SledDiagnosticsCmdOutput;
 use sled_hardware::{HardwareManager, MemoryReservations, underlay};
 use sled_hardware_types::Baseboard;
 use sled_hardware_types::underlay::BootstrapInterface;
-use sled_storage::manager::StorageHandle;
+use sled_storage::config::MountConfig;
 use slog::Logger;
 use slog_error_chain::InlineErrorChain;
 use sprockets_tls::keys::SprocketsConfig;
 use std::collections::BTreeMap;
 use std::net::{Ipv6Addr, SocketAddrV6};
 use std::sync::Arc;
+use tufaceous_artifact::ArtifactHash;
 use uuid::Uuid;
 
 use illumos_utils::dladm::Dladm;
@@ -115,9 +117,6 @@ pub enum Error {
     #[error("Failed to operate on underlay device: {0}")]
     Underlay(#[from] underlay::Error),
 
-    #[error("Failed to request firewall rules")]
-    FirewallRequest(#[source] nexus_client::Error<nexus_client::types::Error>),
-
     #[error(transparent)]
     Services(#[from] crate::services::Error),
 
@@ -130,9 +129,6 @@ pub enum Error {
     #[error("Error managing storage: {0}")]
     Storage(#[from] sled_storage::error::Error),
 
-    #[error("Error monitoring storage: {0}")]
-    StorageMonitor(#[from] crate::storage_monitor::Error),
-
     #[error("Error updating: {0}")]
     Download(#[from] crate::updates::Error),
 
@@ -171,6 +167,9 @@ pub enum Error {
 
     #[error(transparent)]
     RepoDepotStart(#[from] crate::artifact_store::StartError),
+
+    #[error("Time not yet synchronized")]
+    TimeNotSynchronized,
 }
 
 impl From<Error> for omicron_common::api::external::Error {
@@ -338,12 +337,8 @@ struct SledAgentInner {
     // This is used for idempotence checks during RSS/Add-Sled internal APIs
     start_request: StartSledAgentRequest,
 
-    // Component of Sled Agent responsible for storage and dataset management.
-    storage: StorageHandle,
-
-    // Component of Sled Agent responsible for monitoring storage and updating
-    // dump devices.
-    storage_monitor: StorageMonitorHandle,
+    // Handle to the sled-agent-config-reconciler system.
+    config_reconciler: Arc<ConfigReconcilerHandle>,
 
     // Component of Sled Agent responsible for managing Propolis instances.
     instances: InstanceManager,
@@ -357,9 +352,6 @@ struct SledAgentInner {
     // Other Oxide-controlled services running on this Sled.
     services: ServiceManager,
 
-    // Connection to Nexus.
-    nexus_client: NexusClient,
-
     // A mechanism for notifiying nexus about sled-agent updates
     nexus_notifier: NexusNotifierHandle,
 
@@ -382,7 +374,7 @@ struct SledAgentInner {
     probes: ProbeManager,
 
     // Component of Sled Agent responsible for managing the artifact store.
-    repo_depot: dropshot::HttpServer<ArtifactStore<StorageHandle>>,
+    repo_depot: dropshot::HttpServer<Arc<ArtifactStore<InternalDisksReceiver>>>,
 }
 
 impl SledAgentInner {
@@ -430,12 +422,14 @@ impl SledAgent {
         .cleanup_snapshots()
         .await;
 
-        let storage_manager = &long_running_task_handles.storage_manager;
-        let boot_disk = storage_manager
-            .get_latest_disks()
-            .await
-            .boot_disk()
-            .ok_or_else(|| Error::BootDiskNotFound)?;
+        let config_reconciler =
+            Arc::clone(&long_running_task_handles.config_reconciler);
+        let boot_disk_zpool = config_reconciler
+            .internal_disks_rx()
+            .current()
+            .boot_disk_zpool()
+            .ok_or_else(|| Error::BootDiskNotFound)?
+            .clone();
 
         // Configure a swap device of the configured size before other system setup.
         match config.swap_device_size_gb {
@@ -443,7 +437,7 @@ impl SledAgent {
                 info!(log, "Requested swap device of size {} GiB", sz);
                 crate::swap_device::ensure_swap_device(
                     &parent_log,
-                    &boot_disk.1,
+                    &boot_disk_zpool,
                     sz,
                 )?;
             }
@@ -456,7 +450,8 @@ impl SledAgent {
         }
 
         info!(log, "Mounting backing filesystems");
-        crate::backing_fs::ensure_backing_fs(&parent_log, &boot_disk.1).await?;
+        crate::backing_fs::ensure_backing_fs(&parent_log, &boot_disk_zpool)
+            .await?;
 
         // TODO-correctness Bootstrap-agent already ensures the underlay
         // etherstub and etherstub VNIC exist on startup - could it pass them
@@ -549,7 +544,8 @@ impl SledAgent {
             nexus_client.clone(),
             instance_vnic_allocator,
             port_manager.clone(),
-            storage_manager.clone(),
+            config_reconciler.currently_managed_zpools_rx().clone(),
+            config_reconciler.available_datasets_rx(),
             long_running_task_handles.zone_bundler.clone(),
             vmm_reservoir_manager.clone(),
             metrics_manager.request_queue(),
@@ -598,9 +594,30 @@ impl SledAgent {
             .await
             .expect(
                 "Expected an infinite retry loop getting \
-             network config from bootstore",
+                 network config from bootstore",
             );
 
+        let artifact_store = Arc::new(
+            ArtifactStore::new(
+                &log,
+                config_reconciler.internal_disks_rx().clone(),
+                Some(Arc::clone(&config_reconciler)),
+            )
+            .await,
+        );
+
+        // Start reconciling against our ledgered sled config.
+        config_reconciler.spawn_reconciliation_task(
+            etherstub_vnic,
+            ReconcilerFacilities {
+                service_manager: services.clone(),
+                metrics_queue: metrics_manager.request_queue(),
+                zone_bundler: long_running_task_handles.zone_bundler.clone(),
+            },
+            SledAgentArtifactStoreWrapper(Arc::clone(&artifact_store)),
+            &parent_log,
+        );
+
         services
             .sled_agent_started(
                 svc_config,
@@ -612,14 +629,8 @@ impl SledAgent {
             )
             .await?;
 
-        let repo_depot = ArtifactStore::new(
-            &log,
-            storage_manager.clone(),
-            Some(services.clone()),
-        )
-        .await
-        .start(sled_address, &config.dropshot)
-        .await?;
+        let repo_depot =
+            artifact_store.start(sled_address, &config.dropshot).await?;
 
         // Spawn a background task for managing notifications to nexus
         // about this sled-agent.
@@ -642,27 +653,26 @@ impl SledAgent {
             request.body.id.into_untyped_uuid(),
             nexus_client.clone(),
             etherstub.clone(),
-            storage_manager.clone(),
             port_manager.clone(),
             metrics_manager.request_queue(),
+            config_reconciler.available_datasets_rx(),
             log.new(o!("component" => "ProbeManager")),
         );
 
+        let currently_managed_zpools_rx =
+            config_reconciler.currently_managed_zpools_rx().clone();
+
         let sled_agent = SledAgent {
             inner: Arc::new(SledAgentInner {
                 id: request.body.id,
                 subnet: request.body.subnet,
                 start_request: request,
-                storage: long_running_task_handles.storage_manager.clone(),
-                storage_monitor: long_running_task_handles
-                    .storage_monitor_handle
-                    .clone(),
+                config_reconciler,
                 instances,
                 probes,
                 hardware: long_running_task_handles.hardware_manager.clone(),
                 port_manager,
                 services,
-                nexus_client,
                 nexus_notifier: nexus_notifier_handle,
                 rack_network_config,
                 zone_bundler: long_running_task_handles.zone_bundler.clone(),
@@ -675,7 +685,7 @@ impl SledAgent {
             sprockets: config.sprockets.clone(),
         };
 
-        sled_agent.inner.probes.run().await;
+        sled_agent.inner.probes.run(currently_managed_zpools_rx).await;
 
         // We immediately add a notification to the request queue about our
         // existence. If inspection of the hardware later informs us that we're
@@ -686,66 +696,17 @@ impl SledAgent {
         Ok(sled_agent)
     }
 
-    /// Load services for which we're responsible.
-    ///
-    /// Blocks until all services have started, retrying indefinitely on
-    /// failure.
-    pub(crate) async fn load_services(&self) {
-        info!(self.log, "Loading cold boot services");
-        retry_notify(
-            retry_policy_internal_service_aggressive(),
-            || async {
-                // Load as many services as we can, and don't exit immediately
-                // upon failure.
-                let load_services_result =
-                    self.inner.services.load_services().await.map_err(|err| {
-                        BackoffError::transient(Error::from(err))
-                    });
-
-                // If there wasn't any work to do, we're done immediately.
-                if matches!(
-                    load_services_result,
-                    Ok(services::LoadServicesResult::NoServicesToLoad)
-                ) {
-                    info!(
-                        self.log,
-                        "load_services exiting early; no services to be loaded"
-                    );
-                    return Ok(());
-                }
-
-                // Otherwise, request firewall rule updates for as many services as
-                // we can. Note that we still make this request even if we only
-                // partially load some services.
-                let firewall_result = self
-                    .request_firewall_update()
-                    .await
-                    .map_err(|err| BackoffError::transient(err));
-
-                // Only complete if we have loaded all services and firewall
-                // rules successfully.
-                load_services_result.and(firewall_result)
-            },
-            |err, delay| {
-                warn!(
-                    self.log,
-                    "Failed to load services, will retry in {:?}", delay;
-                    "error" => ?err,
-                );
-            },
-        )
-        .await
-        .unwrap(); // we retry forever, so this can't fail
-    }
-
     /// Accesses the [SupportBundleManager] API.
     pub(crate) fn as_support_bundle_storage(&self) -> SupportBundleManager<'_> {
-        SupportBundleManager::new(&self.log, self.storage())
+        SupportBundleManager::new(&self.log, &*self.inner.config_reconciler)
     }
 
     /// Accesses the [SupportBundleLogs] API.
     pub(crate) fn as_support_bundle_logs(&self) -> SupportBundleLogs<'_> {
-        SupportBundleLogs::new(&self.log, self.storage())
+        SupportBundleLogs::new(
+            &self.log,
+            self.inner.config_reconciler.internal_disks_rx(),
+        )
     }
 
     pub(crate) fn switch_zone_underlay_info(
@@ -770,20 +731,6 @@ impl SledAgent {
         self.sprockets.clone()
     }
 
-    /// Requests firewall rules from Nexus.
-    ///
-    /// Does not retry upon failure.
-    async fn request_firewall_update(&self) -> Result<(), Error> {
-        let sled_id = self.inner.id;
-
-        self.inner
-            .nexus_client
-            .sled_firewall_rules_request(&sled_id)
-            .await
-            .map_err(|err| Error::FirewallRequest(err))?;
-        Ok(())
-    }
-
     /// Trigger a request to Nexus informing it that the current sled exists,
     /// with information about the existing set of hardware.
     pub(crate) async fn notify_nexus_about_self(&self, log: &Logger) {
@@ -874,212 +821,13 @@ impl SledAgent {
         self.inner.zone_bundler.cleanup().await.map_err(Error::from)
     }
 
-    pub async fn datasets_config_list(&self) -> Result<DatasetsConfig, Error> {
-        Ok(self.storage().datasets_config_list().await?)
-    }
-
-    async fn datasets_ensure(
-        &self,
-        config: DatasetsConfig,
-    ) -> Result<DatasetsManagementResult, Error> {
-        info!(self.log, "datasets ensure");
-        let datasets_result = self.storage().datasets_ensure(config).await?;
-        info!(self.log, "datasets ensure: Updated storage");
-
-        // TODO(https://github.com/oxidecomputer/omicron/issues/6177):
-        // At the moment, we don't actually remove any datasets -- this function
-        // just adds new datasets.
-        //
-        // Once we start removing old datasets, we should probably ensure that
-        // they are not longer in-use before returning (similar to
-        // omicron_physical_disks_ensure).
-
-        Ok(datasets_result)
-    }
-
-    /// Requests the set of physical disks currently managed by the Sled Agent.
-    ///
-    /// This should be contrasted by the set of disks in the inventory, which
-    /// may contain a slightly different set, if certain disks are not expected
-    /// to be in-use by the broader control plane.
-    pub async fn omicron_physical_disks_list(
-        &self,
-    ) -> Result<OmicronPhysicalDisksConfig, Error> {
-        Ok(self.storage().omicron_physical_disks_list().await?)
-    }
-
-    /// Ensures that the specific set of Omicron Physical Disks are running
-    /// on this sled, and that no other disks are being used by the control
-    /// plane (with the exception of M.2s, which are always automatically
-    /// in-use).
-    async fn omicron_physical_disks_ensure(
-        &self,
-        config: OmicronPhysicalDisksConfig,
-    ) -> Result<DisksManagementResult, Error> {
-        info!(self.log, "physical disks ensure");
-        // Tell the storage subsystem which disks should be managed.
-        let disk_result =
-            self.storage().omicron_physical_disks_ensure(config).await?;
-        info!(self.log, "physical disks ensure: Updated storage");
-
-        // Grab a view of the latest set of disks, alongside a generation
-        // number.
-        //
-        // This generation is at LEAST as high as our last call through
-        // omicron_physical_disks_ensure. It may actually be higher, if a
-        // concurrent operation occurred.
-        //
-        // "latest_disks" has a generation number, which is important for other
-        // subcomponents of Sled Agent to consider. If multiple requests to
-        // ensure disks arrive concurrently, it's important to "only advance
-        // forward" as requested by Nexus.
-        //
-        // For example: if we receive the following requests concurrently:
-        // - Use Disks {A, B, C}, generation = 1
-        // - Use Disks {A, B, C, D}, generation = 2
-        //
-        // If we ignore generation numbers, it's possible that we start using
-        // "disk D" -- e.g., for instance filesystems -- and then immediately
-        // delete it when we process the request with "generation 1".
-        //
-        // By keeping these requests ordered, we prevent this thrashing, and
-        // ensure that we always progress towards the last-requested state.
-        let latest_disks = self.storage().get_latest_disks().await;
-        let our_gen = latest_disks.generation();
-        info!(self.log, "physical disks ensure: Propagating new generation of disks"; "generation" => ?our_gen);
-
-        // Ensure that the StorageMonitor, and the dump devices, have committed
-        // to start using new disks and stop using old ones.
-        self.inner.storage_monitor.await_generation(*our_gen).await?;
-        info!(self.log, "physical disks ensure: Updated storage monitor");
-
-        // Ensure that the ZoneBundler, if it was creating a bundle referencing
-        // the old U.2s, has stopped using them.
-        self.inner.zone_bundler.await_completion_of_prior_bundles().await;
-        info!(self.log, "physical disks ensure: Updated zone bundler");
-
-        // Ensure that all probes, at least after our call to
-        // "omicron_physical_disks_ensure", stop using any disks that
-        // may have been in-service from before that request.
-        self.inner.probes.use_only_these_disks(&latest_disks).await;
-        info!(self.log, "physical disks ensure: Updated probes");
-
-        // Do the same for instances - mark them failed if they were using
-        // expunged disks.
-        self.inner.instances.use_only_these_disks(latest_disks).await?;
-        info!(self.log, "physical disks ensure: Updated instances");
-
-        Ok(disk_result)
-    }
-
     /// Ensures that the specific sets of disks, datasets, and zones specified
     /// by `config` are running.
-    ///
-    /// This method currently blocks while each of disks, datasets, and zones
-    /// are ensured in that order; a failure on one prevents any attempt to
-    /// ensure the subsequent step(s).
     pub async fn set_omicron_config(
         &self,
         config: OmicronSledConfig,
-    ) -> Result<OmicronSledConfigResult, Error> {
-        // Until the config-reconciler work lands: unpack the unified config
-        // into the three split configs for indepenedent ledgering.
-        let disks_config = OmicronPhysicalDisksConfig {
-            generation: config.generation,
-            disks: config.disks.into_iter().collect(),
-        };
-
-        let disks = self.omicron_physical_disks_ensure(disks_config).await?;
-
-        // If we only had partial success deploying disks, don't proceed.
-        if disks.has_error() {
-            return Ok(OmicronSledConfigResult {
-                disks: disks.status,
-                datasets: Vec::new(),
-            });
-        }
-
-        let datasets_config = DatasetsConfig {
-            generation: config.generation,
-            datasets: config.datasets.into_iter().map(|d| (d.id, d)).collect(),
-        };
-
-        let datasets = self.datasets_ensure(datasets_config).await?;
-
-        // If we only had partial success deploying datasets, don't proceed.
-        if datasets.has_error() {
-            return Ok(OmicronSledConfigResult {
-                disks: disks.status,
-                datasets: datasets.status,
-            });
-        }
-
-        let zones_config = OmicronZonesConfig {
-            generation: config.generation,
-            zones: config.zones.into_iter().collect(),
-        };
-
-        self.omicron_zones_ensure(zones_config).await?;
-
-        Ok(OmicronSledConfigResult {
-            disks: disks.status,
-            datasets: datasets.status,
-        })
-    }
-
-    /// Ensures that the specific set of Omicron zones are running as configured
-    /// (and that no other zones are running)
-    async fn omicron_zones_ensure(
-        &self,
-        requested_zones: OmicronZonesConfig,
-    ) -> Result<(), Error> {
-        // TODO(https://github.com/oxidecomputer/omicron/issues/6043):
-        // - If these are the set of filesystems, we should also consider
-        // removing the ones which are not listed here.
-        // - It's probably worth sending a bulk request to the storage system,
-        // rather than requesting individual datasets.
-        for zone in &requested_zones.zones {
-            let Some(dataset_name) = zone.dataset_name() else {
-                continue;
-            };
-
-            // NOTE: This code will be deprecated by https://github.com/oxidecomputer/omicron/pull/7160
-            //
-            // However, we need to ensure that all blueprints have datasets
-            // within them before we can remove this back-fill.
-            //
-            // Therefore, we do something hairy here: We ensure the filesystem
-            // exists, but don't specify any dataset UUID value.
-            //
-            // This means that:
-            // - If the dataset exists and has a UUID, this will be a no-op
-            // - If the dataset doesn't exist, it'll be created without its
-            // oxide:uuid zfs property set
-            // - If a subsequent call to "datasets_ensure" tries to set a UUID,
-            // it should be able to get set (once).
-            self.inner.storage.upsert_filesystem(None, dataset_name).await?;
-        }
-
-        self.inner
-            .services
-            .ensure_all_omicron_zones_persistent(requested_zones)
-            .await?;
-        Ok(())
-    }
-
-    /// Gets the sled's current list of all zpools.
-    pub async fn zpools_get(&self) -> Vec<Zpool> {
-        self.inner
-            .storage
-            .get_latest_disks()
-            .await
-            .get_all_zpools()
-            .into_iter()
-            .map(|(name, variant)| Zpool {
-                id: name.id(),
-                disk_type: variant.into(),
-            })
-            .collect()
+    ) -> Result<Result<(), LedgerNewConfigError>, LedgerTaskError> {
+        self.inner.config_reconciler.set_sled_config(config).await
     }
 
     /// Returns whether or not the sled believes itself to be a scrimlet
@@ -1192,7 +940,7 @@ impl SledAgent {
         todo!("Disk attachment not yet implemented");
     }
 
-    pub fn artifact_store(&self) -> &ArtifactStore<StorageHandle> {
+    pub fn artifact_store(&self) -> &ArtifactStore<InternalDisksReceiver> {
         &self.inner.repo_depot.app_private()
     }
 
@@ -1249,7 +997,18 @@ impl SledAgent {
 
     /// Gets the sled's current time synchronization state
     pub async fn timesync_get(&self) -> Result<TimeSync, Error> {
-        self.inner.services.timesync_get().await.map_err(Error::from)
+        let status = self.inner.config_reconciler.timesync_status();
+
+        // TODO-cleanup we could give a more specific error cause in the
+        // `FailedToGetSyncStatus` case.
+        match status {
+            TimeSyncStatus::NotYetChecked
+            | TimeSyncStatus::ConfiguredToSkip
+            | TimeSyncStatus::FailedToGetSyncStatus(_) => {
+                Err(Error::TimeNotSynchronized)
+            }
+            TimeSyncStatus::TimeSync(time_sync) => Ok(time_sync),
+        }
     }
 
     pub async fn ensure_scrimlet_host_ports(
@@ -1314,8 +1073,15 @@ impl SledAgent {
         Ok(())
     }
 
-    pub(crate) fn storage(&self) -> &StorageHandle {
-        &self.inner.storage
+    pub(crate) fn boot_image_raw_devfs_path(
+        &self,
+        slot: M2Slot,
+    ) -> Option<Result<Utf8PathBuf, Arc<PooledDiskError>>> {
+        self.inner
+            .config_reconciler
+            .internal_disks_rx()
+            .current()
+            .image_raw_devfs_path(slot)
     }
 
     pub(crate) fn boot_disk_os_writer(&self) -> &BootDiskOsWriter {
@@ -1359,94 +1125,14 @@ impl SledAgent {
         let sled_role =
             if is_scrimlet { SledRole::Scrimlet } else { SledRole::Gimlet };
 
-        let mut disks = vec![];
-        let mut zpools = vec![];
-        let mut datasets = vec![];
-        let (all_disks, disks_config, datasets_config, omicron_zones) = tokio::join!(
-            self.storage().get_latest_disks(),
-            self.storage().omicron_physical_disks_list(),
-            self.storage().datasets_config_list(),
-            self.inner.services.omicron_zones_list()
-        );
-
-        // RSS asks for our inventory _before_ it sends us an
-        // `OmicronSledConfig`; echo back the default (empty) disk and dataset
-        // configs if we have no ledger at all.
-        let disks_config = match disks_config {
-            Ok(disks_config) => disks_config,
-            Err(sled_storage::error::Error::LedgerNotFound) => {
-                OmicronPhysicalDisksConfig::default()
-            }
-            Err(err) => return Err(InventoryError::GetDisksConfig(err)),
-        };
-        let datasets_config = match datasets_config {
-            Ok(datasets_config) => datasets_config,
-            Err(sled_storage::error::Error::LedgerNotFound) => {
-                DatasetsConfig::default()
-            }
-            Err(err) => return Err(InventoryError::GetDatasetsConfig(err)),
-        };
-
-        for (identity, variant, slot, firmware) in all_disks.iter_all() {
-            disks.push(InventoryDisk {
-                identity: identity.clone(),
-                variant,
-                slot,
-                active_firmware_slot: firmware.active_slot(),
-                next_active_firmware_slot: firmware.next_active_slot(),
-                number_of_firmware_slots: firmware.number_of_slots(),
-                slot1_is_read_only: firmware.slot1_read_only(),
-                slot_firmware_versions: firmware.slots().to_vec(),
-            });
-        }
-        for zpool in all_disks.all_u2_zpools() {
-            let info =
-                match illumos_utils::zpool::Zpool::get_info(&zpool.to_string())
-                    .await
-                {
-                    Ok(info) => info,
-                    Err(err) => {
-                        warn!(
-                            self.log,
-                            "Failed to access zpool info";
-                            "zpool" => %zpool,
-                            "err" => %err
-                        );
-                        continue;
-                    }
-                };
-
-            zpools.push(InventoryZpool {
-                id: zpool.id(),
-                total_size: ByteCount::try_from(info.size())?,
-            });
-
-            let inv_props = match self.storage().datasets_list(zpool).await {
-                Ok(props) => {
-                    props.into_iter().map(|prop| InventoryDataset::from(prop))
-                }
-                Err(err) => {
-                    warn!(
-                        self.log,
-                        "Failed to access dataset info within zpool";
-                        "zpool" => %zpool,
-                        "err" => %err
-                    );
-                    continue;
-                }
-            };
-            datasets.extend(inv_props);
-        }
-
-        // Reassemble our combined sled config from its separate pieces. (This
-        // will go away once we start ledgering the config as a single unit.)
-        let sled_config = OmicronSledConfig {
-            generation: omicron_zones.generation,
-            disks: disks_config.disks.into_iter().collect(),
-            datasets: datasets_config.datasets.into_values().collect(),
-            zones: omicron_zones.zones.into_iter().collect(),
-            remove_mupdate_override: None,
-        };
+        let ReconcilerInventory {
+            disks,
+            zpools,
+            datasets,
+            ledgered_sled_config,
+            reconciler_status,
+            last_reconciliation,
+        } = self.inner.config_reconciler.inventory();
 
         Ok(Inventory {
             sled_id,
@@ -1459,11 +1145,9 @@ impl SledAgent {
             disks,
             zpools,
             datasets,
-            // These fields will come from the reconciler once it's integrated.
-            // For now, we can report our ledgered config but nothing else.
-            ledgered_sled_config: Some(sled_config),
-            reconciler_status: ConfigReconcilerInventoryStatus::NotYetRun,
-            last_reconciliation: None,
+            ledgered_sled_config,
+            reconciler_status,
+            last_reconciliation,
         })
     }
 
@@ -1625,3 +1309,77 @@ pub async fn sled_add(
     info!(log, "Peer agent initialized"; "peer_bootstrap_addr" => %bootstrap_addr, "peer_id" => %baseboard);
     Ok(())
 }
+
+struct ReconcilerFacilities {
+    service_manager: ServiceManager,
+    metrics_queue: MetricsRequestQueue,
+    zone_bundler: ZoneBundler,
+}
+
+impl SledAgentFacilities for ReconcilerFacilities {
+    type StartZoneError = services::Error;
+    type MetricsUntrackZoneLinksError = Vec<metrics::Error>;
+    type ZoneBundleCreateError = BundleError;
+
+    async fn on_time_sync(&self) {
+        self.service_manager.on_time_sync().await
+    }
+
+    async fn start_omicron_zone(
+        &self,
+        zone_config: &OmicronZoneConfig,
+        mount_config: &MountConfig,
+        is_time_synchronized: bool,
+        all_u2_pools: &[ZpoolName],
+    ) -> Result<RunningZone, Self::StartZoneError> {
+        self.service_manager
+            .start_omicron_zone(
+                mount_config,
+                zone_config,
+                is_time_synchronized,
+                all_u2_pools,
+            )
+            .await
+    }
+
+    fn metrics_untrack_zone_links(
+        &self,
+        zone: &RunningZone,
+    ) -> Result<(), Self::MetricsUntrackZoneLinksError> {
+        self.metrics_queue.untrack_zone_links(zone)
+    }
+
+    fn ddm_add_internal_dns_prefix(&self, prefix: Ipv6Subnet<SLED_PREFIX>) {
+        self.service_manager.ddm_reconciler().add_internal_dns_subnet(prefix);
+    }
+
+    fn ddm_remove_internal_dns_prefix(&self, prefix: Ipv6Subnet<SLED_PREFIX>) {
+        self.service_manager
+            .ddm_reconciler()
+            .remove_internal_dns_subnet(prefix);
+    }
+
+    async fn zone_bundle_create(
+        &self,
+        zone: &RunningZone,
+        cause: ZoneBundleCause,
+    ) -> Result<(), Self::ZoneBundleCreateError> {
+        self.zone_bundler.create(zone, cause).await?;
+        Ok(())
+    }
+}
+
+// Workaround wrapper for orphan rules.
+struct SledAgentArtifactStoreWrapper(Arc<ArtifactStore<InternalDisksReceiver>>);
+
+impl SledAgentArtifactStore for SledAgentArtifactStoreWrapper {
+    type ArtifactExistsValidationError = artifact_store::Error;
+
+    async fn validate_artifact_exists_in_storage(
+        &self,
+        artifact: ArtifactHash,
+    ) -> Result<(), Self::ArtifactExistsValidationError> {
+        self.0.get(artifact).await?;
+        Ok(())
+    }
+}
diff --git a/sled-agent/src/storage_monitor.rs b/sled-agent/src/storage_monitor.rs
deleted file mode 100644
index 626d81d54ff..00000000000
--- a/sled-agent/src/storage_monitor.rs
+++ /dev/null
@@ -1,116 +0,0 @@
-// This Source Code Form is subject to the terms of the Mozilla Public
-// License, v. 2.0. If a copy of the MPL was not distributed with this
-// file, You can obtain one at https://mozilla.org/MPL/2.0/.
-
-//! A task that listens for storage events from [`sled_storage::manager::StorageManager`]
-//! and dispatches them to other parts of the bootstrap agent and sled agent
-//! code.
-
-use omicron_common::api::external::Generation;
-use sled_agent_config_reconciler::dump_setup::DumpSetup;
-use sled_storage::config::MountConfig;
-use sled_storage::manager::StorageHandle;
-use sled_storage::resources::AllDisks;
-use slog::Logger;
-use std::sync::Arc;
-use tokio::sync::watch;
-
-#[derive(thiserror::Error, Debug)]
-pub enum Error {
-    #[error("Storage Monitor no longer running")]
-    NotRunning,
-}
-
-pub struct StorageMonitor {
-    log: Logger,
-    storage_manager: StorageHandle,
-
-    // Invokes dumpadm(8) and savecore(8) when new disks are encountered
-    dump_setup: DumpSetup,
-
-    tx: watch::Sender<StorageMonitorStatus>,
-}
-
-/// Emits status about storage monitoring.
-#[derive(Debug, Clone)]
-pub struct StorageMonitorStatus {
-    /// The latest generation of physical disks to be processed
-    /// by the storage monitor.
-    pub latest_gen: Option<Generation>,
-}
-
-impl StorageMonitorStatus {
-    fn new() -> Self {
-        Self { latest_gen: None }
-    }
-}
-
-#[derive(Clone)]
-pub struct StorageMonitorHandle {
-    rx: watch::Receiver<StorageMonitorStatus>,
-}
-
-impl StorageMonitorHandle {
-    pub async fn await_generation(
-        &self,
-        wanted: Generation,
-    ) -> Result<(), Error> {
-        self.rx
-            .clone()
-            .wait_for(|status| {
-                let Some(observed) = status.latest_gen else {
-                    return false;
-                };
-                return observed >= wanted;
-            })
-            .await
-            .map_err(|_| Error::NotRunning)?;
-        Ok(())
-    }
-}
-
-impl StorageMonitor {
-    pub fn new(
-        log: &Logger,
-        mount_config: MountConfig,
-        storage_manager: StorageHandle,
-    ) -> (StorageMonitor, StorageMonitorHandle) {
-        let dump_setup = DumpSetup::new(&log, Arc::new(mount_config));
-        let log = log.new(o!("component" => "StorageMonitor"));
-        let (tx, rx) = watch::channel(StorageMonitorStatus::new());
-        (
-            StorageMonitor { log, storage_manager, dump_setup, tx },
-            StorageMonitorHandle { rx },
-        )
-    }
-
-    /// Run the main receive loop of the `StorageMonitor`
-    ///
-    /// This should be spawned into a tokio task
-    pub async fn run(mut self) {
-        loop {
-            tokio::select! {
-                disks = self.storage_manager.wait_for_changes() => {
-                    info!(
-                        self.log,
-                        "Received storage manager update";
-                        "disks" => ?disks
-                    );
-                    self.handle_resource_update(disks).await;
-                }
-            }
-        }
-    }
-
-    async fn handle_resource_update(&mut self, updated_disks: AllDisks) {
-        let generation = updated_disks.generation();
-        self.dump_setup
-            .update_dumpdev_setup(
-                updated_disks.iter_managed().map(|(_id, disk)| disk),
-            )
-            .await;
-        self.tx.send_replace(StorageMonitorStatus {
-            latest_gen: Some(*generation),
-        });
-    }
-}
diff --git a/sled-agent/src/support_bundle/logs.rs b/sled-agent/src/support_bundle/logs.rs
index 3467c8663d0..c68a3bdedde 100644
--- a/sled-agent/src/support_bundle/logs.rs
+++ b/sled-agent/src/support_bundle/logs.rs
@@ -7,7 +7,7 @@
 use camino_tempfile::tempfile_in;
 use dropshot::HttpError;
 use range_requests::make_get_response;
-use sled_storage::manager::StorageHandle;
+use sled_agent_config_reconciler::InternalDisksReceiver;
 use slog::Logger;
 use slog_error_chain::InlineErrorChain;
 use tokio::io::AsyncSeekExt;
@@ -43,12 +43,15 @@ impl From<Error> for HttpError {
 
 pub struct SupportBundleLogs<'a> {
     log: &'a Logger,
-    sled_storage: &'a StorageHandle,
+    internal_disks_rx: &'a InternalDisksReceiver,
 }
 
 impl<'a> SupportBundleLogs<'a> {
-    pub fn new(log: &'a Logger, sled_storage: &'a StorageHandle) -> Self {
-        Self { log, sled_storage }
+    pub fn new(
+        log: &'a Logger,
+        internal_disks_rx: &'a InternalDisksReceiver,
+    ) -> Self {
+        Self { log, internal_disks_rx }
     }
 
     /// Get a list of zones on a sled containing logs that we want to include in
@@ -77,12 +80,9 @@ impl<'a> SupportBundleLogs<'a> {
     {
         // We are using an M.2 device for temporary storage to assemble a zip
         // file made up of all of the discovered zone's logs.
-        let m2_debug_datasets = self
-            .sled_storage
-            .get_latest_disks()
-            .await
-            .all_sled_diagnostics_directories();
-        let tempdir = m2_debug_datasets.first().ok_or(Error::MissingStorage)?;
+        let current_internal_disks = self.internal_disks_rx.current();
+        let mut m2_debug_datasets = current_internal_disks.all_debug_datasets();
+        let tempdir = m2_debug_datasets.next().ok_or(Error::MissingStorage)?;
         let mut tempfile = tempfile_in(tempdir)?;
 
         let log = self.log.clone();
diff --git a/sled-agent/src/support_bundle/storage.rs b/sled-agent/src/support_bundle/storage.rs
index 4e3d781e146..ad126e9e1b5 100644
--- a/sled-agent/src/support_bundle/storage.rs
+++ b/sled-agent/src/support_bundle/storage.rs
@@ -14,6 +14,7 @@ use futures::Stream;
 use futures::StreamExt;
 use illumos_utils::zfs::DatasetProperties;
 use omicron_common::api::external::Error as ExternalError;
+use omicron_common::api::external::Generation;
 use omicron_common::disk::CompressionAlgorithm;
 use omicron_common::disk::DatasetConfig;
 use omicron_common::disk::DatasetName;
@@ -28,15 +29,16 @@ use range_requests::PotentialRange;
 use range_requests::SingleRange;
 use sha2::{Digest, Sha256};
 use sled_agent_api::*;
+use sled_agent_config_reconciler::ConfigReconcilerHandle;
+use sled_agent_config_reconciler::DatasetTaskError;
 use sled_agent_types::support_bundle::BUNDLE_FILE_NAME;
 use sled_agent_types::support_bundle::BUNDLE_TMP_FILE_NAME_SUFFIX;
 use sled_storage::manager::NestedDatasetConfig;
 use sled_storage::manager::NestedDatasetListOptions;
 use sled_storage::manager::NestedDatasetLocation;
-use sled_storage::manager::StorageHandle;
 use slog::Logger;
 use slog_error_chain::InlineErrorChain;
-use std::borrow::Cow;
+use std::collections::BTreeMap;
 use std::io::Write;
 use tokio::io::AsyncReadExt;
 use tokio::io::AsyncSeekExt;
@@ -86,6 +88,9 @@ pub enum Error {
 
     #[error(transparent)]
     Zip(#[from] ZipError),
+
+    #[error(transparent)]
+    DatasetTask(#[from] DatasetTaskError),
 }
 
 fn err_str(err: &dyn std::error::Error) -> String {
@@ -144,7 +149,6 @@ pub trait LocalStorage: Sync {
     async fn dyn_ensure_mounted_and_get_mountpoint(
         &self,
         dataset: NestedDatasetLocation,
-        mount_root: &Utf8Path,
     ) -> Result<Utf8PathBuf, Error>;
 
     /// Returns all nested datasets within an existing dataset
@@ -165,19 +169,29 @@ pub trait LocalStorage: Sync {
         &self,
         name: NestedDatasetLocation,
     ) -> Result<(), Error>;
-
-    /// Returns the root filesystem path where datasets are mounted.
-    ///
-    /// This is typically "/" in prod, but can be a temporary directory
-    /// for tests to isolate storage that typically appears globally.
-    fn zpool_mountpoint_root(&self) -> Cow<Utf8Path>;
 }
 
 /// This implementation is effectively a pass-through to the real methods
 #[async_trait]
-impl LocalStorage for StorageHandle {
+impl LocalStorage for ConfigReconcilerHandle {
     async fn dyn_datasets_config_list(&self) -> Result<DatasetsConfig, Error> {
-        self.datasets_config_list().await.map_err(|err| err.into())
+        // TODO-cleanup This is super gross; add a better API (maybe fetch a
+        // single dataset by ID, since that's what our caller wants?)
+        Ok(self
+            .inventory()
+            .ledgered_sled_config
+            .map(|sled_config| DatasetsConfig {
+                generation: sled_config.generation,
+                datasets: sled_config
+                    .datasets
+                    .into_iter()
+                    .map(|d| (d.id, d))
+                    .collect(),
+            })
+            .unwrap_or_else(|| DatasetsConfig {
+                generation: Generation::new(),
+                datasets: BTreeMap::new(),
+            }))
     }
 
     async fn dyn_dataset_get(
@@ -205,12 +219,10 @@ impl LocalStorage for StorageHandle {
     async fn dyn_ensure_mounted_and_get_mountpoint(
         &self,
         dataset: NestedDatasetLocation,
-        mount_root: &Utf8Path,
     ) -> Result<Utf8PathBuf, Error> {
-        dataset
-            .ensure_mounted_and_get_mountpoint(mount_root)
+        self.nested_dataset_ensure_mounted(dataset)
             .await
-            .map_err(Error::from)
+            .map_err(|err| err.into())
     }
 
     async fn dyn_nested_dataset_list(
@@ -234,10 +246,6 @@ impl LocalStorage for StorageHandle {
     ) -> Result<(), Error> {
         self.nested_dataset_destroy(name).await.map_err(|err| err.into())
     }
-
-    fn zpool_mountpoint_root(&self) -> Cow<Utf8Path> {
-        Cow::Borrowed(self.mount_config().root.as_path())
-    }
 }
 
 /// This implementation allows storage bundles to be stored on simulated storage
@@ -257,10 +265,10 @@ impl LocalStorage for crate::sim::Storage {
     async fn dyn_ensure_mounted_and_get_mountpoint(
         &self,
         dataset: NestedDatasetLocation,
-        mount_root: &Utf8Path,
     ) -> Result<Utf8PathBuf, Error> {
+        let slf = self.lock();
         // Simulated storage treats all datasets as mounted.
-        Ok(dataset.mountpoint(mount_root))
+        Ok(dataset.mountpoint(slf.root()))
     }
 
     async fn dyn_nested_dataset_list(
@@ -284,10 +292,6 @@ impl LocalStorage for crate::sim::Storage {
     ) -> Result<(), Error> {
         self.lock().nested_dataset_destroy(name).map_err(|err| err.into())
     }
-
-    fn zpool_mountpoint_root(&self) -> Cow<Utf8Path> {
-        Cow::Owned(self.lock().root().to_path_buf())
-    }
 }
 
 /// Describes the type of access to the support bundle
@@ -512,10 +516,7 @@ impl<'a> SupportBundleManager<'a> {
             // The dataset for a support bundle exists.
             let support_bundle_path = self
                 .storage
-                .dyn_ensure_mounted_and_get_mountpoint(
-                    dataset.name,
-                    &self.storage.zpool_mountpoint_root(),
-                )
+                .dyn_ensure_mounted_and_get_mountpoint(dataset.name)
                 .await?
                 .join(BUNDLE_FILE_NAME);
 
@@ -625,13 +626,8 @@ impl<'a> SupportBundleManager<'a> {
         info!(log, "Dataset does exist for bundle");
 
         // The mounted root of the support bundle dataset
-        let support_bundle_dir = self
-            .storage
-            .dyn_ensure_mounted_and_get_mountpoint(
-                dataset,
-                &self.storage.zpool_mountpoint_root(),
-            )
-            .await?;
+        let support_bundle_dir =
+            self.storage.dyn_ensure_mounted_and_get_mountpoint(dataset).await?;
         let support_bundle_path = support_bundle_dir.join(BUNDLE_FILE_NAME);
         let support_bundle_path_tmp = support_bundle_dir.join(format!(
             "{}-{BUNDLE_TMP_FILE_NAME_SUFFIX}",
@@ -737,13 +733,8 @@ impl<'a> SupportBundleManager<'a> {
             NestedDatasetLocation { path: support_bundle_id.to_string(), root };
 
         // The mounted root of the support bundle dataset
-        let support_bundle_dir = self
-            .storage
-            .dyn_ensure_mounted_and_get_mountpoint(
-                dataset,
-                &self.storage.zpool_mountpoint_root(),
-            )
-            .await?;
+        let support_bundle_dir =
+            self.storage.dyn_ensure_mounted_and_get_mountpoint(dataset).await?;
         let path = support_bundle_dir.join(BUNDLE_FILE_NAME);
 
         let f = tokio::fs::File::open(&path).await?;
@@ -944,11 +935,80 @@ mod tests {
     use omicron_common::disk::DatasetsConfig;
     use omicron_common::zpool_name::ZpoolName;
     use omicron_test_utils::dev::test_setup_log;
+    use sled_storage::manager::StorageHandle;
     use sled_storage::manager_test_harness::StorageManagerTestHarness;
     use std::collections::BTreeMap;
     use zip::ZipWriter;
     use zip::write::SimpleFileOptions;
 
+    // TODO-cleanup Should we rework these tests to not use StorageHandle (real
+    // code now goes through `ConfigReconcilerHandle`)?
+    #[async_trait]
+    impl LocalStorage for StorageHandle {
+        async fn dyn_datasets_config_list(
+            &self,
+        ) -> Result<DatasetsConfig, Error> {
+            self.datasets_config_list().await.map_err(|err| err.into())
+        }
+
+        fn dyn_dataset_get(
+            &self,
+            dataset_name: &String,
+        ) -> Result<DatasetProperties, Error> {
+            let Some(dataset) =
+                illumos_utils::zfs::Zfs::get_dataset_properties(
+                    &[dataset_name.clone()],
+                    illumos_utils::zfs::WhichDatasets::SelfOnly,
+                )
+                .map_err(|err| Error::DatasetLookup(err))?
+                .pop()
+            else {
+                // This should not be possible, unless the "zfs get" command is
+                // behaving unpredictably. We're only asking for a single dataset,
+                // so on success, we should see the result of that dataset.
+                return Err(Error::DatasetLookup(anyhow::anyhow!(
+                    "Zfs::get_dataset_properties returned an empty vec?"
+                )));
+            };
+
+            Ok(dataset)
+        }
+
+        async fn dyn_ensure_mounted_and_get_mountpoint(
+            &self,
+            dataset: NestedDatasetLocation,
+        ) -> Result<Utf8PathBuf, Error> {
+            dataset
+                .ensure_mounted_and_get_mountpoint(&self.mount_config().root)
+                .await
+                .map_err(|err| err.into())
+        }
+
+        async fn dyn_nested_dataset_list(
+            &self,
+            name: NestedDatasetLocation,
+            options: NestedDatasetListOptions,
+        ) -> Result<Vec<NestedDatasetConfig>, Error> {
+            self.nested_dataset_list(name, options)
+                .await
+                .map_err(|err| err.into())
+        }
+
+        async fn dyn_nested_dataset_ensure(
+            &self,
+            config: NestedDatasetConfig,
+        ) -> Result<(), Error> {
+            self.nested_dataset_ensure(config).await.map_err(|err| err.into())
+        }
+
+        async fn dyn_nested_dataset_destroy(
+            &self,
+            name: NestedDatasetLocation,
+        ) -> Result<(), Error> {
+            self.nested_dataset_destroy(name).await.map_err(|err| err.into())
+        }
+    }
+
     struct SingleU2StorageHarness {
         storage_test_harness: StorageManagerTestHarness,
         zpool_id: ZpoolUuid,
diff --git a/sled-agent/src/zone_bundle.rs b/sled-agent/src/zone_bundle.rs
index 24a13b8b2a2..5df159a6c8b 100644
--- a/sled-agent/src/zone_bundle.rs
+++ b/sled-agent/src/zone_bundle.rs
@@ -27,9 +27,9 @@ use illumos_utils::zfs::Snapshot;
 use illumos_utils::zfs::ZFS;
 use illumos_utils::zfs::Zfs;
 use illumos_utils::zone::AdmError;
+use sled_agent_config_reconciler::AvailableDatasetsReceiver;
+use sled_agent_config_reconciler::InternalDisksReceiver;
 use sled_agent_types::zone_bundle::*;
-use sled_storage::dataset::U2_DEBUG_DATASET;
-use sled_storage::manager::StorageHandle;
 use slog::Logger;
 use std::collections::BTreeMap;
 use std::collections::BTreeSet;
@@ -139,7 +139,8 @@ pub struct ZoneBundler {
 // State shared between tasks, e.g., used when creating a bundle in different
 // tasks or between a creation and cleanup.
 struct Inner {
-    storage_handle: StorageHandle,
+    internal_disks_rx: InternalDisksReceiver,
+    available_datasets_rx: AvailableDatasetsReceiver,
     cleanup_context: CleanupContext,
     last_cleanup_at: Instant,
 }
@@ -167,11 +168,11 @@ impl Inner {
     // that can exist but do not, i.e., those whose parent datasets already
     // exist; and returns those.
     async fn bundle_directories(&self) -> Vec<Utf8PathBuf> {
-        let resources = self.storage_handle.get_latest_disks().await;
         // NOTE: These bundle directories are always stored on M.2s, so we don't
         // need to worry about synchronizing with U.2 disk expungement at the
         // callsite.
-        let expected = resources.all_zone_bundle_directories();
+        let internal_disks = self.internal_disks_rx.current();
+        let expected = internal_disks.all_zone_bundle_directories();
         let mut out = Vec::with_capacity(expected.len());
         for each in expected.into_iter() {
             if tokio::fs::create_dir_all(&each).await.is_ok() {
@@ -236,7 +237,8 @@ impl ZoneBundler {
     /// to clean them up to free up space.
     pub async fn new(
         log: Logger,
-        storage_handle: StorageHandle,
+        internal_disks_rx: InternalDisksReceiver,
+        available_datasets_rx: AvailableDatasetsReceiver,
         cleanup_context: CleanupContext,
     ) -> Self {
         // This is compiled out in tests because there's no way to set our
@@ -255,7 +257,8 @@ impl ZoneBundler {
             .expect("Failed to initialize existing ZFS resources");
         let notify_cleanup = Arc::new(Notify::new());
         let inner = Arc::new(Mutex::new(Inner {
-            storage_handle,
+            internal_disks_rx,
+            available_datasets_rx,
             cleanup_context,
             last_cleanup_at: Instant::now(),
         }));
@@ -356,9 +359,9 @@ impl ZoneBundler {
         // prior bundles have completed.
         let inner = self.inner.lock().await;
         let storage_dirs = inner.bundle_directories().await;
-        let resources = inner.storage_handle.get_latest_disks().await;
-        let extra_log_dirs = resources
-            .all_u2_mountpoints(U2_DEBUG_DATASET)
+        let extra_log_dirs = inner
+            .available_datasets_rx
+            .all_mounted_debug_datasets()
             .into_iter()
             .map(|pool_path| pool_path.path)
             .collect();
@@ -1766,7 +1769,10 @@ mod illumos_tests {
     use chrono::TimeZone;
     use chrono::Timelike;
     use chrono::Utc;
+    use omicron_common::disk::DiskIdentity;
     use rand::RngCore;
+    use sled_agent_config_reconciler::AvailableDatasetsReceiver;
+    use sled_agent_config_reconciler::InternalDisksReceiver;
     use sled_storage::manager_test_harness::StorageManagerTestHarness;
     use slog::Drain;
     use slog::Logger;
@@ -1891,9 +1897,38 @@ mod illumos_tests {
         let log = test_logger();
         let context = CleanupContext::default();
         let resource_wrapper = ResourceWrapper::new(&log).await;
+        let handle = resource_wrapper.storage_test_harness.handle();
+        let all_disks = handle.get_latest_disks().await;
+
+        // Convert from StorageManagerTestHarness to config-reconciler channels.
+        // Do we want to expand config-reconciler test support and not use
+        // StorageManagerTestHarness?
+        let internal_disks_rx = InternalDisksReceiver::fake_static(
+            Arc::new(all_disks.mount_config().clone()),
+            all_disks.all_m2_zpools().into_iter().enumerate().map(
+                |(i, zpool)| {
+                    (
+                        DiskIdentity {
+                            vendor: format!("test-vendor-{i}"),
+                            model: format!("test-model-{i}"),
+                            serial: format!("test-serial-{i}"),
+                        },
+                        zpool,
+                    )
+                },
+            ),
+        );
+        let available_datasets_rx = AvailableDatasetsReceiver::fake_static(
+            all_disks
+                .all_m2_zpools()
+                .into_iter()
+                .zip(all_disks.all_m2_mountpoints(".")),
+        );
+
         let bundler = ZoneBundler::new(
             log,
-            resource_wrapper.storage_test_harness.handle().clone(),
+            internal_disks_rx,
+            available_datasets_rx,
             context,
         )
         .await;

From ce36cb7b7440aeb532713a72dc279b29253aa5ba Mon Sep 17 00:00:00 2001
From: John Gallagher <john@oxidecomputer.com>
Date: Wed, 30 Apr 2025 15:16:34 -0400
Subject: [PATCH 02/10] use new ConfigReconcilerSpawnToken interface

---
 sled-agent/src/bootstrap/pre_server.rs |  4 ++++
 sled-agent/src/bootstrap/server.rs     | 26 +++++++++++++++++++++-----
 sled-agent/src/long_running_tasks.rs   | 18 +++++++++++-------
 sled-agent/src/server.rs               |  3 +++
 sled-agent/src/sled_agent.rs           |  9 +++++----
 5 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/sled-agent/src/bootstrap/pre_server.rs b/sled-agent/src/bootstrap/pre_server.rs
index 546a523e3e0..094b44e6c31 100644
--- a/sled-agent/src/bootstrap/pre_server.rs
+++ b/sled-agent/src/bootstrap/pre_server.rs
@@ -35,6 +35,7 @@ use illumos_utils::zone::Api;
 use illumos_utils::zone::Zones;
 use omicron_common::FileKv;
 use omicron_common::address::Ipv6Subnet;
+use sled_agent_config_reconciler::ConfigReconcilerSpawnToken;
 use sled_hardware::DendriteAsic;
 use sled_hardware::SledMode;
 use sled_hardware::underlay;
@@ -53,6 +54,7 @@ pub(super) struct BootstrapAgentStartup {
     pub(super) service_manager: ServiceManager,
     pub(super) long_running_task_handles: LongRunningTaskHandles,
     pub(super) sled_agent_started_tx: oneshot::Sender<SledAgent>,
+    pub(super) config_reconciler_spawn_token: ConfigReconcilerSpawnToken,
 }
 
 impl BootstrapAgentStartup {
@@ -120,6 +122,7 @@ impl BootstrapAgentStartup {
         // the process and are used by both the bootstrap agent and sled agent
         let (
             long_running_task_handles,
+            config_reconciler_spawn_token,
             sled_agent_started_tx,
             service_manager_ready_tx,
         ) = spawn_all_longrunning_tasks(
@@ -161,6 +164,7 @@ impl BootstrapAgentStartup {
             service_manager,
             long_running_task_handles,
             sled_agent_started_tx,
+            config_reconciler_spawn_token,
         })
     }
 }
diff --git a/sled-agent/src/bootstrap/server.rs b/sled-agent/src/bootstrap/server.rs
index 9788bb943ad..d18b42a3466 100644
--- a/sled-agent/src/bootstrap/server.rs
+++ b/sled-agent/src/bootstrap/server.rs
@@ -39,6 +39,7 @@ use omicron_ddm_admin_client::DdmError;
 use omicron_ddm_admin_client::types::EnableStatsRequest;
 use omicron_uuid_kinds::GenericUuid;
 use omicron_uuid_kinds::RackInitUuid;
+use sled_agent_config_reconciler::ConfigReconcilerSpawnToken;
 use sled_agent_config_reconciler::InternalDisksReceiver;
 use sled_agent_types::rack_init::RackInitializeRequest;
 use sled_agent_types::sled::StartSledAgentRequest;
@@ -180,6 +181,7 @@ impl Server {
             service_manager,
             long_running_task_handles,
             sled_agent_started_tx,
+            config_reconciler_spawn_token,
         } = BootstrapAgentStartup::run(config).await?;
 
         // Do we have a StartSledAgentRequest stored in the ledger?
@@ -246,6 +248,7 @@ impl Server {
                 &config,
                 start_sled_agent_request,
                 long_running_task_handles.clone(),
+                config_reconciler_spawn_token,
                 service_manager.clone(),
                 &base_log,
                 &startup_log,
@@ -261,7 +264,10 @@ impl Server {
 
             SledAgentState::ServerStarted(sled_agent_server)
         } else {
-            SledAgentState::Bootstrapping(Some(sled_agent_started_tx))
+            SledAgentState::Bootstrapping(Some(BootstrappingDependencies {
+                sled_agent_started_tx,
+                config_reconciler_spawn_token,
+            }))
         };
 
         // Spawn our inner task that handles any future hardware updates and any
@@ -303,11 +309,16 @@ impl Server {
 // bootstrap server).
 enum SledAgentState {
     // We're still in the bootstrapping phase, waiting for a sled-agent request.
-    Bootstrapping(Option<oneshot::Sender<SledAgent>>),
+    Bootstrapping(Option<BootstrappingDependencies>),
     // ... or the sled agent server is running.
     ServerStarted(SledAgentServer),
 }
 
+struct BootstrappingDependencies {
+    sled_agent_started_tx: oneshot::Sender<SledAgent>,
+    config_reconciler_spawn_token: ConfigReconcilerSpawnToken,
+}
+
 #[derive(thiserror::Error, Debug)]
 pub enum SledAgentServerStartError {
     #[error("Failed to start sled-agent server: {0}")]
@@ -347,6 +358,7 @@ async fn start_sled_agent(
     config: &SledConfig,
     request: StartSledAgentRequest,
     long_running_task_handles: LongRunningTaskHandles,
+    config_reconciler_spawn_token: ConfigReconcilerSpawnToken,
     service_manager: ServiceManager,
     base_log: &Logger,
     log: &Logger,
@@ -398,6 +410,7 @@ async fn start_sled_agent(
         base_log.clone(),
         request.clone(),
         long_running_task_handles.clone(),
+        config_reconciler_spawn_token,
         service_manager,
     )
     .await
@@ -531,7 +544,7 @@ impl Inner {
         log: &Logger,
     ) {
         match &mut self.state {
-            SledAgentState::Bootstrapping(sled_agent_started_tx) => {
+            SledAgentState::Bootstrapping(deps) => {
                 let request_id = request.body.id.into_untyped_uuid();
 
                 // Extract from options to satisfy the borrow checker.
@@ -540,13 +553,16 @@ impl Inner {
                 // we explicitly unwrap here, and panic on error below.
                 //
                 // See https://github.com/oxidecomputer/omicron/issues/4494
-                let sled_agent_started_tx =
-                    sled_agent_started_tx.take().unwrap();
+                let BootstrappingDependencies {
+                    sled_agent_started_tx,
+                    config_reconciler_spawn_token,
+                } = deps.take().unwrap();
 
                 let response = match start_sled_agent(
                     &self.config,
                     request,
                     self.long_running_task_handles.clone(),
+                    config_reconciler_spawn_token,
                     self.service_manager.clone(),
                     &self.base_log,
                     &log,
diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs
index 37b646b8a44..ff9a6a53251 100644
--- a/sled-agent/src/long_running_tasks.rs
+++ b/sled-agent/src/long_running_tasks.rs
@@ -25,7 +25,8 @@ use bootstore::schemes::v0 as bootstore;
 use illumos_utils::zpool::ZpoolName;
 use key_manager::{KeyManager, StorageKeyRequester};
 use sled_agent_config_reconciler::{
-    ConfigReconcilerHandle, RawDisksSender, TimeSyncConfig,
+    ConfigReconcilerHandle, ConfigReconcilerSpawnToken, RawDisksSender,
+    TimeSyncConfig,
 };
 use sled_agent_types::zone_bundle::CleanupContext;
 use sled_agent_zone_images::{ZoneImageSourceResolver, ZoneImageZpools};
@@ -70,6 +71,7 @@ pub async fn spawn_all_longrunning_tasks(
     config: &Config,
 ) -> (
     LongRunningTaskHandles,
+    ConfigReconcilerSpawnToken,
     oneshot::Sender<SledAgent>,
     oneshot::Sender<ServiceManager>,
 ) {
@@ -80,12 +82,13 @@ pub async fn spawn_all_longrunning_tasks(
     } else {
         TimeSyncConfig::Normal
     };
-    let mut config_reconciler = ConfigReconcilerHandle::new(
-        MountConfig::default(),
-        storage_key_requester,
-        time_sync_config,
-        log,
-    );
+    let (mut config_reconciler, config_reconciler_spawn_token) =
+        ConfigReconcilerHandle::new(
+            MountConfig::default(),
+            storage_key_requester,
+            time_sync_config,
+            log,
+        );
 
     let nongimlet_observed_disks =
         config.nongimlet_observed_disks.clone().unwrap_or(vec![]);
@@ -127,6 +130,7 @@ pub async fn spawn_all_longrunning_tasks(
             zone_bundler,
             zone_image_resolver,
         },
+        config_reconciler_spawn_token,
         sled_agent_started_tx,
         service_manager_ready_tx,
     )
diff --git a/sled-agent/src/server.rs b/sled-agent/src/server.rs
index f60f367d280..15a5c2f784d 100644
--- a/sled-agent/src/server.rs
+++ b/sled-agent/src/server.rs
@@ -12,6 +12,7 @@ use crate::nexus::make_nexus_client;
 use crate::services::ServiceManager;
 use internal_dns_resolver::Resolver;
 use omicron_uuid_kinds::SledUuid;
+use sled_agent_config_reconciler::ConfigReconcilerSpawnToken;
 use sled_agent_types::sled::StartSledAgentRequest;
 use slog::Logger;
 use std::net::SocketAddr;
@@ -39,6 +40,7 @@ impl Server {
         log: Logger,
         request: StartSledAgentRequest,
         long_running_tasks_handles: LongRunningTaskHandles,
+        config_reconciler_spawn_token: ConfigReconcilerSpawnToken,
         services: ServiceManager,
     ) -> Result<Server, String> {
         info!(log, "setting up sled agent server");
@@ -61,6 +63,7 @@ impl Server {
             request,
             services,
             long_running_tasks_handles,
+            config_reconciler_spawn_token,
         )
         .await
         .map_err(|e| e.to_string())?;
diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs
index 737c4fe0168..8c9972dec60 100644
--- a/sled-agent/src/sled_agent.rs
+++ b/sled-agent/src/sled_agent.rs
@@ -53,9 +53,9 @@ use omicron_common::disk::M2Slot;
 use omicron_ddm_admin_client::Client as DdmAdminClient;
 use omicron_uuid_kinds::{GenericUuid, PropolisUuid, SledUuid};
 use sled_agent_config_reconciler::{
-    ConfigReconcilerHandle, InternalDisksReceiver, LedgerNewConfigError,
-    LedgerTaskError, ReconcilerInventory, SledAgentArtifactStore,
-    SledAgentFacilities, TimeSyncStatus,
+    ConfigReconcilerHandle, ConfigReconcilerSpawnToken, InternalDisksReceiver,
+    LedgerNewConfigError, LedgerTaskError, ReconcilerInventory,
+    SledAgentArtifactStore, SledAgentFacilities, TimeSyncStatus,
 };
 use sled_agent_types::disk::DiskStateRequested;
 use sled_agent_types::early_networking::EarlyNetworkConfig;
@@ -403,6 +403,7 @@ impl SledAgent {
         request: StartSledAgentRequest,
         services: ServiceManager,
         long_running_task_handles: LongRunningTaskHandles,
+        config_reconciler_spawn_token: ConfigReconcilerSpawnToken,
     ) -> Result<SledAgent, Error> {
         // Pass the "parent_log" to all subcomponents that want to set their own
         // "component" value.
@@ -615,7 +616,7 @@ impl SledAgent {
                 zone_bundler: long_running_task_handles.zone_bundler.clone(),
             },
             SledAgentArtifactStoreWrapper(Arc::clone(&artifact_store)),
-            &parent_log,
+            config_reconciler_spawn_token,
         );
 
         services

From 385807fc3da8e27402c8038ff7fca26326ee2e8f Mon Sep 17 00:00:00 2001
From: John Gallagher <john@oxidecomputer.com>
Date: Tue, 20 May 2025 12:57:35 -0400
Subject: [PATCH 03/10] catch up to changes on main since last rebase

---
 Cargo.lock                                    |   1 +
 sled-agent/config-reconciler/src/handle.rs    |  37 ++--
 .../config-reconciler/src/internal_disks.rs   | 162 ++++++++++++------
 sled-agent/config-reconciler/src/lib.rs       |   1 +
 sled-agent/src/bootstrap/pre_server.rs        |   1 +
 sled-agent/src/long_running_tasks.rs          |  26 +--
 sled-agent/src/services.rs                    |  47 +----
 sled-agent/src/sled_agent.rs                  |  71 ++++----
 sled-agent/src/support_bundle/storage.rs      |  97 +++++++----
 sled-agent/zone-images/Cargo.toml             |   2 +
 .../zone-images/src/mupdate_override.rs       | 134 +++++++--------
 sled-agent/zone-images/src/source_resolver.rs |  87 ++--------
 12 files changed, 337 insertions(+), 329 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 0723344d8c3..a7826cdd5f2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -11704,6 +11704,7 @@ dependencies = [
  "omicron-workspace-hack",
  "pretty_assertions",
  "serde_json",
+ "sled-agent-config-reconciler",
  "sled-storage",
  "slog",
  "slog-error-chain",
diff --git a/sled-agent/config-reconciler/src/handle.rs b/sled-agent/config-reconciler/src/handle.rs
index 7a3ae1ac0e8..e33c72f3f97 100644
--- a/sled-agent/config-reconciler/src/handle.rs
+++ b/sled-agent/config-reconciler/src/handle.rs
@@ -12,7 +12,6 @@ use nexus_sled_agent_shared::inventory::InventoryDisk;
 use nexus_sled_agent_shared::inventory::InventoryZpool;
 use nexus_sled_agent_shared::inventory::OmicronSledConfig;
 use omicron_common::disk::DatasetName;
-use omicron_common::disk::DiskIdentity;
 use sled_agent_api::ArtifactConfig;
 use sled_storage::config::MountConfig;
 use sled_storage::disk::Disk;
@@ -37,6 +36,7 @@ use sled_storage::dataset::U2_DEBUG_DATASET;
 use sled_storage::dataset::ZONE_DATASET;
 
 use crate::DatasetTaskError;
+use crate::InternalDisksWithBootDisk;
 use crate::LedgerArtifactConfigError;
 use crate::LedgerNewConfigError;
 use crate::LedgerTaskError;
@@ -277,7 +277,7 @@ impl ConfigReconcilerHandle {
     }
 
     /// Wait for the internal disks task to start managing the boot disk.
-    pub async fn wait_for_boot_disk(&mut self) -> Arc<DiskIdentity> {
+    pub async fn wait_for_boot_disk(&mut self) -> InternalDisksWithBootDisk {
         self.internal_disks_rx.wait_for_boot_disk().await
     }
 
@@ -343,16 +343,16 @@ impl ConfigReconcilerHandle {
             .await
     }
 
-    /// Collect inventory fields relevant to config reconciliation.
-    pub async fn inventory(
+    /// Return the currently-ledgered [`OmicronSledConfig`].
+    ///
+    /// # Errors
+    ///
+    /// Fails if `spawn_reconciliation_task()` has not yet been called or if we
+    /// have not yet checked the internal disks for a ledgered config.
+    pub fn ledgered_sled_config(
         &self,
-        log: &Logger,
-    ) -> Result<ReconcilerInventory, InventoryError> {
-        let ledgered_sled_config = match self
-            .ledger_task
-            .get()
-            .map(LedgerTaskHandle::current_config)
-        {
+    ) -> Result<Option<OmicronSledConfig>, InventoryError> {
+        match self.ledger_task.get().map(LedgerTaskHandle::current_config) {
             // If we haven't yet spawned the ledger task, or we have but
             // it's still waiting on disks, we don't know whether we have a
             // ledgered sled config. It's not reasonable to report `None` in
@@ -363,12 +363,19 @@ impl ConfigReconcilerHandle {
             // for the boot disk and spawn the reconciler task before starting
             // the dropshot server that allows Nexus to collect inventory.
             None | Some(CurrentSledConfig::WaitingForInternalDisks) => {
-                return Err(InventoryError::LedgerContentsNotAvailable);
+                Err(InventoryError::LedgerContentsNotAvailable)
             }
-            Some(CurrentSledConfig::WaitingForInitialConfig) => None,
-            Some(CurrentSledConfig::Ledgered(config)) => Some(config),
-        };
+            Some(CurrentSledConfig::WaitingForInitialConfig) => Ok(None),
+            Some(CurrentSledConfig::Ledgered(config)) => Ok(Some(config)),
+        }
+    }
 
+    /// Collect inventory fields relevant to config reconciliation.
+    pub async fn inventory(
+        &self,
+        log: &Logger,
+    ) -> Result<ReconcilerInventory, InventoryError> {
+        let ledgered_sled_config = self.ledgered_sled_config()?;
         let zpools = self.currently_managed_zpools_rx.to_inventory(log).await;
 
         let datasets = self
diff --git a/sled-agent/config-reconciler/src/internal_disks.rs b/sled-agent/config-reconciler/src/internal_disks.rs
index e9c05adbdb9..96e2ea82c01 100644
--- a/sled-agent/config-reconciler/src/internal_disks.rs
+++ b/sled-agent/config-reconciler/src/internal_disks.rs
@@ -25,6 +25,7 @@ use sled_hardware::PooledDiskError;
 use sled_storage::config::MountConfig;
 use sled_storage::dataset::CLUSTER_DATASET;
 use sled_storage::dataset::CONFIG_DATASET;
+use sled_storage::dataset::INSTALL_DATASET;
 use sled_storage::dataset::M2_ARTIFACT_DATASET;
 use sled_storage::dataset::M2_DEBUG_DATASET;
 use sled_storage::disk::Disk;
@@ -38,7 +39,6 @@ use slog_error_chain::InlineErrorChain;
 use std::collections::BTreeMap;
 use std::collections::BTreeSet;
 use std::future::Future;
-use std::mem;
 use std::ops::Deref;
 use std::sync::Arc;
 use std::time::Duration;
@@ -70,6 +70,8 @@ enum InternalDisksReceiverInner {
 impl InternalDisksReceiver {
     /// Create an `InternalDisksReceiver` that always reports a fixed set of
     /// disks.
+    ///
+    /// The first disk is set as the boot disk.
     #[cfg(any(test, feature = "testing"))]
     pub fn fake_static(
         mount_config: Arc<MountConfig>,
@@ -77,18 +79,27 @@ impl InternalDisksReceiver {
     ) -> Self {
         let inner = InternalDisksReceiverInner::FakeStatic(Arc::new(
             disks
-                .map(|(identity, zpool_name)| {
-                    InternalDiskDetails::fake_details(identity, zpool_name)
+                .enumerate()
+                .map(|(i, (identity, zpool_name))| {
+                    InternalDiskDetails::fake_details(
+                        identity,
+                        zpool_name,
+                        i == 0,
+                    )
                 })
                 .collect(),
         ));
 
-        // We never report errors from our static set; move the sender to a task
-        // that idles so we don't get recv errors.
+        // We never report errors from our static set. If there's a Tokio
+        // runtime, move the sender to a task that idles so we don't get recv
+        // errors; otherwise, don't bother because no one can await changes on
+        // `errors_rx` anyway (e.g., if we're being used from a non-tokio test).
         let (errors_tx, errors_rx) = watch::channel(Arc::default());
-        tokio::spawn(async move {
-            errors_tx.closed().await;
-        });
+        if tokio::runtime::Handle::try_current().is_ok() {
+            tokio::spawn(async move {
+                errors_tx.closed().await;
+            });
+        }
 
         Self { mount_config, inner, errors_rx }
     }
@@ -109,7 +120,7 @@ impl InternalDisksReceiver {
             .clone()
             .into_iter()
             .map(|(identity, zpool_name)| {
-                InternalDiskDetails::fake_details(identity, zpool_name)
+                InternalDiskDetails::fake_details(identity, zpool_name, false)
             })
             .collect();
         let (mapped_tx, mapped_rx) = watch::channel(Arc::new(current));
@@ -122,7 +133,9 @@ impl InternalDisksReceiver {
                     .clone()
                     .into_iter()
                     .map(|(identity, zpool_name)| {
-                        InternalDiskDetails::fake_details(identity, zpool_name)
+                        InternalDiskDetails::fake_details(
+                            identity, zpool_name, false,
+                        )
                     })
                     .collect();
                 if mapped_tx.send(Arc::new(remapped)).is_err() {
@@ -206,6 +219,20 @@ impl InternalDisksReceiver {
         InternalDisks { disks, mount_config: Arc::clone(&self.mount_config) }
     }
 
+    /// Get an [`InternalDisksWithBootDisk`], panicking if we have no boot
+    /// disk.
+    ///
+    /// This method is only available to tests; production code should instead
+    /// use `current()` and/or `wait_for_boot_disk()`.
+    #[cfg(any(test, feature = "testing"))]
+    pub fn current_with_boot_disk(&self) -> InternalDisksWithBootDisk {
+        let disks = self.current();
+        InternalDisksWithBootDisk::new(disks).expect(
+            "current_with_boot_disk() should be called by \
+             tests that set up a fake boot disk",
+        )
+    }
+
     /// Get the current set of managed internal disks and mark the returned
     /// value as seen.
     ///
@@ -264,40 +291,17 @@ impl InternalDisksReceiver {
     /// Wait until the boot disk is managed, returning its identity.
     ///
     /// Internally updates the most-recently-seen value.
-    pub(crate) async fn wait_for_boot_disk(&mut self) -> Arc<DiskIdentity> {
-        match &mut self.inner {
-            InternalDisksReceiverInner::Real(disks_rx) => loop {
-                let disks = disks_rx.borrow_and_update();
-                if let Some(disk) = disks.iter().find(|d| d.is_boot_disk()) {
-                    return Arc::clone(&disk.identity);
-                }
-                mem::drop(disks);
-
-                disks_rx
-                    .changed()
-                    .await
-                    .expect("InternalDisks task never dies");
-            },
-            #[cfg(any(test, feature = "testing"))]
-            InternalDisksReceiverInner::FakeStatic(disks) => {
-                if let Some(disk) = disks.iter().find(|d| d.is_boot_disk()) {
-                    return Arc::clone(&disk.id.identity);
-                }
-                panic!("fake InternalDisksReceiver has no boot disk")
-            }
-            #[cfg(any(test, feature = "testing"))]
-            InternalDisksReceiverInner::FakeDynamic(disks_rx) => loop {
-                let disks = disks_rx.borrow_and_update();
-                if let Some(disk) = disks.iter().find(|d| d.is_boot_disk()) {
-                    return Arc::clone(&disk.id.identity);
-                }
-                mem::drop(disks);
-
-                disks_rx
-                    .changed()
-                    .await
-                    .expect("InternalDisks task never dies");
-            },
+    pub(crate) async fn wait_for_boot_disk(
+        &mut self,
+    ) -> InternalDisksWithBootDisk {
+        loop {
+            let current = self.current_and_update();
+            if let Some(with_boot_disk) =
+                InternalDisksWithBootDisk::new(current)
+            {
+                return with_boot_disk;
+            };
+            self.changed().await.expect("InternalDisks task never dies");
         }
     }
 
@@ -327,6 +331,12 @@ impl InternalDisks {
         })
     }
 
+    pub fn boot_disk_install_dataset(&self) -> Option<Utf8PathBuf> {
+        self.boot_disk_zpool().map(|zpool| {
+            zpool.dataset_mountpoint(&self.mount_config.root, INSTALL_DATASET)
+        })
+    }
+
     pub fn image_raw_devfs_path(
         &self,
         slot: M2Slot,
@@ -393,6 +403,58 @@ impl InternalDisks {
     }
 }
 
+/// An [`InternalDisks`] with a guaranteed-present boot disk.
+pub struct InternalDisksWithBootDisk {
+    inner: InternalDisks,
+    boot_disk: InternalDiskDetailsId,
+}
+
+impl InternalDisksWithBootDisk {
+    fn new(inner: InternalDisks) -> Option<Self> {
+        let boot_disk = inner
+            .disks
+            .iter()
+            .find_map(|d| if d.is_boot_disk() { Some(d.id()) } else { None })?;
+        Some(Self { inner, boot_disk })
+    }
+
+    fn boot_disk(&self) -> &InternalDiskDetails {
+        match self.inner.disks.get(&self.boot_disk) {
+            Some(details) => details,
+            None => unreachable!("boot disk present by construction"),
+        }
+    }
+
+    pub fn boot_disk_id(&self) -> &Arc<DiskIdentity> {
+        &self.boot_disk().id.identity
+    }
+
+    pub fn boot_disk_zpool(&self) -> ZpoolName {
+        self.boot_disk().zpool_name
+    }
+
+    pub fn boot_disk_install_dataset(&self) -> Utf8PathBuf {
+        self.boot_disk_zpool().dataset_mountpoint(
+            &self.inner.mount_config().root,
+            INSTALL_DATASET,
+        )
+    }
+
+    pub fn non_boot_disk_install_datasets(
+        &self,
+    ) -> impl Iterator<Item = (ZpoolName, Utf8PathBuf)> + '_ {
+        self.inner.disks.iter().filter(|disk| disk.id != self.boot_disk).map(
+            |disk| {
+                let dataset = disk.zpool_name.dataset_mountpoint(
+                    &self.inner.mount_config.root,
+                    INSTALL_DATASET,
+                );
+                (disk.zpool_name, dataset)
+            },
+        )
+    }
+}
+
 // A subset of `Disk` properties. We store this in `InternalDisks` instead of
 // `Disk`s both to avoid exposing raw `Disk`s outside this crate and to support
 // easier faking for tests.
@@ -452,15 +514,19 @@ impl From<&'_ InternalDisk> for InternalDiskDetails {
 
 impl InternalDiskDetails {
     #[cfg(any(test, feature = "testing"))]
-    fn fake_details(identity: DiskIdentity, zpool_name: ZpoolName) -> Self {
-        // We can expand the interface for fake disks if we need to be able to
-        // specify more of these properties in future tests.
+    fn fake_details(
+        identity: DiskIdentity,
+        zpool_name: ZpoolName,
+        is_boot_disk: bool,
+    ) -> Self {
         Self {
             id: InternalDiskDetailsId {
                 identity: Arc::new(identity),
-                is_boot_disk: false,
+                is_boot_disk,
             },
             zpool_name,
+            // We can expand the interface for fake disks if we need to be able
+            // to specify more of these properties in future tests.
             slot: None,
             raw_devfs_path: None,
         }
diff --git a/sled-agent/config-reconciler/src/lib.rs b/sled-agent/config-reconciler/src/lib.rs
index eaa1c6f5f12..24a3e7b4489 100644
--- a/sled-agent/config-reconciler/src/lib.rs
+++ b/sled-agent/config-reconciler/src/lib.rs
@@ -76,6 +76,7 @@ pub use handle::ReconcilerInventory;
 pub use handle::TimeSyncConfig;
 pub use internal_disks::InternalDisks;
 pub use internal_disks::InternalDisksReceiver;
+pub use internal_disks::InternalDisksWithBootDisk;
 pub use ledger::LedgerArtifactConfigError;
 pub use ledger::LedgerNewConfigError;
 pub use ledger::LedgerTaskError;
diff --git a/sled-agent/src/bootstrap/pre_server.rs b/sled-agent/src/bootstrap/pre_server.rs
index 094b44e6c31..283569ade96 100644
--- a/sled-agent/src/bootstrap/pre_server.rs
+++ b/sled-agent/src/bootstrap/pre_server.rs
@@ -143,6 +143,7 @@ impl BootstrapAgentStartup {
             sled_mode,
             config.sidecar_revision.clone(),
             config.switch_zone_maghemite_links.clone(),
+            long_running_task_handles.zone_image_resolver.clone(),
             long_running_task_handles
                 .config_reconciler
                 .internal_disks_rx()
diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs
index ff9a6a53251..e05eb9aa417 100644
--- a/sled-agent/src/long_running_tasks.rs
+++ b/sled-agent/src/long_running_tasks.rs
@@ -22,14 +22,13 @@ use crate::services::ServiceManager;
 use crate::sled_agent::SledAgent;
 use crate::zone_bundle::ZoneBundler;
 use bootstore::schemes::v0 as bootstore;
-use illumos_utils::zpool::ZpoolName;
 use key_manager::{KeyManager, StorageKeyRequester};
 use sled_agent_config_reconciler::{
     ConfigReconcilerHandle, ConfigReconcilerSpawnToken, RawDisksSender,
     TimeSyncConfig,
 };
 use sled_agent_types::zone_bundle::CleanupContext;
-use sled_agent_zone_images::{ZoneImageSourceResolver, ZoneImageZpools};
+use sled_agent_zone_images::ZoneImageSourceResolver;
 use sled_hardware::{HardwareManager, SledMode, UnparsedDisk};
 use sled_storage::config::MountConfig;
 use sled_storage::disk::RawSyntheticDisk;
@@ -106,10 +105,9 @@ pub async fn spawn_all_longrunning_tasks(
     // Wait for the boot disk so that we can work with any ledgers,
     // such as those needed by the bootstore and sled-agent
     info!(log, "Waiting for boot disk");
-    let disk_id = config_reconciler.wait_for_boot_disk().await;
-    info!(log, "Found boot disk {:?}", disk_id);
+    let internal_disks = config_reconciler.wait_for_boot_disk().await;
+    info!(log, "Found boot disk {:?}", internal_disks.boot_disk_id());
 
-    let all_disks = storage_manager.get_latest_disks().await;
     let bootstore = spawn_bootstore_tasks(
         log,
         &config_reconciler,
@@ -118,9 +116,8 @@ pub async fn spawn_all_longrunning_tasks(
     )
     .await;
 
-    let zone_bundler = spawn_zone_bundler_tasks(log, &config_reconciler);
-    let zone_image_resolver =
-        make_zone_image_resolver(log, &all_disks, &boot_zpool);
+    let zone_bundler = spawn_zone_bundler_tasks(log, &config_reconciler).await;
+    let zone_image_resolver = ZoneImageSourceResolver::new(log, internal_disks);
 
     (
         LongRunningTaskHandles {
@@ -227,18 +224,7 @@ async fn spawn_zone_bundler_tasks(
         config_reconciler.available_datasets_rx(),
         CleanupContext::default(),
     )
-}
-
-fn make_zone_image_resolver(
-    log: &Logger,
-    all_disks: &AllDisks,
-    boot_zpool: &ZpoolName,
-) -> ZoneImageSourceResolver {
-    let zpools = ZoneImageZpools {
-        root: &all_disks.mount_config().root,
-        all_m2_zpools: all_disks.all_m2_zpools(),
-    };
-    ZoneImageSourceResolver::new(log, &zpools, boot_zpool)
+    .await
 }
 
 async fn upsert_synthetic_disks_if_needed(
diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs
index 31f2f6a70f8..98453b2f8f8 100644
--- a/sled-agent/src/services.rs
+++ b/sled-agent/src/services.rs
@@ -96,15 +96,15 @@ use omicron_common::ledger::Ledgerable;
 use omicron_ddm_admin_client::DdmError;
 use omicron_uuid_kinds::OmicronZoneUuid;
 use rand::prelude::SliceRandom;
-use sled_agent_zone_images::{ZoneImageSourceResolver, ZoneImageZpools};
 use sled_agent_config_reconciler::InternalDisksReceiver;
 use sled_agent_types::sled::SWITCH_ZONE_BASEBOARD_FILE;
+use sled_agent_zone_images::ZoneImageSourceResolver;
 use sled_hardware::SledMode;
 use sled_hardware::is_gimlet;
 use sled_hardware::underlay;
 use sled_hardware_types::Baseboard;
 use sled_storage::config::MountConfig;
-use sled_storage::dataset::{INSTALL_DATASET, ZONE_DATASET};
+use sled_storage::dataset::ZONE_DATASET;
 use slog::Logger;
 use slog_error_chain::InlineErrorChain;
 use std::net::{IpAddr, Ipv6Addr, SocketAddr};
@@ -920,12 +920,6 @@ impl ServiceManager {
         }
     }
 
-
-    // TODO-john still needed?
-    #[cfg(all(test, target_os = "illumos"))]
-    fn override_image_directory(&self, path: Utf8PathBuf) {
-        self.inner.zone_image_resolver.override_image_directory(path);
-    }
     pub(crate) fn ddm_reconciler(&self) -> &DdmReconciler {
         &self.inner.ddm_reconciler
     }
@@ -1530,44 +1524,9 @@ impl ServiceManager {
             ZoneArgs::Omicron(zone_config) => &zone_config.zone.image_source,
             ZoneArgs::Switch(_) => &OmicronZoneImageSource::InstallDataset,
         };
-        let all_disks = self.inner.storage.get_latest_disks().await;
-        let zpools = ZoneImageZpools {
-            root: &all_disks.mount_config().root,
-            all_m2_zpools: all_disks.all_m2_zpools(),
-/* TODO-john fix this
-        let zone_image_file_name = match image_source {
-            OmicronZoneImageSource::InstallDataset => None,
-            OmicronZoneImageSource::Artifact { hash } => Some(hash.to_string()),
-        };
-        let internal_disks = self.inner.internal_disks_rx.current();
-        let zone_image_paths = match image_source {
-            OmicronZoneImageSource::InstallDataset => {
-                // Look for the image in the ramdisk first
-                let mut zone_image_paths =
-                    vec![Utf8PathBuf::from("/opt/oxide")];
-
-                // If the boot disk exists, look for the image in the "install"
-                // dataset there too.
-                if let Some(boot_zpool) = internal_disks.boot_disk_zpool() {
-                    zone_image_paths.push(boot_zpool.dataset_mountpoint(
-                        &internal_disks.mount_config().root,
-                        INSTALL_DATASET,
-                    ));
-                }
-
-                zone_image_paths
-            }
-            OmicronZoneImageSource::Artifact { .. } => {
-                internal_disks.all_artifact_datasets().collect()
-            }
-*/
-        };
-        let boot_zpool =
-            all_disks.boot_disk().map(|(_, boot_zpool)| boot_zpool);
         let file_source = self.inner.zone_image_resolver.file_source_for(
             image_source,
-            &zpools,
-            boot_zpool.as_ref(),
+            self.inner.internal_disks_rx.current(),
         );
 
         let zone_type_str = match &request {
diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs
index 8c9972dec60..f4349a3c6c9 100644
--- a/sled-agent/src/sled_agent.rs
+++ b/sled-agent/src/sled_agent.rs
@@ -4,14 +4,14 @@
 
 //! Sled agent implementation
 
-use crate::artifact_store::{self, ArtifactStore};
+use crate::artifact_store::ArtifactStore;
 use crate::boot_disk_os_writer::BootDiskOsWriter;
 use crate::bootstrap::config::BOOTSTRAP_AGENT_RACK_INIT_PORT;
 use crate::bootstrap::early_networking::EarlyNetworkSetupError;
 use crate::config::Config;
 use crate::instance_manager::InstanceManager;
 use crate::long_running_tasks::LongRunningTaskHandles;
-use crate::metrics::{self, MetricsManager, MetricsRequestQueue};
+use crate::metrics::{MetricsManager, MetricsRequestQueue};
 use crate::nexus::{
     NexusClient, NexusNotifierHandle, NexusNotifierInput, NexusNotifierTask,
 };
@@ -22,6 +22,7 @@ use crate::support_bundle::storage::SupportBundleManager;
 use crate::vmm_reservoir::{ReservoirMode, VmmReservoirManager};
 use crate::zone_bundle::BundleError;
 use crate::zone_bundle::{self, ZoneBundler};
+use anyhow::anyhow;
 use bootstore::schemes::v0 as bootstore;
 use camino::Utf8PathBuf;
 use derive_more::From;
@@ -31,15 +32,14 @@ use futures::stream::FuturesUnordered;
 use illumos_utils::opte::PortManager;
 use illumos_utils::running_zone::RunningZone;
 use illumos_utils::zpool::ZpoolName;
+use itertools::Itertools as _;
 use nexus_sled_agent_shared::inventory::{
-    Inventory, OmicronSledConfig, SledRole,
+    Inventory, OmicronSledConfig, OmicronZoneConfig, SledRole,
 };
 use omicron_common::address::{
     Ipv6Subnet, SLED_PREFIX, get_sled_address, get_switch_zone_address,
 };
-use omicron_common::api::external::{
-    ByteCount, ByteCountRangeError, Generation, Vni,
-};
+use omicron_common::api::external::{ByteCount, ByteCountRangeError, Vni};
 use omicron_common::api::internal::nexus::{DiskRuntimeState, SledVmmState};
 use omicron_common::api::internal::shared::{
     ExternalIpGatewayMap, HostPortConfig, RackNetworkConfig,
@@ -71,7 +71,9 @@ use sled_agent_types::zone_bundle::{
 };
 use sled_diagnostics::SledDiagnosticsCmdError;
 use sled_diagnostics::SledDiagnosticsCmdOutput;
-use sled_hardware::{HardwareManager, MemoryReservations, underlay};
+use sled_hardware::{
+    HardwareManager, MemoryReservations, PooledDiskError, underlay,
+};
 use sled_hardware_types::Baseboard;
 use sled_hardware_types::underlay::BootstrapInterface;
 use sled_storage::config::MountConfig;
@@ -84,7 +86,7 @@ use std::sync::Arc;
 use tufaceous_artifact::ArtifactHash;
 use uuid::Uuid;
 
-use illumos_utils::dladm::Dladm;
+use illumos_utils::dladm::{Dladm, EtherstubVnic};
 use illumos_utils::zone::Api;
 use illumos_utils::zone::Zones;
 
@@ -295,18 +297,15 @@ pub enum InventoryError {
     // system.
     #[error(transparent)]
     BadByteCount(#[from] ByteCountRangeError),
-    #[error("failed to get current ledgered disks")]
-    GetDisksConfig(#[source] sled_storage::error::Error),
-    #[error("failed to get current ledgered datasets")]
-    GetDatasetsConfig(#[source] sled_storage::error::Error),
+    #[error(transparent)]
+    InventoryError(#[from] sled_agent_config_reconciler::InventoryError),
 }
 
 impl From<InventoryError> for omicron_common::api::external::Error {
     fn from(inventory_error: InventoryError) -> Self {
         match inventory_error {
             e @ (InventoryError::BadByteCount(..)
-            | InventoryError::GetDisksConfig(_)
-            | InventoryError::GetDatasetsConfig(_)) => {
+            | InventoryError::InventoryError(_)) => {
                 omicron_common::api::external::Error::internal_error(
                     &InlineErrorChain::new(&e).to_string(),
                 )
@@ -609,8 +608,8 @@ impl SledAgent {
 
         // Start reconciling against our ledgered sled config.
         config_reconciler.spawn_reconciliation_task(
-            etherstub_vnic,
             ReconcilerFacilities {
+                etherstub_vnic,
                 service_manager: services.clone(),
                 metrics_queue: metrics_manager.request_queue(),
                 zone_bundler: long_running_task_handles.zone_bundler.clone(),
@@ -1133,7 +1132,7 @@ impl SledAgent {
             ledgered_sled_config,
             reconciler_status,
             last_reconciliation,
-        } = self.inner.config_reconciler.inventory();
+        } = self.inner.config_reconciler.inventory(&self.log).await?;
 
         Ok(Inventory {
             sled_id,
@@ -1312,15 +1311,16 @@ pub async fn sled_add(
 }
 
 struct ReconcilerFacilities {
+    etherstub_vnic: EtherstubVnic,
     service_manager: ServiceManager,
     metrics_queue: MetricsRequestQueue,
     zone_bundler: ZoneBundler,
 }
 
 impl SledAgentFacilities for ReconcilerFacilities {
-    type StartZoneError = services::Error;
-    type MetricsUntrackZoneLinksError = Vec<metrics::Error>;
-    type ZoneBundleCreateError = BundleError;
+    fn underlay_vnic(&self) -> &EtherstubVnic {
+        &self.etherstub_vnic
+    }
 
     async fn on_time_sync(&self) {
         self.service_manager.on_time_sync().await
@@ -1332,26 +1332,35 @@ impl SledAgentFacilities for ReconcilerFacilities {
         mount_config: &MountConfig,
         is_time_synchronized: bool,
         all_u2_pools: &[ZpoolName],
-    ) -> Result<RunningZone, Self::StartZoneError> {
-        self.service_manager
+    ) -> anyhow::Result<RunningZone> {
+        let zone = self
+            .service_manager
             .start_omicron_zone(
                 mount_config,
                 zone_config,
                 is_time_synchronized,
                 all_u2_pools,
             )
-            .await
+            .await?;
+        Ok(zone)
     }
 
     fn metrics_untrack_zone_links(
         &self,
         zone: &RunningZone,
-    ) -> Result<(), Self::MetricsUntrackZoneLinksError> {
-        self.metrics_queue.untrack_zone_links(zone)
-    }
-
-    fn ddm_add_internal_dns_prefix(&self, prefix: Ipv6Subnet<SLED_PREFIX>) {
-        self.service_manager.ddm_reconciler().add_internal_dns_subnet(prefix);
+    ) -> anyhow::Result<()> {
+        match self.metrics_queue.untrack_zone_links(zone) {
+            Ok(()) => Ok(()),
+            Err(errors) => {
+                let mut errors =
+                    errors.iter().map(|err| InlineErrorChain::new(err));
+                Err(anyhow!(
+                    "{} errors untracking zone links: {}",
+                    errors.len(),
+                    errors.join(", ")
+                ))
+            }
+        }
     }
 
     fn ddm_remove_internal_dns_prefix(&self, prefix: Ipv6Subnet<SLED_PREFIX>) {
@@ -1364,7 +1373,7 @@ impl SledAgentFacilities for ReconcilerFacilities {
         &self,
         zone: &RunningZone,
         cause: ZoneBundleCause,
-    ) -> Result<(), Self::ZoneBundleCreateError> {
+    ) -> anyhow::Result<()> {
         self.zone_bundler.create(zone, cause).await?;
         Ok(())
     }
@@ -1374,12 +1383,10 @@ impl SledAgentFacilities for ReconcilerFacilities {
 struct SledAgentArtifactStoreWrapper(Arc<ArtifactStore<InternalDisksReceiver>>);
 
 impl SledAgentArtifactStore for SledAgentArtifactStoreWrapper {
-    type ArtifactExistsValidationError = artifact_store::Error;
-
     async fn validate_artifact_exists_in_storage(
         &self,
         artifact: ArtifactHash,
-    ) -> Result<(), Self::ArtifactExistsValidationError> {
+    ) -> anyhow::Result<()> {
         self.0.get(artifact).await?;
         Ok(())
     }
diff --git a/sled-agent/src/support_bundle/storage.rs b/sled-agent/src/support_bundle/storage.rs
index ad126e9e1b5..0c64b625d8b 100644
--- a/sled-agent/src/support_bundle/storage.rs
+++ b/sled-agent/src/support_bundle/storage.rs
@@ -14,7 +14,6 @@ use futures::Stream;
 use futures::StreamExt;
 use illumos_utils::zfs::DatasetProperties;
 use omicron_common::api::external::Error as ExternalError;
-use omicron_common::api::external::Generation;
 use omicron_common::disk::CompressionAlgorithm;
 use omicron_common::disk::DatasetConfig;
 use omicron_common::disk::DatasetName;
@@ -31,6 +30,11 @@ use sha2::{Digest, Sha256};
 use sled_agent_api::*;
 use sled_agent_config_reconciler::ConfigReconcilerHandle;
 use sled_agent_config_reconciler::DatasetTaskError;
+use sled_agent_config_reconciler::InventoryError;
+use sled_agent_config_reconciler::NestedDatasetDestroyError;
+use sled_agent_config_reconciler::NestedDatasetEnsureError;
+use sled_agent_config_reconciler::NestedDatasetListError;
+use sled_agent_config_reconciler::NestedDatasetMountError;
 use sled_agent_types::support_bundle::BUNDLE_FILE_NAME;
 use sled_agent_types::support_bundle::BUNDLE_TMP_FILE_NAME_SUFFIX;
 use sled_storage::manager::NestedDatasetConfig;
@@ -38,7 +42,6 @@ use sled_storage::manager::NestedDatasetListOptions;
 use sled_storage::manager::NestedDatasetLocation;
 use slog::Logger;
 use slog_error_chain::InlineErrorChain;
-use std::collections::BTreeMap;
 use std::io::Write;
 use tokio::io::AsyncReadExt;
 use tokio::io::AsyncSeekExt;
@@ -91,6 +94,21 @@ pub enum Error {
 
     #[error(transparent)]
     DatasetTask(#[from] DatasetTaskError),
+
+    #[error("Could not access ledgered sled config")]
+    LedgeredSledConfig(#[source] InventoryError),
+
+    #[error(transparent)]
+    NestedDatasetMountError(#[from] NestedDatasetMountError),
+
+    #[error(transparent)]
+    NestedDatasetEnsureError(#[from] NestedDatasetEnsureError),
+
+    #[error(transparent)]
+    NestedDatasetDestroyError(#[from] NestedDatasetDestroyError),
+
+    #[error(transparent)]
+    NestedDatasetListError(#[from] NestedDatasetListError),
 }
 
 fn err_str(err: &dyn std::error::Error) -> String {
@@ -154,7 +172,7 @@ pub trait LocalStorage: Sync {
     /// Returns all nested datasets within an existing dataset
     async fn dyn_nested_dataset_list(
         &self,
-        name: NestedDatasetLocation,
+        name: DatasetName,
         options: NestedDatasetListOptions,
     ) -> Result<Vec<NestedDatasetConfig>, Error>;
 
@@ -177,21 +195,20 @@ impl LocalStorage for ConfigReconcilerHandle {
     async fn dyn_datasets_config_list(&self) -> Result<DatasetsConfig, Error> {
         // TODO-cleanup This is super gross; add a better API (maybe fetch a
         // single dataset by ID, since that's what our caller wants?)
-        Ok(self
-            .inventory()
-            .ledgered_sled_config
-            .map(|sled_config| DatasetsConfig {
-                generation: sled_config.generation,
-                datasets: sled_config
-                    .datasets
-                    .into_iter()
-                    .map(|d| (d.id, d))
-                    .collect(),
-            })
-            .unwrap_or_else(|| DatasetsConfig {
-                generation: Generation::new(),
-                datasets: BTreeMap::new(),
-            }))
+        let sled_config =
+            self.ledgered_sled_config().map_err(Error::LedgeredSledConfig)?;
+        let sled_config = match sled_config {
+            Some(config) => config,
+            None => return Ok(DatasetsConfig::default()),
+        };
+        Ok(DatasetsConfig {
+            generation: sled_config.generation,
+            datasets: sled_config
+                .datasets
+                .into_iter()
+                .map(|d| (d.id, d))
+                .collect(),
+        })
     }
 
     async fn dyn_dataset_get(
@@ -222,29 +239,39 @@ impl LocalStorage for ConfigReconcilerHandle {
     ) -> Result<Utf8PathBuf, Error> {
         self.nested_dataset_ensure_mounted(dataset)
             .await
-            .map_err(|err| err.into())
+            .map_err(Error::from)?
+            .map_err(Error::from)
     }
 
     async fn dyn_nested_dataset_list(
         &self,
-        name: NestedDatasetLocation,
+        name: DatasetName,
         options: NestedDatasetListOptions,
     ) -> Result<Vec<NestedDatasetConfig>, Error> {
-        self.nested_dataset_list(name, options).await.map_err(|err| err.into())
+        self.nested_dataset_list(name, options)
+            .await
+            .map_err(Error::from)?
+            .map_err(Error::from)
     }
 
     async fn dyn_nested_dataset_ensure(
         &self,
         config: NestedDatasetConfig,
     ) -> Result<(), Error> {
-        self.nested_dataset_ensure(config).await.map_err(|err| err.into())
+        self.nested_dataset_ensure(config)
+            .await
+            .map_err(Error::from)?
+            .map_err(Error::from)
     }
 
     async fn dyn_nested_dataset_destroy(
         &self,
         name: NestedDatasetLocation,
     ) -> Result<(), Error> {
-        self.nested_dataset_destroy(name).await.map_err(|err| err.into())
+        self.nested_dataset_destroy(name)
+            .await
+            .map_err(Error::from)?
+            .map_err(Error::from)
     }
 }
 
@@ -273,10 +300,15 @@ impl LocalStorage for crate::sim::Storage {
 
     async fn dyn_nested_dataset_list(
         &self,
-        name: NestedDatasetLocation,
+        name: DatasetName,
         options: NestedDatasetListOptions,
     ) -> Result<Vec<NestedDatasetConfig>, Error> {
-        self.lock().nested_dataset_list(name, options).map_err(|err| err.into())
+        self.lock()
+            .nested_dataset_list(
+                NestedDatasetLocation { path: String::new(), root: name },
+                options,
+            )
+            .map_err(|err| err.into())
     }
 
     async fn dyn_nested_dataset_ensure(
@@ -493,12 +525,10 @@ impl<'a> SupportBundleManager<'a> {
     ) -> Result<Vec<SupportBundleMetadata>, Error> {
         let root =
             self.get_mounted_dataset_config(zpool_id, dataset_id).await?.name;
-        let dataset_location =
-            NestedDatasetLocation { path: String::from(""), root };
         let datasets = self
             .storage
             .dyn_nested_dataset_list(
-                dataset_location,
+                root,
                 NestedDatasetListOptions::ChildrenOnly,
             )
             .await?;
@@ -986,12 +1016,15 @@ mod tests {
 
         async fn dyn_nested_dataset_list(
             &self,
-            name: NestedDatasetLocation,
+            name: DatasetName,
             options: NestedDatasetListOptions,
         ) -> Result<Vec<NestedDatasetConfig>, Error> {
-            self.nested_dataset_list(name, options)
-                .await
-                .map_err(|err| err.into())
+            self.nested_dataset_list(
+                NestedDatasetLocation { path: String::new(), root: name },
+                options,
+            )
+            .await
+            .map_err(|err| err.into())
         }
 
         async fn dyn_nested_dataset_ensure(
diff --git a/sled-agent/zone-images/Cargo.toml b/sled-agent/zone-images/Cargo.toml
index 5c388a19dd2..cd63d879001 100644
--- a/sled-agent/zone-images/Cargo.toml
+++ b/sled-agent/zone-images/Cargo.toml
@@ -16,6 +16,7 @@ nexus-sled-agent-shared.workspace = true
 omicron-common.workspace = true
 omicron-workspace-hack.workspace = true
 serde_json.workspace = true
+sled-agent-config-reconciler.workspace = true
 sled-storage.workspace = true
 slog.workspace = true
 slog-error-chain.workspace = true
@@ -26,3 +27,4 @@ camino-tempfile-ext.workspace = true
 dropshot.workspace = true
 omicron-uuid-kinds.workspace = true
 pretty_assertions.workspace = true
+sled-agent-config-reconciler = { workspace = true, features = ["testing"] }
diff --git a/sled-agent/zone-images/src/mupdate_override.rs b/sled-agent/zone-images/src/mupdate_override.rs
index 14ec5fd4c72..e89cd2aca92 100644
--- a/sled-agent/zone-images/src/mupdate_override.rs
+++ b/sled-agent/zone-images/src/mupdate_override.rs
@@ -11,14 +11,13 @@ use std::fs::FileType;
 use std::io;
 use std::sync::Arc;
 
-use crate::ZoneImageZpools;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
 use id_map::IdMap;
 use id_map::IdMappable;
 use illumos_utils::zpool::ZpoolName;
 use omicron_common::update::MupdateOverrideInfo;
-use sled_storage::dataset::INSTALL_DATASET;
+use sled_agent_config_reconciler::InternalDisksWithBootDisk;
 use slog::debug;
 use slog::error;
 use slog::info;
@@ -58,11 +57,9 @@ impl AllMupdateOverrides {
     /// be authoritative). Consider extracting this out into something generic.
     pub(crate) fn read_all(
         log: &slog::Logger,
-        zpools: &ZoneImageZpools<'_>,
-        boot_zpool: &ZpoolName,
+        internal_disks: InternalDisksWithBootDisk,
     ) -> Self {
-        let dataset =
-            boot_zpool.dataset_mountpoint(zpools.root, INSTALL_DATASET);
+        let dataset = internal_disks.boot_disk_install_dataset();
 
         let (boot_disk_path, boot_disk_res) =
             read_mupdate_override(log, &dataset);
@@ -70,18 +67,12 @@ impl AllMupdateOverrides {
         // Now read the file from all other disks. We attempt to make sure they
         // match up and will log a warning if they don't, though (until we have
         // a better story on transient failures) it's not fatal.
-        let non_boot_zpools = zpools
-            .all_m2_zpools
-            .iter()
-            .filter(|&zpool_name| zpool_name != boot_zpool);
-        let non_boot_disks_overrides = non_boot_zpools
-            .map(|zpool_name| {
-                let dataset =
-                    zpool_name.dataset_mountpoint(zpools.root, INSTALL_DATASET);
-
+        let non_boot_datasets = internal_disks.non_boot_disk_install_datasets();
+        let non_boot_disks_overrides = non_boot_datasets
+            .map(|(zpool_name, dataset)| {
                 let (path, res) = read_mupdate_override(log, &dataset);
                 MupdateOverrideNonBootInfo {
-                    zpool_name: *zpool_name,
+                    zpool_name,
                     path,
                     result: MupdateOverrideNonBootResult::new(
                         res,
@@ -92,7 +83,7 @@ impl AllMupdateOverrides {
             .collect();
 
         let ret = Self {
-            boot_zpool: *boot_zpool,
+            boot_zpool: internal_disks.boot_disk_zpool(),
             boot_disk_path,
             boot_disk_override: boot_disk_res,
             non_boot_disk_overrides: non_boot_disks_overrides,
@@ -483,9 +474,12 @@ mod tests {
     use dropshot::ConfigLogging;
     use dropshot::ConfigLoggingLevel;
     use dropshot::test_util::LogContext;
+    use omicron_common::disk::DiskIdentity;
     use omicron_uuid_kinds::MupdateOverrideUuid;
     use omicron_uuid_kinds::ZpoolUuid;
     use pretty_assertions::assert_eq;
+    use sled_agent_config_reconciler::InternalDisksReceiver;
+    use sled_storage::config::MountConfig;
     use std::collections::BTreeSet;
     use std::io;
     use std::sync::LazyLock;
@@ -536,6 +530,33 @@ mod tests {
     static OVERRIDE_2_UUID: MupdateOverrideUuid =
         MupdateOverrideUuid::from_u128(0x20588f8f_c680_4101_afc7_820226d03ada);
 
+    fn make_internal_disks(
+        root: &Utf8Path,
+        boot_zpool: ZpoolName,
+        other_zpools: &[ZpoolName],
+    ) -> InternalDisksWithBootDisk {
+        let identity_from_zpool = |zpool: ZpoolName| DiskIdentity {
+            vendor: "mupdate-override-tests".to_string(),
+            model: "fake-disk".to_string(),
+            serial: zpool.id().to_string(),
+        };
+        let mount_config = MountConfig {
+            root: root.to_path_buf(),
+            synthetic_disk_root: root.to_path_buf(),
+        };
+        InternalDisksReceiver::fake_static(
+            Arc::new(mount_config),
+            std::iter::once((identity_from_zpool(boot_zpool), boot_zpool))
+                .chain(
+                    other_zpools
+                        .iter()
+                        .copied()
+                        .map(|pool| (identity_from_zpool(pool), pool)),
+                ),
+        )
+        .current_with_boot_disk()
+    }
+
     /// Boot disk present / no other disks. (This produces a warning, but is
     /// otherwise okay.)
     #[test]
@@ -551,12 +572,9 @@ mod tests {
             .write_str(&serde_json::to_string(&override_info).unwrap())
             .unwrap();
 
-        let zpools = ZoneImageZpools {
-            root: dir.path(),
-            all_m2_zpools: vec![BOOT_ZPOOL],
-        };
+        let internal_disks = make_internal_disks(dir.path(), BOOT_ZPOOL, &[]);
         let overrides =
-            AllMupdateOverrides::read_all(&logctx.log, &zpools, &BOOT_ZPOOL);
+            AllMupdateOverrides::read_all(&logctx.log, internal_disks);
         assert_eq!(
             overrides.boot_disk_override.as_ref().unwrap().as_ref(),
             Some(&override_info)
@@ -583,13 +601,11 @@ mod tests {
             .write_str(&serde_json::to_string(&override_info).unwrap())
             .unwrap();
 
-        let zpools = ZoneImageZpools {
-            root: dir.path(),
-            all_m2_zpools: vec![BOOT_ZPOOL, NON_BOOT_ZPOOL],
-        };
+        let internal_disks =
+            make_internal_disks(dir.path(), BOOT_ZPOOL, &[NON_BOOT_ZPOOL]);
 
         let overrides =
-            AllMupdateOverrides::read_all(&logctx.log, &zpools, &BOOT_ZPOOL);
+            AllMupdateOverrides::read_all(&logctx.log, internal_disks);
         assert_eq!(
             overrides.boot_disk_override.as_ref().unwrap().as_ref(),
             Some(&override_info)
@@ -621,13 +637,11 @@ mod tests {
         dir.child(&BOOT_PATHS.install_dataset).create_dir_all().unwrap();
         dir.child(&NON_BOOT_PATHS.install_dataset).create_dir_all().unwrap();
 
-        let zpools = ZoneImageZpools {
-            root: dir.path(),
-            all_m2_zpools: vec![BOOT_ZPOOL, NON_BOOT_ZPOOL],
-        };
+        let internal_disks =
+            make_internal_disks(dir.path(), BOOT_ZPOOL, &[NON_BOOT_ZPOOL]);
 
         let overrides =
-            AllMupdateOverrides::read_all(&logctx.log, &zpools, &BOOT_ZPOOL);
+            AllMupdateOverrides::read_all(&logctx.log, internal_disks);
         assert_eq!(
             overrides.boot_disk_override.as_ref().unwrap().as_ref(),
             None,
@@ -662,13 +676,11 @@ mod tests {
         // Create the directory, but not the override JSON within it.
         dir.child(&NON_BOOT_PATHS.install_dataset).create_dir_all().unwrap();
 
-        let zpools = ZoneImageZpools {
-            root: dir.path(),
-            all_m2_zpools: vec![BOOT_ZPOOL, NON_BOOT_ZPOOL],
-        };
+        let internal_disks =
+            make_internal_disks(dir.path(), BOOT_ZPOOL, &[NON_BOOT_ZPOOL]);
 
         let overrides =
-            AllMupdateOverrides::read_all(&logctx.log, &zpools, &BOOT_ZPOOL);
+            AllMupdateOverrides::read_all(&logctx.log, internal_disks);
         assert_eq!(
             overrides.boot_disk_override.as_ref().unwrap().as_ref(),
             Some(&override_info)
@@ -706,12 +718,10 @@ mod tests {
             .write_str(&serde_json::to_string(&override_info).unwrap())
             .unwrap();
 
-        let zpools = ZoneImageZpools {
-            root: dir.path(),
-            all_m2_zpools: vec![BOOT_ZPOOL, NON_BOOT_ZPOOL],
-        };
+        let internal_disks =
+            make_internal_disks(dir.path(), BOOT_ZPOOL, &[NON_BOOT_ZPOOL]);
         let overrides =
-            AllMupdateOverrides::read_all(&logctx.log, &zpools, &BOOT_ZPOOL);
+            AllMupdateOverrides::read_all(&logctx.log, internal_disks);
         assert_eq!(
             overrides.boot_disk_override.as_ref().unwrap().as_ref(),
             None,
@@ -752,12 +762,10 @@ mod tests {
             .write_str(&serde_json::to_string(&override_info_2).unwrap())
             .expect("failed to write override json");
 
-        let zpools = ZoneImageZpools {
-            root: dir.path(),
-            all_m2_zpools: vec![BOOT_ZPOOL, NON_BOOT_ZPOOL],
-        };
+        let internal_disks =
+            make_internal_disks(dir.path(), BOOT_ZPOOL, &[NON_BOOT_ZPOOL]);
         let overrides =
-            AllMupdateOverrides::read_all(&logctx.log, &zpools, &BOOT_ZPOOL);
+            AllMupdateOverrides::read_all(&logctx.log, internal_disks);
         assert_eq!(
             overrides.boot_disk_override.as_ref().unwrap().as_ref(),
             Some(&override_info),
@@ -798,12 +806,10 @@ mod tests {
             .create_dir_all()
             .unwrap();
 
-        let zpools = ZoneImageZpools {
-            root: dir.path(),
-            all_m2_zpools: vec![BOOT_ZPOOL, NON_BOOT_ZPOOL],
-        };
+        let internal_disks =
+            make_internal_disks(dir.path(), BOOT_ZPOOL, &[NON_BOOT_ZPOOL]);
         let overrides =
-            AllMupdateOverrides::read_all(&logctx.log, &zpools, &BOOT_ZPOOL);
+            AllMupdateOverrides::read_all(&logctx.log, internal_disks);
         assert_eq!(
             overrides.boot_disk_override.as_ref().unwrap_err(),
             &dataset_missing_error(
@@ -841,12 +847,10 @@ mod tests {
         dir.child(&BOOT_PATHS.install_dataset).touch().unwrap();
         dir.child(&NON_BOOT_PATHS.install_dataset).touch().unwrap();
 
-        let zpools = ZoneImageZpools {
-            root: dir.path(),
-            all_m2_zpools: vec![BOOT_ZPOOL, NON_BOOT_ZPOOL],
-        };
+        let internal_disks =
+            make_internal_disks(dir.path(), BOOT_ZPOOL, &[NON_BOOT_ZPOOL]);
         let overrides =
-            AllMupdateOverrides::read_all(&logctx.log, &zpools, &BOOT_ZPOOL);
+            AllMupdateOverrides::read_all(&logctx.log, internal_disks);
         assert_eq!(
             overrides.boot_disk_override.as_ref().unwrap_err(),
             &dataset_not_dir_error(
@@ -892,17 +896,13 @@ mod tests {
         // Read error (empty file).
         dir.child(&NON_BOOT_3_PATHS.override_json).touch().unwrap();
 
-        let zpools = ZoneImageZpools {
-            root: dir.path(),
-            all_m2_zpools: vec![
-                BOOT_ZPOOL,
-                NON_BOOT_ZPOOL,
-                NON_BOOT_2_ZPOOL,
-                NON_BOOT_3_ZPOOL,
-            ],
-        };
+        let internal_disks = make_internal_disks(
+            dir.path(),
+            BOOT_ZPOOL,
+            &[NON_BOOT_ZPOOL, NON_BOOT_2_ZPOOL, NON_BOOT_3_ZPOOL],
+        );
         let overrides =
-            AllMupdateOverrides::read_all(&logctx.log, &zpools, &BOOT_ZPOOL);
+            AllMupdateOverrides::read_all(&logctx.log, internal_disks);
         assert_eq!(
             overrides.boot_disk_override.as_ref().unwrap_err(),
             &deserialize_error(dir.path(), &BOOT_PATHS.override_json, "",),
diff --git a/sled-agent/zone-images/src/source_resolver.rs b/sled-agent/zone-images/src/source_resolver.rs
index 576e1676ad0..0ef868960ae 100644
--- a/sled-agent/zone-images/src/source_resolver.rs
+++ b/sled-agent/zone-images/src/source_resolver.rs
@@ -5,12 +5,10 @@
 //! Zone image lookup.
 
 use crate::AllMupdateOverrides;
-use camino::Utf8Path;
 use camino::Utf8PathBuf;
-use illumos_utils::zpool::ZpoolName;
 use nexus_sled_agent_shared::inventory::OmicronZoneImageSource;
-use sled_storage::dataset::INSTALL_DATASET;
-use sled_storage::dataset::M2_ARTIFACT_DATASET;
+use sled_agent_config_reconciler::InternalDisks;
+use sled_agent_config_reconciler::InternalDisksWithBootDisk;
 use slog::o;
 use std::sync::Arc;
 use std::sync::Mutex;
@@ -29,16 +27,6 @@ pub struct ZoneImageFileSource {
     pub search_paths: Vec<Utf8PathBuf>,
 }
 
-/// A description of zpools to examine for zone images.
-pub struct ZoneImageZpools<'a> {
-    /// The root directory, typically `/`.
-    pub root: &'a Utf8Path,
-
-    /// The full set of M.2 zpools that are currently known. Must be non-empty,
-    /// but it can include the boot zpool.
-    pub all_m2_zpools: Vec<ZpoolName>,
-}
-
 /// Resolves [`OmicronZoneImageSource`] instances into file names and search
 /// paths.
 ///
@@ -53,33 +41,25 @@ impl ZoneImageSourceResolver {
     /// Creates a new `ZoneImageSourceResolver`.
     pub fn new(
         log: &slog::Logger,
-        zpools: &ZoneImageZpools<'_>,
-        boot_zpool: &ZpoolName,
+        internal_disks: InternalDisksWithBootDisk,
     ) -> Self {
         Self {
             inner: Arc::new(Mutex::new(ResolverInner::new(
-                log, zpools, boot_zpool,
+                log,
+                internal_disks,
             ))),
         }
     }
 
-    /// Overrides the image directory with another one.
-    ///
-    /// Intended for testing.
-    pub fn override_image_directory(&self, path: Utf8PathBuf) {
-        self.inner.lock().unwrap().override_image_directory(path);
-    }
-
     /// Returns a [`ZoneImageFileSource`] consisting of the file name, plus a
     /// list of potential paths to search, for a zone image.
     pub fn file_source_for(
         &self,
         image_source: &OmicronZoneImageSource,
-        zpools: &ZoneImageZpools<'_>,
-        boot_zpool: Option<&ZpoolName>,
+        internal_disks: InternalDisks,
     ) -> ZoneImageFileSource {
         let inner = self.inner.lock().unwrap();
-        inner.file_source_for(image_source, zpools, boot_zpool)
+        inner.file_source_for(image_source, internal_disks)
     }
 }
 
@@ -99,39 +79,20 @@ struct ResolverInner {
 impl ResolverInner {
     fn new(
         log: &slog::Logger,
-        zpools: &ZoneImageZpools<'_>,
-        boot_zpool: &ZpoolName,
+        internal_disks: InternalDisksWithBootDisk,
     ) -> Self {
         let log = log.new(o!("component" => "ZoneImageSourceResolver"));
 
         let mupdate_overrides =
-            AllMupdateOverrides::read_all(&log, zpools, boot_zpool);
+            AllMupdateOverrides::read_all(&log, internal_disks);
 
         Self { log, image_directory_override: None, mupdate_overrides }
     }
 
-    fn override_image_directory(
-        &mut self,
-        image_directory_override: Utf8PathBuf,
-    ) {
-        if let Some(dir) = &self.image_directory_override {
-            // Allow idempotent sets to the same directory -- some tests do
-            // this.
-            if image_directory_override != *dir {
-                panic!(
-                    "image_directory_override already set to `{dir}`, \
-                     attempting to set it to `{image_directory_override}`"
-                );
-            }
-        }
-        self.image_directory_override = Some(image_directory_override);
-    }
-
     fn file_source_for(
         &self,
         image_source: &OmicronZoneImageSource,
-        zpools: &ZoneImageZpools<'_>,
-        boot_zpool: Option<&ZpoolName>,
+        internal_disks: InternalDisks,
     ) -> ZoneImageFileSource {
         let file_name = match image_source {
             OmicronZoneImageSource::InstallDataset => {
@@ -153,33 +114,17 @@ impl ResolverInner {
 
                 // If the boot disk exists, look for the image in the "install"
                 // dataset on the boot zpool.
-                if let Some(boot_zpool) = boot_zpool {
-                    zone_image_paths.push(
-                        boot_zpool
-                            .dataset_mountpoint(zpools.root, INSTALL_DATASET),
-                    );
+                if let Some(path) = internal_disks.boot_disk_install_dataset() {
+                    zone_image_paths.push(path);
                 }
 
                 zone_image_paths
             }
             OmicronZoneImageSource::Artifact { .. } => {
-                // Search both artifact datasets, but look on the boot disk first.
-                // This iterator starts with the zpool for the boot disk (if it
-                // exists), and then is followed by all other zpools.
-                let zpool_iter = boot_zpool.into_iter().chain(
-                    zpools
-                        .all_m2_zpools
-                        .iter()
-                        .filter(|zpool| Some(zpool) != boot_zpool.as_ref()),
-                );
-                zpool_iter
-                    .map(|zpool| {
-                        zpool.dataset_mountpoint(
-                            zpools.root,
-                            M2_ARTIFACT_DATASET,
-                        )
-                    })
-                    .collect()
+                // Search both artifact datasets. This iterator starts with the
+                // dataset for the boot disk (if it exists), and then is followed
+                // by all other disks.
+                internal_disks.all_artifact_datasets().collect()
             }
         };
 

From 2abceb78cbc6712363a93d81706ce48b9abc31ae Mon Sep 17 00:00:00 2001
From: John Gallagher <john@oxidecomputer.com>
Date: Tue, 20 May 2025 17:40:52 -0400
Subject: [PATCH 04/10] fill in some TODOs that were waiting on reconciler

---
 nexus-sled-agent-shared/src/inventory.rs      | 49 +++++++++++++++++++
 nexus/inventory/src/examples.rs               | 27 ++--------
 nexus/reconfigurator/planning/src/planner.rs  | 49 +++++++++----------
 nexus/reconfigurator/planning/src/system.rs   | 29 ++++++++---
 .../background/tasks/sync_service_zone_nat.rs | 24 ++++-----
 5 files changed, 108 insertions(+), 70 deletions(-)

diff --git a/nexus-sled-agent-shared/src/inventory.rs b/nexus-sled-agent-shared/src/inventory.rs
index 8495c774c49..eae39f4d61f 100644
--- a/nexus-sled-agent-shared/src/inventory.rs
+++ b/nexus-sled-agent-shared/src/inventory.rs
@@ -128,6 +128,55 @@ pub struct ConfigReconcilerInventory {
     pub zones: BTreeMap<OmicronZoneUuid, ConfigReconcilerInventoryResult>,
 }
 
+impl ConfigReconcilerInventory {
+    /// Iterate over all running zones as reported by the last reconciliation
+    /// result.
+    ///
+    /// This includes zones that are both present in `last_reconciled_config`
+    /// and whose status in `zones` indicates "successfully running".
+    pub fn running_omicron_zones(
+        &self,
+    ) -> impl Iterator<Item = &OmicronZoneConfig> {
+        self.zones.iter().filter_map(|(zone_id, result)| {
+            match result {
+                ConfigReconcilerInventoryResult::Ok => (),
+                ConfigReconcilerInventoryResult::Err { .. } => return None,
+            };
+            self.last_reconciled_config.zones.get(zone_id)
+        })
+    }
+
+    /// Given a sled config, produce a reconciler result that sled-agent could
+    /// have emitted if reconciliation succeeded.
+    ///
+    /// This method should only be used by tests and dev tools; real code should
+    /// look at the actual `last_reconciliation` value from the parent
+    /// [`Inventory`].
+    pub fn debug_assume_success(config: OmicronSledConfig) -> Self {
+        let external_disks = config
+            .disks
+            .iter()
+            .map(|d| (d.id, ConfigReconcilerInventoryResult::Ok))
+            .collect();
+        let datasets = config
+            .datasets
+            .iter()
+            .map(|d| (d.id, ConfigReconcilerInventoryResult::Ok))
+            .collect();
+        let zones = config
+            .zones
+            .iter()
+            .map(|z| (z.id, ConfigReconcilerInventoryResult::Ok))
+            .collect();
+        Self {
+            last_reconciled_config: config,
+            external_disks,
+            datasets,
+            zones,
+        }
+    }
+}
+
 #[derive(Clone, Debug, PartialEq, Eq, Deserialize, JsonSchema, Serialize)]
 #[serde(tag = "result", rename_all = "snake_case")]
 pub enum ConfigReconcilerInventoryResult {
diff --git a/nexus/inventory/src/examples.rs b/nexus/inventory/src/examples.rs
index a7603195273..07af0890f46 100644
--- a/nexus/inventory/src/examples.rs
+++ b/nexus/inventory/src/examples.rs
@@ -16,7 +16,6 @@ use gateway_client::types::SpType;
 use gateway_types::rot::RotSlot;
 use nexus_sled_agent_shared::inventory::Baseboard;
 use nexus_sled_agent_shared::inventory::ConfigReconcilerInventory;
-use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryResult;
 use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryStatus;
 use nexus_sled_agent_shared::inventory::Inventory;
 use nexus_sled_agent_shared::inventory::InventoryDataset;
@@ -643,29 +642,9 @@ pub fn sled_agent(
     ledgered_sled_config: Option<OmicronSledConfig>,
 ) -> Inventory {
     // Assume the `ledgered_sled_config` was reconciled successfully.
-    let last_reconciliation = ledgered_sled_config.clone().map(|config| {
-        let external_disks = config
-            .disks
-            .iter()
-            .map(|d| (d.id, ConfigReconcilerInventoryResult::Ok))
-            .collect();
-        let datasets = config
-            .datasets
-            .iter()
-            .map(|d| (d.id, ConfigReconcilerInventoryResult::Ok))
-            .collect();
-        let zones = config
-            .zones
-            .iter()
-            .map(|z| (z.id, ConfigReconcilerInventoryResult::Ok))
-            .collect();
-        ConfigReconcilerInventory {
-            last_reconciled_config: config,
-            external_disks,
-            datasets,
-            zones,
-        }
-    });
+    let last_reconciliation = ledgered_sled_config
+        .clone()
+        .map(ConfigReconcilerInventory::debug_assume_success);
 
     let reconciler_status = if last_reconciliation.is_some() {
         ConfigReconcilerInventoryStatus::Idle {
diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs
index 04b61483705..9f308c1fac6 100644
--- a/nexus/reconfigurator/planning/src/planner.rs
+++ b/nexus/reconfigurator/planning/src/planner.rs
@@ -214,17 +214,18 @@ impl<'a> Planner<'a> {
         // The sled is not expunged. We have to see if the inventory
         // reflects the parent blueprint disk generation. If it does
         // then we mark any expunged disks decommissioned.
+        //
+        // TODO-correctness We inspect `last_reconciliation` here to confirm
+        // that the config reconciler has acted on the generation we're waiting
+        // for. This might be overly conservative - would it be okay to act on
+        // `ledgered_sled_config` instead, and decommission disks as long as the
+        // sled knows it should stop using it (even if it hasn't yet)?
         let Some(seen_generation) = self
             .inventory
             .sled_agents
             .get(&sled_id)
-            // TODO-correctness For now this is correct, because sled-agent
-            // doesn't ledger a config until it's applied it. However, once
-            // https://github.com/oxidecomputer/omicron/pull/8064 lands,
-            // sled-agent will ledger a config and then later reconcile it; do
-            // we need to wait for that reconciliation to decommission disks?
-            .and_then(|sa| sa.ledgered_sled_config.as_ref())
-            .map(|config| config.generation)
+            .and_then(|sa| sa.last_reconciliation.as_ref())
+            .map(|reconciled| reconciled.last_reconciled_config.generation)
         else {
             // There is no current inventory for the sled agent, so we cannot
             // decommission any disks.
@@ -424,15 +425,10 @@ impl<'a> Planner<'a> {
                     as_of_generation,
                     ready_for_cleanup,
                 } if !ready_for_cleanup => {
-                    // TODO-correctness For now this is correct, because
-                    // sled-agent doesn't ledger a config until it's applied it.
-                    // However, as a part of landing
-                    // https://github.com/oxidecomputer/omicron/pull/8064,
-                    // this needs to change to check the last reconciled config
-                    // instead of just the ledgered config.
-                    if let Some(config) = &sled_inv.ledgered_sled_config {
-                        if config.generation >= as_of_generation
-                            && !config.zones.contains_key(&zone.id)
+                    if let Some(reconciled) = &sled_inv.last_reconciliation {
+                        if reconciled.last_reconciled_config.generation
+                            >= as_of_generation
+                            && !reconciled.zones.contains_key(&zone.id)
                         {
                             zones_ready_for_cleanup.push(zone.id);
                         }
@@ -569,22 +565,23 @@ impl<'a> Planner<'a> {
             // NTP zone), we'll need to be careful how we do it to avoid a
             // problem here.
             //
-            // TODO-cleanup Once
-            // https://github.com/oxidecomputer/omicron/pull/8064 lands, the
-            // above comment will be overly conservative; sled-agent won't
-            // reject configs just because time isn't sync'd yet. We may be able
-            // to remove this check entirely. (It's probably also fine to keep
-            // it for now; removing it just saves us an extra planning iteration
-            // when adding a new sled.)
+            // TODO-cleanup The above comment is now overly conservative;
+            // sled-agent won't reject configs just because time isn't sync'd
+            // yet. We may be able to remove this check entirely, but we'd need
+            // to do some testing to confirm no surprises. (It's probably also
+            // fine to keep it for now; removing it just saves us an extra
+            // planning iteration when adding a new sled.)
             let has_ntp_inventory = self
                 .inventory
                 .sled_agents
                 .get(&sled_id)
                 .map(|sled_agent| {
-                    sled_agent.ledgered_sled_config.as_ref().map_or(
+                    sled_agent.last_reconciliation.as_ref().map_or(
                         false,
-                        |config| {
-                            config.zones.iter().any(|z| z.zone_type.is_ntp())
+                        |reconciliation| {
+                            reconciliation
+                                .running_omicron_zones()
+                                .any(|z| z.zone_type.is_ntp())
                         },
                     )
                 })
diff --git a/nexus/reconfigurator/planning/src/system.rs b/nexus/reconfigurator/planning/src/system.rs
index 12219daaf81..0d4bf6b2916 100644
--- a/nexus/reconfigurator/planning/src/system.rs
+++ b/nexus/reconfigurator/planning/src/system.rs
@@ -6,6 +6,7 @@
 //! associated inventory collections and blueprints
 
 use anyhow::{Context, anyhow, bail, ensure};
+use chrono::Utc;
 use gateway_client::types::RotState;
 use gateway_client::types::SpState;
 use indexmap::IndexMap;
@@ -13,6 +14,7 @@ use ipnet::Ipv6Net;
 use ipnet::Ipv6Subnets;
 use nexus_inventory::CollectionBuilder;
 use nexus_sled_agent_shared::inventory::Baseboard;
+use nexus_sled_agent_shared::inventory::ConfigReconcilerInventory;
 use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryStatus;
 use nexus_sled_agent_shared::inventory::Inventory;
 use nexus_sled_agent_shared::inventory::InventoryDataset;
@@ -57,6 +59,7 @@ use std::fmt::Debug;
 use std::net::Ipv4Addr;
 use std::net::Ipv6Addr;
 use std::sync::Arc;
+use std::time::Duration;
 
 /// Describes an actual or synthetic Oxide rack for planning and testing
 ///
@@ -389,10 +392,17 @@ impl SystemDescription {
         })?;
         let sled = Arc::make_mut(sled);
 
-        sled.inventory_sled_agent.ledgered_sled_config = Some(sled_config);
+        sled.inventory_sled_agent.ledgered_sled_config =
+            Some(sled_config.clone());
+
+        // Present results as though the reconciler has successfully completed.
         sled.inventory_sled_agent.reconciler_status =
-            ConfigReconcilerInventoryStatus::NotYetRun;
-        sled.inventory_sled_agent.last_reconciliation = None;
+            ConfigReconcilerInventoryStatus::Idle {
+                completed_at: Utc::now(),
+                ran_for: Duration::from_secs(5),
+            };
+        sled.inventory_sled_agent.last_reconciliation =
+            Some(ConfigReconcilerInventory::debug_assume_success(sled_config));
 
         Ok(self)
     }
@@ -716,9 +726,16 @@ impl Sled {
                     })
                     .collect(),
                 datasets: vec![],
-                ledgered_sled_config: Some(sled_config),
-                reconciler_status: ConfigReconcilerInventoryStatus::NotYetRun,
-                last_reconciliation: None,
+                ledgered_sled_config: Some(sled_config.clone()),
+                reconciler_status: ConfigReconcilerInventoryStatus::Idle {
+                    completed_at: Utc::now(),
+                    ran_for: Duration::from_secs(5),
+                },
+                last_reconciliation: Some(
+                    ConfigReconcilerInventory::debug_assume_success(
+                        sled_config,
+                    ),
+                ),
             }
         };
 
diff --git a/nexus/src/app/background/tasks/sync_service_zone_nat.rs b/nexus/src/app/background/tasks/sync_service_zone_nat.rs
index a1c1869fed9..34798bb9335 100644
--- a/nexus/src/app/background/tasks/sync_service_zone_nat.rs
+++ b/nexus/src/app/background/tasks/sync_service_zone_nat.rs
@@ -126,21 +126,17 @@ impl BackgroundTask for ServiceZoneNatTracker {
                 let sled_address = oxnet::Ipv6Net::host_net(*sled.ip);
 
                 // TODO-correctness Looking at inventory here is a little
-                // sketchy. We currently check the most-recently-ledgered zones
-                // which tells us what services sled-agent things it's supposed
-                // to be running. It might be better to check either:
-                //
-                // * `sa.last_reconciliation` (to know what zones are actually
-                //   running; this requires
-                //   https://github.com/oxidecomputer/omicron/pull/8064 landing)
-                //   if the goal is to sync what's actually on the sled
-                // * a rendezvous table populated by reconfigurator if the goal
-                //   is to sync with what's Nexus thinks is supposed to be
-                //   running on the sled
+                // sketchy. We check the last reconciliation result, which
+                // should be a view of what zones are actually running on the
+                // sled. But maybe it would be better to act on a rendezvous
+                // table populated by reconfigurator if the goal is to sync with
+                // what's Nexus thinks is supposed to be running on the sled?
                 let zones = sa
-                    .ledgered_sled_config
-                    .map(|config| config.zones)
-                    .unwrap_or_default();
+                    .last_reconciliation
+                    .iter()
+                    .flat_map(|reconciliation| {
+                        reconciliation.running_omicron_zones().cloned()
+                    });
 
                 for zone in zones {
                     let zone_type: OmicronZoneType = zone.zone_type;

From 92b491662992e90f35c4dbc191d38f372783c175 Mon Sep 17 00:00:00 2001
From: John Gallagher <john@oxidecomputer.com>
Date: Wed, 21 May 2025 16:26:24 -0400
Subject: [PATCH 05/10] fix failing planner tests

---
 nexus/reconfigurator/planning/src/planner.rs | 186 ++++++++++++-------
 1 file changed, 120 insertions(+), 66 deletions(-)

diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs
index 9f308c1fac6..07b1c86756d 100644
--- a/nexus/reconfigurator/planning/src/planner.rs
+++ b/nexus/reconfigurator/planning/src/planner.rs
@@ -420,23 +420,37 @@ impl<'a> Planner<'a> {
             .blueprint
             .current_sled_zones(sled_id, BlueprintZoneDisposition::is_expunged)
         {
-            match zone.disposition {
+            // If this is a zone still waiting for cleanup, what generation of
+            // sled config was it expunged?
+            let as_of_generation = match zone.disposition {
                 BlueprintZoneDisposition::Expunged {
                     as_of_generation,
                     ready_for_cleanup,
-                } if !ready_for_cleanup => {
-                    if let Some(reconciled) = &sled_inv.last_reconciliation {
-                        if reconciled.last_reconciled_config.generation
-                            >= as_of_generation
-                            && !reconciled.zones.contains_key(&zone.id)
-                        {
-                            zones_ready_for_cleanup.push(zone.id);
-                        }
-                    }
-                }
+                } if !ready_for_cleanup => as_of_generation,
                 BlueprintZoneDisposition::InService
                 | BlueprintZoneDisposition::Expunged { .. } => continue,
+            };
+
+            // If the sled hasn't done any reconciliation, wait until it has.
+            let Some(reconciliation) = &sled_inv.last_reconciliation else {
+                continue;
+            };
+
+            // If the sled hasn't reconciled a new-enough generation, wait until
+            // it has.
+            if reconciliation.last_reconciled_config.generation
+                < as_of_generation
+            {
+                continue;
             }
+
+            // If the sled hasn't shut down the zone, wait until it has.
+            if reconciliation.zones.contains_key(&zone.id) {
+                continue;
+            }
+
+            // All checks passed: we can mark this zone as ready for cleanup.
+            zones_ready_for_cleanup.push(zone.id);
         }
 
         if !zones_ready_for_cleanup.is_empty() {
@@ -1011,6 +1025,8 @@ pub(crate) mod test {
     use clickhouse_admin_types::ClickhouseKeeperClusterMembership;
     use clickhouse_admin_types::KeeperId;
     use expectorate::assert_contents;
+    use nexus_sled_agent_shared::inventory::ConfigReconcilerInventory;
+    use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryResult;
     use nexus_types::deployment::BlueprintDatasetDisposition;
     use nexus_types::deployment::BlueprintDiffSummary;
     use nexus_types::deployment::BlueprintPhysicalDiskDisposition;
@@ -2133,9 +2149,10 @@ pub(crate) mod test {
             .sled_agents
             .get_mut(&sled_id)
             .unwrap()
-            .ledgered_sled_config
+            .last_reconciliation
             .as_mut()
             .unwrap()
+            .last_reconciled_config
             .generation = Generation::from_u32(3);
 
         let blueprint3 = Planner::new_based_on(
@@ -4005,6 +4022,14 @@ pub(crate) mod test {
         // Use our example system.
         let (mut collection, input, blueprint1) = example(&log, TEST_NAME);
 
+        // Don't start more internal DNS zones (which the planner would, as a
+        // side effect of our test details).
+        let input = {
+            let mut builder = input.into_builder();
+            builder.policy_mut().target_internal_dns_zone_count = 0;
+            builder.build()
+        };
+
         // Find a Nexus zone we'll use for our test.
         let (sled_id, nexus_config) = blueprint1
             .sleds
@@ -4037,7 +4062,7 @@ pub(crate) mod test {
         // Run the planner. It should expunge all zones on the disk we just
         // expunged, including our Nexus zone, but not mark them as ready for
         // cleanup yet.
-        let blueprint2 = Planner::new_based_on(
+        let mut blueprint2 = Planner::new_based_on(
             logctx.log.clone(),
             &blueprint1,
             &input,
@@ -4049,6 +4074,27 @@ pub(crate) mod test {
         .plan()
         .expect("planned");
 
+        // Mark the disk we expected as "ready for cleanup"; this isn't what
+        // we're testing, and failing to do this will interfere with some of the
+        // checks we do below.
+        for mut disk in
+            blueprint2.sleds.get_mut(&sled_id).unwrap().disks.iter_mut()
+        {
+            match disk.disposition {
+                BlueprintPhysicalDiskDisposition::InService => (),
+                BlueprintPhysicalDiskDisposition::Expunged {
+                    as_of_generation,
+                    ..
+                } => {
+                    disk.disposition =
+                        BlueprintPhysicalDiskDisposition::Expunged {
+                            as_of_generation,
+                            ready_for_cleanup: true,
+                        };
+                }
+            }
+        }
+
         // Helper to extract the Nexus zone's disposition in a blueprint.
         let get_nexus_disposition = |bp: &Blueprint| {
             bp.sleds.get(&sled_id).unwrap().zones.iter().find_map(|z| {
@@ -4057,8 +4103,8 @@ pub(crate) mod test {
         };
 
         // This sled's config generation should have been bumped...
-        let bp2_generation =
-            blueprint2.sleds.get(&sled_id).unwrap().sled_agent_generation;
+        let bp2_config = blueprint2.sleds.get(&sled_id).unwrap().clone();
+        let bp2_sled_config = bp2_config.clone().into_in_service_sled_config();
         assert_eq!(
             blueprint1
                 .sleds
@@ -4066,24 +4112,25 @@ pub(crate) mod test {
                 .unwrap()
                 .sled_agent_generation
                 .next(),
-            bp2_generation
+            bp2_sled_config.generation
         );
         // ... and the Nexus should should have the disposition we expect.
         assert_eq!(
             get_nexus_disposition(&blueprint2),
             Some(BlueprintZoneDisposition::Expunged {
-                as_of_generation: bp2_generation,
+                as_of_generation: bp2_sled_config.generation,
                 ready_for_cleanup: false,
             })
         );
 
         // Running the planner again should make no changes until the inventory
-        // reports that the zone is not running and that the sled has seen a
-        // new-enough generation. Try three variants that should do nothing:
+        // reports that the zone is not running and that the sled has reconciled
+        // a new-enough generation. Try these variants:
         //
-        // * same inventory as above
-        // * inventory reports a new generation (but zone still running)
-        // * inventory reports zone not running (but still the old generation)
+        // * same inventory as above (expect no changes)
+        // * new config is ledgered but not reconciled (expect no changes)
+        // * new config is reconciled, but zone is in an error state (expect
+        //   no changes)
         eprintln!("planning with no inventory change...");
         assert_planning_makes_no_changes(
             &logctx.log,
@@ -4092,15 +4139,7 @@ pub(crate) mod test {
             &collection,
             TEST_NAME,
         );
-        // TODO-cleanup These checks depend on `last_reconciled_config`, which
-        // is not yet populated; uncomment these and check them by mutating
-        // `last_reconciled_config` once
-        // https://github.com/oxidecomputer/omicron/pull/8064 lands. We could
-        // just mutate `ledgered_sled_config` in the meantime (as this
-        // commented-out code does below), but that's not really checking what
-        // we care about.
-        /*
-        eprintln!("planning with generation bump but zone still running...");
+        eprintln!("planning with config ledgered but not reconciled...");
         assert_planning_makes_no_changes(
             &logctx.log,
             &blueprint2,
@@ -4111,15 +4150,15 @@ pub(crate) mod test {
                     .sled_agents
                     .get_mut(&sled_id)
                     .unwrap()
-                    .ledgered_sled_config
-                    .as_mut()
-                    .unwrap()
-                    .generation = bp2_generation;
+                    .ledgered_sled_config = Some(bp2_sled_config.clone());
                 collection
             },
             TEST_NAME,
         );
-        eprintln!("planning with zone gone but generation not bumped...");
+        eprintln!(
+            "planning with config ledgered but \
+             zones failed to shut down..."
+        );
         assert_planning_makes_no_changes(
             &logctx.log,
             &blueprint2,
@@ -4130,28 +4169,42 @@ pub(crate) mod test {
                     .sled_agents
                     .get_mut(&sled_id)
                     .unwrap()
-                    .ledgered_sled_config
-                    .as_mut()
+                    .ledgered_sled_config = Some(bp2_sled_config.clone());
+                let mut reconciliation =
+                    ConfigReconcilerInventory::debug_assume_success(
+                        bp2_sled_config.clone(),
+                    );
+                // For all the zones that are in bp2_config but not
+                // bp2_sled_config (i.e., zones that should have been shut
+                // down), insert an error result in the reconciliation.
+                for zone_id in bp2_config.zones.keys() {
+                    if !reconciliation.zones.contains_key(zone_id) {
+                        reconciliation.zones.insert(
+                            *zone_id,
+                            ConfigReconcilerInventoryResult::Err {
+                                message: "failed to shut down".to_string(),
+                            },
+                        );
+                    }
+                }
+                collection
+                    .sled_agents
+                    .get_mut(&sled_id)
                     .unwrap()
-                    .zones
-                    .retain(|z| z.id != nexus_config.id);
+                    .last_reconciliation = Some(reconciliation);
                 collection
             },
             TEST_NAME,
         );
-        */
 
         // Now make both changes to the inventory.
         {
-            let config = &mut collection
-                .sled_agents
-                .get_mut(&sled_id)
-                .unwrap()
-                .ledgered_sled_config
-                .as_mut()
-                .unwrap();
-            config.generation = bp2_generation;
-            config.zones.retain(|z| z.id != nexus_config.id);
+            let config = collection.sled_agents.get_mut(&sled_id).unwrap();
+            config.ledgered_sled_config = Some(bp2_sled_config.clone());
+            config.last_reconciliation =
+                Some(ConfigReconcilerInventory::debug_assume_success(
+                    bp2_sled_config.clone(),
+                ));
         }
 
         // Run the planner. It mark our Nexus zone as ready for cleanup now that
@@ -4171,7 +4224,7 @@ pub(crate) mod test {
         assert_eq!(
             get_nexus_disposition(&blueprint3),
             Some(BlueprintZoneDisposition::Expunged {
-                as_of_generation: bp2_generation,
+                as_of_generation: bp2_sled_config.generation,
                 ready_for_cleanup: true,
             })
         );
@@ -4180,7 +4233,7 @@ pub(crate) mod test {
         // since it doesn't affect what's sent to sled-agent.
         assert_eq!(
             blueprint3.sleds.get(&sled_id).unwrap().sled_agent_generation,
-            bp2_generation
+            bp2_sled_config.generation
         );
 
         assert_planning_makes_no_changes(
@@ -4260,8 +4313,12 @@ pub(crate) mod test {
         };
 
         // This sled's config generation should have been bumped...
-        let bp2_generation =
-            blueprint2.sleds.get(&sled_id).unwrap().sled_agent_generation;
+        let bp2_config = blueprint2
+            .sleds
+            .get(&sled_id)
+            .unwrap()
+            .clone()
+            .into_in_service_sled_config();
         assert_eq!(
             blueprint1
                 .sleds
@@ -4269,13 +4326,13 @@ pub(crate) mod test {
                 .unwrap()
                 .sled_agent_generation
                 .next(),
-            bp2_generation
+            bp2_config.generation
         );
         // ... and the DNS zone should should have the disposition we expect.
         assert_eq!(
             get_dns_disposition(&blueprint2),
             Some(BlueprintZoneDisposition::Expunged {
-                as_of_generation: bp2_generation,
+                as_of_generation: bp2_config.generation,
                 ready_for_cleanup: false,
             })
         );
@@ -4293,15 +4350,12 @@ pub(crate) mod test {
 
         // Make the inventory changes necessary for cleanup to proceed.
         {
-            let config = &mut collection
-                .sled_agents
-                .get_mut(&sled_id)
-                .unwrap()
-                .ledgered_sled_config
-                .as_mut()
-                .unwrap();
-            config.generation = bp2_generation;
-            config.zones.retain(|z| z.id != internal_dns_config.id);
+            let config = &mut collection.sled_agents.get_mut(&sled_id).unwrap();
+            config.ledgered_sled_config = Some(bp2_config.clone());
+            config.last_reconciliation =
+                Some(ConfigReconcilerInventory::debug_assume_success(
+                    bp2_config.clone(),
+                ));
         }
 
         // Run the planner. It should mark our internal DNS zone as ready for
@@ -4323,7 +4377,7 @@ pub(crate) mod test {
         assert_eq!(
             get_dns_disposition(&blueprint3),
             Some(BlueprintZoneDisposition::Expunged {
-                as_of_generation: bp2_generation,
+                as_of_generation: bp2_config.generation,
                 ready_for_cleanup: true,
             })
         );

From 78c60a366b65fb190db977dd3023300c1e0140c4 Mon Sep 17 00:00:00 2001
From: John Gallagher <john@oxidecomputer.com>
Date: Wed, 21 May 2025 16:37:27 -0400
Subject: [PATCH 06/10] expectorate

---
 dev-tools/omdb/tests/test_all_output.rs |  1 -
 dev-tools/omdb/tests/usage_errors.out   | 28 -------------------------
 2 files changed, 29 deletions(-)

diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs
index 5ed7a606fda..c8441924d7f 100644
--- a/dev-tools/omdb/tests/test_all_output.rs
+++ b/dev-tools/omdb/tests/test_all_output.rs
@@ -107,7 +107,6 @@ async fn test_omdb_usage_errors() {
         &["nexus", "sleds"],
         &["sled-agent"],
         &["sled-agent", "zones"],
-        &["sled-agent", "zpools"],
         &["oximeter", "--help"],
         &["oxql", "--help"],
         // Mispelled argument
diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out
index 68ff3c59272..472e2b3cc9b 100644
--- a/dev-tools/omdb/tests/usage_errors.out
+++ b/dev-tools/omdb/tests/usage_errors.out
@@ -909,8 +909,6 @@ Usage: omdb sled-agent [OPTIONS] <COMMAND>
 
 Commands:
   zones      print information about zones
-  zpools     print information about zpools
-  datasets   print information about datasets
   bootstore  print information about the local bootstore node
   help       Print this message or the help of the given subcommand(s)
 
@@ -949,32 +947,6 @@ Connection Options:
   --sled-agent-url <SLED_AGENT_URL>  URL of the Sled internal API [env: OMDB_SLED_AGENT_URL=]
   --dns-server <DNS_SERVER>          [env: OMDB_DNS_SERVER=]
 
-Safety Options:
-  -w, --destructive  Allow potentially-destructive subcommands
-=============================================
-EXECUTING COMMAND: omdb ["sled-agent", "zpools"]
-termination: Exited(2)
----------------------------------------------
-stdout:
----------------------------------------------
-stderr:
-print information about zpools
-
-Usage: omdb sled-agent zpools [OPTIONS] <COMMAND>
-
-Commands:
-  list  Print list of all zpools managed by the sled agent
-  help  Print this message or the help of the given subcommand(s)
-
-Options:
-      --log-level <LOG_LEVEL>  log level filter [env: LOG_LEVEL=] [default: warn]
-      --color <COLOR>          Color output [default: auto] [possible values: auto, always, never]
-  -h, --help                   Print help
-
-Connection Options:
-  --sled-agent-url <SLED_AGENT_URL>  URL of the Sled internal API [env: OMDB_SLED_AGENT_URL=]
-  --dns-server <DNS_SERVER>          [env: OMDB_DNS_SERVER=]
-
 Safety Options:
   -w, --destructive  Allow potentially-destructive subcommands
 =============================================

From 6b543b821e51179d2cb94a6bbd0082a985af5298 Mon Sep 17 00:00:00 2001
From: John Gallagher <john@oxidecomputer.com>
Date: Thu, 22 May 2025 11:22:48 -0400
Subject: [PATCH 07/10] flesh out RSS wait_for_config_reconciliation_on_sled()

---
 sled-agent/src/rack_setup/service.rs | 126 ++++++++++++++++++++++++++-
 1 file changed, 122 insertions(+), 4 deletions(-)

diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs
index 436cd4503fc..abd07712d2c 100644
--- a/sled-agent/src/rack_setup/service.rs
+++ b/sled-agent/src/rack_setup/service.rs
@@ -89,7 +89,8 @@ use nexus_client::{
     Client as NexusClient, Error as NexusError, types as NexusTypes,
 };
 use nexus_sled_agent_shared::inventory::{
-    OmicronSledConfig, OmicronZoneConfig, OmicronZoneType, OmicronZonesConfig,
+    ConfigReconcilerInventoryResult, OmicronSledConfig, OmicronZoneConfig,
+    OmicronZoneType, OmicronZonesConfig,
 };
 use nexus_types::deployment::{
     Blueprint, BlueprintDatasetConfig, BlueprintDatasetDisposition,
@@ -200,6 +201,9 @@ pub enum SetupServiceError {
     #[error("Error making HTTP request to Sled Agent: {0}")]
     SledApi(#[from] SledAgentError<SledAgentTypes::Error>),
 
+    #[error("Sled config not yet reconciled: {0}")]
+    ConfigNotYetReconciled(String),
+
     #[error("Error making HTTP request to Nexus: {0}")]
     NexusApi(#[from] NexusError<NexusTypes::Error>),
 
@@ -427,10 +431,124 @@ impl ServiceInner {
     // reconciled the config at `generation`.
     async fn wait_for_config_reconciliation_on_sled(
         &self,
-        _sled_address: SocketAddrV6,
-        _generation: Generation,
+        sled_address: SocketAddrV6,
+        generation: Generation,
     ) -> Result<(), SetupServiceError> {
-        unimplemented!("needs updated inventory")
+        let dur = std::time::Duration::from_secs(60);
+        let client = reqwest::ClientBuilder::new()
+            .connect_timeout(dur)
+            .build()
+            .map_err(SetupServiceError::HttpClient)?;
+        let log = self.log.new(o!("sled_address" => sled_address.to_string()));
+        let client = SledAgentClient::new_with_client(
+            &format!("http://{}", sled_address),
+            client,
+            log.clone(),
+        );
+
+        let inv_check = || async {
+            info!(log, "attempting to read sled's inventory");
+            let inventory = match client.inventory().await {
+                Ok(response) => response.into_inner(),
+                Err(error) => {
+                    // TODO Many other codes here should not be retried.  See
+                    // omicron#4578.
+                    return Err(BackoffError::transient(
+                        SetupServiceError::SledApi(error),
+                    ));
+                }
+            };
+
+            // Has this sled's reconciler run at all?
+            let Some(last_reconciliation) = inventory.last_reconciliation
+            else {
+                return Err(BackoffError::transient(
+                    SetupServiceError::ConfigNotYetReconciled(
+                        "no reconcilation state available".to_string(),
+                    ),
+                ));
+            };
+
+            // Has it attempted to reconcile our target generation?
+            let reconciled_gen =
+                last_reconciliation.last_reconciled_config.generation;
+
+            if reconciled_gen < generation {
+                return Err(BackoffError::transient(
+                    SetupServiceError::ConfigNotYetReconciled(format!(
+                        "reconciled generation {reconciled_gen} lower than \
+                         desired generation {generation}",
+                    )),
+                ));
+            }
+
+            // Were there any errors during reconciliation? Check for disk,
+            // dataset, and zone errors.
+            let mut errors = Vec::new();
+
+            for (disk_id, result) in &last_reconciliation.external_disks {
+                match result {
+                    ConfigReconcilerInventoryResult::Ok => (),
+                    ConfigReconcilerInventoryResult::Err { message } => {
+                        errors.push(format!(
+                            "reconcilation for disk {disk_id} failed: {message}"
+                        ));
+                    }
+                }
+            }
+            for (dataset_id, result) in &last_reconciliation.datasets {
+                match result {
+                    ConfigReconcilerInventoryResult::Ok => (),
+                    ConfigReconcilerInventoryResult::Err { message } => {
+                        errors.push(format!(
+                            "reconcilation for dataset {dataset_id} failed: \
+                             {message}"
+                        ));
+                    }
+                }
+            }
+            for (zone_id, result) in &last_reconciliation.zones {
+                match result {
+                    ConfigReconcilerInventoryResult::Ok => (),
+                    ConfigReconcilerInventoryResult::Err { message } => {
+                        errors.push(format!(
+                            "reconcilation for zone {zone_id} failed: {message}"
+                        ));
+                    }
+                }
+            }
+
+            if errors.is_empty() {
+                Ok(())
+            } else {
+                // We treat all of these as transient (although it's possible
+                // some are permanent - we have no means for recovering from
+                // permanent errors during RSS other than clean-slating and
+                // starting over, so it's safer to treat these as transient and
+                // let operators investigate if things are stuck).
+                Err(BackoffError::transient(
+                    SetupServiceError::ConfigNotYetReconciled(
+                        errors.join(", "),
+                    ),
+                ))
+            }
+        };
+        let log_failure = |error, delay| {
+            warn!(
+                log,
+                "sled config not yet reconciled";
+                "error" => #%error,
+                "retry_after" => ?delay,
+            );
+        };
+        retry_notify(
+            retry_policy_internal_service_aggressive(),
+            inv_check,
+            log_failure,
+        )
+        .await?;
+
+        Ok(())
     }
 
     // Ensure that the desired sled configuration for a particular zone version

From 6bce90060258fce1faa30a0a0217e66b5d67420e Mon Sep 17 00:00:00 2001
From: John Gallagher <john@oxidecomputer.com>
Date: Thu, 22 May 2025 11:23:07 -0400
Subject: [PATCH 08/10] cargo fmt

---
 nexus-sled-agent-shared/src/inventory.rs | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/nexus-sled-agent-shared/src/inventory.rs b/nexus-sled-agent-shared/src/inventory.rs
index eae39f4d61f..4a775d7ab39 100644
--- a/nexus-sled-agent-shared/src/inventory.rs
+++ b/nexus-sled-agent-shared/src/inventory.rs
@@ -168,12 +168,7 @@ impl ConfigReconcilerInventory {
             .iter()
             .map(|z| (z.id, ConfigReconcilerInventoryResult::Ok))
             .collect();
-        Self {
-            last_reconciled_config: config,
-            external_disks,
-            datasets,
-            zones,
-        }
+        Self { last_reconciled_config: config, external_disks, datasets, zones }
     }
 }
 

From 8ff4ae3feac08c016d703770c5cf781c464e1944 Mon Sep 17 00:00:00 2001
From: John Gallagher <john@oxidecomputer.com>
Date: Thu, 22 May 2025 11:30:33 -0400
Subject: [PATCH 09/10] clippy

---
 sled-agent/config-reconciler/src/handle.rs         | 4 ++--
 sled-agent/config-reconciler/src/internal_disks.rs | 4 ++--
 sled-agent/src/services.rs                         | 2 ++
 sled-agent/src/sled_agent.rs                       | 3 +--
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/sled-agent/config-reconciler/src/handle.rs b/sled-agent/config-reconciler/src/handle.rs
index e33c72f3f97..e70fd7fab59 100644
--- a/sled-agent/config-reconciler/src/handle.rs
+++ b/sled-agent/config-reconciler/src/handle.rs
@@ -475,7 +475,7 @@ impl AvailableDatasetsReceiver {
             AvailableDatasetsReceiverInner::FakeStatic(pools) => pools
                 .iter()
                 .map(|(pool, path)| PathInPool {
-                    pool: ZpoolOrRamdisk::Zpool(pool.clone()),
+                    pool: ZpoolOrRamdisk::Zpool(*pool),
                     path: path.join(U2_DEBUG_DATASET),
                 })
                 .collect(),
@@ -498,7 +498,7 @@ impl AvailableDatasetsReceiver {
             AvailableDatasetsReceiverInner::FakeStatic(pools) => pools
                 .iter()
                 .map(|(pool, path)| PathInPool {
-                    pool: ZpoolOrRamdisk::Zpool(pool.clone()),
+                    pool: ZpoolOrRamdisk::Zpool(*pool),
                     path: path.join(ZONE_DATASET),
                 })
                 .collect(),
diff --git a/sled-agent/config-reconciler/src/internal_disks.rs b/sled-agent/config-reconciler/src/internal_disks.rs
index 96e2ea82c01..0d0102047d7 100644
--- a/sled-agent/config-reconciler/src/internal_disks.rs
+++ b/sled-agent/config-reconciler/src/internal_disks.rs
@@ -325,9 +325,9 @@ impl InternalDisks {
         &self.mount_config
     }
 
-    pub fn boot_disk_zpool(&self) -> Option<&ZpoolName> {
+    pub fn boot_disk_zpool(&self) -> Option<ZpoolName> {
         self.disks.iter().find_map(|d| {
-            if d.is_boot_disk() { Some(&d.zpool_name) } else { None }
+            if d.is_boot_disk() { Some(d.zpool_name) } else { None }
         })
     }
 
diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs
index 98453b2f8f8..0400aff0b3f 100644
--- a/sled-agent/src/services.rs
+++ b/sled-agent/src/services.rs
@@ -848,7 +848,9 @@ impl ServiceManager {
     /// - `sidecar_revision`: Rev of attached sidecar, if present.
     /// - `switch_zone_maghemite_links`: List of physical links on which
     ///    maghemite should listen.
+    /// - `zone_image_resolver`: how to find Omicron zone images
     /// - `internal_disks_rx`: watch channel for changes to internal disks
+    #[allow(clippy::too_many_arguments)]
     pub(crate) fn new(
         log: &Logger,
         ddm_reconciler: DdmReconciler,
diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs
index f4349a3c6c9..93c0b9e4730 100644
--- a/sled-agent/src/sled_agent.rs
+++ b/sled-agent/src/sled_agent.rs
@@ -428,8 +428,7 @@ impl SledAgent {
             .internal_disks_rx()
             .current()
             .boot_disk_zpool()
-            .ok_or_else(|| Error::BootDiskNotFound)?
-            .clone();
+            .ok_or_else(|| Error::BootDiskNotFound)?;
 
         // Configure a swap device of the configured size before other system setup.
         match config.swap_device_size_gb {

From 3ab3e791aec829d1d33f3a740203df56313670c2 Mon Sep 17 00:00:00 2001
From: John Gallagher <john@oxidecomputer.com>
Date: Thu, 22 May 2025 12:17:08 -0400
Subject: [PATCH 10/10] illumos test fix

---
 sled-agent/src/support_bundle/storage.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sled-agent/src/support_bundle/storage.rs b/sled-agent/src/support_bundle/storage.rs
index 0c64b625d8b..34e6bf37480 100644
--- a/sled-agent/src/support_bundle/storage.rs
+++ b/sled-agent/src/support_bundle/storage.rs
@@ -981,7 +981,7 @@ mod tests {
             self.datasets_config_list().await.map_err(|err| err.into())
         }
 
-        fn dyn_dataset_get(
+        async fn dyn_dataset_get(
             &self,
             dataset_name: &String,
         ) -> Result<DatasetProperties, Error> {
@@ -990,6 +990,7 @@ mod tests {
                     &[dataset_name.clone()],
                     illumos_utils::zfs::WhichDatasets::SelfOnly,
                 )
+                .await
                 .map_err(|err| Error::DatasetLookup(err))?
                 .pop()
             else {