From d3c41c2680775b9ded7dcac5d9eb87db912f115d Mon Sep 17 00:00:00 2001 From: yihuaf Date: Thu, 22 Jun 2023 13:52:59 -0700 Subject: [PATCH 01/21] implements clone fallback implements a clone fallback path when `clone3` is blocked with ENOSYS Removed the use of clone3 crate. Our usecase for the clone3 crate is very specific and do not need to support everything. I can easily replace the clone3 crate with a few lines of code, so I did it in this PR. We usually favors safe syscall wrappers like nix over libc, but I believe the save is worth it here. We use the raw clone syscall instead of glibc wrapper. Details are in the comments. Using raw syscalls for both clone and clone3 actually makes the code well organized. For our usecase, they share the input types. Unit tests using seccomp profile to simulate clone3 being blocked are added. Signed-off-by: yihuaf --- Cargo.lock | 125 +++++----------- crates/libcontainer/Cargo.toml | 1 - crates/libcontainer/src/process/fork.rs | 189 ++++++++++++++++++++---- 3 files changed, 205 insertions(+), 110 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3289cc88b..376f4dfad 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -43,7 +43,7 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "once_cell", "version_check", ] @@ -191,7 +191,7 @@ checksum = "4319208da049c43661739c5fade2ba182f09d1dc2299b32298d3a31692b17e12" dependencies = [ "addr2line 0.20.0", "cc", - "cfg-if 1.0.0", + "cfg-if", "libc", "miniz_oxide", "object 0.31.1", @@ -407,12 +407,6 @@ dependencies = [ "nom", ] -[[package]] -name = "cfg-if" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" - [[package]] name = "cfg-if" version = "1.0.0" @@ -495,16 +489,6 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" -[[package]] -name = "clone3" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ee4e061ea30800291ca09663878f3953840a69b08ce244b3e8b26e894d9f60f" -dependencies = [ - "bitflags 1.3.2", - "uapi", -] - [[package]] name = "cmake" version = "0.1.50" @@ -565,7 +549,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9847f90f32a50b0dcbd68bc23ff242798b13080b97b0569f6ed96a45ce4cf2cd" dependencies = [ "autocfg", - "cfg-if 1.0.0", + "cfg-if", "libc", "scopeguard", "windows-sys 0.33.0", @@ -577,7 +561,7 @@ version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eeaa953eaad386a53111e47172c2fedba671e5684c8dd601a5f474f4f118710f" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", ] [[package]] @@ -786,7 +770,7 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", ] [[package]] @@ -801,7 +785,7 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2801af0d36612ae591caa9568261fddce32ce6e08a7275ea334a06a4ad021a2c" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "crossbeam-channel", "crossbeam-deque", "crossbeam-epoch", @@ -815,7 +799,7 @@ version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "crossbeam-utils", ] @@ -825,7 +809,7 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "crossbeam-epoch", "crossbeam-utils", ] @@ -837,7 +821,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" dependencies = [ "autocfg", - "cfg-if 1.0.0", + "cfg-if", "crossbeam-utils", "memoffset 0.9.0", "scopeguard", @@ -849,7 +833,7 @@ version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d1cfb3ea8a53f37c40dea2c7bedcbd88bdfae54f5e2175d6ecaff1c988353add" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "crossbeam-utils", ] @@ -859,7 +843,7 @@ version = "0.8.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", ] [[package]] @@ -947,7 +931,7 @@ version = "5.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "907076dfda823b0b36d2a1bb5f90c96660a5bbcd7729e10727f07858f22c4edc" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "hashbrown 0.12.3", "lock_api", "once_cell", @@ -1038,7 +1022,7 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "339ee130d97a610ea5a5872d2bbb130fdf68884ff09d3028b81bec8a1ac23bbc" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "dirs-sys-next", ] @@ -1097,7 +1081,7 @@ version = "0.8.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "071a31f4ee85403370b58aca746f01041ede6f0da2730960ad001edc2b71b394" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", ] [[package]] @@ -1218,7 +1202,7 @@ version = "4.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b0377f1edc77dbd1118507bc7a66e4ab64d2b90c66f90726dc801e73a8c68f9" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "rustix 0.38.1", "windows-sys 0.48.0", ] @@ -1239,7 +1223,7 @@ version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5cbc844cecaee9d4443931972e1289c8ff485cb4cc2767cb03ca139ed6885153" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "libc", "redox_syscall 0.2.16", "windows-sys 0.48.0", @@ -1457,7 +1441,7 @@ version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "js-sys", "libc", "wasi", @@ -1765,7 +1749,7 @@ version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", ] [[package]] @@ -1981,7 +1965,6 @@ dependencies = [ "bitflags 2.3.3", "caps", "chrono", - "clone3", "fastrand 2.0.0", "futures", "libc", @@ -2020,7 +2003,7 @@ version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "winapi", ] @@ -2220,7 +2203,7 @@ version = "0.11.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c84490118f2ee2d74570d114f3d0493cbf02790df303d2707606c3e14e07c96" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "downcast", "fragile", "lazy_static", @@ -2235,7 +2218,7 @@ version = "0.11.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22ce75669015c4f47b289fd4d4f56e894e4c96003ffdf3ac51313126f94c6cbb" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "proc-macro2", "quote", "syn 1.0.109", @@ -2272,7 +2255,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a" dependencies = [ "bitflags 1.3.2", - "cfg-if 1.0.0", + "cfg-if", "libc", "memoffset 0.7.1", "pin-utils", @@ -2419,7 +2402,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "345df152bc43501c5eb9e4654ff05f794effb78d4efe3d53abc158baddc0703d" dependencies = [ "bitflags 1.3.2", - "cfg-if 1.0.0", + "cfg-if", "foreign-types", "libc", "once_cell", @@ -2478,7 +2461,7 @@ version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "libc", "redox_syscall 0.3.5", "smallvec", @@ -3469,7 +3452,7 @@ version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479fb9d862239e610720565ca91403019f2f00410f1864c5aa7479b950a76ed8" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "cpufeatures", "digest", ] @@ -3728,7 +3711,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "31c0432476357e58790aaa47a8efb0c5138f137343f3b5f23bd36a27e3b0a6d6" dependencies = [ "autocfg", - "cfg-if 1.0.0", + "cfg-if", "fastrand 1.9.0", "redox_syscall 0.3.5", "rustix 0.37.19", @@ -3803,7 +3786,7 @@ version = "1.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "once_cell", ] @@ -3991,7 +3974,7 @@ version = "0.1.37" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "log", "pin-project-lite", "tracing-attributes", @@ -4084,32 +4067,6 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" -[[package]] -name = "uapi" -version = "0.2.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "019450240401d342e2a5bc47f7fbaeb002a38fe18197b83788750d7ffb143274" -dependencies = [ - "cc", - "cfg-if 0.1.10", - "libc", - "uapi-proc", -] - -[[package]] -name = "uapi-proc" -version = "0.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54de46f980cea7b2ae8d8f7f9f1c35cf7062c68343e99345ef73758f8e60975a" -dependencies = [ - "lazy_static", - "libc", - "proc-macro2", - "quote", - "regex", - "syn 1.0.109", -] - [[package]] name = "unicase" version = "2.6.0" @@ -4448,7 +4405,7 @@ version = "0.2.84" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "31f8dcbc21f30d9b8f2ea926ecb58f6b91192c17e9d33594b3df58b2007ca53b" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "wasm-bindgen-macro", ] @@ -4496,7 +4453,7 @@ version = "0.4.34" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f219e0d211ba40266969f6dbdd90636da12f75bee4fc9d6c23d1260dadb51454" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "js-sys", "wasm-bindgen", "web-sys", @@ -4605,7 +4562,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ea790bcdfb4e6e9d1e5ddf75b4699aac62b078fcc9f27f44e1748165ceea67bf" dependencies = [ "bytes", - "cfg-if 1.0.0", + "cfg-if", "derivative", "indexmap 1.9.3", "js-sys", @@ -4633,7 +4590,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f093937725e242e5529fed27e08ff836c011a9ecc22e6819fb818c2ac6ff5f88" dependencies = [ "backtrace", - "cfg-if 1.0.0", + "cfg-if", "enum-iterator", "enumset", "lazy_static", @@ -4705,7 +4662,7 @@ checksum = "a1e000c2cbd4f9805427af5f3b3446574caf89ab3a1e66c2f3579fbde22b072b" dependencies = [ "backtrace", "cc", - "cfg-if 1.0.0", + "cfg-if", "corosensei", "dashmap", "derivative", @@ -4734,7 +4691,7 @@ dependencies = [ "async-trait", "bincode", "bytes", - "cfg-if 1.0.0", + "cfg-if", "cooked-waker", "dashmap", "derivative", @@ -4789,7 +4746,7 @@ dependencies = [ "anyhow", "bitflags 1.3.2", "byteorder", - "cfg-if 1.0.0", + "cfg-if", "num_enum", "serde", "time 0.2.27", @@ -4843,7 +4800,7 @@ dependencies = [ "async-trait", "bincode", "bumpalo", - "cfg-if 1.0.0", + "cfg-if", "encoding_rs", "fxprof-processed-profile", "indexmap 1.9.3", @@ -4877,7 +4834,7 @@ version = "10.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "284466ef356ce2d909bc0ad470b60c4d0df5df2de9084457e118131b3c779b92" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", ] [[package]] @@ -4989,7 +4946,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "14309cbdf2c395258b124a24757c727403070c0465a28bcc780c4f82f4bca5ff" dependencies = [ "cc", - "cfg-if 1.0.0", + "cfg-if", "rustix 0.37.19", "wasmtime-asm-macros", "windows-sys 0.48.0", @@ -5004,7 +4961,7 @@ dependencies = [ "addr2line 0.19.0", "anyhow", "bincode", - "cfg-if 1.0.0", + "cfg-if", "cpp_demangle", "gimli 0.27.2", "ittapi", @@ -5038,7 +4995,7 @@ version = "10.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b49ceb7e2105a8ebe5614d7bbab6f6ef137a284e371633af60b34925493081f" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "libc", "windows-sys 0.48.0", ] @@ -5051,7 +5008,7 @@ checksum = "3a5de4762421b0b2b19e02111ca403632852b53e506e03b4b227ffb0fbfa63c2" dependencies = [ "anyhow", "cc", - "cfg-if 1.0.0", + "cfg-if", "encoding_rs", "indexmap 1.9.3", "libc", diff --git a/crates/libcontainer/Cargo.toml b/crates/libcontainer/Cargo.toml index aaf1ea0e4..9d31b687a 100644 --- a/crates/libcontainer/Cargo.toml +++ b/crates/libcontainer/Cargo.toml @@ -36,7 +36,6 @@ libseccomp = { version = "0.3.0", optional=true } serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" rust-criu = "0.4.0" -clone3 = "0.2.3" regex = "1.9.0" thiserror = "1.0.43" tracing = { version = "0.1.37", features = ["attributes"]} diff --git a/crates/libcontainer/src/process/fork.rs b/crates/libcontainer/src/process/fork.rs index a3557a531..3c9975447 100644 --- a/crates/libcontainer/src/process/fork.rs +++ b/crates/libcontainer/src/process/fork.rs @@ -8,8 +8,6 @@ pub enum CloneError { Clone(#[source] nix::Error), } -type Result = std::result::Result; - #[derive(Debug, thiserror::Error)] pub enum CallbackError { #[error(transparent)] @@ -19,11 +17,12 @@ pub enum CallbackError { #[error(transparent)] InitProcess(#[from] crate::process::container_init_process::InitProcessError), // Need a fake error for testing - #[error("unknown")] #[cfg(test)] + #[error("unknown")] Test, } +type Result = std::result::Result; type CallbackResult = std::result::Result; // Fork/Clone a sibling process that shares the same parent as the calling @@ -37,14 +36,22 @@ pub fn container_clone_sibling CallbackResult>( child_name: &str, cb: F, ) -> Result { - let mut clone = clone3::Clone3::default(); // Note: normally, an exit signal is required, but when using // `CLONE_PARENT`, the `clone3` will return EINVAL if an exit signal is set. // The older `clone` will not return EINVAL in this case. Instead it ignores - // the exit signal bits in the glibc wrapper. - clone.flag_parent(); + // the exit signal bits in the glibc wrapper. Therefore, we explicitly set + // the exit_signal to None here, so this works for both version of clone. + container_clone(child_name, cb, libc::CLONE_PARENT as u64, None) +} - container_clone(child_name, cb, clone) +// Execute the cb in another process. Make the fork works more like thread_spawn +// or clone, so it is easier to reason. Compared to clone call, fork is easier +// to use since fork will magically take care of all the variable copying. If +// using clone, we would have to manually make sure all the variables are +// correctly send to the new process, especially Rust borrow checker will be a +// lot of hassel to deal with every details. +pub fn container_fork CallbackResult>(child_name: &str, cb: F) -> Result { + container_clone(child_name, cb, 0, Some(SIGCHLD as u64)) } // A simple clone wrapper to clone3 so we can share this logic in different @@ -54,20 +61,18 @@ pub fn container_clone_sibling CallbackResult>( fn container_clone CallbackResult>( child_name: &str, cb: F, - mut clone_cmd: clone3::Clone3, + flags: u64, + exit_signal: Option, ) -> Result { // Return the child's pid in case of parent/calling process, and for the // cloned process, run the callback function, and exit with the same exit // code returned by the callback. If there was any error when trying to run // callback, exit with -1 - match unsafe { - clone_cmd - .call() - .map_err(|err| CloneError::Clone(nix::errno::from_i32(err.0)))? - } { + match clone_wrapper(flags, exit_signal) { + -1 => Err(CloneError::Clone(nix::Error::last())), 0 => { - prctl::set_name(child_name).expect("failed to set name"); // Inside the cloned process + prctl::set_name(child_name).expect("failed to set name"); let ret = match cb() { Err(error) => { tracing::debug!("failed to run child process in clone: {:?}", error); @@ -81,18 +86,89 @@ fn container_clone CallbackResult>( } } -// Execute the cb in another process. Make the fork works more like thread_spawn -// or clone, so it is easier to reason. Compared to clone call, fork is easier -// to use since fork will magically take care of all the variable copying. If -// using clone, we would have to manually make sure all the variables are -// correctly send to the new process, especially Rust borrow checker will be a -// lot of hassel to deal with every details. -pub fn container_fork CallbackResult>(child_name: &str, cb: F) -> Result { - // Using `clone3` to mimic the effect of `fork`. - let mut clone = clone3::Clone3::default(); - clone.exit_signal(SIGCHLD as u64); +#[repr(C)] +struct clone3_args { + flags: u64, + pidfd: u64, + child_tid: u64, + parent_tid: u64, + exit_signal: u64, + stack: u64, + stack_size: u64, + tls: u64, + set_tid: u64, + set_tid_size: u64, + cgroup: u64, +} + +// clone_wrapper wraps the logic of using `clone3` with fallback behavior when +// `clone3` is either not available or blocked. While `libcontainer` maintains a +// minimum kernel version where `clone3` is available, we have found that in +// real life, places would choose to block `clone3`. This is mostly due to +// seccomp profile can't effectively filter `clone3` calls because the clone +// flags are inside the clone_args, not part of the variables like the `clone` +// call. Therefore, we try `clone3` first, but fallback to `clone` when ENOSYS +// is returned. +fn clone_wrapper(flags: u64, exit_signal: Option) -> i32 { + let mut args = clone3_args { + flags, + pidfd: 0, + child_tid: 0, + parent_tid: 0, + exit_signal: exit_signal.unwrap_or(0), + stack: 0, + stack_size: 0, + tls: 0, + set_tid: 0, + set_tid_size: 0, + cgroup: 0, + }; + let args_ptr = &mut args as *mut clone3_args; + let args_size = std::mem::size_of::(); + // We strategically choose to use the raw syscall here because it is simpler + // for our usecase. We don't have to care about all the other usecases that + // clone syscalls supports in general. + match unsafe { libc::syscall(libc::SYS_clone3, args_ptr, args_size) } { + -1 if nix::Error::last() == nix::Error::ENOSYS => { + // continue to fallback to clone syscall + } + ret => { + return ret as i32; + } + }; + + let ret = unsafe { + // We choose to use the raw clone syscall here instead of the glibc + // wrapper version for the following reasons: + // + // 1. the raw syscall behaves more like the fork and clone3 call, so the + // substitution is more natural in the case of a fallback. We do not + // need to create a new function for the child to execute. Like fork and + // clone3, the clone raw syscall will start the child from the point of + // clone call. + // + // 2. the raw clone syscall can take null or 0 for the child stack as + // arguement. The syscall will do copy on write with the existing stack + // and takes care of child stack allocation. Correctly allocate a child + // stack is a pain when we previously implemented the logic using the + // glibc clone wrapper. + // + // The strategically use of the raw clone syscall is safe here because + // we are using a specific subset of the clone flags to launch + // processes. Unlike the general clone syscall where a number of + // usecases are supported such as launching thread, we want a behavior + // that is more similar to fork. + libc::syscall( + libc::SYS_clone, + flags | exit_signal.unwrap_or(0), // flags + 0, // stack + 0, // parent_tid + 0, // child_tid + 0, // tls + ) + }; - container_clone(child_name, cb, clone) + ret as i32 } #[cfg(test)] @@ -180,4 +256,67 @@ mod test { Ok(()) } + + // This test depends on libseccomp to work. + #[cfg(feature = "libseccomp")] + #[test] + fn test_clone_fallback() -> Result<()> { + use crate::test_utils::TestCallbackError; + use oci_spec::runtime::{ + Arch, LinuxSeccompAction, LinuxSeccompBuilder, LinuxSyscallBuilder, + }; + + fn has_clone3() -> bool { + // We use the probe syscall to check if the kernel supports clone3 or + // seccomp has successfully blocked clone3. + let res = unsafe { libc::syscall(libc::SYS_clone3, 0, 0) }; + let err = (res == -1) + .then(std::io::Error::last_os_error) + .expect("probe syscall should not succeed"); + err.raw_os_error() != Some(libc::ENOSYS) + } + + // To test the fallback behavior, we will create a seccomp rule that + // blocks `clone3` as ENOSYS. + let syscall = LinuxSyscallBuilder::default() + .names(vec![String::from("clone3")]) + .action(LinuxSeccompAction::ScmpActErrno) + .errno_ret(libc::ENOSYS as u32) + .build()?; + let seccomp_profile = LinuxSeccompBuilder::default() + .default_action(LinuxSeccompAction::ScmpActAllow) + .architectures(vec![Arch::ScmpArchNative]) + .syscalls(vec![syscall]) + .build()?; + + crate::test_utils::test_in_child_process(|| { + // We use seccomp to block `clone3` + let _ = prctl::set_no_new_privileges(true); + crate::seccomp::initialize_seccomp(&seccomp_profile) + .expect("failed to initialize seccomp"); + + if has_clone3() { + return Err(TestCallbackError::Custom( + "clone3 is not blocked by seccomp".into(), + )); + } + + let pid = container_fork("test:child", || Ok(0)).map_err(|err| err.to_string())?; + match waitpid(pid, None).expect("wait pid failed.") { + WaitStatus::Exited(p, status) => { + assert_eq!(pid, p); + assert_eq!(status, 0); + } + _ => { + return Err(TestCallbackError::Custom( + "failed to wait on the child process".into(), + )); + } + }; + + Ok(()) + })?; + + Ok(()) + } } From 599eddb8fe4245a9976ce9f4ebd31c6f5d07b65b Mon Sep 17 00:00:00 2001 From: yihuaf Date: Sun, 2 Jul 2023 22:37:15 -0700 Subject: [PATCH 02/21] this compiles but doesn't work Signed-off-by: yihuaf --- .../src/process/container_init_process.rs | 2 +- .../process/container_intermediate_process.rs | 12 +- .../src/process/container_main_process.rs | 4 +- crates/libcontainer/src/process/fork.rs | 308 ++++++++++++------ 4 files changed, 218 insertions(+), 108 deletions(-) diff --git a/crates/libcontainer/src/process/container_init_process.rs b/crates/libcontainer/src/process/container_init_process.rs index 3bdbb6a32..86d2065b0 100644 --- a/crates/libcontainer/src/process/container_init_process.rs +++ b/crates/libcontainer/src/process/container_init_process.rs @@ -486,7 +486,7 @@ pub fn container_init_process( } }; - set_supplementary_gids(proc.user(), args.rootless, syscall).map_err(|err| { + set_supplementary_gids(proc.user(), &args.rootless, syscall).map_err(|err| { tracing::error!(?err, "failed to set supplementary gids"); err })?; diff --git a/crates/libcontainer/src/process/container_intermediate_process.rs b/crates/libcontainer/src/process/container_intermediate_process.rs index 5a9706f02..857473aa8 100644 --- a/crates/libcontainer/src/process/container_intermediate_process.rs +++ b/crates/libcontainer/src/process/container_intermediate_process.rs @@ -36,11 +36,11 @@ pub fn container_intermediate_process( intermediate_chan: &mut (channel::IntermediateSender, channel::IntermediateReceiver), init_chan: &mut (channel::InitSender, channel::InitReceiver), main_sender: &mut channel::MainSender, -) -> Result { +) -> Result<()> { let (inter_sender, inter_receiver) = intermediate_chan; let (init_sender, init_receiver) = init_chan; - let command = &args.syscall; - let spec = &args.spec; + let command = args.syscall; + let spec = args.spec; let linux = spec.linux().as_ref().ok_or(MissingSpecError::Linux)?; let namespaces = Namespaces::try_from(linux.namespaces().as_ref())?; @@ -115,6 +115,7 @@ pub fn container_intermediate_process( // configuration. The youki main process can decide what to do with the init // process and the intermediate process can just exit safely after the job // is done. + let pid = fork::container_clone_sibling("youki:[2:INIT]", || { // We are inside the forked process here. The first thing we have to do // is to close any unused senders, since fork will make a dup for all @@ -131,7 +132,7 @@ pub fn container_intermediate_process( IntermediateProcessError::Channel(err) })?; match container_init_process(args, main_sender, init_receiver) { - Ok(_) => Ok(0), + Ok(_) => Ok(()), Err(e) => { if let ContainerType::TenantContainer { exec_notify_fd } = args.container_type { let buf = format!("{e}"); @@ -183,7 +184,8 @@ pub fn container_intermediate_process( tracing::error!("failed to close unused init sender: {}", err); err })?; - Ok(pid) + + Ok(()) } fn apply_cgroups< diff --git a/crates/libcontainer/src/process/container_main_process.rs b/crates/libcontainer/src/process/container_main_process.rs index 29d8f5584..35848b042 100644 --- a/crates/libcontainer/src/process/container_main_process.rs +++ b/crates/libcontainer/src/process/container_main_process.rs @@ -41,7 +41,7 @@ pub fn container_main_process(container_args: &ContainerArgs) -> Result<(Pid, bo let inter_chan = &mut channel::intermediate_channel()?; let init_chan = &mut channel::init_channel()?; - let intermediate_pid = fork::container_fork("youki:[1:INTER]", || { + let intermediate_pid = fork::container_clone("youki:[1:INTER]", || { container_intermediate_process::container_intermediate_process( container_args, inter_chan, @@ -49,7 +49,7 @@ pub fn container_main_process(container_args: &ContainerArgs) -> Result<(Pid, bo main_sender, )?; - Ok(0) + Ok(()) }) .map_err(|err| { tracing::error!("failed to fork intermediate process: {}", err); diff --git a/crates/libcontainer/src/process/fork.rs b/crates/libcontainer/src/process/fork.rs index 3c9975447..2444df232 100644 --- a/crates/libcontainer/src/process/fork.rs +++ b/crates/libcontainer/src/process/fork.rs @@ -1,11 +1,28 @@ +use std::{ffi::c_int, num::NonZeroUsize}; + use libc::SIGCHLD; -use nix::unistd::Pid; +use nix::{ + sys::{mman, resource}, + unistd::Pid, +}; use prctl; #[derive(Debug, thiserror::Error)] pub enum CloneError { - #[error("failed to clone process using clone3")] + #[error("failed to clone process")] Clone(#[source] nix::Error), + #[error("failed to get system memory page size")] + PageSize(#[source] nix::Error), + #[error("failed to get resource limit")] + ResourceLimit(#[source] nix::Error), + #[error("the stack size is zero")] + ZeroStackSize, + #[error("failed to allocate stack")] + StackAllocation(#[source] nix::Error), + #[error("failed to create stack guard page")] + GuardPage(#[source] nix::Error), + #[error("unkown error code {0}")] + UnknownErrno(i32), } #[derive(Debug, thiserror::Error)] @@ -22,9 +39,12 @@ pub enum CallbackError { Test, } -type Result = std::result::Result; type CallbackResult = std::result::Result; +// The clone callback is used in clone call. It is a boxed closure and it needs +// to trasfer the ownership of related memory to the new process. +type CloneCb = Box i32>; + // Fork/Clone a sibling process that shares the same parent as the calling // process. This is used to launch the container init process so the parent // process of the calling process can receive ownership of the process. If we @@ -32,16 +52,16 @@ type CallbackResult = std::result::Result; // youki main process) will exit and the init process will be re-parented to the // process 1 (system init process), which is not the right behavior of what we // look for. -pub fn container_clone_sibling CallbackResult>( +pub fn container_clone_sibling<'a, F: FnMut() -> CallbackResult<()> + 'a>( child_name: &str, cb: F, -) -> Result { +) -> Result { // Note: normally, an exit signal is required, but when using // `CLONE_PARENT`, the `clone3` will return EINVAL if an exit signal is set. // The older `clone` will not return EINVAL in this case. Instead it ignores // the exit signal bits in the glibc wrapper. Therefore, we explicitly set // the exit_signal to None here, so this works for both version of clone. - container_clone(child_name, cb, libc::CLONE_PARENT as u64, None) + clone_internal(child_name, cb, libc::CLONE_PARENT as u64, None) } // Execute the cb in another process. Make the fork works more like thread_spawn @@ -50,66 +70,64 @@ pub fn container_clone_sibling CallbackResult>( // using clone, we would have to manually make sure all the variables are // correctly send to the new process, especially Rust borrow checker will be a // lot of hassel to deal with every details. -pub fn container_fork CallbackResult>(child_name: &str, cb: F) -> Result { - container_clone(child_name, cb, 0, Some(SIGCHLD as u64)) +pub fn container_clone<'a, F: FnMut() -> CallbackResult<()> + 'a>( + child_name: &str, + cb: F, +) -> Result { + clone_internal(child_name, cb, 0, Some(SIGCHLD as u64)) } -// A simple clone wrapper to clone3 so we can share this logic in different -// fork/clone situations. We decided to minimally support kernel version >= 5.4, -// and `clone3` requires only kernel version >= 5.3. Therefore, we don't need to -// fall back to `clone` or `fork`. -fn container_clone CallbackResult>( +fn clone_internal<'a, F: FnMut() -> CallbackResult<()> + 'a>( child_name: &str, - cb: F, + mut cb: F, flags: u64, exit_signal: Option, -) -> Result { - // Return the child's pid in case of parent/calling process, and for the - // cloned process, run the callback function, and exit with the same exit - // code returned by the callback. If there was any error when trying to run - // callback, exit with -1 - match clone_wrapper(flags, exit_signal) { - -1 => Err(CloneError::Clone(nix::Error::last())), - 0 => { - // Inside the cloned process - prctl::set_name(child_name).expect("failed to set name"); - let ret = match cb() { - Err(error) => { - tracing::debug!("failed to run child process in clone: {:?}", error); - -1 - } - Ok(exit_code) => exit_code, - }; - std::process::exit(ret); +) -> Result { + let mut child_closure = || { + if let Err(ret) = prctl::set_name(child_name) { + tracing::error!(?ret, "failed to set name for child process"); + return ret; } - pid => Ok(Pid::from_raw(pid)), + match cb() { + Ok(_) => 0, + Err(err) => { + tracing::error!(?err, "failed to run callback in clone"); + -1 + } + } + }; + + match clone3(&mut child_closure, flags, exit_signal) { + Ok(pid) => return Ok(pid), + Err(CloneError::Clone(nix::Error::ENOSYS)) => { + tracing::debug!("clone3 is not supported, fallback to clone"); + let pid = clone(&mut child_closure, flags, exit_signal)?; + + Ok(pid) + } + Err(err) => return Err(err), } } -#[repr(C)] -struct clone3_args { +fn clone3<'a, F: FnMut() -> i32 + 'a>( + mut cb: F, flags: u64, - pidfd: u64, - child_tid: u64, - parent_tid: u64, - exit_signal: u64, - stack: u64, - stack_size: u64, - tls: u64, - set_tid: u64, - set_tid_size: u64, - cgroup: u64, -} - -// clone_wrapper wraps the logic of using `clone3` with fallback behavior when -// `clone3` is either not available or blocked. While `libcontainer` maintains a -// minimum kernel version where `clone3` is available, we have found that in -// real life, places would choose to block `clone3`. This is mostly due to -// seccomp profile can't effectively filter `clone3` calls because the clone -// flags are inside the clone_args, not part of the variables like the `clone` -// call. Therefore, we try `clone3` first, but fallback to `clone` when ENOSYS -// is returned. -fn clone_wrapper(flags: u64, exit_signal: Option) -> i32 { + exit_signal: Option, +) -> Result { + #[repr(C)] + struct clone3_args { + flags: u64, + pidfd: u64, + child_tid: u64, + parent_tid: u64, + exit_signal: u64, + stack: u64, + stack_size: u64, + tls: u64, + set_tid: u64, + set_tid_size: u64, + cgroup: u64, + } let mut args = clone3_args { flags, pidfd: 0, @@ -125,50 +143,139 @@ fn clone_wrapper(flags: u64, exit_signal: Option) -> i32 { }; let args_ptr = &mut args as *mut clone3_args; let args_size = std::mem::size_of::(); - // We strategically choose to use the raw syscall here because it is simpler - // for our usecase. We don't have to care about all the other usecases that - // clone syscalls supports in general. + // For now, we can only use clone3 as a kernel syscall. Libc wrapper is not + // available yet. match unsafe { libc::syscall(libc::SYS_clone3, args_ptr, args_size) } { - -1 if nix::Error::last() == nix::Error::ENOSYS => { - // continue to fallback to clone syscall + -1 => { + return Err(CloneError::Clone(nix::Error::last())); } - ret => { - return ret as i32; + 0 => { + // Inside the cloned process, continue in the cloned process, we + // execute the callback and exit with the return code. + let ret = cb(); + std::process::exit(ret); } + ret if ret >= 0 => return Ok(Pid::from_raw(ret as i32)), + ret => return Err(CloneError::UnknownErrno(ret as i32)), + }; +} + +/// clone uses syscall clone(2) to create a new process for the container init +/// process. Using clone syscall gives us better control over how to can create +/// the new container process, where we can enter into namespaces directly instead +/// of using unshare and fork. This call will only create one new process, instead +/// of two using fork. +fn clone<'a, F: FnMut() -> i32 + 'a>( + cb: F, + flags: u64, + exit_signal: Option, +) -> Result { + const DEFAULT_STACK_SIZE: usize = 8 * 1024 * 1024; // 8M + const DEFAULT_PAGE_SIZE: usize = 4 * 1024; // 4K + + // Use sysconf to find the page size. If there is an error, we assume + // the default 4K page size. + let page_size = nix::unistd::sysconf(nix::unistd::SysconfVar::PAGE_SIZE) + .map_err(CloneError::PageSize)? + .map(|size| size as usize) + .unwrap_or(DEFAULT_PAGE_SIZE); + + // Find out the default stack max size through getrlimit. + let (rlim_cur, _) = + resource::getrlimit(resource::Resource::RLIMIT_STACK).map_err(CloneError::ResourceLimit)?; + // mmap will return ENOMEM if stack size is unlimited when we create the + // child stack, so we need to set a reasonable default stack size. + let default_stack_size = if rlim_cur != u64::MAX { + rlim_cur as usize + } else { + tracing::debug!( + "stack size returned by getrlimit() is unlimited, use DEFAULT_STACK_SIZE(8MB)" + ); + DEFAULT_STACK_SIZE }; - let ret = unsafe { - // We choose to use the raw clone syscall here instead of the glibc - // wrapper version for the following reasons: - // - // 1. the raw syscall behaves more like the fork and clone3 call, so the - // substitution is more natural in the case of a fallback. We do not - // need to create a new function for the child to execute. Like fork and - // clone3, the clone raw syscall will start the child from the point of - // clone call. - // - // 2. the raw clone syscall can take null or 0 for the child stack as - // arguement. The syscall will do copy on write with the existing stack - // and takes care of child stack allocation. Correctly allocate a child - // stack is a pain when we previously implemented the logic using the - // glibc clone wrapper. - // - // The strategically use of the raw clone syscall is safe here because - // we are using a specific subset of the clone flags to launch - // processes. Unlike the general clone syscall where a number of - // usecases are supported such as launching thread, we want a behavior - // that is more similar to fork. - libc::syscall( - libc::SYS_clone, - flags | exit_signal.unwrap_or(0), // flags - 0, // stack - 0, // parent_tid - 0, // child_tid - 0, // tls + // Using the clone syscall requires us to create the stack space for the + // child process instead of taken cared for us like fork call. We use mmap + // here to create the stack. Instead of guessing how much space the child + // process needs, we allocate through mmap to the system default limit, + // which is 8MB on most of the linux system today. This is OK since mmap + // will only researve the address space upfront, instead of allocating + // physical memory upfront. The stack will grow as needed, up to the size + // researved, so no wasted memory here. Lastly, the child stack only needs + // to support the container init process set up code in Youki. When Youki + // calls exec into the container payload, exec will reset the stack. Note, + // do not use MAP_GROWSDOWN since it is not well supported. + // Ref: https://man7.org/linux/man-pages/man2/mmap.2.html + let child_stack = unsafe { + mman::mmap( + None, + NonZeroUsize::new(default_stack_size).ok_or(CloneError::ZeroStackSize)?, + mman::ProtFlags::PROT_READ | mman::ProtFlags::PROT_WRITE, + mman::MapFlags::MAP_PRIVATE | mman::MapFlags::MAP_ANONYMOUS | mman::MapFlags::MAP_STACK, + -1, + 0, ) + .map_err(CloneError::StackAllocation)? }; + unsafe { + // Consistant with how pthread_create sets up the stack, we create a + // guard page of 1 page, to protect the child stack collision. Note, for + // clone call, the child stack will grow downward, so the bottom of the + // child stack is in the beginning. + mman::mprotect(child_stack, page_size, mman::ProtFlags::PROT_NONE) + .map_err(CloneError::GuardPage)?; + }; + + // Since the child stack for clone grows downward, we need to pass in + // the top of the stack address. + let child_stack_top = unsafe { child_stack.add(default_stack_size) }; + + // Combine the clone flags with exit signals. + let combined_flags = (flags | exit_signal.unwrap_or(0)) as c_int; + + // We are passing the boxed closure "cb" into the clone function as the a + // function pointer in C. The box closure in Rust is both a function pointer + // and a struct. However, when casting the box closure into libc::c_void, + // the function pointer will be lost. Therefore, to work around the issue, + // we double box the closure. This is consistant with how std::unix::thread + // handles the closure. + // Ref: https://github.com/rust-lang/rust/blob/master/library/std/src/sys/unix/thread.rs + let data = Box::into_raw(Box::new(Box::new(cb))); + // The main is a wrapper function passed into clone call below. The "data" + // arg is actually a raw pointer to a Box closure. so here, we re-box the + // pointer back into a box closure so the main takes ownership of the + // memory. Then we can call the closure passed in. + extern "C" fn main(data: *mut libc::c_void) -> libc::c_int { + unsafe { Box::from_raw(data as *mut CloneCb)() as i32 } + } - ret as i32 + // The nix::sched::clone wrapper doesn't provide the right interface. Using + // the clone syscall is one of the rare cases where we don't want rust to + // manage the child stack memory. Instead, we want to use c_void directly + // here. Therefore, here we are using libc::clone syscall directly for + // better control. The child stack will be cleaned when exec is called or + // the child process terminates. The nix wrapper also does not treat the + // closure memory correctly. The wrapper implementation fails to pass the + // right ownership to the new child process. + // Ref: https://github.com/nix-rust/nix/issues/919 + // Ref: https://github.com/nix-rust/nix/pull/920 + match unsafe { + libc::clone( + main, + child_stack_top, + combined_flags, + data as *mut libc::c_void, + ) + } { + -1 => { + // Since the clone call failed, the closure passed in didn't get + // consumed. To complete the circle, we can safely box up the + // closure again and let rust manage this memory for us. + unsafe { drop(Box::from_raw(data)) }; + Err(CloneError::Clone(nix::Error::last())) + } + pid => Ok(Pid::from_raw(pid)), + } } #[cfg(test)] @@ -182,7 +289,7 @@ mod test { #[test] fn test_container_fork() -> Result<()> { - let pid = container_fork("test:child", || Ok(0))?; + let pid = container_clone("test:child", || Ok(()))?; match waitpid(pid, None).expect("wait pid failed.") { WaitStatus::Exited(p, status) => { assert_eq!(pid, p); @@ -195,7 +302,7 @@ mod test { #[test] fn test_container_err_fork() -> Result<()> { - let pid = container_fork("test:child", || Err(CallbackError::Test))?; + let pid = container_clone("test:child", || Err(CallbackError::Test))?; match waitpid(pid, None).expect("wait pid failed.") { WaitStatus::Exited(p, status) => { assert_eq!(pid, p); @@ -247,7 +354,7 @@ mod test { unistd::ForkResult::Child => { // Inside the forked process. We call `container_clone` and pass // the pid to the parent process. - let pid = container_clone_sibling("test:child", || Ok(0))?; + let pid = container_clone_sibling("test:child", || Ok(()))?; sender.send(pid.as_raw())?; sender.close()?; std::process::exit(0); @@ -301,16 +408,17 @@ mod test { )); } - let pid = container_fork("test:child", || Ok(0)).map_err(|err| err.to_string())?; + let pid = container_clone("test:child", || Ok(())).map_err(|err| err.to_string())?; match waitpid(pid, None).expect("wait pid failed.") { WaitStatus::Exited(p, status) => { assert_eq!(pid, p); assert_eq!(status, 0); } - _ => { - return Err(TestCallbackError::Custom( - "failed to wait on the child process".into(), - )); + status => { + return Err(TestCallbackError::Custom(format!( + "failed to wait on child process: {:?}", + status + ))); } }; From 63108769d8ffec4465060dbc5229f3d093ed31b8 Mon Sep 17 00:00:00 2001 From: yihuaf Date: Mon, 3 Jul 2023 12:05:47 -0700 Subject: [PATCH 03/21] move notify listener creation to init process Signed-off-by: yihuaf --- .../libcontainer/src/container/builder_impl.rs | 8 +------- crates/libcontainer/src/process/args.rs | 5 +++-- .../src/process/container_init_process.rs | 17 ++++++++++------- 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/crates/libcontainer/src/container/builder_impl.rs b/crates/libcontainer/src/container/builder_impl.rs index 59fcc5f99..2b37e2750 100644 --- a/crates/libcontainer/src/container/builder_impl.rs +++ b/crates/libcontainer/src/container/builder_impl.rs @@ -90,12 +90,6 @@ impl<'a> ContainerBuilderImpl<'a> { } } - // Need to create the notify socket before we pivot root, since the unix - // domain socket used here is outside of the rootfs of container. During - // exec, need to create the socket before we enter into existing mount - // namespace. - let notify_socket: NotifyListener = NotifyListener::new(&self.notify_path)?; - // If Out-of-memory score adjustment is set in specification. set the score // value for the current process check // https://dev.to/rrampage/surviving-the-linux-oom-killer-2ki9 for some more @@ -139,7 +133,7 @@ impl<'a> ContainerBuilderImpl<'a> { spec: self.spec, rootfs: &self.rootfs, console_socket: self.console_socket, - notify_socket, + notify_socket_path: self.notify_path.to_owned(), preserve_fds: self.preserve_fds, container: &self.container, rootless: &self.rootless, diff --git a/crates/libcontainer/src/process/args.rs b/crates/libcontainer/src/process/args.rs index 44258975e..2500efd24 100644 --- a/crates/libcontainer/src/process/args.rs +++ b/crates/libcontainer/src/process/args.rs @@ -5,7 +5,7 @@ use std::path::PathBuf; use crate::rootless::Rootless; use crate::workload::ExecutorManager; -use crate::{container::Container, notify_socket::NotifyListener, syscall::Syscall}; +use crate::{container::Container, syscall::Syscall}; #[derive(Debug, Copy, Clone)] pub enum ContainerType { @@ -13,6 +13,7 @@ pub enum ContainerType { TenantContainer { exec_notify_fd: RawFd }, } +#[derive(Clone)] pub struct ContainerArgs<'a> { /// Indicates if an init or a tenant container should be created pub container_type: ContainerType, @@ -25,7 +26,7 @@ pub struct ContainerArgs<'a> { /// Socket to communicate the file descriptor of the ptty pub console_socket: Option, /// The Unix Domain Socket to communicate container start - pub notify_socket: NotifyListener, + pub notify_socket_path: PathBuf, /// File descriptors preserved/passed to the container init process. pub preserve_fds: i32, /// Container state diff --git a/crates/libcontainer/src/process/container_init_process.rs b/crates/libcontainer/src/process/container_init_process.rs index 86d2065b0..75454eff5 100644 --- a/crates/libcontainer/src/process/container_init_process.rs +++ b/crates/libcontainer/src/process/container_init_process.rs @@ -341,6 +341,11 @@ pub fn container_init_process( let hooks = spec.hooks().as_ref(); let container = args.container.as_ref(); let namespaces = Namespaces::try_from(linux.namespaces().as_ref())?; + // Need to create the notify socket before we pivot root, since the unix + // domain socket used here is outside of the rootfs of container. During + // exec, need to create the socket before we enter into existing mount + // namespace. + let notify_listener = notify_socket::NotifyListener::new(args.notify_socket_path.as_ref())?; setsid().map_err(|err| { tracing::error!(?err, "failed to setsid to create a session"); @@ -652,13 +657,11 @@ pub fn container_init_process( })?; // listing on the notify socket for container start command - args.notify_socket - .wait_for_container_start() - .map_err(|err| { - tracing::error!(?err, "failed to wait for container start"); - err - })?; - args.notify_socket.close().map_err(|err| { + notify_listener.wait_for_container_start().map_err(|err| { + tracing::error!(?err, "failed to wait for container start"); + err + })?; + notify_listener.close().map_err(|err| { tracing::error!(?err, "failed to close notify socket"); err })?; From deecf4f4aac60075b79e09037a220f907dfeec31 Mon Sep 17 00:00:00 2001 From: yihuaf Date: Mon, 3 Jul 2023 13:11:22 -0700 Subject: [PATCH 04/21] refactored the creation of cgroup manager Signed-off-by: yihuaf --- crates/libcgroups/src/common.rs | 31 +++++++++++-------- crates/libcgroups/src/systemd/manager.rs | 6 ++-- crates/libcgroups/src/v1/manager.rs | 4 +-- crates/libcgroups/src/v2/manager.rs | 6 ++-- .../src/container/builder_impl.rs | 24 +++++++------- .../src/container/container_delete.rs | 9 +++--- .../src/container/container_events.rs | 9 +++--- .../src/container/container_kill.rs | 29 ++++++++++------- .../src/container/container_pause.rs | 8 +++-- .../src/container/container_resume.rs | 8 +++-- crates/libcontainer/src/process/args.rs | 6 ++-- .../process/container_intermediate_process.rs | 3 +- crates/youki/src/commands/mod.rs | 13 ++++---- 13 files changed, 87 insertions(+), 69 deletions(-) diff --git a/crates/libcgroups/src/common.rs b/crates/libcgroups/src/common.rs index aba574f58..84c690dd1 100644 --- a/crates/libcgroups/src/common.rs +++ b/crates/libcgroups/src/common.rs @@ -316,17 +316,22 @@ pub enum CreateCgroupSetupError { Systemd(#[from] systemd::manager::SystemdManagerError), } -pub fn create_cgroup_manager>( - cgroup_path: P, - systemd_cgroup: bool, - container_name: &str, +#[derive(Clone)] +pub struct CgroupConfig { + pub cgroup_path: PathBuf, + pub systemd_cgroup: bool, + pub container_name: String, +} + +pub fn create_cgroup_manager( + config: &CgroupConfig, ) -> Result { let cgroup_setup = get_cgroup_setup().map_err(|err| match err { GetCgroupSetupError::WrappedIo(err) => CreateCgroupSetupError::WrappedIo(err), GetCgroupSetupError::NonDefault => CreateCgroupSetupError::NonDefault, GetCgroupSetupError::FailedToDetect => CreateCgroupSetupError::FailedToDetect, })?; - let cgroup_path = cgroup_path.into(); + let cgroup_path = config.cgroup_path.as_path(); match cgroup_setup { CgroupSetup::Legacy | CgroupSetup::Hybrid => { @@ -334,17 +339,17 @@ pub fn create_cgroup_manager>( } CgroupSetup::Unified => { // ref https://github.com/opencontainers/runtime-spec/blob/main/config-linux.md#cgroups-path - if cgroup_path.is_absolute() || !systemd_cgroup { + if cgroup_path.is_absolute() || !config.systemd_cgroup { return Ok(create_v2_cgroup_manager(cgroup_path)?.any()); } - Ok(create_systemd_cgroup_manager(cgroup_path, container_name)?.any()) + Ok(create_systemd_cgroup_manager(cgroup_path, config.container_name.as_str())?.any()) } } } #[cfg(feature = "v1")] fn create_v1_cgroup_manager( - cgroup_path: PathBuf, + cgroup_path: &Path, ) -> Result { tracing::info!("cgroup manager V1 will be used"); v1::manager::Manager::new(cgroup_path) @@ -352,14 +357,14 @@ fn create_v1_cgroup_manager( #[cfg(not(feature = "v1"))] fn create_v1_cgroup_manager( - _cgroup_path: PathBuf, + _cgroup_path: &Path, ) -> Result { Err(v1::manager::V1ManagerError::NotEnabled) } #[cfg(feature = "v2")] fn create_v2_cgroup_manager( - cgroup_path: PathBuf, + cgroup_path: &Path, ) -> Result { tracing::info!("cgroup manager V2 will be used"); v2::manager::Manager::new(DEFAULT_CGROUP_ROOT.into(), cgroup_path) @@ -367,14 +372,14 @@ fn create_v2_cgroup_manager( #[cfg(not(feature = "v2"))] fn create_v2_cgroup_manager( - _cgroup_path: PathBuf, + _cgroup_path: &Path, ) -> Result { Err(v2::manager::V2ManagerError::NotEnabled) } #[cfg(feature = "systemd")] fn create_systemd_cgroup_manager( - cgroup_path: PathBuf, + cgroup_path: &Path, container_name: &str, ) -> Result { if !systemd::booted() { @@ -399,7 +404,7 @@ fn create_systemd_cgroup_manager( #[cfg(not(feature = "systemd"))] fn create_systemd_cgroup_manager( - _cgroup_path: PathBuf, + _cgroup_path: &Path, _container_name: &str, ) -> Result { Err(systemd::manager::SystemdManagerError::NotEnabled) diff --git a/crates/libcgroups/src/systemd/manager.rs b/crates/libcgroups/src/systemd/manager.rs index 21a2e60d8..b94cdcdbb 100644 --- a/crates/libcgroups/src/systemd/manager.rs +++ b/crates/libcgroups/src/systemd/manager.rs @@ -176,11 +176,11 @@ pub enum SystemdManagerError { impl Manager { pub fn new( root_path: PathBuf, - cgroups_path: PathBuf, + cgroups_path: &Path, container_name: String, use_system: bool, ) -> Result { - let mut destructured_path = cgroups_path.as_path().try_into()?; + let mut destructured_path = cgroups_path.try_into()?; ensure_parent_unit(&mut destructured_path, use_system); let client = match use_system { @@ -191,7 +191,7 @@ impl Manager { let (cgroups_path, delegation_boundary) = Self::construct_cgroups_path(&destructured_path, &client)?; let full_path = root_path.join_safely(&cgroups_path)?; - let fs_manager = FsManager::new(root_path.clone(), cgroups_path.clone())?; + let fs_manager = FsManager::new(root_path.clone(), &cgroups_path)?; Ok(Manager { root_path, diff --git a/crates/libcgroups/src/v1/manager.rs b/crates/libcgroups/src/v1/manager.rs index e1376a883..022ffbbcb 100644 --- a/crates/libcgroups/src/v1/manager.rs +++ b/crates/libcgroups/src/v1/manager.rs @@ -82,10 +82,10 @@ pub enum V1ManagerError { impl Manager { /// Constructs a new cgroup manager with cgroups_path being relative to the root of the subsystem - pub fn new(cgroup_path: PathBuf) -> Result { + pub fn new(cgroup_path: &Path) -> Result { let mut subsystems = HashMap::::new(); for subsystem in CONTROLLERS { - if let Ok(subsystem_path) = Self::get_subsystem_path(&cgroup_path, subsystem) { + if let Ok(subsystem_path) = Self::get_subsystem_path(cgroup_path, subsystem) { subsystems.insert(*subsystem, subsystem_path); } else { tracing::warn!("cgroup {} not supported on this system", subsystem); diff --git a/crates/libcgroups/src/v2/manager.rs b/crates/libcgroups/src/v2/manager.rs index 9aa5b8d56..b23d6f9e0 100644 --- a/crates/libcgroups/src/v2/manager.rs +++ b/crates/libcgroups/src/v2/manager.rs @@ -84,12 +84,12 @@ pub struct Manager { impl Manager { /// Constructs a new cgroup manager with root path being the mount point /// of a cgroup v2 fs and cgroup path being a relative path from the root - pub fn new(root_path: PathBuf, cgroup_path: PathBuf) -> Result { - let full_path = root_path.join_safely(&cgroup_path)?; + pub fn new(root_path: PathBuf, cgroup_path: &Path) -> Result { + let full_path = root_path.join_safely(cgroup_path)?; Ok(Self { root_path, - cgroup_path, + cgroup_path: cgroup_path.to_owned(), full_path, }) } diff --git a/crates/libcontainer/src/container/builder_impl.rs b/crates/libcontainer/src/container/builder_impl.rs index 2b37e2750..7613a0c05 100644 --- a/crates/libcontainer/src/container/builder_impl.rs +++ b/crates/libcontainer/src/container/builder_impl.rs @@ -2,7 +2,6 @@ use super::{Container, ContainerStatus}; use crate::{ error::{LibcontainerError, MissingSpecError}, hooks, - notify_socket::NotifyListener, process::{ self, args::{ContainerArgs, ContainerType}, @@ -73,11 +72,11 @@ impl<'a> ContainerBuilderImpl<'a> { &self.container_id, self.rootless.is_some(), ); - let cmanager = libcgroups::common::create_cgroup_manager( - cgroups_path, - self.use_systemd || self.rootless.is_some(), - &self.container_id, - )?; + let cgroup_config = libcgroups::common::CgroupConfig { + cgroup_path: cgroups_path, + systemd_cgroup: self.use_systemd || self.rootless.is_some(), + container_name: self.container_id.to_owned(), + }; let process = self .spec .process() @@ -137,7 +136,7 @@ impl<'a> ContainerBuilderImpl<'a> { preserve_fds: self.preserve_fds, container: &self.container, rootless: &self.rootless, - cgroup_manager: cmanager, + cgroup_config, detached: self.detached, executor_manager: &self.executor_manager, }; @@ -178,11 +177,12 @@ impl<'a> ContainerBuilderImpl<'a> { &self.container_id, self.rootless.is_some(), ); - let cmanager = libcgroups::common::create_cgroup_manager( - cgroups_path, - self.use_systemd || self.rootless.is_some(), - &self.container_id, - )?; + let cmanager = + libcgroups::common::create_cgroup_manager(&libcgroups::common::CgroupConfig { + cgroup_path: cgroups_path.to_owned(), + systemd_cgroup: self.use_systemd || self.rootless.is_some(), + container_name: self.container_id.to_string(), + })?; let mut errors = Vec::new(); diff --git a/crates/libcontainer/src/container/container_delete.rs b/crates/libcontainer/src/container/container_delete.rs index 99a3cce3a..7fc46dcc8 100644 --- a/crates/libcontainer/src/container/container_delete.rs +++ b/crates/libcontainer/src/container/container_delete.rs @@ -82,11 +82,12 @@ impl Container { // remove the cgroup created for the container // check https://man7.org/linux/man-pages/man7/cgroups.7.html // creating and removing cgroups section for more information on cgroups - let use_systemd = self.systemd(); let cmanager = libcgroups::common::create_cgroup_manager( - &config.cgroup_path, - use_systemd, - self.id(), + &libcgroups::common::CgroupConfig { + cgroup_path: config.cgroup_path.to_owned(), + systemd_cgroup: self.systemd(), + container_name: self.id().to_string(), + }, )?; cmanager.remove().map_err(|err| { tracing::error!(cgroup_path = ?config.cgroup_path, "failed to remove cgroup due to: {err:?}"); diff --git a/crates/libcontainer/src/container/container_events.rs b/crates/libcontainer/src/container/container_events.rs index 4d45bf889..33e0df020 100644 --- a/crates/libcontainer/src/container/container_events.rs +++ b/crates/libcontainer/src/container/container_events.rs @@ -34,11 +34,12 @@ impl Container { return Err(LibcontainerError::IncorrectStatus); } - let cgroups_path = self.spec()?.cgroup_path; - let use_systemd = self.systemd(); - let cgroup_manager = - libcgroups::common::create_cgroup_manager(cgroups_path, use_systemd, self.id())?; + libcgroups::common::create_cgroup_manager(&libcgroups::common::CgroupConfig { + cgroup_path: self.spec()?.cgroup_path.to_owned(), + systemd_cgroup: self.systemd(), + container_name: self.id().to_string(), + })?; match stats { true => { let stats = cgroup_manager.stats()?; diff --git a/crates/libcontainer/src/container/container_kill.rs b/crates/libcontainer/src/container/container_kill.rs index 9853d7c78..66d126dce 100644 --- a/crates/libcontainer/src/container/container_kill.rs +++ b/crates/libcontainer/src/container/container_kill.rs @@ -1,6 +1,6 @@ use super::{Container, ContainerStatus}; use crate::{error::LibcontainerError, signal::Signal}; -use libcgroups::common::{create_cgroup_manager, get_cgroup_setup, CgroupManager}; +use libcgroups::common::{get_cgroup_setup, CgroupManager}; use nix::sys::signal::{self}; impl Container { @@ -79,10 +79,14 @@ impl Container { match get_cgroup_setup()? { libcgroups::common::CgroupSetup::Legacy | libcgroups::common::CgroupSetup::Hybrid => { - let cgroups_path = self.spec()?.cgroup_path; - let use_systemd = self.systemd(); - let cmanger = create_cgroup_manager(cgroups_path, use_systemd, self.id())?; - cmanger.freeze(libcgroups::common::FreezerState::Thawed)?; + let cmanager = libcgroups::common::create_cgroup_manager( + &libcgroups::common::CgroupConfig { + cgroup_path: self.spec()?.cgroup_path.to_owned(), + systemd_cgroup: self.systemd(), + container_name: self.id().to_string(), + }, + )?; + cmanager.freeze(libcgroups::common::FreezerState::Thawed)?; } libcgroups::common::CgroupSetup::Unified => {} } @@ -92,11 +96,14 @@ impl Container { fn kill_all_processes>(&self, signal: S) -> Result<(), LibcontainerError> { let signal = signal.into().into_raw(); - let cgroups_path = self.spec()?.cgroup_path; - let use_systemd = self.systemd(); - let cmanger = create_cgroup_manager(cgroups_path, use_systemd, self.id())?; + let cmanager = + libcgroups::common::create_cgroup_manager(&libcgroups::common::CgroupConfig { + cgroup_path: self.spec()?.cgroup_path.to_owned(), + systemd_cgroup: self.systemd(), + container_name: self.id().to_string(), + })?; - if let Err(e) = cmanger.freeze(libcgroups::common::FreezerState::Frozen) { + if let Err(e) = cmanager.freeze(libcgroups::common::FreezerState::Frozen) { tracing::warn!( err = ?e, id = ?self.id(), @@ -104,7 +111,7 @@ impl Container { ); } - let pids = cmanger.get_all_pids()?; + let pids = cmanager.get_all_pids()?; pids.iter() .try_for_each(|&pid| { tracing::debug!("kill signal {} to {}", signal, pid); @@ -118,7 +125,7 @@ impl Container { } }) .map_err(LibcontainerError::OtherSyscall)?; - if let Err(err) = cmanger.freeze(libcgroups::common::FreezerState::Thawed) { + if let Err(err) = cmanager.freeze(libcgroups::common::FreezerState::Thawed) { tracing::warn!( err = ?err, id = ?self.id(), diff --git a/crates/libcontainer/src/container/container_pause.rs b/crates/libcontainer/src/container/container_pause.rs index aa2074656..5be1326a3 100644 --- a/crates/libcontainer/src/container/container_pause.rs +++ b/crates/libcontainer/src/container/container_pause.rs @@ -33,10 +33,12 @@ impl Container { return Err(LibcontainerError::IncorrectStatus); } - let cgroups_path = self.spec()?.cgroup_path; - let use_systemd = self.systemd(); let cmanager = - libcgroups::common::create_cgroup_manager(cgroups_path, use_systemd, self.id())?; + libcgroups::common::create_cgroup_manager(&libcgroups::common::CgroupConfig { + cgroup_path: self.spec()?.cgroup_path.to_owned(), + systemd_cgroup: self.systemd(), + container_name: self.id().to_string(), + })?; cmanager.freeze(FreezerState::Frozen)?; tracing::debug!("saving paused status"); diff --git a/crates/libcontainer/src/container/container_resume.rs b/crates/libcontainer/src/container/container_resume.rs index 0275f2bfc..0087ecfe5 100644 --- a/crates/libcontainer/src/container/container_resume.rs +++ b/crates/libcontainer/src/container/container_resume.rs @@ -35,10 +35,12 @@ impl Container { return Err(LibcontainerError::IncorrectStatus); } - let cgroups_path = self.spec()?.cgroup_path; - let use_systemd = self.systemd(); let cmanager = - libcgroups::common::create_cgroup_manager(cgroups_path, use_systemd, self.id())?; + libcgroups::common::create_cgroup_manager(&libcgroups::common::CgroupConfig { + cgroup_path: self.spec()?.cgroup_path.to_owned(), + systemd_cgroup: self.systemd(), + container_name: self.id().to_string(), + })?; // resume the frozen container cmanager.freeze(FreezerState::Thawed)?; diff --git a/crates/libcontainer/src/process/args.rs b/crates/libcontainer/src/process/args.rs index 2500efd24..9c6615b3e 100644 --- a/crates/libcontainer/src/process/args.rs +++ b/crates/libcontainer/src/process/args.rs @@ -1,4 +1,4 @@ -use libcgroups::common::AnyCgroupManager; +use libcgroups::common::CgroupConfig; use oci_spec::runtime::Spec; use std::os::unix::prelude::RawFd; use std::path::PathBuf; @@ -33,8 +33,8 @@ pub struct ContainerArgs<'a> { pub container: &'a Option, /// Options for rootless containers pub rootless: &'a Option>, - /// Cgroup Manager - pub cgroup_manager: AnyCgroupManager, + /// Cgroup Manager Config + pub cgroup_config: CgroupConfig, /// If the container is to be run in detached mode pub detached: bool, /// Manage the functions that actually run on the container diff --git a/crates/libcontainer/src/process/container_intermediate_process.rs b/crates/libcontainer/src/process/container_intermediate_process.rs index 857473aa8..a838dacb0 100644 --- a/crates/libcontainer/src/process/container_intermediate_process.rs +++ b/crates/libcontainer/src/process/container_intermediate_process.rs @@ -43,6 +43,7 @@ pub fn container_intermediate_process( let spec = args.spec; let linux = spec.linux().as_ref().ok_or(MissingSpecError::Linux)?; let namespaces = Namespaces::try_from(linux.namespaces().as_ref())?; + let cgroup_manager = libcgroups::common::create_cgroup_manager(&args.cgroup_config).unwrap(); // this needs to be done before we create the init process, so that the init // process will already be captured by the cgroup. It also needs to be done @@ -55,7 +56,7 @@ pub fn container_intermediate_process( // the cgroup of the process will form the root of the cgroup hierarchy in // the cgroup namespace. apply_cgroups( - &args.cgroup_manager, + &cgroup_manager, linux.resources().as_ref(), matches!(args.container_type, ContainerType::InitContainer), )?; diff --git a/crates/youki/src/commands/mod.rs b/crates/youki/src/commands/mod.rs index 7cd3669ac..520e004cb 100644 --- a/crates/youki/src/commands/mod.rs +++ b/crates/youki/src/commands/mod.rs @@ -58,12 +58,11 @@ fn create_cgroup_manager>( container_id: &str, ) -> Result { let container = load_container(root_path, container_id)?; - let cgroups_path = container.spec()?.cgroup_path; - let systemd_cgroup = container.systemd(); + let cgroup_config = libcgroups::common::CgroupConfig { + cgroup_path: container.spec()?.cgroup_path, + systemd_cgroup: container.systemd(), + container_name: container.id().to_string(), + }; - Ok(libcgroups::common::create_cgroup_manager( - cgroups_path, - systemd_cgroup, - container.id(), - )?) + Ok(libcgroups::common::create_cgroup_manager(&cgroup_config)?) } From a628cbdb9ecad58294abb65749384c2c46980c9e Mon Sep 17 00:00:00 2001 From: yihuaf Date: Mon, 3 Jul 2023 14:48:47 -0700 Subject: [PATCH 05/21] fix syscall reference and lifetime Signed-off-by: yihuaf --- crates/libcontainer/src/container/builder.rs | 43 +++++++++---------- .../src/container/builder_impl.rs | 4 +- .../src/container/init_builder.rs | 8 ++-- .../src/container/tenant_builder.rs | 8 ++-- crates/libcontainer/src/process/args.rs | 5 ++- .../src/process/container_init_process.rs | 14 +++--- .../process/container_intermediate_process.rs | 2 +- crates/libcontainer/src/syscall/syscall.rs | 31 ++++++++++--- 8 files changed, 68 insertions(+), 47 deletions(-) diff --git a/crates/libcontainer/src/container/builder.rs b/crates/libcontainer/src/container/builder.rs index 80d0f8b94..a656e732a 100644 --- a/crates/libcontainer/src/container/builder.rs +++ b/crates/libcontainer/src/container/builder.rs @@ -1,4 +1,5 @@ use crate::error::{ErrInvalidID, LibcontainerError}; +use crate::syscall::syscall::SyscallType; use crate::workload::default::DefaultExecutor; use crate::workload::{Executor, ExecutorManager}; use crate::{syscall::Syscall, utils::PathBufExt}; @@ -6,13 +7,13 @@ use std::path::PathBuf; use super::{init_builder::InitContainerBuilder, tenant_builder::TenantContainerBuilder}; -pub struct ContainerBuilder<'a> { +pub struct ContainerBuilder { /// Id of the container pub(super) container_id: String, /// Root directory for container state pub(super) root_path: PathBuf, /// Interface to operating system primitives - pub(super) syscall: &'a dyn Syscall, + pub(super) syscall: SyscallType, /// File which will be used to communicate the pid of the /// container process to the higher level runtime pub(super) pid_file: Option, @@ -45,7 +46,7 @@ pub struct ContainerBuilder<'a> { /// .as_init("/var/run/docker/bundle") /// .build(); /// ``` -impl<'a> ContainerBuilder<'a> { +impl ContainerBuilder { /// Generates the base configuration for a container which can be /// transformed into either a init container or a tenant container /// @@ -61,7 +62,7 @@ impl<'a> ContainerBuilder<'a> { /// create_syscall().as_ref(), /// ); /// ``` - pub fn new(container_id: String, syscall: &'a dyn Syscall) -> Self { + pub fn new(container_id: String, syscall: SyscallType) -> Self { let root_path = PathBuf::from("/run/youki"); Self { container_id, @@ -129,7 +130,7 @@ impl<'a> ContainerBuilder<'a> { /// .build(); /// ``` #[allow(clippy::wrong_self_convention)] - pub fn as_tenant(self) -> TenantContainerBuilder<'a> { + pub fn as_tenant(self) -> TenantContainerBuilder { TenantContainerBuilder::new(self) } @@ -150,7 +151,7 @@ impl<'a> ContainerBuilder<'a> { /// .build(); /// ``` #[allow(clippy::wrong_self_convention)] - pub fn as_init>(self, bundle: P) -> InitContainerBuilder<'a> { + pub fn as_init>(self, bundle: P) -> InitContainerBuilder { InitContainerBuilder::new(self, bundle.into()) } @@ -276,8 +277,8 @@ impl<'a> ContainerBuilder<'a> { #[cfg(test)] mod tests { - use crate::container::builder::ContainerBuilder; use crate::syscall::syscall::create_syscall; + use crate::{container::builder::ContainerBuilder, syscall::syscall::SyscallType}; use anyhow::{Context, Result}; use std::path::PathBuf; @@ -285,42 +286,41 @@ mod tests { fn test_failable_functions() -> Result<()> { let root_path_temp_dir = tempfile::tempdir().context("failed to create temp dir")?; let pid_file_temp_dir = tempfile::tempdir().context("failed to create temp dir")?; - let syscall = create_syscall(); + let syscall = SyscallType::default(); - ContainerBuilder::new("74f1a4cb3801".to_owned(), syscall.as_ref()) + ContainerBuilder::new("74f1a4cb3801".to_owned(), syscall) .with_root_path(root_path_temp_dir.path())? .with_pid_file(Some(pid_file_temp_dir.path().join("fake.pid")))? .with_console_socket(Some("/var/run/docker/sock.tty")) .as_init("/var/run/docker/bundle"); // accept None pid file. - ContainerBuilder::new("74f1a4cb3801".to_owned(), syscall.as_ref()) - .with_pid_file::(None)?; + ContainerBuilder::new("74f1a4cb3801".to_owned(), syscall).with_pid_file::(None)?; // accept absolute root path which does not exist let abs_root_path = PathBuf::from("/not/existing/path"); - let path_builder = ContainerBuilder::new("74f1a4cb3801".to_owned(), syscall.as_ref()) + let path_builder = ContainerBuilder::new("74f1a4cb3801".to_owned(), syscall) .with_root_path(&abs_root_path) .context("build container")?; assert_eq!(path_builder.root_path, abs_root_path); // accept relative root path which does not exist let cwd = std::env::current_dir().context("get current dir")?; - let path_builder = ContainerBuilder::new("74f1a4cb3801".to_owned(), syscall.as_ref()) + let path_builder = ContainerBuilder::new("74f1a4cb3801".to_owned(), syscall) .with_root_path("./not/existing/path") .context("build container")?; assert_eq!(path_builder.root_path, cwd.join("not/existing/path")); // accept absolute pid path which does not exist let abs_pid_path = PathBuf::from("/not/existing/path"); - let path_builder = ContainerBuilder::new("74f1a4cb3801".to_owned(), syscall.as_ref()) + let path_builder = ContainerBuilder::new("74f1a4cb3801".to_owned(), syscall) .with_pid_file(Some(&abs_pid_path)) .context("build container")?; assert_eq!(path_builder.pid_file, Some(abs_pid_path)); // accept relative pid path which does not exist let cwd = std::env::current_dir().context("get current dir")?; - let path_builder = ContainerBuilder::new("74f1a4cb3801".to_owned(), syscall.as_ref()) + let path_builder = ContainerBuilder::new("74f1a4cb3801".to_owned(), syscall) .with_pid_file(Some("./not/existing/path")) .context("build container")?; assert_eq!(path_builder.pid_file, Some(cwd.join("not/existing/path"))); @@ -330,22 +330,21 @@ mod tests { #[test] fn test_validate_id() -> Result<()> { - let syscall = create_syscall(); + let syscall = SyscallType::default(); // validate container_id - let result = ContainerBuilder::new("$#".to_owned(), syscall.as_ref()).validate_id(); + let result = ContainerBuilder::new("$#".to_owned(), syscall).validate_id(); assert!(result.is_err()); - let result = ContainerBuilder::new(".".to_owned(), syscall.as_ref()).validate_id(); + let result = ContainerBuilder::new(".".to_owned(), syscall).validate_id(); assert!(result.is_err()); - let result = ContainerBuilder::new("..".to_owned(), syscall.as_ref()).validate_id(); + let result = ContainerBuilder::new("..".to_owned(), syscall).validate_id(); assert!(result.is_err()); - let result = ContainerBuilder::new("...".to_owned(), syscall.as_ref()).validate_id(); + let result = ContainerBuilder::new("...".to_owned(), syscall).validate_id(); assert!(result.is_ok()); - let result = - ContainerBuilder::new("74f1a4cb3801".to_owned(), syscall.as_ref()).validate_id(); + let result = ContainerBuilder::new("74f1a4cb3801".to_owned(), syscall).validate_id(); assert!(result.is_ok()); Ok(()) } diff --git a/crates/libcontainer/src/container/builder_impl.rs b/crates/libcontainer/src/container/builder_impl.rs index 7613a0c05..b7a27b6e0 100644 --- a/crates/libcontainer/src/container/builder_impl.rs +++ b/crates/libcontainer/src/container/builder_impl.rs @@ -8,7 +8,7 @@ use crate::{ intel_rdt::delete_resctrl_subdirectory, }, rootless::Rootless, - syscall::Syscall, + syscall::{syscall::SyscallType, Syscall}, utils, workload::ExecutorManager, }; @@ -21,7 +21,7 @@ pub(super) struct ContainerBuilderImpl<'a> { /// Flag indicating if an init or a tenant container should be created pub container_type: ContainerType, /// Interface to operating system primitives - pub syscall: &'a dyn Syscall, + pub syscall: SyscallType, /// Flag indicating if systemd should be used for cgroup management pub use_systemd: bool, /// Id of the container diff --git a/crates/libcontainer/src/container/init_builder.rs b/crates/libcontainer/src/container/init_builder.rs index b18a0b0ad..22d647a6f 100644 --- a/crates/libcontainer/src/container/init_builder.rs +++ b/crates/libcontainer/src/container/init_builder.rs @@ -20,17 +20,17 @@ use super::{ }; // Builder that can be used to configure the properties of a new container -pub struct InitContainerBuilder<'a> { - base: ContainerBuilder<'a>, +pub struct InitContainerBuilder { + base: ContainerBuilder, bundle: PathBuf, use_systemd: bool, detached: bool, } -impl<'a> InitContainerBuilder<'a> { +impl InitContainerBuilder { /// Generates the base configuration for a new container from which /// configuration methods can be chained - pub(super) fn new(builder: ContainerBuilder<'a>, bundle: PathBuf) -> Self { + pub(super) fn new(builder: ContainerBuilder, bundle: PathBuf) -> Self { Self { base: builder, bundle, diff --git a/crates/libcontainer/src/container/tenant_builder.rs b/crates/libcontainer/src/container/tenant_builder.rs index ebdff13ea..fa8392613 100644 --- a/crates/libcontainer/src/container/tenant_builder.rs +++ b/crates/libcontainer/src/container/tenant_builder.rs @@ -32,8 +32,8 @@ const TENANT_TTY: &str = "tenant-tty-"; /// Builder that can be used to configure the properties of a process /// that will join an existing container sandbox -pub struct TenantContainerBuilder<'a> { - base: ContainerBuilder<'a>, +pub struct TenantContainerBuilder { + base: ContainerBuilder, env: HashMap, cwd: Option, args: Vec, @@ -43,11 +43,11 @@ pub struct TenantContainerBuilder<'a> { detached: bool, } -impl<'a> TenantContainerBuilder<'a> { +impl TenantContainerBuilder { /// Generates the base configuration for a process that will join /// an existing container sandbox from which configuration methods /// can be chained - pub(super) fn new(builder: ContainerBuilder<'a>) -> Self { + pub(super) fn new(builder: ContainerBuilder) -> Self { Self { base: builder, env: HashMap::new(), diff --git a/crates/libcontainer/src/process/args.rs b/crates/libcontainer/src/process/args.rs index 9c6615b3e..e0447c25d 100644 --- a/crates/libcontainer/src/process/args.rs +++ b/crates/libcontainer/src/process/args.rs @@ -3,9 +3,10 @@ use oci_spec::runtime::Spec; use std::os::unix::prelude::RawFd; use std::path::PathBuf; +use crate::container::Container; use crate::rootless::Rootless; +use crate::syscall::syscall::SyscallType; use crate::workload::ExecutorManager; -use crate::{container::Container, syscall::Syscall}; #[derive(Debug, Copy, Clone)] pub enum ContainerType { @@ -18,7 +19,7 @@ pub struct ContainerArgs<'a> { /// Indicates if an init or a tenant container should be created pub container_type: ContainerType, /// Interface to operating system primitives - pub syscall: &'a dyn Syscall, + pub syscall: SyscallType, /// OCI compliant runtime spec pub spec: &'a Spec, /// Root filesystem of the container diff --git a/crates/libcontainer/src/process/container_init_process.rs b/crates/libcontainer/src/process/container_init_process.rs index 75454eff5..091845360 100644 --- a/crates/libcontainer/src/process/container_init_process.rs +++ b/crates/libcontainer/src/process/container_init_process.rs @@ -332,7 +332,7 @@ pub fn container_init_process( main_sender: &mut channel::MainSender, init_receiver: &mut channel::InitReceiver, ) -> Result<()> { - let syscall = args.syscall; + let syscall = args.syscall.create_syscall(); let spec = args.spec; let linux = spec.linux().as_ref().ok_or(MissingSpecError::Linux)?; let proc = spec.process().as_ref().ok_or(MissingSpecError::Process)?; @@ -359,7 +359,7 @@ pub fn container_init_process( })?; } - apply_rest_namespaces(&namespaces, spec, syscall)?; + apply_rest_namespaces(&namespaces, spec, syscall.as_ref())?; if let Some(true) = proc.no_new_privileges() { let _ = prctl::set_no_new_privileges(true); @@ -457,7 +457,7 @@ pub fn container_init_process( if let Some(paths) = linux.readonly_paths() { // mount readonly path for path in paths { - readonly_path(Path::new(path), syscall).map_err(|err| { + readonly_path(Path::new(path), syscall.as_ref()).map_err(|err| { tracing::error!(?err, ?path, "failed to set readonly path"); err })?; @@ -467,7 +467,7 @@ pub fn container_init_process( if let Some(paths) = linux.masked_paths() { // mount masked path for path in paths { - masked_path(Path::new(path), linux.mount_label(), syscall).map_err(|err| { + masked_path(Path::new(path), linux.mount_label(), syscall.as_ref()).map_err(|err| { tracing::error!(?err, ?path, "failed to set masked path"); err })?; @@ -491,7 +491,7 @@ pub fn container_init_process( } }; - set_supplementary_gids(proc.user(), &args.rootless, syscall).map_err(|err| { + set_supplementary_gids(proc.user(), &args.rootless, syscall.as_ref()).map_err(|err| { tracing::error!(?err, "failed to set supplementary gids"); err })?; @@ -581,12 +581,12 @@ pub fn container_init_process( tracing::warn!("seccomp not available, unable to enforce no_new_privileges!") } - capabilities::reset_effective(syscall).map_err(|err| { + capabilities::reset_effective(syscall.as_ref()).map_err(|err| { tracing::error!(?err, "failed to reset effective capabilities"); InitProcessError::SyscallOther(err) })?; if let Some(caps) = proc.capabilities() { - capabilities::drop_privileges(caps, syscall).map_err(|err| { + capabilities::drop_privileges(caps, syscall.as_ref()).map_err(|err| { tracing::error!(?err, "failed to drop capabilities"); InitProcessError::SyscallOther(err) })?; diff --git a/crates/libcontainer/src/process/container_intermediate_process.rs b/crates/libcontainer/src/process/container_intermediate_process.rs index a838dacb0..be1d92a55 100644 --- a/crates/libcontainer/src/process/container_intermediate_process.rs +++ b/crates/libcontainer/src/process/container_intermediate_process.rs @@ -39,7 +39,7 @@ pub fn container_intermediate_process( ) -> Result<()> { let (inter_sender, inter_receiver) = intermediate_chan; let (init_sender, init_receiver) = init_chan; - let command = args.syscall; + let command = args.syscall.create_syscall(); let spec = args.spec; let linux = spec.linux().as_ref().ok_or(MissingSpecError::Linux)?; let namespaces = Namespaces::try_from(linux.namespaces().as_ref())?; diff --git a/crates/libcontainer/src/syscall/syscall.rs b/crates/libcontainer/src/syscall/syscall.rs index 587963839..3d3e38360 100644 --- a/crates/libcontainer/src/syscall/syscall.rs +++ b/crates/libcontainer/src/syscall/syscall.rs @@ -56,10 +56,31 @@ pub trait Syscall { ) -> Result<()>; } -pub fn create_syscall() -> Box { - if cfg!(test) { - Box::::default() - } else { - Box::new(LinuxSyscall) +#[derive(Clone, Copy)] +pub enum SyscallType { + Linux, + Test, +} + +impl Default for SyscallType { + fn default() -> Self { + if cfg!(test) { + SyscallType::Test + } else { + SyscallType::Linux + } } } + +impl SyscallType { + pub fn create_syscall(&self) -> Box { + match self { + SyscallType::Linux => Box::new(LinuxSyscall), + SyscallType::Test => Box::new(TestHelperSyscall::default()), + } + } +} + +pub fn create_syscall() -> Box { + SyscallType::default().create_syscall() +} From de914ac9e3e5f1f6bc06fb946dd15e68455104be Mon Sep 17 00:00:00 2001 From: yihuaf Date: Mon, 3 Jul 2023 15:26:43 -0700 Subject: [PATCH 06/21] fixed spec and rootfs not using reference Signed-off-by: yihuaf --- crates/libcontainer/src/container/builder_impl.rs | 4 ++-- crates/libcontainer/src/process/args.rs | 4 ++-- .../libcontainer/src/process/container_init_process.rs | 10 +++++----- .../src/process/container_intermediate_process.rs | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/crates/libcontainer/src/container/builder_impl.rs b/crates/libcontainer/src/container/builder_impl.rs index b7a27b6e0..f725bfb14 100644 --- a/crates/libcontainer/src/container/builder_impl.rs +++ b/crates/libcontainer/src/container/builder_impl.rs @@ -129,8 +129,8 @@ impl<'a> ContainerBuilderImpl<'a> { let container_args = ContainerArgs { container_type: self.container_type, syscall: self.syscall, - spec: self.spec, - rootfs: &self.rootfs, + spec: self.spec.to_owned(), + rootfs: self.rootfs.to_owned(), console_socket: self.console_socket, notify_socket_path: self.notify_path.to_owned(), preserve_fds: self.preserve_fds, diff --git a/crates/libcontainer/src/process/args.rs b/crates/libcontainer/src/process/args.rs index e0447c25d..95d7de395 100644 --- a/crates/libcontainer/src/process/args.rs +++ b/crates/libcontainer/src/process/args.rs @@ -21,9 +21,9 @@ pub struct ContainerArgs<'a> { /// Interface to operating system primitives pub syscall: SyscallType, /// OCI compliant runtime spec - pub spec: &'a Spec, + pub spec: Spec, /// Root filesystem of the container - pub rootfs: &'a PathBuf, + pub rootfs: PathBuf, /// Socket to communicate the file descriptor of the ptty pub console_socket: Option, /// The Unix Domain Socket to communicate container start diff --git a/crates/libcontainer/src/process/container_init_process.rs b/crates/libcontainer/src/process/container_init_process.rs index 091845360..823e9f7b2 100644 --- a/crates/libcontainer/src/process/container_init_process.rs +++ b/crates/libcontainer/src/process/container_init_process.rs @@ -333,11 +333,11 @@ pub fn container_init_process( init_receiver: &mut channel::InitReceiver, ) -> Result<()> { let syscall = args.syscall.create_syscall(); - let spec = args.spec; + let spec = &args.spec; let linux = spec.linux().as_ref().ok_or(MissingSpecError::Linux)?; let proc = spec.process().as_ref().ok_or(MissingSpecError::Process)?; let mut envs: Vec = proc.env().as_ref().unwrap_or(&vec![]).clone(); - let rootfs_path = args.rootfs; + let rootfs_path = &args.rootfs; let hooks = spec.hooks().as_ref(); let container = args.container.as_ref(); let namespaces = Namespaces::try_from(linux.namespaces().as_ref())?; @@ -359,7 +359,7 @@ pub fn container_init_process( })?; } - apply_rest_namespaces(&namespaces, spec, syscall.as_ref())?; + apply_rest_namespaces(&namespaces, &spec, syscall.as_ref())?; if let Some(true) = proc.no_new_privileges() { let _ = prctl::set_no_new_privileges(true); @@ -379,7 +379,7 @@ pub fn container_init_process( let rootfs = RootFS::new(); rootfs .prepare_rootfs( - spec, + &spec, rootfs_path, bind_service, namespaces.get(LinuxNamespaceType::Cgroup)?.is_some(), @@ -678,7 +678,7 @@ pub fn container_init_process( } if proc.args().is_some() { - args.executor_manager.exec(spec).map_err(|err| { + args.executor_manager.exec(&spec).map_err(|err| { tracing::error!(?err, "failed to execute payload"); err })?; diff --git a/crates/libcontainer/src/process/container_intermediate_process.rs b/crates/libcontainer/src/process/container_intermediate_process.rs index be1d92a55..20ab84ade 100644 --- a/crates/libcontainer/src/process/container_intermediate_process.rs +++ b/crates/libcontainer/src/process/container_intermediate_process.rs @@ -40,7 +40,7 @@ pub fn container_intermediate_process( let (inter_sender, inter_receiver) = intermediate_chan; let (init_sender, init_receiver) = init_chan; let command = args.syscall.create_syscall(); - let spec = args.spec; + let spec = &args.spec; let linux = spec.linux().as_ref().ok_or(MissingSpecError::Linux)?; let namespaces = Namespaces::try_from(linux.namespaces().as_ref())?; let cgroup_manager = libcgroups::common::create_cgroup_manager(&args.cgroup_config).unwrap(); From a32c3ed4fee6139f624d944f0315c83e430afaf8 Mon Sep 17 00:00:00 2001 From: yihuaf Date: Mon, 3 Jul 2023 17:20:12 -0700 Subject: [PATCH 07/21] refactored the executor to be clone-able Signed-off-by: yihuaf --- crates/libcontainer/src/container/builder.rs | 22 +++----- .../src/container/builder_impl.rs | 8 +-- .../src/container/init_builder.rs | 2 +- .../src/container/tenant_builder.rs | 2 +- crates/libcontainer/src/process/args.rs | 4 +- .../src/process/container_init_process.rs | 4 +- crates/libcontainer/src/workload/default.rs | 19 ++----- crates/libcontainer/src/workload/mod.rs | 50 ++----------------- crates/youki/src/commands/create.rs | 9 ++-- crates/youki/src/commands/exec.rs | 9 ++-- crates/youki/src/commands/run.rs | 9 ++-- crates/youki/src/workload/executor.rs | 30 ++++++++--- crates/youki/src/workload/wasmedge.rs | 35 +++++++------ crates/youki/src/workload/wasmer.rs | 39 +++++++-------- crates/youki/src/workload/wasmtime.rs | 35 ++++++------- 15 files changed, 107 insertions(+), 170 deletions(-) diff --git a/crates/libcontainer/src/container/builder.rs b/crates/libcontainer/src/container/builder.rs index a656e732a..66aee07a6 100644 --- a/crates/libcontainer/src/container/builder.rs +++ b/crates/libcontainer/src/container/builder.rs @@ -1,8 +1,7 @@ use crate::error::{ErrInvalidID, LibcontainerError}; use crate::syscall::syscall::SyscallType; -use crate::workload::default::DefaultExecutor; -use crate::workload::{Executor, ExecutorManager}; -use crate::{syscall::Syscall, utils::PathBufExt}; +use crate::utils::PathBufExt; +use crate::workload::{self, Executor}; use std::path::PathBuf; use super::{init_builder::InitContainerBuilder, tenant_builder::TenantContainerBuilder}; @@ -23,7 +22,7 @@ pub struct ContainerBuilder { pub(super) preserve_fds: i32, /// Manage the functions that actually run on the container /// Default executes the specified execution of a generic command - pub(super) executor_manager: ExecutorManager, + pub(super) executor: Executor, } /// Builder that can be used to configure the common properties of @@ -71,9 +70,7 @@ impl ContainerBuilder { pid_file: None, console_socket: None, preserve_fds: 0, - executor_manager: ExecutorManager { - executors: vec![Box::::default()], - }, + executor: workload::default::get_executor(), } } @@ -263,21 +260,14 @@ impl ContainerBuilder { /// ) /// .with_executor(vec![Box::::default()]); /// ``` - pub fn with_executor( - mut self, - executors: Vec>, - ) -> Result { - if executors.is_empty() { - return Err(LibcontainerError::NoExecutors); - }; - self.executor_manager = ExecutorManager { executors }; + pub fn with_executor(mut self, executor: Executor) -> Result { + self.executor = executor; Ok(self) } } #[cfg(test)] mod tests { - use crate::syscall::syscall::create_syscall; use crate::{container::builder::ContainerBuilder, syscall::syscall::SyscallType}; use anyhow::{Context, Result}; use std::path::PathBuf; diff --git a/crates/libcontainer/src/container/builder_impl.rs b/crates/libcontainer/src/container/builder_impl.rs index f725bfb14..8fffb28ff 100644 --- a/crates/libcontainer/src/container/builder_impl.rs +++ b/crates/libcontainer/src/container/builder_impl.rs @@ -8,9 +8,9 @@ use crate::{ intel_rdt::delete_resctrl_subdirectory, }, rootless::Rootless, - syscall::{syscall::SyscallType, Syscall}, + syscall::syscall::SyscallType, utils, - workload::ExecutorManager, + workload::Executor, }; use libcgroups::common::CgroupManager; use nix::unistd::Pid; @@ -46,7 +46,7 @@ pub(super) struct ContainerBuilderImpl<'a> { /// If the container is to be run in detached mode pub detached: bool, /// Default executes the specified execution of a generic command - pub executor_manager: ExecutorManager, + pub executor: Executor, } impl<'a> ContainerBuilderImpl<'a> { @@ -138,7 +138,7 @@ impl<'a> ContainerBuilderImpl<'a> { rootless: &self.rootless, cgroup_config, detached: self.detached, - executor_manager: &self.executor_manager, + executor: self.executor.clone(), }; let (init_pid, need_to_clean_up_intel_rdt_dir) = diff --git a/crates/libcontainer/src/container/init_builder.rs b/crates/libcontainer/src/container/init_builder.rs index 22d647a6f..73e514baa 100644 --- a/crates/libcontainer/src/container/init_builder.rs +++ b/crates/libcontainer/src/container/init_builder.rs @@ -106,7 +106,7 @@ impl InitContainerBuilder { container: Some(container.clone()), preserve_fds: self.base.preserve_fds, detached: self.detached, - executor_manager: self.base.executor_manager, + executor: self.base.executor, }; builder_impl.create()?; diff --git a/crates/libcontainer/src/container/tenant_builder.rs b/crates/libcontainer/src/container/tenant_builder.rs index fa8392613..07e10cf21 100644 --- a/crates/libcontainer/src/container/tenant_builder.rs +++ b/crates/libcontainer/src/container/tenant_builder.rs @@ -139,7 +139,7 @@ impl TenantContainerBuilder { container: None, preserve_fds: self.base.preserve_fds, detached: self.detached, - executor_manager: self.base.executor_manager, + executor: self.base.executor, }; let pid = builder_impl.create()?; diff --git a/crates/libcontainer/src/process/args.rs b/crates/libcontainer/src/process/args.rs index 95d7de395..9201bd165 100644 --- a/crates/libcontainer/src/process/args.rs +++ b/crates/libcontainer/src/process/args.rs @@ -6,7 +6,7 @@ use std::path::PathBuf; use crate::container::Container; use crate::rootless::Rootless; use crate::syscall::syscall::SyscallType; -use crate::workload::ExecutorManager; +use crate::workload::Executor; #[derive(Debug, Copy, Clone)] pub enum ContainerType { @@ -39,5 +39,5 @@ pub struct ContainerArgs<'a> { /// If the container is to be run in detached mode pub detached: bool, /// Manage the functions that actually run on the container - pub executor_manager: &'a ExecutorManager, + pub executor: Executor, } diff --git a/crates/libcontainer/src/process/container_init_process.rs b/crates/libcontainer/src/process/container_init_process.rs index 823e9f7b2..5f61ec480 100644 --- a/crates/libcontainer/src/process/container_init_process.rs +++ b/crates/libcontainer/src/process/container_init_process.rs @@ -69,7 +69,7 @@ pub enum InitProcessError { #[error(transparent)] NotifyListener(#[from] notify_socket::NotifyListenerError), #[error(transparent)] - Workload(#[from] workload::ExecutorManagerError), + Workload(#[from] workload::ExecutorError), } type Result = std::result::Result; @@ -678,7 +678,7 @@ pub fn container_init_process( } if proc.args().is_some() { - args.executor_manager.exec(&spec).map_err(|err| { + (args.executor)(&spec).map_err(|err| { tracing::error!(?err, "failed to execute payload"); err })?; diff --git a/crates/libcontainer/src/workload/default.rs b/crates/libcontainer/src/workload/default.rs index 923c96dae..f6c4b6161 100644 --- a/crates/libcontainer/src/workload/default.rs +++ b/crates/libcontainer/src/workload/default.rs @@ -5,13 +5,8 @@ use oci_spec::runtime::Spec; use super::{Executor, ExecutorError, EMPTY}; -const EXECUTOR_NAME: &str = "default"; - -#[derive(Default)] -pub struct DefaultExecutor {} - -impl Executor for DefaultExecutor { - fn exec(&self, spec: &Spec) -> Result<(), ExecutorError> { +pub fn get_executor() -> Executor { + return Box::new(|spec: &Spec| -> Result<(), ExecutorError> { tracing::debug!("executing workload with default handler"); let args = spec .process() @@ -41,13 +36,5 @@ impl Executor for DefaultExecutor { // After execvp is called, the process is replaced with the container // payload through execvp, so it should never reach here. unreachable!(); - } - - fn can_handle(&self, _: &Spec) -> bool { - true - } - - fn name(&self) -> &'static str { - EXECUTOR_NAME - } + }); } diff --git a/crates/libcontainer/src/workload/mod.rs b/crates/libcontainer/src/workload/mod.rs index bf06909c4..bae41e04d 100644 --- a/crates/libcontainer/src/workload/mod.rs +++ b/crates/libcontainer/src/workload/mod.rs @@ -12,52 +12,8 @@ pub enum ExecutorError { Execution(#[from] Box), #[error("{0}")] Other(String), + #[error("{0} executor can't handle spec")] + CantHandle(&'static str), } -pub trait Executor { - /// Executes the workload - fn exec(&self, spec: &Spec) -> Result<(), ExecutorError>; - - /// Checks if the handler is able to handle the workload - fn can_handle(&self, spec: &Spec) -> bool; - - /// The name of the handler - fn name(&self) -> &'static str; -} - -#[derive(Debug, thiserror::Error)] -pub enum ExecutorManagerError { - #[error("failed executor {name}")] - ExecutionFailed { - source: Box, - name: String, - }, - #[error("failed to find an executor that satisfies all requirements")] - NoExecutorFound, -} - -/// Manage the functions that actually run on the container -pub struct ExecutorManager { - pub executors: Vec>, -} - -impl ExecutorManager { - pub fn exec(&self, spec: &Spec) -> Result<(), ExecutorManagerError> { - if self.executors.is_empty() { - return Err(ExecutorManagerError::NoExecutorFound); - }; - - for executor in self.executors.iter() { - if executor.can_handle(spec) { - return executor.exec(spec).map_err(|e| { - tracing::error!(err = ?e, name = ?executor.name(), "failed to execute workload"); - ExecutorManagerError::ExecutionFailed { - source: e.into(), - name: executor.name().to_string(), - } - }); - } - } - Err(ExecutorManagerError::NoExecutorFound) - } -} +pub type Executor = Box Result<(), ExecutorError>>; diff --git a/crates/youki/src/commands/create.rs b/crates/youki/src/commands/create.rs index 7aea7c45f..b37410746 100644 --- a/crates/youki/src/commands/create.rs +++ b/crates/youki/src/commands/create.rs @@ -2,10 +2,10 @@ use anyhow::Result; use std::path::PathBuf; -use libcontainer::{container::builder::ContainerBuilder, syscall::syscall::create_syscall}; +use libcontainer::{container::builder::ContainerBuilder, syscall::syscall::SyscallType}; use liboci_cli::Create; -use crate::workload::executor::default_executors; +use crate::workload::executor::default_executor; // One thing to note is that in the end, container is just another process in Linux // it has specific/different control group, namespace, using which program executing in it @@ -13,9 +13,8 @@ use crate::workload::executor::default_executors; // it is running, it is just another process, and has attributes such as pid, file descriptors, etc. // associated with it like any other process. pub fn create(args: Create, root_path: PathBuf, systemd_cgroup: bool) -> Result<()> { - let syscall = create_syscall(); - ContainerBuilder::new(args.container_id.clone(), syscall.as_ref()) - .with_executor(default_executors())? + ContainerBuilder::new(args.container_id.clone(), SyscallType::default()) + .with_executor(default_executor())? .with_pid_file(args.pid_file.as_ref())? .with_console_socket(args.console_socket.as_ref()) .with_root_path(root_path)? diff --git a/crates/youki/src/commands/exec.rs b/crates/youki/src/commands/exec.rs index f3ac65fa9..4d244e737 100644 --- a/crates/youki/src/commands/exec.rs +++ b/crates/youki/src/commands/exec.rs @@ -2,15 +2,14 @@ use anyhow::Result; use nix::sys::wait::{waitpid, WaitStatus}; use std::path::PathBuf; -use libcontainer::{container::builder::ContainerBuilder, syscall::syscall::create_syscall}; +use libcontainer::{container::builder::ContainerBuilder, syscall::syscall::SyscallType}; use liboci_cli::Exec; -use crate::workload::executor::default_executors; +use crate::workload::executor::default_executor; pub fn exec(args: Exec, root_path: PathBuf) -> Result { - let syscall = create_syscall(); - let pid = ContainerBuilder::new(args.container_id.clone(), syscall.as_ref()) - .with_executor(default_executors())? + let pid = ContainerBuilder::new(args.container_id.clone(), SyscallType::default()) + .with_executor(default_executor())? .with_root_path(root_path)? .with_console_socket(args.console_socket.as_ref()) .with_pid_file(args.pid_file.as_ref())? diff --git a/crates/youki/src/commands/run.rs b/crates/youki/src/commands/run.rs index 3a7c83d97..75c08d0ae 100644 --- a/crates/youki/src/commands/run.rs +++ b/crates/youki/src/commands/run.rs @@ -1,7 +1,7 @@ use std::path::PathBuf; use anyhow::{Context, Result}; -use libcontainer::{container::builder::ContainerBuilder, syscall::syscall::create_syscall}; +use libcontainer::{container::builder::ContainerBuilder, syscall::syscall::SyscallType}; use liboci_cli::Run; use nix::{ sys::{ @@ -12,12 +12,11 @@ use nix::{ unistd::Pid, }; -use crate::workload::executor::default_executors; +use crate::workload::executor::default_executor; pub fn run(args: Run, root_path: PathBuf, systemd_cgroup: bool) -> Result { - let syscall = create_syscall(); - let mut container = ContainerBuilder::new(args.container_id.clone(), syscall.as_ref()) - .with_executor(default_executors())? + let mut container = ContainerBuilder::new(args.container_id.clone(), SyscallType::default()) + .with_executor(default_executor())? .with_pid_file(args.pid_file.as_ref())? .with_console_socket(args.console_socket.as_ref()) .with_root_path(root_path)? diff --git a/crates/youki/src/workload/executor.rs b/crates/youki/src/workload/executor.rs index e38e33640..faccc8316 100644 --- a/crates/youki/src/workload/executor.rs +++ b/crates/youki/src/workload/executor.rs @@ -1,13 +1,27 @@ -use libcontainer::workload::{default::DefaultExecutor, Executor}; +use libcontainer::workload::{Executor, ExecutorError}; +use oci_spec::runtime::Spec; -pub fn default_executors() -> Vec> { - vec![ +pub fn default_executor() -> Executor { + return Box::new(|spec: &Spec| -> Result<(), ExecutorError> { #[cfg(feature = "wasm-wasmer")] - Box::::default(), + match super::wasmer::get_executor()(spec) { + Ok(_) => return Ok(()), + Err(ExecutorError::CantHandle(_)) => (), + Err(err) => return Err(err), + } #[cfg(feature = "wasm-wasmedge")] - Box::::default(), + match super::wasmedge::get_executor()(spec) { + Ok(_) => return Ok(()), + Err(ExecutorError::CantHandle(_)) => (), + Err(err) => return Err(err), + } #[cfg(feature = "wasm-wasmtime")] - Box::::default(), - Box::::default(), - ] + match super::wasmtime::get_executor()(spec) { + Ok(_) => return Ok(()), + Err(ExecutorError::CantHandle(_)) => (), + Err(err) => return Err(err), + } + + libcontainer::workload::default::get_executor()(spec) + }); } diff --git a/crates/youki/src/workload/wasmedge.rs b/crates/youki/src/workload/wasmedge.rs index 9ab50a34f..536fc15cd 100644 --- a/crates/youki/src/workload/wasmedge.rs +++ b/crates/youki/src/workload/wasmedge.rs @@ -8,11 +8,14 @@ use libcontainer::workload::{Executor, ExecutorError}; const EXECUTOR_NAME: &str = "wasmedge"; -#[derive(Default)] -pub struct WasmEdgeExecutor {} +pub fn get_executor() -> Executor { + return Box::new(|spec: &Spec| -> Result<(), ExecutorError> { + if !can_handle(spec) { + return Err(ExecutorError::CantHandle(EXECUTOR_NAME)); + } + + tracing::debug!("executing workload with wasmedge handler"); -impl Executor for WasmEdgeExecutor { - fn exec(&self, spec: &Spec) -> Result<(), ExecutorError> { // parse wasi parameters let args = get_args(spec); let mut cmd = args[0].clone(); @@ -55,25 +58,21 @@ impl Executor for WasmEdgeExecutor { .map_err(|err| ExecutorError::Execution(err))?; Ok(()) - } - - fn can_handle(&self, spec: &Spec) -> bool { - if let Some(annotations) = spec.annotations() { - if let Some(handler) = annotations.get("run.oci.handler") { - return handler == "wasm"; - } + }); +} - if let Some(variant) = annotations.get("module.wasm.image/variant") { - return variant == "compat"; - } +fn can_handle(spec: &Spec) -> bool { + if let Some(annotations) = spec.annotations() { + if let Some(handler) = annotations.get("run.oci.handler") { + return handler == "wasm"; } - false + if let Some(variant) = annotations.get("module.wasm.image/variant") { + return variant == "compat"; + } } - fn name(&self) -> &'static str { - EXECUTOR_NAME - } + false } fn get_args(spec: &Spec) -> &[String] { diff --git a/crates/youki/src/workload/wasmer.rs b/crates/youki/src/workload/wasmer.rs index 1fd5d8ce0..417d7db49 100644 --- a/crates/youki/src/workload/wasmer.rs +++ b/crates/youki/src/workload/wasmer.rs @@ -6,11 +6,12 @@ use libcontainer::workload::{Executor, ExecutorError, EMPTY}; const EXECUTOR_NAME: &str = "wasmer"; -#[derive(Default)] -pub struct WasmerExecutor {} +pub fn get_executor() -> Executor { + return Box::new(|spec: &Spec| -> Result<(), ExecutorError> { + if !can_handle(spec) { + return Err(ExecutorError::CantHandle(EXECUTOR_NAME)); + } -impl Executor for WasmerExecutor { - fn exec(&self, spec: &Spec) -> Result<(), ExecutorError> { tracing::debug!("executing workload with wasmer handler"); let process = spec.process().as_ref(); @@ -75,25 +76,21 @@ impl Executor for WasmerExecutor { wasi_env.cleanup(&mut store, None); Ok(()) - } - - fn can_handle(&self, spec: &Spec) -> bool { - if let Some(annotations) = spec.annotations() { - if let Some(handler) = annotations.get("run.oci.handler") { - return handler == "wasm"; - } + }); +} - if let Some(variant) = annotations.get("module.wasm.image/variant") { - return variant == "compat"; - } +fn can_handle(spec: &Spec) -> bool { + if let Some(annotations) = spec.annotations() { + if let Some(handler) = annotations.get("run.oci.handler") { + return handler == "wasm"; } - false + if let Some(variant) = annotations.get("module.wasm.image/variant") { + return variant == "compat"; + } } - fn name(&self) -> &'static str { - EXECUTOR_NAME - } + false } #[cfg(test)] @@ -112,7 +109,7 @@ mod tests { .build() .context("build spec")?; - assert!(WasmerExecutor::default().can_handle(&spec)); + assert!(can_handle(&spec)); Ok(()) } @@ -126,7 +123,7 @@ mod tests { .build() .context("build spec")?; - assert!(WasmerExecutor::default().can_handle(&spec)); + assert!(can_handle(&spec)); Ok(()) } @@ -135,7 +132,7 @@ mod tests { fn test_can_handle_no_execute() -> Result<()> { let spec = SpecBuilder::default().build().context("build spec")?; - assert!(!WasmerExecutor::default().can_handle(&spec)); + assert!(!can_handle(&spec)); Ok(()) } diff --git a/crates/youki/src/workload/wasmtime.rs b/crates/youki/src/workload/wasmtime.rs index afa0e4e9d..508a0078b 100644 --- a/crates/youki/src/workload/wasmtime.rs +++ b/crates/youki/src/workload/wasmtime.rs @@ -6,12 +6,13 @@ use libcontainer::workload::{Executor, ExecutorError, EMPTY}; const EXECUTOR_NAME: &str = "wasmtime"; -#[derive(Default)] -pub struct WasmtimeExecutor {} +pub fn get_executor() -> Executor { + return Box::new(|spec: &Spec| -> Result<(), ExecutorError> { + if !can_handle(spec) { + return Err(ExecutorError::CantHandle(EXECUTOR_NAME)); + } -impl Executor for WasmtimeExecutor { - fn exec(&self, spec: &Spec) -> Result<(), ExecutorError> { - tracing::info!("Executing workload with wasmtime handler"); + tracing::debug!("executing workload with wasmtime handler"); let process = spec.process().as_ref(); let args = spec @@ -85,23 +86,19 @@ impl Executor for WasmtimeExecutor { start .call(&mut store, &[], &mut []) .map_err(|err| ExecutorError::Execution(err.into())) - } - - fn can_handle(&self, spec: &Spec) -> bool { - if let Some(annotations) = spec.annotations() { - if let Some(handler) = annotations.get("run.oci.handler") { - return handler == "wasm"; - } + }); +} - if let Some(variant) = annotations.get("module.wasm.image/variant") { - return variant == "compat"; - } +fn can_handle(spec: &Spec) -> bool { + if let Some(annotations) = spec.annotations() { + if let Some(handler) = annotations.get("run.oci.handler") { + return handler == "wasm"; } - false + if let Some(variant) = annotations.get("module.wasm.image/variant") { + return variant == "compat"; + } } - fn name(&self) -> &'static str { - EXECUTOR_NAME - } + false } From 22083d8a9a455d0d367efc5bcd8dda84db02a83a Mon Sep 17 00:00:00 2001 From: yihuaf Date: Mon, 3 Jul 2023 17:28:41 -0700 Subject: [PATCH 08/21] remove the rest of reference from container_args Signed-off-by: yihuaf --- .../src/container/builder_impl.rs | 12 +++++----- .../src/container/init_builder.rs | 2 +- .../src/container/tenant_builder.rs | 2 +- crates/libcontainer/src/process/args.rs | 6 ++--- .../src/process/container_main_process.rs | 4 ++-- crates/libcontainer/src/rootless.rs | 22 +++++++++---------- 6 files changed, 24 insertions(+), 24 deletions(-) diff --git a/crates/libcontainer/src/container/builder_impl.rs b/crates/libcontainer/src/container/builder_impl.rs index 8fffb28ff..dd6526ee4 100644 --- a/crates/libcontainer/src/container/builder_impl.rs +++ b/crates/libcontainer/src/container/builder_impl.rs @@ -17,7 +17,7 @@ use nix::unistd::Pid; use oci_spec::runtime::Spec; use std::{fs, io::Write, os::unix::prelude::RawFd, path::PathBuf}; -pub(super) struct ContainerBuilderImpl<'a> { +pub(super) struct ContainerBuilderImpl { /// Flag indicating if an init or a tenant container should be created pub container_type: ContainerType, /// Interface to operating system primitives @@ -27,7 +27,7 @@ pub(super) struct ContainerBuilderImpl<'a> { /// Id of the container pub container_id: String, /// OCI compliant runtime spec - pub spec: &'a Spec, + pub spec: Spec, /// Root filesystem of the container pub rootfs: PathBuf, /// File which will be used to communicate the pid of the @@ -36,7 +36,7 @@ pub(super) struct ContainerBuilderImpl<'a> { /// Socket to communicate the file descriptor of the ptty pub console_socket: Option, /// Options for rootless containers - pub rootless: Option>, + pub rootless: Option, /// Path to the Unix Domain Socket to communicate container start pub notify_path: PathBuf, /// Container state @@ -49,7 +49,7 @@ pub(super) struct ContainerBuilderImpl<'a> { pub executor: Executor, } -impl<'a> ContainerBuilderImpl<'a> { +impl ContainerBuilderImpl { pub(super) fn create(&mut self) -> Result { match self.run_container() { Ok(pid) => Ok(pid), @@ -134,8 +134,8 @@ impl<'a> ContainerBuilderImpl<'a> { console_socket: self.console_socket, notify_socket_path: self.notify_path.to_owned(), preserve_fds: self.preserve_fds, - container: &self.container, - rootless: &self.rootless, + container: self.container.to_owned(), + rootless: self.rootless.to_owned(), cgroup_config, detached: self.detached, executor: self.executor.clone(), diff --git a/crates/libcontainer/src/container/init_builder.rs b/crates/libcontainer/src/container/init_builder.rs index 73e514baa..d30213053 100644 --- a/crates/libcontainer/src/container/init_builder.rs +++ b/crates/libcontainer/src/container/init_builder.rs @@ -99,7 +99,7 @@ impl InitContainerBuilder { pid_file: self.base.pid_file, console_socket: csocketfd, use_systemd: self.use_systemd, - spec: &spec, + spec, rootfs, rootless, notify_path, diff --git a/crates/libcontainer/src/container/tenant_builder.rs b/crates/libcontainer/src/container/tenant_builder.rs index 07e10cf21..ae32cc583 100644 --- a/crates/libcontainer/src/container/tenant_builder.rs +++ b/crates/libcontainer/src/container/tenant_builder.rs @@ -132,7 +132,7 @@ impl TenantContainerBuilder { pid_file: self.base.pid_file, console_socket: csocketfd, use_systemd, - spec: &spec, + spec, rootfs, rootless, notify_path: notify_path.clone(), diff --git a/crates/libcontainer/src/process/args.rs b/crates/libcontainer/src/process/args.rs index 9201bd165..16919bad8 100644 --- a/crates/libcontainer/src/process/args.rs +++ b/crates/libcontainer/src/process/args.rs @@ -15,7 +15,7 @@ pub enum ContainerType { } #[derive(Clone)] -pub struct ContainerArgs<'a> { +pub struct ContainerArgs { /// Indicates if an init or a tenant container should be created pub container_type: ContainerType, /// Interface to operating system primitives @@ -31,9 +31,9 @@ pub struct ContainerArgs<'a> { /// File descriptors preserved/passed to the container init process. pub preserve_fds: i32, /// Container state - pub container: &'a Option, + pub container: Option, /// Options for rootless containers - pub rootless: &'a Option>, + pub rootless: Option, /// Cgroup Manager Config pub cgroup_config: CgroupConfig, /// If the container is to be run in detached mode diff --git a/crates/libcontainer/src/process/container_main_process.rs b/crates/libcontainer/src/process/container_main_process.rs index 35848b042..7176b9e8e 100644 --- a/crates/libcontainer/src/process/container_main_process.rs +++ b/crates/libcontainer/src/process/container_main_process.rs @@ -222,7 +222,7 @@ mod tests { let tmp = tempfile::tempdir()?; let id_mapper = RootlessIDMapper::new_test(tmp.path().to_path_buf()); let rootless = Rootless { - uid_mappings: Some(&uid_mappings), + uid_mappings: Some(uid_mappings), privileged: true, rootless_id_mapper: id_mapper.clone(), ..Default::default() @@ -277,7 +277,7 @@ mod tests { let tmp = tempfile::tempdir()?; let id_mapper = RootlessIDMapper::new_test(tmp.path().to_path_buf()); let rootless = Rootless { - gid_mappings: Some(&gid_mappings), + gid_mappings: Some(gid_mappings), rootless_id_mapper: id_mapper.clone(), ..Default::default() }; diff --git a/crates/libcontainer/src/rootless.rs b/crates/libcontainer/src/rootless.rs index 31ec7b23e..87eff2495 100644 --- a/crates/libcontainer/src/rootless.rs +++ b/crates/libcontainer/src/rootless.rs @@ -122,15 +122,15 @@ pub enum MappingError { } #[derive(Debug, Clone, Default)] -pub struct Rootless<'a> { +pub struct Rootless { /// Location of the newuidmap binary pub newuidmap: Option, /// Location of the newgidmap binary pub newgidmap: Option, /// Mappings for user ids - pub(crate) uid_mappings: Option<&'a Vec>, + pub(crate) uid_mappings: Option>, /// Mappings for group ids - pub(crate) gid_mappings: Option<&'a Vec>, + pub(crate) gid_mappings: Option>, /// Info on the user namespaces pub user_namespace: Option, /// Is rootless container requested by a privileged user @@ -139,8 +139,8 @@ pub struct Rootless<'a> { pub rootless_id_mapper: RootlessIDMapper, } -impl<'a> Rootless<'a> { - pub fn new(spec: &'a Spec) -> Result>> { +impl Rootless { + pub fn new(spec: &Spec) -> Result> { let linux = spec.linux().as_ref().ok_or(MissingSpecError::Linux)?; let namespaces = Namespaces::try_from(linux.namespaces().as_ref()) .map_err(ValidateSpecError::Namespaces)?; @@ -176,7 +176,7 @@ impl<'a> Rootless<'a> { pub fn write_uid_mapping(&self, target_pid: Pid) -> Result<()> { tracing::debug!("write UID mapping for {:?}", target_pid); - if let Some(uid_mappings) = self.uid_mappings { + if let Some(uid_mappings) = self.uid_mappings.as_ref() { write_id_mapping( target_pid, self.rootless_id_mapper.get_uid_path(&target_pid).as_path(), @@ -189,7 +189,7 @@ impl<'a> Rootless<'a> { pub fn write_gid_mapping(&self, target_pid: Pid) -> Result<()> { tracing::debug!("write GID mapping for {:?}", target_pid); - if let Some(gid_mappings) = self.gid_mappings { + if let Some(gid_mappings) = self.gid_mappings.as_ref() { write_id_mapping( target_pid, self.rootless_id_mapper.get_gid_path(&target_pid).as_path(), @@ -205,10 +205,10 @@ impl<'a> Rootless<'a> { } } -impl<'a> TryFrom<&'a Linux> for Rootless<'a> { +impl TryFrom<&Linux> for Rootless { type Error = RootlessError; - fn try_from(linux: &'a Linux) -> Result { + fn try_from(linux: &Linux) -> Result { let namespaces = Namespaces::try_from(linux.namespaces().as_ref()) .map_err(ValidateSpecError::Namespaces)?; let user_namespace = namespaces @@ -217,8 +217,8 @@ impl<'a> TryFrom<&'a Linux> for Rootless<'a> { Ok(Self { newuidmap: None, newgidmap: None, - uid_mappings: linux.uid_mappings().as_ref(), - gid_mappings: linux.gid_mappings().as_ref(), + uid_mappings: linux.uid_mappings().to_owned(), + gid_mappings: linux.gid_mappings().to_owned(), user_namespace: user_namespace.cloned(), privileged: nix::unistd::geteuid().is_root(), rootless_id_mapper: RootlessIDMapper::new(), From 46fb4d44041566eab462d9ef06dc482f43e6576b Mon Sep 17 00:00:00 2001 From: yihuaf Date: Mon, 3 Jul 2023 20:21:06 -0700 Subject: [PATCH 09/21] put enough clone to make the closure work Signed-off-by: yihuaf --- crates/libcontainer/src/channel.rs | 3 +- crates/libcontainer/src/process/channel.rs | 6 ++ .../process/container_intermediate_process.rs | 79 ++++++++++--------- .../src/process/container_main_process.rs | 43 +++++----- crates/libcontainer/src/process/fork.rs | 66 ++++++++-------- crates/libcontainer/src/process/message.rs | 2 +- 6 files changed, 108 insertions(+), 91 deletions(-) diff --git a/crates/libcontainer/src/channel.rs b/crates/libcontainer/src/channel.rs index 797dee188..a772a10d3 100644 --- a/crates/libcontainer/src/channel.rs +++ b/crates/libcontainer/src/channel.rs @@ -18,12 +18,13 @@ pub enum ChannelError { #[error("channel connection broken")] BrokenChannel, } - +#[derive(Clone)] pub struct Receiver { receiver: RawFd, phantom: PhantomData, } +#[derive(Clone)] pub struct Sender { sender: RawFd, phantom: PhantomData, diff --git a/crates/libcontainer/src/process/channel.rs b/crates/libcontainer/src/process/channel.rs index f05bf205d..5a03c4840 100644 --- a/crates/libcontainer/src/process/channel.rs +++ b/crates/libcontainer/src/process/channel.rs @@ -40,6 +40,7 @@ pub fn main_channel() -> Result<(MainSender, MainReceiver), ChannelError> { Ok((MainSender { sender }, MainReceiver { receiver })) } +#[derive(Clone)] pub struct MainSender { sender: Sender, } @@ -87,6 +88,7 @@ impl MainSender { } } +#[derive(Clone)] pub struct MainReceiver { receiver: Receiver, } @@ -193,6 +195,7 @@ pub fn intermediate_channel() -> Result<(IntermediateSender, IntermediateReceive )) } +#[derive(Clone)] pub struct IntermediateSender { sender: Sender, } @@ -212,6 +215,7 @@ impl IntermediateSender { } } +#[derive(Clone)] pub struct IntermediateReceiver { receiver: Receiver, } @@ -248,6 +252,7 @@ pub fn init_channel() -> Result<(InitSender, InitReceiver), ChannelError> { Ok((InitSender { sender }, InitReceiver { receiver })) } +#[derive(Clone)] pub struct InitSender { sender: Sender, } @@ -266,6 +271,7 @@ impl InitSender { } } +#[derive(Clone)] pub struct InitReceiver { receiver: Receiver, } diff --git a/crates/libcontainer/src/process/container_intermediate_process.rs b/crates/libcontainer/src/process/container_intermediate_process.rs index 20ab84ade..063ffcba5 100644 --- a/crates/libcontainer/src/process/container_intermediate_process.rs +++ b/crates/libcontainer/src/process/container_intermediate_process.rs @@ -108,6 +108,48 @@ pub fn container_intermediate_process( namespaces.unshare_or_setns(pid_namespace)?; } + let cb: fork::CloneCb = Box::new({ + let args = args.clone(); + let init_sender = init_sender.clone(); + let inter_sender = inter_sender.clone(); + let mut main_sender = main_sender.clone(); + let mut init_receiver = init_receiver.clone(); + move || { + // We are inside the forked process here. The first thing we have to do + // is to close any unused senders, since fork will make a dup for all + // the socket. + init_sender.close().map_err(|err| { + tracing::error!("failed to close receiver in init process: {}", err); + IntermediateProcessError::Channel(err) + })?; + inter_sender.close().map_err(|err| { + tracing::error!( + "failed to close sender in the intermediate process: {}", + err + ); + IntermediateProcessError::Channel(err) + })?; + match container_init_process(&args, &mut main_sender, &mut init_receiver) { + Ok(_) => Ok(()), + Err(e) => { + if let ContainerType::TenantContainer { exec_notify_fd } = args.container_type { + let buf = format!("{e}"); + write(exec_notify_fd, buf.as_bytes()).map_err(|err| { + tracing::error!("failed to write to exec notify fd: {}", err); + IntermediateProcessError::ExecNotify(err) + })?; + close(exec_notify_fd).map_err(|err| { + tracing::error!("failed to close exec notify fd: {}", err); + IntermediateProcessError::ExecNotify(err) + })?; + } + tracing::error!("failed to initialize container process: {e}"); + Err(e.into()) + } + } + } + }); + // We have to record the pid of the init process. The init process will be // inside the pid namespace, so we can't rely on the init process to send us // the correct pid. We also want to clone the init process as a sibling @@ -116,42 +158,7 @@ pub fn container_intermediate_process( // configuration. The youki main process can decide what to do with the init // process and the intermediate process can just exit safely after the job // is done. - - let pid = fork::container_clone_sibling("youki:[2:INIT]", || { - // We are inside the forked process here. The first thing we have to do - // is to close any unused senders, since fork will make a dup for all - // the socket. - init_sender.close().map_err(|err| { - tracing::error!("failed to close receiver in init process: {}", err); - IntermediateProcessError::Channel(err) - })?; - inter_sender.close().map_err(|err| { - tracing::error!( - "failed to close sender in the intermediate process: {}", - err - ); - IntermediateProcessError::Channel(err) - })?; - match container_init_process(args, main_sender, init_receiver) { - Ok(_) => Ok(()), - Err(e) => { - if let ContainerType::TenantContainer { exec_notify_fd } = args.container_type { - let buf = format!("{e}"); - write(exec_notify_fd, buf.as_bytes()).map_err(|err| { - tracing::error!("failed to write to exec notify fd: {}", err); - IntermediateProcessError::ExecNotify(err) - })?; - close(exec_notify_fd).map_err(|err| { - tracing::error!("failed to close exec notify fd: {}", err); - IntermediateProcessError::ExecNotify(err) - })?; - } - tracing::error!("failed to initialize container process: {e}"); - Err(e.into()) - } - } - }) - .map_err(|err| { + let pid = fork::container_clone_sibling("youki:[2:INIT]", cb).map_err(|err| { tracing::error!("failed to fork init process: {}", err); IntermediateProcessError::InitProcess(err) })?; diff --git a/crates/libcontainer/src/process/container_main_process.rs b/crates/libcontainer/src/process/container_main_process.rs index 7176b9e8e..b1da9f882 100644 --- a/crates/libcontainer/src/process/container_main_process.rs +++ b/crates/libcontainer/src/process/container_main_process.rs @@ -37,34 +37,41 @@ pub fn container_main_process(container_args: &ContainerArgs) -> Result<(Pid, bo // cloned process, we have to be deligent about closing any unused channel. // At minimum, we have to close down any unused senders. The corresponding // receivers will be cleaned up once the senders are closed down. - let (main_sender, main_receiver) = &mut channel::main_channel()?; - let inter_chan = &mut channel::intermediate_channel()?; - let init_chan = &mut channel::init_channel()?; + let (main_sender, mut main_receiver) = channel::main_channel()?; + let inter_chan = channel::intermediate_channel()?; + let init_chan = channel::init_channel()?; - let intermediate_pid = fork::container_clone("youki:[1:INTER]", || { - container_intermediate_process::container_intermediate_process( - container_args, - inter_chan, - init_chan, - main_sender, - )?; + let cb: fork::CloneCb = Box::new({ + let container_args = container_args.clone(); + let mut main_sender = main_sender.clone(); + let mut inter_chan = inter_chan.clone(); + let mut init_chan = init_chan.clone(); + move || { + container_intermediate_process::container_intermediate_process( + &container_args, + &mut inter_chan, + &mut init_chan, + &mut main_sender, + )?; - Ok(()) - }) - .map_err(|err| { + Ok(()) + } + }); + + let intermediate_pid = fork::container_clone("youki:[1:INTER]", cb).map_err(|err| { tracing::error!("failed to fork intermediate process: {}", err); ProcessError::IntermediateProcessFailed(err) })?; // Close down unused fds. The corresponding fds are duplicated to the - // child process during fork. + // child process during clone. main_sender.close().map_err(|err| { tracing::error!("failed to close unused sender: {}", err); err })?; - let (inter_sender, inter_receiver) = inter_chan; - let (init_sender, init_receiver) = init_chan; + let (mut inter_sender, inter_receiver) = inter_chan; + let (mut init_sender, init_receiver) = init_chan; // If creating a rootless container, the intermediate process will ask // the main process to set up uid and gid mapping, once the intermediate @@ -106,8 +113,8 @@ pub fn container_main_process(container_args: &ContainerArgs) -> Result<(Pid, bo crate::process::seccomp_listener::sync_seccomp( seccomp, &state, - init_sender, - main_receiver, + &mut init_sender, + &mut main_receiver, )?; } diff --git a/crates/libcontainer/src/process/fork.rs b/crates/libcontainer/src/process/fork.rs index 2444df232..58ab5e92f 100644 --- a/crates/libcontainer/src/process/fork.rs +++ b/crates/libcontainer/src/process/fork.rs @@ -41,9 +41,14 @@ pub enum CallbackError { type CallbackResult = std::result::Result; -// The clone callback is used in clone call. It is a boxed closure and it needs -// to trasfer the ownership of related memory to the new process. -type CloneCb = Box i32>; +// This callback signature is used in the public rust interface with easier to +// use rust error types. +pub type CloneCb = Box CallbackResult<()>>; + +// The clone callback is used in clone system call. It is a boxed closure and it +// needs to trasfer the ownership of related memory to the new process. The +// interface returns i32 which is typical for C functions. +type CloneSystemCb = Box i32>; // Fork/Clone a sibling process that shares the same parent as the calling // process. This is used to launch the container init process so the parent @@ -52,10 +57,7 @@ type CloneCb = Box i32>; // youki main process) will exit and the init process will be re-parented to the // process 1 (system init process), which is not the right behavior of what we // look for. -pub fn container_clone_sibling<'a, F: FnMut() -> CallbackResult<()> + 'a>( - child_name: &str, - cb: F, -) -> Result { +pub fn container_clone_sibling(child_name: &str, cb: CloneCb) -> Result { // Note: normally, an exit signal is required, but when using // `CLONE_PARENT`, the `clone3` will return EINVAL if an exit signal is set. // The older `clone` will not return EINVAL in this case. Instead it ignores @@ -70,21 +72,20 @@ pub fn container_clone_sibling<'a, F: FnMut() -> CallbackResult<()> + 'a>( // using clone, we would have to manually make sure all the variables are // correctly send to the new process, especially Rust borrow checker will be a // lot of hassel to deal with every details. -pub fn container_clone<'a, F: FnMut() -> CallbackResult<()> + 'a>( - child_name: &str, - cb: F, -) -> Result { +pub fn container_clone(child_name: &str, cb: CloneCb) -> Result { clone_internal(child_name, cb, 0, Some(SIGCHLD as u64)) } -fn clone_internal<'a, F: FnMut() -> CallbackResult<()> + 'a>( +fn clone_internal( child_name: &str, - mut cb: F, + cb: CloneCb, flags: u64, exit_signal: Option, ) -> Result { - let mut child_closure = || { - if let Err(ret) = prctl::set_name(child_name) { + // Prepare a owned string for the closure + let child_name = child_name.to_owned(); + let child_closure = Box::new(move || { + if let Err(ret) = prctl::set_name(child_name.as_str()) { tracing::error!(?ret, "failed to set name for child process"); return ret; } @@ -95,25 +96,23 @@ fn clone_internal<'a, F: FnMut() -> CallbackResult<()> + 'a>( -1 } } - }; + }); - match clone3(&mut child_closure, flags, exit_signal) { + match clone3(child_closure, flags, exit_signal) { Ok(pid) => return Ok(pid), Err(CloneError::Clone(nix::Error::ENOSYS)) => { tracing::debug!("clone3 is not supported, fallback to clone"); - let pid = clone(&mut child_closure, flags, exit_signal)?; + // let pid = clone(child_closure, flags, exit_signal)?; + + // Ok(pid) - Ok(pid) + todo!() } Err(err) => return Err(err), } } -fn clone3<'a, F: FnMut() -> i32 + 'a>( - mut cb: F, - flags: u64, - exit_signal: Option, -) -> Result { +fn clone3(cb: CloneSystemCb, flags: u64, exit_signal: Option) -> Result { #[repr(C)] struct clone3_args { flags: u64, @@ -165,11 +164,7 @@ fn clone3<'a, F: FnMut() -> i32 + 'a>( /// the new container process, where we can enter into namespaces directly instead /// of using unshare and fork. This call will only create one new process, instead /// of two using fork. -fn clone<'a, F: FnMut() -> i32 + 'a>( - cb: F, - flags: u64, - exit_signal: Option, -) -> Result { +fn clone(cb: CloneSystemCb, flags: u64, exit_signal: Option) -> Result { const DEFAULT_STACK_SIZE: usize = 8 * 1024 * 1024; // 8M const DEFAULT_PAGE_SIZE: usize = 4 * 1024; // 4K @@ -240,13 +235,13 @@ fn clone<'a, F: FnMut() -> i32 + 'a>( // we double box the closure. This is consistant with how std::unix::thread // handles the closure. // Ref: https://github.com/rust-lang/rust/blob/master/library/std/src/sys/unix/thread.rs - let data = Box::into_raw(Box::new(Box::new(cb))); + let data = Box::into_raw(Box::new(cb)); // The main is a wrapper function passed into clone call below. The "data" // arg is actually a raw pointer to a Box closure. so here, we re-box the // pointer back into a box closure so the main takes ownership of the // memory. Then we can call the closure passed in. extern "C" fn main(data: *mut libc::c_void) -> libc::c_int { - unsafe { Box::from_raw(data as *mut CloneCb)() as i32 } + unsafe { Box::from_raw(data as *mut CloneSystemCb)() as i32 } } // The nix::sched::clone wrapper doesn't provide the right interface. Using @@ -289,7 +284,7 @@ mod test { #[test] fn test_container_fork() -> Result<()> { - let pid = container_clone("test:child", || Ok(()))?; + let pid = container_clone("test:child", Box::new(|| Ok(())))?; match waitpid(pid, None).expect("wait pid failed.") { WaitStatus::Exited(p, status) => { assert_eq!(pid, p); @@ -302,7 +297,7 @@ mod test { #[test] fn test_container_err_fork() -> Result<()> { - let pid = container_clone("test:child", || Err(CallbackError::Test))?; + let pid = container_clone("test:child", Box::new(|| Err(CallbackError::Test)))?; match waitpid(pid, None).expect("wait pid failed.") { WaitStatus::Exited(p, status) => { assert_eq!(pid, p); @@ -354,7 +349,7 @@ mod test { unistd::ForkResult::Child => { // Inside the forked process. We call `container_clone` and pass // the pid to the parent process. - let pid = container_clone_sibling("test:child", || Ok(()))?; + let pid = container_clone_sibling("test:child", Box::new(|| Ok(())))?; sender.send(pid.as_raw())?; sender.close()?; std::process::exit(0); @@ -408,7 +403,8 @@ mod test { )); } - let pid = container_clone("test:child", || Ok(())).map_err(|err| err.to_string())?; + let pid = container_clone("test:child", Box::new(|| Ok(()))) + .map_err(|err| err.to_string())?; match waitpid(pid, None).expect("wait pid failed.") { WaitStatus::Exited(p, status) => { assert_eq!(pid, p); diff --git a/crates/libcontainer/src/process/message.rs b/crates/libcontainer/src/process/message.rs index 8de9de7c8..9b04723f4 100644 --- a/crates/libcontainer/src/process/message.rs +++ b/crates/libcontainer/src/process/message.rs @@ -2,7 +2,7 @@ use core::fmt; use serde::{Deserialize, Serialize}; /// Used as a wrapper for messages to be sent between child and parent processes -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Serialize, Deserialize, Clone)] pub enum Message { IntermediateReady(i32), InitReady, From 2ce495319caa403aea58277a5d4bd23590566468 Mon Sep 17 00:00:00 2001 From: yihuaf Date: Mon, 3 Jul 2023 23:59:08 -0700 Subject: [PATCH 10/21] implemented the rest of the clone Signed-off-by: yihuaf --- .../process/container_intermediate_process.rs | 56 +++++++------ .../src/process/container_main_process.rs | 31 ++++--- crates/libcontainer/src/process/fork.rs | 80 +++++-------------- 3 files changed, 70 insertions(+), 97 deletions(-) diff --git a/crates/libcontainer/src/process/container_intermediate_process.rs b/crates/libcontainer/src/process/container_intermediate_process.rs index 063ffcba5..e515f2520 100644 --- a/crates/libcontainer/src/process/container_intermediate_process.rs +++ b/crates/libcontainer/src/process/container_intermediate_process.rs @@ -8,6 +8,7 @@ use procfs::process::Process; use super::args::{ContainerArgs, ContainerType}; use super::container_init_process::container_init_process; +use super::fork::CloneCb; #[derive(Debug, thiserror::Error)] pub enum IntermediateProcessError { @@ -108,47 +109,50 @@ pub fn container_intermediate_process( namespaces.unshare_or_setns(pid_namespace)?; } - let cb: fork::CloneCb = Box::new({ + let cb: CloneCb = { let args = args.clone(); let init_sender = init_sender.clone(); let inter_sender = inter_sender.clone(); let mut main_sender = main_sender.clone(); let mut init_receiver = init_receiver.clone(); - move || { + Box::new(move || { + if let Err(ret) = prctl::set_name("youki:[2:INIT]") { + tracing::error!(?ret, "failed to set name for child process"); + return ret; + } + // We are inside the forked process here. The first thing we have to do // is to close any unused senders, since fork will make a dup for all // the socket. - init_sender.close().map_err(|err| { - tracing::error!("failed to close receiver in init process: {}", err); - IntermediateProcessError::Channel(err) - })?; - inter_sender.close().map_err(|err| { - tracing::error!( - "failed to close sender in the intermediate process: {}", - err - ); - IntermediateProcessError::Channel(err) - })?; + if let Err(err) = init_sender.close() { + tracing::error!(?err, "failed to close receiver in init process"); + return -1; + } + if let Err(err) = inter_sender.close() { + tracing::error!(?err, "failed to close sender in the intermediate process"); + return -1; + } match container_init_process(&args, &mut main_sender, &mut init_receiver) { - Ok(_) => Ok(()), + Ok(_) => 0, Err(e) => { if let ContainerType::TenantContainer { exec_notify_fd } = args.container_type { let buf = format!("{e}"); - write(exec_notify_fd, buf.as_bytes()).map_err(|err| { - tracing::error!("failed to write to exec notify fd: {}", err); - IntermediateProcessError::ExecNotify(err) - })?; - close(exec_notify_fd).map_err(|err| { - tracing::error!("failed to close exec notify fd: {}", err); - IntermediateProcessError::ExecNotify(err) - })?; + if let Err(err) = write(exec_notify_fd, buf.as_bytes()) { + tracing::error!(?err, "failed to write to exec notify fd"); + return -1; + } + if let Err(err) = close(exec_notify_fd) { + tracing::error!(?err, "failed to close exec notify fd"); + return -1; + } } + tracing::error!("failed to initialize container process: {e}"); - Err(e.into()) + -1 } } - } - }); + }) + }; // We have to record the pid of the init process. The init process will be // inside the pid namespace, so we can't rely on the init process to send us @@ -158,7 +162,7 @@ pub fn container_intermediate_process( // configuration. The youki main process can decide what to do with the init // process and the intermediate process can just exit safely after the job // is done. - let pid = fork::container_clone_sibling("youki:[2:INIT]", cb).map_err(|err| { + let pid = fork::container_clone_sibling(cb).map_err(|err| { tracing::error!("failed to fork init process: {}", err); IntermediateProcessError::InitProcess(err) })?; diff --git a/crates/libcontainer/src/process/container_main_process.rs b/crates/libcontainer/src/process/container_main_process.rs index b1da9f882..d192b9aff 100644 --- a/crates/libcontainer/src/process/container_main_process.rs +++ b/crates/libcontainer/src/process/container_main_process.rs @@ -1,6 +1,8 @@ use crate::{ process::{ - args::ContainerArgs, channel, container_intermediate_process, fork, + args::ContainerArgs, + channel, container_intermediate_process, + fork::{self, CloneCb}, intel_rdt::setup_intel_rdt, }, rootless::Rootless, @@ -41,24 +43,33 @@ pub fn container_main_process(container_args: &ContainerArgs) -> Result<(Pid, bo let inter_chan = channel::intermediate_channel()?; let init_chan = channel::init_channel()?; - let cb: fork::CloneCb = Box::new({ + let cb: CloneCb = { let container_args = container_args.clone(); let mut main_sender = main_sender.clone(); let mut inter_chan = inter_chan.clone(); let mut init_chan = init_chan.clone(); - move || { - container_intermediate_process::container_intermediate_process( + Box::new(move || { + if let Err(ret) = prctl::set_name("youki:[1:INTER]") { + tracing::error!(?ret, "failed to set name for child process"); + return ret; + } + + match container_intermediate_process::container_intermediate_process( &container_args, &mut inter_chan, &mut init_chan, &mut main_sender, - )?; - - Ok(()) - } - }); + ) { + Ok(_) => 0, + Err(err) => { + tracing::error!(?err, "failed to run intermediate process"); + -1 + } + } + }) + }; - let intermediate_pid = fork::container_clone("youki:[1:INTER]", cb).map_err(|err| { + let intermediate_pid = fork::container_clone(cb).map_err(|err| { tracing::error!("failed to fork intermediate process: {}", err); ProcessError::IntermediateProcessFailed(err) })?; diff --git a/crates/libcontainer/src/process/fork.rs b/crates/libcontainer/src/process/fork.rs index 58ab5e92f..360cc6862 100644 --- a/crates/libcontainer/src/process/fork.rs +++ b/crates/libcontainer/src/process/fork.rs @@ -5,7 +5,6 @@ use nix::{ sys::{mman, resource}, unistd::Pid, }; -use prctl; #[derive(Debug, thiserror::Error)] pub enum CloneError { @@ -25,30 +24,10 @@ pub enum CloneError { UnknownErrno(i32), } -#[derive(Debug, thiserror::Error)] -pub enum CallbackError { - #[error(transparent)] - IntermediateProcess( - #[from] crate::process::container_intermediate_process::IntermediateProcessError, - ), - #[error(transparent)] - InitProcess(#[from] crate::process::container_init_process::InitProcessError), - // Need a fake error for testing - #[cfg(test)] - #[error("unknown")] - Test, -} - -type CallbackResult = std::result::Result; - -// This callback signature is used in the public rust interface with easier to -// use rust error types. -pub type CloneCb = Box CallbackResult<()>>; - // The clone callback is used in clone system call. It is a boxed closure and it // needs to trasfer the ownership of related memory to the new process. The // interface returns i32 which is typical for C functions. -type CloneSystemCb = Box i32>; +pub type CloneCb = Box i32>; // Fork/Clone a sibling process that shares the same parent as the calling // process. This is used to launch the container init process so the parent @@ -57,13 +36,13 @@ type CloneSystemCb = Box i32>; // youki main process) will exit and the init process will be re-parented to the // process 1 (system init process), which is not the right behavior of what we // look for. -pub fn container_clone_sibling(child_name: &str, cb: CloneCb) -> Result { +pub fn container_clone_sibling(cb: CloneCb) -> Result { // Note: normally, an exit signal is required, but when using // `CLONE_PARENT`, the `clone3` will return EINVAL if an exit signal is set. // The older `clone` will not return EINVAL in this case. Instead it ignores // the exit signal bits in the glibc wrapper. Therefore, we explicitly set // the exit_signal to None here, so this works for both version of clone. - clone_internal(child_name, cb, libc::CLONE_PARENT as u64, None) + clone_internal(cb, libc::CLONE_PARENT as u64, None) } // Execute the cb in another process. Make the fork works more like thread_spawn @@ -72,47 +51,28 @@ pub fn container_clone_sibling(child_name: &str, cb: CloneCb) -> Result Result { - clone_internal(child_name, cb, 0, Some(SIGCHLD as u64)) +pub fn container_clone(cb: CloneCb) -> Result { + clone_internal(cb, 0, Some(SIGCHLD as u64)) } fn clone_internal( - child_name: &str, - cb: CloneCb, + mut cb: CloneCb, flags: u64, exit_signal: Option, ) -> Result { - // Prepare a owned string for the closure - let child_name = child_name.to_owned(); - let child_closure = Box::new(move || { - if let Err(ret) = prctl::set_name(child_name.as_str()) { - tracing::error!(?ret, "failed to set name for child process"); - return ret; - } - match cb() { - Ok(_) => 0, - Err(err) => { - tracing::error!(?err, "failed to run callback in clone"); - -1 - } - } - }); - - match clone3(child_closure, flags, exit_signal) { + match clone3(&mut cb, flags, exit_signal) { Ok(pid) => return Ok(pid), Err(CloneError::Clone(nix::Error::ENOSYS)) => { tracing::debug!("clone3 is not supported, fallback to clone"); - // let pid = clone(child_closure, flags, exit_signal)?; - - // Ok(pid) + let pid = clone(cb, flags, exit_signal)?; - todo!() + Ok(pid) } Err(err) => return Err(err), } } -fn clone3(cb: CloneSystemCb, flags: u64, exit_signal: Option) -> Result { +fn clone3(cb: &mut CloneCb, flags: u64, exit_signal: Option) -> Result { #[repr(C)] struct clone3_args { flags: u64, @@ -149,10 +109,9 @@ fn clone3(cb: CloneSystemCb, flags: u64, exit_signal: Option) -> Result { - // Inside the cloned process, continue in the cloned process, we - // execute the callback and exit with the return code. - let ret = cb(); - std::process::exit(ret); + // Inside the cloned process, we execute the callback and exit with + // the return code. + std::process::exit(cb()); } ret if ret >= 0 => return Ok(Pid::from_raw(ret as i32)), ret => return Err(CloneError::UnknownErrno(ret as i32)), @@ -164,7 +123,7 @@ fn clone3(cb: CloneSystemCb, flags: u64, exit_signal: Option) -> Result) -> Result { +fn clone(cb: CloneCb, flags: u64, exit_signal: Option) -> Result { const DEFAULT_STACK_SIZE: usize = 8 * 1024 * 1024; // 8M const DEFAULT_PAGE_SIZE: usize = 4 * 1024; // 4K @@ -241,7 +200,7 @@ fn clone(cb: CloneSystemCb, flags: u64, exit_signal: Option) -> Result libc::c_int { - unsafe { Box::from_raw(data as *mut CloneSystemCb)() as i32 } + unsafe { Box::from_raw(data as *mut CloneCb)() } } // The nix::sched::clone wrapper doesn't provide the right interface. Using @@ -284,7 +243,7 @@ mod test { #[test] fn test_container_fork() -> Result<()> { - let pid = container_clone("test:child", Box::new(|| Ok(())))?; + let pid = container_clone(Box::new(|| 0))?; match waitpid(pid, None).expect("wait pid failed.") { WaitStatus::Exited(p, status) => { assert_eq!(pid, p); @@ -297,7 +256,7 @@ mod test { #[test] fn test_container_err_fork() -> Result<()> { - let pid = container_clone("test:child", Box::new(|| Err(CallbackError::Test)))?; + let pid = container_clone(Box::new(|| -1))?; match waitpid(pid, None).expect("wait pid failed.") { WaitStatus::Exited(p, status) => { assert_eq!(pid, p); @@ -349,7 +308,7 @@ mod test { unistd::ForkResult::Child => { // Inside the forked process. We call `container_clone` and pass // the pid to the parent process. - let pid = container_clone_sibling("test:child", Box::new(|| Ok(())))?; + let pid = container_clone_sibling(Box::new(|| 0))?; sender.send(pid.as_raw())?; sender.close()?; std::process::exit(0); @@ -403,8 +362,7 @@ mod test { )); } - let pid = container_clone("test:child", Box::new(|| Ok(()))) - .map_err(|err| err.to_string())?; + let pid = container_clone(Box::new(|| 0)).map_err(|err| err.to_string())?; match waitpid(pid, None).expect("wait pid failed.") { WaitStatus::Exited(p, status) => { assert_eq!(pid, p); From 552bed92c04f88848d5b166df0e7df67c73e8ecd Mon Sep 17 00:00:00 2001 From: yihuaf Date: Tue, 4 Jul 2023 00:13:41 -0700 Subject: [PATCH 11/21] fix comments Signed-off-by: yihuaf --- crates/libcontainer/src/process/fork.rs | 34 +++++++++++++------------ 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/crates/libcontainer/src/process/fork.rs b/crates/libcontainer/src/process/fork.rs index 360cc6862..08bd39f2e 100644 --- a/crates/libcontainer/src/process/fork.rs +++ b/crates/libcontainer/src/process/fork.rs @@ -24,12 +24,15 @@ pub enum CloneError { UnknownErrno(i32), } -// The clone callback is used in clone system call. It is a boxed closure and it -// needs to trasfer the ownership of related memory to the new process. The -// interface returns i32 which is typical for C functions. +/// The callback function used in clone system call. The return value is i32 +/// which is consistent with C functions return code. The closure is boxed +/// because we will have to correctly transfer the ownership of the closure +/// memory on the heap to the new process. The trait has to be `FnMut` because +/// we need to be able to call the closure multiple times, once in clone3 and +/// once in clone if fallback is required. pub type CloneCb = Box i32>; -// Fork/Clone a sibling process that shares the same parent as the calling +// Clone a sibling process that shares the same parent as the calling // process. This is used to launch the container init process so the parent // process of the calling process can receive ownership of the process. If we // clone a child process as the init process, the calling process (likely the @@ -45,16 +48,12 @@ pub fn container_clone_sibling(cb: CloneCb) -> Result { clone_internal(cb, libc::CLONE_PARENT as u64, None) } -// Execute the cb in another process. Make the fork works more like thread_spawn -// or clone, so it is easier to reason. Compared to clone call, fork is easier -// to use since fork will magically take care of all the variable copying. If -// using clone, we would have to manually make sure all the variables are -// correctly send to the new process, especially Rust borrow checker will be a -// lot of hassel to deal with every details. +// Clone a child process and execute the callback. pub fn container_clone(cb: CloneCb) -> Result { clone_internal(cb, 0, Some(SIGCHLD as u64)) } +// An internal wrapper to manage the clone3 vs clone fallback logic. fn clone_internal( mut cb: CloneCb, flags: u64, @@ -62,6 +61,7 @@ fn clone_internal( ) -> Result { match clone3(&mut cb, flags, exit_signal) { Ok(pid) => return Ok(pid), + // For now, we decide to only fallback on ENOSYS Err(CloneError::Clone(nix::Error::ENOSYS)) => { tracing::debug!("clone3 is not supported, fallback to clone"); let pid = clone(cb, flags, exit_signal)?; @@ -72,6 +72,9 @@ fn clone_internal( } } +// Unlike the clone call, clone3 is currently using the kernel syscall, mimicing +// the interface of fork. There is not need to explicitly manage the memory, so +// we can safely passin the callback closure as reference. fn clone3(cb: &mut CloneCb, flags: u64, exit_signal: Option) -> Result { #[repr(C)] struct clone3_args { @@ -103,7 +106,11 @@ fn clone3(cb: &mut CloneCb, flags: u64, exit_signal: Option) -> Result(); // For now, we can only use clone3 as a kernel syscall. Libc wrapper is not - // available yet. + // available yet. This can have undefined behavior because libc authors do + // not like people calling kernel syscall to directly create processes. Libc + // does perform additional bookkeeping when calling clone or fork. So far, + // we have not observed any issues with calling clone3 directly, but we + // should keep an eye on it. match unsafe { libc::syscall(libc::SYS_clone3, args_ptr, args_size) } { -1 => { return Err(CloneError::Clone(nix::Error::last())); @@ -118,11 +125,6 @@ fn clone3(cb: &mut CloneCb, flags: u64, exit_signal: Option) -> Result) -> Result { const DEFAULT_STACK_SIZE: usize = 8 * 1024 * 1024; // 8M const DEFAULT_PAGE_SIZE: usize = 4 * 1024; // 4K From e8979073bfe8187f23ed375e7ca77eff542a3839 Mon Sep 17 00:00:00 2001 From: yihuaf Date: Tue, 4 Jul 2023 00:27:41 -0700 Subject: [PATCH 12/21] fix doc test Signed-off-by: yihuaf --- crates/libcontainer/src/container/builder.rs | 48 ++++++++----------- .../src/container/container_delete.rs | 5 +- .../src/container/container_events.rs | 5 +- .../src/container/container_kill.rs | 5 +- .../src/container/container_pause.rs | 5 +- .../src/container/container_resume.rs | 5 +- .../src/container/container_start.rs | 5 +- 7 files changed, 32 insertions(+), 46 deletions(-) diff --git a/crates/libcontainer/src/container/builder.rs b/crates/libcontainer/src/container/builder.rs index 66aee07a6..5667c08a0 100644 --- a/crates/libcontainer/src/container/builder.rs +++ b/crates/libcontainer/src/container/builder.rs @@ -32,12 +32,11 @@ pub struct ContainerBuilder { /// /// ```no_run /// use libcontainer::container::builder::ContainerBuilder; -/// use libcontainer::syscall::syscall::create_syscall; -/// use libcontainer::workload::default::DefaultExecutor; +/// use libcontainer::syscall::syscall::SyscallType; /// /// ContainerBuilder::new( /// "74f1a4cb3801".to_owned(), -/// create_syscall().as_ref(), +/// SyscallType::default(), /// ) /// .with_root_path("/run/containers/youki").expect("invalid root path") /// .with_pid_file(Some("/var/run/docker.pid")).expect("invalid pid file") @@ -53,12 +52,11 @@ impl ContainerBuilder { /// /// ```no_run /// use libcontainer::container::builder::ContainerBuilder; - /// use libcontainer::syscall::syscall::create_syscall; - /// use libcontainer::workload::default::DefaultExecutor; + /// use libcontainer::syscall::syscall::SyscallType; /// /// let builder = ContainerBuilder::new( /// "74f1a4cb3801".to_owned(), - /// create_syscall().as_ref(), + /// SyscallType::default(), /// ); /// ``` pub fn new(container_id: String, syscall: SyscallType) -> Self { @@ -115,12 +113,11 @@ impl ContainerBuilder { /// /// ```no_run /// # use libcontainer::container::builder::ContainerBuilder; - /// # use libcontainer::syscall::syscall::create_syscall; - /// # use libcontainer::workload::default::DefaultExecutor; + /// # use libcontainer::syscall::syscall::SyscallType; /// /// ContainerBuilder::new( /// "74f1a4cb3801".to_owned(), - /// create_syscall().as_ref(), + /// SyscallType::default(), /// ) /// .as_tenant() /// .with_container_args(vec!["sleep".to_owned(), "9001".to_owned()]) @@ -136,12 +133,11 @@ impl ContainerBuilder { /// /// ```no_run /// # use libcontainer::container::builder::ContainerBuilder; - /// # use libcontainer::syscall::syscall::create_syscall; - /// # use libcontainer::workload::default::DefaultExecutor; + /// # use libcontainer::syscall::syscall::SyscallType; /// /// ContainerBuilder::new( /// "74f1a4cb3801".to_owned(), - /// create_syscall().as_ref(), + /// SyscallType::default(), /// ) /// .as_init("/var/run/docker/bundle") /// .with_systemd(false) @@ -157,12 +153,11 @@ impl ContainerBuilder { /// /// ```no_run /// # use libcontainer::container::builder::ContainerBuilder; - /// # use libcontainer::syscall::syscall::create_syscall; - /// # use libcontainer::workload::default::DefaultExecutor; + /// # use libcontainer::syscall::syscall::SyscallType; /// /// ContainerBuilder::new( /// "74f1a4cb3801".to_owned(), - /// create_syscall().as_ref(), + /// SyscallType::default(), /// ) /// .with_root_path("/run/containers/youki").expect("invalid root path"); /// ``` @@ -182,12 +177,11 @@ impl ContainerBuilder { /// /// ```no_run /// # use libcontainer::container::builder::ContainerBuilder; - /// # use libcontainer::syscall::syscall::create_syscall; - /// # use libcontainer::workload::default::DefaultExecutor; + /// # use libcontainer::syscall::syscall::SyscallType; /// /// ContainerBuilder::new( /// "74f1a4cb3801".to_owned(), - /// create_syscall().as_ref(), + /// SyscallType::default(), /// ) /// .with_pid_file(Some("/var/run/docker.pid")).expect("invalid pid file"); /// ``` @@ -212,12 +206,11 @@ impl ContainerBuilder { /// /// ```no_run /// # use libcontainer::container::builder::ContainerBuilder; - /// # use libcontainer::syscall::syscall::create_syscall; - /// # use libcontainer::workload::default::DefaultExecutor; + /// # use libcontainer::syscall::syscall::SyscallType; /// /// ContainerBuilder::new( /// "74f1a4cb3801".to_owned(), - /// create_syscall().as_ref(), + /// SyscallType::default(), /// ) /// .with_console_socket(Some("/var/run/docker/sock.tty")); /// ``` @@ -232,12 +225,11 @@ impl ContainerBuilder { /// /// ```no_run /// # use libcontainer::container::builder::ContainerBuilder; - /// # use libcontainer::syscall::syscall::create_syscall; - /// # use libcontainer::workload::default::DefaultExecutor; + /// # use libcontainer::syscall::syscall::SyscallType; /// /// ContainerBuilder::new( /// "74f1a4cb3801".to_owned(), - /// create_syscall().as_ref(), + /// SyscallType::default(), /// ) /// .with_preserved_fds(5); /// ``` @@ -251,14 +243,14 @@ impl ContainerBuilder { /// /// ```no_run /// # use libcontainer::container::builder::ContainerBuilder; - /// # use libcontainer::syscall::syscall::create_syscall; - /// # use libcontainer::workload::default::DefaultExecutor; + /// # use libcontainer::syscall::syscall::SyscallType; + /// # use libcontainer::workload::default::get_executor; /// /// ContainerBuilder::new( /// "74f1a4cb3801".to_owned(), - /// create_syscall().as_ref(), + /// SyscallType::default(), /// ) - /// .with_executor(vec![Box::::default()]); + /// .with_executor(get_executor()); /// ``` pub fn with_executor(mut self, executor: Executor) -> Result { self.executor = executor; diff --git a/crates/libcontainer/src/container/container_delete.rs b/crates/libcontainer/src/container/container_delete.rs index 7fc46dcc8..322a47f7d 100644 --- a/crates/libcontainer/src/container/container_delete.rs +++ b/crates/libcontainer/src/container/container_delete.rs @@ -13,13 +13,12 @@ impl Container { /// /// ```no_run /// use libcontainer::container::builder::ContainerBuilder; - /// use libcontainer::syscall::syscall::create_syscall; - /// use libcontainer::workload::default::DefaultExecutor; + /// use libcontainer::syscall::syscall::SyscallType; /// /// # fn main() -> anyhow::Result<()> { /// let mut container = ContainerBuilder::new( /// "74f1a4cb3801".to_owned(), - /// create_syscall().as_ref(), + /// SyscallType::default(), /// ) /// .as_init("/var/run/docker/bundle") /// .build()?; diff --git a/crates/libcontainer/src/container/container_events.rs b/crates/libcontainer/src/container/container_events.rs index 33e0df020..6a4200755 100644 --- a/crates/libcontainer/src/container/container_events.rs +++ b/crates/libcontainer/src/container/container_events.rs @@ -12,13 +12,12 @@ impl Container { /// /// ```no_run /// use libcontainer::container::builder::ContainerBuilder; - /// use libcontainer::syscall::syscall::create_syscall; - /// use libcontainer::workload::default::DefaultExecutor; + /// use libcontainer::syscall::syscall::SyscallType; /// /// # fn main() -> anyhow::Result<()> { /// let mut container = ContainerBuilder::new( /// "74f1a4cb3801".to_owned(), - /// create_syscall().as_ref(), + /// SyscallType::default(), /// ) /// .as_init("/var/run/docker/bundle") /// .build()?; diff --git a/crates/libcontainer/src/container/container_kill.rs b/crates/libcontainer/src/container/container_kill.rs index 66d126dce..63b2ab175 100644 --- a/crates/libcontainer/src/container/container_kill.rs +++ b/crates/libcontainer/src/container/container_kill.rs @@ -10,14 +10,13 @@ impl Container { /// /// ```no_run /// use libcontainer::container::builder::ContainerBuilder; - /// use libcontainer::syscall::syscall::create_syscall; - /// use libcontainer::workload::default::DefaultExecutor; + /// use libcontainer::syscall::syscall::SyscallType; /// use nix::sys::signal::Signal; /// /// # fn main() -> anyhow::Result<()> { /// let mut container = ContainerBuilder::new( /// "74f1a4cb3801".to_owned(), - /// create_syscall().as_ref(), + /// SyscallType::default(), /// ) /// .as_init("/var/run/docker/bundle") /// .build()?; diff --git a/crates/libcontainer/src/container/container_pause.rs b/crates/libcontainer/src/container/container_pause.rs index 5be1326a3..592056056 100644 --- a/crates/libcontainer/src/container/container_pause.rs +++ b/crates/libcontainer/src/container/container_pause.rs @@ -10,13 +10,12 @@ impl Container { /// /// ```no_run /// use libcontainer::container::builder::ContainerBuilder; - /// use libcontainer::syscall::syscall::create_syscall; - /// use libcontainer::workload::default::DefaultExecutor; + /// use libcontainer::syscall::syscall::SyscallType; /// /// # fn main() -> anyhow::Result<()> { /// let mut container = ContainerBuilder::new( /// "74f1a4cb3801".to_owned(), - /// create_syscall().as_ref(), + /// SyscallType::default(), /// ) /// .as_init("/var/run/docker/bundle") /// .build()?; diff --git a/crates/libcontainer/src/container/container_resume.rs b/crates/libcontainer/src/container/container_resume.rs index 0087ecfe5..eb0a9a419 100644 --- a/crates/libcontainer/src/container/container_resume.rs +++ b/crates/libcontainer/src/container/container_resume.rs @@ -11,13 +11,12 @@ impl Container { /// /// ```no_run /// use libcontainer::container::builder::ContainerBuilder; - /// use libcontainer::syscall::syscall::create_syscall; - /// use libcontainer::workload::default::DefaultExecutor; + /// use libcontainer::syscall::syscall::SyscallType; /// /// # fn main() -> anyhow::Result<()> { /// let mut container = ContainerBuilder::new( /// "74f1a4cb3801".to_owned(), - /// create_syscall().as_ref(), + /// SyscallType::default(), /// ) /// .as_init("/var/run/docker/bundle") /// .build()?; diff --git a/crates/libcontainer/src/container/container_start.rs b/crates/libcontainer/src/container/container_start.rs index 5ce9ac4a6..c6805511c 100644 --- a/crates/libcontainer/src/container/container_start.rs +++ b/crates/libcontainer/src/container/container_start.rs @@ -15,13 +15,12 @@ impl Container { /// /// ```no_run /// use libcontainer::container::builder::ContainerBuilder; - /// use libcontainer::syscall::syscall::create_syscall; - /// use libcontainer::workload::default::DefaultExecutor; + /// use libcontainer::syscall::syscall::SyscallType; /// /// # fn main() -> anyhow::Result<()> { /// let mut container = ContainerBuilder::new( /// "74f1a4cb3801".to_owned(), - /// create_syscall().as_ref(), + /// SyscallType::default(), /// ) /// .as_init("/var/run/docker/bundle") /// .build()?; From 9428e0ef8565b3ac61be64c7634487f966cbe46a Mon Sep 17 00:00:00 2001 From: yihuaf Date: Tue, 4 Jul 2023 00:38:55 -0700 Subject: [PATCH 13/21] spellcheck Signed-off-by: yihuaf --- crates/libcontainer/src/process/fork.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/crates/libcontainer/src/process/fork.rs b/crates/libcontainer/src/process/fork.rs index 08bd39f2e..6195d31d5 100644 --- a/crates/libcontainer/src/process/fork.rs +++ b/crates/libcontainer/src/process/fork.rs @@ -20,7 +20,7 @@ pub enum CloneError { StackAllocation(#[source] nix::Error), #[error("failed to create stack guard page")] GuardPage(#[source] nix::Error), - #[error("unkown error code {0}")] + #[error("unknown error code {0}")] UnknownErrno(i32), } @@ -72,9 +72,9 @@ fn clone_internal( } } -// Unlike the clone call, clone3 is currently using the kernel syscall, mimicing +// Unlike the clone call, clone3 is currently using the kernel syscall, mimicking // the interface of fork. There is not need to explicitly manage the memory, so -// we can safely passin the callback closure as reference. +// we can safely passing the callback closure as reference. fn clone3(cb: &mut CloneCb, flags: u64, exit_signal: Option) -> Result { #[repr(C)] struct clone3_args { @@ -155,9 +155,9 @@ fn clone(cb: CloneCb, flags: u64, exit_signal: Option) -> Result) -> Result) -> Result Date: Tue, 4 Jul 2023 00:44:09 -0700 Subject: [PATCH 14/21] fix lint Signed-off-by: yihuaf --- crates/libcontainer/src/container/builder_impl.rs | 2 +- .../libcontainer/src/container/container_events.rs | 2 +- .../libcontainer/src/container/container_kill.rs | 4 ++-- .../libcontainer/src/container/container_pause.rs | 2 +- .../libcontainer/src/container/container_resume.rs | 2 +- .../src/process/container_init_process.rs | 6 +++--- crates/libcontainer/src/process/fork.rs | 14 ++++++-------- crates/libcontainer/src/syscall/syscall.rs | 2 +- crates/libcontainer/src/workload/default.rs | 4 ++-- crates/youki/src/workload/executor.rs | 6 ++++-- crates/youki/src/workload/wasmedge.rs | 4 ++-- crates/youki/src/workload/wasmer.rs | 4 ++-- crates/youki/src/workload/wasmtime.rs | 4 ++-- 13 files changed, 28 insertions(+), 28 deletions(-) diff --git a/crates/libcontainer/src/container/builder_impl.rs b/crates/libcontainer/src/container/builder_impl.rs index dd6526ee4..fd1c1f64d 100644 --- a/crates/libcontainer/src/container/builder_impl.rs +++ b/crates/libcontainer/src/container/builder_impl.rs @@ -179,7 +179,7 @@ impl ContainerBuilderImpl { ); let cmanager = libcgroups::common::create_cgroup_manager(&libcgroups::common::CgroupConfig { - cgroup_path: cgroups_path.to_owned(), + cgroup_path: cgroups_path, systemd_cgroup: self.use_systemd || self.rootless.is_some(), container_name: self.container_id.to_string(), })?; diff --git a/crates/libcontainer/src/container/container_events.rs b/crates/libcontainer/src/container/container_events.rs index 6a4200755..6fcfec320 100644 --- a/crates/libcontainer/src/container/container_events.rs +++ b/crates/libcontainer/src/container/container_events.rs @@ -35,7 +35,7 @@ impl Container { let cgroup_manager = libcgroups::common::create_cgroup_manager(&libcgroups::common::CgroupConfig { - cgroup_path: self.spec()?.cgroup_path.to_owned(), + cgroup_path: self.spec()?.cgroup_path, systemd_cgroup: self.systemd(), container_name: self.id().to_string(), })?; diff --git a/crates/libcontainer/src/container/container_kill.rs b/crates/libcontainer/src/container/container_kill.rs index 63b2ab175..8a122950b 100644 --- a/crates/libcontainer/src/container/container_kill.rs +++ b/crates/libcontainer/src/container/container_kill.rs @@ -80,7 +80,7 @@ impl Container { | libcgroups::common::CgroupSetup::Hybrid => { let cmanager = libcgroups::common::create_cgroup_manager( &libcgroups::common::CgroupConfig { - cgroup_path: self.spec()?.cgroup_path.to_owned(), + cgroup_path: self.spec()?.cgroup_path, systemd_cgroup: self.systemd(), container_name: self.id().to_string(), }, @@ -97,7 +97,7 @@ impl Container { let signal = signal.into().into_raw(); let cmanager = libcgroups::common::create_cgroup_manager(&libcgroups::common::CgroupConfig { - cgroup_path: self.spec()?.cgroup_path.to_owned(), + cgroup_path: self.spec()?.cgroup_path, systemd_cgroup: self.systemd(), container_name: self.id().to_string(), })?; diff --git a/crates/libcontainer/src/container/container_pause.rs b/crates/libcontainer/src/container/container_pause.rs index 592056056..c35302b66 100644 --- a/crates/libcontainer/src/container/container_pause.rs +++ b/crates/libcontainer/src/container/container_pause.rs @@ -34,7 +34,7 @@ impl Container { let cmanager = libcgroups::common::create_cgroup_manager(&libcgroups::common::CgroupConfig { - cgroup_path: self.spec()?.cgroup_path.to_owned(), + cgroup_path: self.spec()?.cgroup_path, systemd_cgroup: self.systemd(), container_name: self.id().to_string(), })?; diff --git a/crates/libcontainer/src/container/container_resume.rs b/crates/libcontainer/src/container/container_resume.rs index eb0a9a419..d2165ec80 100644 --- a/crates/libcontainer/src/container/container_resume.rs +++ b/crates/libcontainer/src/container/container_resume.rs @@ -36,7 +36,7 @@ impl Container { let cmanager = libcgroups::common::create_cgroup_manager(&libcgroups::common::CgroupConfig { - cgroup_path: self.spec()?.cgroup_path.to_owned(), + cgroup_path: self.spec()?.cgroup_path, systemd_cgroup: self.systemd(), container_name: self.id().to_string(), })?; diff --git a/crates/libcontainer/src/process/container_init_process.rs b/crates/libcontainer/src/process/container_init_process.rs index 5f61ec480..915a3698f 100644 --- a/crates/libcontainer/src/process/container_init_process.rs +++ b/crates/libcontainer/src/process/container_init_process.rs @@ -359,7 +359,7 @@ pub fn container_init_process( })?; } - apply_rest_namespaces(&namespaces, &spec, syscall.as_ref())?; + apply_rest_namespaces(&namespaces, spec, syscall.as_ref())?; if let Some(true) = proc.no_new_privileges() { let _ = prctl::set_no_new_privileges(true); @@ -379,7 +379,7 @@ pub fn container_init_process( let rootfs = RootFS::new(); rootfs .prepare_rootfs( - &spec, + spec, rootfs_path, bind_service, namespaces.get(LinuxNamespaceType::Cgroup)?.is_some(), @@ -678,7 +678,7 @@ pub fn container_init_process( } if proc.args().is_some() { - (args.executor)(&spec).map_err(|err| { + (args.executor)(spec).map_err(|err| { tracing::error!(?err, "failed to execute payload"); err })?; diff --git a/crates/libcontainer/src/process/fork.rs b/crates/libcontainer/src/process/fork.rs index 6195d31d5..7981784ea 100644 --- a/crates/libcontainer/src/process/fork.rs +++ b/crates/libcontainer/src/process/fork.rs @@ -60,7 +60,7 @@ fn clone_internal( exit_signal: Option, ) -> Result { match clone3(&mut cb, flags, exit_signal) { - Ok(pid) => return Ok(pid), + Ok(pid) => Ok(pid), // For now, we decide to only fallback on ENOSYS Err(CloneError::Clone(nix::Error::ENOSYS)) => { tracing::debug!("clone3 is not supported, fallback to clone"); @@ -68,7 +68,7 @@ fn clone_internal( Ok(pid) } - Err(err) => return Err(err), + Err(err) => Err(err), } } @@ -112,17 +112,15 @@ fn clone3(cb: &mut CloneCb, flags: u64, exit_signal: Option) -> Result { - return Err(CloneError::Clone(nix::Error::last())); - } + -1 => Err(CloneError::Clone(nix::Error::last())), 0 => { // Inside the cloned process, we execute the callback and exit with // the return code. std::process::exit(cb()); } - ret if ret >= 0 => return Ok(Pid::from_raw(ret as i32)), - ret => return Err(CloneError::UnknownErrno(ret as i32)), - }; + ret if ret >= 0 => Ok(Pid::from_raw(ret as i32)), + ret => Err(CloneError::UnknownErrno(ret as i32)), + } } fn clone(cb: CloneCb, flags: u64, exit_signal: Option) -> Result { diff --git a/crates/libcontainer/src/syscall/syscall.rs b/crates/libcontainer/src/syscall/syscall.rs index 3d3e38360..106bde0df 100644 --- a/crates/libcontainer/src/syscall/syscall.rs +++ b/crates/libcontainer/src/syscall/syscall.rs @@ -76,7 +76,7 @@ impl SyscallType { pub fn create_syscall(&self) -> Box { match self { SyscallType::Linux => Box::new(LinuxSyscall), - SyscallType::Test => Box::new(TestHelperSyscall::default()), + SyscallType::Test => Box::::default(), } } } diff --git a/crates/libcontainer/src/workload/default.rs b/crates/libcontainer/src/workload/default.rs index f6c4b6161..b59a3e6b5 100644 --- a/crates/libcontainer/src/workload/default.rs +++ b/crates/libcontainer/src/workload/default.rs @@ -6,7 +6,7 @@ use oci_spec::runtime::Spec; use super::{Executor, ExecutorError, EMPTY}; pub fn get_executor() -> Executor { - return Box::new(|spec: &Spec| -> Result<(), ExecutorError> { + Box::new(|spec: &Spec| -> Result<(), ExecutorError> { tracing::debug!("executing workload with default handler"); let args = spec .process() @@ -36,5 +36,5 @@ pub fn get_executor() -> Executor { // After execvp is called, the process is replaced with the container // payload through execvp, so it should never reach here. unreachable!(); - }); + }) } diff --git a/crates/youki/src/workload/executor.rs b/crates/youki/src/workload/executor.rs index faccc8316..fbf893936 100644 --- a/crates/youki/src/workload/executor.rs +++ b/crates/youki/src/workload/executor.rs @@ -2,7 +2,7 @@ use libcontainer::workload::{Executor, ExecutorError}; use oci_spec::runtime::Spec; pub fn default_executor() -> Executor { - return Box::new(|spec: &Spec| -> Result<(), ExecutorError> { + Box::new(|spec: &Spec| -> Result<(), ExecutorError> { #[cfg(feature = "wasm-wasmer")] match super::wasmer::get_executor()(spec) { Ok(_) => return Ok(()), @@ -22,6 +22,8 @@ pub fn default_executor() -> Executor { Err(err) => return Err(err), } + // Leave the default executor as the last option, which executes normal + // container workloads. libcontainer::workload::default::get_executor()(spec) - }); + }) } diff --git a/crates/youki/src/workload/wasmedge.rs b/crates/youki/src/workload/wasmedge.rs index 536fc15cd..eb813e3a6 100644 --- a/crates/youki/src/workload/wasmedge.rs +++ b/crates/youki/src/workload/wasmedge.rs @@ -9,7 +9,7 @@ use libcontainer::workload::{Executor, ExecutorError}; const EXECUTOR_NAME: &str = "wasmedge"; pub fn get_executor() -> Executor { - return Box::new(|spec: &Spec| -> Result<(), ExecutorError> { + Box::new(|spec: &Spec| -> Result<(), ExecutorError> { if !can_handle(spec) { return Err(ExecutorError::CantHandle(EXECUTOR_NAME)); } @@ -58,7 +58,7 @@ pub fn get_executor() -> Executor { .map_err(|err| ExecutorError::Execution(err))?; Ok(()) - }); + }) } fn can_handle(spec: &Spec) -> bool { diff --git a/crates/youki/src/workload/wasmer.rs b/crates/youki/src/workload/wasmer.rs index 417d7db49..1f0a4b824 100644 --- a/crates/youki/src/workload/wasmer.rs +++ b/crates/youki/src/workload/wasmer.rs @@ -7,7 +7,7 @@ use libcontainer::workload::{Executor, ExecutorError, EMPTY}; const EXECUTOR_NAME: &str = "wasmer"; pub fn get_executor() -> Executor { - return Box::new(|spec: &Spec| -> Result<(), ExecutorError> { + Box::new(|spec: &Spec| -> Result<(), ExecutorError> { if !can_handle(spec) { return Err(ExecutorError::CantHandle(EXECUTOR_NAME)); } @@ -76,7 +76,7 @@ pub fn get_executor() -> Executor { wasi_env.cleanup(&mut store, None); Ok(()) - }); + }) } fn can_handle(spec: &Spec) -> bool { diff --git a/crates/youki/src/workload/wasmtime.rs b/crates/youki/src/workload/wasmtime.rs index 508a0078b..6834ea617 100644 --- a/crates/youki/src/workload/wasmtime.rs +++ b/crates/youki/src/workload/wasmtime.rs @@ -7,7 +7,7 @@ use libcontainer::workload::{Executor, ExecutorError, EMPTY}; const EXECUTOR_NAME: &str = "wasmtime"; pub fn get_executor() -> Executor { - return Box::new(|spec: &Spec| -> Result<(), ExecutorError> { + Box::new(|spec: &Spec| -> Result<(), ExecutorError> { if !can_handle(spec) { return Err(ExecutorError::CantHandle(EXECUTOR_NAME)); } @@ -86,7 +86,7 @@ pub fn get_executor() -> Executor { start .call(&mut store, &[], &mut []) .map_err(|err| ExecutorError::Execution(err.into())) - }); + }) } fn can_handle(spec: &Spec) -> bool { From db3c1798915d1da8b39d5349745ee44c250f9733 Mon Sep 17 00:00:00 2001 From: yihuaf Date: Tue, 4 Jul 2023 01:18:38 -0700 Subject: [PATCH 15/21] fix feature test Signed-off-by: yihuaf --- crates/libcontainer/src/process/container_main_process.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/crates/libcontainer/src/process/container_main_process.rs b/crates/libcontainer/src/process/container_main_process.rs index d192b9aff..38f7c0d60 100644 --- a/crates/libcontainer/src/process/container_main_process.rs +++ b/crates/libcontainer/src/process/container_main_process.rs @@ -82,7 +82,10 @@ pub fn container_main_process(container_args: &ContainerArgs) -> Result<(Pid, bo })?; let (mut inter_sender, inter_receiver) = inter_chan; + #[cfg(feature = "libseccomp")] let (mut init_sender, init_receiver) = init_chan; + #[cfg(not(feature = "libseccomp"))] + let (init_sender, init_receiver) = init_chan; // If creating a rootless container, the intermediate process will ask // the main process to set up uid and gid mapping, once the intermediate From d07801a1de7239351bb215682b088157b84329cf Mon Sep 17 00:00:00 2001 From: yihuaf Date: Tue, 4 Jul 2023 01:18:46 -0700 Subject: [PATCH 16/21] fix Signed-off-by: yihuaf --- crates/libcontainer/src/notify_socket.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/libcontainer/src/notify_socket.rs b/crates/libcontainer/src/notify_socket.rs index 9daf16b89..61764cda0 100644 --- a/crates/libcontainer/src/notify_socket.rs +++ b/crates/libcontainer/src/notify_socket.rs @@ -9,7 +9,7 @@ pub const NOTIFY_FILE: &str = "notify.sock"; #[derive(Debug, thiserror::Error)] pub enum NotifyListenerError { - #[error("failed to chdir while creating notify socket")] + #[error("failed to chdir while creating notify socket: {path}")] Chdir { source: nix::Error, path: PathBuf }, #[error("invalid path: {0}")] InvalidPath(PathBuf), From 5be4fc83d63198c208eeb7183a80dbedae6d48be Mon Sep 17 00:00:00 2001 From: yihuaf Date: Tue, 4 Jul 2023 01:57:44 -0700 Subject: [PATCH 17/21] fix notify listener Signed-off-by: yihuaf --- crates/libcontainer/src/container/builder_impl.rs | 11 ++++++++++- crates/libcontainer/src/notify_socket.rs | 14 +++++++++++++- crates/libcontainer/src/process/args.rs | 3 ++- .../src/process/container_init_process.rs | 6 +----- 4 files changed, 26 insertions(+), 8 deletions(-) diff --git a/crates/libcontainer/src/container/builder_impl.rs b/crates/libcontainer/src/container/builder_impl.rs index fd1c1f64d..d4b7dde64 100644 --- a/crates/libcontainer/src/container/builder_impl.rs +++ b/crates/libcontainer/src/container/builder_impl.rs @@ -123,6 +123,15 @@ impl ContainerBuilderImpl { prctl::set_dumpable(false).unwrap(); } + // Need to create the notify socket before we pivot root, since the unix + // domain socket used here is outside of the rootfs of container. During + // exec, need to create the socket before we enter into existing mount + // namespace. We also need to create to socket before entering into the + // user namespace in the case that the path is located in paths only + // root can access. + let notify_listener = + crate::notify_socket::NotifyListener::new(self.notify_path.as_path())?; + // This container_args will be passed to the container processes, // therefore we will have to move all the variable by value. Since self // is a shared reference, we have to clone these variables here. @@ -132,7 +141,7 @@ impl ContainerBuilderImpl { spec: self.spec.to_owned(), rootfs: self.rootfs.to_owned(), console_socket: self.console_socket, - notify_socket_path: self.notify_path.to_owned(), + notify_socket: notify_listener, preserve_fds: self.preserve_fds, container: self.container.to_owned(), rootless: self.rootless.to_owned(), diff --git a/crates/libcontainer/src/notify_socket.rs b/crates/libcontainer/src/notify_socket.rs index 61764cda0..004027532 100644 --- a/crates/libcontainer/src/notify_socket.rs +++ b/crates/libcontainer/src/notify_socket.rs @@ -1,6 +1,7 @@ use nix::unistd::{self, close}; use std::env; use std::io::prelude::*; +use std::os::fd::FromRawFd; use std::os::unix::io::AsRawFd; use std::os::unix::net::{UnixListener, UnixStream}; use std::path::{Path, PathBuf}; @@ -9,7 +10,7 @@ pub const NOTIFY_FILE: &str = "notify.sock"; #[derive(Debug, thiserror::Error)] pub enum NotifyListenerError { - #[error("failed to chdir while creating notify socket: {path}")] + #[error("failed to chdir {path} while creating notify socket: {source}")] Chdir { source: nix::Error, path: PathBuf }, #[error("invalid path: {0}")] InvalidPath(PathBuf), @@ -43,6 +44,7 @@ pub struct NotifyListener { impl NotifyListener { pub fn new(socket_path: &Path) -> Result { + tracing::debug!(?socket_path, "create notify listener"); // Unix domain socket has a maximum length of 108, different from // normal path length of 255. Due to how docker create the path name // to the container working directory, there is a high chance that @@ -56,6 +58,7 @@ impl NotifyListener { .file_name() .ok_or_else(|| NotifyListenerError::InvalidPath(socket_path.to_owned()))?; let cwd = env::current_dir().map_err(NotifyListenerError::GetCwd)?; + tracing::debug!(?cwd, "the cwd to create the notify socket"); unistd::chdir(workdir).map_err(|e| NotifyListenerError::Chdir { source: e, path: workdir.to_owned(), @@ -93,6 +96,15 @@ impl NotifyListener { } } +impl Clone for NotifyListener { + fn clone(&self) -> Self { + let fd = self.socket.as_raw_fd(); + // This is safe because we just duplicate a valid fd. + let socket = unsafe { UnixListener::from_raw_fd(fd) }; + Self { socket } + } +} + pub struct NotifySocket { path: PathBuf, } diff --git a/crates/libcontainer/src/process/args.rs b/crates/libcontainer/src/process/args.rs index 16919bad8..56f3b149a 100644 --- a/crates/libcontainer/src/process/args.rs +++ b/crates/libcontainer/src/process/args.rs @@ -4,6 +4,7 @@ use std::os::unix::prelude::RawFd; use std::path::PathBuf; use crate::container::Container; +use crate::notify_socket::NotifyListener; use crate::rootless::Rootless; use crate::syscall::syscall::SyscallType; use crate::workload::Executor; @@ -27,7 +28,7 @@ pub struct ContainerArgs { /// Socket to communicate the file descriptor of the ptty pub console_socket: Option, /// The Unix Domain Socket to communicate container start - pub notify_socket_path: PathBuf, + pub notify_socket: NotifyListener, /// File descriptors preserved/passed to the container init process. pub preserve_fds: i32, /// Container state diff --git a/crates/libcontainer/src/process/container_init_process.rs b/crates/libcontainer/src/process/container_init_process.rs index 915a3698f..ce01635d2 100644 --- a/crates/libcontainer/src/process/container_init_process.rs +++ b/crates/libcontainer/src/process/container_init_process.rs @@ -341,11 +341,7 @@ pub fn container_init_process( let hooks = spec.hooks().as_ref(); let container = args.container.as_ref(); let namespaces = Namespaces::try_from(linux.namespaces().as_ref())?; - // Need to create the notify socket before we pivot root, since the unix - // domain socket used here is outside of the rootfs of container. During - // exec, need to create the socket before we enter into existing mount - // namespace. - let notify_listener = notify_socket::NotifyListener::new(args.notify_socket_path.as_ref())?; + let notify_listener = &args.notify_socket; setsid().map_err(|err| { tracing::error!(?err, "failed to setsid to create a session"); From 7cb9e478b09968aa8e86e302a8e45b25c11923a6 Mon Sep 17 00:00:00 2001 From: yihuaf Date: Tue, 4 Jul 2023 10:38:16 -0700 Subject: [PATCH 18/21] rebase Signed-off-by: yihuaf --- crates/youki/src/workload/executor.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/youki/src/workload/executor.rs b/crates/youki/src/workload/executor.rs index fbf893936..9565c16b5 100644 --- a/crates/youki/src/workload/executor.rs +++ b/crates/youki/src/workload/executor.rs @@ -1,5 +1,5 @@ +use libcontainer::oci_spec::runtime::Spec; use libcontainer::workload::{Executor, ExecutorError}; -use oci_spec::runtime::Spec; pub fn default_executor() -> Executor { Box::new(|spec: &Spec| -> Result<(), ExecutorError> { From e4899ad31bee1628c32d82c28988bcd6a9f117c0 Mon Sep 17 00:00:00 2001 From: yihuaf Date: Tue, 4 Jul 2023 10:51:59 -0700 Subject: [PATCH 19/21] address review Signed-off-by: yihuaf --- crates/libcontainer/src/container/builder.rs | 4 ++-- crates/libcontainer/src/container/builder_impl.rs | 2 +- crates/libcontainer/src/process/args.rs | 2 +- crates/libcontainer/src/process/container_init_process.rs | 2 +- crates/youki/src/commands/create.rs | 2 +- crates/youki/src/commands/exec.rs | 2 +- crates/youki/src/commands/run.rs | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/crates/libcontainer/src/container/builder.rs b/crates/libcontainer/src/container/builder.rs index 5667c08a0..c568146a2 100644 --- a/crates/libcontainer/src/container/builder.rs +++ b/crates/libcontainer/src/container/builder.rs @@ -252,9 +252,9 @@ impl ContainerBuilder { /// ) /// .with_executor(get_executor()); /// ``` - pub fn with_executor(mut self, executor: Executor) -> Result { + pub fn with_executor(mut self, executor: Executor) -> Self { self.executor = executor; - Ok(self) + self } } diff --git a/crates/libcontainer/src/container/builder_impl.rs b/crates/libcontainer/src/container/builder_impl.rs index d4b7dde64..787b1cdf0 100644 --- a/crates/libcontainer/src/container/builder_impl.rs +++ b/crates/libcontainer/src/container/builder_impl.rs @@ -141,7 +141,7 @@ impl ContainerBuilderImpl { spec: self.spec.to_owned(), rootfs: self.rootfs.to_owned(), console_socket: self.console_socket, - notify_socket: notify_listener, + notify_listener, preserve_fds: self.preserve_fds, container: self.container.to_owned(), rootless: self.rootless.to_owned(), diff --git a/crates/libcontainer/src/process/args.rs b/crates/libcontainer/src/process/args.rs index 56f3b149a..51530d2cd 100644 --- a/crates/libcontainer/src/process/args.rs +++ b/crates/libcontainer/src/process/args.rs @@ -28,7 +28,7 @@ pub struct ContainerArgs { /// Socket to communicate the file descriptor of the ptty pub console_socket: Option, /// The Unix Domain Socket to communicate container start - pub notify_socket: NotifyListener, + pub notify_listener: NotifyListener, /// File descriptors preserved/passed to the container init process. pub preserve_fds: i32, /// Container state diff --git a/crates/libcontainer/src/process/container_init_process.rs b/crates/libcontainer/src/process/container_init_process.rs index ce01635d2..51086097c 100644 --- a/crates/libcontainer/src/process/container_init_process.rs +++ b/crates/libcontainer/src/process/container_init_process.rs @@ -341,7 +341,7 @@ pub fn container_init_process( let hooks = spec.hooks().as_ref(); let container = args.container.as_ref(); let namespaces = Namespaces::try_from(linux.namespaces().as_ref())?; - let notify_listener = &args.notify_socket; + let notify_listener = &args.notify_listener; setsid().map_err(|err| { tracing::error!(?err, "failed to setsid to create a session"); diff --git a/crates/youki/src/commands/create.rs b/crates/youki/src/commands/create.rs index b37410746..009a7854d 100644 --- a/crates/youki/src/commands/create.rs +++ b/crates/youki/src/commands/create.rs @@ -14,7 +14,7 @@ use crate::workload::executor::default_executor; // associated with it like any other process. pub fn create(args: Create, root_path: PathBuf, systemd_cgroup: bool) -> Result<()> { ContainerBuilder::new(args.container_id.clone(), SyscallType::default()) - .with_executor(default_executor())? + .with_executor(default_executor()) .with_pid_file(args.pid_file.as_ref())? .with_console_socket(args.console_socket.as_ref()) .with_root_path(root_path)? diff --git a/crates/youki/src/commands/exec.rs b/crates/youki/src/commands/exec.rs index 4d244e737..859e88a76 100644 --- a/crates/youki/src/commands/exec.rs +++ b/crates/youki/src/commands/exec.rs @@ -9,7 +9,7 @@ use crate::workload::executor::default_executor; pub fn exec(args: Exec, root_path: PathBuf) -> Result { let pid = ContainerBuilder::new(args.container_id.clone(), SyscallType::default()) - .with_executor(default_executor())? + .with_executor(default_executor()) .with_root_path(root_path)? .with_console_socket(args.console_socket.as_ref()) .with_pid_file(args.pid_file.as_ref())? diff --git a/crates/youki/src/commands/run.rs b/crates/youki/src/commands/run.rs index 75c08d0ae..5f0e33124 100644 --- a/crates/youki/src/commands/run.rs +++ b/crates/youki/src/commands/run.rs @@ -16,7 +16,7 @@ use crate::workload::executor::default_executor; pub fn run(args: Run, root_path: PathBuf, systemd_cgroup: bool) -> Result { let mut container = ContainerBuilder::new(args.container_id.clone(), SyscallType::default()) - .with_executor(default_executor())? + .with_executor(default_executor()) .with_pid_file(args.pid_file.as_ref())? .with_console_socket(args.console_socket.as_ref()) .with_root_path(root_path)? From 5b3a931069f504d86b84e7fc465a9bfeb82a0ba5 Mon Sep 17 00:00:00 2001 From: yihuaf Date: Tue, 4 Jul 2023 20:00:33 -0700 Subject: [PATCH 20/21] fix Signed-off-by: yihuaf --- scripts/musl_test.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/musl_test.sh b/scripts/musl_test.sh index 7e1dcfdd4..3007b097c 100755 --- a/scripts/musl_test.sh +++ b/scripts/musl_test.sh @@ -6,13 +6,13 @@ test_musl() { cargo +nightly build \ -Zbuild-std \ --target $(uname -m)-unknown-linux-musl \ - --package libcontainer \ - --no-default-features -F v2 + --package "$1" \ + --no-default-features -F "$2" cargo +nightly test \ -Zbuild-std \ --target $(uname -m)-unknown-linux-musl \ - --package libcontainer \ - --no-default-features -F v2 + --package "$1" \ + --no-default-features -F "$2" } test_musl "libcontainer" "v1" From 64c81337fd36a34a2222bf59b1428a99ef157efc Mon Sep 17 00:00:00 2001 From: yihuaf Date: Tue, 4 Jul 2023 20:51:46 -0700 Subject: [PATCH 21/21] fix Signed-off-by: yihuaf --- crates/libcontainer/src/process/fork.rs | 41 +++++++++++++++---------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/crates/libcontainer/src/process/fork.rs b/crates/libcontainer/src/process/fork.rs index 7981784ea..114a22440 100644 --- a/crates/libcontainer/src/process/fork.rs +++ b/crates/libcontainer/src/process/fork.rs @@ -25,11 +25,14 @@ pub enum CloneError { } /// The callback function used in clone system call. The return value is i32 -/// which is consistent with C functions return code. The closure is boxed -/// because we will have to correctly transfer the ownership of the closure -/// memory on the heap to the new process. The trait has to be `FnMut` because -/// we need to be able to call the closure multiple times, once in clone3 and -/// once in clone if fallback is required. +/// which is consistent with C functions return code. The trait has to be +/// `FnMut` because we need to be able to call the closure multiple times, once +/// in clone3 and once in clone if fallback is required. The closure is boxed +/// because we need to store the closure on heap, not stack in the case of +/// `clone`. Unlike `fork` or `clone3`, the `clone` glibc wrapper requires us to +/// pass in a child stack, which is empty. By storing the closure in heap, we +/// can then in the new process to re-box the heap memory back to a closure +/// correctly. pub type CloneCb = Box i32>; // Clone a sibling process that shares the same parent as the calling @@ -196,9 +199,9 @@ fn clone(cb: CloneCb, flags: u64, exit_signal: Option) -> Result libc::c_int { unsafe { Box::from_raw(data as *mut CloneCb)() } } @@ -213,22 +216,26 @@ fn clone(cb: CloneCb, flags: u64, exit_signal: Option) -> Result { - // Since the clone call failed, the closure passed in didn't get - // consumed. To complete the circle, we can safely box up the - // closure again and let rust manage this memory for us. - unsafe { drop(Box::from_raw(data)) }; - Err(CloneError::Clone(nix::Error::last())) - } - pid => Ok(Pid::from_raw(pid)), + }; + + // After the clone returns, the heap memory associated with the Box closure + // is duplicated in the cloned process. Therefore, we can safely re-box the + // closure from the raw pointer and let rust to continue managing the + // memory. We call drop here explicitly to avoid the warning that the + // closure is not used. This is correct since the closure is called in the + // cloned process, not the parent process. + unsafe { drop(Box::from_raw(data)) }; + match ret { + -1 => Err(CloneError::Clone(nix::Error::last())), + pid if ret > 0 => Ok(Pid::from_raw(pid)), + _ => unreachable!("clone returned a negative pid {ret}"), } }