diff --git a/Cargo.toml b/Cargo.toml index 9e14abf..d1b01c1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,6 @@ [workspace] members = [ "lockc", + "lockc-uprobes", "xtask", ] diff --git a/contrib/etc/lockc/lockc.toml b/contrib/etc/lockc/lockc.toml index c993ea2..4e29596 100644 --- a/contrib/etc/lockc/lockc.toml +++ b/contrib/etc/lockc/lockc.toml @@ -16,6 +16,8 @@ allowed_paths_mount_restricted = [ "/var/run/container", # Storage directory used by CRI containerd. "/run/containerd/io.containerd.runtime.v1.linux", + # Storage directory used by CRI containerd. + "/run/containerd/io.containerd.runtime.v2.task", # Data directory used by docker. "/var/lib/docker/containers", # Sandbox directory used by containerd. @@ -74,6 +76,54 @@ allowed_paths_mount_restricted = [ "/sys/fs/cgroup/systemd/kubepods.slice", # Cgroup v2 hierarchy (used by systemd) for kubelet. "/sys/fs/cgroup/unified/kubepods.slice", + # Block I/O controller for kubelet. + "/sys/fs/cgroup/blkio/kubepods-besteffort", + # CPU accounting controller for kubelet. + "/sys/fs/cgroup/cpu,cpuacct/kubepods-besteffort", + # Cpusets for libpod for kubelet. + "/sys/fs/cgroup/cpuset/kubepods-besteffort", + # Device allowlist controller for kubelet. + "/sys/fs/cgroup/devices/kubepods-besteffort", + # Cgroup freezer for kubelet. + "/sys/fs/cgroup/freezer/kubepods-besteffort", + # HugeTLB controller for kubelet. + "/sys/fs/cgroup/hugetlb/kubepods-besteffort", + # Memory controller for kubelet. + "/sys/fs/cgroup/memory/kubepods-besteffort", + # Network classifier and priority controller for kubelet. + "/sys/fs/cgroup/net_cls,net_prio/kubepods-besteffort", + # Perf event controller for kubelet. + "/sys/fs/cgroup/perf_event/kubepods-besteffort", + # Process number controller for kubelet. + "/sys/fs/cgroup/pids/kubepods-besteffort", + # Cgroup v1 hierarchy (used by systemd) for kubelet. + "/sys/fs/cgroup/systemd/kubepods-besteffort", + # Cgroup v2 hierarchy (used by systemd) for kubelet. + "/sys/fs/cgroup/unified/kubepods-besteffort", + # Block I/O controller for containerd. + "/sys/fs/cgroup/blkio/system.slice/containerd.service", + # CPU accounting controller for containerd. + "/sys/fs/cgroup/cpu,cpuacct/system.slice/containerd.service", + # Cpusets for libpod for containerd. + "/sys/fs/cgroup/cpuset/system.slice/containerd.service", + # Device allowlist controller for containerd. + "/sys/fs/cgroup/devices/system.slice/containerd.service", + # Cgroup freezer for containerd. + "/sys/fs/cgroup/freezer/system.slice/containerd.service", + # HugeTLB controller for containerd. + "/sys/fs/cgroup/hugetlb/system.slice/containerd.service", + # Memory controller for containerd. + "/sys/fs/cgroup/memory/system.slice/containerd.service", + # Network classifier and priority controller for containerd. + "/sys/fs/cgroup/net_cls,net_prio/system.slice/containerd.service", + # Perf event controller for containerd. + "/sys/fs/cgroup/perf_event/system.slice/containerd.service", + # Process number controller for containerd. + "/sys/fs/cgroup/pids/system.slice/containerd.service", + # Cgroup v1 hierarchy (used by systemd) for containerd. + "/sys/fs/cgroup/systemd/system.slice/containerd.service", + # Cgroup v2 hierarchy (used by systemd) for containerd. + "/sys/fs/cgroup/unified/system.slice/containerd.service", # Block I/O controller for docker. "/sys/fs/cgroup/blkio/docker", # CPU accounting controller for docker. @@ -121,6 +171,8 @@ allowed_paths_mount_baseline = [ "/var/run/container", # Storage directory used by CRI containerd. "/run/containerd/io.containerd.runtime.v1.linux", + # Storage directory used by CRI containerd. + "/run/containerd/io.containerd.runtime.v2.task", # Data directory used by docker. "/var/lib/docker/containers", # Sandbox directory used by containerd. @@ -179,6 +231,54 @@ allowed_paths_mount_baseline = [ "/sys/fs/cgroup/systemd/kubepods.slice", # Cgroup v2 hierarchy (used by systemd) for kubelet. "/sys/fs/cgroup/unified/kubepods.slice", + # Block I/O controller for kubelet. + "/sys/fs/cgroup/blkio/kubepods-besteffort", + # CPU accounting controller for kubelet. + "/sys/fs/cgroup/cpu,cpuacct/kubepods-besteffort", + # Cpusets for libpod for kubelet. + "/sys/fs/cgroup/cpuset/kubepods-besteffort", + # Device allowlist controller for kubelet. + "/sys/fs/cgroup/devices/kubepods-besteffort", + # Cgroup freezer for kubelet. + "/sys/fs/cgroup/freezer/kubepods-besteffort", + # HugeTLB controller for kubelet. + "/sys/fs/cgroup/hugetlb/kubepods-besteffort", + # Memory controller for kubelet. + "/sys/fs/cgroup/memory/kubepods-besteffort", + # Network classifier and priority controller for kubelet. + "/sys/fs/cgroup/net_cls,net_prio/kubepods-besteffort", + # Perf event controller for kubelet. + "/sys/fs/cgroup/perf_event/kubepods-besteffort", + # Process number controller for kubelet. + "/sys/fs/cgroup/pids/kubepods-besteffort", + # Cgroup v1 hierarchy (used by systemd) for kubelet. + "/sys/fs/cgroup/systemd/kubepods-besteffort", + # Cgroup v2 hierarchy (used by systemd) for kubelet. + "/sys/fs/cgroup/unified/kubepods-besteffort", + # Block I/O controller for containerd. + "/sys/fs/cgroup/blkio/system.slice/containerd.service", + # CPU accounting controller for containerd. + "/sys/fs/cgroup/cpu,cpuacct/system.slice/containerd.service", + # Cpusets for libpod for containerd. + "/sys/fs/cgroup/cpuset/system.slice/containerd.service", + # Device allowlist controller for containerd. + "/sys/fs/cgroup/devices/system.slice/containerd.service", + # Cgroup freezer for containerd. + "/sys/fs/cgroup/freezer/system.slice/containerd.service", + # HugeTLB controller for containerd. + "/sys/fs/cgroup/hugetlb/system.slice/containerd.service", + # Memory controller for containerd. + "/sys/fs/cgroup/memory/system.slice/containerd.service", + # Network classifier and priority controller for containerd. + "/sys/fs/cgroup/net_cls,net_prio/system.slice/containerd.service", + # Perf event controller for containerd. + "/sys/fs/cgroup/perf_event/system.slice/containerd.service", + # Process number controller for containerd. + "/sys/fs/cgroup/pids/system.slice/containerd.service", + # Cgroup v1 hierarchy (used by systemd) for containerd. + "/sys/fs/cgroup/systemd/system.slice/containerd.service", + # Cgroup v2 hierarchy (used by systemd) for containerd. + "/sys/fs/cgroup/unified/system.slice/containerd.service", # Block I/O controller for docker. "/sys/fs/cgroup/blkio/docker", # CPU accounting controller for docker. @@ -213,6 +313,15 @@ allowed_paths_mount_baseline = [ ] allowed_paths_access_restricted = [ + "cgroup:", + "ipc:", + "mnt:", + "net:", + "pid:", + "pipe:", + "time:", + "user:", + "uts:", "/bin", "/dev/console", "/dev/full", @@ -224,14 +333,27 @@ allowed_paths_access_restricted = [ "/etc", "/home", "/lib", + "/lib64", + "/pause", "/proc", + "/run", "/sys/fs/cgroup", + "/sys/kernel/mm", "/tmp", "/usr", "/var", ] allowed_paths_access_baseline = [ + "cgroup:", + "ipc:", + "mnt:", + "net:", + "pid:", + "pipe:", + "time:", + "user:", + "uts:", "/bin", "/dev/console", "/dev/full", @@ -243,8 +365,12 @@ allowed_paths_access_baseline = [ "/etc", "/home", "/lib", + "/lib64", + "/pause", "/proc", + "/run", "/sys/fs/cgroup", + "/sys/kernel/mm", "/tmp", "/usr", "/var", @@ -252,9 +378,9 @@ allowed_paths_access_baseline = [ denied_paths_access_restricted = [ "/proc/acpi", + "/proc/sys", ] denied_paths_access_baseline = [ "/proc/acpi", - "/proc/sys", ] diff --git a/contrib/guestfs/build.sh b/contrib/guestfs/build.sh index f76ce6e..04785f8 100755 --- a/contrib/guestfs/build.sh +++ b/contrib/guestfs/build.sh @@ -37,10 +37,6 @@ set -eux virt-customize -a \ ${LOCKC_IMAGE} \ - --mkdir /etc/containerd \ - --mkdir /etc/docker \ - --copy-in provision/etc/containerd/config.toml:/etc/containerd/ \ - --copy-in provision/etc/docker/daemon.json:/etc/docker/ \ --copy-in provision/etc/modules-load.d/99-k8s.conf:/etc/modules-load.d/ \ --copy-in provision/etc/sysctl.d/99-k8s.conf:/etc/sysctl.d/ \ --copy-in provision/systemd/containerd.service:/etc/systemd/system/ \ diff --git a/contrib/guestfs/provision/etc/containerd/config.toml b/contrib/guestfs/provision/etc/containerd/config.toml deleted file mode 100644 index e80a669..0000000 --- a/contrib/guestfs/provision/etc/containerd/config.toml +++ /dev/null @@ -1,69 +0,0 @@ -root = "/var/lib/containerd" -state = "/run/containerd" -oom_score = 0 - -[grpc] - address = "/run/containerd/containerd.sock" - uid = 0 - gid = 0 - max_recv_message_size = 16777216 - max_send_message_size = 16777216 - -[debug] - address = "" - uid = 0 - gid = 0 - level = "" - -[metrics] - address = "" - grpc_histogram = false - -[cgroup] - path = "" - -[plugins] - [plugins.cgroups] - no_prometheus = false - [plugins.cri] - stream_server_address = "" - stream_server_port = "10010" - enable_selinux = false - sandbox_image = "k8s.gcr.io/pause:latest" - stats_collect_period = 10 - systemd_cgroup = true - enable_tls_streaming = false - max_container_log_line_size = 16384 - [plugins.cri.containerd] - snapshotter = "overlayfs" - no_pivot = true - [plugins.cri.containerd.default_runtime] - runtime_type = "io.containerd.runtime.v1.linux" - runtime_engine = "" - runtime_root = "" - [plugins.cri.containerd.untrusted_workload_runtime] - runtime_type = "" - runtime_engine = "" - runtime_root = "" - [plugins.cri.cni] - bin_dir = "/opt/cni/bin" - conf_dir = "/etc/cni/net.d" - conf_template = "" - [plugins.cri.registry] - [plugins.cri.registry.mirrors] - [plugins.cri.registry.mirrors."docker.io"] - endpoint = ["https://registry-1.docker.io"] - [plugins.diff-service] - default = ["walking"] - [plugins.linux] - shim = "containerd-shim" - runtime = "/usr/local/bin/lockc-runc-wrapper" - runtime_root = "" - no_shim = false - shim_debug = false - [plugins.scheduler] - pause_threshold = 0.02 - deletion_threshold = 0 - mutation_threshold = 100 - schedule_delay = "0s" - startup_delay = "100ms" diff --git a/contrib/guestfs/provision/etc/docker/daemon.json b/contrib/guestfs/provision/etc/docker/daemon.json deleted file mode 100644 index 5cfa689..0000000 --- a/contrib/guestfs/provision/etc/docker/daemon.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "log-level": "warn", - "log-driver": "json-file", - "log-opts": { - "max-size": "10m", - "max-file": "5" - }, - "default-runtime": "runc-lockc", - "runtimes": { - "runc-lockc": { - "path": "/usr/local/bin/lockc-runc-wrapper" - } - } -} diff --git a/contrib/guestfs/provision/provision.sh b/contrib/guestfs/provision/provision.sh index 2986183..bed178c 100755 --- a/contrib/guestfs/provision/provision.sh +++ b/contrib/guestfs/provision/provision.sh @@ -84,8 +84,6 @@ EOF ### Rebuild initrd with dracut mkinitrd -mv /etc/containerd/config.toml.rpmorig /etc/containerd/config.toml - systemctl enable containerd systemctl enable docker diff --git a/contrib/systemd/lockcd.service.in b/contrib/systemd/lockcd.service.in index 85cbfb2..f3b7082 100644 --- a/contrib/systemd/lockcd.service.in +++ b/contrib/systemd/lockcd.service.in @@ -3,7 +3,9 @@ Description=lockc daemon After=network-online.target [Service] -Type=oneshot +Type=simple +Restart=always +RestartSec=1 ExecStart={{ bindir }}/lockcd StandardOutput=journal diff --git a/docs/src/architecture.md b/docs/src/architecture.md index 1e9a09a..8ac3395 100644 --- a/docs/src/architecture.md +++ b/docs/src/architecture.md @@ -11,7 +11,7 @@ The project consists of 3 parts: policies per container/pod is yet to be implemented) - **lockcd** - the userspace program (written in Rust) - loads the BPF programs into the kernel, pins them in BPFFS + - monitors runc processes, registers new containers and determines which + policy should be applied on a container - in future, it's going to serve as the configuration manager and log collector -- **lockc-runc-wrapper** - a wraper for runc which registers new containers - and determines which policy should be applied on a container diff --git a/docs/src/policies/mount.md b/docs/src/policies/mount.md index b8270ce..8ffbc57 100644 --- a/docs/src/policies/mount.md +++ b/docs/src/policies/mount.md @@ -20,6 +20,6 @@ By default, with the **baseline** policy level, this is a good example of not allowed behavior: ```bash -# podman --runtime $(pwd)/build/src/lockc-runc-wrapper run -ti -v /:/rootfs --rm registry.opensuse.org/opensuse/toolbox:latest +# docker run -ti -v /:/rootfs --rm registry.opensuse.org/opensuse/toolbox:latest Error: container create failed (no logs from conmon): EOF ``` diff --git a/docs/src/policies/syslog.md b/docs/src/policies/syslog.md index 1bfbb0b..b6112d4 100644 --- a/docs/src/policies/syslog.md +++ b/docs/src/policies/syslog.md @@ -11,7 +11,7 @@ By default, with the **baseline** policy level, checking the kernel logs from the container is not allowed: ```bash -# podman --runtime $(pwd)/build/src/lockc-runc-wrapper run -ti --rm registry.opensuse.org/opensuse/toolbox:latest +# docker run -it --rm registry.opensuse.org/opensuse/toolbox:latest b10f9fa4a385:/ # dmesg dmesg: read kernel buffer failed: Operation not permitted ``` diff --git a/docs/src/use/README.md b/docs/src/use/README.md index dcead78..47b15b8 100644 --- a/docs/src/use/README.md +++ b/docs/src/use/README.md @@ -32,8 +32,9 @@ sudo bpftool prog btf_id 18711 ``` -To check if containers get "hardened" by lockc, check if you are able to -see the kernel logs from inside the container wrapped by **lockc-runc-wrapper**. +To check if containers get "hardened" by lockc, check if you are able to see +parts of container filesystem which are restricted by lockc. + Example: ```bash diff --git a/lockc-uprobes/Cargo.toml b/lockc-uprobes/Cargo.toml new file mode 100644 index 0000000..106169e --- /dev/null +++ b/lockc-uprobes/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "lockc-uprobes" +version = "0.1.0" +edition = "2021" + +license = "Apache-2.0" + +[dependencies] +libc = "0.2" diff --git a/lockc-uprobes/src/lib.rs b/lockc-uprobes/src/lib.rs new file mode 100644 index 0000000..48e691a --- /dev/null +++ b/lockc-uprobes/src/lib.rs @@ -0,0 +1,13 @@ +use libc::pid_t; + +#[no_mangle] +#[inline(never)] +pub extern "C" fn add_container(_retp: *mut i32, _container_id: u32, _pid: pid_t, _policy: i32) {} + +#[no_mangle] +#[inline(never)] +pub extern "C" fn delete_container(_retp: *mut i32, _container_id: u32) {} + +#[no_mangle] +#[inline(never)] +pub extern "C" fn add_process(_retp: *mut i32, _container_id: u32, _pid: pid_t) {} diff --git a/lockc/Cargo.toml b/lockc/Cargo.toml index 28aa6c6..1e9a9e3 100644 --- a/lockc/Cargo.toml +++ b/lockc/Cargo.toml @@ -22,25 +22,32 @@ bindgen = "0.59" byteorder = "1.4" chrono = { version = "0.4", default-features = false, features = ["clock"] } config = { version = "0.11", default-features = false, features = ["toml"] } +ctrlc = "3.2" dirs = "4.0" +fanotify-rs = { git = "https://github.com/vadorovsky/fanotify-rs", branch = "fix-pid-type" } futures = "0.3" +goblin = "0.4" kube = "0.63" k8s-openapi = { version = "0.13", default-features = false, features = ["v1_21"] } lazy_static = "1.4" libc = { version = "0.2", features = [ "extra_traits" ] } libbpf-rs = "0.14" +lockc-uprobes = { path = "../lockc-uprobes" } log = "0.4" log4rs = "1.0" nix = "0.23" plain = "0.2" procfs = "0.11" regex = { version = "1.5", default-features = false, features = ["perf"] } +scopeguard = "1.1" serde = "1.0" serde_json = "1.0" +simplelog = "0.11" sysctl = "0.4" thiserror = "1.0" tokio = { version = "1.7", features = ["macros", "process", "rt-multi-thread"] } uuid = { version = "0.8", default-features = false, features = ["v4"] } +which = "4.2" [build-dependencies] anyhow = "1.0" diff --git a/lockc/src/bin/lockc-runc-wrapper.rs b/lockc/src/bin/lockc-runc-wrapper.rs deleted file mode 100644 index ca9bb3a..0000000 --- a/lockc/src/bin/lockc-runc-wrapper.rs +++ /dev/null @@ -1,409 +0,0 @@ -use std::{convert::TryFrom, fs, io, path}; - -use k8s_openapi::api::core::v1; -use log::{info, LevelFilter, SetLoggerError}; -use log4rs::append::file::FileAppender; -use log4rs::config::{runtime::ConfigErrors, Appender, Config, Root}; -use uuid::Uuid; - -// TODO: To be used for cri-o. -// static ANNOTATION_K8S_LABELS: &str = "io.kubernetes.cri-o.Labels"; - -// static LABEL_NAMESPACE: &str = "io.kubernetes.pod.namespace"; -static LABEL_POLICY_ENFORCE: &str = "pod-security.kubernetes.io/enforce"; -// static LABEL_POLICY_AUDIT: &str = "pod-security.kubernetes.io/audit"; -// static LABEL_POLICY_WARN: &str = "pod-security.kubernetes.io/warn"; - -static ANNOTATION_CONTAINERD_LOG_DIRECTORY: &str = "io.kubernetes.cri.sandbox-log-directory"; -static ANNOTATION_CONTAINERD_SANDBOX_ID: &str = "io.kubernetes.cri.sandbox-id"; - -#[derive(thiserror::Error, Debug)] -enum ContainerError { - #[error("could not retrieve the runc status")] - Status(#[from] std::io::Error), - - #[error("could not format")] - Format(#[from] std::fmt::Error), - - #[error("could not convert bytes to utf-8 string")] - Utf8(#[from] std::string::FromUtf8Error), - - #[error("could not parse JSON")] - Json(#[from] serde_json::Error), - - #[error("could not find sandbox container bundle directory")] - BundleDirError, -} - -fn container_namespace>( - container_bundle: P, -) -> Result, ContainerError> { - let bundle_path = container_bundle.as_ref(); - let config_path = bundle_path.join("config.json"); - let f = std::fs::File::open(config_path)?; - let r = std::io::BufReader::new(f); - - let config: serde_json::Value = serde_json::from_reader(r)?; - let annotations_o = config["annotations"].as_object(); - - match annotations_o { - Some(annotations) => { - // containerd - if annotations.contains_key(ANNOTATION_CONTAINERD_LOG_DIRECTORY) { - // containerd doesn't expose k8s namespaces directly. They have - // to be parsed from the log directory path, where the first - // part of the filename is the namespace. - let log_directory = annotations[ANNOTATION_CONTAINERD_LOG_DIRECTORY] - .as_str() - .unwrap(); - let log_path = std::path::PathBuf::from(log_directory); - let file_name = log_path.file_name().unwrap().to_str().unwrap(); - let mut splitter = file_name.split('_'); - let namespace = splitter.next().unwrap().to_string(); - - return Ok(Some(namespace)); - } else if annotations.contains_key(ANNOTATION_CONTAINERD_SANDBOX_ID) { - // When a container is running as a part of a previously created - // pod, the log directory path has to be retrieved from the - // sandbox container. - let sandbox_id = annotations[ANNOTATION_CONTAINERD_SANDBOX_ID] - .as_str() - .unwrap(); - - // Go one directory up from the current bundle. - let mut ancestors = bundle_path.ancestors(); - ancestors.next(); - match ancestors.next() { - Some(v) => { - // Then go to sandbox_id directory (sandbox's bundle). - let new_bundle = v.join(sandbox_id); - return container_namespace(new_bundle); - } - None => return Err(ContainerError::BundleDirError), - } - } - Ok(None) - } - None => Ok(None), - } -} - -/// Finds the policy for the given Kubernetes namespace. If none, the baseline -/// policy is returned. Otherwise checks the Kubernetes namespace labels. -async fn policy_label( - namespace_o: Option, -) -> Result { - // Apply the privileged policy for kube-system containers immediately. - // Otherwise the core k8s components (apiserver, scheduler) won't be able - // to run. - // If container has no k8s namespace, apply the baseline policy. - let namespace_s = match namespace_o { - Some(v) if v.as_str() == "kube-system" => { - return Ok(lockc::bpfstructs::container_policy_level_POLICY_LEVEL_PRIVILEGED) - } - Some(v) => v, - None => return Ok(lockc::bpfstructs::container_policy_level_POLICY_LEVEL_BASELINE), - }; - - let kubeconfig = - kube::config::Kubeconfig::read_from(std::path::Path::new("/etc/kubernetes/admin.conf"))?; - let options = kube::config::KubeConfigOptions::default(); - let config = kube::config::Config::from_custom_kubeconfig(kubeconfig, &options).await?; - let client = kube::Client::try_from(config)?; - - let namespaces: kube::api::Api = kube::api::Api::all(client); - let namespace = namespaces.get(&namespace_s).await?; - - match namespace.metadata.labels { - Some(v) => match v.get(LABEL_POLICY_ENFORCE) { - Some(v) => match v.as_str() { - "restricted" => { - Ok(lockc::bpfstructs::container_policy_level_POLICY_LEVEL_RESTRICTED) - } - "baseline" => Ok(lockc::bpfstructs::container_policy_level_POLICY_LEVEL_BASELINE), - "privileged" => { - Ok(lockc::bpfstructs::container_policy_level_POLICY_LEVEL_PRIVILEGED) - } - _ => Ok(lockc::bpfstructs::container_policy_level_POLICY_LEVEL_BASELINE), - }, - None => Ok(lockc::bpfstructs::container_policy_level_POLICY_LEVEL_BASELINE), - }, - None => Ok(lockc::bpfstructs::container_policy_level_POLICY_LEVEL_BASELINE), - } -} - -use serde::Deserialize; - -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -struct Mount { - destination: String, - r#type: String, - source: String, - options: Vec -} - -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -struct Mounts { - mounts: Vec -} - -fn docker_config>( - container_bundle: P, -) -> Result { - let bundle_path = container_bundle.as_ref(); - let config_path = bundle_path.join("config.json"); - let f = std::fs::File::open(config_path)?; - let r = std::io::BufReader::new(f); - - let m: Mounts = serde_json::from_reader(r)?; - - for test in m.mounts { - let source: Vec<&str> = test.source.split('/').collect(); - if source.len() > 1 && source[ source.len() - 1 ] == "hostname" { - let config_v2= str::replace(&test.source, "hostname", "config.v2.json"); - return Ok(std::path::PathBuf::from(config_v2)); - } - } - - Err(ContainerError::BundleDirError) -} - -use serde_json::Value; - -fn docker_label>( - docker_bundle: P, -) -> Result { - let config_path = docker_bundle.as_ref(); - let f = std::fs::File::open(config_path)?; - let r = std::io::BufReader::new(f); - - let l: Value = serde_json::from_reader(r)?; - - let x = l["Config"]["Labels"]["org.lockc.policy"].as_str(); - - match x { - Some(x) => match x { - "restricted" => { - Ok(lockc::bpfstructs::container_policy_level_POLICY_LEVEL_RESTRICTED) - } - "baseline" => Ok(lockc::bpfstructs::container_policy_level_POLICY_LEVEL_BASELINE), - "privileged" => { - Ok(lockc::bpfstructs::container_policy_level_POLICY_LEVEL_PRIVILEGED) - } - _ => Ok(lockc::bpfstructs::container_policy_level_POLICY_LEVEL_BASELINE) - } - None => Ok(lockc::bpfstructs::container_policy_level_POLICY_LEVEL_BASELINE), - } -} - -/// Types of options (prepositioned by `--`). -enum OptParsingAction { - /// Option not followed by a positional argument. - NoPositional, - /// Option followed by a positional argument we don't want to store. - Skip, - /// --bundle option which we want to store. - Bundle, -} - -/// Types of positional arguments. -enum ArgParsingAction { - /// Argument we don't want to store. - None, - /// Container ID which we want to store. - ContainerId, -} - -/// Types of actions performed on the container, defined by a runc subcommand. -enum ContainerAction { - /// Types we don't explicitly handle, except of registering the process as - /// containerized. - Other, - /// Action of creating the container, when we want to register the new - /// container. - Create, - /// Action of deleting the container, when we want to remove the registered - /// container. - Delete, - /// Action of starting the container, when we want to detect and apply a - /// policy. - Start, -} - -#[derive(thiserror::Error, Debug)] -enum SetupLoggingError { - #[error(transparent)] - Config(#[from] ConfigErrors), - - #[error(transparent)] - IO(#[from] io::Error), - - #[error(transparent)] - SetLogger(#[from] SetLoggerError), -} - -fn setup_logging() -> Result<(), SetupLoggingError> { - let log_dir = path::Path::new("/var") - .join("log") - .join("lockc-runc-wrapper"); - - fs::create_dir_all(log_dir.clone())?; - let log_file = FileAppender::builder() - .build(log_dir.join(format!("{}.log", Uuid::new_v4()))) - .unwrap(); - let config = Config::builder() - .appender(Appender::builder().build("log_file", Box::new(log_file))) - .build( - Root::builder() - .appender("log_file") - .build(LevelFilter::Info), - )?; - log4rs::init_config(config)?; - - Ok(()) -} - -#[tokio::main] -async fn main() -> anyhow::Result<()> { - setup_logging()?; - - let pid = nix::unistd::getpid(); - let pid_u = u32::try_from(libc::pid_t::from(pid))?; - - let mut opt_parsing_action = OptParsingAction::NoPositional; - let mut arg_parsing_action = ArgParsingAction::None; - let mut container_action = ContainerAction::Other; - - let mut container_bundle_o: Option = None; - let mut container_id_o: Option = None; - - let mut cmd = tokio::process::Command::new("runc"); - for arg in std::env::args().skip(1) { - info!("argument: {}", arg.clone()); - cmd.arg(arg.clone()); - - match arg.as_str() { - // Options which are followed with a positional arguments we don't - // want to store. - "--log" => opt_parsing_action = OptParsingAction::Skip, - "--log-format" => opt_parsing_action = OptParsingAction::Skip, - "--pid-file" => opt_parsing_action = OptParsingAction::Skip, - "--console-socket" => opt_parsing_action = OptParsingAction::Skip, - "--root" => opt_parsing_action = OptParsingAction::Skip, - // We want to explicitly store the value of --bundle and --root - // options. - "--bundle" => opt_parsing_action = OptParsingAction::Bundle, - _ => {} - } - if arg.as_str().starts_with('-') { - // After handling the option, start parsing the next argument. - continue; - } - - match opt_parsing_action { - OptParsingAction::NoPositional => {} - OptParsingAction::Skip => { - opt_parsing_action = OptParsingAction::NoPositional; - continue; - } - OptParsingAction::Bundle => { - container_bundle_o = Some(arg.clone()); - opt_parsing_action = OptParsingAction::NoPositional; - continue; - } - } - match arg_parsing_action { - ArgParsingAction::None => {} - ArgParsingAction::ContainerId => { - container_id_o = Some(arg.clone()); - arg_parsing_action = ArgParsingAction::None; - continue; - } - } - - match arg.as_str() { - "checkpoint" => arg_parsing_action = ArgParsingAction::ContainerId, - "create" => { - arg_parsing_action = ArgParsingAction::ContainerId; - container_action = ContainerAction::Create; - } - "delete" => { - arg_parsing_action = ArgParsingAction::ContainerId; - container_action = ContainerAction::Delete; - } - "events" => arg_parsing_action = ArgParsingAction::ContainerId, - "exec" => arg_parsing_action = ArgParsingAction::ContainerId, - "kill" => arg_parsing_action = ArgParsingAction::ContainerId, - "pause" => arg_parsing_action = ArgParsingAction::ContainerId, - "ps" => arg_parsing_action = ArgParsingAction::ContainerId, - "restore" => arg_parsing_action = ArgParsingAction::ContainerId, - "resume" => arg_parsing_action = ArgParsingAction::ContainerId, - "run" => arg_parsing_action = ArgParsingAction::ContainerId, - "start" => { - arg_parsing_action = ArgParsingAction::ContainerId; - container_action = ContainerAction::Start; - } - "state" => arg_parsing_action = ArgParsingAction::ContainerId, - "update" => arg_parsing_action = ArgParsingAction::ContainerId, - _ => {} - } - } - - match container_action { - ContainerAction::Other => { - match container_id_o { - Some(v) => { - let container_key = lockc::hash(&v)?; - lockc::add_process(container_key, pid_u)?; - cmd.status().await?; - lockc::delete_process(pid_u)?; - } - None => { - // The purpose of this fake "container" is only to allow the runc - // subcommand to be detected as wrapped and thus allowed by - // the LSM program to execute. It's only to handle subcommands - // like `init`, `list` or `spec`, so we make it restricted. - lockc::add_container( - 0, - pid_u, - lockc::bpfstructs::container_policy_level_POLICY_LEVEL_RESTRICTED, - )?; - cmd.status().await?; - lockc::delete_container(0)?; - } - } - } - ContainerAction::Create => { - let container_key = lockc::hash(&container_id_o.unwrap())?; - let container_bundle = match container_bundle_o { - Some(v) => std::path::PathBuf::from(v), - None => std::env::current_dir()?, - }; - - let policy; - let runc_bundle = container_bundle.clone(); - let namespace = container_namespace(container_bundle); - match namespace { - Ok(n) => policy = policy_label(n).await?, - Err(_) => { - let docker_conf = docker_config(runc_bundle)?; - policy = docker_label(docker_conf)?; - } - }; - lockc::add_container(container_key, pid_u, policy)?; - cmd.status().await?; - } - ContainerAction::Delete => { - let container_key = lockc::hash(&container_id_o.unwrap())?; - lockc::delete_container(container_key)?; - cmd.status().await?; - } - ContainerAction::Start => { - cmd.status().await?; - } - } - - Ok(()) -} diff --git a/lockc/src/bin/lockcd.rs b/lockc/src/bin/lockcd.rs index 33d806b..6eee21a 100644 --- a/lockc/src/bin/lockcd.rs +++ b/lockc/src/bin/lockcd.rs @@ -1,13 +1,31 @@ -use std::path; +use std::{env, path}; use chrono::prelude::*; +use log::debug; +use simplelog::{ColorChoice, ConfigBuilder, LevelFilter, TermLogger, TerminalMode}; fn main() -> anyhow::Result<()> { - let sys_lsm_path = path::Path::new("/sys") - .join("kernel") - .join("security") - .join("lsm"); - lockc::check_bpf_lsm_enabled(sys_lsm_path)?; + let log_level = match env::var("LOCKC_DEBUG") { + Ok(_) => LevelFilter::Debug, + Err(_) => LevelFilter::Info, + }; + TermLogger::init( + LevelFilter::Debug, + ConfigBuilder::new() + .set_target_level(log_level) + .set_location_level(log_level) + .build(), + TerminalMode::Mixed, + ColorChoice::Auto, + )?; + + if env::var("LOCKC_CHECK_LSM_SKIP").is_err() { + let sys_lsm_path = path::Path::new("/sys") + .join("kernel") + .join("security") + .join("lsm"); + lockc::check_bpf_lsm_enabled(sys_lsm_path)?; + } let now = Utc::now(); let dirname = now.format("%s").to_string(); @@ -20,8 +38,12 @@ fn main() -> anyhow::Result<()> { let path_base_ts = path_base.join(&dirname); - lockc::load_programs(path_base_ts)?; + let _skel = lockc::BpfContext::new(path_base_ts)?; + debug!("initialized BPF skeleton, loaded programs"); lockc::cleanup(path_base, &dirname)?; + debug!("cleaned up old BPF programs"); + + lockc::runc::RuncWatcher::new()?.work_loop()?; Ok(()) } diff --git a/lockc/src/bpf/limits.h b/lockc/src/bpf/limits.h index e3db791..a36974d 100644 --- a/lockc/src/bpf/limits.h +++ b/lockc/src/bpf/limits.h @@ -9,7 +9,7 @@ /* Our arbitrary path length limit. */ #define PATH_LEN 64 -#define PATH_MAX_LIMIT 64 +#define PATH_MAX_LIMIT 128 /* Max length of task name (comm). */ #define TASK_COMM_LEN 16 diff --git a/lockc/src/bpf/lockc.bpf.c b/lockc/src/bpf/lockc.bpf.c index 925ac3f..1de5c1f 100644 --- a/lockc/src/bpf/lockc.bpf.c +++ b/lockc/src/bpf/lockc.bpf.c @@ -223,10 +223,10 @@ int BPF_PROG(syslog_audit, int type, int ret_prev) } /* - * callback_ctx - input/output data for the `check_allowed_paths` callback + * paths_callback_ctx - input/output data for the `check_allowed_paths` callback * function. */ -struct callback_ctx { +struct paths_callback_ctx { /* Input path to compare all the allowed paths with. */ unsigned char *path; /* Output whether a match was found. */ @@ -242,13 +242,13 @@ struct callback_ctx { * @allowed_path: the checked BPF map element * @data: input/output data shared between this callback and the BPF program * - * Return: 0 if the match was found and next iterations should be stopped. - * 1 if the match was not found and the search for a possible match should be + * Return: 1 if the match was found and next iterations should be stopped. + * 0 if the match was not found and the search for a possible match should be * continued. */ static u64 check_paths(struct bpf_map *map, u32 *key, struct accessed_path *allowed_path, - struct callback_ctx *data) + struct paths_callback_ctx *data) { /* * Shouldn't happen, but if in any case the checked path is NULL, skip @@ -356,7 +356,7 @@ int BPF_PROG(mount_audit, const char *dev_name, const struct path *path, ret = -EFAULT; goto out; } - struct callback_ctx cb = { + struct paths_callback_ctx cb = { .found = false, .path = dev_name_safe }; @@ -449,9 +449,9 @@ int BPF_PROG(open_audit, struct file *file, int ret_prev) bpf_printk("open: restricted: allow /\n"); goto out; } - struct callback_ctx cb = { + struct paths_callback_ctx cb = { .found = false, - .path = d_path_buf + .path = d_path_buf, }; /* @@ -474,7 +474,7 @@ int BPF_PROG(open_audit, struct file *file, int ret_prev) * If anyone can show or contribute the better solution, I owe them a * beer! */ - switch (policy_level){ + switch (policy_level) { case POLICY_LEVEL_RESTRICTED: bpf_for_each_map_elem(&denied_paths_access_restricted, check_paths, &cb, 0); @@ -517,4 +517,145 @@ int BPF_PROG(open_audit, struct file *file, int ret_prev) return ret; } +/* + * add_container - uprobe program triggered by lockc-runc-wrapper adding a new + * container. It registers that new container in BPF maps. + * + * This program is inspired by bpfcontain-rs project and its similar uprobe + * program: + * https://github.com/willfindlay/bpfcontain-rs/blob/ba4fde80b6bc75ef340dd22ac921206b18e350ab/src/bpf/bpfcontain.bpf.c#L2291-L2315 + */ +SEC("uprobe/add_container") +int BPF_KPROBE(add_container, int *retp, u32 container_id, pid_t pid, int policy) +{ + int ret = 0; + int err; + struct container c = { + .policy_level = policy, + }; + + err = bpf_map_update_elem(&containers, &container_id, &c, 0); + if (err < 0) { + bpf_printk("adding container: containers: error: %d\n", err); + ret = err; + goto out; + } + + struct process p = { + .container_id = container_id, + }; + + err = bpf_map_update_elem(&processes, &pid, &p, 0); + if (err < 0) { + bpf_printk("adding container: processes: error: %d\n", err); + ret = err; + goto out; + } + bpf_printk("adding container: success\n"); + +out: + bpf_probe_write_user(retp, &ret, sizeof(ret)); + return ret; +} + +/* + * processes_callback_ctx - input data for the `clean_processes` callback + * function. + */ +struct processes_callback_ctx { + u32 container_id; + int err; +}; + +/* + * clean_processes - callback function which removes all the processes + * associated with the given container (ID). It's supposed to be called on the + * processes BPF map when deleting a container. + */ +static u64 clean_processes(struct bpf_map *map, pid_t *key, + struct process *process, + struct processes_callback_ctx *data) +{ + int err; + + if (unlikely(process == NULL)) + return 0; + + if (process->container_id == data->container_id) { + err = bpf_map_delete_elem(map, key); + if (err < 0) { + bpf_printk("clean_processes: could not delete process, " + "err: %d\n", err); + data->err = err; + /* Continue removing next elements anyway. */ + return 0; + } + } + + return 0; +} + +/* + * delete_container - uprobe program triggered by lockc-runc-wrapper deleting a + * container. It removes information about that container and its processes from + * BPF maps. + */ +SEC("uprobe/delete_container") +int BPF_KPROBE(delete_container, int *retp, u32 container_id) +{ + int ret = 0; + int err; + err = bpf_map_delete_elem(&containers, &container_id); + struct processes_callback_ctx cb = { + .container_id = container_id, + .err = 0, + }; + bpf_for_each_map_elem(&processes, clean_processes, &cb, 0); + + /* Handle errors later, after attempting to remove everything. */ + if (err < 0) { + bpf_printk("deleting container: error: %d\n", err); + ret = err; + goto out; + } + if (cb.err < 0) { + bpf_printk("deleting container: callbacks: error: %d\n", + cb.err); + ret = cb.err; + goto out; + } + bpf_printk("deleting container: success\n"); + +out: + bpf_probe_write_user(retp, &ret, sizeof(ret)); + return ret; +} + +/* + * add_process - uprobe program triggered by lockc-runc-wrapper adding a new + * process to the container when i.e. exec-ing a new process by runc. It + * registers that new process in the BPF map. + */ +SEC("uprobe/add_process") +int BPF_KPROBE(add_process, int *retp, u32 container_id, pid_t pid) +{ + int ret = 0; + int err; + struct process p = { + .container_id = container_id, + }; + + err = bpf_map_update_elem(&processes, &pid, &p, 0); + if (err < 0) { + bpf_printk("adding process: error: %d\n", err); + ret = err; + goto out; + } + bpf_printk("adding process: success\n"); + +out: + bpf_probe_write_user(retp, &ret, sizeof(ret)); + return 0; +} + char __license[] SEC("license") = "GPL"; diff --git a/lockc/src/lib.rs b/lockc/src/lib.rs index 4a83b32..1588b90 100644 --- a/lockc/src/lib.rs +++ b/lockc/src/lib.rs @@ -5,20 +5,32 @@ #[macro_use] extern crate lazy_static; -use std::{fs, io, io::prelude::*, num, path}; +use std::{ + fs, + io::{self, prelude::*}, + num, path, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, + thread, time, +}; use byteorder::{NativeEndian, WriteBytesExt}; use sysctl::Sysctl; use bpfstructs::BpfStruct; - +use lockc_uprobes::{add_container, add_process, delete_container}; +use uprobe_ext::FindSymbolUprobeExt; #[rustfmt::skip] mod bpf; use bpf::*; pub mod bpfstructs; +pub mod runc; mod settings; +mod uprobe_ext; lazy_static! { static ref SETTINGS: settings::Settings = settings::Settings::new().unwrap(); @@ -154,11 +166,18 @@ fn get_pid_max() -> Result { Ok(pid_max) } +pub struct BpfContext<'a> { + pub skel: LockcSkel<'a>, +} + #[derive(thiserror::Error, Debug)] -pub enum LoadProgramError { +pub enum NewBpfContextError { #[error(transparent)] Libbpf(#[from] libbpf_rs::Error), + #[error(transparent)] + AttachUprobeAddr(#[from] uprobe_ext::AttachUprobeAddrError), + #[error(transparent)] GetPidMax(#[from] GetPidMaxError), @@ -169,109 +188,173 @@ pub enum LoadProgramError { InitRuntimes(#[from] InitRuntimesError), } -/// Performs the following BPF-related operations: -/// - loading BPF programs -/// - resizing PID-related BPF maps -/// - pinning BPF maps in BPFFS -/// - pinning BPF programs in BPFFS -/// - attaching BPF programs, creating links -/// - pinning links in BPFFS -/// -/// All entities pinned in BPFFS have the dedicated directory signed with a -/// timestamp. The reason behind it is to be able to still keep running -/// previous instances of BPF programs while we are in the process of loading -/// new programs. This is done to ensure that **some** instance of BPF programs -/// is always running and that containers are secured. -/// -/// TODO: The concept described above still has one hole - the contents of old -/// BPF maps is not migrated in any way. We need to come up with some sane copy -/// mechanism. -pub fn load_programs>(path_base_ts_r: P) -> Result<(), LoadProgramError> { - let path_base_ts = path_base_ts_r.as_ref(); - let skel_builder = LockcSkelBuilder::default(); - let mut open_skel = skel_builder.open()?; - - let pid_max = get_pid_max()?; - open_skel.maps_mut().containers().set_max_entries(pid_max)?; - open_skel.maps_mut().processes().set_max_entries(pid_max)?; - - let mut skel = open_skel.load()?; - - let mut path_map_runtimes = path_base_ts.join("map_runtimes"); - skel.maps_mut().runtimes().pin(&mut path_map_runtimes)?; - - init_runtimes(skel.maps_mut().runtimes())?; - - let path_map_containers = path_base_ts.join("map_containers"); - skel.maps_mut().containers().pin(path_map_containers)?; - - let path_map_processes = path_base_ts.join("map_processes"); - skel.maps_mut().processes().pin(path_map_processes)?; - - let path_map_allowed_paths_mount_restricted = - path_base_ts.join("map_allowed_paths_mount_restricted"); - skel.maps_mut() - .allowed_paths_mount_restricted() - .pin(path_map_allowed_paths_mount_restricted)?; - - let path_map_allowed_paths_mount_baseline = - path_base_ts.join("map_allowed_paths_mount_baseline"); - skel.maps_mut() - .allowed_paths_mount_baseline() - .pin(path_map_allowed_paths_mount_baseline)?; - - let path_map_allowed_paths_access_restricted = - path_base_ts.join("map_allowed_paths_access_restricted"); - skel.maps_mut() - .allowed_paths_access_restricted() - .pin(path_map_allowed_paths_access_restricted)?; - - let path_map_allowed_paths_access_baseline = - path_base_ts.join("map_allowed_paths_access_baseline"); - skel.maps_mut() - .allowed_paths_access_baseline() - .pin(path_map_allowed_paths_access_baseline)?; - - init_allowed_paths(skel.maps_mut())?; - - let path_program_fork = path_base_ts.join("prog_fork"); - skel.progs_mut() - .sched_process_fork() - .pin(path_program_fork)?; - - let path_program_clone = path_base_ts.join("prog_clone_audit"); - skel.progs_mut().clone_audit().pin(path_program_clone)?; - - let path_program_syslog = path_base_ts.join("prog_syslog_audit"); - skel.progs_mut().syslog_audit().pin(path_program_syslog)?; - - let path_program_mount = path_base_ts.join("prog_mount_audit"); - skel.progs_mut().mount_audit().pin(path_program_mount)?; - - let path_program_open = path_base_ts.join("prog_open_audit"); - skel.progs_mut().open_audit().pin(path_program_open)?; - - let mut link_fork = skel.progs_mut().sched_process_fork().attach()?; - let path_link_fork = path_base_ts.join("link_fork"); - link_fork.pin(path_link_fork)?; - - let mut link_clone = skel.progs_mut().clone_audit().attach_lsm()?; - let path_link_clone = path_base_ts.join("link_clone_audit"); - link_clone.pin(path_link_clone)?; - - let mut link_syslog = skel.progs_mut().syslog_audit().attach_lsm()?; - let path_link_syslog = path_base_ts.join("link_syslog_audit"); - link_syslog.pin(path_link_syslog)?; +impl<'a> BpfContext<'a> { + /// Performs the following BPF-related operations: + /// - loading BPF programs + /// - resizing PID-related BPF maps + /// - pinning BPF maps in BPFFS + /// - pinning BPF programs in BPFFS + /// - attaching BPF programs, creating links + /// - pinning links in BPFFS + /// + /// All entities pinned in BPFFS have the dedicated directory signed with a + /// timestamp. The reason behind it is to be able to still keep running + /// previous instances of BPF programs while we are in the process of loading + /// new programs. This is done to ensure that **some** instance of BPF programs + /// is always running and that containers are secured. + /// + /// TODO: The concept described above still has one hole - the contents of old + /// BPF maps is not migrated in any way. We need to come up with some sane copy + /// mechanism. + pub fn new>(path_base_ts_r: P) -> Result { + let path_base_ts = path_base_ts_r.as_ref(); + let skel_builder = LockcSkelBuilder::default(); + let mut open_skel = skel_builder.open()?; + + let pid_max = get_pid_max()?; + open_skel.maps_mut().containers().set_max_entries(pid_max)?; + open_skel.maps_mut().processes().set_max_entries(pid_max)?; + + let mut skel = open_skel.load()?; + + let mut path_map_runtimes = path_base_ts.join("map_runtimes"); + skel.maps_mut().runtimes().pin(&mut path_map_runtimes)?; + + init_runtimes(skel.maps_mut().runtimes())?; + + let path_map_containers = path_base_ts.join("map_containers"); + skel.maps_mut().containers().pin(path_map_containers)?; + + let path_map_processes = path_base_ts.join("map_processes"); + skel.maps_mut().processes().pin(path_map_processes)?; + + let path_map_allowed_paths_mount_restricted = + path_base_ts.join("map_allowed_paths_mount_restricted"); + skel.maps_mut() + .allowed_paths_mount_restricted() + .pin(path_map_allowed_paths_mount_restricted)?; + + let path_map_allowed_paths_mount_baseline = + path_base_ts.join("map_allowed_paths_mount_baseline"); + skel.maps_mut() + .allowed_paths_mount_baseline() + .pin(path_map_allowed_paths_mount_baseline)?; + + let path_map_allowed_paths_access_restricted = + path_base_ts.join("map_allowed_paths_access_restricted"); + skel.maps_mut() + .allowed_paths_access_restricted() + .pin(path_map_allowed_paths_access_restricted)?; + + let path_map_allowed_paths_access_baseline = + path_base_ts.join("map_allowed_paths_access_baseline"); + skel.maps_mut() + .allowed_paths_access_baseline() + .pin(path_map_allowed_paths_access_baseline)?; + + init_allowed_paths(skel.maps_mut())?; + + let path_program_fork = path_base_ts.join("prog_fork"); + skel.progs_mut() + .sched_process_fork() + .pin(path_program_fork)?; + + let path_program_clone = path_base_ts.join("prog_clone_audit"); + skel.progs_mut().clone_audit().pin(path_program_clone)?; + + let path_program_syslog = path_base_ts.join("prog_syslog_audit"); + skel.progs_mut().syslog_audit().pin(path_program_syslog)?; + + let path_program_mount = path_base_ts.join("prog_mount_audit"); + skel.progs_mut().mount_audit().pin(path_program_mount)?; + + let path_program_open = path_base_ts.join("prog_open_audit"); + skel.progs_mut().open_audit().pin(path_program_open)?; + + let path_program_add_container = path_base_ts.join("prog_add_container"); + skel.progs_mut() + .add_container() + .pin(path_program_add_container)?; + + let path_program_delete_container = path_base_ts.join("prog_delete_container"); + skel.progs_mut() + .delete_container() + .pin(path_program_delete_container)?; + + let path_program_add_process = path_base_ts.join("prog_add_process"); + skel.progs_mut() + .add_process() + .pin(path_program_add_process)?; + + let mut link_fork = skel.progs_mut().sched_process_fork().attach()?; + let path_link_fork = path_base_ts.join("link_fork"); + link_fork.pin(path_link_fork)?; + + let mut link_clone = skel.progs_mut().clone_audit().attach_lsm()?; + let path_link_clone = path_base_ts.join("link_clone_audit"); + link_clone.pin(path_link_clone)?; + + let mut link_syslog = skel.progs_mut().syslog_audit().attach_lsm()?; + let path_link_syslog = path_base_ts.join("link_syslog_audit"); + link_syslog.pin(path_link_syslog)?; + + let mut link_mount = skel.progs_mut().mount_audit().attach_lsm()?; + let path_link_mount = path_base_ts.join("link_mount_audit"); + link_mount.pin(path_link_mount)?; + + let mut link_open = skel.progs_mut().open_audit().attach_lsm()?; + let path_link_open = path_base_ts.join("link_open_audit"); + link_open.pin(path_link_open)?; + + let link_add_container = skel.progs_mut().add_container().attach_uprobe_addr( + false, + -1, + add_container as *const () as usize, + )?; + skel.links.add_container = link_add_container.into(); + // NOTE(vadorovsky): Currently it's impossible to pin uprobe links, but + // it would be REALLY NICE to be able to do so. + // let path_link_add_container = path_base_ts.join("link_add_container"); + // link_add_container.pin(path_link_add_container)?; + + let link_delete_container = skel.progs_mut().delete_container().attach_uprobe_addr( + false, + -1, + delete_container as *const () as usize, + )?; + skel.links.delete_container = link_delete_container.into(); + // NOTE(vadorovsky): Currently it's impossible to pin uprobe links, but + // it would be REALLY NICE to be able to do so. + // let path_link_delete_container = path_base_ts.join("link_delete_container"); + // link_delete_container.pin(path_link_delete_container)?; + + let link_add_process = skel.progs_mut().add_process().attach_uprobe_addr( + false, + -1, + add_process as *const () as usize, + )?; + skel.links.add_process = link_add_process.into(); + // NOTE(vadorovsky): Currently it's impossible to pin uprobe links, but + // it would be REALLY NICE to be able to do so. + // let path_link_add_process = path_base_ts.join("link_add_process"); + // link_add_process.pin(path_link_add_process)?; - let mut link_mount = skel.progs_mut().mount_audit().attach_lsm()?; - let path_link_mount = path_base_ts.join("link_mount_audit"); - link_mount.pin(path_link_mount)?; + Ok(BpfContext { skel }) + } - let mut link_open = skel.progs_mut().open_audit().attach_lsm()?; - let path_link_open = path_base_ts.join("link_open_audit"); - link_open.pin(path_link_open)?; + pub fn work_loop(&self) -> Result<(), ctrlc::Error> { + let running = Arc::new(AtomicBool::new(true)); + let r = running.clone(); + ctrlc::set_handler(move || { + r.store(false, Ordering::SeqCst); + })?; + while running.load(Ordering::SeqCst) { + eprint!("."); + thread::sleep(time::Duration::from_secs(1)); + } - Ok(()) + Ok(()) + } } #[derive(thiserror::Error, Debug)] @@ -345,35 +428,6 @@ pub enum ReusedMapsOperationError { SkelReusedMapsError(#[from] SkelReusedMapsError), } -/// Adds a new container and its first associated process into BPF maps. -pub fn add_container( - container_key: u32, - pid: u32, - level: bpfstructs::container_policy_level, -) -> Result<(), ReusedMapsOperationError> { - let mut skel = skel_reused_maps()?; - - bpfstructs::container { - policy_level: level, - } - .map_update(skel.maps_mut().containers(), container_key)?; - - bpfstructs::process { - container_id: container_key, - } - .map_update(skel.maps_mut().processes(), pid)?; - - Ok(()) -} - -/// Deletes the given container from BPF map. -pub fn delete_container(container_key: u32) -> Result<(), ReusedMapsOperationError> { - let mut skel = skel_reused_maps()?; - bpfstructs::map_delete(skel.maps_mut().containers(), container_key)?; - - Ok(()) -} - /// Writes the given policy to the container info in BPF map. pub fn write_policy( container_id: &str, @@ -390,19 +444,6 @@ pub fn write_policy( Ok(()) } -/// Adds the given process as a container's member in the BPF map. After this -/// action, LSM BPF programs are going to enforce policies on that process. -pub fn add_process(container_key: u32, pid: u32) -> Result<(), ReusedMapsOperationError> { - let mut skel = skel_reused_maps()?; - - bpfstructs::process { - container_id: container_key, - } - .map_update(skel.maps_mut().processes(), pid)?; - - Ok(()) -} - /// Removes the given process from BPF map. pub fn delete_process(pid: u32) -> Result<(), ReusedMapsOperationError> { let mut skel = skel_reused_maps()?; @@ -517,9 +558,9 @@ mod tests { // https://github.com/rancher-sandbox/lockc/issues/65 #[test] #[ignore] - fn test_load_programs() { + fn test_bpf_context() { let _cleanup = PathBase::new(); - assert!(load_programs(PATH_BASE).is_ok()); + assert!(BpfContext::new(PATH_BASE).is_ok()); } #[test] diff --git a/lockc/src/runc.rs b/lockc/src/runc.rs new file mode 100644 index 0000000..0ffecea --- /dev/null +++ b/lockc/src/runc.rs @@ -0,0 +1,639 @@ +use std::{collections::HashMap, fs, io, os::unix::fs::PermissionsExt, path::Path, string::String}; + +use fanotify::{ + high_level::{Event, Fanotify, FanotifyMode, FanotifyResponse}, + low_level::FAN_OPEN_EXEC_PERM, +}; +use k8s_openapi::api::core::v1; +use log::{debug, error}; +use nix::poll::{poll, PollFd, PollFlags}; +use procfs::{process::Process, ProcError}; +use scopeguard::defer; +use serde::Deserialize; +use serde_json::Value; +use thiserror::Error; +use tokio::runtime::Builder; + +use crate::{ + bpfstructs::{ + container_policy_level, container_policy_level_POLICY_LEVEL_BASELINE, + container_policy_level_POLICY_LEVEL_PRIVILEGED, + container_policy_level_POLICY_LEVEL_RESTRICTED, + }, + hash, HashError, +}; +use lockc_uprobes::{add_container, add_process, delete_container}; + +// static LABEL_NAMESPACE: &str = "io.kubernetes.pod.namespace"; +static LABEL_POLICY_ENFORCE: &str = "pod-security.kubernetes.io/enforce"; +// static LABEL_POLICY_AUDIT: &str = "pod-security.kubernetes.io/audit"; +// static LABEL_POLICY_WARN: &str = "pod-security.kubernetes.io/warn"; + +static ANNOTATION_CONTAINERD_LOG_DIRECTORY: &str = "io.kubernetes.cri.sandbox-log-directory"; +static ANNOTATION_CONTAINERD_SANDBOX_ID: &str = "io.kubernetes.cri.sandbox-id"; + +/// Type of Kubernetes container determined by annotations. +enum KubernetesContainerType { + /// Containerd CRI, main container with own log directory. + ContainerdMain, + /// Containerd CRI, part of another sandbox which has its own log + /// directory. + ContainerdPartOfSandbox, + /// Unknown type of Kubernetes annotations. + Unknown, +} + +fn kubernetes_type(annotations: HashMap) -> KubernetesContainerType { + if annotations.contains_key(ANNOTATION_CONTAINERD_LOG_DIRECTORY) { + return KubernetesContainerType::ContainerdMain; + } else if annotations.contains_key(ANNOTATION_CONTAINERD_SANDBOX_ID) { + return KubernetesContainerType::ContainerdPartOfSandbox; + } + KubernetesContainerType::Unknown +} + +/// Type of container by engine/runtime. +enum ContainerType { + Docker, + KubernetesContainerd, + Unknown, +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +struct Mount { + destination: String, + r#type: String, + source: String, + options: Vec, +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +struct ContainerConfig { + mounts: Vec, + annotations: Option>, +} + +#[derive(Error, Debug)] +pub enum ContainerError { + #[error(transparent)] + Status(#[from] io::Error), + + #[error(transparent)] + Json(#[from] serde_json::Error), + + #[error("could not get the file name of container log file")] + LogFileName, + + #[error("could not parse k8s namespace")] + K8sNamespace, +} + +fn container_type_data>( + container_bundle: P, +) -> Result<(ContainerType, Option), ContainerError> { + let bundle_path = container_bundle.as_ref(); + let config_path = bundle_path.join("config.json"); + let f = fs::File::open(config_path.clone())?; + let r = io::BufReader::new(f); + + let config: ContainerConfig = serde_json::from_reader(r)?; + + // Kubernetes + if let Some(annotations) = config.annotations { + debug!( + "detected kubernetes container with bundle {}, config {}", + bundle_path.display(), + config_path.display(), + ); + match kubernetes_type(annotations.clone()) { + KubernetesContainerType::ContainerdMain => { + // containerd doesn't expose k8s namespaces directly. They have + // to be parsed from the log directory path, where the first + // part of the filename is the namespace. + let log_directory = &annotations[ANNOTATION_CONTAINERD_LOG_DIRECTORY]; + debug!( + "detected k8s+containerd container with log directory {}", + log_directory + ); + let log_path = std::path::PathBuf::from(log_directory); + let file_name = log_path + .file_name() + .ok_or(ContainerError::LogFileName)? + .to_str() + .ok_or(ContainerError::LogFileName)?; + let mut splitter = file_name.split('_'); + let namespace = splitter + .next() + .ok_or(ContainerError::K8sNamespace)? + .to_string(); + + return Ok((ContainerType::KubernetesContainerd, Some(namespace))); + } + KubernetesContainerType::ContainerdPartOfSandbox => { + // When a container is running as a part of a previously created + // pod, the log directory path has to be retrieved from the + // sandbox container. + let sandbox_id = &annotations[ANNOTATION_CONTAINERD_SANDBOX_ID]; + debug!( + "detected k8s+containerd container with sandbox id {}", + sandbox_id + ); + + // Go one directory up from the current bundle. + let mut ancestors = bundle_path.ancestors(); + ancestors.next(); + if let Some(v) = ancestors.next() { + // Then go to sandbox_id directory (sandbox's bundle). + let new_bundle = v.join(sandbox_id); + return container_type_data(new_bundle); + } + } + KubernetesContainerType::Unknown => {} + } + // TODO(vadorovsky): Support more Kubernetes CRI implementations. + // They all come with their own annotations, so we will have to + // handle more keys here. + } + + // Docker + for mount in config.mounts { + let source: Vec<&str> = mount.source.split('/').collect(); + if source.len() > 1 && source[source.len() - 1] == "hostname" { + let config_v2 = str::replace(&mount.source, "hostname", "config.v2.json"); + debug!("detected docker container with config path {}", config_v2); + return Ok((ContainerType::Docker, Some(config_v2))); + } + } + + Ok((ContainerType::Unknown, None)) +} + +/// Finds the policy for the given Kubernetes namespace. If none, the baseline +/// policy is returned. Otherwise checks the Kubernetes namespace labels. +async fn policy_kubernetes(namespace: String) -> Result { + // Apply the privileged policy for kube-system containers immediately. + // Otherwise the core k8s components (apiserver, scheduler) won't be able + // to run. + // If container has no k8s namespace, apply the baseline policy. + if namespace.as_str() == "kube-system" { + return Ok(container_policy_level_POLICY_LEVEL_PRIVILEGED); + } + + let client = kube::Client::try_default().await?; + + let namespaces: kube::api::Api = kube::api::Api::all(client); + let namespace = namespaces.get(&namespace).await?; + + match namespace.metadata.labels { + Some(v) => match v.get(LABEL_POLICY_ENFORCE) { + Some(v) => match v.as_str() { + "restricted" => Ok(container_policy_level_POLICY_LEVEL_RESTRICTED), + "baseline" => Ok(container_policy_level_POLICY_LEVEL_BASELINE), + "privileged" => Ok(container_policy_level_POLICY_LEVEL_PRIVILEGED), + _ => Ok(container_policy_level_POLICY_LEVEL_BASELINE), + }, + None => Ok(container_policy_level_POLICY_LEVEL_BASELINE), + }, + None => Ok(container_policy_level_POLICY_LEVEL_BASELINE), + } +} + +#[derive(Error, Debug)] +pub enum PolicyKubernetesSyncError { + #[error(transparent)] + IO(#[from] io::Error), + + #[error(transparent)] + Kube(#[from] kube::Error), +} + +/// Makes the `policy_label_sync` function synchronous. We use it together with +/// poll(2) syscall, which is definitely not meant for multithreaded code. +fn policy_kubernetes_sync( + namespace: String, +) -> Result { + match Builder::new_current_thread() + .enable_all() + .build()? + .block_on(policy_kubernetes(namespace)) + { + Ok(p) => Ok(p), + Err(e) => Err(PolicyKubernetesSyncError::from(e)), + } +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +struct Mounts { + mounts: Vec, +} + +fn policy_docker>( + docker_bundle: P, +) -> Result { + let config_path = docker_bundle.as_ref(); + let f = std::fs::File::open(config_path)?; + let r = std::io::BufReader::new(f); + + let l: Value = serde_json::from_reader(r)?; + + let x = l["Config"]["Labels"]["org.lockc.policy"].as_str(); + + match x { + Some(x) => match x { + "restricted" => Ok(container_policy_level_POLICY_LEVEL_RESTRICTED), + "baseline" => Ok(container_policy_level_POLICY_LEVEL_BASELINE), + "privileged" => Ok(container_policy_level_POLICY_LEVEL_PRIVILEGED), + _ => Ok(container_policy_level_POLICY_LEVEL_BASELINE), + }, + None => Ok(container_policy_level_POLICY_LEVEL_BASELINE), + } +} + +enum ShimOptParsingAction { + NoPositional, + Skip, + ContainerId, +} + +enum ShimContainerAction { + Other, + Delete, +} + +/// Types of options (prepositioned by `--`). +enum OptParsingAction { + /// Option not followed by a positional argument. + NoPositional, + /// Option followed by a positional argument we don't want to store. + Skip, + /// --bundle option which we want to store. + Bundle, +} + +/// Types of positional arguments. +enum ArgParsingAction { + /// Argument we don't want to store. + None, + /// Container ID which we want to store. + ContainerId, +} + +/// Types of actions performed on the container, defined by a runc subcommand. +enum ContainerAction { + /// Types we don't explicitly handle, except of registering the process as + /// containerized. + Other, + /// Action of creating the container, when we want to register the new + /// container. + Create, + /// Action of deleting the container, when we want to remove the registered + /// container. + Delete, +} + +#[derive(Error, Debug)] +pub enum UprobeError { + #[error("failed to call into uprobe, BPF programs are most likely not running")] + Call, + + #[error("BPF program error")] + BPF, + + #[error("unknown uprobe error")] + Unknown, +} + +fn check_uprobe_ret(ret: i32) -> Result<(), UprobeError> { + match ret { + 0 => Ok(()), + n if n == -libc::EAGAIN => Err(UprobeError::Call), + n if n == -libc::EINVAL => Err(UprobeError::BPF), + _ => Err(UprobeError::Unknown), + } +} + +pub struct RuncWatcher { + fd: Fanotify, +} + +#[derive(Error, Debug)] +pub enum HandleRuncEventError { + #[error(transparent)] + IO(#[from] io::Error), + + #[error(transparent)] + Errno(#[from] nix::errno::Errno), + + #[error(transparent)] + Proc(#[from] ProcError), + + #[error(transparent)] + Container(#[from] ContainerError), + + #[error(transparent)] + Hash(#[from] HashError), + + #[error(transparent)] + PolicyKubernetes(#[from] PolicyKubernetesSyncError), + + #[error(transparent)] + Uprobe(#[from] UprobeError), + + #[error("container data missing")] + ContainerData, + + #[error("container ID missing")] + ContainerID, +} + +impl RuncWatcher { + pub fn new() -> Result { + let runc_paths = vec![ + "/usr/bin/runc", + "/usr/sbin/runc", + "/usr/local/bin/runc", + "/usr/local/sbin/runc", + "/host/usr/bin/runc", + "/host/usr/sbin/runc", + "/host/usr/local/bin/runc", + "/host/usr/local/sbin/runc", + ]; + let fd = Fanotify::new_with_nonblocking(FanotifyMode::CONTENT); + + for runc_path in runc_paths { + debug!("checking runc path {}", runc_path); + let p = Path::new(&runc_path); + if p.exists() { + let metadata = p.metadata()?; + + // When the source for host mount in Kubernetes does not + // exists, an empty directory is created. Also, directories + // contain an executable bit. Skip directories before any other + // checks. + if metadata.is_dir() { + continue; + } + + // If the file is executable. + if metadata.permissions().mode() & 0o111 != 0 { + debug!( + "runc path {} exists and is an excecutable binary", + runc_path + ); + fd.add_path(FAN_OPEN_EXEC_PERM, runc_path)?; + debug!("added runc path {} to fanotify", runc_path); + } + } + } + + Ok(RuncWatcher { fd }) + } + + fn handle_containerd_shim_event( + &self, + containerd_shim_process: Process, + ) -> Result<(), HandleRuncEventError> { + let mut opt_parsing_action = ShimOptParsingAction::NoPositional; + let mut container_action = ShimContainerAction::Other; + + let mut container_id_o: Option = None; + + for arg in containerd_shim_process.cmdline()? { + debug!("containerd-shim argument: {}", arg); + match arg.as_str() { + "-address" => opt_parsing_action = ShimOptParsingAction::Skip, + "-bundle" => opt_parsing_action = ShimOptParsingAction::Skip, + "-id" => opt_parsing_action = ShimOptParsingAction::ContainerId, + "-namespace" => opt_parsing_action = ShimOptParsingAction::Skip, + "-publish-binary" => opt_parsing_action = ShimOptParsingAction::Skip, + _ => {} + } + if arg.starts_with('-') { + continue; + } + + match opt_parsing_action { + ShimOptParsingAction::NoPositional => {} + ShimOptParsingAction::Skip => { + opt_parsing_action = ShimOptParsingAction::NoPositional; + continue; + } + ShimOptParsingAction::ContainerId => { + container_id_o = Some(arg); + opt_parsing_action = ShimOptParsingAction::NoPositional; + continue; + } + } + + if arg.as_str() == "delete" { + container_action = ShimContainerAction::Delete + } + } + + match container_action { + ShimContainerAction::Other => {} + ShimContainerAction::Delete => { + let container_key = + hash(&container_id_o.ok_or(HandleRuncEventError::ContainerID)?)?; + debug!("deleting container with key {}", container_key); + + let mut ret: i32 = -libc::EAGAIN; + delete_container(&mut ret as *mut i32, container_key); + check_uprobe_ret(ret)?; + } + } + + Ok(()) + } + + fn handle_runc_event(&self, runc_process: Process) -> Result<(), HandleRuncEventError> { + let mut opt_parsing_action = OptParsingAction::NoPositional; + let mut arg_parsing_action = ArgParsingAction::None; + let mut container_action = ContainerAction::Other; + + let mut container_bundle_o: Option = None; + let mut container_id_o: Option = None; + + // for arg in cmdline.split(CMDLINE_DELIMITER) { + for arg in runc_process.cmdline()? { + debug!("runc argument: {}", arg); + match arg.as_str() { + // Options which are followed with a positional arguments we don't + // want to store. + "--log" => opt_parsing_action = OptParsingAction::Skip, + "--log-format" => opt_parsing_action = OptParsingAction::Skip, + "--pid-file" => opt_parsing_action = OptParsingAction::Skip, + "--console-socket" => opt_parsing_action = OptParsingAction::Skip, + "--root" => opt_parsing_action = OptParsingAction::Skip, + // We want to explicitly store the value of --bundle and --root + // options. + "--bundle" => opt_parsing_action = OptParsingAction::Bundle, + _ => {} + } + if arg.starts_with('-') { + // After handling the option, start parsing the next argument. + continue; + } + + match opt_parsing_action { + OptParsingAction::NoPositional => {} + OptParsingAction::Skip => { + opt_parsing_action = OptParsingAction::NoPositional; + continue; + } + OptParsingAction::Bundle => { + container_bundle_o = Some(arg); + opt_parsing_action = OptParsingAction::NoPositional; + continue; + } + } + match arg_parsing_action { + ArgParsingAction::None => {} + ArgParsingAction::ContainerId => { + container_id_o = Some(arg); + arg_parsing_action = ArgParsingAction::None; + continue; + } + } + + match arg.as_str() { + "checkpoint" => arg_parsing_action = ArgParsingAction::ContainerId, + "create" => { + arg_parsing_action = ArgParsingAction::ContainerId; + container_action = ContainerAction::Create; + } + "delete" => { + arg_parsing_action = ArgParsingAction::ContainerId; + container_action = ContainerAction::Delete; + } + "events" => arg_parsing_action = ArgParsingAction::ContainerId, + "exec" => arg_parsing_action = ArgParsingAction::ContainerId, + "kill" => arg_parsing_action = ArgParsingAction::ContainerId, + "pause" => arg_parsing_action = ArgParsingAction::ContainerId, + "ps" => arg_parsing_action = ArgParsingAction::ContainerId, + "restore" => arg_parsing_action = ArgParsingAction::ContainerId, + "resume" => arg_parsing_action = ArgParsingAction::ContainerId, + "run" => arg_parsing_action = ArgParsingAction::ContainerId, + "start" => { + arg_parsing_action = ArgParsingAction::ContainerId; + } + "state" => arg_parsing_action = ArgParsingAction::ContainerId, + "update" => arg_parsing_action = ArgParsingAction::ContainerId, + _ => {} + } + } + + match container_action { + ContainerAction::Other => { + debug!("other container action"); + if let Some(v) = container_id_o { + let container_key = hash(&v)?; + + let mut ret: i32 = -libc::EAGAIN; + add_process(&mut ret as *mut i32, container_key, runc_process.pid); + check_uprobe_ret(ret)?; + } + } + ContainerAction::Create => { + let container_id = container_id_o.ok_or(HandleRuncEventError::ContainerID)?; + let container_key = hash(&container_id)?; + debug!( + "creating containerd with id {} key {}", + container_id, container_key + ); + let container_bundle = match container_bundle_o { + Some(v) => std::path::PathBuf::from(v), + None => std::env::current_dir()?, + }; + + // let policy; + let (container_type, container_data) = container_type_data(container_bundle)?; + let policy: container_policy_level = match container_type { + ContainerType::Docker => { + policy_docker(container_data.ok_or(HandleRuncEventError::ContainerData)?)? + } + ContainerType::KubernetesContainerd => policy_kubernetes_sync( + container_data.ok_or(HandleRuncEventError::ContainerData)?, + )?, + ContainerType::Unknown => container_policy_level_POLICY_LEVEL_BASELINE, + }; + + let mut ret: i32 = -libc::EAGAIN; + add_container( + &mut ret as *mut i32, + container_key, + runc_process.pid, + policy, + ); + check_uprobe_ret(ret)?; + } + ContainerAction::Delete => { + let container_id = container_id_o.ok_or(HandleRuncEventError::ContainerID)?; + let container_key = hash(&container_id)?; + debug!( + "deleting container with id {} key {}", + container_id, container_key + ); + + let mut ret: i32 = -libc::EAGAIN; + delete_container(&mut ret as *mut i32, container_key); + check_uprobe_ret(ret)?; + } + } + + Ok(()) + } + + fn handle_event(&self, event: Event) -> Result<(), HandleRuncEventError> { + // Let the process execute again + defer!(self.fd.send_response(event.fd, FanotifyResponse::Allow)); + + debug!("received fanotify event: {:#?}", event); + + let p = Process::new(event.pid)?; + + // Usually fanotify receives two notifications about executing runc: + // 1) from containerd-shim (or similar) + // 2) from runc + // We are interested in parsing only runc arguments rather than + // containerd-shim. + let comm = p.stat()?.comm; + debug!("event's process comm: {}", comm); + match comm.as_str() { + "runc" => { + self.handle_runc_event(p)?; + } + "containerd-shim" => { + self.handle_containerd_shim_event(p)?; + } + _ => {} + } + + Ok(()) + } + + pub fn work_loop(&self) -> Result<(), HandleRuncEventError> { + let mut fds = [PollFd::new(self.fd.as_raw_fd(), PollFlags::POLLIN)]; + loop { + let poll_num = poll(&mut fds, -1)?; + if poll_num > 0 { + for event in self.fd.read_event() { + match self.handle_event(event) { + Ok(_) => {} + Err(e) => { + error!("failed to handle event: {}", e); + } + }; + } + } else { + debug!("poll_num <= 0!"); + break; + } + } + + Ok(()) + } +} diff --git a/lockc/src/settings.rs b/lockc/src/settings.rs index 2d0aec6..64ec37f 100644 --- a/lockc/src/settings.rs +++ b/lockc/src/settings.rs @@ -14,6 +14,8 @@ static DIR_STORAGE_DOCKER_OVERLAY2: &str = "/var/lib/docker/overlay2"; static DIR_STORAGE_CONTAINERD: &str = "/var/run/container"; /// Storage directory used by CRI containerd. static DIR_STORAGE_CRI_CONTAINERD: &str = "/run/containerd/io.containerd.runtime.v1.linux"; +/// Storage directory used by CRI containerd. +static DIR_STORAGE_CRI_CONTAINERD2: &str = "/run/containerd/io.containerd.runtime.v2.task"; /// Data directory used by docker. static DIR_DATA_DOCKER: &str = "/var/lib/docker/containers"; @@ -55,29 +57,88 @@ static DIR_CGROUP_SYSTEMD_LIBPOD: &str = "/sys/fs/cgroup/systemd/machine.slice"; /// Cgroup v2 hierarchy (used by systemd) for libpod (podman, cri-o). static DIR_CGROUP_UNIFIED_LIBPOD: &str = "/sys/fs/cgroup/unified/machine.slice"; /// Block I/O controller for kubelet. -static DIR_CGROUP_BLKIO_K8S: &str = "/sys/fs/cgroup/blkio/kubepods.slice"; +static DIR_CGROUP_BLKIO_K8S1: &str = "/sys/fs/cgroup/blkio/kubepods.slice"; /// CPU accounting controller for kubelet. -static DIR_CGROUP_CPU_K8S: &str = "/sys/fs/cgroup/cpu,cpuacct/kubepods.slice"; +static DIR_CGROUP_CPU_K8S1: &str = "/sys/fs/cgroup/cpu,cpuacct/kubepods.slice"; /// Cpusets for libpod for kubelet. -static DIR_CGROUP_CPUSET_K8S: &str = "/sys/fs/cgroup/cpuset/kubepods.slice"; +static DIR_CGROUP_CPUSET_K8S1: &str = "/sys/fs/cgroup/cpuset/kubepods.slice"; /// Device allowlist controller for kubelet. -static DIR_CGROUP_DEVICES_K8S: &str = "/sys/fs/cgroup/devices/kubepods.slice"; +static DIR_CGROUP_DEVICES_K8S1: &str = "/sys/fs/cgroup/devices/kubepods.slice"; /// Cgroup freezer for kubelet. -static DIR_CGROUP_FREEZER_K8S: &str = "/sys/fs/cgroup/freezer/kubepods.slice"; +static DIR_CGROUP_FREEZER_K8S1: &str = "/sys/fs/cgroup/freezer/kubepods.slice"; /// HugeTLB controller for kubelet. -static DIR_CGROUP_HUGETLB_K8S: &str = "/sys/fs/cgroup/hugetlb/kubepods.slice"; +static DIR_CGROUP_HUGETLB_K8S1: &str = "/sys/fs/cgroup/hugetlb/kubepods.slice"; /// Memory controller for kubelet. -static DIR_CGROUP_MEMORY_K8S: &str = "/sys/fs/cgroup/memory/kubepods.slice"; +static DIR_CGROUP_MEMORY_K8S1: &str = "/sys/fs/cgroup/memory/kubepods.slice"; /// Network classifier and priority controller for kubelet. -static DIR_CGROUP_NET_K8S: &str = "/sys/fs/cgroup/net_cls,net_prio/kubepods.slice"; +static DIR_CGROUP_NET_K8S1: &str = "/sys/fs/cgroup/net_cls,net_prio/kubepods.slice"; /// Perf event controller for kubelet. -static DIR_CGROUP_PERF_K8S: &str = "/sys/fs/cgroup/perf_event/kubepods.slice"; +static DIR_CGROUP_PERF_K8S1: &str = "/sys/fs/cgroup/perf_event/kubepods.slice"; /// Process number controller for kubelet. -static DIR_CGROUP_PIDS_K8S: &str = "/sys/fs/cgroup/pids/kubepods.slice"; +static DIR_CGROUP_PIDS_K8S1: &str = "/sys/fs/cgroup/pids/kubepods.slice"; /// Cgroup v1 hierarchy (used by systemd) for kubelet. -static DIR_CGROUP_SYSTEMD_K8S: &str = "/sys/fs/cgroup/systemd/kubepods.slice"; +static DIR_CGROUP_SYSTEMD_K8S1: &str = "/sys/fs/cgroup/systemd/kubepods.slice"; /// Cgroup v2 hierarchy (used by systemd) for kubelet. -static DIR_CGROUP_UNIFIED_K8S: &str = "/sys/fs/cgroup/unified/kubepods.slice"; +static DIR_CGROUP_UNIFIED_K8S1: &str = "/sys/fs/cgroup/unified/kubepods.slice"; +/// Block I/O controller for kubelet. +static DIR_CGROUP_BLKIO_K8S2: &str = "/sys/fs/cgroup/blkio/kubepods-besteffort"; +/// CPU accounting controller for kubelet. +static DIR_CGROUP_CPU_K8S2: &str = "/sys/fs/cgroup/cpu,cpuacct/kubepods-besteffort"; +/// Cpusets for libpod for kubelet. +static DIR_CGROUP_CPUSET_K8S2: &str = "/sys/fs/cgroup/cpuset/kubepods-besteffort"; +/// Device allowlist controller for kubelet. +static DIR_CGROUP_DEVICES_K8S2: &str = "/sys/fs/cgroup/devices/kubepods-besteffort"; +/// Cgroup freezer for kubelet. +static DIR_CGROUP_FREEZER_K8S2: &str = "/sys/fs/cgroup/freezer/kubepods-besteffort"; +/// HugeTLB controller for kubelet. +static DIR_CGROUP_HUGETLB_K8S2: &str = "/sys/fs/cgroup/hugetlb/kubepods-besteffort"; +/// Memory controller for kubelet. +static DIR_CGROUP_MEMORY_K8S2: &str = "/sys/fs/cgroup/memory/kubepods-besteffort"; +/// Network classifier and priority controller for kubelet. +static DIR_CGROUP_NET_K8S2: &str = "/sys/fs/cgroup/net_cls,net_prio/kubepods-besteffort"; +/// Perf event controller for kubelet. +static DIR_CGROUP_PERF_K8S2: &str = "/sys/fs/cgroup/perf_event/kubepods-besteffort"; +/// Process number controller for kubelet. +static DIR_CGROUP_PIDS_K8S2: &str = "/sys/fs/cgroup/pids/kubepods-besteffort"; +/// Cgroup v1 hierarchy (used by systemd) for kubelet. +static DIR_CGROUP_SYSTEMD_K8S2: &str = "/sys/fs/cgroup/systemd/kubepods-besteffort"; +/// Cgroup v2 hierarchy (used by systemd) for kubelet. +static DIR_CGROUP_UNIFIED_K8S2: &str = "/sys/fs/cgroup/unified/kubepods-besteffort"; +/// Block I/O controller for containerd. +static DIR_CGROUP_BLKIO_CONTAINERD_K8S: &str = + "/sys/fs/cgroup/blkio/system.slice/containerd.service"; +/// CPU accounting controller for containerd. +static DIR_CGROUP_CPU_CONTAINERD_K8S: &str = + "/sys/fs/cgroup/cpu,cpuacct/system.slice/containerd.service"; +/// Cpusets for libpod for containerd. +static DIR_CGROUP_CPUSET_CONTAINERD_K8S: &str = + "/sys/fs/cgroup/cpuset/system.slice/containerd.service"; +/// Device allowlist controller for containerd. +static DIR_CGROUP_DEVICES_CONTAINERD_K8S: &str = + "/sys/fs/cgroup/devices/system.slice/containerd.service"; +/// Cgroup freezer for containerd. +static DIR_CGROUP_FREEZER_CONTAINERD_K8S: &str = + "/sys/fs/cgroup/freezer/system.slice/containerd.service"; +/// HugeTLB controller for containerd. +static DIR_CGROUP_HUGETLB_CONTAINERD_K8S: &str = + "/sys/fs/cgroup/hugetlb/system.slice/containerd.service"; +/// Memory controller for containerd. +static DIR_CGROUP_MEMORY_CONTAINERD_K8S: &str = + "/sys/fs/cgroup/memory/system.slice/containerd.service"; +/// Network classifier and priority controller for containerd. +static DIR_CGROUP_NET_CONTAINERD_K8S: &str = + "/sys/fs/cgroup/net_cls,net_prio/system.slice/containerd.service"; +/// Perf event controller for containerd. +static DIR_CGROUP_PERF_CONTAINERD_K8S: &str = + "/sys/fs/cgroup/perf_event/system.slice/containerd.service"; +/// Process number controller for containerd. +static DIR_CGROUP_PIDS_CONTAINERD_K8S: &str = "/sys/fs/cgroup/pids/system.slice/containerd.service"; +/// Cgroup v1 hierarchy (used by systemd) for containerd. +static DIR_CGROUP_SYSTEMD_CONTAINERD_K8S: &str = + "/sys/fs/cgroup/systemd/system.slice/containerd.service"; +/// Cgroup v2 hierarchy (used by systemd) for containerd. +static DIR_CGROUP_UNIFIED_CONTAINERD_K8S: &str = + "/sys/fs/cgroup/unified/system.slice/containerd.service"; /// Block I/O controller for docker. static DIR_CGROUP_BLKIO_DOCKER: &str = "/sys/fs/cgroup/blkio/docker"; /// CPU accounting controller for docker. @@ -109,6 +170,24 @@ static DIR_PODS_KUBELET: &str = "/var/lib/kubelet/pods"; static DIR_HOME: &str = "/home"; static DIR_VAR_DATA: &str = "/var/data"; +/// Cgroup file, i.e. cgroup:[4026531835] +static GROUP: &str = "cgroup:"; +/// IPC namespace file, i.e. ipc:[4026531839] +static NS_IPC: &str = "ipc:"; +/// Mount namespace file, i.e. mnt:[4026531840] +static NS_MNT: &str = "mnt:"; +/// Network namespace file, i.e. net:[4026531992] +static NS_NET: &str = "net:"; +/// PID namespace file, i.e. pid:[4026531836] +static NS_PID: &str = "pid:"; +/// Pipe +static PIPE: &str = "pipe:"; +/// Time namespace file. i.e. time:[4026531834] +static NS_TIME: &str = "time:"; +/// User namespace file, i.e. user:[4026531837] +static NS_USER: &str = "user:"; +/// UTS namespace file, i.e. uts:[4026531838] +static NS_UTS: &str = "uts:"; static DIR_BIN: &str = "/bin"; static DIR_DEV_CONSOLE: &str = "/dev/console"; static DIR_DEV_FULL: &str = "/dev/full"; @@ -119,8 +198,12 @@ static DIR_DEV_URANDOM: &str = "/dev/urandom"; static DIR_DEV_ZERO: &str = "/dev/zero"; static DIR_ETC: &str = "/etc"; static DIR_LIB: &str = "/lib"; +static DIR_LIB64: &str = "/lib64"; +static PAUSE: &str = "/pause"; static DIR_PROC: &str = "/proc"; +static DIR_RUN: &str = "/run"; static DIR_CGROUP: &str = "/sys/fs/cgroup"; +static DIR_MM: &str = "/sys/kernel/mm"; static DIR_TMP: &str = "/tmp"; static DIR_USR: &str = "/usr"; static DIR_VAR: &str = "/var"; @@ -177,6 +260,7 @@ impl Settings { DIR_STORAGE_DOCKER_OVERLAY2.to_string(), DIR_STORAGE_CONTAINERD.to_string(), DIR_STORAGE_CRI_CONTAINERD.to_string(), + DIR_STORAGE_CRI_CONTAINERD2.to_string(), DIR_DATA_DOCKER.to_string(), DIR_SANDBOXES_CRI_CONTAINERD1.to_string(), DIR_SANDBOXES_CRI_CONTAINERD2.to_string(), @@ -194,18 +278,42 @@ impl Settings { DIR_CGROUP_PIDS_LIBPOD.to_string(), DIR_CGROUP_SYSTEMD_LIBPOD.to_string(), DIR_CGROUP_UNIFIED_LIBPOD.to_string(), - DIR_CGROUP_BLKIO_K8S.to_string(), - DIR_CGROUP_CPU_K8S.to_string(), - DIR_CGROUP_CPUSET_K8S.to_string(), - DIR_CGROUP_DEVICES_K8S.to_string(), - DIR_CGROUP_FREEZER_K8S.to_string(), - DIR_CGROUP_HUGETLB_K8S.to_string(), - DIR_CGROUP_MEMORY_K8S.to_string(), - DIR_CGROUP_NET_K8S.to_string(), - DIR_CGROUP_PERF_K8S.to_string(), - DIR_CGROUP_PIDS_K8S.to_string(), - DIR_CGROUP_SYSTEMD_K8S.to_string(), - DIR_CGROUP_UNIFIED_K8S.to_string(), + DIR_CGROUP_BLKIO_K8S1.to_string(), + DIR_CGROUP_CPU_K8S1.to_string(), + DIR_CGROUP_CPUSET_K8S1.to_string(), + DIR_CGROUP_DEVICES_K8S1.to_string(), + DIR_CGROUP_FREEZER_K8S1.to_string(), + DIR_CGROUP_HUGETLB_K8S1.to_string(), + DIR_CGROUP_MEMORY_K8S1.to_string(), + DIR_CGROUP_NET_K8S1.to_string(), + DIR_CGROUP_PERF_K8S1.to_string(), + DIR_CGROUP_PIDS_K8S1.to_string(), + DIR_CGROUP_SYSTEMD_K8S1.to_string(), + DIR_CGROUP_UNIFIED_K8S1.to_string(), + DIR_CGROUP_BLKIO_K8S2.to_string(), + DIR_CGROUP_CPU_K8S2.to_string(), + DIR_CGROUP_CPUSET_K8S2.to_string(), + DIR_CGROUP_DEVICES_K8S2.to_string(), + DIR_CGROUP_FREEZER_K8S2.to_string(), + DIR_CGROUP_HUGETLB_K8S2.to_string(), + DIR_CGROUP_MEMORY_K8S2.to_string(), + DIR_CGROUP_NET_K8S2.to_string(), + DIR_CGROUP_PERF_K8S2.to_string(), + DIR_CGROUP_PIDS_K8S2.to_string(), + DIR_CGROUP_SYSTEMD_K8S2.to_string(), + DIR_CGROUP_UNIFIED_K8S2.to_string(), + DIR_CGROUP_BLKIO_CONTAINERD_K8S.to_string(), + DIR_CGROUP_CPU_CONTAINERD_K8S.to_string(), + DIR_CGROUP_CPUSET_CONTAINERD_K8S.to_string(), + DIR_CGROUP_DEVICES_CONTAINERD_K8S.to_string(), + DIR_CGROUP_FREEZER_CONTAINERD_K8S.to_string(), + DIR_CGROUP_HUGETLB_CONTAINERD_K8S.to_string(), + DIR_CGROUP_MEMORY_CONTAINERD_K8S.to_string(), + DIR_CGROUP_NET_CONTAINERD_K8S.to_string(), + DIR_CGROUP_PERF_CONTAINERD_K8S.to_string(), + DIR_CGROUP_PIDS_CONTAINERD_K8S.to_string(), + DIR_CGROUP_SYSTEMD_CONTAINERD_K8S.to_string(), + DIR_CGROUP_UNIFIED_CONTAINERD_K8S.to_string(), DIR_CGROUP_BLKIO_DOCKER.to_string(), DIR_CGROUP_CPU_DOCKER.to_string(), DIR_CGROUP_CPUSET_DOCKER.to_string(), @@ -230,6 +338,7 @@ impl Settings { DIR_STORAGE_DOCKER_OVERLAY2.to_string(), DIR_STORAGE_CONTAINERD.to_string(), DIR_STORAGE_CRI_CONTAINERD.to_string(), + DIR_STORAGE_CRI_CONTAINERD2.to_string(), DIR_DATA_DOCKER.to_string(), DIR_SANDBOXES_CRI_CONTAINERD1.to_string(), DIR_SANDBOXES_CRI_CONTAINERD2.to_string(), @@ -247,18 +356,42 @@ impl Settings { DIR_CGROUP_PIDS_LIBPOD.to_string(), DIR_CGROUP_SYSTEMD_LIBPOD.to_string(), DIR_CGROUP_UNIFIED_LIBPOD.to_string(), - DIR_CGROUP_BLKIO_K8S.to_string(), - DIR_CGROUP_CPU_K8S.to_string(), - DIR_CGROUP_CPUSET_K8S.to_string(), - DIR_CGROUP_DEVICES_K8S.to_string(), - DIR_CGROUP_FREEZER_K8S.to_string(), - DIR_CGROUP_HUGETLB_K8S.to_string(), - DIR_CGROUP_MEMORY_K8S.to_string(), - DIR_CGROUP_NET_K8S.to_string(), - DIR_CGROUP_PERF_K8S.to_string(), - DIR_CGROUP_PIDS_K8S.to_string(), - DIR_CGROUP_SYSTEMD_K8S.to_string(), - DIR_CGROUP_UNIFIED_K8S.to_string(), + DIR_CGROUP_BLKIO_K8S1.to_string(), + DIR_CGROUP_CPU_K8S1.to_string(), + DIR_CGROUP_CPUSET_K8S1.to_string(), + DIR_CGROUP_DEVICES_K8S1.to_string(), + DIR_CGROUP_FREEZER_K8S1.to_string(), + DIR_CGROUP_HUGETLB_K8S1.to_string(), + DIR_CGROUP_MEMORY_K8S1.to_string(), + DIR_CGROUP_NET_K8S1.to_string(), + DIR_CGROUP_PERF_K8S1.to_string(), + DIR_CGROUP_PIDS_K8S1.to_string(), + DIR_CGROUP_SYSTEMD_K8S1.to_string(), + DIR_CGROUP_UNIFIED_K8S1.to_string(), + DIR_CGROUP_BLKIO_K8S2.to_string(), + DIR_CGROUP_CPU_K8S2.to_string(), + DIR_CGROUP_CPUSET_K8S2.to_string(), + DIR_CGROUP_DEVICES_K8S2.to_string(), + DIR_CGROUP_FREEZER_K8S2.to_string(), + DIR_CGROUP_HUGETLB_K8S2.to_string(), + DIR_CGROUP_MEMORY_K8S2.to_string(), + DIR_CGROUP_NET_K8S2.to_string(), + DIR_CGROUP_PERF_K8S2.to_string(), + DIR_CGROUP_PIDS_K8S2.to_string(), + DIR_CGROUP_SYSTEMD_K8S2.to_string(), + DIR_CGROUP_UNIFIED_K8S2.to_string(), + DIR_CGROUP_BLKIO_CONTAINERD_K8S.to_string(), + DIR_CGROUP_CPU_CONTAINERD_K8S.to_string(), + DIR_CGROUP_CPUSET_CONTAINERD_K8S.to_string(), + DIR_CGROUP_DEVICES_CONTAINERD_K8S.to_string(), + DIR_CGROUP_FREEZER_CONTAINERD_K8S.to_string(), + DIR_CGROUP_HUGETLB_CONTAINERD_K8S.to_string(), + DIR_CGROUP_MEMORY_CONTAINERD_K8S.to_string(), + DIR_CGROUP_NET_CONTAINERD_K8S.to_string(), + DIR_CGROUP_PERF_CONTAINERD_K8S.to_string(), + DIR_CGROUP_PIDS_CONTAINERD_K8S.to_string(), + DIR_CGROUP_SYSTEMD_CONTAINERD_K8S.to_string(), + DIR_CGROUP_UNIFIED_CONTAINERD_K8S.to_string(), DIR_CGROUP_BLKIO_DOCKER.to_string(), DIR_CGROUP_CPU_DOCKER.to_string(), DIR_CGROUP_CPUSET_DOCKER.to_string(), @@ -280,6 +413,15 @@ impl Settings { s.set( "allowed_paths_access_restricted", vec![ + GROUP.to_string(), + NS_IPC.to_string(), + NS_MNT.to_string(), + NS_NET.to_string(), + NS_PID.to_string(), + PIPE.to_string(), + NS_TIME.to_string(), + NS_USER.to_string(), + NS_UTS.to_string(), DIR_BIN.to_string(), DIR_DEV_CONSOLE.to_string(), DIR_DEV_FULL.to_string(), @@ -291,8 +433,12 @@ impl Settings { DIR_ETC.to_string(), DIR_HOME.to_string(), DIR_LIB.to_string(), + DIR_LIB64.to_string(), + PAUSE.to_string(), DIR_PROC.to_string(), + DIR_RUN.to_string(), DIR_CGROUP.to_string(), + DIR_MM.to_string(), DIR_TMP.to_string(), DIR_USR.to_string(), DIR_VAR.to_string(), @@ -301,6 +447,15 @@ impl Settings { s.set( "allowed_paths_access_baseline", vec![ + GROUP.to_string(), + NS_IPC.to_string(), + NS_MNT.to_string(), + NS_NET.to_string(), + NS_PID.to_string(), + PIPE.to_string(), + NS_TIME.to_string(), + NS_USER.to_string(), + NS_UTS.to_string(), DIR_BIN.to_string(), DIR_DEV_CONSOLE.to_string(), DIR_DEV_FULL.to_string(), @@ -312,8 +467,12 @@ impl Settings { DIR_ETC.to_string(), DIR_HOME.to_string(), DIR_LIB.to_string(), + DIR_LIB64.to_string(), + PAUSE.to_string(), DIR_PROC.to_string(), + DIR_RUN.to_string(), DIR_CGROUP.to_string(), + DIR_MM.to_string(), DIR_TMP.to_string(), DIR_USR.to_string(), DIR_VAR.to_string(), @@ -321,11 +480,11 @@ impl Settings { )?; s.set( "denied_paths_access_restricted", - vec![DIR_PROC_ACPI.to_string()], + vec![DIR_PROC_ACPI.to_string(), DIR_PROC_SYS.to_string()], )?; s.set( "denied_paths_access_baseline", - vec![DIR_PROC_ACPI.to_string(), DIR_PROC_SYS.to_string()], + vec![DIR_PROC_ACPI.to_string()], )?; s.merge(config::File::with_name("/etc/lockc/lockc.toml").required(false))?; diff --git a/lockc/src/uprobe_ext.rs b/lockc/src/uprobe_ext.rs new file mode 100644 index 0000000..bd02196 --- /dev/null +++ b/lockc/src/uprobe_ext.rs @@ -0,0 +1,167 @@ +//! Extensions for libbpf-rs uprobe functionality. Specifically, we add a higher level +//! interface for resolving symbols from ELF binaries for uprobe attachment as well as +//! attaching uprobes to a function address in our current address space. +//! +//! Based on a similar module in bpfcontain-rs: +//! https://github.com/willfindlay/bpfcontain-rs/blob/ba4fde80b6bc75ef340dd22ac921206b18e350ab/src/uprobe_ext.rs + +use std::{fs::read, io, path::Path}; + +use goblin::elf::{Elf, Sym}; +use procfs::process::Process; +use thiserror::Error; + +/// Resolves symbols from an ELF file +/// Based on https://github.com/ingraind/redbpf/blob/main/redbpf/src/symbols.rs +struct SymbolResolver<'a> { + elf: Elf<'a>, +} + +#[derive(Error, Debug)] +pub enum FindInFileError { + #[error(transparent)] + IO(#[from] io::Error), + + #[error(transparent)] + Goblin(#[from] goblin::error::Error), + + #[error("failed to find symbol")] + NotFound, +} + +impl<'a> SymbolResolver<'a> { + /// Find a symbol offset within a file specified by `pathname` + pub fn find_in_file(pathname: &Path, symbol: &str) -> Result { + let bytes = read(pathname)?; + let resolver = Self::parse(&bytes)?; + let offset = resolver.find_offset(symbol); + match offset { + Some(o) => Ok(o), + None => Err(FindInFileError::NotFound), + } + } + + /// Parse an ELF file and return a [`SymbolResolver`] + pub fn parse(bytes: &[u8]) -> Result { + let elf = Elf::parse(bytes)?; + Ok(SymbolResolver { elf }) + } + + /// Resolve a symbol in the ELF file + fn resolve_sym(&self, symbol: &str) -> Option { + self.elf.syms.iter().find(|sym| { + self.elf + .strtab + .get_at(sym.st_name) + .map(|sym| sym == symbol) + .unwrap_or(false) + }) + } + + /// Find the offset of a symbol in the ELF file + pub fn find_offset(&self, symbol: &str) -> Option { + self.resolve_sym(symbol).map(|sym| sym.st_value as usize) + } +} + +#[derive(Error, Debug)] +pub enum AttachUprobeSymbolError { + #[error(transparent)] + Libbpf(#[from] libbpf_rs::Error), + + #[error(transparent)] + FindInFile(#[from] FindInFileError), +} + +#[derive(thiserror::Error, Debug)] +pub enum AttachUprobeAddrError { + #[error(transparent)] + Libbpf(#[from] libbpf_rs::Error), + + #[error(transparent)] + Proc(#[from] procfs::ProcError), + + #[error("failed to find executable region")] + NotFound, +} + +pub trait FindSymbolUprobeExt { + fn attach_uprobe_symbol( + &mut self, + retprobe: bool, + pid: i32, + pathname: &Path, + symbol: &str, + ) -> Result; + + fn attach_uprobe_addr( + &mut self, + retprobe: bool, + pid: i32, + addr: usize, + ) -> Result; +} + +impl FindSymbolUprobeExt for libbpf_rs::Program { + /// Attach a uprobe to a symbol within another binary. + fn attach_uprobe_symbol( + &mut self, + retprobe: bool, + pid: i32, + pathname: &Path, + symbol: &str, + ) -> Result { + // Find symbol in the ELF file + let offset = SymbolResolver::find_in_file(pathname, symbol)?; + + // Use the offset we found to attach the probe + match self.attach_uprobe(retprobe, pid, pathname, offset) { + Ok(link) => Ok(link), + Err(e) => Err(AttachUprobeSymbolError::from(e)), + } + } + + /// Attach a uprobe to an address within our own address space. + fn attach_uprobe_addr( + &mut self, + retprobe: bool, + pid: i32, + addr: usize, + ) -> Result { + // Find the offset + let base_addr = get_base_addr()?; + let offset = addr - base_addr; + + let pathname = "/proc/self/exe"; + + // Use the offset we found to attach the probe + match self.attach_uprobe(retprobe, pid, pathname, offset) { + Ok(link) => Ok(link), + Err(e) => Err(AttachUprobeAddrError::from(e)), + } + } +} + +/// Find our base load address. We use /proc/self/maps for this. +fn get_base_addr() -> Result { + let me = Process::myself()?; + let maps = me.maps()?; + + for entry in maps { + if entry.perms.contains("r-xp") { + return Ok((entry.address.0 - entry.offset) as usize); + } + } + + Err(AttachUprobeAddrError::NotFound) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn get_base_addr_smoke_test() { + get_base_addr().expect("Calling get_base_addr failed"); + } +}