diff --git a/.github/workflows/e2e_hypernode.yaml b/.github/workflows/e2e_hypernode.yaml new file mode 100644 index 0000000000..c80beda026 --- /dev/null +++ b/.github/workflows/e2e_hypernode.yaml @@ -0,0 +1,60 @@ +name: E2E Hypernode + +on: + push: + branches: + - master + tags: + pull_request: + +jobs: + e2e_hypernode: + runs-on: ubuntu-24.04 + name: E2E about Hypernode + timeout-minutes: 40 + steps: + - name: Install Go + uses: actions/setup-go@v4 + with: + go-version: 1.22.x + + - name: Install musl + run: | + wget http://musl.libc.org/releases/musl-1.2.1.tar.gz + tar -xf musl-1.2.1.tar.gz && cd musl-1.2.1 + ./configure + make && sudo make install + + - uses: actions/cache@v2 + with: + path: ~/go/pkg/mod + key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }} + + - name: Install dependences + run: | + GO111MODULE="on" go install sigs.k8s.io/kind@v0.24.0 + curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.31.0/bin/linux/amd64/kubectl && sudo install kubectl /usr/local/bin/kubectl + + - name: Install kwok + run: | + helm repo add kwok https://kwok.sigs.k8s.io/charts/ + helm repo update + helm --install --namespace kube-system kwok kwok/kwok + helm --install kwok kwok/stage-fast + # delete pod-complete stage to avoid volcano-job-pod change status to complete. + kubectl delete stage pod-complete + + - name: Checkout code + uses: actions/checkout@v3 + + - name: Run E2E Tests + run: | + export ARTIFACTS_PATH=${{ github.workspace }}/e2e-hypernode-logs + make e2e-test-hypernode CC=/usr/local/musl/bin/musl-gcc + + - name: Upload e2e hypernode logs + if: failure() + uses: actions/upload-artifact@v4 + with: + name: volcano_e2e_hypernode_logs + path: ${{ github.workspace }}/e2e-hypernode-logs diff --git a/Makefile b/Makefile index 9175e95756..f46fd09574 100644 --- a/Makefile +++ b/Makefile @@ -142,6 +142,9 @@ e2e-test-vcctl: vcctl images e2e-test-stress: images E2E_TYPE=STRESS ./hack/run-e2e-kind.sh +e2e-test-hypernode: images + E2E_TYPE=HYPERNODE ./hack/run-e2e-kind.sh + generate-yaml: init manifests ./hack/generate-yaml.sh TAG=${RELEASE_VER} CRD_VERSION=${CRD_VERSION} diff --git a/hack/lib/install.sh b/hack/lib/install.sh index 4f77920d91..63496ded4a 100644 --- a/hack/lib/install.sh +++ b/hack/lib/install.sh @@ -98,4 +98,12 @@ function install-ginkgo-if-not-exist { else echo -n "Found ginkgo, version: " && ginkgo version fi -} \ No newline at end of file +} + +function install-kwok-with-helm { + helm repo add kwok https://kwok.sigs.k8s.io/charts/ + helm upgrade --namespace kube-system --install kwok kwok/kwok + helm upgrade --install kwok kwok/stage-fast + # delete pod-complete stage to avoid volcano-job-pod change status to complete. + kubectl delete stage pod-complete +} diff --git a/hack/run-e2e-kind.sh b/hack/run-e2e-kind.sh index dc04e57ab3..67bcbd76f2 100755 --- a/hack/run-e2e-kind.sh +++ b/hack/run-e2e-kind.sh @@ -31,6 +31,57 @@ export CLUSTER_CONTEXT=("--name" "${CLUSTER_NAME}") export KIND_OPT=${KIND_OPT:="--config ${VK_ROOT}/hack/e2e-kind-config.yaml"} +# kwok node config +export KWOK_NODE_CPU=${KWOK_NODE_CPU:-4} # 4 cores +export KWOK_NODE_MEMORY=${KWOK_NODE_MEMORY:-4Gi} # 4GB + +# generate kwok node config +function generate-kwok-node-config() { + local node_index=$1 + local cpu=$2 + local memory=$3 + + cat < "${VK_ROOT}/hack/kwok-node-${node_index}.yaml" +apiVersion: v1 +kind: Node +metadata: + annotations: + node.alpha.kubernetes.io/ttl: "0" + kwok.x-k8s.io/node: fake + labels: + beta.kubernetes.io/arch: amd64 + beta.kubernetes.io/os: linux + kubernetes.io/arch: amd64 + kubernetes.io/hostname: kwok-node-${node_index} + kubernetes.io/os: linux + kubernetes.io/role: agent + node-role.kubernetes.io/agent: "" + type: kwok + name: kwok-node-${node_index} +spec: + taints: + - effect: NoSchedule + key: kwok.x-k8s.io/node + value: fake +status: + capacity: + cpu: "${cpu}" + memory: "${memory}" + pods: "110" + allocatable: + cpu: "${cpu}" + memory: "${memory}" + pods: "110" +EOF +} + +# install kwok nodes +function install-kwok-nodes(node_count) { + for i in $(seq 0 $((node_count-1))); do + generate-kwok-node-config $i "${KWOK_NODE_CPU}" "${KWOK_NODE_MEMORY}" + kubectl apply -f "${VK_ROOT}/hack/kwok-node-${i}.yaml" + done +} function install-volcano { install-helm @@ -121,6 +172,7 @@ source "${VK_ROOT}/hack/lib/install.sh" check-prerequisites kind-up-cluster +install-kwok-with-helm if [[ -z ${KUBECONFIG+x} ]]; then export KUBECONFIG="${HOME}/.kube/config" @@ -166,6 +218,12 @@ case ${E2E_TYPE} in echo "Running stress e2e suite..." KUBECONFIG=${KUBECONFIG} GOOS=${OS} ginkgo -r --slow-spec-threshold='30s' --progress ./test/e2e/stress/ ;; +"HYPERNODE") + echo "Creating 8 kwok nodes for 3-tier topology" + install-kwok-nodes(8) + echo "Running hypernode e2e suite..." + KUBECONFIG=${KUBECONFIG} GOOS=${OS} ginkgo -r --slow-spec-threshold='30s' --progress ./test/e2e/hypernode/ + ;; esac if [[ $? -ne 0 ]]; then diff --git a/test/e2e/hypernode/e2e_test.go b/test/e2e/hypernode/e2e_test.go new file mode 100644 index 0000000000..7e42ce26c4 --- /dev/null +++ b/test/e2e/hypernode/e2e_test.go @@ -0,0 +1,13 @@ +package hypernode + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestE2E(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Volcano Network Topology Test Suite") +} diff --git a/test/e2e/hypernode/main_test.go b/test/e2e/hypernode/main_test.go new file mode 100644 index 0000000000..7080119f45 --- /dev/null +++ b/test/e2e/hypernode/main_test.go @@ -0,0 +1,21 @@ +package hypernode + +import ( + "os" + "testing" + + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" + + vcclient "volcano.sh/apis/pkg/client/clientset/versioned" + e2eutil "volcano.sh/volcano/test/e2e/util" +) + +func TestMain(m *testing.M) { + home := e2eutil.HomeDir() + configPath := e2eutil.KubeconfigPath(home) + config, _ := clientcmd.BuildConfigFromFlags(e2eutil.MasterURL(), configPath) + e2eutil.VcClient = vcclient.NewForConfigOrDie(config) + e2eutil.KubeClient = kubernetes.NewForConfigOrDie(config) + os.Exit(m.Run()) +} diff --git a/test/e2e/hypernode/network_topology_test.go b/test/e2e/hypernode/network_topology_test.go new file mode 100644 index 0000000000..c0fdeb195b --- /dev/null +++ b/test/e2e/hypernode/network_topology_test.go @@ -0,0 +1,204 @@ +package hypernode + +import ( + "context" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" + + batchv1alpha1 "volcano.sh/apis/pkg/apis/batch/v1alpha1" + topologyv1alpha1 "volcano.sh/apis/pkg/apis/topology/v1alpha1" + e2eutil "volcano.sh/volcano/test/e2e/util" +) + +var _ = Describe("Network Topology Tests", func() { + var ctx *e2eutil.TestContext + + BeforeEach(func() { + ctx = e2eutil.InitTestContext(e2eutil.Options{ + NodesNumLimit: 8, // Need 8 nodes for the 3-tier topology + }) + + // Setup the 3-tier topology structure + By("Setup 3-tier hypernodes") + hyperNodes := []struct { + name string + nodes []string + tier int + }{ + // Tier-1 + {"s0", []string{"kwok-node-0", "kwok-node-1"}, 1}, + {"s1", []string{"kwok-node-2", "kwok-node-3"}, 1}, + {"s2", []string{"kwok-node-4", "kwok-node-5"}, 1}, + {"s3", []string{"kwok-node-6", "kwok-node-7"}, 1}, + // Tier-2 + {"s4", []string{"s0", "s1"}, 2}, + {"s5", []string{"s2", "s3"}, 2}, + // Tier-3 + {"s6", []string{"s4", "s5"}, 3}, + } + + for _, hn := range hyperNodes { + spec := &topologyv1alpha1.HyperNode{ + ObjectMeta: metav1.ObjectMeta{ + Name: hn.name, + }, + Spec: topologyv1alpha1.HyperNodeSpec{ + Tier: hn.tier, + Members: []topologyv1alpha1.MemberSpec{ + { + Type: topologyv1alpha1.MemberTypeNode, + Selector: topologyv1alpha1.MemberSelector{ + ExactMatch: &topologyv1alpha1.ExactMatch{ + Name: hn.nodes[0], + }, + }, + }, + { + Type: topologyv1alpha1.MemberTypeNode, + Selector: topologyv1alpha1.MemberSelector{ + ExactMatch: &topologyv1alpha1.ExactMatch{ + Name: hn.nodes[1], + }, + }, + }, + }, + }, + } + + err := e2eutil.SetupHyperNode(ctx, spec) + Expect(err).NotTo(HaveOccurred()) + } + + // Wait for all hypernodes to be ready + By("Wait for hypernodes to be ready") + for _, hn := range hyperNodes { + Eventually(func() error { + _, err := ctx.Vcclient.TopologyV1alpha1().HyperNodes().Get(context.TODO(), hn.name, metav1.GetOptions{}) + return err + }, 30*time.Second, time.Second).Should(BeNil()) + } + }) + + AfterEach(func() { + e2eutil.CleanupTestContext(ctx) + }) + + Context("Hard Mode Tests", func() { + It("Case 1.1: Schedule to node-0 and node-1 when resources are enough", func() { + By("Create job that fits in s0's resources") + tolerations := []v1.Toleration{ + { + Key: "kwok.x-k8s.io/node", + Operator: v1.TolerationOpEqual, + Value: "fake", + Effect: v1.TaintEffectNoSchedule, + }, + } + + // schedule pod to node-0 and node-1 to make sure the node-0 and node-1's binpack score is higher + pod0 := e2eutil.CreatePod(ctx, e2eutil.PodSpec{ + Name: "pod-0", + Node: "kwok-node-0", + Req: e2eutil.CPU1Mem1, + Tolerations: tolerations, + }) + pod1 := e2eutil.CreatePod(ctx, e2eutil.PodSpec{ + Name: "pod-1", + Node: "kwok-node-1", + Req: e2eutil.CPU1Mem1, + Tolerations: tolerations, + }) + + By("Wait for pod-0 and pod-1 to be ready") + err := e2eutil.WaitPodReady(ctx, pod0) + Expect(err).NotTo(HaveOccurred()) + err = e2eutil.WaitPodReady(ctx, pod1) + Expect(err).NotTo(HaveOccurred()) + + defer func() { + ctx.Kubeclient.CoreV1().Pods(pod0.Namespace).Delete(context.TODO(), pod0.Name, metav1.DeleteOptions{}) + ctx.Kubeclient.CoreV1().Pods(pod1.Namespace).Delete(context.TODO(), pod1.Name, metav1.DeleteOptions{}) + }() + + job := &e2eutil.JobSpec{ + Name: "job-1", + NetworkTopology: &batchv1alpha1.NetworkTopologySpec{ + Mode: batchv1alpha1.HardNetworkTopologyMode, + HighestTierAllowed: ptr.To(1), + }, + Tasks: []e2eutil.TaskSpec{ + { + Name: "task-1", + Img: e2eutil.DefaultNginxImage, + Req: e2eutil.CPU2Mem2, + Min: 2, + Rep: 2, + Tolerations: tolerations, + }, + }, + } + topologyJob := e2eutil.CreateJob(ctx, job) + + By("Wait for job running") + err = e2eutil.WaitJobReady(ctx, topologyJob) + Expect(err).NotTo(HaveOccurred()) + + By("Verify pods are scheduled to kwok-node-0 and kwok-node-1") + err = e2eutil.VerifyPodScheduling(ctx, topologyJob, []string{"kwok-node-0", "kwok-node-1"}) + Expect(err).NotTo(HaveOccurred()) + }) + + It("Case 1.2: Schedule across node-0 and node-1 when node-0 resources are insufficient", func() { + By("Create placeholder pod to consume resources on node-0") + // Create fake nodes using kwok with the topology structure + // Setup hypernode s0 containing node-0 and node-1 + // Create a job that requires more resources than available on node-0 + // Verify pods are scheduled across node-0 and node-1 + Skip("TODO: Implement this test") + + }) + + It("Case 1.3: Pods remain pending when no hypernode has sufficient resources", func() { + // Create fake nodes using kwok with the topology structure + // Setup hypernodes s0-s3 with limited resources + // Create a job requiring more resources than any hypernode can provide + // Verify pods remain in pending state + Skip("TODO: Implement this test") + }) + }) + + Context("Tier Tests", func() { + It("Case 2: Schedule to tier 2 when tier 1 resources are insufficient", func() { + // Create fake nodes using kwok with the topology structure + // Setup tier 1 hypernodes with insufficient resources + // Setup tier 2 hypernode s4 with some existing pods and sufficient resources + // Create a job + // Verify all pods are scheduled to hypernode s4 + Skip("TODO: Implement this test") + }) + + It("Case 3: Schedule to same hypernode when partial pods already running", func() { + // Create fake nodes using kwok with the topology structure + // Setup hypernode s4 with some pods from the job already running + // Create remaining pods for the same job + // Verify new pods are scheduled to the same hypernode s4 + Skip("TODO: Implement this test") + }) + }) + + Context("Soft Mode Tests", func() { + It("Case 4: Schedule to single hypernode in soft mode", func() { + // Create fake nodes using kwok with the topology structure + // Setup tier 1 with insufficient resources + // Setup tier 2 hypernodes s4 and s5 with sufficient resources + // Create a job requiring 4 nodes worth of resources + // Verify all pods are scheduled to either s4 or s5, but not both + Skip("TODO: Implement this test") + }) + }) +}) diff --git a/test/e2e/util/hypernode.go b/test/e2e/util/hypernode.go new file mode 100644 index 0000000000..d5baf6dcbf --- /dev/null +++ b/test/e2e/util/hypernode.go @@ -0,0 +1,69 @@ +package util + +import ( + "context" + "fmt" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + batchv1alpha1 "volcano.sh/apis/pkg/apis/batch/v1alpha1" + topologyv1alpha1 "volcano.sh/apis/pkg/apis/topology/v1alpha1" +) + +// SetupHyperNode creates a hypernode with the given configuration +func SetupHyperNode(ctx *TestContext, spec *topologyv1alpha1.HyperNode) error { + hyperNode := &topologyv1alpha1.HyperNode{ + ObjectMeta: metav1.ObjectMeta{ + Name: spec.Name, + }, + Spec: spec.Spec, + } + + _, err := ctx.Vcclient.TopologyV1alpha1().HyperNodes().Create(context.TODO(), hyperNode, metav1.CreateOptions{}) + if err != nil { + return fmt.Errorf("failed to create hypernode %s: %v", spec.Name, err) + } + + return nil +} + +// VerifyPodScheduling verifies if pods are scheduled according to topology requirements +func VerifyPodScheduling(ctx *TestContext, job *batchv1alpha1.Job, expectedNodes []string) error { + // Get all pods for the job + pods := GetTasksOfJob(ctx, job) + if len(pods) == 0 { + return fmt.Errorf("no pods found for job %s", job.Name) + } + + // Create a map for quick lookup of expected nodes + nodeSet := make(map[string]bool) + for _, node := range expectedNodes { + nodeSet[node] = true + } + + // Verify each pod is scheduled to expected nodes + for _, pod := range pods { + if pod.Spec.NodeName == "" { + return fmt.Errorf("pod %s/%s is not scheduled", pod.Namespace, pod.Name) + } + if !nodeSet[pod.Spec.NodeName] { + return fmt.Errorf("pod %s/%s is scheduled to unexpected node %s, expected nodes: %v", + pod.Namespace, pod.Name, pod.Spec.NodeName, expectedNodes) + } + } + + return nil +} + +// CleanupHyperNodes deletes all hypernode resources in the cluster +func CleanupHyperNodes(ctx *TestContext) error { + err := ctx.Vcclient.TopologyV1alpha1().HyperNodes().DeleteCollection( + context.TODO(), + metav1.DeleteOptions{}, + metav1.ListOptions{}, + ) + if err != nil { + return fmt.Errorf("failed to delete hypernodes: %v", err) + } + return nil +} diff --git a/test/e2e/util/job.go b/test/e2e/util/job.go index 3b3e16a8df..60a8f1601d 100644 --- a/test/e2e/util/job.go +++ b/test/e2e/util/job.go @@ -73,6 +73,8 @@ type JobSpec struct { MinSuccess *int32 // job max retry MaxRetry int32 + // network topology mode hard or soft + NetworkTopology *batchv1alpha1.NetworkTopologySpec } func Namespace(context *TestContext, job *JobSpec) string { @@ -202,6 +204,7 @@ func CreateJobInner(ctx *TestContext, jobSpec *JobSpec) (*batchv1alpha1.Job, err TTLSecondsAfterFinished: jobSpec.TTL, MinSuccess: jobSpec.MinSuccess, MaxRetry: jobSpec.MaxRetry, + NetworkTopology: jobSpec.NetworkTopology, }, } diff --git a/test/e2e/util/pod.go b/test/e2e/util/pod.go new file mode 100644 index 0000000000..4bef143250 --- /dev/null +++ b/test/e2e/util/pod.go @@ -0,0 +1,53 @@ +package util + +import ( + "context" + "time" + + . "github.com/onsi/gomega" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" +) + +type PodSpec struct { + Name string + Node string + Req v1.ResourceList + Tolerations []v1.Toleration +} + +func CreatePod(ctx *TestContext, spec PodSpec) *v1.Pod { + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: spec.Name, + Namespace: ctx.Namespace, + }, + Spec: v1.PodSpec{ + NodeName: spec.Node, + Containers: []v1.Container{ + { + Image: DefaultNginxImage, + Name: spec.Name, + ImagePullPolicy: v1.PullIfNotPresent, + }, + }, + Tolerations: spec.Tolerations, + }, + } + + pod, err := ctx.Kubeclient.CoreV1().Pods(ctx.Namespace).Create(context.TODO(), pod, metav1.CreateOptions{}) + Expect(err).NotTo(HaveOccurred(), "failed to create pod %s", spec.Name) + + return pod +} + +func WaitPodReady(ctx *TestContext, pod *v1.Pod) error { + return wait.Poll(100*time.Millisecond, FiveMinute, func() (bool, error) { + pod, err := ctx.Kubeclient.CoreV1().Pods(pod.Namespace).Get(context.TODO(), pod.Name, metav1.GetOptions{}) + if err != nil { + return false, err + } + return pod.Status.Phase == v1.PodRunning, nil + }) +} diff --git a/test/e2e/util/util.go b/test/e2e/util/util.go index b08b6ffce1..5cb30bb1fc 100644 --- a/test/e2e/util/util.go +++ b/test/e2e/util/util.go @@ -204,8 +204,12 @@ func FileExist(name string) bool { func CleanupTestContext(ctx *TestContext) { By("Cleaning up test context") + // Clean up hypernodes first + err := CleanupHyperNodes(ctx) + Expect(err).NotTo(HaveOccurred(), "failed to clean up hypernodes") + foreground := metav1.DeletePropagationForeground - err := ctx.Kubeclient.CoreV1().Namespaces().Delete(context.TODO(), ctx.Namespace, metav1.DeleteOptions{ + err = ctx.Kubeclient.CoreV1().Namespaces().Delete(context.TODO(), ctx.Namespace, metav1.DeleteOptions{ PropagationPolicy: &foreground, }) Expect(err).NotTo(HaveOccurred(), "failed to delete namespace")