Skip to content

Commit

Permalink
Add hyperNode controller framework and provider
Browse files Browse the repository at this point in the history
Signed-off-by: Monokaix <changxuzheng@huawei.com>
  • Loading branch information
Monokaix committed Feb 20, 2025
1 parent 1d69621 commit bbe2df1
Show file tree
Hide file tree
Showing 15 changed files with 572 additions and 4 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -135,3 +135,5 @@ vendor

# helm dependency files
installer/helm/chart/volcano/requirements.lock

*.so
11 changes: 8 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ IMAGE_PREFIX=volcanosh
CRD_OPTIONS ?= "crd:crdVersions=v1,generateEmbeddedObjectMeta=true"
CRD_OPTIONS_EXCLUDE_DESCRIPTION=${CRD_OPTIONS}",maxDescLen=0"
CC ?= "gcc"
MUSL_CC ?= "/usr/local/musl/bin/musl-gcc"
SUPPORT_PLUGINS ?= "no"
CRD_VERSION ?= v1
BUILDX_OUTPUT_TYPE ?= "docker"
Expand Down Expand Up @@ -73,13 +74,17 @@ init:

vc-scheduler: init
if [ ${SUPPORT_PLUGINS} = "yes" ];then\
CC=${CC} CGO_ENABLED=1 go build -ldflags ${LD_FLAGS} -o ${BIN_DIR}/vc-scheduler ./cmd/scheduler;\
CC=${MUSL_CC} CGO_ENABLED=1 go build -ldflags ${LD_FLAGS} -o ${BIN_DIR}/vc-scheduler ./cmd/scheduler;\
else\
CC=${CC} CGO_ENABLED=0 go build -ldflags ${LD_FLAGS} -o ${BIN_DIR}/vc-scheduler ./cmd/scheduler;\
fi;

vc-controller-manager: init
CC=${CC} CGO_ENABLED=0 go build -ldflags ${LD_FLAGS} -o ${BIN_DIR}/vc-controller-manager ./cmd/controller-manager
if [ ${SUPPORT_PLUGINS} = "yes" ];then\
CC=${MUSL_CC} CGO_ENABLED=1 go build -ldflags ${LD_FLAGS} -o ${BIN_DIR}/vc-controller-manager ./cmd/controller-manager;\
else\
CC=${CC} CGO_ENABLED=0 go build -ldflags ${LD_FLAGS} -o ${BIN_DIR}/vc-controller-manager ./cmd/controller-manager;\
fi;

vc-webhook-manager: init
CC=${CC} CGO_ENABLED=0 go build -ldflags ${LD_FLAGS} -o ${BIN_DIR}/vc-webhook-manager ./cmd/webhook-manager
Expand All @@ -94,7 +99,7 @@ vcctl: init
image_bins: vc-scheduler vc-controller-manager vc-webhook-manager vc-agent

images:
for name in controller-manager scheduler webhook-manager agent; do\
for name in controller-manager ; do\
docker buildx build -t "${IMAGE_PREFIX}/vc-$$name:$(TAG)" . -f ./installer/dockerfile/$$name/Dockerfile --output=type=${BUILDX_OUTPUT_TYPE} --platform ${DOCKER_PLATFORMS} --build-arg APK_MIRROR=${APK_MIRROR} --build-arg OPEN_EULER_IMAGE_TAG=${OPEN_EULER_IMAGE_TAG}; \
done

Expand Down
3 changes: 2 additions & 1 deletion Makefile.def
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ GitSHA=`git rev-parse HEAD`
Date=`date "+%Y-%m-%d %H:%M:%S"`
RELEASE_VER=v1.11.0-network-topology-preview.0
OPEN_EULER_IMAGE_TAG ?= 22.03-lts-sp2
LD_FLAGS=" \
LD_FLAGS="\
-linkmode=external \
-X '${REPO_PATH}/pkg/version.GitSHA=${GitSHA}' \
-X '${REPO_PATH}/pkg/version.Built=${Date}' \
-X '${REPO_PATH}/pkg/version.Version=${RELEASE_VER}'"
Expand Down
4 changes: 4 additions & 0 deletions cmd/controller-manager/app/options/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ type ServerOption struct {
// Case3: "-gc-controller,-job-controller,-jobflow-controller,-jobtemplate-controller,-pg-controller,-queue-controller"
// to disable specific controllers,
Controllers []string
// HyperNodeProviderDir is the directory of hyperNode provider, vc controller
// read .so file in this directory to load hyperNode provider plugins.
HyperNodeProviderDir string
}

type DecryptFunc func(c *ServerOption) error
Expand Down Expand Up @@ -129,6 +132,7 @@ func (s *ServerOption) AddFlags(fs *pflag.FlagSet, knownControllers []string) {
fs.Uint32Var(&s.WorkerThreadsForQueue, "worker-threads-for-queue", defaultQueueWorkers, "The number of threads syncing queue operations. The larger the number, the faster the queue processing, but requires more CPU load.")
fs.StringSliceVar(&s.Controllers, "controllers", []string{defaultControllers}, fmt.Sprintf("Specify controller gates. Use '*' for all controllers, all knownController: %s ,and we can use "+
"'-' to disable controllers, e.g. \"-job-controller,-queue-controller\" to disable job and queue controllers.", knownControllers))
fs.StringVar(&s.HyperNodeProviderDir, "hypernode-provider-dir", "", "The directory of hyperNode provider, vc controller read .so file in this directory to load hyperNode provider plugin.")
}

// CheckOptionOrDie checks all options and returns all errors if they are invalid.
Expand Down
1 change: 1 addition & 0 deletions cmd/controller-manager/app/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ func startControllers(config *rest.Config, opt *options.ServerOption) func(ctx c
controllerOpt.WorkerThreadsForQueue = opt.WorkerThreadsForQueue
controllerOpt.WorkerThreadsForGC = opt.WorkerThreadsForGC
controllerOpt.Config = config
controllerOpt.HyperNodeProviderDir = opt.HyperNodeProviderDir

return func(ctx context.Context) {
framework.ForeachController(func(c framework.Controller) {
Expand Down
1 change: 1 addition & 0 deletions cmd/controller-manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
"volcano.sh/volcano/cmd/controller-manager/app/options"
"volcano.sh/volcano/pkg/controllers/framework"
_ "volcano.sh/volcano/pkg/controllers/garbagecollector"
_ "volcano.sh/volcano/pkg/controllers/hypernode"
_ "volcano.sh/volcano/pkg/controllers/job"
_ "volcano.sh/volcano/pkg/controllers/jobflow"
_ "volcano.sh/volcano/pkg/controllers/jobtemplate"
Expand Down
31 changes: 31 additions & 0 deletions example/hypernode-provider/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
FROM golang:1.22.2 AS builder

WORKDIR /go/src/volcano.sh/

# Install musl
RUN apt-get update && \
apt-get install -y sudo

RUN wget http://musl.libc.org/releases/musl-1.2.1.tar.gz && \
tar -xf musl-1.2.1.tar.gz && \
cd musl-1.2.1 && \
./configure && make && sudo make install

COPY go.mod go.sum ./

RUN go mod download

ADD . volcano

# Build plugin
RUN cd volcano && CC=/usr/local/musl/bin/musl-gcc CGO_ENABLED=1 \
go build -buildmode=plugin -ldflags '-linkmode=external' \
-o example/hypernode-provider/example-provider.so example/hypernode-provider/example_provider.go

# Build vc controller base manager image with plugin enabled
RUN cd volcano && SUPPORT_PLUGINS=yes make vc-controller-manager

# Build vc controller manager image with plugin
FROM volcanosh/vc-controller-manager:latest
COPY --from=builder /go/src/volcano.sh/volcano/_output/bin/vc-controller-manager /vc-controller-manager
COPY --from=builder /go/src/volcano.sh/volcano/example/hypernode-provider/example-provider.so /
15 changes: 15 additions & 0 deletions example/hypernode-provider/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@

# Overview

As stared in the [network topology aware scheduling](https://github.com/volcano-sh/volcano/blob/master/docs/design/Network%20Topology%20Aware%20Scheduling.md#network-topology-generation-and-update), datacenter's network topology details are different, besides create and update hyperNodes CRD manually, Volcano should also support a method that auto-discovery the network topology and update the hyperNodes CRD automatically, this need the cooperation with under-layer hardware vendors, Volcano controller supports the basic hyperNodes reconcile framework, and expose an interface which can interactive with the hardware vendors, the vendors behaves as a hyeprNodes providers and reconcile hyperNodes such as creating/updating/reporting healthy status, through which Volcano can adapt any hardware vendors with auto-discovery network topology tools supported, and vendors just need to focus on the auto-discovery mechanism while Volcnao supports a basic framework and integrate them with an extensible way. This is similar to the [cloud provider mechanism](https://github.com/kubernetes/cloud-provider) in kubernetes.

# How to use

## Write your provider
Write your codes locally and implement the `Plugin` interface in file `pkg/controllers/hypernode/provider/interface.go`, there are two critical params that you need to concern:
`eventCh chan<- Event`: Vendors should send the hyperNode create/update/delete event to this channel, and Volcano controller will communicate to API Server to store them.
`replyCh <-chan Reply`: Volcano will reply errors to vendor providers through this channel when an unexpected error occurs when communicating with the API Server and the retry does not succeed, providers should be aware of that and should resend the event or perform fault-tolerant processing.

There in an example in `example/hypernode-provider/example_provider.go` demonstrated how to write a provider.

## Build the provider with volcano controller
112 changes: 112 additions & 0 deletions example/hypernode-provider/example_provider.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
/*
Copyright 2025 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package main

import (
"strconv"
"time"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/klog/v2"
"volcano.sh/apis/pkg/apis/topology/v1alpha1"
topologyinformerv1alpha1 "volcano.sh/apis/pkg/client/informers/externalversions/topology/v1alpha1"

"volcano.sh/volcano/pkg/controllers/hypernode/provider"
)

func New() provider.Plugin {
return &exampleProvider{
stopChan: make(chan struct{}),
}
}

// exampleProvider is an example provider of hyperNodes.
type exampleProvider struct {
stopChan chan struct{}
hyperNodeInformer topologyinformerv1alpha1.HyperNodeInformer
}

// Name returns the name of the vendor.
func (e *exampleProvider) Name() string {
return "example-provider"
}

// Start starts the vendor provider.
func (e *exampleProvider) Start(eventCh chan<- provider.Event, replyCh <-chan provider.Reply, informer topologyinformerv1alpha1.HyperNodeInformer) error {
e.hyperNodeInformer = informer
go e.receiveReply(replyCh)
go e.sendEvent(eventCh)
return nil
}

// Stop stops the provider.
func (e *exampleProvider) Stop() error {
klog.InfoS("exampleProvider stopped")
close(e.stopChan)
return nil
}

func (e *exampleProvider) sendEvent(eventCh chan<- provider.Event) {
for {
i := 0
select {
case <-e.stopChan:
klog.InfoS("Stop signal received, exiting sendEvent")
return
default:
hn := v1alpha1.HyperNode{
ObjectMeta: metav1.ObjectMeta{
Name: "hypernode-" + strconv.Itoa(i),
},
Spec: v1alpha1.HyperNodeSpec{
Tier: 1,
Members: []v1alpha1.MemberSpec{
{
Type: v1alpha1.MemberTypeNode,
Selector: v1alpha1.MemberSelector{
ExactMatch: &v1alpha1.ExactMatch{
Name: "node-" + strconv.Itoa(i),
},
},
},
},
},
}
event := provider.Event{Type: provider.EventAdd, HyperNode: hn}
eventCh <- event
klog.InfoS("Successfully sent add event", "event", event.Type, "hyperNodeName", hn.Name)
time.Sleep(5 * time.Second) // analog sending interval
i++
if i == 3 {
return
}
}
}
}

func (e *exampleProvider) receiveReply(replyCh <-chan provider.Reply) {
for {
select {
case reply, ok := <-replyCh:
if !ok {
klog.InfoS("Reply channel closed, exiting receiveReply")
return
}
klog.ErrorS(reply.Error, "Failed to process hyperNode event", "hyperNodeName", reply.HyperNodeName)
}
}
}
3 changes: 3 additions & 0 deletions installer/helm/chart/volcano/templates/controllers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,9 @@ rules:
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["get", "create", "update", "watch"]
- apiGroups: ["topology.volcano.sh"]
resources: ["hypernodes", "hypernodes/status"]
verbs: ["list", "watch", "get", "create", "delete", "update", "patch"]
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
Expand Down
3 changes: 3 additions & 0 deletions installer/volcano-development.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4410,6 +4410,9 @@ rules:
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["get", "create", "update", "watch"]
- apiGroups: ["topology.volcano.sh"]
resources: ["hypernodes", "hypernodes/status"]
verbs: ["list", "watch", "get", "create", "delete", "update", "patch"]
---
# Source: volcano/templates/controllers.yaml
kind: ClusterRoleBinding
Expand Down
2 changes: 2 additions & 0 deletions pkg/controllers/framework/interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ type ControllerOption struct {
// Config holds the common attributes that can be passed to a Kubernetes client
// and controllers registered by the users can use it.
Config *rest.Config
// HyperNodeProviderDir specifies the directory where the hyperNode provider plugins are stored.
HyperNodeProviderDir string
}

// Controller is the interface of all controllers.
Expand Down
73 changes: 73 additions & 0 deletions pkg/controllers/hypernode/hypernode_controller.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/*
Copyright 2025 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package hypernode

import (
"k8s.io/klog/v2"
vcclientset "volcano.sh/apis/pkg/client/clientset/versioned"
vcinformer "volcano.sh/apis/pkg/client/informers/externalversions"
topologyinformerv1alpha1 "volcano.sh/apis/pkg/client/informers/externalversions/topology/v1alpha1"
topologylisterv1alpha1 "volcano.sh/apis/pkg/client/listers/topology/v1alpha1"

"volcano.sh/volcano/pkg/controllers/framework"
"volcano.sh/volcano/pkg/controllers/hypernode/provider"
)

func init() {
framework.RegisterController(&hyperNodeController{})
}

const (
name = "hyperNode-controller"
)

type hyperNodeController struct {
vcClient vcclientset.Interface
vcInformerFactory vcinformer.SharedInformerFactory
hyperNodeInformer topologyinformerv1alpha1.HyperNodeInformer
hyperNodeLister topologylisterv1alpha1.HyperNodeLister
provider provider.Provider
}

// Run starts the hyperNode controller.
func (hn *hyperNodeController) Run(stopCh <-chan struct{}) {
hn.vcInformerFactory.Start(stopCh)
for informerType, ok := range hn.vcInformerFactory.WaitForCacheSync(stopCh) {
if !ok {
klog.ErrorS(nil, "Failed to sync cache", "type", informerType)
return
}
}

go hn.provider.Provision(stopCh)
}

// Name returns the name of the hyperNode controller.
func (hn *hyperNodeController) Name() string {
return name
}

// Initialize initializes the hyperNode controller.
func (hn *hyperNodeController) Initialize(opt *framework.ControllerOption) error {
hn.vcClient = opt.VolcanoClient
factory := opt.VCSharedInformerFactory
hn.vcInformerFactory = factory
hn.hyperNodeInformer = factory.Topology().V1alpha1().HyperNodes()
hn.hyperNodeLister = hn.hyperNodeInformer.Lister()
hn.provider = provider.NewProvider(hn.vcClient, factory, opt.HyperNodeProviderDir)
return nil
}
Loading

0 comments on commit bbe2df1

Please # to comment.