diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index bed511c0a..aed0ead88 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -14,7 +14,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout
- uses: actions/checkout@v2
+ uses: actions/checkout@v4
- name: Checkout submodule
uses: Mushus/checkout-submodule@v1.0.1
diff --git a/README.md b/README.md
index 54f981ff4..175e0260b 100644
--- a/README.md
+++ b/README.md
@@ -157,12 +157,16 @@ sudo systemctl daemon-reload && systemctl restart containerd
Label your nodes
-Label your GPU nodes for scheduling with HAMi by adding the label "gpu=on". Without this label, the nodes cannot be managed by our scheduler.
+Finally, put a label on the node that needs to use GPU. Currently, there are two ways to label the node. You can manually or customize the label by controlling the autoValidatorDriver variable.
+- Manually put `label`, use the following command to put `label` on the node, and set `autoValidatorDriver` to `false`.
```
-kubectl label nodes {nodeid} gpu=on
+$ kubectl label nodes {nodeid} hami.io/nvidia-selector=on
```
+- Automatically put `label`. By setting `autoValidatorDriver` to `true`, a `daemonset` will be started to automatically detect whether the `NVIDIA` driver on the node is ready. If it is ready, set `hami.io/nvidia- selector=on` label, otherwise `hami.io/nvidia-selector=off` label.
+
+
### Install and Uninstall
diff --git a/README_cn.md b/README_cn.md
index dc7d7f2e1..b637ba521 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -138,12 +138,15 @@ systemctl daemon-reload && systemctl restart containerd
为GPU节点打上标签
-最后,你需要将所有要使用到的GPU节点打上gpu=on标签,否则该节点不会被调度到
+最后,在需要使用`GPU`的节点上打上`label`,目前提供了两种方式给节点打`label`,通过控制`autoValidatorDriver`变量来手动或者自定打`label`。
+- 手动打`label`,通过如下命令在节点上打`label`,并设置`autoValidatorDriver`为`false`。
```
-$ kubectl label nodes {nodeid} gpu=on
+$ kubectl label nodes {nodeid} hami.io/nvidia-selector=on
```
+- 自动打`label`,通过设置`autoValidatorDriver`为`true`,会启动一个`daemonset`,自动探测节点上的`NVIDIA`驱动是否准备好,准备好则给节点设置`hami.io/nvidia-selector=on` label,否则为`hami.io/nvidia-selector=off` label.
+
### 安装,更新与卸载
diff --git a/charts/hami/templates/device-plugin/daemonsetvalidator-configmap.yaml b/charts/hami/templates/device-plugin/daemonsetvalidator-configmap.yaml
new file mode 100644
index 000000000..a0dc3cbe1
--- /dev/null
+++ b/charts/hami/templates/device-plugin/daemonsetvalidator-configmap.yaml
@@ -0,0 +1,51 @@
+{{- if .Values.devicePlugin.autoValidatorDriver }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: driver-validator-configmap
+ labels:
+ app.kubernetes.io/component: 4pd-driver-validator
+data:
+ validator.sh: |
+ #!/bin/zsh
+ if [ -f "/host/usr/bin/nvidia-smi" ]; then
+ driverRoot="/host"
+ isHostDriver="true"
+ else
+ driverRoot="/run/nvidia/driver"
+ isHostDriver="false"
+ fi
+
+ # check this directory /run/nvidia/driver does exist?
+ if [ "$isHostDriver" = "false" ]; then
+ while [ ! -d "/run/nvidia/driver" ]; do
+ echo current node $NODE_NAME not GPU device;
+ sleep 10
+ done
+ fi
+
+ sleep_interval=5
+ if [ "$isHostDriver" = "false" ]; then
+ while true; do
+ stat /run/nvidia/validations/.driver-ctr-ready
+ if [ $? -ne 0 ]; then
+ kubectl label node $NODE_NAME hami.io/nvidia-selector="off" --overwrite
+ sleep $sleep_interval
+ else
+ break
+ fi
+ done
+ fi
+ while true; do
+ chroot "$driverRoot" nvidia-smi
+ if [ $? -eq 0 ]; then
+ sleep 30
+ kubectl label node $NODE_NAME hami.io/nvidia-selector="on" --overwrite
+ break
+ else
+ sleep $sleep_interval
+ fi
+ done
+ echo validations are successful; sleep infinity
+
+ {{- end }}
\ No newline at end of file
diff --git a/charts/hami/templates/device-plugin/daemonsetvalidator.yaml b/charts/hami/templates/device-plugin/daemonsetvalidator.yaml
new file mode 100644
index 000000000..693de896d
--- /dev/null
+++ b/charts/hami/templates/device-plugin/daemonsetvalidator.yaml
@@ -0,0 +1,70 @@
+{{- if .Values.devicePlugin.autoValidatorDriver }}
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+ name: driver-validator
+ labels:
+ app.kubernetes.io/component: 4pd-driver-validator
+ {{- with .Values.global.labels }}
+ {{- toYaml . | nindent 4 }}
+ {{- end }}
+ {{- if .Values.global.annotations }}
+ annotations: {{ toYaml .Values.global.annotations | nindent 4}}
+ {{- end }}
+spec:
+ selector:
+ matchLabels:
+ app.kubernetes.io/component: 4pd-driver-validator
+ template:
+ metadata:
+ labels:
+ app.kubernetes.io/component: 4pd-driver-validator
+ 4pd.io/webhook: ignore
+ spec:
+ serviceAccountName: {{ include "4pd-vgpu.device-plugin" . }}
+ priorityClassName: system-node-critical
+ containers:
+ - name: driver-validator
+ image: '{{ .Values.devicePlugin.image }}:{{ .Values.version }}'
+ command:
+ - /bin/sh
+ - -c
+ - /scripts/validator.sh
+ env:
+ - name: NODE_NAME
+ valueFrom:
+ fieldRef:
+ fieldPath: spec.nodeName
+ volumeMounts:
+ - mountPath: /host
+ mountPropagation: HostToContainer
+ name: host-root
+ readOnly: true
+ - mountPath: /run/nvidia/driver
+ mountPropagation: HostToContainer
+ name: driver-install-path
+ readOnly: true
+ - mountPath: /run/nvidia/validations
+ mountPropagation: Bidirectional
+ name: run-nvidia-validations
+ - name: validator-configmap
+ mountPath: /scripts
+ resources:
+ limits:
+ cpu: 10m
+ memory: 50Mb
+ volumes:
+ - hostPath:
+ path: /run/nvidia/driver
+ name: driver-install-path
+ - hostPath:
+ path: /
+ name: host-root
+ - name: validator-configmap
+ configMap:
+ name: driver-validator-configmap
+ - hostPath:
+ path: /run/nvidia/validations
+ type: DirectoryOrCreate
+ name: run-nvidia-validations
+{{- end }}
\ No newline at end of file
diff --git a/charts/hami/values.yaml b/charts/hami/values.yaml
index aae46d1ed..03cc190aa 100644
--- a/charts/hami/values.yaml
+++ b/charts/hami/values.yaml
@@ -112,10 +112,10 @@ devicePlugin:
podAnnotations: {}
nvidianodeSelector:
- gpu: "on"
+ hami.io/nvidia-selector: "on"
mlunodeSelector:
mlu: "on"
hygonnodeSelector:
dcu: "on"
tolerations: []
-
+ autoValidatorDriver: false
diff --git a/docker/Dockerfile b/docker/Dockerfile
index e6b4308bb..d842aa676 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -16,9 +16,12 @@ RUN wget https://cmake.org/files/v3.19/cmake-3.19.8-Linux-x86_64.tar.gz
RUN tar -xf cmake-3.19.8-Linux-x86_64.tar.gz
RUN cp /libvgpu/cmake-3.19.8-Linux-x86_64/bin/cmake /libvgpu/cmake-3.19.8-Linux-x86_64/bin/cmake3
ENV PATH="/libvgpu/cmake-3.19.8-Linux-x86_64/bin:${PATH}"
-RUN apt-get -y install openssl libssl-dev
+RUN apt-get -y install openssl libssl-dev curl
+ARG KUBERCTL_VERSION
+RUN curl https://storage.googleapis.com/kubernetes-release/release/${KUBERCTL_VERSION}/bin/linux/amd64/kubectl --output /bin/kubectl && chmod u+x /bin/kubectl
RUN bash ./build.sh
+
FROM nvidia/cuda:12.2.0-base-ubuntu22.04
ENV NVIDIA_DISABLE_REQUIRE="true"
ENV NVIDIA_VISIBLE_DEVICES=all
@@ -32,6 +35,7 @@ COPY --from=GOBUILD /k8s-vgpu/bin /k8s-vgpu/bin
COPY ./docker/entrypoint.sh /k8s-vgpu/bin/entrypoint.sh
COPY ./lib /k8s-vgpu/lib
COPY --from=NVBUILD /libvgpu/build/libvgpu.so /k8s-vgpu/lib/nvidia/
+COPY --from=NVBUILD /bin/kubectl /bin/kubectl
COPY ./lib/mlu/cntopo /usr/bin/
COPY ./lib/mlu/libcndev.so /usr/lib/
diff --git a/go.sum b/go.sum
index fd7cd579e..f0bab5e89 100644
--- a/go.sum
+++ b/go.sum
@@ -1211,6 +1211,7 @@ github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfn
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
+github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/pty v1.1.3/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/pty v1.1.5/go.mod h1:9r2w37qlBe7rQ6e1fg1S/9xpWHSnaqNdHD3WcMdbPDA=
diff --git a/hack/build.sh b/hack/build.sh
index 6c0736e94..aa6f2ad19 100755
--- a/hack/build.sh
+++ b/hack/build.sh
@@ -25,6 +25,7 @@ export LATEST_VERSION="latest"
export GOLANG_IMAGE="golang:1.21-bullseye"
export NVIDIA_IMAGE="nvidia/cuda:12.2.0-devel-ubuntu20.04"
export DEST_DIR="/usr/local"
+export KUBERCTL_VERSION="v1.29.0"
IMAGE=${IMAGE-"projecthami/hami"}
@@ -34,7 +35,7 @@ function go_build() {
}
function docker_build() {
- docker build --build-arg VERSION="${VERSION}" --build-arg GOLANG_IMAGE=${GOLANG_IMAGE} --build-arg NVIDIA_IMAGE=${NVIDIA_IMAGE} --build-arg DEST_DIR=${DEST_DIR} -t "${IMAGE}:${VERSION}" -f docker/Dockerfile .
+ docker build --build-arg VERSION="${VERSION}" --build-arg GOLANG_IMAGE=${GOLANG_IMAGE} --build-arg NVIDIA_IMAGE=${NVIDIA_IMAGE} --build-arg DEST_DIR=${DEST_DIR} --build-arg KUBERCTL_VERSION=${KUBERCTL_VERSION} -t "${IMAGE}:${VERSION}" -f docker/Dockerfile .
docker tag "${IMAGE}:${VERSION}" "${IMAGE}:${SHORT_VERSION}"
docker tag "${IMAGE}:${VERSION}" "${IMAGE}:${LATEST_VERSION}"
}