From ca232fa327db711b59d3b6a26c695580c4d7d60f Mon Sep 17 00:00:00 2001 From: lengrongfu Date: Wed, 7 Feb 2024 13:05:49 +0800 Subject: [PATCH] add validator nvidia driver whether install success Signed-off-by: lengrongfu --- .github/workflows/ci.yaml | 2 +- README.md | 8 ++- README_cn.md | 7 +- .../daemonsetvalidator-configmap.yaml | 51 ++++++++++++++ .../device-plugin/daemonsetvalidator.yaml | 70 +++++++++++++++++++ charts/hami/values.yaml | 4 +- docker/Dockerfile | 6 +- go.sum | 1 + hack/build.sh | 3 +- 9 files changed, 143 insertions(+), 9 deletions(-) create mode 100644 charts/hami/templates/device-plugin/daemonsetvalidator-configmap.yaml create mode 100644 charts/hami/templates/device-plugin/daemonsetvalidator.yaml diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index bed511c0a..aed0ead88 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Checkout submodule uses: Mushus/checkout-submodule@v1.0.1 diff --git a/README.md b/README.md index 54f981ff4..175e0260b 100644 --- a/README.md +++ b/README.md @@ -157,12 +157,16 @@ sudo systemctl daemon-reload && systemctl restart containerd
Label your nodes -Label your GPU nodes for scheduling with HAMi by adding the label "gpu=on". Without this label, the nodes cannot be managed by our scheduler. +Finally, put a label on the node that needs to use GPU. Currently, there are two ways to label the node. You can manually or customize the label by controlling the autoValidatorDriver variable. +- Manually put `label`, use the following command to put `label` on the node, and set `autoValidatorDriver` to `false`. ``` -kubectl label nodes {nodeid} gpu=on +$ kubectl label nodes {nodeid} hami.io/nvidia-selector=on ``` +- Automatically put `label`. By setting `autoValidatorDriver` to `true`, a `daemonset` will be started to automatically detect whether the `NVIDIA` driver on the node is ready. If it is ready, set `hami.io/nvidia- selector=on` label, otherwise `hami.io/nvidia-selector=off` label. + +
### Install and Uninstall diff --git a/README_cn.md b/README_cn.md index dc7d7f2e1..b637ba521 100644 --- a/README_cn.md +++ b/README_cn.md @@ -138,12 +138,15 @@ systemctl daemon-reload && systemctl restart containerd
为GPU节点打上标签 -最后,你需要将所有要使用到的GPU节点打上gpu=on标签,否则该节点不会被调度到 +最后,在需要使用`GPU`的节点上打上`label`,目前提供了两种方式给节点打`label`,通过控制`autoValidatorDriver`变量来手动或者自定打`label`。 +- 手动打`label`,通过如下命令在节点上打`label`,并设置`autoValidatorDriver`为`false`。 ``` -$ kubectl label nodes {nodeid} gpu=on +$ kubectl label nodes {nodeid} hami.io/nvidia-selector=on ``` +- 自动打`label`,通过设置`autoValidatorDriver`为`true`,会启动一个`daemonset`,自动探测节点上的`NVIDIA`驱动是否准备好,准备好则给节点设置`hami.io/nvidia-selector=on` label,否则为`hami.io/nvidia-selector=off` label. +
### 安装,更新与卸载 diff --git a/charts/hami/templates/device-plugin/daemonsetvalidator-configmap.yaml b/charts/hami/templates/device-plugin/daemonsetvalidator-configmap.yaml new file mode 100644 index 000000000..a0dc3cbe1 --- /dev/null +++ b/charts/hami/templates/device-plugin/daemonsetvalidator-configmap.yaml @@ -0,0 +1,51 @@ +{{- if .Values.devicePlugin.autoValidatorDriver }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: driver-validator-configmap + labels: + app.kubernetes.io/component: 4pd-driver-validator +data: + validator.sh: | + #!/bin/zsh + if [ -f "/host/usr/bin/nvidia-smi" ]; then + driverRoot="/host" + isHostDriver="true" + else + driverRoot="/run/nvidia/driver" + isHostDriver="false" + fi + + # check this directory /run/nvidia/driver does exist? + if [ "$isHostDriver" = "false" ]; then + while [ ! -d "/run/nvidia/driver" ]; do + echo current node $NODE_NAME not GPU device; + sleep 10 + done + fi + + sleep_interval=5 + if [ "$isHostDriver" = "false" ]; then + while true; do + stat /run/nvidia/validations/.driver-ctr-ready + if [ $? -ne 0 ]; then + kubectl label node $NODE_NAME hami.io/nvidia-selector="off" --overwrite + sleep $sleep_interval + else + break + fi + done + fi + while true; do + chroot "$driverRoot" nvidia-smi + if [ $? -eq 0 ]; then + sleep 30 + kubectl label node $NODE_NAME hami.io/nvidia-selector="on" --overwrite + break + else + sleep $sleep_interval + fi + done + echo validations are successful; sleep infinity + + {{- end }} \ No newline at end of file diff --git a/charts/hami/templates/device-plugin/daemonsetvalidator.yaml b/charts/hami/templates/device-plugin/daemonsetvalidator.yaml new file mode 100644 index 000000000..693de896d --- /dev/null +++ b/charts/hami/templates/device-plugin/daemonsetvalidator.yaml @@ -0,0 +1,70 @@ +{{- if .Values.devicePlugin.autoValidatorDriver }} +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: driver-validator + labels: + app.kubernetes.io/component: 4pd-driver-validator + {{- with .Values.global.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} + {{- if .Values.global.annotations }} + annotations: {{ toYaml .Values.global.annotations | nindent 4}} + {{- end }} +spec: + selector: + matchLabels: + app.kubernetes.io/component: 4pd-driver-validator + template: + metadata: + labels: + app.kubernetes.io/component: 4pd-driver-validator + 4pd.io/webhook: ignore + spec: + serviceAccountName: {{ include "4pd-vgpu.device-plugin" . }} + priorityClassName: system-node-critical + containers: + - name: driver-validator + image: '{{ .Values.devicePlugin.image }}:{{ .Values.version }}' + command: + - /bin/sh + - -c + - /scripts/validator.sh + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + volumeMounts: + - mountPath: /host + mountPropagation: HostToContainer + name: host-root + readOnly: true + - mountPath: /run/nvidia/driver + mountPropagation: HostToContainer + name: driver-install-path + readOnly: true + - mountPath: /run/nvidia/validations + mountPropagation: Bidirectional + name: run-nvidia-validations + - name: validator-configmap + mountPath: /scripts + resources: + limits: + cpu: 10m + memory: 50Mb + volumes: + - hostPath: + path: /run/nvidia/driver + name: driver-install-path + - hostPath: + path: / + name: host-root + - name: validator-configmap + configMap: + name: driver-validator-configmap + - hostPath: + path: /run/nvidia/validations + type: DirectoryOrCreate + name: run-nvidia-validations +{{- end }} \ No newline at end of file diff --git a/charts/hami/values.yaml b/charts/hami/values.yaml index aae46d1ed..03cc190aa 100644 --- a/charts/hami/values.yaml +++ b/charts/hami/values.yaml @@ -112,10 +112,10 @@ devicePlugin: podAnnotations: {} nvidianodeSelector: - gpu: "on" + hami.io/nvidia-selector: "on" mlunodeSelector: mlu: "on" hygonnodeSelector: dcu: "on" tolerations: [] - + autoValidatorDriver: false diff --git a/docker/Dockerfile b/docker/Dockerfile index e6b4308bb..d842aa676 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -16,9 +16,12 @@ RUN wget https://cmake.org/files/v3.19/cmake-3.19.8-Linux-x86_64.tar.gz RUN tar -xf cmake-3.19.8-Linux-x86_64.tar.gz RUN cp /libvgpu/cmake-3.19.8-Linux-x86_64/bin/cmake /libvgpu/cmake-3.19.8-Linux-x86_64/bin/cmake3 ENV PATH="/libvgpu/cmake-3.19.8-Linux-x86_64/bin:${PATH}" -RUN apt-get -y install openssl libssl-dev +RUN apt-get -y install openssl libssl-dev curl +ARG KUBERCTL_VERSION +RUN curl https://storage.googleapis.com/kubernetes-release/release/${KUBERCTL_VERSION}/bin/linux/amd64/kubectl --output /bin/kubectl && chmod u+x /bin/kubectl RUN bash ./build.sh + FROM nvidia/cuda:12.2.0-base-ubuntu22.04 ENV NVIDIA_DISABLE_REQUIRE="true" ENV NVIDIA_VISIBLE_DEVICES=all @@ -32,6 +35,7 @@ COPY --from=GOBUILD /k8s-vgpu/bin /k8s-vgpu/bin COPY ./docker/entrypoint.sh /k8s-vgpu/bin/entrypoint.sh COPY ./lib /k8s-vgpu/lib COPY --from=NVBUILD /libvgpu/build/libvgpu.so /k8s-vgpu/lib/nvidia/ +COPY --from=NVBUILD /bin/kubectl /bin/kubectl COPY ./lib/mlu/cntopo /usr/bin/ COPY ./lib/mlu/libcndev.so /usr/lib/ diff --git a/go.sum b/go.sum index fd7cd579e..f0bab5e89 100644 --- a/go.sum +++ b/go.sum @@ -1211,6 +1211,7 @@ github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfn github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/pty v1.1.3/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/pty v1.1.5/go.mod h1:9r2w37qlBe7rQ6e1fg1S/9xpWHSnaqNdHD3WcMdbPDA= diff --git a/hack/build.sh b/hack/build.sh index 6c0736e94..aa6f2ad19 100755 --- a/hack/build.sh +++ b/hack/build.sh @@ -25,6 +25,7 @@ export LATEST_VERSION="latest" export GOLANG_IMAGE="golang:1.21-bullseye" export NVIDIA_IMAGE="nvidia/cuda:12.2.0-devel-ubuntu20.04" export DEST_DIR="/usr/local" +export KUBERCTL_VERSION="v1.29.0" IMAGE=${IMAGE-"projecthami/hami"} @@ -34,7 +35,7 @@ function go_build() { } function docker_build() { - docker build --build-arg VERSION="${VERSION}" --build-arg GOLANG_IMAGE=${GOLANG_IMAGE} --build-arg NVIDIA_IMAGE=${NVIDIA_IMAGE} --build-arg DEST_DIR=${DEST_DIR} -t "${IMAGE}:${VERSION}" -f docker/Dockerfile . + docker build --build-arg VERSION="${VERSION}" --build-arg GOLANG_IMAGE=${GOLANG_IMAGE} --build-arg NVIDIA_IMAGE=${NVIDIA_IMAGE} --build-arg DEST_DIR=${DEST_DIR} --build-arg KUBERCTL_VERSION=${KUBERCTL_VERSION} -t "${IMAGE}:${VERSION}" -f docker/Dockerfile . docker tag "${IMAGE}:${VERSION}" "${IMAGE}:${SHORT_VERSION}" docker tag "${IMAGE}:${VERSION}" "${IMAGE}:${LATEST_VERSION}" }