add validator nvidia driver whether install success

Signed-off-by: lengrongfu <lenronfu@gmail.com>
Project-HAMi · Feb 21, 2024 · ca232fa · ca232fa
1 parent 59f1afc
commit ca232fa
Show file tree

Hide file tree

Showing 9 changed files with 143 additions and 9 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -14,7 +14,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v4
 
       - name: Checkout submodule
         uses: Mushus/checkout-submodule@v1.0.1

diff --git a/README.md b/README.md
@@ -157,12 +157,16 @@ sudo systemctl daemon-reload && systemctl restart containerd
 
 <details> <summary> Label your nodes </summary>
 
-Label your GPU nodes for scheduling with HAMi by adding the label "gpu=on". Without this label, the nodes cannot be managed by our scheduler.
+Finally, put a label on the node that needs to use GPU. Currently, there are two ways to label the node. You can manually or customize the label by controlling the autoValidatorDriver variable.
 
+- Manually put `label`, use the following command to put `label` on the node, and set `autoValidatorDriver` to `false`.
 ```
-kubectl label nodes {nodeid} gpu=on
+$ kubectl label nodes {nodeid} hami.io/nvidia-selector=on
 ```
 
+- Automatically put `label`. By setting `autoValidatorDriver` to `true`, a `daemonset` will be started to automatically detect whether the `NVIDIA` driver on the node is ready. If it is ready, set `hami.io/nvidia- selector=on` label, otherwise `hami.io/nvidia-selector=off` label.
+
+
 </details>
 
 ### Install and Uninstall

diff --git a/README_cn.md b/README_cn.md
@@ -138,12 +138,15 @@ systemctl daemon-reload && systemctl restart containerd
 
 <details> <summary> 为GPU节点打上标签 </summary>
 
-最后，你需要将所有要使用到的GPU节点打上gpu=on标签，否则该节点不会被调度到
+最后，在需要使用`GPU`的节点上打上`label`，目前提供了两种方式给节点打`label`，通过控制`autoValidatorDriver`变量来手动或者自定打`label`。
 
+- 手动打`label`，通过如下命令在节点上打`label`，并设置`autoValidatorDriver`为`false`。
 ```
-$ kubectl label nodes {nodeid} gpu=on
+$ kubectl label nodes {nodeid} hami.io/nvidia-selector=on
 ```
 
+- 自动打`label`，通过设置`autoValidatorDriver`为`true`，会启动一个`daemonset`，自动探测节点上的`NVIDIA`驱动是否准备好，准备好则给节点设置`hami.io/nvidia-selector=on` label，否则为`hami.io/nvidia-selector=off` label.
+
 </details>
 
 ### 安装，更新与卸载

diff --git a/charts/hami/templates/device-plugin/daemonsetvalidator-configmap.yaml b/charts/hami/templates/device-plugin/daemonsetvalidator-configmap.yaml
@@ -0,0 +1,51 @@
+{{- if .Values.devicePlugin.autoValidatorDriver }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: driver-validator-configmap
+  labels:
+    app.kubernetes.io/component: 4pd-driver-validator
+data:
+  validator.sh: |
+    #!/bin/zsh
+    if [ -f "/host/usr/bin/nvidia-smi" ]; then
+        driverRoot="/host"
+        isHostDriver="true"
+    else
+        driverRoot="/run/nvidia/driver"
+        isHostDriver="false"
+    fi
+
+    # check this directory /run/nvidia/driver does exist?
+    if [ "$isHostDriver" = "false" ]; then
+      while [ ! -d "/run/nvidia/driver" ]; do
+        echo current node $NODE_NAME not GPU device;
+        sleep 10
+      done
+    fi
+
+    sleep_interval=5
+    if [ "$isHostDriver" = "false" ]; then
+        while true; do
+            stat /run/nvidia/validations/.driver-ctr-ready
+            if [ $? -ne 0 ]; then
+              kubectl label node $NODE_NAME hami.io/nvidia-selector="off" --overwrite
+              sleep $sleep_interval
+            else
+              break
+            fi
+        done
+    fi
+    while true; do
+        chroot "$driverRoot" nvidia-smi
+        if [ $? -eq 0 ]; then
+            sleep 30
+            kubectl label node $NODE_NAME hami.io/nvidia-selector="on" --overwrite
+            break
+        else
+            sleep $sleep_interval
+        fi
+    done
+    echo validations are successful; sleep infinity
+
+    {{- end }}
diff --git a/charts/hami/templates/device-plugin/daemonsetvalidator.yaml b/charts/hami/templates/device-plugin/daemonsetvalidator.yaml
@@ -0,0 +1,70 @@
+{{- if .Values.devicePlugin.autoValidatorDriver }}
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: driver-validator
+  labels:
+    app.kubernetes.io/component: 4pd-driver-validator
+    {{- with .Values.global.labels }}
+    {{- toYaml . | nindent 4 }}
+    {{- end }}
+  {{- if .Values.global.annotations }}
+  annotations: {{ toYaml .Values.global.annotations | nindent 4}}
+  {{- end }}
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/component: 4pd-driver-validator
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/component: 4pd-driver-validator
+        4pd.io/webhook: ignore
+    spec:
+      serviceAccountName: {{ include "4pd-vgpu.device-plugin" . }}
+      priorityClassName: system-node-critical
+      containers:
+        - name: driver-validator
+          image: '{{ .Values.devicePlugin.image }}:{{ .Values.version }}'
+          command:
+            - /bin/sh
+            - -c
+            - /scripts/validator.sh
+          env:
+          - name: NODE_NAME
+            valueFrom:
+              fieldRef:
+                fieldPath: spec.nodeName
+          volumeMounts:
+            - mountPath: /host
+              mountPropagation: HostToContainer
+              name: host-root
+              readOnly: true
+            - mountPath: /run/nvidia/driver
+              mountPropagation: HostToContainer
+              name: driver-install-path
+              readOnly: true
+            - mountPath: /run/nvidia/validations
+              mountPropagation: Bidirectional
+              name: run-nvidia-validations
+            - name: validator-configmap
+              mountPath: /scripts
+          resources:
+            limits:
+              cpu: 10m
+              memory: 50Mb
+      volumes:
+        - hostPath:
+            path: /run/nvidia/driver
+          name: driver-install-path
+        - hostPath:
+            path: /
+          name: host-root
+        - name: validator-configmap
+          configMap:
+            name: driver-validator-configmap
+        - hostPath:
+            path: /run/nvidia/validations
+            type: DirectoryOrCreate
+          name: run-nvidia-validations
+{{- end }}
diff --git a/charts/hami/values.yaml b/charts/hami/values.yaml
@@ -112,10 +112,10 @@ devicePlugin:
 
   podAnnotations: {}
   nvidianodeSelector:
-    gpu: "on"
+    hami.io/nvidia-selector: "on"
   mlunodeSelector:
     mlu: "on"
   hygonnodeSelector:
     dcu: "on"
   tolerations: []
-
+  autoValidatorDriver: false
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -16,9 +16,12 @@ RUN wget https://cmake.org/files/v3.19/cmake-3.19.8-Linux-x86_64.tar.gz
 RUN tar -xf cmake-3.19.8-Linux-x86_64.tar.gz
 RUN cp /libvgpu/cmake-3.19.8-Linux-x86_64/bin/cmake /libvgpu/cmake-3.19.8-Linux-x86_64/bin/cmake3
 ENV PATH="/libvgpu/cmake-3.19.8-Linux-x86_64/bin:${PATH}"
-RUN apt-get -y install openssl libssl-dev
+RUN apt-get -y install openssl libssl-dev curl
+ARG KUBERCTL_VERSION
+RUN curl https://storage.googleapis.com/kubernetes-release/release/${KUBERCTL_VERSION}/bin/linux/amd64/kubectl --output /bin/kubectl && chmod u+x /bin/kubectl
 RUN bash ./build.sh
 
+
 FROM nvidia/cuda:12.2.0-base-ubuntu22.04
 ENV NVIDIA_DISABLE_REQUIRE="true"
 ENV NVIDIA_VISIBLE_DEVICES=all
@@ -32,6 +35,7 @@ COPY --from=GOBUILD /k8s-vgpu/bin /k8s-vgpu/bin
 COPY ./docker/entrypoint.sh /k8s-vgpu/bin/entrypoint.sh
 COPY ./lib /k8s-vgpu/lib
 COPY --from=NVBUILD /libvgpu/build/libvgpu.so /k8s-vgpu/lib/nvidia/
+COPY --from=NVBUILD /bin/kubectl /bin/kubectl
 COPY ./lib/mlu/cntopo /usr/bin/
 COPY ./lib/mlu/libcndev.so /usr/lib/
 

diff --git a/go.sum b/go.sum
@@ -1211,6 +1211,7 @@ github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfn
 github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
 github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk=
 github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
+github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
 github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
 github.com/kr/pty v1.1.3/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
 github.com/kr/pty v1.1.5/go.mod h1:9r2w37qlBe7rQ6e1fg1S/9xpWHSnaqNdHD3WcMdbPDA=

diff --git a/hack/build.sh b/hack/build.sh
@@ -25,6 +25,7 @@ export LATEST_VERSION="latest"
 export GOLANG_IMAGE="golang:1.21-bullseye"
 export NVIDIA_IMAGE="nvidia/cuda:12.2.0-devel-ubuntu20.04"
 export DEST_DIR="/usr/local"
+export KUBERCTL_VERSION="v1.29.0"
 
 IMAGE=${IMAGE-"projecthami/hami"}
 
@@ -34,7 +35,7 @@ function go_build() {
 }
 
 function docker_build() {
-    docker build --build-arg VERSION="${VERSION}" --build-arg GOLANG_IMAGE=${GOLANG_IMAGE} --build-arg NVIDIA_IMAGE=${NVIDIA_IMAGE} --build-arg DEST_DIR=${DEST_DIR} -t "${IMAGE}:${VERSION}" -f docker/Dockerfile .
+    docker build --build-arg VERSION="${VERSION}" --build-arg GOLANG_IMAGE=${GOLANG_IMAGE} --build-arg NVIDIA_IMAGE=${NVIDIA_IMAGE} --build-arg DEST_DIR=${DEST_DIR}  --build-arg KUBERCTL_VERSION=${KUBERCTL_VERSION} -t "${IMAGE}:${VERSION}" -f docker/Dockerfile .
     docker tag "${IMAGE}:${VERSION}" "${IMAGE}:${SHORT_VERSION}"
     docker tag "${IMAGE}:${VERSION}" "${IMAGE}:${LATEST_VERSION}"
 }