Skip to content

Commit

Permalink
Merge pull request #1070 from NVIDIA/gfd-imex-init-container
Browse files Browse the repository at this point in the history
Add init container to GFD for handling imex nodes config mount
  • Loading branch information
cdesiniotis authored Oct 28, 2024
2 parents 09efdfe + 22941b5 commit a25a1d5
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 65 deletions.
22 changes: 0 additions & 22 deletions assets/gpu-feature-discovery/0500_configmap.yaml

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,51 @@ spec:
securityContext:
privileged: true
volumeMounts:
- name: run-nvidia
mountPath: /run/nvidia
mountPropagation: Bidirectional
- name: run-nvidia-validations
mountPath: /run/nvidia/validations
mountPropagation: HostToContainer
- name: gpu-feature-discovery-imex-init
image: "FILLED BY THE OPERATOR"
command: ["/bin/bash", "-c"]
args:
- |
until [[ -f /run/nvidia/validations/driver-ready ]]
do
echo "waiting for the driver validations to be ready..."
sleep 5
done
set -o allexport
cat /run/nvidia/validations/driver-ready
. /run/nvidia/validations/driver-ready
IMEX_NODES_CONFIG_FILE=/etc/nvidia-imex/nodes_config.cfg
if [[ -f /config/${IMEX_NODES_CONFIG_FILE} ]]; then
echo "Removing cached IMEX nodes config"
rm -f /config/${IMEX_NODES_CONFIG_FILE}
fi
if [[ ! -f ${DRIVER_ROOT_CTR_PATH}/${IMEX_NODES_CONFIG_FILE} ]]; then
echo "No IMEX nodes config path detected; Skipping"
exit 0
fi
echo "Copying IMEX nodes config"
mkdir -p $(dirname /config/${IMEX_NODES_CONFIG_FILE})
cp ${DRIVER_ROOT_CTR_PATH}/${IMEX_NODES_CONFIG_FILE} /config/${IMEX_NODES_CONFIG_FILE}
securityContext:
privileged: true
volumeMounts:
- name: config
mountPath: /config
- name: run-nvidia-validations
mountPath: /run/nvidia/validations
mountPropagation: HostToContainer
- name: host-root
mountPath: /host/etc
subPath: etc
readOnly: true
- name: driver-install-dir
mountPath: /driver-root/etc
subPath: etc
readOnly: true
- name: config-manager-init
image: "FILLED BY THE OPERATOR"
command: ["config-manager"]
Expand All @@ -62,12 +104,13 @@ spec:
value: ""
- name: PROCESS_TO_SIGNAL
value: ""
volumeMounts:
- name: config
mountPath: /config
containers:
- image: "FILLED BY THE OPERATOR"
name: gpu-feature-discovery
command: [ "/bin/bash", "-c" ]
args:
- /bin/entrypoint.sh
command: ["gpu-feature-discovery"]
env:
- name: GFD_SLEEP_INTERVAL
value: 60s
Expand All @@ -82,24 +125,13 @@ spec:
fieldRef:
fieldPath: spec.nodeName
volumeMounts:
- name: gpu-feature-discovery-entrypoint
readOnly: true
mountPath: /bin/entrypoint.sh
subPath: entrypoint.sh
- name: output-dir
mountPath: "/etc/kubernetes/node-feature-discovery/features.d"
- name: dmi-info-dir
mountPath: "/sys/class/dmi/id"
readOnly: true
- name: run-nvidia-validations
mountPath: /run/nvidia/validations
- name: driver-install-dir
mountPath: /driver-root
mountPropagation: HostToContainer
- name: host-root
mountPath: /host
readOnly: true
mountPropagation: HostToContainer
- name: config
mountPath: /config
securityContext:
privileged: true
- image: "FILLED BY THE OPERATOR"
Expand Down Expand Up @@ -130,24 +162,19 @@ spec:
value: "1" # SIGHUP
- name: PROCESS_TO_SIGNAL
value: "gpu-feature-discovery"
volumeMounts:
- name: config
mountPath: /config
volumes:
- name: gpu-feature-discovery-entrypoint
configMap:
name: gpu-feature-discovery-entrypoint
defaultMode: 448
- name: output-dir
hostPath:
path: "/etc/kubernetes/node-feature-discovery/features.d"
- name: dmi-info-dir
hostPath:
path: "/sys/class/dmi/id"
- name: run-nvidia
hostPath:
path: "/run/nvidia"
type: Directory
- name: run-nvidia-validations
hostPath:
path: /run/nvidia/validations
path: "/run/nvidia/validations"
type: DirectoryOrCreate
- name: host-root
hostPath:
Expand All @@ -156,3 +183,5 @@ spec:
hostPath:
path: /run/nvidia/driver
type: DirectoryOrCreate
- name: config
emptyDir: {}
10 changes: 10 additions & 0 deletions assets/state-device-plugin/0500_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ spec:
value: ""
- name: PROCESS_TO_SIGNAL
value: ""
volumeMounts:
- name: config
mountPath: /config
containers:
- image: "FILLED BY THE OPERATOR"
name: nvidia-device-plugin
Expand Down Expand Up @@ -107,6 +110,8 @@ spec:
mountPath: /dev/shm
- name: mps-root
mountPath: /mps
- name: config
mountPath: /config
- image: "FILLED BY THE OPERATOR"
name: config-manager
command: ["config-manager"]
Expand Down Expand Up @@ -135,6 +140,9 @@ spec:
value: "1" # SIGHUP
- name: PROCESS_TO_SIGNAL
value: "nvidia-device-plugin"
volumeMounts:
- name: config
mountPath: /config
volumes:
- name: nvidia-device-plugin-entrypoint
configMap:
Expand Down Expand Up @@ -165,3 +173,5 @@ spec:
- name: mps-shm
hostPath:
path: /run/nvidia/mps/shm
- name: config
emptyDir: {}
25 changes: 10 additions & 15 deletions controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -879,6 +879,14 @@ func TransformGPUDiscoveryPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPol
}
obj.Spec.Template.Spec.Containers[0].Image = img

// update image for IMEX init container
for i, initCtr := range obj.Spec.Template.Spec.InitContainers {
if initCtr.Name == "gpu-feature-discovery-imex-init" {
obj.Spec.Template.Spec.InitContainers[i].Image = img
break
}
}

// update image pull policy
obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.GPUFeatureDiscovery.ImagePullPolicy)

Expand Down Expand Up @@ -2432,10 +2440,7 @@ func isCustomPluginConfigSet(pluginConfig *gpuv1.DevicePluginConfig) bool {

// adds shared volume mounts required for custom plugin config provided via a ConfigMap
func addSharedMountsForPluginConfig(container *corev1.Container, config *gpuv1.DevicePluginConfig) {
emptyDirMount := corev1.VolumeMount{Name: "config", MountPath: "/config"}
configVolMount := corev1.VolumeMount{Name: config.Name, MountPath: "/available-configs"}

container.VolumeMounts = append(container.VolumeMounts, emptyDirMount)
container.VolumeMounts = append(container.VolumeMounts, configVolMount)
}

Expand Down Expand Up @@ -2471,15 +2476,14 @@ func handleDevicePluginConfig(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicy
continue
}
setContainerEnv(&obj.Spec.Template.Spec.Containers[i], "CONFIG_FILE", "/config/config.yaml")
// setup sharedvolume(emptydir) for main container
// add configmap volume mount
addSharedMountsForPluginConfig(&obj.Spec.Template.Spec.Containers[i], config.DevicePlugin.Config)
}
// Enable process ns sharing for PID access
shareProcessNamespace := true
obj.Spec.Template.Spec.ShareProcessNamespace = &shareProcessNamespace
// setup volumes from configmap and shared emptyDir
// add configmap volume
obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, createConfigMapVolume(config.DevicePlugin.Config.Name, nil))
obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, createEmptyDirVolume("config"))

// apply env/volume changes to initContainer
err := transformConfigManagerInitContainer(obj, config)
Expand Down Expand Up @@ -3107,15 +3111,6 @@ func createConfigMapVolume(configMapName string, itemsToInclude []corev1.KeyToPa
return corev1.Volume{Name: configMapName, VolumeSource: volumeSource}
}

func createEmptyDirVolume(volumeName string) corev1.Volume {
return corev1.Volume{
Name: volumeName,
VolumeSource: corev1.VolumeSource{
EmptyDir: &corev1.EmptyDirVolumeSource{},
},
}
}

func transformDriverContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
driverIndex := 0
driverCtrFound := false
Expand Down

0 comments on commit a25a1d5

Please sign in to comment.