From 0d96d67ea15c481203860e32489c74a57773312d Mon Sep 17 00:00:00 2001 From: "rongfu.leng" Date: Thu, 8 Aug 2024 18:10:33 +0800 Subject: [PATCH] add memory-gi-unit feature Signed-off-by: rongfu.leng --- cmd/scheduler/main.go | 1 + pkg/device/ascend/ascend310p.go | 11 ++++ pkg/device/ascend/device.go | 11 ++++ pkg/device/cambricon/device.go | 4 ++ pkg/device/devices.go | 1 + pkg/device/hygon/device.go | 12 +++++ pkg/device/iluvatar/device.go | 4 ++ pkg/device/nvidia/device.go | 11 ++++ pkg/device/nvidia/device_test.go | 87 ++++++++++++++++++++++++++++++++ pkg/scheduler/config/config.go | 3 +- pkg/scheduler/webhook.go | 3 ++ 11 files changed, 147 insertions(+), 1 deletion(-) diff --git a/cmd/scheduler/main.go b/cmd/scheduler/main.go index 9dad76c8e..41ec4ab74 100644 --- a/cmd/scheduler/main.go +++ b/cmd/scheduler/main.go @@ -62,6 +62,7 @@ func init() { rootCmd.Flags().StringVar(&config.GPUSchedulerPolicy, "gpu-scheduler-policy", policy.GPUSchedulerPolicySpread.String(), "GPU scheduler policy") rootCmd.Flags().StringVar(&config.MetricsBindAddress, "metrics-bind-address", ":9395", "The TCP address that the scheduler should bind to for serving prometheus metrics(e.g. 127.0.0.1:9395, :9395)") rootCmd.Flags().StringToStringVar(&config.NodeLabelSelector, "node-label-selector", nil, "key=value pairs separated by commas") + rootCmd.Flags().BoolVar(&config.MemoryUserGIUnit, "memory-gb-unit", false, "set memory resource unit is Gi") rootCmd.PersistentFlags().AddGoFlagSet(device.GlobalFlagSet()) rootCmd.AddCommand(version.VersionCmd) rootCmd.Flags().AddGoFlagSet(util.InitKlogFlags()) diff --git a/pkg/device/ascend/ascend310p.go b/pkg/device/ascend/ascend310p.go index 2c4a0253f..7093eee3f 100644 --- a/pkg/device/ascend/ascend310p.go +++ b/pkg/device/ascend/ascend310p.go @@ -20,6 +20,7 @@ import ( "errors" "flag" "fmt" + "math" "strconv" "strings" "time" @@ -246,3 +247,13 @@ func (dev *Ascend310P) GenerateResourceRequests(ctr *corev1.Container) util.Cont } return util.ContainerDeviceRequest{} } + +func (dev *Ascend310P) ResourceMemoryUnitConversion(resources corev1.ResourceRequirements) corev1.ResourceRequirements { + if v, ok := resources.Limits[corev1.ResourceName(Ascend310PResourceMemory)]; ok { + resources.Limits[corev1.ResourceName(Ascend310PResourceMemory)] = *resource.NewQuantity(int64(math.Floor(v.AsApproximateFloat64()*1024)), resource.BinarySI) + } + if v, ok := resources.Requests[corev1.ResourceName(Ascend310PResourceMemory)]; ok { + resources.Requests[corev1.ResourceName(Ascend310PResourceMemory)] = *resource.NewQuantity(int64(math.Floor(v.AsApproximateFloat64()*1024)), resource.BinarySI) + } + return resources +} diff --git a/pkg/device/ascend/device.go b/pkg/device/ascend/device.go index 9671b2a5c..b77f4bee1 100644 --- a/pkg/device/ascend/device.go +++ b/pkg/device/ascend/device.go @@ -20,6 +20,7 @@ import ( "errors" "flag" "fmt" + "math" "strconv" "strings" "time" @@ -231,3 +232,13 @@ func (dev *AscendDevices) GenerateResourceRequests(ctr *corev1.Container) util.C } return util.ContainerDeviceRequest{} } + +func (dev *AscendDevices) ResourceMemoryUnitConversion(resources corev1.ResourceRequirements) corev1.ResourceRequirements { + if v, ok := resources.Limits[corev1.ResourceName(AscendResourceMemory)]; ok { + resources.Limits[corev1.ResourceName(AscendResourceMemory)] = *resource.NewQuantity(int64(math.Floor(v.AsApproximateFloat64()*1024)), resource.BinarySI) + } + if v, ok := resources.Requests[corev1.ResourceName(AscendResourceMemory)]; ok { + resources.Requests[corev1.ResourceName(AscendResourceMemory)] = *resource.NewQuantity(int64(math.Floor(v.AsApproximateFloat64()*1024)), resource.BinarySI) + } + return resources +} diff --git a/pkg/device/cambricon/device.go b/pkg/device/cambricon/device.go index 14e9e157f..0f0760f7d 100644 --- a/pkg/device/cambricon/device.go +++ b/pkg/device/cambricon/device.go @@ -299,3 +299,7 @@ func (dev *CambriconDevices) PatchAnnotations(annoinput *map[string]string, pd u } return *annoinput } + +func (dev *CambriconDevices) ResourceMemoryUnitConversion(resources corev1.ResourceRequirements) corev1.ResourceRequirements { + return resources +} diff --git a/pkg/device/devices.go b/pkg/device/devices.go index 7d2359ee8..e452162c3 100644 --- a/pkg/device/devices.go +++ b/pkg/device/devices.go @@ -39,6 +39,7 @@ import ( type Devices interface { MutateAdmission(ctr *corev1.Container) (bool, error) + ResourceMemoryUnitConversion(resources corev1.ResourceRequirements) corev1.ResourceRequirements CheckHealth(devType string, n *corev1.Node) (bool, bool) NodeCleanUp(nn string) error GetNodeDevices(n corev1.Node) ([]*api.DeviceInfo, error) diff --git a/pkg/device/hygon/device.go b/pkg/device/hygon/device.go index 9b85e4948..7be95793a 100644 --- a/pkg/device/hygon/device.go +++ b/pkg/device/hygon/device.go @@ -19,12 +19,14 @@ package hygon import ( "errors" "flag" + math "math" "strings" "github.com/Project-HAMi/HAMi/pkg/api" "github.com/Project-HAMi/HAMi/pkg/util" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" "k8s.io/klog/v2" ) @@ -233,3 +235,13 @@ func (dev *DCUDevices) PatchAnnotations(annoinput *map[string]string, pd util.Po } return *annoinput } + +func (dev *DCUDevices) ResourceMemoryUnitConversion(resources corev1.ResourceRequirements) corev1.ResourceRequirements { + if v, ok := resources.Limits[corev1.ResourceName(HygonResourceMemory)]; ok { + resources.Limits[corev1.ResourceName(HygonResourceMemory)] = *resource.NewQuantity(int64(math.Floor(v.AsApproximateFloat64()*1024)), resource.BinarySI) + } + if v, ok := resources.Requests[corev1.ResourceName(HygonResourceMemory)]; ok { + resources.Requests[corev1.ResourceName(HygonResourceMemory)] = *resource.NewQuantity(int64(math.Floor(v.AsApproximateFloat64()*1024)), resource.BinarySI) + } + return resources +} diff --git a/pkg/device/iluvatar/device.go b/pkg/device/iluvatar/device.go index 194ccfda9..5d361772e 100644 --- a/pkg/device/iluvatar/device.go +++ b/pkg/device/iluvatar/device.go @@ -213,3 +213,7 @@ func (dev *IluvatarDevices) GenerateResourceRequests(ctr *corev1.Container) util } return util.ContainerDeviceRequest{} } + +func (dev *IluvatarDevices) ResourceMemoryUnitConversion(resources corev1.ResourceRequirements) corev1.ResourceRequirements { + return resources +} diff --git a/pkg/device/nvidia/device.go b/pkg/device/nvidia/device.go index 86db3dd1f..e9b5bf35d 100644 --- a/pkg/device/nvidia/device.go +++ b/pkg/device/nvidia/device.go @@ -20,6 +20,7 @@ import ( "errors" "flag" "fmt" + "math" "strconv" "strings" @@ -319,3 +320,13 @@ func (dev *NvidiaGPUDevices) GenerateResourceRequests(ctr *corev1.Container) uti } return util.ContainerDeviceRequest{} } + +func (dev *NvidiaGPUDevices) ResourceMemoryUnitConversion(resources corev1.ResourceRequirements) corev1.ResourceRequirements { + if v, ok := resources.Limits[corev1.ResourceName(ResourceMem)]; ok { + resources.Limits[corev1.ResourceName(ResourceMem)] = *resource.NewQuantity(int64(math.Floor(v.AsApproximateFloat64()*1024)), resource.BinarySI) + } + if v, ok := resources.Requests[corev1.ResourceName(ResourceMem)]; ok { + resources.Requests[corev1.ResourceName(ResourceMem)] = *resource.NewQuantity(int64(math.Floor(v.AsApproximateFloat64()*1024)), resource.BinarySI) + } + return resources +} diff --git a/pkg/device/nvidia/device_test.go b/pkg/device/nvidia/device_test.go index b48fe2668..e26eb1417 100644 --- a/pkg/device/nvidia/device_test.go +++ b/pkg/device/nvidia/device_test.go @@ -208,6 +208,7 @@ func Test_CheckUUID(t *testing.T) { } func Test_CheckType(t *testing.T) { + gpuDevices := &NvidiaGPUDevices{} tests := []struct { name string @@ -275,3 +276,89 @@ func Test_CheckType(t *testing.T) { }) } } + +func Test_ResourceMemoryUnitConversion(t *testing.T) { + tests := []struct { + name string + args corev1.ResourceRequirements + want corev1.ResourceRequirements + }{ + { + name: "not set memory field", + args: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "cpu": resource.MustParse("1"), + "nvidia.com/gpu": resource.MustParse("2"), + }, + }, + want: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "cpu": resource.MustParse("1"), + "nvidia.com/gpu": resource.MustParse("2"), + }, + }, + }, + { + name: "memory limits set 1Gi", + args: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "nvidia.com/gpu": resource.MustParse("2"), + "nvidia.com/gpumem": resource.MustParse("1"), + }, + }, + want: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "nvidia.com/gpu": resource.MustParse("2"), + "nvidia.com/gpumem": resource.MustParse("1024"), + }, + }, + }, + { + name: "memory limits set 0.001Gi", + args: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "nvidia.com/gpu": resource.MustParse("2"), + "nvidia.com/gpumem": resource.MustParse("0.001"), + }, + }, + want: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "nvidia.com/gpu": resource.MustParse("2"), + "nvidia.com/gpumem": resource.MustParse("1"), + }, + }, + }, + + { + name: "memory limits and request set 0.001Gi", + args: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "nvidia.com/gpu": resource.MustParse("2"), + "nvidia.com/gpumem": resource.MustParse("1"), + }, + Requests: corev1.ResourceList{ + "nvidia.com/gpu": resource.MustParse("2"), + "nvidia.com/gpumem": resource.MustParse("0.001"), + }, + }, + want: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "nvidia.com/gpu": resource.MustParse("2"), + "nvidia.com/gpumem": resource.MustParse("1024"), + }, + Requests: corev1.ResourceList{ + "nvidia.com/gpu": resource.MustParse("2"), + "nvidia.com/gpumem": resource.MustParse("1"), + }, + }, + }, + } + gpuDevices := &NvidiaGPUDevices{} + ResourceMem = "nvidia.com/gpumem" + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + got := gpuDevices.ResourceMemoryUnitConversion(test.args) + assert.DeepEqual(t, got, test.want) + }) + } +} diff --git a/pkg/scheduler/config/config.go b/pkg/scheduler/config/config.go index 4ac95f4b2..3dc99dd16 100644 --- a/pkg/scheduler/config/config.go +++ b/pkg/scheduler/config/config.go @@ -30,7 +30,8 @@ var ( NodeSchedulerPolicy = policy.NodeSchedulerPolicyBinpack.String() // GPUSchedulerPolicy is config this scheduler GPU to use `binpack` or `spread`. default value is spread. GPUSchedulerPolicy = policy.GPUSchedulerPolicySpread.String() - // NodeLabelSelector is scheduler filter node by node label. NodeLabelSelector map[string]string + // MemoryUserGIUnit set memory resource unit is Gi. + MemoryUserGIUnit bool = false ) diff --git a/pkg/scheduler/webhook.go b/pkg/scheduler/webhook.go index dd72727b9..4c4802c6c 100644 --- a/pkg/scheduler/webhook.go +++ b/pkg/scheduler/webhook.go @@ -71,6 +71,9 @@ func (h *webhook) Handle(_ context.Context, req admission.Request) admission.Res } } for _, val := range device.GetDevices() { + if config.MemoryUserGIUnit { + c.Resources = val.ResourceMemoryUnitConversion(c.Resources) + } found, err := val.MutateAdmission(c) if err != nil { klog.Errorf("validating pod failed:%s", err.Error())