Skip to content

Commit

Permalink
add memory-gi-unit feature
Browse files Browse the repository at this point in the history
Signed-off-by: rongfu.leng <lenronfu@gmail.com>
  • Loading branch information
lengrongfu committed Aug 9, 2024
1 parent 901b43a commit 0d96d67
Show file tree
Hide file tree
Showing 11 changed files with 147 additions and 1 deletion.
1 change: 1 addition & 0 deletions cmd/scheduler/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ func init() {
rootCmd.Flags().StringVar(&config.GPUSchedulerPolicy, "gpu-scheduler-policy", policy.GPUSchedulerPolicySpread.String(), "GPU scheduler policy")
rootCmd.Flags().StringVar(&config.MetricsBindAddress, "metrics-bind-address", ":9395", "The TCP address that the scheduler should bind to for serving prometheus metrics(e.g. 127.0.0.1:9395, :9395)")
rootCmd.Flags().StringToStringVar(&config.NodeLabelSelector, "node-label-selector", nil, "key=value pairs separated by commas")
rootCmd.Flags().BoolVar(&config.MemoryUserGIUnit, "memory-gb-unit", false, "set memory resource unit is Gi")
rootCmd.PersistentFlags().AddGoFlagSet(device.GlobalFlagSet())
rootCmd.AddCommand(version.VersionCmd)
rootCmd.Flags().AddGoFlagSet(util.InitKlogFlags())
Expand Down
11 changes: 11 additions & 0 deletions pkg/device/ascend/ascend310p.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"errors"
"flag"
"fmt"
"math"
"strconv"
"strings"
"time"
Expand Down Expand Up @@ -246,3 +247,13 @@ func (dev *Ascend310P) GenerateResourceRequests(ctr *corev1.Container) util.Cont
}
return util.ContainerDeviceRequest{}
}

func (dev *Ascend310P) ResourceMemoryUnitConversion(resources corev1.ResourceRequirements) corev1.ResourceRequirements {
if v, ok := resources.Limits[corev1.ResourceName(Ascend310PResourceMemory)]; ok {
resources.Limits[corev1.ResourceName(Ascend310PResourceMemory)] = *resource.NewQuantity(int64(math.Floor(v.AsApproximateFloat64()*1024)), resource.BinarySI)
}
if v, ok := resources.Requests[corev1.ResourceName(Ascend310PResourceMemory)]; ok {
resources.Requests[corev1.ResourceName(Ascend310PResourceMemory)] = *resource.NewQuantity(int64(math.Floor(v.AsApproximateFloat64()*1024)), resource.BinarySI)
}
return resources
}
11 changes: 11 additions & 0 deletions pkg/device/ascend/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"errors"
"flag"
"fmt"
"math"
"strconv"
"strings"
"time"
Expand Down Expand Up @@ -231,3 +232,13 @@ func (dev *AscendDevices) GenerateResourceRequests(ctr *corev1.Container) util.C
}
return util.ContainerDeviceRequest{}
}

func (dev *AscendDevices) ResourceMemoryUnitConversion(resources corev1.ResourceRequirements) corev1.ResourceRequirements {
if v, ok := resources.Limits[corev1.ResourceName(AscendResourceMemory)]; ok {
resources.Limits[corev1.ResourceName(AscendResourceMemory)] = *resource.NewQuantity(int64(math.Floor(v.AsApproximateFloat64()*1024)), resource.BinarySI)
}
if v, ok := resources.Requests[corev1.ResourceName(AscendResourceMemory)]; ok {
resources.Requests[corev1.ResourceName(AscendResourceMemory)] = *resource.NewQuantity(int64(math.Floor(v.AsApproximateFloat64()*1024)), resource.BinarySI)
}
return resources
}
4 changes: 4 additions & 0 deletions pkg/device/cambricon/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -299,3 +299,7 @@ func (dev *CambriconDevices) PatchAnnotations(annoinput *map[string]string, pd u
}
return *annoinput
}

func (dev *CambriconDevices) ResourceMemoryUnitConversion(resources corev1.ResourceRequirements) corev1.ResourceRequirements {
return resources
}
1 change: 1 addition & 0 deletions pkg/device/devices.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import (

type Devices interface {
MutateAdmission(ctr *corev1.Container) (bool, error)
ResourceMemoryUnitConversion(resources corev1.ResourceRequirements) corev1.ResourceRequirements
CheckHealth(devType string, n *corev1.Node) (bool, bool)
NodeCleanUp(nn string) error
GetNodeDevices(n corev1.Node) ([]*api.DeviceInfo, error)
Expand Down
12 changes: 12 additions & 0 deletions pkg/device/hygon/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,14 @@ package hygon
import (
"errors"
"flag"
math "math"
"strings"

"github.com/Project-HAMi/HAMi/pkg/api"
"github.com/Project-HAMi/HAMi/pkg/util"

corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/klog/v2"
)

Expand Down Expand Up @@ -233,3 +235,13 @@ func (dev *DCUDevices) PatchAnnotations(annoinput *map[string]string, pd util.Po
}
return *annoinput
}

func (dev *DCUDevices) ResourceMemoryUnitConversion(resources corev1.ResourceRequirements) corev1.ResourceRequirements {
if v, ok := resources.Limits[corev1.ResourceName(HygonResourceMemory)]; ok {
resources.Limits[corev1.ResourceName(HygonResourceMemory)] = *resource.NewQuantity(int64(math.Floor(v.AsApproximateFloat64()*1024)), resource.BinarySI)
}
if v, ok := resources.Requests[corev1.ResourceName(HygonResourceMemory)]; ok {
resources.Requests[corev1.ResourceName(HygonResourceMemory)] = *resource.NewQuantity(int64(math.Floor(v.AsApproximateFloat64()*1024)), resource.BinarySI)
}
return resources
}
4 changes: 4 additions & 0 deletions pkg/device/iluvatar/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,3 +213,7 @@ func (dev *IluvatarDevices) GenerateResourceRequests(ctr *corev1.Container) util
}
return util.ContainerDeviceRequest{}
}

func (dev *IluvatarDevices) ResourceMemoryUnitConversion(resources corev1.ResourceRequirements) corev1.ResourceRequirements {
return resources
}
11 changes: 11 additions & 0 deletions pkg/device/nvidia/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"errors"
"flag"
"fmt"
"math"
"strconv"
"strings"

Expand Down Expand Up @@ -319,3 +320,13 @@ func (dev *NvidiaGPUDevices) GenerateResourceRequests(ctr *corev1.Container) uti
}
return util.ContainerDeviceRequest{}
}

func (dev *NvidiaGPUDevices) ResourceMemoryUnitConversion(resources corev1.ResourceRequirements) corev1.ResourceRequirements {
if v, ok := resources.Limits[corev1.ResourceName(ResourceMem)]; ok {
resources.Limits[corev1.ResourceName(ResourceMem)] = *resource.NewQuantity(int64(math.Floor(v.AsApproximateFloat64()*1024)), resource.BinarySI)
}
if v, ok := resources.Requests[corev1.ResourceName(ResourceMem)]; ok {
resources.Requests[corev1.ResourceName(ResourceMem)] = *resource.NewQuantity(int64(math.Floor(v.AsApproximateFloat64()*1024)), resource.BinarySI)
}
return resources
}
87 changes: 87 additions & 0 deletions pkg/device/nvidia/device_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@ func Test_CheckUUID(t *testing.T) {
}

func Test_CheckType(t *testing.T) {

gpuDevices := &NvidiaGPUDevices{}
tests := []struct {
name string
Expand Down Expand Up @@ -275,3 +276,89 @@ func Test_CheckType(t *testing.T) {
})
}
}

func Test_ResourceMemoryUnitConversion(t *testing.T) {
tests := []struct {
name string
args corev1.ResourceRequirements
want corev1.ResourceRequirements
}{
{
name: "not set memory field",
args: corev1.ResourceRequirements{
Limits: corev1.ResourceList{
"cpu": resource.MustParse("1"),
"nvidia.com/gpu": resource.MustParse("2"),
},
},
want: corev1.ResourceRequirements{
Limits: corev1.ResourceList{
"cpu": resource.MustParse("1"),
"nvidia.com/gpu": resource.MustParse("2"),
},
},
},
{
name: "memory limits set 1Gi",
args: corev1.ResourceRequirements{
Limits: corev1.ResourceList{
"nvidia.com/gpu": resource.MustParse("2"),
"nvidia.com/gpumem": resource.MustParse("1"),
},
},
want: corev1.ResourceRequirements{
Limits: corev1.ResourceList{
"nvidia.com/gpu": resource.MustParse("2"),
"nvidia.com/gpumem": resource.MustParse("1024"),
},
},
},
{
name: "memory limits set 0.001Gi",
args: corev1.ResourceRequirements{
Limits: corev1.ResourceList{
"nvidia.com/gpu": resource.MustParse("2"),
"nvidia.com/gpumem": resource.MustParse("0.001"),
},
},
want: corev1.ResourceRequirements{
Limits: corev1.ResourceList{
"nvidia.com/gpu": resource.MustParse("2"),
"nvidia.com/gpumem": resource.MustParse("1"),
},
},
},

{
name: "memory limits and request set 0.001Gi",
args: corev1.ResourceRequirements{
Limits: corev1.ResourceList{
"nvidia.com/gpu": resource.MustParse("2"),
"nvidia.com/gpumem": resource.MustParse("1"),
},
Requests: corev1.ResourceList{
"nvidia.com/gpu": resource.MustParse("2"),
"nvidia.com/gpumem": resource.MustParse("0.001"),
},
},
want: corev1.ResourceRequirements{
Limits: corev1.ResourceList{
"nvidia.com/gpu": resource.MustParse("2"),
"nvidia.com/gpumem": resource.MustParse("1024"),
},
Requests: corev1.ResourceList{
"nvidia.com/gpu": resource.MustParse("2"),
"nvidia.com/gpumem": resource.MustParse("1"),
},
},
},
}
gpuDevices := &NvidiaGPUDevices{}
ResourceMem = "nvidia.com/gpumem"
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
got := gpuDevices.ResourceMemoryUnitConversion(test.args)
assert.DeepEqual(t, got, test.want)
})
}
}
3 changes: 2 additions & 1 deletion pkg/scheduler/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ var (
NodeSchedulerPolicy = policy.NodeSchedulerPolicyBinpack.String()
// GPUSchedulerPolicy is config this scheduler GPU to use `binpack` or `spread`. default value is spread.
GPUSchedulerPolicy = policy.GPUSchedulerPolicySpread.String()

// NodeLabelSelector is scheduler filter node by node label.
NodeLabelSelector map[string]string
// MemoryUserGIUnit set memory resource unit is Gi.
MemoryUserGIUnit bool = false
)
3 changes: 3 additions & 0 deletions pkg/scheduler/webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ func (h *webhook) Handle(_ context.Context, req admission.Request) admission.Res
}
}
for _, val := range device.GetDevices() {
if config.MemoryUserGIUnit {
c.Resources = val.ResourceMemoryUnitConversion(c.Resources)
}
found, err := val.MutateAdmission(c)
if err != nil {
klog.Errorf("validating pod failed:%s", err.Error())
Expand Down

0 comments on commit 0d96d67

Please sign in to comment.