Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into self-monitoring
Browse files Browse the repository at this point in the history
  • Loading branch information
JasonHe-WQ committed Jun 19, 2024
2 parents 61b68b1 + 7ed4a58 commit b2118aa
Show file tree
Hide file tree
Showing 7 changed files with 53 additions and 25 deletions.
26 changes: 26 additions & 0 deletions examples/nvidia/use_as_normal.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
apiVersion: v1
kind: Pod
metadata:
name: gpu-pod1
spec:
containers:
- name: ubuntu-container
image: ubuntu:18.04
command: ["bash", "-c", "sleep 86400"]
resources:
limits:
nvidia.com/gpu: 2 # requesting 2 vGPUs
---
apiVersion: v1
kind: Pod
metadata:
name: gpu-pod2
spec:
containers:
- name: ubuntu-container
image: ubuntu:18.04
command: ["bash", "-c", "sleep 86400"]
resources:
limits:
nvidia.com/gpu: 2 # requesting 2 vGPUs
# gpu-pod1 and gpu-pod2 will NOT share the same GPU
2 changes: 1 addition & 1 deletion examples/nvidia/use_exclusive_card.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ spec:
limits:
nvidia.com/gpu: 2 # requesting 2 vGPUs
nvidia.com/gpumem-percentage: 100 # Each vGPU contains 100% of the entire GPU device memory (Optional,Integer)
nvidia.com/gpucores: 100 # Each vGPU uses 30% of the entire GPU cores(Optional,Integer)
nvidia.com/gpucores: 100 # Each vGPU uses 100% of the entire GPU cores(Optional,Integer)
6 changes: 4 additions & 2 deletions pkg/scheduler/policy/gpu_policy.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,11 @@ func (ds *DeviceListsScore) ComputeScore(requests util.ContainerDeviceRequests)
}
mem += container.Memreq
}
useScore := float32(request+ds.Device.Used) / float32(ds.Device.Count)
klog.V(2).Infof("device %s user %d, userCore %d, userMem %d,", ds.Device.ID, ds.Device.Used, ds.Device.Usedcores, ds.Device.Usedmem)

usedScore := float32(request+ds.Device.Used) / float32(ds.Device.Count)
coreScore := float32(core+ds.Device.Usedcores) / float32(ds.Device.Totalcore)
memScore := float32(mem+ds.Device.Usedmem) / float32(ds.Device.Totalmem)
ds.Score = float32(Weight) * (useScore + coreScore + memScore)
ds.Score = float32(Weight) * (usedScore + coreScore + memScore)
klog.V(2).Infof("device %s computer score is %f", ds.Device.ID, ds.Score)
}
16 changes: 8 additions & 8 deletions pkg/scheduler/policy/node_policy.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,24 +51,24 @@ func (l NodeScoreList) Less(i, j int) bool {
}

func (ns *NodeScore) ComputeScore(devices DeviceUsageList) {
request, core, mem := int32(0), int32(0), int32(0)
// current user having request resource
user, userCore, userMem := int32(0), int32(0), int32(0)
used, usedCore, usedMem := int32(0), int32(0), int32(0)
for _, device := range devices.DeviceLists {
user += device.Device.Used
userCore += device.Device.Usedcores
userMem += device.Device.Usedmem
used += device.Device.Used
usedCore += device.Device.Usedcores
usedMem += device.Device.Usedmem
}
klog.V(2).Infof("node %s used %d, usedCore %d, usedMem %d,", ns.NodeID, used, usedCore, usedMem)

total, totalCore, totalMem := int32(0), int32(0), int32(0)
for _, deviceLists := range devices.DeviceLists {
total += deviceLists.Device.Count
totalCore += deviceLists.Device.Totalcore
totalMem += deviceLists.Device.Totalmem
}
useScore := float32(request+user) / float32(total)
coreScore := float32(core+userCore) / float32(totalCore)
memScore := float32(mem+userMem) / float32(totalMem)
useScore := float32(used) / float32(total)
coreScore := float32(usedCore) / float32(totalCore)
memScore := float32(usedMem) / float32(totalMem)
ns.Score = float32(Weight) * (useScore + coreScore + memScore)
klog.V(2).Infof("node %s computer score is %f", ns.NodeID, ns.Score)
}
4 changes: 2 additions & 2 deletions pkg/scheduler/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,7 @@ func (s *Scheduler) Bind(args extenderv1.ExtenderBindingArgs) (*extenderv1.Exten
for _, val := range device.GetDevices() {
err = val.LockNode(node, current)
if err != nil {
goto RelaseNodeLocks
goto ReleaseNodeLocks
}
}
/*
Expand Down Expand Up @@ -419,7 +419,7 @@ func (s *Scheduler) Bind(args extenderv1.ExtenderBindingArgs) (*extenderv1.Exten
klog.Infoln("After Binding Process")
return res, nil
}
RelaseNodeLocks:
ReleaseNodeLocks:
klog.InfoS("bind failed", "err", err.Error())
for _, val := range device.GetDevices() {
val.ReleaseNodeLock(node, current)
Expand Down
2 changes: 1 addition & 1 deletion pkg/scheduler/score.go
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ func (s *Scheduler) calcScore(nodes *map[string]*NodeUsage, nums util.PodDeviceR
for nodeID, node := range *nodes {
viewStatus(*node)
score := policy.NodeScore{NodeID: nodeID, Devices: make(util.PodDevices), Score: 0}
score.ComputeScore(node.Devices)

//This loop is for different container request
ctrfit := false
Expand Down Expand Up @@ -225,7 +226,6 @@ func (s *Scheduler) calcScore(nodes *map[string]*NodeUsage, nums util.PodDeviceR
}

if ctrfit {
score.ComputeScore(node.Devices)
res.NodeList = append(res.NodeList, &score)
}
}
Expand Down
22 changes: 11 additions & 11 deletions pkg/scheduler/score_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ func Test_calcScore(t *testing.T) {
},
},
},
Score: 5.25,
Score: 0,
},
},
},
Expand Down Expand Up @@ -242,7 +242,7 @@ func Test_calcScore(t *testing.T) {
},
},
},
Score: 20.25,
Score: 15,
},
},
},
Expand Down Expand Up @@ -353,7 +353,7 @@ func Test_calcScore(t *testing.T) {
},
},
},
Score: 2.625,
Score: 0,
},
},
},
Expand Down Expand Up @@ -464,7 +464,7 @@ func Test_calcScore(t *testing.T) {
},
},
},
Score: 10.125,
Score: 7.5,
},
},
},
Expand Down Expand Up @@ -582,7 +582,7 @@ func Test_calcScore(t *testing.T) {
},
},
},
Score: 5.25,
Score: 0,
},
},
},
Expand Down Expand Up @@ -700,7 +700,7 @@ func Test_calcScore(t *testing.T) {
},
},
},
Score: 20.25,
Score: 15,
},
},
},
Expand Down Expand Up @@ -818,7 +818,7 @@ func Test_calcScore(t *testing.T) {
},
},
},
Score: 5.25,
Score: 0,
},
{
NodeID: "node2",
Expand All @@ -835,7 +835,7 @@ func Test_calcScore(t *testing.T) {
},
},
},
Score: 5.25,
Score: 0,
},
},
},
Expand Down Expand Up @@ -953,7 +953,7 @@ func Test_calcScore(t *testing.T) {
},
},
},
Score: 20.25,
Score: 15,
},
{
NodeID: "node2",
Expand All @@ -970,7 +970,7 @@ func Test_calcScore(t *testing.T) {
},
},
},
Score: 5.25,
Score: 0,
},
},
},
Expand Down Expand Up @@ -1081,7 +1081,7 @@ func Test_calcScore(t *testing.T) {
},
},
},
Score: 5.25,
Score: 0,
},
},
},
Expand Down

0 comments on commit b2118aa

Please sign in to comment.