Skip to content

Commit 9c3cd1b

Browse files
committed
feat: Partionable Devices Support
1 parent 3956443 commit 9c3cd1b

File tree

2 files changed

+41
-16
lines changed

2 files changed

+41
-16
lines changed

cluster-autoscaler/simulator/dynamicresources/utils/utilization.go

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -101,25 +101,31 @@ func calculatePoolUtil(unallocated, allocated []resourceapi.Device, resourceSlic
101101
}
102102

103103
// we want to find the counter that is most utilized, since it is the "bottleneck" of the pool
104-
var maxUtilization float64
105-
if devicesWithoutCounters == 0 {
106-
maxUtilization = 0
107-
} else {
108-
maxUtilization = float64(allocatedDevicesWithoutCounters) / float64(devicesWithoutCounters)
104+
var partitionableUtilization float64 = 0
105+
var atomicDevicesUtilization float64 = 0
106+
if devicesWithoutCounters != 0 {
107+
atomicDevicesUtilization = partitionableUtilization
108+
}
109+
if len(TotalConsumedCounters) == 0 {
110+
return atomicDevicesUtilization
109111
}
110112
for counterSet, counters := range TotalConsumedCounters {
111113
for counterName, totalValue := range counters {
114+
if totalValue.IsZero() {
115+
continue
116+
}
112117
if allocatedSet, exists := allocatedConsumedCounters[counterSet]; exists {
113-
if allocatedValue, exists := allocatedSet[counterName]; exists && !totalValue.IsZero() {
118+
if allocatedValue, exists := allocatedSet[counterName]; exists {
114119
utilization := float64(allocatedValue.Value()) / float64(totalValue.Value())
115-
if utilization > maxUtilization {
116-
maxUtilization = utilization
120+
if utilization > partitionableUtilization {
121+
partitionableUtilization = utilization
117122
}
118123
}
119124
}
120125
}
121126
}
122-
return maxUtilization
127+
// when a pool has both atomic and partitionable devices, we sum their utilizations since they are mutually exclusive
128+
return partitionableUtilization + atomicDevicesUtilization
123129
}
124130

125131
// calculateConsumedCounters calculates the total counters consumed by a list of devices

cluster-autoscaler/simulator/dynamicresources/utils/utilization_test.go

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -143,10 +143,10 @@ func TestDynamicResourceUtilization(t *testing.T) {
143143
wantHighestUtilizationName: apiv1.ResourceName(fmt.Sprintf("%s/%s", fooDriver, "pool1")),
144144
},
145145
{
146-
testName: "",
146+
testName: "partitionable devices, 2/4 partitions used",
147147
nodeInfo: framework.NewNodeInfo(node,
148148
mergeLists(
149-
testResourceSlicesWithPartionableDevices(fooDriver, "pool1", "node", 2, 4),
149+
testResourceSlicesWithPartionableDevices(fooDriver, "pool1", "gpu-0", "node", 2, 4),
150150
),
151151
mergeLists(
152152
testPodsWithCustomClaims(fooDriver, "pool1", "node", []string{"gpu-0-partition-0", "gpu-0-partition-1"}),
@@ -160,6 +160,25 @@ func TestDynamicResourceUtilization(t *testing.T) {
160160
wantHighestUtilization: 0.5,
161161
wantHighestUtilizationName: apiv1.ResourceName(fmt.Sprintf("%s/%s", fooDriver, "pool1")),
162162
},
163+
{
164+
testName: "multi-GPU partitionable devices, 2/8 partitions used",
165+
nodeInfo: framework.NewNodeInfo(node,
166+
mergeLists(
167+
testResourceSlicesWithPartionableDevices(fooDriver, "pool1", "gpu-0", "node", 2, 4),
168+
testResourceSlicesWithPartionableDevices(fooDriver, "pool1", "gpu-1", "node", 0, 4),
169+
),
170+
mergeLists(
171+
testPodsWithCustomClaims(fooDriver, "pool1", "node", []string{"gpu-0-partition-0", "gpu-0-partition-1"}),
172+
)...,
173+
),
174+
wantUtilization: map[string]map[string]float64{
175+
fooDriver: {
176+
"pool1": 0.25,
177+
},
178+
},
179+
wantHighestUtilization: 0.25,
180+
wantHighestUtilizationName: apiv1.ResourceName(fmt.Sprintf("%s/%s", fooDriver, "pool1")),
181+
},
163182
} {
164183
if tc.testName != "" {
165184
continue
@@ -212,22 +231,22 @@ func testResourceSlices(driverName, poolName, nodeName string, poolGen, deviceCo
212231
return result
213232
}
214233

215-
func testResourceSlicesWithPartionableDevices(driverName, poolName, nodeName string, poolGen, partitionCount int) []*resourceapi.ResourceSlice {
234+
func testResourceSlicesWithPartionableDevices(driverName, poolName, deviceName, nodeName string, poolGen, partitionCount int) []*resourceapi.ResourceSlice {
216235
sliceName := fmt.Sprintf("%s-%s-slice", driverName, poolName)
217236
var devices []resourceapi.Device
218237
for i := 0; i < partitionCount; i++ {
219238
devices = append(
220239
devices,
221240
resourceapi.Device{
222-
Name: fmt.Sprintf("gpu-0-partition-%d", i),
241+
Name: fmt.Sprintf("%s-partition-%d", deviceName, i),
223242
Capacity: map[resourceapi.QualifiedName]resourceapi.DeviceCapacity{
224243
"memory": {
225244
Value: resource.MustParse("10Gi"),
226245
},
227246
},
228247
ConsumesCounters: []resourceapi.DeviceCounterConsumption{
229248
{
230-
CounterSet: "gpu-0-counter-set",
249+
CounterSet: fmt.Sprintf("%s-counter-set", deviceName),
231250
Counters: map[string]resourceapi.Counter{
232251
"memory": {
233252
Value: resource.MustParse("10Gi"),
@@ -240,15 +259,15 @@ func testResourceSlicesWithPartionableDevices(driverName, poolName, nodeName str
240259
}
241260
devices = append(devices,
242261
resourceapi.Device{
243-
Name: "gpu-0",
262+
Name: deviceName,
244263
Capacity: map[resourceapi.QualifiedName]resourceapi.DeviceCapacity{
245264
"memory": {
246265
Value: resource.MustParse(fmt.Sprintf("%dGi", 10*partitionCount)),
247266
},
248267
},
249268
ConsumesCounters: []resourceapi.DeviceCounterConsumption{
250269
{
251-
CounterSet: "gpu-0-counter-set",
270+
CounterSet: fmt.Sprintf("%s-counter-set", deviceName),
252271
Counters: map[string]resourceapi.Counter{
253272
"memory": {
254273
Value: resource.MustParse(fmt.Sprintf("%dGi", 10*partitionCount)),

0 commit comments

Comments
 (0)