Skip to content

Commit c22fe12

Browse files
authored
Add multiple GPU support #760 (#924)
1 parent b10cd0d commit c22fe12

File tree

134 files changed

+4872
-711
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

134 files changed

+4872
-711
lines changed

VERSION.in

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.12
1+
0.13

cuebot/src/main/java/com/imageworks/spcue/DispatchFrame.java

+3-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,9 @@ public class DispatchFrame extends FrameEntity implements FrameInterface {
4343
public int maxCores;
4444
public boolean threadable;
4545
public long minMemory;
46-
public long minGpu;
46+
public int minGpus;
47+
public int maxGpus;
48+
public long minGpuMemory;
4749

4850
public String services;
4951
}

cuebot/src/main/java/com/imageworks/spcue/DispatchHost.java

+26-13
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,16 @@ public class DispatchHost extends Entity
3535
public int cores;
3636
public int idleCores;
3737

38+
public int gpus;
39+
public int idleGpus;
40+
3841
// Basically an 0 = auto, 1 = all.
3942
public int threadMode;
4043

4144
public long memory;
4245
public long idleMemory;
43-
public long gpu;
44-
public long idleGpu;
46+
public long gpuMemory;
47+
public long idleGpuMemory;
4548
public String tags;
4649
public String os;
4750

@@ -53,11 +56,13 @@ public class DispatchHost extends Entity
5356
* booked to this host.
5457
*/
5558
public int strandedCores = 0;
59+
public int strandedGpus = 0;
5660

5761
// To reserve resources for future gpu job
5862
long idleMemoryOrig = 0;
5963
int idleCoresOrig = 0;
60-
long idleGpuOrig = 0;
64+
long idleGpuMemoryOrig = 0;
65+
int idleGpusOrig = 0;
6166

6267
public String getHostId() {
6368
return id;
@@ -72,41 +77,47 @@ public String getFacilityId() {
7277
}
7378

7479
@Override
75-
public boolean hasAdditionalResources(int minCores, long minMemory, long minGpu) {
80+
public boolean hasAdditionalResources(int minCores, long minMemory, int minGpus, long minGpuMemory) {
7681

7782
if (idleCores < minCores) {
7883
return false;
7984
}
8085
else if (idleMemory < minMemory) {
8186
return false;
8287
}
83-
else if (idleGpu < minGpu) {
88+
else if (idleGpus < minGpus) {
89+
return false;
90+
}
91+
else if (idleGpuMemory < minGpuMemory) {
8492
return false;
8593
}
8694

8795
return true;
8896
}
8997

9098
@Override
91-
public void useResources(int coreUnits, long memory, long gpu) {
99+
public void useResources(int coreUnits, long memory, int gpuUnits, long gpuMemory) {
92100
idleCores = idleCores - coreUnits;
93101
idleMemory = idleMemory - memory;
94-
idleGpu = idleGpu - gpu;
102+
idleGpus = idleGpus - gpuUnits;
103+
idleGpuMemory = idleGpuMemory - gpuMemory;
95104
}
96105

97106
/**
98107
* If host has idle gpu, remove enough resources to book a gpu frame later.
99108
*
100109
*/
101110
public void removeGpu() {
102-
if (idleGpu > 0 && idleGpuOrig == 0) {
111+
if (idleGpuMemory > 0 && idleGpuMemoryOrig == 0) {
103112
idleMemoryOrig = idleMemory;
104113
idleCoresOrig = idleCores;
105-
idleGpuOrig = idleGpu;
114+
idleGpuMemoryOrig = idleGpuMemory;
115+
idleGpusOrig = idleGpus;
106116

107117
idleMemory = idleMemory - Math.min(CueUtil.GB4, idleMemory);
108118
idleCores = idleCores - Math.min(100, idleCores);
109-
idleGpu = 0;
119+
idleGpuMemory = idleGpuMemory - Math.min(CueUtil.GB4, idleGpuMemory);
120+
idleGpus = idleGpus - Math.min(1, idleGpus);
110121
}
111122
}
112123

@@ -115,14 +126,16 @@ public void removeGpu() {
115126
*
116127
*/
117128
public void restoreGpu() {
118-
if (idleGpuOrig > 0) {
129+
if (idleGpuMemoryOrig > 0) {
119130
idleMemory = idleMemoryOrig;
120131
idleCores = idleCoresOrig;
121-
idleGpu = idleGpuOrig;
132+
idleGpuMemory = idleGpuMemoryOrig;
133+
idleGpus = idleGpusOrig;
122134

123135
idleMemoryOrig = 0;
124136
idleCoresOrig = 0;
125-
idleGpuOrig = 0;
137+
idleGpuMemoryOrig = 0;
138+
idleGpusOrig = 0;
126139
}
127140
}
128141
}

cuebot/src/main/java/com/imageworks/spcue/ExecutionSummary.java

+27
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ public class ExecutionSummary {
2828
public long coreTime;
2929
public long coreTimeSuccess;
3030
public long coreTimeFail;
31+
public long gpuTime;
32+
public long gpuTimeSuccess;
33+
public long gpuTimeFail;
3134
public long highMemoryKb;
3235

3336
public long getHighMemoryKb() {
@@ -69,5 +72,29 @@ public long getCoreTimeFail() {
6972
public void setCoreTimeFail(long coreTimeFail) {
7073
this.coreTimeFail = coreTimeFail;
7174
}
75+
76+
public long getGpuTime() {
77+
return gpuTime;
78+
}
79+
80+
public void setGpuTime(long gpuTime) {
81+
this.gpuTime = gpuTime;
82+
}
83+
84+
public long getGpuTimeSuccess() {
85+
return gpuTimeSuccess;
86+
}
87+
88+
public void setGpuTimeSuccess(long gpuTimeSuccess) {
89+
this.gpuTimeSuccess = gpuTimeSuccess;
90+
}
91+
92+
public long getGpuTimeFail() {
93+
return gpuTimeFail;
94+
}
95+
96+
public void setGpuTimeFail(long gpuTimeFail) {
97+
this.gpuTimeFail = gpuTimeFail;
98+
}
7299
}
73100

cuebot/src/main/java/com/imageworks/spcue/GroupDetail.java

+5
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,16 @@ public class GroupDetail extends Entity implements GroupInterface, DepartmentInt
2323

2424
public int jobMinCores = -1;
2525
public int jobMaxCores = -1;
26+
public int jobMinGpus = -1;
27+
public int jobMaxGpus = -1;
2628
public int jobPriority = -1;
2729

2830
public int minCores = -1;
2931
public int maxCores = -1;
3032

33+
public int minGpus = -1;
34+
public int maxGpus = -1;
35+
3136
public String parentId = null;
3237
public String showId;
3338
public String deptId;

cuebot/src/main/java/com/imageworks/spcue/HostEntity.java

+12-8
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,12 @@ public class HostEntity extends Entity implements HostInterface {
3636
public int procs;
3737
public int cores;
3838
public int idleCores;
39-
public int memory;
40-
public int idleMemory;
41-
public int gpu;
42-
public int idleGpu;
39+
public long memory;
40+
public long idleMemory;
41+
public int gpus;
42+
public int idleGpus;
43+
public long gpuMemory;
44+
public long idleGpuMemory;
4345

4446
public boolean unlockAtBoot;
4547

@@ -57,10 +59,12 @@ public HostEntity(Host grpcHost) {
5759
this.nimbyEnabled = grpcHost.getNimbyEnabled();
5860
this.cores = (int) grpcHost.getCores();
5961
this.idleCores = (int) grpcHost.getIdleCores();
60-
this.memory = (int) grpcHost.getMemory();
61-
this.idleMemory = (int) grpcHost.getIdleMemory();
62-
this.gpu = (int) grpcHost.getGpu();
63-
this.idleGpu = (int) grpcHost.getIdleGpu();
62+
this.memory = grpcHost.getMemory();
63+
this.idleMemory = grpcHost.getIdleMemory();
64+
this.gpus = (int) grpcHost.getGpus();
65+
this.idleGpus = (int) grpcHost.getIdleGpus();
66+
this.gpuMemory = grpcHost.getGpuMemory();
67+
this.idleGpuMemory = grpcHost.getIdleGpuMemory();
6468
}
6569

6670
public String getHostId() {

cuebot/src/main/java/com/imageworks/spcue/Inherit.java

+2
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ public enum Inherit {
2828
Priority,
2929
MinCores,
3030
MaxCores,
31+
MinGpus,
32+
MaxGpus,
3133
All
3234
}
3335

cuebot/src/main/java/com/imageworks/spcue/JobDetail.java

+5-2
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,15 @@ public class JobDetail extends JobEntity implements JobInterface, DepartmentInte
4646
public int priority = 1;
4747
public int minCoreUnits = 100;
4848
public int maxCoreUnits = 200000;
49+
public int minGpuUnits = 0;
50+
public int maxGpuUnits = 1000;
4951
public boolean isLocal = false;
5052
public String localHostName;
5153
public int localMaxCores;
52-
public int localMaxMemory;
54+
public long localMaxMemory;
5355
public int localThreadNumber;
54-
public int localMaxGpu;
56+
public int localMaxGpus;
57+
public long localMaxGpuMemory;
5558

5659
public String getDepartmentId() {
5760
return deptId;

cuebot/src/main/java/com/imageworks/spcue/LayerDetail.java

+15-5
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,11 @@ public class LayerDetail extends LayerEntity implements LayerInterface {
3232
public LayerType type;
3333
public int minimumCores;
3434
public int maximumCores;
35+
public int minimumGpus;
36+
public int maximumGpus;
3537
public boolean isThreadable;
3638
public long minimumMemory;
37-
public long minimumGpu;
39+
public long minimumGpuMemory;
3840
public int chunkSize;
3941
public int timeout;
4042
public int timeout_llu;
@@ -116,12 +118,20 @@ public void setMinimumMemory(long minimumMemory) {
116118
this.minimumMemory = minimumMemory;
117119
}
118120

119-
public long getMinimumGpu() {
120-
return minimumGpu;
121+
public int getMinimumGpus() {
122+
return minimumGpus;
121123
}
122124

123-
public void setMinimumGpu(long minimumGpu) {
124-
this.minimumGpu = minimumGpu;
125+
public void setMinimumGpus(int minimumGpus) {
126+
this.minimumGpus = minimumGpus;
127+
}
128+
129+
public long getMinimumGpuMemory() {
130+
return minimumGpuMemory;
131+
}
132+
133+
public void setMinimumGpuMemory(long minimumGpuMemory) {
134+
this.minimumGpuMemory = minimumGpuMemory;
125135
}
126136

127137
public int getChunkSize() {

cuebot/src/main/java/com/imageworks/spcue/LocalHostAssignment.java

+39-16
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,13 @@ public class LocalHostAssignment extends Entity
3535

3636
private int idleCoreUnits;
3737
private long idleMemory;
38-
private long idleGpu;
38+
private int idleGpuUnits;
39+
private long idleGpuMemory;
3940

4041
private long maxMemory;
41-
private long maxGpu;
42+
private long maxGpuMemory;
4243
private int maxCoreUnits;
44+
private int maxGpuUnits;
4345

4446
private int threads;
4547

@@ -52,34 +54,39 @@ public class LocalHostAssignment extends Entity
5254

5355
public LocalHostAssignment() { }
5456

55-
public LocalHostAssignment(int maxCores, int threads, long maxMemory, long maxGpu) {
57+
public LocalHostAssignment(int maxCores, int threads, long maxMemory, int maxGpus, long maxGpuMemory) {
5658
this.maxCoreUnits = maxCores;
5759
this.threads = threads;
5860
this.maxMemory = maxMemory;
59-
this.maxGpu = maxGpu;
61+
this.maxGpuUnits = maxGpus;
62+
this.maxGpuMemory = maxGpuMemory;
6063
}
6164

6265
@Override
63-
public boolean hasAdditionalResources(int minCores, long minMemory, long minGpu) {
66+
public boolean hasAdditionalResources(int minCores, long minMemory, int minGpus, long minGpuMemory) {
6467

6568
if (idleCoreUnits < minCores) {
6669
return false;
6770
}
6871
else if (idleMemory < minMemory) {
6972
return false;
7073
}
71-
else if (idleGpu < minGpu) {
74+
else if (idleGpuUnits < minGpus) {
75+
return false;
76+
}
77+
else if (idleGpuMemory < minGpuMemory) {
7278
return false;
7379
}
7480

7581
return true;
7682
}
7783

7884
@Override
79-
public void useResources(int coreUnits, long memory, long gpu) {
85+
public void useResources(int coreUnits, long memory, int gpuUnits, long gpuMemory) {
8086
idleCoreUnits = idleCoreUnits - coreUnits;
8187
idleMemory = idleMemory - memory;
82-
idleGpu = idleGpu - gpu;
88+
idleGpuUnits = idleGpuUnits - gpuUnits;
89+
idleGpuMemory = idleGpuMemory - gpuMemory;
8390
}
8491

8592
public int getThreads() {
@@ -110,16 +117,24 @@ public long getIdleMemory() {
110117
return this.idleMemory;
111118
}
112119

113-
public long getMaxGpu() {
114-
return maxGpu;
120+
public int getMaxGpuUnits() {
121+
return maxGpuUnits;
122+
}
123+
124+
public void setMaxGpuUnits(int maxGpuUnits) {
125+
this.maxGpuUnits = maxGpuUnits;
126+
}
127+
128+
public long getMaxGpuMemory() {
129+
return maxGpuMemory;
115130
}
116131

117-
public void setMaxGpu(long maxGpu) {
118-
this.maxGpu = maxGpu;
132+
public void setMaxGpuMemory(long maxGpuMemory) {
133+
this.maxGpuMemory = maxGpuMemory;
119134
}
120135

121-
public long getIdleGpu() {
122-
return this.idleGpu;
136+
public long getIdleGpuMemory() {
137+
return this.idleGpuMemory;
123138
}
124139

125140
public int getIdleCoreUnits() {
@@ -134,8 +149,16 @@ public void setIdleMemory(long idleMemory) {
134149
this.idleMemory = idleMemory;
135150
}
136151

137-
public void setIdleGpu(long idleGpu) {
138-
this.idleGpu = idleGpu;
152+
public int getIdleGpuUnits() {
153+
return this.idleGpuUnits;
154+
}
155+
156+
public void setIdleGpuUnits(int idleGpuUnits) {
157+
this.idleGpuUnits = idleGpuUnits;
158+
}
159+
160+
public void setIdleGpuMemory(long idleGpuMemory) {
161+
this.idleGpuMemory = idleGpuMemory;
139162
}
140163

141164
public String getHostId() {

0 commit comments

Comments
 (0)