Skip to content

Commit 027d853

Browse files
Replace DispatchQueue and BookingQueue with HealthyThreadPool (#1035)
* Update dispatchQuery to use min_cores Sorting jobs only by priority causes a situation where low priority jobs can get starved by a constant flow of high priority jobs. The new formula adds a modifier to the sorting rank to take into account the number of cores the job is requesting and also the number of days the job is waiting on the queue. Priorities numbers over 200 will mostly override the formula and work as a priority only based scheduling. sort = priority + (100 * (1 - (job.cores/job.int_min_cores))) + (age in days) Besides that, also take layer_int_cores_min into account when filtering folder_resourse limitations to avoid allocating more cores than the folder limits. (cherry picked from commit 566411aeeddc60983a30eabe121fd03263d05525) * Revert "Update dispatchQuery to use min_cores" This reverts commit 2eb4936 * Replace DispatchQueue and BookingQueue with HealthyThreadPool Queues will not inherit from ThreadPoolExecutor, instead they will manage an instance of HealthThreadPool, which is a threadPoolExecutor that handles healthChecks, termination and repeated tasks. With this the Booking queue should be able to self-heal when locked threads happen. * Remove trackit reference * Refactor HostReportQueue to use guava Cache Use a guava cache to store only the last version of a HostReport per host. * Configure HostReportQueue on opencue.properties * Fix unit tests * Fix unit tests * This unit tests is not actually testing anything useful Test doesn't make sense with the new threadpool and will also cause problems whenever an user changes a config property. Co-authored-by: Roula O'Regan <[email protected]>
1 parent 6da47ba commit 027d853

37 files changed

+781
-363
lines changed

cuebot/src/main/java/com/imageworks/spcue/config/AppConfig.java

-1
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,5 @@ public ServletRegistrationBean<JobLaunchServlet> jobLaunchServlet() {
6565
b.setServlet(new JobLaunchServlet());
6666
return b;
6767
}
68-
6968
}
7069

cuebot/src/main/java/com/imageworks/spcue/dispatcher/BookingQueue.java

+61-87
Original file line numberDiff line numberDiff line change
@@ -19,128 +19,102 @@
1919

2020
package com.imageworks.spcue.dispatcher;
2121

22-
import java.util.concurrent.LinkedBlockingQueue;
23-
import java.util.concurrent.ThreadPoolExecutor;
24-
import java.util.concurrent.TimeUnit;
25-
import java.util.concurrent.atomic.AtomicBoolean;
26-
2722
import com.google.common.cache.Cache;
2823
import com.google.common.cache.CacheBuilder;
29-
import com.imageworks.spcue.dispatcher.commands.DispatchBookHost;
30-
import com.imageworks.spcue.util.CueUtil;
24+
import com.imageworks.spcue.dispatcher.commands.KeyRunnable;
3125

3226
import org.apache.log4j.Logger;
3327
import org.springframework.beans.factory.annotation.Autowired;
3428
import org.springframework.core.env.Environment;
3529

36-
public class BookingQueue extends ThreadPoolExecutor {
37-
38-
private static final Logger logger = Logger.getLogger(BookingQueue.class);
39-
40-
private static final int THREADS_KEEP_ALIVE_SECONDS = 10;
30+
public class BookingQueue {
4131

42-
private int queueCapacity;
43-
private int baseSleepTimeMillis = 400;
44-
private AtomicBoolean isShutdown = new AtomicBoolean(false);
32+
private final int healthThreshold;
33+
private final int minUnhealthyPeriodMin;
34+
private final int queueCapacity;
35+
private final int corePoolSize;
36+
private final int maxPoolSize;
37+
private static final int BASE_SLEEP_TIME_MILLIS = 300;
4538

46-
private QueueRejectCounter rejectCounter = new QueueRejectCounter();
39+
private static final Logger logger = Logger.getLogger("HEALTH");
40+
private HealthyThreadPool healthyThreadPool;
4741

48-
private Cache<String, DispatchBookHost> bookingCache = CacheBuilder.newBuilder()
49-
.expireAfterWrite(3, TimeUnit.MINUTES)
50-
// Invalidate entries that got executed by the threadpool and lost their reference
51-
.weakValues()
52-
.build();
53-
54-
private BookingQueue(int corePoolSize, int maxPoolSize, int queueCapacity, int sleepTimeMs) {
55-
super(corePoolSize, maxPoolSize, THREADS_KEEP_ALIVE_SECONDS,
56-
TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>(queueCapacity));
42+
public BookingQueue(int healthThreshold, int minUnhealthyPeriodMin, int queueCapacity,
43+
int corePoolSize, int maxPoolSize) {
44+
this.healthThreshold = healthThreshold;
45+
this.minUnhealthyPeriodMin = minUnhealthyPeriodMin;
5746
this.queueCapacity = queueCapacity;
58-
this.baseSleepTimeMillis = sleepTimeMs;
59-
this.setRejectedExecutionHandler(rejectCounter);
60-
logger.info("BookingQueue" +
61-
" core:" + getCorePoolSize() +
62-
" max:" + getMaximumPoolSize() +
63-
" capacity:" + queueCapacity +
64-
" sleepTimeMs:" + sleepTimeMs);
47+
this.corePoolSize = corePoolSize;
48+
this.maxPoolSize = maxPoolSize;
49+
initThreadPool();
6550
}
6651

67-
@Autowired
68-
public BookingQueue(Environment env, String propertyKeyPrefix, int sleepTimeMs) {
69-
this(CueUtil.getIntProperty(env, propertyKeyPrefix, "core_pool_size"),
70-
CueUtil.getIntProperty(env, propertyKeyPrefix, "max_pool_size"),
71-
CueUtil.getIntProperty(env, propertyKeyPrefix, "queue_capacity"),
72-
sleepTimeMs);
52+
public void initThreadPool() {
53+
healthyThreadPool = new HealthyThreadPool(
54+
"BookingQueue",
55+
healthThreshold,
56+
minUnhealthyPeriodMin,
57+
queueCapacity,
58+
corePoolSize,
59+
maxPoolSize,
60+
BASE_SLEEP_TIME_MILLIS);
7361
}
7462

75-
public void execute(DispatchBookHost r) {
76-
if (isShutdown.get()) {
77-
return;
78-
}
79-
if (bookingCache.getIfPresent(r.getKey()) == null){
80-
bookingCache.put(r.getKey(), r);
81-
super.execute(r);
63+
public boolean isHealthy() {
64+
try {
65+
if (!healthyThreadPool.isHealthyOrShutdown()) {
66+
logger.warn("BookingQueue: Unhealthy queue terminated, starting a new one");
67+
initThreadPool();
68+
}
69+
} catch (InterruptedException e) {
70+
// TODO: evaluate crashing the whole springbook context here
71+
// to force a container restart cycle
72+
logger.error("Failed to restart BookingThreadPool", e);
73+
return false;
8274
}
75+
76+
return true;
77+
}
78+
79+
public void execute(KeyRunnable r) {
80+
healthyThreadPool.execute(r);
8381
}
8482

8583
public long getRejectedTaskCount() {
86-
return rejectCounter.getRejectCount();
84+
return healthyThreadPool.getRejectedTaskCount();
8785
}
8886

8987
public int getQueueCapacity() {
9088
return queueCapacity;
9189
}
9290

9391
public void shutdown() {
94-
if (!isShutdown.getAndSet(true)) {
95-
logger.info("clearing out booking queue: " + this.getQueue().size());
96-
this.getQueue().clear();
97-
}
92+
healthyThreadPool.shutdown();
93+
}
9894

95+
public int getSize() {
96+
return healthyThreadPool.getQueue().size();
9997
}
10098

101-
/**
102-
* Lowers the sleep time as the queue grows.
103-
*
104-
* @return
105-
*/
106-
public int sleepTime() {
107-
if (!isShutdown.get()) {
108-
int sleep = (int) (baseSleepTimeMillis - (((this.getQueue().size () /
109-
(float) queueCapacity) * baseSleepTimeMillis)) * 2);
110-
if (sleep < 0) {
111-
sleep = 0;
112-
}
113-
return sleep;
114-
} else {
115-
return 0;
116-
}
99+
public int getRemainingCapacity() {
100+
return healthyThreadPool.getQueue().remainingCapacity();
117101
}
118102

119-
protected void beforeExecute(Thread t, Runnable r) {
120-
super.beforeExecute(t, r);
121-
if (isShutdown()) {
122-
this.remove(r);
123-
} else {
124-
try {
125-
Thread.sleep(sleepTime());
126-
} catch (InterruptedException e) {
127-
logger.info("booking queue was interrupted.");
128-
Thread.currentThread().interrupt();
129-
}
130-
}
103+
public int getActiveCount() {
104+
return healthyThreadPool.getActiveCount();
131105
}
132106

133-
protected void afterExecute(Runnable r, Throwable t) {
134-
super.afterExecute(r, t);
107+
public long getCompletedTaskCount() {
108+
return healthyThreadPool.getCompletedTaskCount();
109+
}
135110

136-
// Invalidate cache to avoid having to wait for GC to mark processed entries collectible
137-
DispatchBookHost h = (DispatchBookHost)r;
138-
bookingCache.invalidate(h.getKey());
111+
public long getCorePoolSize() {
112+
return corePoolSize;
113+
}
139114

140-
if (sleepTime() < 100) {
141-
logger.info("BookingQueue cleanup executed.");
142-
getQueue().clear();
143-
}
115+
public long getMaximumPoolSize() {
116+
return maxPoolSize;
144117
}
118+
145119
}
146120

cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchQueue.java

+51-60
Original file line numberDiff line numberDiff line change
@@ -23,93 +23,84 @@
2323
import java.util.concurrent.atomic.AtomicLong;
2424

2525
import org.apache.log4j.Logger;
26-
import org.springframework.core.task.TaskExecutor;
27-
import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor;
26+
import com.imageworks.spcue.dispatcher.commands.KeyRunnable;
2827

2928
public class DispatchQueue {
3029

31-
private TaskExecutor dispatchPool;
32-
private ThreadPoolTaskExecutor _dispatchPool;
33-
private String name = "Default";
34-
private AtomicBoolean isShutdown = new AtomicBoolean(false);
35-
36-
private final AtomicLong tasksRun = new AtomicLong(0);
37-
private final AtomicLong tasksRejected = new AtomicLong(0);
30+
private int healthThreshold;
31+
private int minUnhealthyPeriodMin;
32+
private int queueCapacity;
33+
private int corePoolSize;
34+
private int maxPoolSize;
3835

39-
private static final Logger logger = Logger.getLogger(DispatchQueue.class);
40-
41-
public DispatchQueue() {}
36+
private static final Logger logger = Logger.getLogger("HEALTH");
37+
private String name = "Default";
38+
private HealthyThreadPool healthyDispatchPool;
4239

43-
public DispatchQueue(String name) {
40+
public DispatchQueue(String name, int healthThreshold, int minUnhealthyPeriodMin, int queueCapacity,
41+
int corePoolSize, int maxPoolSize) {
4442
this.name = name;
43+
this.healthThreshold = healthThreshold;
44+
this.minUnhealthyPeriodMin = minUnhealthyPeriodMin;
45+
this.queueCapacity = queueCapacity;
46+
this.corePoolSize = corePoolSize;
47+
this.maxPoolSize = maxPoolSize;
48+
initThreadPool();
4549
}
4650

47-
public void execute(Runnable r) {
51+
public void initThreadPool() {
52+
healthyDispatchPool = new HealthyThreadPool(
53+
name,
54+
healthThreshold,
55+
minUnhealthyPeriodMin,
56+
queueCapacity,
57+
corePoolSize,
58+
maxPoolSize);
59+
}
60+
61+
public boolean isHealthy() {
4862
try {
49-
if (!isShutdown.get()) {
50-
this.dispatchPool.execute(r);
51-
tasksRun.addAndGet(1);
63+
if (!healthyDispatchPool.isHealthyOrShutdown()) {
64+
logger.warn("DispatchQueue_" + name + ": Unhealthy queue terminated, starting a new one");
65+
initThreadPool();
5266
}
53-
} catch (Exception e) {
54-
long rejection = tasksRejected.addAndGet(1);
55-
logger.warn("Warning, dispatch queue - [" + name + "] rejected, " + e);
56-
throw new DispatchQueueTaskRejectionException(
57-
"Warning, dispatch queue [" + name + " rejected task #"
58-
+ rejection);
67+
} catch (InterruptedException e) {
68+
// TODO: evaluate crashing the whole springbook context here
69+
// to force a container restart cycle
70+
logger.error("DispatchQueue_" + name + ":Failed to restart DispatchThreadPool", e);
71+
return false;
5972
}
60-
}
6173

62-
public int getMaxPoolSize() {
63-
return _dispatchPool.getMaxPoolSize();
74+
return true;
6475
}
6576

66-
public int getActiveThreadCount() {
67-
return _dispatchPool.getActiveCount();
77+
public void execute(KeyRunnable r) {
78+
healthyDispatchPool.execute(r);
6879
}
6980

70-
public int getWaitingCount() {
71-
return _dispatchPool.getThreadPoolExecutor().getQueue().size();
81+
public long getRejectedTaskCount() {
82+
return healthyDispatchPool.getRejectedTaskCount();
7283
}
7384

74-
public int getRemainingCapacity() {
75-
return _dispatchPool.getThreadPoolExecutor().getQueue().remainingCapacity();
85+
public void shutdown() {
86+
healthyDispatchPool.shutdown();
7687
}
7788

78-
public long getTotalDispatched() {
79-
return tasksRun.get();
89+
public int getSize() {
90+
return healthyDispatchPool.getQueue().size();
8091
}
8192

82-
public long getTotalRejected() {
83-
return tasksRejected.get();
93+
public int getRemainingCapacity() {
94+
return healthyDispatchPool.getQueue().remainingCapacity();
8495
}
8596

86-
public TaskExecutor getDispatchPool() {
87-
return dispatchPool;
97+
public int getActiveCount() {
98+
return healthyDispatchPool.getActiveCount();
8899
}
89100

90-
public void setDispatchPool(TaskExecutor dispatchPool) {
91-
this.dispatchPool = dispatchPool;
92-
this._dispatchPool = (ThreadPoolTaskExecutor) dispatchPool;
101+
public long getCompletedTaskCount() {
102+
return healthyDispatchPool.getCompletedTaskCount();
93103
}
94104

95-
public void shutdown() {
96-
if (!isShutdown.getAndSet(true)) {
97-
logger.info("Shutting down thread pool " + name + ", currently "
98-
+ getActiveThreadCount() + " active threads.");
99-
final long startTime = System.currentTimeMillis();
100-
while (getWaitingCount() != 0 && getActiveThreadCount() != 0) {
101-
try {
102-
if (System.currentTimeMillis() - startTime > 10000) {
103-
throw new InterruptedException(name
104-
+ " thread pool failed to shutdown properly");
105-
}
106-
Thread.sleep(250);
107-
} catch (InterruptedException e) {
108-
Thread.currentThread().interrupt();
109-
break;
110-
}
111-
}
112-
}
113-
}
114105
}
115106

cuebot/src/main/java/com/imageworks/spcue/dispatcher/FrameCompleteHandler.java

+6-4
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
import com.imageworks.spcue.VirtualProc;
4040
import com.imageworks.spcue.dispatcher.commands.DispatchBookHost;
4141
import com.imageworks.spcue.dispatcher.commands.DispatchNextFrame;
42+
import com.imageworks.spcue.dispatcher.commands.KeyRunnable;
4243
import com.imageworks.spcue.grpc.host.LockState;
4344
import com.imageworks.spcue.grpc.job.FrameExitStatus;
4445
import com.imageworks.spcue.grpc.job.FrameState;
@@ -158,10 +159,11 @@ public void handleFrameCompleteReport(final FrameCompleteReport report) {
158159
final LayerDetail layer = jobManager.getLayerDetail(report.getFrame().getLayerId());
159160
final DispatchFrame frame = jobManager.getDispatchFrame(report.getFrame().getFrameId());
160161
final FrameState newFrameState = determineFrameState(job, layer, frame, report);
161-
162+
final String key = proc.getJobId() + "_" + report.getFrame().getLayerId() +
163+
"_" + report.getFrame().getFrameId();
162164
if (dispatchSupport.stopFrame(frame, newFrameState, report.getExitStatus(),
163165
report.getFrame().getMaxRss())) {
164-
dispatchQueue.execute(new Runnable() {
166+
dispatchQueue.execute(new KeyRunnable(key) {
165167
@Override
166168
public void run() {
167169
try {
@@ -182,7 +184,7 @@ public void run() {
182184
* properties.
183185
*/
184186
if (redirectManager.hasRedirect(proc)) {
185-
dispatchQueue.execute(new Runnable() {
187+
dispatchQueue.execute(new KeyRunnable(key) {
186188
@Override
187189
public void run() {
188190
try {
@@ -195,7 +197,7 @@ public void run() {
195197
});
196198
}
197199
else {
198-
dispatchQueue.execute(new Runnable() {
200+
dispatchQueue.execute(new KeyRunnable(key) {
199201
@Override
200202
public void run() {
201203
try {

0 commit comments

Comments
 (0)