Skip to content

Commit b606a59

Browse files
[cuebot] Prevent booking frames on hosts with no temp space. (#1306)
1 parent baa122a commit b606a59

39 files changed

+489
-81
lines changed

cuebot/src/main/java/com/imageworks/spcue/dao/CommentDao.java

+22
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
import com.imageworks.spcue.HostInterface;
2424
import com.imageworks.spcue.JobInterface;
2525

26+
import java.util.List;
27+
2628
public interface CommentDao {
2729

2830
/**
@@ -32,6 +34,26 @@ public interface CommentDao {
3234
*/
3335
public void deleteComment(String id);
3436

37+
/**
38+
* Deletes comments using host, user, and subject
39+
*
40+
* @param host
41+
* @param user
42+
* @param subject
43+
* @return boolean: returns true if one or more comments where deleted
44+
*/
45+
public boolean deleteCommentByHostUserAndSubject(HostInterface host, String user, String subject);
46+
47+
/**
48+
* Get comments using host, user, and subject
49+
*
50+
* @param host
51+
* @param user
52+
* @param subject
53+
* @return List<Comment>
54+
*/
55+
public List<CommentDetail> getCommentsByHostUserAndSubject(HostInterface host, String user, String subject);
56+
3557
/**
3658
* Retrieves the specified comment.
3759
*

cuebot/src/main/java/com/imageworks/spcue/dao/HostDao.java

+8
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,14 @@ public interface HostDao {
7878
*/
7979
void updateHostState(HostInterface host, HardwareState state);
8080

81+
/**
82+
* updates a host with the passed free temporary directory
83+
*
84+
* @param host
85+
* @param freeTempDir
86+
*/
87+
void updateHostFreeTempDir(HostInterface host, Long freeTempDir);
88+
8189
/**
8290
* returns a full host detail
8391
*

cuebot/src/main/java/com/imageworks/spcue/dao/postgres/CommentDaoJdbc.java

+13
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
import java.sql.ResultSet;
2323
import java.sql.SQLException;
24+
import java.util.List;
2425
import java.util.Map;
2526

2627
import org.springframework.jdbc.core.RowMapper;
@@ -71,6 +72,18 @@ public CommentDetail mapRow(ResultSet rs, int row) throws SQLException {
7172
}
7273
};
7374

75+
public boolean deleteCommentByHostUserAndSubject(HostInterface host, String user, String subject) {
76+
return getJdbcTemplate().update(
77+
"DELETE FROM comments WHERE pk_host=? AND str_user=? AND str_subject=?",
78+
host.getHostId(), user, subject) > 0;
79+
}
80+
81+
public List<CommentDetail> getCommentsByHostUserAndSubject(HostInterface host, String user, String subject) {
82+
return getJdbcTemplate().query(
83+
"SELECT * FROM comments WHERE pk_host=? AND str_user=? AND str_subject=?",
84+
COMMENT_DETAIL_MAPPER, host.getHostId(), user, subject);
85+
}
86+
7487
public CommentDetail getCommentDetail(String id) {
7588
return getJdbcTemplate().queryForObject(
7689
"SELECT * FROM comments WHERE pk_comment=?",

cuebot/src/main/java/com/imageworks/spcue/dao/postgres/HostDaoJdbc.java

+7
Original file line numberDiff line numberDiff line change
@@ -523,6 +523,13 @@ public void updateHostState(HostInterface host, HardwareState state) {
523523
state.toString(), host.getHostId());
524524
}
525525

526+
@Override
527+
public void updateHostFreeTempDir(HostInterface host, Long freeTempDir) {
528+
getJdbcTemplate().update(
529+
"UPDATE host_stat SET int_mcp_free=? WHERE pk_host=?",
530+
freeTempDir, host.getHostId());
531+
}
532+
526533
@Override
527534
public void updateHostSetAllocation(HostInterface host, AllocationInterface alloc) {
528535

cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java

+73-6
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,13 @@
2929

3030
import org.apache.logging.log4j.Logger;
3131
import org.apache.logging.log4j.LogManager;
32+
import org.springframework.beans.factory.annotation.Autowired;
33+
import org.springframework.core.env.Environment;
3234
import org.springframework.core.task.TaskRejectedException;
3335
import org.springframework.dao.DataAccessException;
3436
import org.springframework.dao.EmptyResultDataAccessException;
3537

38+
import com.imageworks.spcue.CommentDetail;
3639
import com.imageworks.spcue.DispatchHost;
3740
import com.imageworks.spcue.FrameInterface;
3841
import com.imageworks.spcue.JobEntity;
@@ -57,6 +60,7 @@
5760
import com.imageworks.spcue.rqd.RqdClient;
5861
import com.imageworks.spcue.rqd.RqdClientException;
5962
import com.imageworks.spcue.service.BookingManager;
63+
import com.imageworks.spcue.service.CommentManager;
6064
import com.imageworks.spcue.service.HostManager;
6165
import com.imageworks.spcue.service.JobManager;
6266
import com.imageworks.spcue.service.JobManagerSupport;
@@ -80,6 +84,14 @@ public class HostReportHandler {
8084
private JobManagerSupport jobManagerSupport;
8185
private JobDao jobDao;
8286
private LayerDao layerDao;
87+
@Autowired
88+
private Environment env;
89+
@Autowired
90+
private CommentManager commentManager;
91+
// Comment constants
92+
private static final String SUBJECT_COMMENT_FULL_TEMP_DIR = "Host set to REPAIR for not having enough storage " +
93+
"space on the temporary directory (mcp)";
94+
private static final String CUEBOT_COMMENT_USER = "cuebot";
8395

8496
/**
8597
* Boolean to toggle if this class is accepting data or not.
@@ -156,7 +168,7 @@ public void handleHostReport(HostReport report, boolean isBoot) {
156168
rhost.getLoad(), new Timestamp(rhost.getBootTime() * 1000l),
157169
rhost.getAttributesMap().get("SP_OS"));
158170

159-
changeHardwareState(host, report.getHost().getState(), isBoot);
171+
changeHardwareState(host, report.getHost().getState(), isBoot, report.getHost().getFreeMcp());
160172
changeNimbyState(host, report.getHost());
161173

162174
/**
@@ -221,7 +233,14 @@ public void handleHostReport(HostReport report, boolean isBoot) {
221233
}
222234
}
223235

224-
if (host.idleCores < Dispatcher.CORE_POINTS_RESERVED_MIN) {
236+
// The minimum amount of free space in the temporary directory to book a host
237+
Long minBookableFreeTempDir = env.getRequiredProperty("dispatcher.min_bookable_free_temp_dir_kb", Long.class);
238+
239+
if (minBookableFreeTempDir != -1 && report.getHost().getFreeMcp() < minBookableFreeTempDir) {
240+
msg = String.format("%s doens't have enough free space in the temporary directory (mcp), %dMB needs %dMB",
241+
host.name, (report.getHost().getFreeMcp()/1024), (minBookableFreeTempDir/1024));
242+
}
243+
else if (host.idleCores < Dispatcher.CORE_POINTS_RESERVED_MIN) {
225244
msg = String.format("%s doesn't have enough idle cores, %d needs %d",
226245
host.name, host.idleCores, Dispatcher.CORE_POINTS_RESERVED_MIN);
227246
}
@@ -231,7 +250,7 @@ else if (host.idleMemory < Dispatcher.MEM_RESERVED_MIN) {
231250
}
232251
else if (report.getHost().getFreeMem() < CueUtil.MB512) {
233252
msg = String.format("%s doens't have enough free system mem, %d needs %d",
234-
host.name, report.getHost().getFreeMem(), Dispatcher.MEM_RESERVED_MIN);
253+
host.name, report.getHost().getFreeMem(), Dispatcher.MEM_RESERVED_MIN);
235254
}
236255
else if(!host.hardwareState.equals(HardwareState.UP)) {
237256
msg = host + " is not in the Up state.";
@@ -309,13 +328,61 @@ else if (!dispatchSupport.isCueBookable(host)) {
309328
* updated with a boot report. If the state is Repair, then state is
310329
* never updated via RQD.
311330
*
331+
*
332+
* Prevent cue frames from booking on hosts with full temporary directories.
333+
*
334+
* Change host state to REPAIR or UP according the amount of free space
335+
* in the temporary directory:
336+
* - Set the host state to REPAIR, when the amount of free space in the
337+
* temporary directory is less than the minimum required. Add a comment with
338+
* subject: SUBJECT_COMMENT_FULL_TEMP_DIR
339+
* - Set the host state to UP, when the amount of free space in the temporary directory
340+
* is greater or equals to the minimum required and the host has a comment with
341+
* subject: SUBJECT_COMMENT_FULL_TEMP_DIR
342+
*
312343
* @param host
313344
* @param reportState
314345
* @param isBoot
346+
* @param freeTempDir
315347
*/
316-
private void changeHardwareState(DispatchHost host,
317-
HardwareState reportState, boolean isBoot) {
348+
private void changeHardwareState(DispatchHost host, HardwareState reportState, boolean isBoot, long freeTempDir) {
349+
350+
// The minimum amount of free space in the temporary directory to book a host
351+
Long minBookableFreeTempDir = env.getRequiredProperty("dispatcher.min_bookable_free_temp_dir_kb", Long.class);
352+
353+
// Prevent cue frames from booking on hosts with full temporary directories
354+
if (minBookableFreeTempDir != -1) {
355+
if (host.hardwareState == HardwareState.UP && freeTempDir < minBookableFreeTempDir) {
356+
357+
// Insert a comment indicating that the Host status = Repair with reason = Full temporary directory
358+
CommentDetail c = new CommentDetail();
359+
c.subject = SUBJECT_COMMENT_FULL_TEMP_DIR;
360+
c.user = CUEBOT_COMMENT_USER;
361+
c.timestamp = null;
362+
c.message = "Host " + host.getName() + " marked as REPAIR. The current amount of free space in the " +
363+
"temporary directory (mcp) is " + (freeTempDir/1024) + "MB. It must have at least "
364+
+ (minBookableFreeTempDir/1024) + "MB of free space in temporary directory";
365+
commentManager.addComment(host, c);
318366

367+
// Set the host state to REPAIR
368+
hostManager.setHostState(host, HardwareState.REPAIR);
369+
host.hardwareState = HardwareState.REPAIR;
370+
371+
return;
372+
} else if (host.hardwareState == HardwareState.REPAIR && freeTempDir >= minBookableFreeTempDir) {
373+
// Check if the host with REPAIR status has comments with subject=SUBJECT_COMMENT_FULL_TEMP_DIR and
374+
// user=CUEBOT_COMMENT_USER and delete the comments, if they exists
375+
boolean commentsDeleted = commentManager.deleteCommentByHostUserAndSubject(host,
376+
CUEBOT_COMMENT_USER, SUBJECT_COMMENT_FULL_TEMP_DIR);
377+
378+
if (commentsDeleted) {
379+
// Set the host state to UP
380+
hostManager.setHostState(host, HardwareState.UP);
381+
host.hardwareState = HardwareState.UP;
382+
return;
383+
}
384+
}
385+
}
319386

320387
// If the states are the same there is no reason to do this update.
321388
if (host.hardwareState.equals(reportState)) {
@@ -374,7 +441,7 @@ private void changeNimbyState(DispatchHost host, RenderHost rh) {
374441
* locked if all cores are locked.
375442
*
376443
* @param host DispatchHost
377-
* @param renderHost RenderHost
444+
* @param coreInfo CoreDetail
378445
*/
379446
private void changeLockState(DispatchHost host, CoreDetail coreInfo) {
380447
if (host.lockState == LockState.LOCKED) {

cuebot/src/main/java/com/imageworks/spcue/service/CommentManager.java

+22
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
import com.imageworks.spcue.HostInterface;
2424
import com.imageworks.spcue.JobInterface;
2525

26+
import java.util.List;
27+
2628
public interface CommentManager {
2729

2830
/**
@@ -47,6 +49,26 @@ public interface CommentManager {
4749
*/
4850
public void deleteComment(String id);
4951

52+
/**
53+
* Deletes comments using host, user, and subject
54+
*
55+
* @param host
56+
* @param user
57+
* @param subject
58+
* @return boolean: returns true if one or more comments where deleted
59+
*/
60+
public boolean deleteCommentByHostUserAndSubject(HostInterface host, String user, String subject);
61+
62+
/**
63+
* Get comments using host, user, and subject
64+
*
65+
* @param host
66+
* @param user
67+
* @param subject
68+
* @return List<Comment>
69+
*/
70+
public List<CommentDetail> getCommentsByHostUserAndSubject(HostInterface host, String user, String subject);
71+
5072
/**
5173
*
5274
* @param id

cuebot/src/main/java/com/imageworks/spcue/service/CommentManagerService.java

+12
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
import com.imageworks.spcue.ShowEntity;
2929
import com.imageworks.spcue.dao.CommentDao;
3030

31+
import java.util.List;
32+
3133
@Transactional
3234
public class CommentManagerService implements CommentManager {
3335

@@ -55,6 +57,16 @@ public void deleteComment(String id) {
5557
commentDao.deleteComment(id);
5658
}
5759

60+
@Transactional(propagation = Propagation.REQUIRED)
61+
public boolean deleteCommentByHostUserAndSubject(HostInterface host, String user, String subject) {
62+
return commentDao.deleteCommentByHostUserAndSubject(host, user, subject);
63+
}
64+
65+
@Transactional(propagation = Propagation.REQUIRED)
66+
public List<CommentDetail> getCommentsByHostUserAndSubject(HostInterface host, String user, String subject) {
67+
return commentDao.getCommentsByHostUserAndSubject(host, user, subject);
68+
}
69+
5870
@Transactional(propagation = Propagation.REQUIRED)
5971
public void setCommentSubject(String id, String subject) {
6072
commentDao.updateCommentSubject(id, subject);

cuebot/src/main/java/com/imageworks/spcue/service/HostManager.java

+8
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,14 @@ public interface HostManager {
6262
*/
6363
void setHostState(HostInterface host, HardwareState state);
6464

65+
/**
66+
* Updates the free temporary directory (mcp) of a host.
67+
*
68+
* @param host HostInterface
69+
* @param freeTempDir Long
70+
*/
71+
void setHostFreeTempDir(HostInterface host, Long freeTempDir);
72+
6573
/**
6674
* Return true if the host is swapping hard enough
6775
* that killing frames will save the entire machine.

cuebot/src/main/java/com/imageworks/spcue/service/HostManagerService.java

+5
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,11 @@ public void setHostState(HostInterface host, HardwareState state) {
9393
hostDao.updateHostState(host, state);
9494
}
9595

96+
@Override
97+
public void setHostFreeTempDir(HostInterface host, Long freeTempDir) {
98+
hostDao.updateHostFreeTempDir(host, freeTempDir);
99+
}
100+
96101
@Override
97102
@Transactional(propagation = Propagation.REQUIRED, readOnly=true)
98103
public boolean isSwapping(HostInterface host) {

cuebot/src/main/resources/opencue.properties

+6
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,12 @@ dispatcher.report_queue.max_pool_size=8
110110
# Queue capacity for handling Host Report.
111111
dispatcher.report_queue.queue_capacity=1000
112112

113+
# The minimum amount of free space in the temporary directory (mcp) to book a host.
114+
# E.g: 1G = 1048576 kB => dispatcher.min_bookable_free_temp_dir_kb=1048576
115+
# Default = -1 (deactivated)
116+
# If equals to -1, it means the feature is turned off
117+
dispatcher.min_bookable_free_temp_dir_kb=-1
118+
113119
# Number of threads to keep in the pool for kill frame operation.
114120
dispatcher.kill_queue.core_pool_size=6
115121
# Maximum number of threads to allow in the pool for kill frame operation.

cuebot/src/test/java/com/imageworks/spcue/test/dao/criteria/ProcSearchTests.java

+4-2
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
import com.imageworks.spcue.service.HostManager;
5151
import com.imageworks.spcue.service.JobLauncher;
5252
import com.imageworks.spcue.service.JobManager;
53+
import com.imageworks.spcue.util.CueUtil;
5354

5455
import static org.assertj.core.api.Assertions.assertThat;
5556
import static org.junit.Assert.assertEquals;
@@ -209,11 +210,12 @@ private void launchJobs() {
209210
private RenderHost.Builder buildRenderHost() {
210211
return RenderHost.newBuilder()
211212
.setBootTime(1192369572)
212-
.setFreeMcp(76020)
213+
// The minimum amount of free space in the temporary directory to book a host.
214+
.setFreeMcp(CueUtil.GB)
213215
.setFreeMem(53500)
214216
.setFreeSwap(20760)
215217
.setLoad(1)
216-
.setTotalMcp(195430)
218+
.setTotalMcp(CueUtil.GB4)
217219
.setTotalMem(8173264)
218220
.setTotalSwap(20960)
219221
.setNimbyEnabled(false)

cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/BookingDaoTests.java

+3-2
Original file line numberDiff line numberDiff line change
@@ -96,11 +96,12 @@ public DispatchHost createHost() {
9696
RenderHost host = RenderHost.newBuilder()
9797
.setName("test_host")
9898
.setBootTime(1192369572)
99-
.setFreeMcp(76020)
99+
// The minimum amount of free space in the temporary directory to book a host.
100+
.setFreeMcp(CueUtil.GB)
100101
.setFreeMem(53500)
101102
.setFreeSwap(20760)
102103
.setLoad(1)
103-
.setTotalMcp(195430)
104+
.setTotalMcp(CueUtil.GB4)
104105
.setTotalMem((int) CueUtil.GB16)
105106
.setTotalSwap((int) CueUtil.GB16)
106107
.setNimbyEnabled(false)

cuebot/src/test/java/com/imageworks/spcue/test/dao/postgres/CommentDaoTests.java

+3-2
Original file line numberDiff line numberDiff line change
@@ -140,11 +140,12 @@ public void testInsertCommentOnHost() {
140140
RenderHost host = RenderHost.newBuilder()
141141
.setName("boo")
142142
.setBootTime(1192369572)
143-
.setFreeMcp(76020)
143+
// The minimum amount of free space in the temporary directory to book a host.
144+
.setFreeMcp(CueUtil.GB)
144145
.setFreeMem(15290520)
145146
.setFreeSwap(2076)
146147
.setLoad(1)
147-
.setTotalMcp(19543)
148+
.setTotalMcp(CueUtil.GB4)
148149
.setTotalMem(15290520)
149150
.setTotalSwap(2096)
150151
.setNimbyEnabled(false)

0 commit comments

Comments
 (0)