29
29
30
30
import org .apache .logging .log4j .Logger ;
31
31
import org .apache .logging .log4j .LogManager ;
32
+ import org .springframework .beans .factory .annotation .Autowired ;
33
+ import org .springframework .core .env .Environment ;
32
34
import org .springframework .core .task .TaskRejectedException ;
33
35
import org .springframework .dao .DataAccessException ;
34
36
import org .springframework .dao .EmptyResultDataAccessException ;
35
37
38
+ import com .imageworks .spcue .CommentDetail ;
36
39
import com .imageworks .spcue .DispatchHost ;
37
40
import com .imageworks .spcue .FrameInterface ;
38
41
import com .imageworks .spcue .JobEntity ;
57
60
import com .imageworks .spcue .rqd .RqdClient ;
58
61
import com .imageworks .spcue .rqd .RqdClientException ;
59
62
import com .imageworks .spcue .service .BookingManager ;
63
+ import com .imageworks .spcue .service .CommentManager ;
60
64
import com .imageworks .spcue .service .HostManager ;
61
65
import com .imageworks .spcue .service .JobManager ;
62
66
import com .imageworks .spcue .service .JobManagerSupport ;
@@ -80,6 +84,14 @@ public class HostReportHandler {
80
84
private JobManagerSupport jobManagerSupport ;
81
85
private JobDao jobDao ;
82
86
private LayerDao layerDao ;
87
+ @ Autowired
88
+ private Environment env ;
89
+ @ Autowired
90
+ private CommentManager commentManager ;
91
+ // Comment constants
92
+ private static final String SUBJECT_COMMENT_FULL_TEMP_DIR = "Host set to REPAIR for not having enough storage " +
93
+ "space on the temporary directory (mcp)" ;
94
+ private static final String CUEBOT_COMMENT_USER = "cuebot" ;
83
95
84
96
/**
85
97
* Boolean to toggle if this class is accepting data or not.
@@ -156,7 +168,7 @@ public void handleHostReport(HostReport report, boolean isBoot) {
156
168
rhost .getLoad (), new Timestamp (rhost .getBootTime () * 1000l ),
157
169
rhost .getAttributesMap ().get ("SP_OS" ));
158
170
159
- changeHardwareState (host , report .getHost ().getState (), isBoot );
171
+ changeHardwareState (host , report .getHost ().getState (), isBoot , report . getHost (). getFreeMcp () );
160
172
changeNimbyState (host , report .getHost ());
161
173
162
174
/**
@@ -221,7 +233,14 @@ public void handleHostReport(HostReport report, boolean isBoot) {
221
233
}
222
234
}
223
235
224
- if (host .idleCores < Dispatcher .CORE_POINTS_RESERVED_MIN ) {
236
+ // The minimum amount of free space in the temporary directory to book a host
237
+ Long minBookableFreeTempDir = env .getRequiredProperty ("dispatcher.min_bookable_free_temp_dir_kb" , Long .class );
238
+
239
+ if (minBookableFreeTempDir != -1 && report .getHost ().getFreeMcp () < minBookableFreeTempDir ) {
240
+ msg = String .format ("%s doens't have enough free space in the temporary directory (mcp), %dMB needs %dMB" ,
241
+ host .name , (report .getHost ().getFreeMcp ()/1024 ), (minBookableFreeTempDir /1024 ));
242
+ }
243
+ else if (host .idleCores < Dispatcher .CORE_POINTS_RESERVED_MIN ) {
225
244
msg = String .format ("%s doesn't have enough idle cores, %d needs %d" ,
226
245
host .name , host .idleCores , Dispatcher .CORE_POINTS_RESERVED_MIN );
227
246
}
@@ -231,7 +250,7 @@ else if (host.idleMemory < Dispatcher.MEM_RESERVED_MIN) {
231
250
}
232
251
else if (report .getHost ().getFreeMem () < CueUtil .MB512 ) {
233
252
msg = String .format ("%s doens't have enough free system mem, %d needs %d" ,
234
- host .name , report .getHost ().getFreeMem (), Dispatcher .MEM_RESERVED_MIN );
253
+ host .name , report .getHost ().getFreeMem (), Dispatcher .MEM_RESERVED_MIN );
235
254
}
236
255
else if (!host .hardwareState .equals (HardwareState .UP )) {
237
256
msg = host + " is not in the Up state." ;
@@ -309,13 +328,61 @@ else if (!dispatchSupport.isCueBookable(host)) {
309
328
* updated with a boot report. If the state is Repair, then state is
310
329
* never updated via RQD.
311
330
*
331
+ *
332
+ * Prevent cue frames from booking on hosts with full temporary directories.
333
+ *
334
+ * Change host state to REPAIR or UP according the amount of free space
335
+ * in the temporary directory:
336
+ * - Set the host state to REPAIR, when the amount of free space in the
337
+ * temporary directory is less than the minimum required. Add a comment with
338
+ * subject: SUBJECT_COMMENT_FULL_TEMP_DIR
339
+ * - Set the host state to UP, when the amount of free space in the temporary directory
340
+ * is greater or equals to the minimum required and the host has a comment with
341
+ * subject: SUBJECT_COMMENT_FULL_TEMP_DIR
342
+ *
312
343
* @param host
313
344
* @param reportState
314
345
* @param isBoot
346
+ * @param freeTempDir
315
347
*/
316
- private void changeHardwareState (DispatchHost host ,
317
- HardwareState reportState , boolean isBoot ) {
348
+ private void changeHardwareState (DispatchHost host , HardwareState reportState , boolean isBoot , long freeTempDir ) {
349
+
350
+ // The minimum amount of free space in the temporary directory to book a host
351
+ Long minBookableFreeTempDir = env .getRequiredProperty ("dispatcher.min_bookable_free_temp_dir_kb" , Long .class );
352
+
353
+ // Prevent cue frames from booking on hosts with full temporary directories
354
+ if (minBookableFreeTempDir != -1 ) {
355
+ if (host .hardwareState == HardwareState .UP && freeTempDir < minBookableFreeTempDir ) {
356
+
357
+ // Insert a comment indicating that the Host status = Repair with reason = Full temporary directory
358
+ CommentDetail c = new CommentDetail ();
359
+ c .subject = SUBJECT_COMMENT_FULL_TEMP_DIR ;
360
+ c .user = CUEBOT_COMMENT_USER ;
361
+ c .timestamp = null ;
362
+ c .message = "Host " + host .getName () + " marked as REPAIR. The current amount of free space in the " +
363
+ "temporary directory (mcp) is " + (freeTempDir /1024 ) + "MB. It must have at least "
364
+ + (minBookableFreeTempDir /1024 ) + "MB of free space in temporary directory" ;
365
+ commentManager .addComment (host , c );
318
366
367
+ // Set the host state to REPAIR
368
+ hostManager .setHostState (host , HardwareState .REPAIR );
369
+ host .hardwareState = HardwareState .REPAIR ;
370
+
371
+ return ;
372
+ } else if (host .hardwareState == HardwareState .REPAIR && freeTempDir >= minBookableFreeTempDir ) {
373
+ // Check if the host with REPAIR status has comments with subject=SUBJECT_COMMENT_FULL_TEMP_DIR and
374
+ // user=CUEBOT_COMMENT_USER and delete the comments, if they exists
375
+ boolean commentsDeleted = commentManager .deleteCommentByHostUserAndSubject (host ,
376
+ CUEBOT_COMMENT_USER , SUBJECT_COMMENT_FULL_TEMP_DIR );
377
+
378
+ if (commentsDeleted ) {
379
+ // Set the host state to UP
380
+ hostManager .setHostState (host , HardwareState .UP );
381
+ host .hardwareState = HardwareState .UP ;
382
+ return ;
383
+ }
384
+ }
385
+ }
319
386
320
387
// If the states are the same there is no reason to do this update.
321
388
if (host .hardwareState .equals (reportState )) {
@@ -374,7 +441,7 @@ private void changeNimbyState(DispatchHost host, RenderHost rh) {
374
441
* locked if all cores are locked.
375
442
*
376
443
* @param host DispatchHost
377
- * @param renderHost RenderHost
444
+ * @param coreInfo CoreDetail
378
445
*/
379
446
private void changeLockState (DispatchHost host , CoreDetail coreInfo ) {
380
447
if (host .lockState == LockState .LOCKED ) {
0 commit comments