Skip to content

Commit a314c6c

Browse files
Kill job reason (#1367)
Adding requester information to JobKillRequest This feature requires more information from job kill actions requested through the API. Link the Issue(s) this Pull Request is related to. This feature is motivated by a situation where a script was misusing the API and calling kill on all the jobs for a show on a regular basis. Without this feature, finding where the requests were coming from was a big endeavor. Summarize your change. This change requires that a kill request also provide username, pid, host_kill and reason. --------- Signed-off-by: Diego Tavares <[email protected]> Co-authored-by: Roula O'Regan <[email protected]>
1 parent 7da773a commit a314c6c

File tree

16 files changed

+223
-97
lines changed

16 files changed

+223
-97
lines changed

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,5 @@ htmlcov/
1212
/.env
1313
.envrc
1414
.vscode
15-
.venv/
15+
.venv/
16+
.eggs/*

VERSION.in

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.25
1+
0.26

cuebot/src/main/java/com/imageworks/spcue/Source.java

+21-1
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,35 @@
2525
public class Source {
2626

2727
public String source = "unknown";
28+
public String username = "";
29+
public String pid = "";
30+
public String host_kill = "";
31+
public String reason = "";
2832

2933
public Source() {}
3034

3135
public Source(String source) {
3236
this.source = source;
3337
}
3438

39+
public Source(String source, String username, String pid, String host_kill, String reason) {
40+
this.source = source;
41+
this.username = username;
42+
this.pid = pid;
43+
this.host_kill = host_kill;
44+
this.reason = reason;
45+
}
46+
47+
public String getReason() {
48+
return this.reason;
49+
}
50+
3551
public String toString() {
36-
return this.source;
52+
return "User: " + this.username +
53+
", Pid: " + this.pid +
54+
", Hostname: " + this.host_kill +
55+
", Reason: " + this.reason +
56+
"\n" + this.source;
3757
}
3858
}
3959

cuebot/src/main/java/com/imageworks/spcue/servant/ManageJob.java

+5-2
Original file line numberDiff line numberDiff line change
@@ -275,7 +275,9 @@ public void kill(JobKillRequest request, StreamObserver<JobKillResponse> respons
275275
try {
276276
setupJobData(request.getJob());
277277
manageQueue.execute(new DispatchJobComplete(job,
278-
new Source(request.toString()), true, jobManagerSupport));
278+
new Source(request.toString(), request.getUsername(), request.getPid(),
279+
request.getHostKill(), request.getReason()),
280+
true, jobManagerSupport));
279281
responseObserver.onNext(JobKillResponse.newBuilder().build());
280282
responseObserver.onCompleted();
281283
}
@@ -486,7 +488,8 @@ public void killFrames(JobKillFramesRequest request, StreamObserver<JobKillFrame
486488
manageQueue.execute(
487489
new DispatchKillFrames(
488490
frameSearchFactory.create(job, request.getReq()),
489-
new Source(request.toString()),
491+
new Source(request.toString(), request.getUsername(), request.getPid(),
492+
request.getHostKill(), request.getReason()),
490493
jobManagerSupport));
491494
responseObserver.onNext(JobKillFramesResponse.newBuilder().build());
492495
responseObserver.onCompleted();

cuebot/src/main/java/com/imageworks/spcue/servant/ManageLayer.java

+5-6
Original file line numberDiff line numberDiff line change
@@ -197,12 +197,11 @@ public void getFrames(LayerGetFramesRequest request, StreamObserver<LayerGetFram
197197
@Override
198198
public void killFrames(LayerKillFramesRequest request, StreamObserver<LayerKillFramesResponse> responseObserver) {
199199
updateLayer(request.getLayer());
200-
if (attemptChange(env, property, jobManager, layer, responseObserver)) {
201-
manageQueue.execute(new DispatchKillFrames(frameSearch,
202-
new Source(request.toString()), jobManagerSupport));
203-
responseObserver.onNext(LayerKillFramesResponse.newBuilder().build());
204-
responseObserver.onCompleted();
205-
}
200+
manageQueue.execute(new DispatchKillFrames(frameSearch,
201+
new Source(request.toString(), request.getUsername(), request.getPid(),
202+
request.getHostKill(), request.getReason()), jobManagerSupport));
203+
responseObserver.onNext(LayerKillFramesResponse.newBuilder().build());
204+
responseObserver.onCompleted();
206205
}
207206

208207
@Override

cuebot/src/main/java/com/imageworks/spcue/service/JobManagerSupport.java

+59-53
Original file line numberDiff line numberDiff line change
@@ -75,76 +75,82 @@ public void queueShutdownJob(JobInterface job, Source source, boolean isManualKi
7575

7676
public boolean shutdownJob(JobInterface job, Source source, boolean isManualKill) {
7777

78-
if (jobManager.shutdownJob(job)) {
79-
80-
/*
81-
* Satisfy any dependencies on just the
82-
* job record, not layers or frames.
83-
*/
84-
satisfyWhatDependsOn(job);
85-
86-
if (departmentManager.isManaged(job)) {
87-
departmentManager.syncJobsWithTask(job);
88-
}
89-
90-
if (isManualKill) {
78+
if (isManualKill && source.getReason().isEmpty()) {
79+
logger.info(job.getName() + "/" + job.getId() +
80+
" **Invalid Job Kill Request** for " + source.toString());
81+
}
82+
else {
83+
if (jobManager.shutdownJob(job)) {
9184

92-
logger.info(job.getName() + "/" + job.getId() +
93-
" is being manually killed by " + source.toString());
85+
/*
86+
* Satisfy any dependencies on just the
87+
* job record, not layers or frames.
88+
*/
89+
satisfyWhatDependsOn(job);
9490

95-
/**
96-
* Sleep a bit here in case any frames were
97-
* dispatched during the job shutdown process.
98-
*/
99-
try {
100-
Thread.sleep(3000);
101-
} catch (InterruptedException e1) {
102-
logger.info(job.getName() + "/" + job.getId() +
103-
" shutdown thread was interrupted.");
104-
Thread.currentThread().interrupt();
91+
if (departmentManager.isManaged(job)) {
92+
departmentManager.syncJobsWithTask(job);
10593
}
10694

107-
FrameSearchInterface search = frameSearchFactory.create(job);
108-
FrameSearchCriteria newCriteria = search.getCriteria();
109-
FrameStateSeq states = newCriteria.getStates().toBuilder()
110-
.addFrameStates(FrameState.RUNNING)
111-
.build();
112-
search.setCriteria(newCriteria.toBuilder().setStates(states).build());
95+
if (isManualKill) {
11396

114-
for (FrameInterface frame: jobManager.findFrames(search)) {
97+
logger.info(job.getName() + "/" + job.getId() +
98+
" is being manually killed by " + source.toString());
11599

116-
VirtualProc proc = null;
100+
/**
101+
* Sleep a bit here in case any frames were
102+
* dispatched during the job shutdown process.
103+
*/
117104
try {
118-
proc = hostManager.findVirtualProc(frame);
119-
}
120-
catch (DataAccessException e) {
121-
logger.warn("Unable to find proc to kill frame " + frame +
122-
" on job shutdown operation, " + e);
105+
Thread.sleep(3000);
106+
} catch (InterruptedException e1) {
107+
logger.info(job.getName() + "/" + job.getId() +
108+
" shutdown thread was interrupted.");
109+
Thread.currentThread().interrupt();
123110
}
124111

125-
if (manualStopFrame(frame, FrameState.WAITING)) {
112+
FrameSearchInterface search = frameSearchFactory.create(job);
113+
FrameSearchCriteria newCriteria = search.getCriteria();
114+
FrameStateSeq states = newCriteria.getStates().toBuilder()
115+
.addFrameStates(FrameState.RUNNING)
116+
.build();
117+
search.setCriteria(newCriteria.toBuilder().setStates(states).build());
118+
119+
for (FrameInterface frame: jobManager.findFrames(search)) {
120+
121+
VirtualProc proc = null;
126122
try {
127-
if (proc != null) {
128-
kill(proc, source);
129-
}
130-
} catch (DataAccessException e) {
131-
logger.warn("Failed to kill frame " + frame +
123+
proc = hostManager.findVirtualProc(frame);
124+
}
125+
catch (DataAccessException e) {
126+
logger.warn("Unable to find proc to kill frame " + frame +
132127
" on job shutdown operation, " + e);
133128
}
134-
catch (Exception e) {
135-
logger.warn("error killing frame: " + frame);
129+
130+
if (manualStopFrame(frame, FrameState.WAITING)) {
131+
try {
132+
if (proc != null) {
133+
kill(proc, source);
134+
}
135+
} catch (DataAccessException e) {
136+
logger.warn("Failed to kill frame " + frame +
137+
" on job shutdown operation, " + e);
138+
}
139+
catch (Exception e) {
140+
logger.warn("error killing frame: " + frame);
141+
}
136142
}
137143
}
138144
}
139-
}
140145

141-
/*
142-
* Send mail after all frames have been stopped or else the email
143-
* will have inaccurate numbers.
144-
*/
145-
emailSupport.sendShutdownEmail(job);
146+
/*
147+
* Send mail after all frames have been stopped or else the email
148+
* will have inaccurate numbers.
149+
*/
150+
emailSupport.sendShutdownEmail(job);
146151

147-
return true;
152+
return true;
153+
}
148154
}
149155

150156
return false;

cuegui/cuegui/MenuActions.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from builtins import filter
2525
from builtins import str
2626
from builtins import object
27+
import getpass
2728
import glob
2829
import subprocess
2930
import time
@@ -65,7 +66,8 @@
6566
TITLE = 0
6667
TOOLTIP = 1
6768
ICON = 2
68-
69+
DEFAULT_JOB_KILL_REASON = "Manual Job Kill Request in Cuegui by " + getpass.getuser()
70+
DEFAULT_FRAME_KILL_REASON = "Manual Frame(s) Kill Request in Cuegui by " + getpass.getuser()
6971

7072
# pylint: disable=missing-function-docstring,no-self-use,unused-argument
7173

@@ -368,7 +370,7 @@ def kill(self, rpcObjects=None):
368370
if cuegui.Utils.questionBoxYesNo(self._caller, "Kill jobs?", msg,
369371
[job.data.name for job in jobs]):
370372
for job in jobs:
371-
job.kill()
373+
job.kill(reason=DEFAULT_JOB_KILL_REASON)
372374
self.killDependents(jobs)
373375
self._update()
374376

@@ -384,7 +386,7 @@ def killDependents(self, jobs):
384386
sorted([dep.name() for dep in dependents])):
385387
for depJob in dependents:
386388
try:
387-
depJob.kill()
389+
depJob.kill(reason=DEFAULT_JOB_KILL_REASON)
388390
except opencue.exception.CueException as e:
389391
errMsg = "Failed to kill depending job: %s - %s" % (depJob.name(), e)
390392
logger.warning(errMsg)
@@ -769,7 +771,7 @@ def kill(self, rpcObjects=None):
769771
"Kill ALL frames in selected layers?",
770772
[layer.data.name for layer in layers]):
771773
for layer in layers:
772-
layer.kill()
774+
layer.kill(reason=DEFAULT_FRAME_KILL_REASON)
773775
self._update()
774776

775777
eat_info = ["&Eat", None, "eat"]
@@ -1080,7 +1082,8 @@ def kill(self, rpcObjects=None):
10801082
if cuegui.Utils.questionBoxYesNo(self._caller, "Confirm",
10811083
"Kill selected frames?",
10821084
names):
1083-
self._getSource().killFrames(name=names)
1085+
self._getSource().killFrames(reason=DEFAULT_FRAME_KILL_REASON,
1086+
name=names)
10841087
self._update()
10851088

10861089
markAsWaiting_info = ["Mark as &waiting", None, "configure"]

cuegui/cuegui/plugins/StuckFramePlugin.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,12 @@
2323
from builtins import str
2424
from builtins import map
2525
import datetime
26-
import re
26+
import getpass
2727
import os
28-
from datetime import datetime
29-
import time
30-
import socket
28+
import re
3129
import signal
30+
import socket
31+
import time
3232
import yaml
3333

3434
from qtpy import QtGui
@@ -63,7 +63,7 @@
6363
LLU_COLUMN = 3
6464
RUNTIME_COLUMN = 4
6565
LASTLINE_COLUMN = 7
66-
66+
DEFAULT_FRAME_KILL_REASON = "Manual Frame Kill Request in Cuegui by " + getpass.getuser()
6767

6868
class StuckWidget(cuegui.AbstractDockWidget.AbstractDockWidget):
6969
"""This builds what is displayed on the dock widget"""
@@ -1345,7 +1345,7 @@ def logKill(self):
13451345
if cuegui.Utils.questionBoxYesNo(self, "Confirm", "Kill selected frames?", names):
13461346
self.log()
13471347
for frame in self.selectedObjects():
1348-
frame.kill()
1348+
frame.kill(reason=DEFAULT_FRAME_KILL_REASON)
13491349
self.remove()
13501350

13511351
def retryFrame(self):

cuegui/tests/MenuActions_tests.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -958,7 +958,9 @@ def test_kill(self, yesNoMock):
958958

959959
self.frame_actions.kill(rpcObjects=[frame])
960960

961-
self.job.killFrames.assert_called_with(name=[frame_name])
961+
self.job.killFrames.assert_called_with(
962+
name=[frame_name],
963+
reason="Manual Frame(s) Kill Request in Cuegui by root")
962964

963965
@mock.patch('cuegui.Utils.questionBoxYesNo', return_value=True)
964966
def test_markAsWaiting(self, yesNoMock):

proto/job.proto

+16
Original file line numberDiff line numberDiff line change
@@ -885,6 +885,10 @@ message FrameGetWhatThisDependsOnResponse {
885885
// Kill
886886
message FrameKillRequest {
887887
Frame frame = 1;
888+
string username = 2;
889+
string pid = 3;
890+
string host_kill = 4;
891+
string reason = 5;
888892
}
889893

890894
message FrameKillResponse {} // Empty
@@ -1283,6 +1287,10 @@ message JobIsJobPendingResponse {
12831287
// Kill
12841288
message JobKillRequest {
12851289
Job job = 1;
1290+
string username = 2;
1291+
string pid = 3;
1292+
string host_kill = 4;
1293+
string reason = 5;
12861294
}
12871295

12881296
message JobKillResponse {} // Empty
@@ -1291,6 +1299,10 @@ message JobKillResponse {} // Empty
12911299
message JobKillFramesRequest {
12921300
Job job = 1;
12931301
FrameSearchCriteria req = 2;
1302+
string username = 3;
1303+
string pid = 4;
1304+
string host_kill = 5;
1305+
string reason = 6;
12941306
}
12951307

12961308
message JobKillFramesResponse {} // Empty
@@ -1613,6 +1625,10 @@ message LayerGetWhatThisDependsOnResponse {
16131625
// KillFrames
16141626
message LayerKillFramesRequest {
16151627
Layer layer = 1;
1628+
string username = 2;
1629+
string pid = 3;
1630+
string host_kill = 4;
1631+
string reason = 5;
16161632
}
16171633

16181634
message LayerKillFramesResponse {} // Empty

pycue/opencue/wrappers/frame.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
"""Module for classes related to frames."""
1616

1717
import enum
18+
import getpass
1819
import time
1920
import os
2021

@@ -59,10 +60,18 @@ def eat(self):
5960
if self.data.state != job_pb2.FrameState.Value('EATEN'):
6061
self.stub.Eat(job_pb2.FrameEatRequest(frame=self.data), timeout=Cuebot.Timeout)
6162

62-
def kill(self):
63+
def kill(self, username=None, pid=None, host_kill=None, reason=None):
6364
"""Kills the frame."""
65+
username = username if username else getpass.getuser()
66+
pid = pid if pid else os.getpid()
67+
host_kill = host_kill if host_kill else os.uname()[1]
6468
if self.data.state == job_pb2.FrameState.Value('RUNNING'):
65-
self.stub.Kill(job_pb2.FrameKillRequest(frame=self.data), timeout=Cuebot.Timeout)
69+
self.stub.Kill(job_pb2.FrameKillRequest(frame=self.data,
70+
username=username,
71+
pid=str(pid),
72+
host_kill=host_kill,
73+
reason=reason),
74+
timeout=Cuebot.Timeout)
6675

6776
def retry(self):
6877
"""Retries the frame."""

0 commit comments

Comments
 (0)