Skip to content

Commit aec8793

Browse files
committed
BF: CS-1922 shepherd_cmd does not work with qrsh jobs
1 parent 195b359 commit aec8793

7 files changed

Lines changed: 62 additions & 48 deletions

File tree

doc/markdown/man/man5/sge_conf.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -613,6 +613,19 @@ Changing *shepherd_cmd* will take immediate effect. The default for *shepherd_cm
613613

614614
The global configuration entry for this value may be overwritten by the execution host local configuration.
615615

616+
In a shepherd wrapper make sure to call the actual shepherd binary with the same arguments as the wrapper, e.g.,
617+
618+
```bash
619+
#!/bin/sh
620+
621+
ARCH=`$SGE_ROOT/util/arch`
622+
SHEPHERD="$SGE_ROOT/bin/$ARCH/sge_shepherd"
623+
624+
# Do the shepherd wrapper specific actions here.
625+
626+
exec "$SHEPHERD" "$@"
627+
```
628+
616629
## gid_range
617630

618631
The *gid_range* is a comma separated list of range expressions of the form n-m (n as well as m are integer numbers

source/daemons/common/err_trace.cc

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
*
2828
* All Rights Reserved.
2929
*
30-
* Portions of this software are Copyright (c) 2023-2025 HPC-Gridware GmbH
30+
* Portions of this software are Copyright (c) 2026 HPC-Gridware GmbH
3131
*
3232
************************************************************************/
3333
/*___INFO__MARK_END__*/
@@ -92,7 +92,7 @@ static pthread_mutex_t g_trace_mutex;
9292
extern pid_t coshepherd_pid;
9393
extern int shepherd_state; /* holds exit status for shepherd_error() */
9494
extern bool g_new_interactive_job_support;
95-
int foreground = 1; /* usability of stderr/out */
95+
bool foreground = true; // make shepherd_trace() write to both trace file and stdout for debugging
9696

9797
/* Forward declaration of static functions */
9898

@@ -329,8 +329,7 @@ int shepherd_trace(const char *format, ...)
329329
sge_dstring_vsprintf(&message, format, ap);
330330
va_end(ap);
331331

332-
ret = sh_str2file(header_str, sge_dstring_get_string(&message),
333-
shepherd_trace_fp);
332+
ret = sh_str2file(header_str, sge_dstring_get_string(&message), shepherd_trace_fp);
334333

335334
if (foreground) {
336335
printf("%s%s\n", header_str, sge_dstring_get_string(&message));
@@ -549,6 +548,10 @@ int is_shepherd_trace_fd(int fd)
549548
}
550549
}
551550

551+
int get_shepherd_trace_fp() {
552+
return fileno(shepherd_trace_fp);
553+
}
554+
552555
/****** count_exit_status *****************************************************
553556
* NAME
554557
* count_exit_status() -- Return the number of lines in the exit status file
@@ -816,7 +819,7 @@ static FILE* shepherd_trace_init_intern(st_shepherd_file_t shepherd_file)
816819
return nullptr;
817820
}
818821

819-
/* To avoid to block stdin, stdout or stderr, dup the fd until it is >= 3 */
822+
/* To avoid blocking stdin, stdout or stderr, dup the fd until it is >= 3 */
820823
if (fd<3) {
821824
dup_fd(&fd);
822825
}

source/daemons/common/err_trace.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,14 @@
2828
*
2929
* All Rights Reserved.
3030
*
31-
* Portions of this software are Copyright (c) 2023-2024 HPC-Gridware GmbH
31+
* Portions of this software are Copyright (c) 2026 HPC-Gridware GmbH
3232
*
3333
************************************************************************/
3434
/*___INFO__MARK_END__*/
3535

3636
typedef int (*tShepherd_trace)(const char *format, ...);
3737

38-
extern int foreground; /* != 0 if we can write to stderr/out */
38+
extern bool foreground; // make shepherd_trace() write to both trace file and stdout for debugging
3939

4040
void shepherd_trace_init();
4141
void shepherd_trace_exit();
@@ -50,5 +50,6 @@ void shepherd_error(int do_exit, const char *format, ...);
5050
void shepherd_error_ptr(const char *text);
5151
void shepherd_write_exit_status( const char *exit_status );
5252

53-
int is_shepherd_trace_fd( int fd );
54-
int count_exit_status();
53+
int get_shepherd_trace_fp();
54+
int is_shepherd_trace_fd(int fd);
55+
int count_exit_status();

source/daemons/execd/exec_job.cc

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
*
2828
* All Rights Reserved.
2929
*
30-
* Portions of this software are Copyright (c) 2023-2026 HPC-Gridware GmbH
30+
* Portions of this software are Copyright (c) 2026 HPC-Gridware GmbH
3131
*
3232
************************************************************************/
3333
/*___INFO__MARK_END__*/
@@ -1996,7 +1996,11 @@ int sge_exec_job(lListElem *jep, lListElem *jatep, lListElem *petep, char *err_s
19961996
DPRINTF("CHILD - About to exec shepherd wrapper job ->%s< under queue -<%s<\n",
19971997
lGetString(jep, JB_job_name),
19981998
lGetString(master_q, QU_full_name));
1999-
execlp(shepherd_cmd, ps_name, nullptr);
1999+
if (ISTRACE) {
2000+
execlp(shepherd_cmd, ps_name, nullptr);
2001+
} else {
2002+
execlp(shepherd_cmd, ps_name, "-bg", nullptr);
2003+
}
20002004
} else if (mconf_get_do_credentials() && ocs::Bootstrap::has_security_mode(ocs::Bootstrap::BS_SEC_MODE_DCE)) {
20012005
DPRINTF("CHILD - About to exec DCE shepherd wrapper job ->%s< under queue -<%s<\n",
20022006
lGetString(jep, JB_job_name),
@@ -2008,10 +2012,11 @@ int sge_exec_job(lListElem *jep, lListElem *jatep, lListElem *petep, char *err_s
20082012
lGetString(jep, JB_job_name),
20092013
lGetString(master_q, QU_full_name));
20102014

2011-
if (ISTRACE)
2015+
if (ISTRACE) {
20122016
execlp(shepherd_path, ps_name, nullptr);
2013-
else
2017+
} else {
20142018
execlp(shepherd_path, ps_name, "-bg", nullptr);
2019+
}
20152020
} else {
20162021
char commandline[2048];
20172022

source/daemons/shepherd/builtin_starter.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
*
3030
* Portions of this software are Copyright (c) 2011-2012 Univa Corporation
3131
*
32-
* Portions of this software are Copyright (c) 2023-2025 HPC-Gridware GmbH
32+
* Portions of this software are Copyright (c) 2026 HPC-Gridware GmbH
3333
*
3434
************************************************************************/
3535
/*___INFO__MARK_END__*/
@@ -175,7 +175,7 @@ void son(const char *childname, char *script_file, int truncate_stderr_out, bool
175175
int pty;
176176
bool is_the_job = strcmp(childname, "job") == 0;
177177

178-
foreground = 0; /* VX sends SIGTTOU if trace messages go to foreground */
178+
foreground = false; // VX sends SIGTTOU if trace messages go to foreground
179179

180180
/* From here only the son --------------------------------------*/
181181
if (script_file == nullptr) {
@@ -427,7 +427,7 @@ void son(const char *childname, char *script_file, int truncate_stderr_out, bool
427427
}
428428
}
429429
}
430-
foreground = 0;
430+
foreground = false;
431431

432432
/* We have different possibilities to start the job script:
433433
* - We can start it as login shell or not

source/daemons/shepherd/ocs_shepherd_pty.cc

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -458,40 +458,35 @@ pid_t fork_no_pty(int *fd_pipe_in, int *fd_pipe_out,
458458

459459
ret = pipe(fd_pipe_in);
460460
if (ret == -1) {
461-
sge_dstring_sprintf(err_msg, "can't create pipe for stdin: %d: %s",
462-
errno, strerror(errno));
461+
sge_dstring_sprintf(err_msg, "can't create pipe for stdin: %d: %s", errno, strerror(errno));
463462
return -1;
464463
}
465464

466465
ret = pipe(fd_pipe_out);
467466
if (ret == -1) {
468-
sge_dstring_sprintf(err_msg, "can't create pipe for stdout: %d: %s",
469-
errno, strerror(errno));
467+
sge_dstring_sprintf(err_msg, "can't create pipe for stdout: %d: %s", errno, strerror(errno));
470468
return -1;
471469
}
472470

473471
ret = pipe(fd_pipe_err);
474472
if (ret == -1) {
475-
sge_dstring_sprintf(err_msg, "can't create pipe for stderr: %d: %s",
476-
errno, strerror(errno));
473+
sge_dstring_sprintf(err_msg, "can't create pipe for stderr: %d: %s", errno, strerror(errno));
477474
return -1;
478475
}
479476

480477
if ((pid = fork()) < 0) {
481478
return -1;
482479
} else if (pid == 0) { /* child */
483480
if (setsid() < 0) {
484-
sge_dstring_sprintf(err_msg, "setsid() error: %d, %s",
485-
errno, strerror(errno));
481+
sge_dstring_sprintf(err_msg, "setsid() error: %d, %s", errno, strerror(errno));
486482
return -1;
487483
}
488484

489485
/* attach pipes to stdin/stdout/stderr of child */
490486
close(fd_pipe_in[1]);
491487
fd_pipe_in[1] = -1;
492488
if ((dup2(fd_pipe_in[0], STDIN_FILENO)) != STDIN_FILENO) {
493-
sge_dstring_sprintf(err_msg, "dup2 to stdin error: %d, %s",
494-
errno, strerror(errno));
489+
sge_dstring_sprintf(err_msg, "dup2 to stdin error: %d, %s", errno, strerror(errno));
495490
return -1;
496491
}
497492
close(fd_pipe_in[0]);
@@ -500,8 +495,7 @@ pid_t fork_no_pty(int *fd_pipe_in, int *fd_pipe_out,
500495
close(fd_pipe_out[0]);
501496
fd_pipe_out[0] = -1;
502497
if ((dup2(fd_pipe_out[1], STDOUT_FILENO)) != STDOUT_FILENO) {
503-
sge_dstring_sprintf(err_msg, "dup2 to stdout error: %d, %s",
504-
errno, strerror(errno));
498+
sge_dstring_sprintf(err_msg, "dup2 to stdout error: %d, %s", errno, strerror(errno));
505499
return -1;
506500
}
507501
close(fd_pipe_out[1]);
@@ -510,8 +504,7 @@ pid_t fork_no_pty(int *fd_pipe_in, int *fd_pipe_out,
510504
close(fd_pipe_err[0]);
511505
fd_pipe_out[0] = -1;
512506
if ((dup2(fd_pipe_err[1], STDERR_FILENO)) != STDERR_FILENO) {
513-
sge_dstring_sprintf(err_msg, "dup2 to stderr error: %d, %s",
514-
errno, strerror(errno));
507+
sge_dstring_sprintf(err_msg, "dup2 to stderr error: %d, %s", errno, strerror(errno));
515508
return -1;
516509
}
517510
close(fd_pipe_err[1]);
@@ -526,5 +519,3 @@ pid_t fork_no_pty(int *fd_pipe_in, int *fd_pipe_out,
526519
}
527520
DRETURN(pid);
528521
}
529-
530-

source/daemons/shepherd/shepherd.cc

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
*
3030
* Portions of this software are Copyright (c) 2011 Univa Corporation
3131
*
32-
* Portions of this software are Copyright (c) 2023-2026 HPC-Gridware GmbH
32+
* Portions of this software are Copyright (c) 2026 HPC-Gridware GmbH
3333
*
3434
************************************************************************/
3535
/*___INFO__MARK_END__*/
@@ -301,19 +301,19 @@ static int wait_until_parent_has_registered_to_server(int fd_pipe_to_child[])
301301
SIGIGNORE(SIGWINCH);
302302

303303
/* close parents end of our copy of the pipe */
304-
shepherd_trace("child: closing parents end of the pipe_to_child");
304+
shepherd_trace("child: closing parent side ends of pipes: %d", fd_pipe_to_child[1]);
305305
close(fd_pipe_to_child[1]);
306306
fd_pipe_to_child[1] = -1;
307307

308308
/* wait until parent has registered at the server */
309-
shepherd_trace("child: trying to read from parent through the pipe_to_child");
309+
shepherd_trace("child: trying to read from parent through the pipe_to_child[0] = %d", fd_pipe_to_child[0]);
310310
ret = read(fd_pipe_to_child[0], tmpbuf, 11);
311311
if (ret <= 0) {
312-
shepherd_trace("child: error communicating with parent: %d, %s",
313-
errno, strerror(errno));
312+
shepherd_trace("child: error communicating with parent: %d, %s", errno, strerror(errno));
314313
ret = -1;
315314
} else {
316315
/* close other side of our copy of the pipe */
316+
shepherd_trace("child: closing child side ends of pipes: %d", fd_pipe_to_child[0]);
317317
close(fd_pipe_to_child[0]);
318318
fd_pipe_to_child[0] = -1;
319319
shepherd_trace("child: parent sent us '%s'", tmpbuf);
@@ -699,8 +699,8 @@ int main(int argc, char **argv)
699699
shepherd_error(1, "can't read cwd - getcwd failed: %s", strerror(errno));
700700
}
701701

702-
if (argc >= 2 && !strcmp("-bg", argv[1])) {
703-
foreground = 0; /* no output to stderr */
702+
if (argc >= 2 && strcmp("-bg", argv[1]) == 0) {
703+
foreground = false; // no shepherd_trace() output to stdout
704704
}
705705

706706
set_shepherd_signal_mask();
@@ -1128,8 +1128,14 @@ static int start_child(
11281128
}
11291129
}
11301130

1131-
if (pid==0) { /* child */
1131+
if (pid == 0) { // child
11321132
if (g_new_interactive_job_support && is_interactive) {
1133+
// When we are running a builtin interactive job, in foreground mode shepherd_trace() would
1134+
// print the messages to stdout which is redirected to a pipe to the shepherd parent.
1135+
// The output would then be forwarded to the qrsh client.
1136+
// Disable foreground mode.
1137+
foreground = false;
1138+
11331139
// Why do we wait until the connection to qrsh is up?
11341140
// To avoid starting the job when the qrsh client has terminated in the meantime?
11351141
ret = wait_until_parent_has_registered_to_server(fd_pipe_to_child);
@@ -1383,9 +1389,7 @@ static int start_child(
13831389
}
13841390

13851391
/******* write usage to file "usage" ************/
1386-
shepherd_write_usage_file(wait_status, exit_status,
1387-
child_signal, start_time,
1388-
end_time, &rusage);
1392+
shepherd_write_usage_file(wait_status, exit_status, child_signal, start_time, end_time, &rusage);
13891393

13901394
/* this is SEMPA stuff */
13911395
notify_tasker(exit_status);
@@ -1528,9 +1532,6 @@ dstring *dstr_error /* OUT: error message - if any */
15281532
int remote_port = 0;
15291533
int exit_status = -1;
15301534

1531-
/* close child's end of the pipe */
1532-
shepherd_trace("parent: closing child's end of the pipe");
1533-
15341535
/* read destination host and port from config */
15351536
ret = get_remote_host_and_port_from_config(&remote_host, &remote_port, dstr_error);
15361537
if (ret != 0 || remote_host == nullptr || remote_port == 0) {
@@ -2764,6 +2765,8 @@ static int start_async_command(const char *descr, char *cmd)
27642765
if ((pid = fork()) == -1) {
27652766
shepherd_trace("can't fork for starting %s command", descr);
27662767
} else if (pid == 0) {
2768+
foreground = false;
2769+
27672770
int use_qsub_gid;
27682771
gid_t gid;
27692772
char *tmp_str;
@@ -2808,8 +2811,6 @@ static int start_async_command(const char *descr, char *cmd)
28082811
exit(1);
28092812
}
28102813

2811-
foreground = 0;
2812-
28132814
cwd = get_conf_val("cwd");
28142815
if (sge_chdir(cwd)) {
28152816
shepherd_trace("%s: can't chdir to %s", descr, cwd);

0 commit comments

Comments
 (0)