2929 *
3030 * Portions of this software are Copyright (c) 2011 Univa Corporation.
3131 *
32- * Portions of this software are Copyright (c) 2023 -2025 HPC-Gridware GmbH
32+ * Portions of this software are Copyright (c) 2024 -2025 HPC-Gridware GmbH
3333 *
3434 ************************************************************************/
3535/* ___INFO__MARK_END__*/
6565#include " sge_ijs_comm.h"
6666
6767extern sig_atomic_t received_signal;
68-
6968/*
7069 * TODO: Cleanup / Headers
7170 * This is just slightly modified copy of the gdi commlib error handling,
@@ -309,6 +308,7 @@ int my_log_list_flush_list(cl_raw_list_t* list_p) {
309308*
310309* INPUTS
311310* dstring *err_msg - Gets the error reason in case of error.
311+ * cl_log_func_t - a commlib logging function which will print CL_LOG messages
312312*
313313* RESULT
314314* int - COMM_RETVAL_OK:
@@ -324,19 +324,20 @@ int my_log_list_flush_list(cl_raw_list_t* list_p) {
324324* SEE ALSO
325325* communication/comm_cleanup_lib()
326326*******************************************************************************/
327- int comm_init_lib (dstring *err_msg)
327+ int comm_init_lib (dstring *err_msg, cl_log_func_t commlib_log_func )
328328{
329329 int ret, ret_val = COMM_RETVAL_OK ;
330330
331331 DENTER (TOP_LAYER );
332332
333- /*
334- * To enable commlib logging to a file (see my_log_list_flush_list()
335- * for the file path), exchange this line with the one below.
336- * Caution: On some architectures, logging causes problems!
337- */
338- /* ret = cl_com_setup_commlib(CL_RW_THREAD, CL_LOG_DEBUG, my_log_list_flush_list);*/
339- ret = cl_com_setup_commlib (CL_RW_THREAD , CL_LOG_OFF , nullptr );
333+ // When we pass a logging function to see commlib logging
334+ // (in sge_shepherd, when compiled with EXTENSIVE_TRACING)
335+ // we want to see INFO logging.
336+ cl_log_type debug_level = CL_LOG_OFF ;
337+ if (commlib_log_func != nullptr ) {
338+ debug_level = CL_LOG_INFO ;
339+ }
340+ ret = cl_com_setup_commlib (CL_RW_THREAD , debug_level, commlib_log_func);
340341 if (ret != CL_RETVAL_OK ) {
341342 sge_dstring_sprintf (err_msg, cl_get_error_text (ret));
342343 DPRINTF (" cl_com_setup_commlib() failed: %s (%d)\n " , sge_dstring_get_string (err_msg), ret);
@@ -768,9 +769,9 @@ int comm_ignore_timeouts(bool b_ignore, dstring *err_msg)
768769
769770 cl_com_ignore_timeouts (b_ignore);
770771 if (ret != CL_RETVAL_OK ) {
771- sge_dstring_sprintf (err_msg, cl_get_error_text (ret));
772- DPRINTF (" cl_com_ignore_timeouts() failed: %s (%d)\n " , sge_dstring_get_string (err_msg), ret);
773- ret_val = COMM_CANT_SET_IGNORE_TIMEOUTS ;
772+ sge_dstring_sprintf (err_msg, cl_get_error_text (ret));
773+ DPRINTF (" cl_com_ignore_timeouts() failed: %s (%d)\n " , sge_dstring_get_string (err_msg), ret);
774+ ret_val = COMM_CANT_SET_IGNORE_TIMEOUTS ;
774775 }
775776 DRETURN (ret_val);
776777}
@@ -1223,40 +1224,44 @@ unsigned long comm_write_message(COMM_HANDLE *handle,
12231224*******************************************************************************/
12241225int comm_flush_write_messages (COMM_HANDLE *handle, dstring *err_msg)
12251226{
1226- unsigned long elems = 0 ;
1227- int ret = 0 , retries = 0 ;
1227+ int retries = 0 ;
12281228
1229- elems = cl_com_messages_in_send_queue (handle);
1229+ unsigned long elems = cl_com_messages_in_send_queue (handle);
12301230 while (elems > 0 ) {
12311231 /*
12321232 * Don't set the cl_commlib_trigger()-call to be blocking and
12331233 * get rid of the usleep() - it's much slower!
12341234 * The last cl_commlib_trigger()-call will take 1 s.
12351235 */
1236- ret = cl_commlib_trigger (handle, 0 );
1236+ int trigger_ret = cl_commlib_trigger (handle, 0 );
12371237 /*
12381238 * Bail out if trigger fails with an error that indicates that we
12391239 * won't be able to send the messages in the near future.
12401240 */
1241- if (ret != CL_RETVAL_OK &&
1242- ret != CL_RETVAL_SELECT_TIMEOUT &&
1243- ret != CL_RETVAL_SELECT_INTERRUPT ) {
1244- sge_dstring_sprintf (err_msg, cl_get_error_text (ret));
1245- retries = ret;
1246- break ;
1241+ if (trigger_ret != CL_RETVAL_OK &&
1242+ trigger_ret != CL_RETVAL_SELECT_TIMEOUT &&
1243+ trigger_ret != CL_RETVAL_SELECT_INTERRUPT &&
1244+ trigger_ret != CL_RETVAL_THREADS_ENABLED ) {
1245+ sge_dstring_sprintf (err_msg, cl_get_error_text (trigger_ret));
1246+ sge_dstring_sprintf_append (err_msg, " - after %d retries" , retries);
1247+ return trigger_ret;
12471248 }
1249+
12481250 elems = cl_com_messages_in_send_queue (handle);
12491251 /*
12501252 * We just tried to send the messages and it wasn't possible to send
12511253 * all messages - give the network some time to recover.
1254+ * @todo CS-1739 cl_commlib_trigger() does *not* wait until all messages are sent!
1255+ * @todo Shall we have a maximum number of retries? A timeout?
1256+ * But if the qrsh client is suspended, we probably need to wait until it is unsuspended again.
12521257 */
12531258 /* TODO (NEW): make this work correctly by calling check_client_alive */
12541259 if (elems > 0 ) {
12551260 usleep (10000 );
1256- retries-- ;
1261+ retries++ ;
12571262 }
12581263 }
1259- return retries;
1264+ return - retries;
12601265}
12611266
12621267/* ***** sge_ijs_comm/comm_recv_message() **************************************
@@ -1323,7 +1328,7 @@ int comm_recv_message(COMM_HANDLE *handle, bool b_synchron,
13231328 nullptr , /* unresolved_hostname, */
13241329 nullptr , /* component_name, */
13251330 0 , /* component_id, */
1326- false ,
1331+ b_synchron ,
13271332 0 ,
13281333 &message,
13291334 &sender);
@@ -1356,7 +1361,7 @@ int comm_recv_message(COMM_HANDLE *handle, bool b_synchron,
13561361 }
13571362 }
13581363
1359- if (sender != nullptr ) {
1364+ if (sender != nullptr ) {
13601365 cl_com_free_endpoint (&sender);
13611366 }
13621367
@@ -1407,6 +1412,10 @@ int comm_recv_message(COMM_HANDLE *handle, bool b_synchron,
14071412 }
14081413 }
14091414 } else {
1415+ // @todo CS-1739 do we need the cl_commlib_trigger, when we are using multi-threaded commlib?
1416+ // if b_synchron is 0, then it does essentially nothing
1417+ // otherwise it waits, until a message is available - the same which is done by cl_commlib_receive_message()
1418+ // itself
14101419 cl_commlib_trigger (handle, b_synchron);
14111420 }
14121421 DRETURN (ret_val);
0 commit comments