Skip to content
8 changes: 8 additions & 0 deletions doc/markdown/man/man5/sge_conf.md
Original file line number Diff line number Diff line change
Expand Up @@ -809,6 +809,14 @@ Sets the time interval for spooling the sharetree usage. The default is set to 0
colon-separated string or seconds. There is no setting to turn the sharetree spooling off. (e.g.
STREE_SPOOL_INTERVAL=00:02:00)

***STREE_TICK_INTERVAL***

Sets the cadence at which share-tree usage is decayed and the resulting share-tree state is republished to
clients such as the scheduler and *sge_share_mon*. The value is given in seconds or as a colon-separated time
string. The valid range is 1 to 300 seconds; values above the upper bound are clamped on use, and values
\<= 0 are rejected with a warning and reset to the default. The default value is 5 seconds. Changes take
effect within seconds. (e.g. STREE_TICK_INTERVAL=00:00:30)

***MAX_JOB_DELETION_TIME***

Sets the value of how long the qmaster will spend deleting jobs. After this time, the qmaster will continue with
Expand Down
2 changes: 2 additions & 0 deletions source/daemons/qmaster/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ set(QMASTER_SOURCES
ocs_BaseAccountingFileWriter.cc
ocs_BaseReportingFileWriter.cc
ocs_CategoryQmaster.cc
ocs_FinishedJob.cc
ocs_SharetreeUsage.cc
ocs_JsonAccountingFileWriter.cc
ocs_JsonReportingFileWriter.cc
ocs_MirrorReaderDataStore.cc
Expand Down
15 changes: 15 additions & 0 deletions source/daemons/qmaster/configuration_qmaster.cc
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
#include "configuration_qmaster.h"
#include "ocs_ReportingFileWriter.h"
#include "sge.h"
#include "ocs_SharetreeUsage.h"
#include "sge_persistence_qmaster.h"
#include "sge_userprj_qmaster.h"
#include "reschedule.h"
Expand Down Expand Up @@ -395,6 +396,20 @@ sge_mod_configuration(lListElem *aConf, lList **anAnswer, const char *aUser, con

// The maximum number of events clients might have changed.
sge_set_max_dynamic_event_clients(mconf_get_max_dynamic_event_clients());

// CS-1239: STREE_TICK_INTERVAL may have changed. The pending one-time
// tick event was scheduled with the *previous* interval, so without
// an explicit re-schedule a tighter new interval would not take effect
// until the already-queued event fires - which can be up to 300 s away.
// Drop the queued event and re-queue at +5 s so the new value is in
// effect within seconds.
sge_reschedule_sharetree_tick();

// CS-1239: same story for STREE_SPOOL_INTERVAL and the dirty-objects
// flush event - the already-queued event keeps its original 240 s
// schedule unless we explicitly re-arm it. issue_1385 pins
// STREE_SPOOL_INTERVAL low then reads the spool file within seconds.
sge_reschedule_sharetree_spool();
}

/* invalidate configuration cache */
Expand Down
12 changes: 10 additions & 2 deletions source/daemons/qmaster/job_exit.cc
Original file line number Diff line number Diff line change
Expand Up @@ -160,11 +160,19 @@ sge_job_exit(lListElem *jr, lListElem *jep, lListElem *jatep, monitoring_t *moni
ocs::ReportingFileWriter::create_job_logs(nullptr, timestamp, JL_DELETED, MSG_EXECD, hostname, jr, jep, jatep, nullptr,
MSG_LOG_JREMOVED);

/* CS-1239: snapshot what AR cleanup needs because the commit below now also
* buries the ja_task (and possibly the job), invalidating jep/jatep. The AR
* removal check must run AFTER the commit because qinstance_slots_used() on
* the AR's reserved queues only drops once sge_clear_granted_resources()
* inside sge_commit_job() has released this job's bookings. */
uint32_t job_ar_id = lGetUlong(jep, JB_ar);
bool job_was_deleted = (lGetUlong(jatep, JAT_state) & JDELETED) == JDELETED;

sge_commit_job(jep, jatep, jr, COMMIT_ST_FINISHED_FAILED_EE, COMMIT_DEFAULT | COMMIT_NEVER_RAN, monitor, gdi_session);

if (lGetUlong(jep, JB_ar) != 0 && (lGetUlong(jatep, JAT_state) & JDELETED) == JDELETED) {
if (job_ar_id != 0 && job_was_deleted) {
/* get AR and remove it if no other jobs are debited */
lListElem *ar = ar_list_locate(master_ar_list, lGetUlong(jep, JB_ar));
lListElem *ar = ar_list_locate(master_ar_list, job_ar_id);

if (ar != nullptr && lGetUlong(ar, AR_state) == AR_DELETED) {
const lListElem *ar_queue;
Expand Down
203 changes: 203 additions & 0 deletions source/daemons/qmaster/ocs_FinishedJob.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
/*___INFO__MARK_BEGIN_NEW__*/
/***************************************************************************
*
* Copyright 2026 HPC-Gridware GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
***************************************************************************/
/*___INFO__MARK_END_NEW__*/

#include "uti/sge_time.h"

#include "sgeobj/ocs_DataStore.h"
#include "sgeobj/ocs_Usage.h"
#include "sgeobj/sge_job.h"
#include "sgeobj/sge_ja_task.h"
#include "sgeobj/sge_schedd_conf.h"
#include "sgeobj/sge_usage.h"
#include "sgeobj/sge_userprj.h"

#include "ocs_SharetreeUsage.h"
#include "ocs_FinishedJob.h"
#include "sge_persistence_qmaster.h"

namespace {
/* Remove the debited-job-usage entry for the given job number from the
* passed list, if present. Mirrors the "UPU_old_usage_list == nullptr"
* signal in the ORT_update_user_usage / _project_usage handlers, which is
* how the order-driven path tells qmaster to drop the entry: with the
* worker-thread path the qmaster IS the one doing the modification, so
* the drop is direct. */
void drop_debited_job(lListElem *userprj, int upu_field, uint32_t job_number) {
lList *upu_list = lGetListRW(userprj, upu_field);
if (upu_list == nullptr) {
return;
}
lListElem *upu = lGetElemUlongRW(upu_list, UPU_job_number, job_number);
if (upu != nullptr) {
lRemoveElem(upu_list, &upu);
}
}

/* Bump UU_/PR_version so consumers that key off it see a change.
*
* Do NOT bump UU_/PR_usage_seqno: that field is the periodic decay
* handler's idempotency stamp - decay_userprj_usage skips when
* seqno == stored_seqno. The handler's tet_decay_run counter also
* starts at 0 and ++s by 1, so a booking bump would silently match a
* tick and the decay would be a no-op. (That collision produced
* u0 == u1 bit-exact in the sharetree_decay_periodic test.) */
void bump_change_markers(lListElem *userprj, int version_field) {
lAddUlong(userprj, version_field, 1);
}
}

int
sge_book_finished_job_usage(lListElem *jep, lListElem *jatep, monitoring_t * /* monitor */, uint64_t gdi_session) {
if (jep == nullptr || jatep == nullptr) {
return -1;
}

/* Idempotency guard: skip if this ja_task has already been booked.
* The "finished_jobs" entry in JAT_scaled_usage_list is set below as
* part of the booking; if it is already there, the booking has run
* before and we must not double-count. Mirrors the pre-CS-1239 check
* in sge_calc_tickets ("we already handled this finished job").
* Reschedule is safe: sge_give_jobs.cc clears JAT_scaled_usage_list on
* COMMIT_ST_RESCHEDULED / COMMIT_ST_USER_RESCHEDULED so the next exit
* re-seeds. The guard is defensive coverage for future paths that may
* re-invoke this helper (e.g. CS-1908 finished-job retention). */
if (lGetSubStr(jatep, UA_name, "finished_jobs", JAT_scaled_usage_list) != nullptr) {
return 0;
}

/* CS-1239: skip the entire booking when no master share tree is configured.
* The summed UU_/PR_/UPP_usage is consumed only by the share-tree allocator
* and by sge_share_mon, both of which are dormant in this case. This
* restores the pre-CS-1239 implicit gate ("no UU/PR objects -> no work";
* see calc_usage.md) by checking the direct indicator. Trade-off: a
* qconf -Astree after a quiet period starts the share-tree allocator from
* zero history for one tick. Matching gate is in sge_sharetree_tick_handler.
* Caller holds LOCK_GLOBAL write (always via sge_commit_job), so reading
* the SHARETREE master list without an extra lock is safe. */
if (lFirst(*ocs::DataStore::get_master_list(SGE_TYPE_SHARETREE)) == nullptr) {
return 0;
}

const lList *master_user_list = *ocs::DataStore::get_master_list(SGE_TYPE_USER);
const lList *master_project_list = *ocs::DataStore::get_master_list(SGE_TYPE_PROJECT);

lListElem *user = user_list_locate(master_user_list, lGetString(jep, JB_owner));
lListElem *project = nullptr;
const char *project_name = lGetString(jep, JB_project);
if (project_name != nullptr && *project_name != '\0') {
project = prj_list_locate(master_project_list, project_name);
}

/* 1. catch up decay since last tick, then sum the finished job's usage
* into UU_/PR_/UPP_. Worker thread's sc_state_t.decay_constant is
* thread-local and uninitialized by default (== 0), so without this
* refresh decay_usage would multiply by 0 and zero everything. Same
* gap previously hit on the scheduler thread (fixed in
* sge_sharetree_tick_handler). */
ocs::Usage::calculate_default_decay_constant(sconf_get_halftime());

const uint64_t curr_time = sge_get_gmt64();
lList *decay_list = ocs::Usage::get_decay_list();
lList *usage_weight_list = sconf_get_usage_weight_list();

/* Seed "finished_jobs"=1 onto the job's scaled_usage_list before we
* sum it into UU_/PR_/UPP_. The pending-priority calculation in
* sge_sort_pending_job_nodes (sgeee.cc) reads the leaf's
* STN_usage_list "finished_jobs" entry and adds it to job_count when
* computing per-pending-job tickets. calc_node_usage propagates the
* project's "finished_jobs" PR_usage entry up to the leaf's
* STN_usage_list. Without this seed the counter is never written
* anywhere and the formula treats every leaf as "0 finished" - the
* leaf with the fewest pending jobs wins by per-job dilution alone,
* starving leaves with many pending (project3 in the sharetree
* test). */
lList *scaled = lGetListRW(jatep, JAT_scaled_usage_list);
if (scaled == nullptr) {
lSetList(jatep, JAT_scaled_usage_list, lCreateList("scaled_usage", UA_Type));
scaled = lGetListRW(jatep, JAT_scaled_usage_list);
}
usage_list_set_double_usage(scaled, "finished_jobs", 1.0);

/* CS-1239 (test convergence fix): the decay between the previous TET
* tick and this finish time must apply to the historical UU_/PR_usage
* BEFORE we sum the new job's scaled_usage onto it. Otherwise the
* stamp jumps forward to curr_time but the historical part is "stuck
* at the previous-tick decay state" - the next tick decays by
* (next_tick - curr_time) which loses the (curr_time - prev_tick)
* interval. For projects that finish often (project2 in the sharetree
* test) this loss accumulates: combined_usage stays artificially high,
* shr stays high relative to truth, allocator keeps picking project2,
* test fails. seqno = (u_long)-1 tells decay_userprj_usage to update
* the stamp but NOT update the stored seqno - the TET tick owns the
* seqno namespace, we just want a catch-up decay tick here. */
ocs::Usage::decay_and_sum_usage(jep, jatep, nullptr, user, project,
decay_list, usage_weight_list,
static_cast<u_long>(-1), curr_time);
lFreeList(&usage_weight_list);

const uint32_t job_number = lGetUlong(jep, JB_job_number);

/* 2. drop the per-job debited entry the sum step just appended (the job
* is gone; no further deltas to compute), and bump change markers so
* consumers see "something happened to this object". */
if (user != nullptr) {
drop_debited_job(user, UU_debited_job_usage, job_number);
bump_change_markers(user, UU_version);
}
if (project != nullptr) {
drop_debited_job(project, PR_debited_job_usage, job_number);
bump_change_markers(project, PR_version);
}

/* 3. mark user / project dirty for both event emission and spool. The
* sgeE_USER_MOD / sgeE_PROJECT_MOD events are deferred to the share-
* tree tick handler (sge_sharetree_tick_handler), which batches them
* with sgeE_NEW_SHARETREE inside one event-master transaction so the
* scheduler mirror sees user/project usage and tree state as one
* atomic snapshot (pre-CS-1239 ORT_share_tree invariant restored).
* Spool is deferred to the share-tree spool handler at its own
* cadence. */
if (user != nullptr) {
const char *uname = lGetString(user, UU_name);
ocs::SharetreeUsage::mark_user_event_dirty(uname);
ocs::SharetreeUsage::mark_user_spool_dirty(uname);
}
if (project != nullptr) {
const char *pname = lGetString(project, PR_name);
ocs::SharetreeUsage::mark_project_event_dirty(pname);
ocs::SharetreeUsage::mark_project_spool_dirty(pname);
}

/* CS-1239 step 5: this finish moved usage in UU_/PR_/UPP_, which means
* the master share tree (read by sge_share_mon and propagated to mirror
* clients via sgeE_NEW_SHARETREE) is now stale. Flag it so the TET
* share-tree tick handler republishes on its next tick. Cheap (one
* bool write under the LOCK_GLOBAL we already hold). */
if (user != nullptr || project != nullptr) {
ocs::SharetreeUsage::mark_share_tree_dirty();
}

/* Bury (sge_bury_job) is performed by the caller (sge_commit_job's
* COMMIT_ST_FINISHED_FAILED_EE case) right after this helper returns.
* Keeping this function a pure "book the usage" routine keeps its
* pre-/post-conditions reviewable in isolation. */

return 0;
}
58 changes: 58 additions & 0 deletions source/daemons/qmaster/ocs_FinishedJob.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#pragma once
/*___INFO__MARK_BEGIN_NEW__*/
/***************************************************************************
*
* Copyright 2026 HPC-Gridware GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
***************************************************************************/
/*___INFO__MARK_END_NEW__*/

#include <cstdint>

#include "cull/cull.h"
#include "uti/sge_monitor.h"

/** Apply the per-finish bookkeeping that, in the pre-CS-1239 scheme, was
* generated by the scheduler thread as the three orders ORT_update_user_usage,
* ORT_update_project_usage, and ORT_remove_job, and applied in worker threads
* via sge_follow_order.
*
* Called from inside sge_commit_job's COMMIT_ST_FINISHED_FAILED_EE case, after
* the JAT_status = JFINISHED transition and before sge_bury_job. External
* callers should never invoke this helper directly - sge_commit_job is the
* contract.
*
* Preconditions: caller holds LOCK_GLOBAL (write); JAT_status is JFINISHED.
*
* Steps:
* - resolve user / project refs;
* - sum the job's final scaled usage into UU_usage (or
* user.UU_project[project].UPP_usage when project is set), UU_long_term_usage
* / UPP_long_term_usage, PR_usage, and PR_long_term_usage. Catches up the
* decay since the last TET tick first, so the historical part stays in sync;
* - drop the job's entry from UU_debited_job_usage / PR_debited_job_usage
* (the job is gone, the tracking record must follow);
* - bump UU_version / PR_version (the change marker). UU_/PR_usage_seqno
* is deliberately NOT touched: it is the periodic decay handler's
* idempotency stamp; sharing it with the change marker would silently
* collide with the handler's run counter and skip decay ticks;
* - mark the (user, project) dirty for the TET sharetree-spool handler,
* which batches the sgeE_USER_MOD / sgeE_PROJECT_MOD events and the spool
* writes with sgeE_NEW_SHARETREE under one event-master transaction;
* - mark the master share tree dirty so the next TET tick republishes it.
*
* Returns 0 on success, a negative value on a hard error (missing jep/jatep).
*/
int sge_book_finished_job_usage(lListElem *jep, lListElem *jatep, monitoring_t *monitor, uint64_t gdi_session);
Loading
Loading