Skip to content

Commit 57c7247

Browse files
committed
feat: prevent excessive scheduling with min_scheduling_interval
1 parent b7e4d9b commit 57c7247

2 files changed

Lines changed: 39 additions & 6 deletions

File tree

datajoint/autopopulate.py

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -448,14 +448,45 @@ def _Jobs(self):
448448
def jobs(self):
449449
return self._Jobs & {"table_name": self.target.table_name}
450450

451-
def schedule_jobs(self, *restrictions, purge_invalid_jobs=True):
451+
def schedule_jobs(self, *restrictions, purge_invalid_jobs=True, min_scheduling_interval=None):
452452
"""
453-
Schedule new jobs for this autopopulate table
454-
:param restrictions: a list of restrictions each restrict
455-
(table.key_source - target.proj())
456-
:param purge_invalid_jobs: if True, remove invalid entry from the jobs table (potentially expensive operation)
457-
:return:
453+
Schedule new jobs for this autopopulate table by finding keys that need computation.
454+
455+
This method implements an optimization strategy to avoid excessive scheduling:
456+
1. First checks if any jobs were scheduled recently (within min_scheduling_interval)
457+
2. If recent jobs exist, skips scheduling to prevent database load
458+
3. Otherwise, finds keys that need computation and schedules them
459+
460+
The method also optionally purges invalid jobs (jobs that no longer exist in key_source)
461+
to maintain database cleanliness.
462+
463+
Args:
464+
restrictions: a list of restrictions each restrict (table.key_source - target.proj())
465+
purge_invalid_jobs: if True, remove invalid entry from the jobs table (potentially expensive operation)
466+
min_scheduling_interval: minimum time in seconds that must have passed since last job scheduling.
467+
If None, uses the value from dj.config["min_scheduling_interval"] (default: None)
468+
469+
Returns:
470+
None
458471
"""
472+
if min_scheduling_interval is None:
473+
min_scheduling_interval = config["min_scheduling_interval"]
474+
475+
# First check if we have any recent jobs
476+
if min_scheduling_interval > 0:
477+
recent_jobs = len(
478+
self.jobs
479+
& {"status": "scheduled"}
480+
& f"timestamp <= UTC_TIMESTAMP()" # Only consider jobs up to current UTC time
481+
& f"timestamp >= DATE_SUB(UTC_TIMESTAMP(), INTERVAL {min_scheduling_interval} SECOND)"
482+
)
483+
if recent_jobs > 0:
484+
logger.debug(
485+
f"Skipping job scheduling for `{to_camel_case(self.target.table_name)}` - "
486+
f"found {recent_jobs} jobs created within last {min_scheduling_interval} seconds"
487+
)
488+
return
489+
459490
try:
460491
with self.connection.transaction:
461492
schedule_count = 0

datajoint/settings.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@
5151
"add_hidden_timestamp": False,
5252
# file size limit for when to disable checksums
5353
"filepath_checksum_size_limit": None,
54+
# minimum time in seconds between job scheduling operations
55+
"min_scheduling_interval": 5,
5456
}
5557
)
5658

0 commit comments

Comments
 (0)