feat: prevent excessive scheduling with min_scheduling_interval

ttngu207 · ttngu207 · commit 57c724713cdf · 2025-05-28T18:02:00.000-05:00
diff --git a/datajoint/autopopulate.py b/datajoint/autopopulate.py
@@ -448,14 +448,45 @@ def _Jobs(self):
     def jobs(self):
         return self._Jobs & {"table_name": self.target.table_name}
 
-    def schedule_jobs(self, *restrictions, purge_invalid_jobs=True):
+    def schedule_jobs(self, *restrictions, purge_invalid_jobs=True, min_scheduling_interval=None):
         """
-        Schedule new jobs for this autopopulate table
-        :param restrictions: a list of restrictions each restrict
-            (table.key_source - target.proj())
-        :param purge_invalid_jobs: if True, remove invalid entry from the jobs table (potentially expensive operation)
-        :return:
+        Schedule new jobs for this autopopulate table by finding keys that need computation.
+        
+        This method implements an optimization strategy to avoid excessive scheduling:
+        1. First checks if any jobs were scheduled recently (within min_scheduling_interval)
+        2. If recent jobs exist, skips scheduling to prevent database load
+        3. Otherwise, finds keys that need computation and schedules them
+        
+        The method also optionally purges invalid jobs (jobs that no longer exist in key_source)
+        to maintain database cleanliness.
+        
+        Args:
+            restrictions: a list of restrictions each restrict (table.key_source - target.proj())
+            purge_invalid_jobs: if True, remove invalid entry from the jobs table (potentially expensive operation)
+            min_scheduling_interval: minimum time in seconds that must have passed since last job scheduling.
+                If None, uses the value from dj.config["min_scheduling_interval"] (default: None)
+            
+        Returns:
+            None
         """
+        if min_scheduling_interval is None:
+            min_scheduling_interval = config["min_scheduling_interval"]
+
+        # First check if we have any recent jobs
+        if min_scheduling_interval > 0:
+            recent_jobs = len(
+                self.jobs
+                & {"status": "scheduled"}
+                & f"timestamp <= UTC_TIMESTAMP()"  # Only consider jobs up to current UTC time
+                & f"timestamp >= DATE_SUB(UTC_TIMESTAMP(), INTERVAL {min_scheduling_interval} SECOND)"
+            )
+            if recent_jobs > 0:
+                logger.debug(
+                    f"Skipping job scheduling for `{to_camel_case(self.target.table_name)}` - "
+                    f"found {recent_jobs} jobs created within last {min_scheduling_interval} seconds"
+                )
+                return
+
         try:
             with self.connection.transaction:
                 schedule_count = 0
diff --git a/datajoint/settings.py b/datajoint/settings.py
@@ -51,6 +51,8 @@
         "add_hidden_timestamp": False,
         # file size limit for when to disable checksums
         "filepath_checksum_size_limit": None,
+        # minimum time in seconds between job scheduling operations
+        "min_scheduling_interval": 5,
     }
 )
 

Original file line number	Diff line number	Diff line change
`@@ -51,6 +51,8 @@`
`51`	`51`	`"add_hidden_timestamp": False,`
`52`	`52`	`# file size limit for when to disable checksums`
`53`	`53`	`"filepath_checksum_size_limit": None,`
	`54`	`+ # minimum time in seconds between job scheduling operations`
	`55`	`+ "min_scheduling_interval": 5,`
`54`	`56`	`}`
`55`	`57`	`)`
`56`	`58`