Skip to content

Commit dfc0e15

Browse files
Merge pull request #1405 from hummuscience/fix/populate-antijoin-proj
Merge fix for populate antijoin with overlapping secondary attributes
2 parents 0bd6e02 + 73a53dd commit dfc0e15

File tree

3 files changed

+127
-3
lines changed

3 files changed

+127
-3
lines changed

src/datajoint/autopopulate.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -403,7 +403,7 @@ def _populate_direct(
403403
"""
404404
from tqdm import tqdm
405405

406-
keys = (self._jobs_to_do(restrictions) - self).keys()
406+
keys = (self._jobs_to_do(restrictions) - self.proj()).keys()
407407

408408
logger.debug("Found %d keys to populate" % len(keys))
409409

@@ -701,7 +701,7 @@ def progress(self, *restrictions: Any, display: bool = False) -> tuple[int, int]
701701
if not common_attrs:
702702
# No common attributes - fall back to two-query method
703703
total = len(todo)
704-
remaining = len(todo - self)
704+
remaining = len(todo - self.proj())
705705
else:
706706
# Build a single query that computes both total and remaining
707707
# Using LEFT JOIN with COUNT(DISTINCT) to handle 1:many relationships

src/datajoint/jobs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -370,7 +370,7 @@ def refresh(
370370

371371
# Keys that need jobs: in key_source, not in target, not in jobs
372372
# Disable semantic_check for Job table (self) because its attributes may not have matching lineage
373-
new_keys = (key_source - self._target).restrict(Not(self), semantic_check=False).proj()
373+
new_keys = (key_source - self._target.proj()).restrict(Not(self), semantic_check=False).proj()
374374
new_key_list = new_keys.keys()
375375

376376
if new_key_list:

tests/integration/test_autopopulate.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,130 @@ def test_allow_insert(clean_autopopulate, subject, experiment):
112112
experiment.insert1(key)
113113

114114

115+
def test_populate_antijoin_with_secondary_attrs(clean_autopopulate, subject, experiment):
116+
"""Test that populate correctly computes pending keys via antijoin.
117+
118+
Verifies that partial populate + antijoin gives correct pending counts.
119+
Note: Experiment.make() inserts fake_experiments_per_subject rows per key.
120+
"""
121+
assert subject, "root tables are empty"
122+
assert not experiment, "table already filled?"
123+
124+
total_keys = len(experiment.key_source)
125+
assert total_keys > 0
126+
127+
# Partially populate (2 keys from key_source)
128+
experiment.populate(max_calls=2)
129+
assert len(experiment) == 2 * experiment.fake_experiments_per_subject
130+
131+
# key_source - target must return only unpopulated keys
132+
pending = experiment.key_source - experiment
133+
assert len(pending) == total_keys - 2, f"Antijoin returned {len(pending)} pending keys, expected {total_keys - 2}."
134+
135+
# Verify progress() reports correct counts
136+
remaining, total = experiment.progress()
137+
assert total == total_keys
138+
assert remaining == total_keys - 2
139+
140+
# Populate the rest and verify antijoin returns 0
141+
experiment.populate()
142+
pending_after = experiment.key_source - experiment
143+
assert len(pending_after) == 0, f"Antijoin returned {len(pending_after)} pending keys after full populate, expected 0."
144+
145+
146+
def test_populate_antijoin_overlapping_attrs(prefix, connection_test):
147+
"""Regression test: antijoin with overlapping secondary attribute names.
148+
149+
This reproduces the bug where `key_source - self` returns ALL keys instead
150+
of just unpopulated ones. The condition is:
151+
152+
1. key_source returns secondary attributes (e.g., num_samples, quality)
153+
2. The target table has secondary attributes with the SAME NAMES
154+
3. The VALUES differ between source and target after populate
155+
156+
Without .proj() on the target, SQL matches on ALL common column names
157+
(including secondary attrs), so different values mean no match, and all
158+
keys appear "pending" even after populate.
159+
160+
Real-world example: LightningPoseOutput (key_source) has num_frames,
161+
quality, processing_datetime as secondary attrs. InitialContainer (target)
162+
also has those same-named columns with different values.
163+
"""
164+
test_schema = dj.Schema(f"{prefix}_antijoin_overlap", connection=connection_test)
165+
166+
@test_schema
167+
class Sensor(dj.Lookup):
168+
definition = """
169+
sensor_id : int32
170+
---
171+
num_samples : int32
172+
quality : decimal(4,2)
173+
"""
174+
contents = [
175+
(1, 100, 0.95),
176+
(2, 200, 0.87),
177+
(3, 150, 0.92),
178+
(4, 175, 0.89),
179+
]
180+
181+
@test_schema
182+
class ProcessedSensor(dj.Computed):
183+
definition = """
184+
-> Sensor
185+
---
186+
num_samples : int32 # same name as Sensor's secondary attr
187+
quality : decimal(4,2) # same name as Sensor's secondary attr
188+
result : decimal(8,2)
189+
"""
190+
191+
@property
192+
def key_source(self):
193+
return Sensor() # returns sensor_id + num_samples + quality
194+
195+
def make(self, key):
196+
# Fetch source data (key only contains PK after projection)
197+
source = (Sensor() & key).fetch1()
198+
# Values intentionally differ from source — this is what triggers
199+
# the bug: the antijoin tries to match on num_samples and quality
200+
# too, and since values differ, no match is found.
201+
self.insert1(
202+
dict(
203+
sensor_id=key["sensor_id"],
204+
num_samples=source["num_samples"] * 2,
205+
quality=float(source["quality"]) + 0.05,
206+
result=float(source["num_samples"]) * float(source["quality"]),
207+
)
208+
)
209+
210+
try:
211+
# Partially populate (2 out of 4)
212+
ProcessedSensor().populate(max_calls=2)
213+
assert len(ProcessedSensor()) == 2
214+
215+
total_keys = len(ProcessedSensor().key_source)
216+
assert total_keys == 4
217+
218+
# The critical test: populate() must correctly identify remaining keys.
219+
# Before the fix, populate() used `key_source - self` which matched on
220+
# num_samples and quality too, returning all 4 keys as "pending".
221+
ProcessedSensor().populate()
222+
assert len(ProcessedSensor()) == 4, (
223+
f"After full populate, expected 4 entries but got {len(ProcessedSensor())}. "
224+
f"populate() likely re-processed already-completed keys."
225+
)
226+
227+
# Verify progress reports 0 remaining
228+
remaining, total = ProcessedSensor().progress()
229+
assert remaining == 0, f"Expected 0 remaining, got {remaining}"
230+
assert total == 4
231+
232+
# Verify antijoin with .proj() is correct
233+
pending = ProcessedSensor().key_source - ProcessedSensor().proj()
234+
assert len(pending) == 0
235+
finally:
236+
test_schema.drop(prompt=False)
237+
238+
115239
def test_load_dependencies(prefix, connection_test):
116240
schema = dj.Schema(f"{prefix}_load_dependencies_populate", connection=connection_test)
117241

0 commit comments

Comments
 (0)