@@ -136,8 +136,27 @@ def crawl(
136136 # iterations rather than constructing a fresh object on every BFS hop.
137137 enrichment = EnrichmentService ()
138138
139+ # One provenance Analysis per authority corpus, reused across the run.
140+ # Every section of a given authority bootstraps into ONE corpus — the
141+ # provider's ``title`` is a constant, so every ``usc-*`` section lands in
142+ # the single "United States Code" corpus — so a crawl that ingests N
143+ # sections of an authority calls apply() on the SAME corpus N times.
144+ # Letting each call mint its own Analysis (apply()'s default when
145+ # ``analysis=None``) would leave N provenance rows on that one corpus;
146+ # instead we capture the Analysis the first apply creates and feed it back
147+ # into the rest, capping it at one per corpus (issue #2027).
148+ from opencontractserver .analyzer .models import Analysis
149+
150+ apply_analyses : dict [int , Analysis ] = {}
151+
139152 while True :
140- # Hard cap checks before dequeue so the summary is honest.
153+ # Hard cap checks before dequeue so the summary is honest. On these
154+ # early stops we intentionally do NOT populate
155+ # blocked_by_bound["min_demand_or_depth"]: rows still queued when a cap
156+ # fires were simply not reached, and may be perfectly eligible (above
157+ # min_demand, within max_depth) — attributing them to a bound would be
158+ # a lie. The frontier_residual census (computed below for EVERY stop
159+ # reason) accounts for them, so the summary is still non-silent.
141160 if ingested >= max_authorities :
142161 stop_reason = "max_authorities"
143162 break
@@ -150,10 +169,13 @@ def crawl(
150169 limit = 1 , max_depth = max_depth , min_demand = min_demand
151170 )
152171 if not rows :
153- # Count how many queued rows remain so the summary is non-silent
154- # about what was left. This is the UNION of rows excluded by the
155- # min_demand floor and/or the max_depth bound — the single key
156- # does not attribute each row to one cause or the other.
172+ # frontier_drained: dequeue returned nothing, so EVERY remaining
173+ # queued row failed the (min_demand AND max_depth) filters. Here —
174+ # and only here — is attributing the residual queued count to those
175+ # bounds correct (the early max_authorities / token_budget breaks
176+ # above leave their unreached-but-eligible rows to
177+ # frontier_residual instead). The single key is the UNION of the
178+ # two exclusions; it does not split each row by cause.
157179 blocked_by_bound ["min_demand_or_depth" ] = (
158180 AuthorityFrontier .objects .filter (
159181 discovery_state = C .DISCOVERY_STATE_QUEUED
@@ -208,14 +230,26 @@ def crawl(
208230 # Re-extract the authority's OWN outbound citations and seed the
209231 # frontier at depth+1 — only when we haven't reached max_depth.
210232 if row .depth < max_depth :
211- # Authority corpora hold one small document per statute section,
212- # so this apply scan is bounded (not a large-corpus scan).
233+ # Reuse this corpus's provenance Analysis across sections (see the
234+ # apply_analyses note above) so the BFS doesn't accumulate one
235+ # Analysis row per section on a shared authority corpus.
236+ apply_analysis = apply_analyses .get (authority_corpus_id )
213237 apply_res = enrichment .apply (
214238 corpus_id = authority_corpus_id ,
215239 creator_id = creator_id ,
216240 types = [C .REF_LAW ],
217241 extra_tiers = [C .DETECTION_TIER_GRAMMAR ],
242+ analysis = apply_analysis ,
218243 )
244+ if apply_analysis is None :
245+ # First apply on this corpus created the provenance Analysis;
246+ # cache it so the corpus's remaining sections reattach to it
247+ # instead of each minting a fresh one.
248+ new_analysis_id = apply_res .get ("analysis_id" )
249+ if new_analysis_id is not None :
250+ apply_analyses [authority_corpus_id ] = Analysis .objects .get (
251+ pk = new_analysis_id
252+ )
219253
220254 outbound = list (
221255 CorpusReferenceService .for_corpus (user , authority_corpus_id )
0 commit comments