2323DEFAULT_MAX_WORKERS = 4
2424DEFAULT_THRESHOLD = 64
2525
26+ # Keys we lift out of a worker's internal _stats and ship back to the parent.
27+ # These mirror the same string constants used by ``deepdiff/diff.py``; we keep
28+ # string literals here to avoid importing diff.py at module load (which would
29+ # create an import cycle under spawn).
30+ _WORKER_STATS_COUNTER_KEYS = ('DIFF COUNT' , 'PASSES COUNT' , 'DISTANCE CACHE HIT COUNT' )
31+ _WORKER_STATS_FLAG_KEYS = ('MAX PASS LIMIT REACHED' , 'MAX DIFF LIMIT REACHED' )
32+
33+
34+ def _extract_worker_stats (diff_instance : Any ) -> Dict [str , Any ]:
35+ """Pull a small, picklable stats snapshot off a worker-local DeepDiff.
36+
37+ Returns a dict with integer counters plus boolean limit flags. Missing keys
38+ are tolerated so this stays robust if ``_stats`` shrinks at the end of
39+ ``__init__`` (it currently deletes ``DISTANCE CACHE ENABLED`` and the
40+ ``PREVIOUS *`` bookkeeping keys before we get here).
41+ """
42+ stats = getattr (diff_instance , '_stats' , None ) or {}
43+ delta : Dict [str , Any ] = {}
44+ for key in _WORKER_STATS_COUNTER_KEYS :
45+ delta [key ] = int (stats .get (key , 0 ) or 0 )
46+ for key in _WORKER_STATS_FLAG_KEYS :
47+ delta [key ] = bool (stats .get (key , False ))
48+ return delta
49+
50+
51+ def _aggregate_worker_stats (deltas : List [Dict [str , Any ]]) -> Dict [str , Any ]:
52+ """Sum counter keys and OR-merge limit flags across worker deltas."""
53+ out : Dict [str , Any ] = {key : 0 for key in _WORKER_STATS_COUNTER_KEYS }
54+ for key in _WORKER_STATS_FLAG_KEYS :
55+ out [key ] = False
56+ for delta in deltas :
57+ if not delta :
58+ continue
59+ for key in _WORKER_STATS_COUNTER_KEYS :
60+ out [key ] += int (delta .get (key , 0 ) or 0 )
61+ for key in _WORKER_STATS_FLAG_KEYS :
62+ if delta .get (key ):
63+ out [key ] = True
64+ return out
65+
2666
2767@dataclass (frozen = True )
2868class MPConfig :
@@ -114,7 +154,9 @@ def _sanitize_parameters_for_worker(parameters: Dict[str, Any]) -> Dict[str, Any
114154 return sanitized
115155
116156
117- def _distance_worker (job : Tuple [int , Dict [str , Any ], Any , Any , Any , Any ]) -> Tuple [int , float ]:
157+ def _distance_worker (
158+ job : Tuple [int , Dict [str , Any ], Any , Any , Any , Any ],
159+ ) -> Tuple [int , float , Dict [str , Any ]]:
118160 """Compute the rough distance between two items in a worker process.
119161
120162 ``job`` layout matches what ``compute_distances_parallel`` ships:
@@ -123,7 +165,9 @@ def _distance_worker(job: Tuple[int, Dict[str, Any], Any, Any, Any, Any]) -> Tup
123165
124166 The worker constructs a fresh root ``DeepDiff`` (no shared parent state),
125167 requests the DELTA_VIEW so we hit the same code path as the serial call in
126- ``_get_rough_distance_of_hashed_objs``, and returns the resulting float.
168+ ``_get_rough_distance_of_hashed_objs``, and returns the resulting float
169+ plus a ``_extract_worker_stats`` snapshot so the parent can aggregate
170+ diff/pass/cache-hit counts into its WORKER_* stats keys.
127171 """
128172 # Imported here to keep module import cheap and to dodge any circular
129173 # import surprises under spawn.
@@ -144,7 +188,7 @@ def _distance_worker(job: Tuple[int, Dict[str, Any], Any, Any, Any, Any]) -> Tup
144188 # call below, hence cache_purge_level=0.
145189 cache_purge_level = 0 ,
146190 )
147- return job_index , cast (float , diff ._get_rough_distance ())
191+ return job_index , cast (float , diff ._get_rough_distance ()), _extract_worker_stats ( diff )
148192
149193
150194def compute_distances_parallel (
@@ -153,25 +197,28 @@ def compute_distances_parallel(
153197 original_type : Any ,
154198 iterable_compare_func : Optional [Callable ],
155199 config : MPConfig ,
156- ) -> Optional [Dict [Tuple [Any , Any ], float ]]:
200+ ) -> Optional [Tuple [ Dict [Tuple [Any , Any ], float ], Dict [ str , Any ] ]]:
157201 """Run ``_distance_worker`` over ``jobs`` and return distances by pair.
158202
159203 ``jobs`` is a list of ``(added_hash, removed_hash, added_item, removed_item)``
160204 tuples in the exact order the serial nested loop visits them. The parent
161205 is responsible for that ordering; this helper does not reorder anything.
162206
163207 Returns:
164- A dict ``{(added_hash, removed_hash): distance}``, or ``None`` if the
165- section is unsafe to parallelize (unpickleable inputs/parameters,
166- worker import error, etc.). On ``None`` the caller MUST fall back to
167- the serial path so correctness is preserved.
208+ ``(distances_by_pair, aggregated_worker_stats)`` where the first item
209+ is a dict ``{(added_hash, removed_hash): distance}`` and the second is
210+ the aggregated ``_extract_worker_stats`` snapshot summed across all
211+ workers (counter keys summed, limit flags OR-merged). Returns
212+ ``None`` if the section is unsafe to parallelize (unpickleable
213+ inputs/parameters, worker import error, etc.). On ``None`` the caller
214+ MUST fall back to the serial path so correctness is preserved.
168215
169216 Workers may finish out of order; we collect results into a dict keyed by
170217 the original job index, so callers see the same result regardless of
171218 completion order.
172219 """
173220 if not jobs :
174- return {}
221+ return {}, _aggregate_worker_stats ([])
175222
176223 sanitized_params = _sanitize_parameters_for_worker (parameters )
177224
@@ -200,14 +247,16 @@ def compute_distances_parallel(
200247 )
201248
202249 results_by_index : Dict [int , float ] = {}
250+ stats_deltas : List [Dict [str , Any ]] = []
203251 try :
204252 with ProcessPoolExecutor (max_workers = config .workers ) as executor :
205253 futures = [executor .submit (_distance_worker , payload ) for payload in payloads ]
206254 for future in as_completed (futures ):
207255 # Re-raise worker exceptions in the parent so they surface as
208256 # normal DeepDiff exceptions instead of being swallowed.
209- idx , distance = future .result ()
257+ idx , distance , stats_delta = future .result ()
210258 results_by_index [idx ] = distance
259+ stats_deltas .append (stats_delta )
211260 except (pickle .PicklingError , AttributeError , TypeError ):
212261 # Pickling/spawn-related failures: surface as a serial fallback rather
213262 # than crashing the diff. Other exceptions (worker logic bugs, user
@@ -217,7 +266,7 @@ def compute_distances_parallel(
217266 out : Dict [Tuple [Any , Any ], float ] = {}
218267 for i , job in enumerate (jobs ):
219268 out [(job [0 ], job [1 ])] = results_by_index [i ]
220- return out
269+ return out , _aggregate_worker_stats ( stats_deltas )
221270
222271
223272def _hash_worker (job : Tuple [int , Any , str , Dict [str , Any ]]) -> Tuple [int , Optional [str ]]:
@@ -256,7 +305,7 @@ def _hash_worker(job: Tuple[int, Any, str, Dict[str, Any]]) -> Tuple[int, Option
256305
257306def _subtree_diff_worker (
258307 job : Tuple [int , Dict [str , Any ], Any , Any , Any ],
259- ) -> Tuple [int , List [Tuple [str , Any ]]]:
308+ ) -> Tuple [int , List [Tuple [str , Any ]], Dict [ str , Any ] ]:
260309 """Run one paired-item subtree diff in a worker process.
261310
262311 ``job`` layout: ``(job_index, sanitized_parameters, t1, t2, _original_type)``.
@@ -290,30 +339,33 @@ def _subtree_diff_worker(
290339 continue
291340 for leaf in levels :
292341 entries .append ((report_type , leaf ))
293- return job_index , entries
342+ return job_index , entries , _extract_worker_stats ( diff )
294343
295344
296345def compute_subtree_diffs_parallel (
297346 jobs : List [Tuple [Any , Any ]],
298347 parameters : Dict [str , Any ],
299348 original_type : Any ,
300349 config : MPConfig ,
301- ) -> Optional [List [List [Tuple [str , Any ]]]]:
350+ ) -> Optional [Tuple [ List [List [Tuple [str , Any ]]], Dict [ str , Any ]]]:
302351 """Run ``_subtree_diff_worker`` over ``jobs`` and return per-job entries.
303352
304353 ``jobs`` is a list of ``(t1_item, t2_item)`` tuples in the exact order
305- the serial paired-iteration code visits them. Returns a list aligned to
306- that order; each element is ``[(report_type, leaf_difflevel), ...]``
307- suitable for the parent to rebase and merge into its tree. Returns
308- ``None`` when the section is unsafe to parallelize (unpickleable
354+ the serial paired-iteration code visits them. Returns
355+ ``(entries_by_job, aggregated_worker_stats)`` where ``entries_by_job`` is
356+ a list aligned to job order — each element is ``[(report_type,
357+ leaf_difflevel), ...]`` suitable for the parent to rebase and merge into
358+ its tree — and ``aggregated_worker_stats`` is the per-batch ``_stats``
359+ deltas summed across workers (counters summed, limit flags OR-merged).
360+ Returns ``None`` when the section is unsafe to parallelize (unpickleable
309361 parameters/items, worker import error). On ``None`` the caller MUST run
310362 the same jobs serially so correctness is preserved.
311363
312364 Workers may finish out of order; results are collected by their original
313365 job index so the merge order is identical regardless of completion order.
314366 """
315367 if not jobs :
316- return []
368+ return [], _aggregate_worker_stats ([])
317369
318370 sanitized_params = _sanitize_parameters_for_worker (parameters )
319371
@@ -332,16 +384,21 @@ def compute_subtree_diffs_parallel(
332384 ]
333385
334386 results_by_index : Dict [int , List [Tuple [str , Any ]]] = {}
387+ stats_deltas : List [Dict [str , Any ]] = []
335388 try :
336389 with ProcessPoolExecutor (max_workers = config .workers ) as executor :
337390 futures = [executor .submit (_subtree_diff_worker , payload ) for payload in payloads ]
338391 for future in as_completed (futures ):
339- idx , entries = future .result ()
392+ idx , entries , stats_delta = future .result ()
340393 results_by_index [idx ] = entries
394+ stats_deltas .append (stats_delta )
341395 except (pickle .PicklingError , AttributeError , TypeError ):
342396 return None
343397
344- return [results_by_index [i ] for i in range (len (jobs ))]
398+ return (
399+ [results_by_index [i ] for i in range (len (jobs ))],
400+ _aggregate_worker_stats (stats_deltas ),
401+ )
345402
346403
347404def compute_hashes_parallel (
0 commit comments