2222from openviking .storage import VikingDBManager , VikingDBManagerProxy
2323from openviking .storage .viking_fs import get_viking_fs
2424from openviking .telemetry import get_current_telemetry
25- from openviking .utils .tag_utils import expand_query_tags , merge_tags , parse_tags
2625from openviking .utils .time_utils import parse_iso_datetime
2726from openviking_cli .retrieve .types import (
2827 ContextType ,
@@ -51,9 +50,6 @@ class HierarchicalRetriever:
5150 DIRECTORY_DOMINANCE_RATIO = 1.2 # Directory score must exceed max child score
5251 GLOBAL_SEARCH_TOPK = 10 # Global retrieval count (more candidates = better rerank precision)
5352 HOTNESS_ALPHA = 0.2 # Weight for hotness score in final ranking (0 = disabled)
54- MAX_TAG_EXPANSION_TAGS = 8 # Upper bound on expansion tags collected per query.
55- TAG_EXPANSION_LIMIT = 8 # Upper bound on extra nodes discovered via tags.
56- TAG_EXPANSION_SCORE = 0.15 # Lower seed score for tag-expanded nodes.
5753 LEVEL_URI_SUFFIX = {0 : ".abstract.md" , 1 : ".overview.md" }
5854
5955 def __init__ (
@@ -117,7 +113,6 @@ async def retrieve(
117113 vector_proxy = VikingDBManagerProxy (self .vector_store , ctx )
118114
119115 target_dirs = [d for d in (query .target_directories or []) if d ]
120- query_tags = expand_query_tags (query .tags )
121116
122117 if not await vector_proxy .collection_exists_bound ():
123118 logger .warning (
@@ -151,7 +146,6 @@ async def retrieve(
151146 sparse_query_vector = sparse_query_vector ,
152147 context_type = query .context_type .value if query .context_type else None ,
153148 target_dirs = target_dirs ,
154- tags = query_tags ,
155149 scope_dsl = scope_dsl ,
156150 limit = max (limit , self .GLOBAL_SEARCH_TOPK ),
157151 )
@@ -173,22 +167,11 @@ async def retrieve(
173167 f" [{ i } ] URI: { uri } , score: { score :.4f} , level: { level } , account_id: { account_id } "
174168 )
175169
176- expanded_points , expanded_candidates = await self ._expand_starting_points_by_tags (
177- vector_proxy = vector_proxy ,
178- global_results = global_results ,
179- explicit_tags = query_tags ,
180- context_type = query .context_type .value if query .context_type else None ,
181- target_dirs = target_dirs ,
182- scope_dsl = scope_dsl ,
183- limit = self .TAG_EXPANSION_LIMIT ,
184- )
185-
186170 # Step 3: Merge starting points
187171 starting_points = self ._merge_starting_points (
188172 query .query ,
189173 root_uris ,
190174 global_results ,
191- extra_points = expanded_points ,
192175 mode = mode ,
193176 )
194177
@@ -200,10 +183,6 @@ async def retrieve(
200183 initial_candidates ,
201184 mode = mode ,
202185 )
203- initial_candidates = self ._merge_initial_candidates (
204- initial_candidates ,
205- expanded_candidates ,
206- )
207186
208187 # Step 4: Recursive search
209188 candidates = await self ._recursive_search (
@@ -250,7 +229,6 @@ async def _global_vector_search(
250229 sparse_query_vector : Optional [Dict [str , float ]],
251230 context_type : Optional [str ],
252231 target_dirs : List [str ],
253- tags : List [str ],
254232 scope_dsl : Optional [Dict [str , Any ]],
255233 limit : int ,
256234 ) -> List [Dict [str , Any ]]:
@@ -260,7 +238,6 @@ async def _global_vector_search(
260238 sparse_query_vector = sparse_query_vector ,
261239 context_type = context_type ,
262240 target_directories = target_dirs ,
263- tags = tags ,
264241 extra_filter = scope_dsl ,
265242 limit = limit ,
266243 )
@@ -307,7 +284,6 @@ def _merge_starting_points(
307284 query : str ,
308285 root_uris : List [str ],
309286 global_results : List [Dict [str , Any ]],
310- extra_points : Optional [List [Tuple [str , float ]]] = None ,
311287 mode : str = "thinking" ,
312288 ) -> List [Tuple [str , float ]]:
313289 """Merge starting points.
@@ -344,111 +320,8 @@ def _merge_starting_points(
344320 points .append ((uri , 0.0 ))
345321 seen .add (uri )
346322
347- for uri , score in extra_points or []:
348- if uri not in seen :
349- points .append ((uri , score ))
350- seen .add (uri )
351-
352323 return points
353324
354- def _merge_initial_candidates (
355- self ,
356- * candidate_groups : Optional [List [Dict [str , Any ]]],
357- ) -> List [Dict [str , Any ]]:
358- merged : Dict [str , Dict [str , Any ]] = {}
359- for group in candidate_groups :
360- for candidate in group or []:
361- uri = candidate .get ("uri" , "" )
362- if not uri :
363- continue
364- previous = merged .get (uri )
365- if previous is None or candidate .get ("_score" , 0.0 ) > previous .get ("_score" , 0.0 ):
366- merged [uri ] = candidate
367- return sorted (merged .values (), key = lambda item : item .get ("_score" , 0.0 ), reverse = True )
368-
369- async def _expand_starting_points_by_tags (
370- self ,
371- vector_proxy : VikingDBManagerProxy ,
372- global_results : List [Dict [str , Any ]],
373- explicit_tags : List [str ],
374- context_type : Optional [str ],
375- target_dirs : List [str ],
376- scope_dsl : Optional [Dict [str , Any ]],
377- limit : int ,
378- ) -> Tuple [List [Tuple [str , float ]], List [Dict [str , Any ]]]:
379- expansion_tags = self ._collect_expansion_tags (global_results , explicit_tags )
380- if not expansion_tags or limit <= 0 :
381- return [], []
382-
383- tag_matches = await vector_proxy .search_by_tags_in_tenant (
384- tags = expansion_tags ,
385- context_type = context_type ,
386- target_directories = target_dirs ,
387- extra_filter = scope_dsl ,
388- levels = [0 , 1 , 2 ],
389- limit = limit ,
390- )
391- telemetry = get_current_telemetry ()
392- telemetry .count ("retrieval.tag_expansion.tags" , len (expansion_tags ))
393- telemetry .count ("retrieval.tag_expansion.matches" , len (tag_matches ))
394-
395- seen_uris = {result .get ("uri" , "" ) for result in global_results }
396- expansion_points : Dict [str , float ] = {}
397- expansion_candidates : Dict [str , Dict [str , Any ]] = {}
398- expansion_tag_set = set (expansion_tags )
399-
400- for match in tag_matches :
401- uri = match .get ("uri" , "" )
402- if not uri or uri in seen_uris :
403- continue
404-
405- overlap = expansion_tag_set .intersection (parse_tags (match .get ("tags" )))
406- score = self ._score_tag_expansion (len (overlap ))
407-
408- if match .get ("level" , 2 ) == 2 :
409- candidate = dict (match )
410- candidate ["_score" ] = score
411- previous = expansion_candidates .get (uri )
412- if previous is None or score > previous .get ("_score" , 0.0 ):
413- expansion_candidates [uri ] = candidate
414-
415- start_uri = self ._start_uri_from_record (match )
416- if start_uri and score > expansion_points .get (start_uri , 0.0 ):
417- expansion_points [start_uri ] = score
418-
419- return list (expansion_points .items ()), list (expansion_candidates .values ())
420-
421- def _collect_expansion_tags (
422- self ,
423- global_results : List [Dict [str , Any ]],
424- explicit_tags : List [str ],
425- ) -> List [str ]:
426- collected = [explicit_tags ]
427- for result in global_results :
428- collected .append (parse_tags (result .get ("tags" )))
429- return merge_tags (* collected , max_tags = self .MAX_TAG_EXPANSION_TAGS )
430-
431- def _score_tag_expansion (self , overlap_count : int ) -> float :
432- if overlap_count <= 1 :
433- return self .TAG_EXPANSION_SCORE
434- return self .TAG_EXPANSION_SCORE * (1.0 + 0.2 * min (overlap_count - 1 , 3 ))
435-
436- def _start_uri_from_record (self , record : Dict [str , Any ]) -> str :
437- uri = record .get ("uri" , "" )
438- if not uri :
439- return ""
440- if record .get ("level" , 2 ) != 2 :
441- return uri
442-
443- parent_uri = record .get ("parent_uri" )
444- if parent_uri :
445- return parent_uri
446-
447- normalized = uri .rstrip ("/" )
448- if "/" not in normalized :
449- return ""
450- return normalized .rsplit ("/" , 1 )[0 ]
451-
452325 def _prepare_initial_candidates (
453326 self ,
454327 query : str ,
0 commit comments