55
66from knowcode .knowledge_store import KnowledgeStore
77from knowcode .models import Entity , EntityKind
8+ from knowcode .token_counter import TokenCounter
89
910
1011@dataclass
@@ -15,27 +16,31 @@ class ContextBundle:
1516 context_text : str
1617 included_entities : list [str ]
1718 total_chars : int
19+ total_tokens : int
1820 truncated : bool
1921
2022
2123class ContextSynthesizer :
2224 """Synthesizes context bundles for entities."""
2325
24- DEFAULT_MAX_CHARS = 8000 # Rough proxy for ~2K tokens
26+ DEFAULT_MAX_TOKENS = 2000
2527
2628 def __init__ (
2729 self ,
2830 store : KnowledgeStore ,
29- max_chars : int = DEFAULT_MAX_CHARS ,
31+ max_tokens : int = DEFAULT_MAX_TOKENS ,
32+ model : str = "gpt-4" ,
3033 ) -> None :
3134 """Initialize context synthesizer.
3235
3336 Args:
3437 store: Knowledge store to query.
35- max_chars: Maximum characters in context bundle.
38+ max_tokens: Maximum tokens in context bundle.
39+ model: Model name for token counting.
3640 """
3741 self .store = store
38- self .max_chars = max_chars
42+ self .max_tokens = max_tokens
43+ self .tokenizer = TokenCounter (model )
3944
4045 def synthesize (self , entity_id : str ) -> Optional [ContextBundle ]:
4146 """Synthesize context bundle for an entity.
@@ -52,80 +57,114 @@ def synthesize(self, entity_id: str) -> Optional[ContextBundle]:
5257
5358 sections : list [str ] = []
5459 included : list [str ] = [entity_id ]
55- truncated = False
56-
57- # Section 1: Entity header
58- sections .append (self ._format_entity_header (entity ))
59-
60- # Section 2: Docstring/description
60+
61+ # We build sections in priority order but display them in logical order usually.
62+ # However, for simplicity, we'll append and check budget.
63+
64+ # Priority 1: Entity Core (Header, Signature, Description)
65+ header = self ._format_entity_header (entity )
66+ current_tokens = self .tokenizer .count_tokens (header )
67+ sections .append (header )
68+
69+ desc = ""
6170 if entity .docstring :
62- sections . append ( f"## Description\n \n { entity .docstring } " )
63-
64- # Section 3: Signature (for functions/methods)
71+ desc = f"## Description\n \n { entity .docstring } "
72+
73+ sig = ""
6574 if entity .signature :
66- sections .append (f"## Signature\n \n ```python\n { entity .signature } \n ```" )
67-
68- # Section 4: Source code (if available and fits)
75+ sig = f"## Signature\n \n ```python\n { entity .signature } \n ```"
76+
77+ # Add high priority sections if they fit
78+ if desc :
79+ t = self .tokenizer .count_tokens (desc )
80+ if current_tokens + t < self .max_tokens :
81+ sections .append (desc )
82+ current_tokens += t
83+
84+ if sig :
85+ t = self .tokenizer .count_tokens (sig )
86+ if current_tokens + t < self .max_tokens :
87+ sections .append (sig )
88+ current_tokens += t
89+
90+ # Priority 2: Source Code (Huge consumer, often truncated)
6991 if entity .source_code :
70- code_section = f"## Source Code\n \n ```python\n { entity .source_code } \n ```"
71- if self ._would_fit (sections , code_section ):
72- sections .append (code_section )
73-
74- # Section 5: Parent context
92+ code_header = "## Source Code\n \n ```python\n "
93+ code_footer = "\n ```"
94+ overhead = self .tokenizer .count_tokens (code_header + code_footer )
95+ remaining = self .max_tokens - current_tokens - overhead
96+
97+ if remaining > 100 : # Only add if we have decent space
98+ code_body = entity .source_code
99+ code_tokens = self .tokenizer .count_tokens (code_body )
100+
101+ if code_tokens > remaining :
102+ code_body = self .tokenizer .truncate (code_body , remaining ) + "\n # ... (truncated)"
103+ # We technically truncated the content
104+ # But we will rely on full budget exhaustion check often
105+
106+ sections .append (f"{ code_header } { code_body } { code_footer } " )
107+ current_tokens += self .tokenizer .count_tokens (sections [- 1 ])
108+ else :
109+ # Skipped source code due to budget
110+ # We consider this truncation/loss of info
111+ pass
112+
113+ # Priority 3: Parent Context
75114 parent = self .store .get_parent (entity_id )
76115 if parent :
77116 parent_section = self ._format_parent_context (parent )
78- if self ._would_fit (sections , parent_section ):
117+ t = self .tokenizer .count_tokens (parent_section )
118+ if current_tokens + t < self .max_tokens :
79119 sections .append (parent_section )
80120 included .append (parent .id )
81-
82- # Section 6: Callers (who calls this?)
121+ current_tokens += t
122+
123+ # Priority 4: Relationships (Callers, Callees, Children)
124+ # We add them greedily until budget exhaust
125+
126+ # Unified list of potential sections
127+ rel_sections = []
128+
83129 callers = self .store .get_callers (entity_id )
84130 if callers :
85- callers_section = self ._format_callers (callers )
86- if self ._would_fit (sections , callers_section ):
87- sections .append (callers_section )
88- included .extend (c .id for c in callers )
131+ rel_sections .append ((self ._format_callers (callers ), [c .id for c in callers ]))
89132
90- # Section 7: Callees (what does this call?)
91133 callees = self .store .get_callees (entity_id )
92134 if callees :
93- callees_section = self ._format_callees (callees )
94- if self ._would_fit (sections , callees_section ):
95- sections .append (callees_section )
96- included .extend (c .id for c in callees )
97-
98- # Section 8: Children (for classes/modules)
135+ rel_sections .append ((self ._format_callees (callees ), [c .id for c in callees ]))
136+
99137 if entity .kind in {EntityKind .CLASS , EntityKind .MODULE , EntityKind .DOCUMENT }:
100138 children = self .store .get_children (entity_id )
101139 if children :
102- children_section = self ._format_children (children )
103- if self ._would_fit (sections , children_section ):
104- sections .append (children_section )
105- included .extend (c .id for c in children )
140+ rel_sections .append ((self ._format_children (children ), [c .id for c in children ]))
141+
142+ is_truncated = False
143+
144+ for text , ids in rel_sections :
145+ t = self .tokenizer .count_tokens (text )
146+ if current_tokens + t < self .max_tokens :
147+ sections .append (text )
148+ included .extend (ids )
149+ current_tokens += t
150+ else :
151+ is_truncated = True
106152
107- # Build final context
108153 context_text = "\n \n ---\n \n " .join (sections )
109-
110- # Final truncation if still too long
111- if len (context_text ) > self .max_chars :
112- context_text = context_text [: self .max_chars - 20 ] + "\n \n [TRUNCATED]"
113- truncated = True
154+
155+ # Check if we skipped source code but had it
156+ if entity .source_code and "## Source Code" not in context_text :
157+ is_truncated = True
114158
115159 return ContextBundle (
116160 target_entity = entity ,
117161 context_text = context_text ,
118162 included_entities = included ,
119163 total_chars = len (context_text ),
120- truncated = truncated ,
164+ total_tokens = current_tokens ,
165+ truncated = is_truncated or (current_tokens >= self .max_tokens ),
121166 )
122167
123- def _would_fit (self , current_sections : list [str ], new_section : str ) -> bool :
124- """Check if adding a section would stay within budget."""
125- current_len = sum (len (s ) for s in current_sections )
126- new_len = current_len + len (new_section ) + 10 # +10 for separators
127- return new_len < self .max_chars
128-
129168 def _format_entity_header (self , entity : Entity ) -> str :
130169 """Format entity header."""
131170 lines = [
0 commit comments