Skip to content

Commit 771c271

Browse files
authored
refactor: remove max_tokens parameter from LLM configurations and update related logic (#997)
- Removed max_tokens from LLM input and output schemas across multiple YAML files. - Updated LLM service logic to dynamically calculate context length based on model's max_tokens. - Adjusted frontend components to reflect the removal of max_tokens input. - Updated test data to align with the new model configurations.
1 parent ae1efb6 commit 771c271

11 files changed

Lines changed: 70 additions & 82 deletions

File tree

aperag/flow/examples/rag_flow.yaml

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -222,13 +222,6 @@ nodes:
222222
minimum: 0
223223
maximum: 1
224224
description: Sampling temperature
225-
max_tokens:
226-
value: 1000
227-
type: integer
228-
default: 1000
229-
minimum: 1
230-
maximum: 128000
231-
description: Max tokens for generation
232225
query:
233226
value: {{ .nodes.start.output.query }}
234227
type: string
@@ -244,7 +237,6 @@ nodes:
244237
- model_name
245238
- prompt_template
246239
- temperature
247-
- max_tokens
248240
- query
249241
- docs
250242
output:

aperag/flow/examples/rag_flow2.yaml

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -295,12 +295,6 @@ nodes:
295295
minimum: 0
296296
maximum: 1
297297
description: Sampling temperature
298-
max_tokens:
299-
type: integer
300-
default: 1000
301-
minimum: 1
302-
maximum: 128000
303-
description: Max tokens for generation
304298
query:
305299
type: string
306300
description: User's question or query
@@ -309,14 +303,13 @@ nodes:
309303
description: Docs for LLM context
310304
items:
311305
$ref: '#/schema/document_with_score'
312-
required: [model_service_provider, model_name, custom_llm_provider, prompt_template, temperature, max_tokens, query, docs]
306+
required: [model_service_provider, model_name, custom_llm_provider, prompt_template, temperature, query, docs]
313307
values:
314308
model_service_provider: openrouter
315309
model_name: deepseek/deepseek-v3-base:free
316310
custom_llm_provider: openrouter
317311
prompt_template: "{context}\n{query}"
318312
temperature: 0.7
319-
max_tokens: 1000
320313
query: "{{ nodes.start.output.query }}"
321314
docs: "{{ nodes.rerank_5c7e1b2a.output.docs }}"
322315
output:

aperag/flow/runners/llm.py

Lines changed: 63 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from litellm import BaseModel
2121
from pydantic import Field
2222

23+
from aperag.db.models import APIType
2324
from aperag.db.ops import async_db_ops
2425
from aperag.flow.base.models import BaseNodeRunner, SystemInput, register_node_runner
2526
from aperag.llm.completion.completion_service import CompletionService
@@ -29,7 +30,15 @@
2930
from aperag.utils.history import BaseChatMessageHistory
3031
from aperag.utils.utils import now_unix_milliseconds
3132

32-
MAX_CONTEXT_LENGTH = 100000
33+
# Character to token estimation ratio for Chinese/mixed content
34+
# Conservative estimate: 2 characters = 1 token
35+
CHAR_TO_TOKEN_RATIO = 2.0
36+
37+
# Reserve tokens for output generation (default 1000 tokens)
38+
DEFAULT_OUTPUT_TOKENS = 1000
39+
40+
# Fallback max context length if model max_tokens is not available
41+
FALLBACK_MAX_CONTEXT_LENGTH = 50000
3342

3443

3544
class Message(BaseModel):
@@ -81,14 +90,39 @@ class LLMInput(BaseModel):
8190
custom_llm_provider: str = Field(..., description="Custom LLM provider")
8291
prompt_template: str = Field(..., description="Prompt template")
8392
temperature: float = Field(..., description="Sampling temperature")
84-
max_tokens: int = Field(..., description="Max tokens for generation")
8593
docs: Optional[List[DocumentWithScore]] = Field(None, description="Documents")
8694

8795

8896
class LLMOutput(BaseModel):
8997
text: str
9098

9199

100+
def estimate_token_count(text: str) -> int:
101+
"""
102+
Estimate token count from character count for Chinese/mixed content.
103+
Using conservative ratio: 2 characters = 1 token
104+
"""
105+
return int(len(text) / CHAR_TO_TOKEN_RATIO)
106+
107+
108+
def calculate_max_context_length(model_max_tokens: Optional[int], output_tokens: int = DEFAULT_OUTPUT_TOKENS) -> int:
109+
"""
110+
Calculate maximum context length based on model's max_tokens limit.
111+
Reserve tokens for output generation.
112+
"""
113+
if not model_max_tokens:
114+
return FALLBACK_MAX_CONTEXT_LENGTH
115+
116+
# Reserve tokens for output, convert to character count
117+
max_context_tokens = model_max_tokens - output_tokens
118+
if max_context_tokens <= 0:
119+
# If model max_tokens is too small, use a minimal context
120+
max_context_tokens = max(model_max_tokens // 2, 100)
121+
122+
# Convert tokens to character count
123+
return int(max_context_tokens * CHAR_TO_TOKEN_RATIO)
124+
125+
92126
# Database operations interface
93127
class LLMRepository:
94128
"""Repository interface for LLM database operations"""
@@ -114,7 +148,6 @@ async def generate_response(
114148
custom_llm_provider: str,
115149
prompt_template: str,
116150
temperature: float,
117-
max_tokens: int,
118151
docs: Optional[List[DocumentWithScore]] = None,
119152
) -> Tuple[str, Dict]:
120153
"""Generate LLM response with given parameters"""
@@ -130,23 +163,43 @@ async def generate_response(
130163
except Exception:
131164
raise Exception(f"LLMProvider {model_service_provider} not found")
132165

166+
# Get model configuration to determine max_tokens
167+
try:
168+
model_config = await async_db_ops.query_llm_provider_model(
169+
provider_name=model_service_provider,
170+
api=APIType.COMPLETION.value,
171+
model=model_name
172+
)
173+
model_max_tokens = model_config.max_tokens if model_config else None
174+
except Exception:
175+
model_max_tokens = None
176+
177+
# Calculate dynamic context length based on model's max_tokens
178+
max_context_length = calculate_max_context_length(model_max_tokens)
179+
133180
# Build context and references from documents
134181
context = ""
135182
references = []
136183
if docs:
137184
for doc in docs:
138-
if len(context) + len(doc.text) > MAX_CONTEXT_LENGTH:
185+
if len(context) + len(doc.text) > max_context_length:
139186
break
140187
context += doc.text
141188
references.append({"text": doc.text, "metadata": doc.metadata, "score": doc.score})
142189

143190
prompt = prompt_template.format(query=query, context=context)
144-
output_max_tokens = max_tokens - len(prompt)
145-
146-
if output_max_tokens < 0:
147-
raise Exception(
148-
"max_tokens %d is too small to hold the prompt which size is %d" % (max_tokens, len(prompt))
149-
)
191+
192+
# Estimate prompt tokens and calculate output tokens
193+
prompt_tokens = estimate_token_count(prompt)
194+
if model_max_tokens:
195+
output_max_tokens = model_max_tokens - prompt_tokens
196+
if output_max_tokens < 100: # Ensure minimum output tokens
197+
raise Exception(
198+
f"Model max_tokens {model_max_tokens} is too small to hold the prompt which requires approximately {prompt_tokens} tokens"
199+
)
200+
else:
201+
# Use default output tokens if model max_tokens is unknown
202+
output_max_tokens = DEFAULT_OUTPUT_TOKENS
150203

151204
cs = CompletionService(custom_llm_provider, model_name, base_url, api_key, temperature, output_max_tokens)
152205

@@ -193,7 +246,6 @@ async def run(self, ui: LLMInput, si: SystemInput) -> Tuple[LLMOutput, dict]:
193246
custom_llm_provider=ui.custom_llm_provider,
194247
prompt_template=ui.prompt_template,
195248
temperature=ui.temperature,
196-
max_tokens=ui.max_tokens,
197249
docs=ui.docs,
198250
)
199251

aperag/llm/completion/completion_service.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ async def _acompletion_non_stream(self, history, prompt, memory=False) -> str:
8282
base_url=self.base_url,
8383
api_key=self.api_key,
8484
temperature=self.temperature,
85-
max_tokens=self.max_tokens,
85+
# max_tokens=self.max_tokens,
8686
messages=messages,
8787
stream=False,
8888
caching=self.caching,
@@ -109,7 +109,7 @@ async def _acompletion_stream_raw(self, history, prompt, memory=False) -> AsyncG
109109
base_url=self.base_url,
110110
api_key=self.api_key,
111111
temperature=self.temperature,
112-
max_tokens=self.max_tokens,
112+
# max_tokens=self.max_tokens,
113113
messages=messages,
114114
stream=True,
115115
caching=self.caching,
@@ -149,7 +149,7 @@ def _completion_core(self, history, prompt, memory=False) -> str:
149149
base_url=self.base_url,
150150
api_key=self.api_key,
151151
temperature=self.temperature,
152-
max_tokens=self.max_tokens,
152+
# max_tokens=self.max_tokens,
153153
messages=messages,
154154
stream=False,
155155
caching=self.caching,

frontend/src/locales/en-US.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,6 @@ export default {
278278
'flow.temperature': 'Temperature',
279279
'flow.temperature.tips':
280280
"This parameter controls the randomness of the model's predictions. A lower temperature results in more conservative responses, while a higher temperature yields more creative and diverse responses.",
281-
'flow.max_tokens': 'Max tokens',
282281
'flow.merge.merge_strategy': 'Strategy',
283282
'flow.merge.deduplicate': 'Deduplicate',
284283
'flow.merge.deduplicate.tips':

frontend/src/locales/zh-CN.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,6 @@ export default {
276276
'flow.temperature': '温度',
277277
'flow.temperature.tips':
278278
'该参数控制模型预测的随机性。 较低的温度使模型对其响应更有信心,而较高的温度则使其更具创造性和多样性。',
279-
'flow.max_tokens': '最大token数',
280279
'flow.merge.merge_strategy': '策略',
281280
'flow.merge.deduplicate': '是否去重',
282281
'flow.merge.deduplicate.tips':

frontend/src/pages/bots/$botId/flow/_nodes/_node_llm.tsx

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -114,23 +114,6 @@ export const ApeNodeLlm = ({ node }: { node: ApeNode }) => {
114114
}}
115115
/>
116116
</Form.Item>
117-
<Form.Item
118-
required
119-
label={formatMessage({ id: 'flow.max_tokens' })}
120-
>
121-
<InputNumber
122-
min={_.get(schema, 'properties.max_tokens.minimum')}
123-
max={_.get(schema, 'properties.max_tokens.maximum')}
124-
step={10}
125-
variant="filled"
126-
style={{ width: '100%' }}
127-
value={_.get(values, 'max_tokens')}
128-
onChange={(value) => {
129-
_.set(values, 'max_tokens', value);
130-
applyChanges();
131-
}}
132-
/>
133-
</Form.Item>
134117
<Form.Item label={formatMessage({ id: 'flow.input.source' })}>
135118
<NodeInput
136119
variant="filled"

frontend/src/pages/bots/$botId/flow/utils.ts

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -439,13 +439,6 @@ export const nodeLlmDefinition = (params?: {
439439
maximum: 1,
440440
description: 'Sampling temperature',
441441
},
442-
max_tokens: {
443-
type: 'integer',
444-
default: 1000,
445-
minimum: 1,
446-
maximum: 128000,
447-
description: 'Max tokens for generation',
448-
},
449442
query: {
450443
type: 'string',
451444
description: "User's question or query",
@@ -464,7 +457,6 @@ export const nodeLlmDefinition = (params?: {
464457
'custom_llm_provider',
465458
'prompt_template',
466459
'temperature',
467-
'max_tokens',
468460
'query',
469461
'docs',
470462
],
@@ -476,7 +468,6 @@ export const nodeLlmDefinition = (params?: {
476468
prompt_template:
477469
params?.botType === 'knowledge' ? '{context}\n{query}' : '{query}',
478470
temperature: 0.7,
479-
max_tokens: 1000,
480471
query: params?.startId
481472
? `{{ nodes.${params.startId}.output.query }}`
482473
: '',

tests/e2e_test/testdata/basic-flow.yaml

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -84,23 +84,16 @@ nodes:
8484
minimum: 0
8585
maximum: 1
8686
description: Sampling temperature
87-
max_tokens:
88-
type: integer
89-
default: 1000
90-
minimum: 1
91-
maximum: 128000
92-
description: Max tokens for generation
9387
query:
9488
type: string
9589
description: User's question or query
96-
required: [model_service_provider, model_name, custom_llm_provider, prompt_template, temperature, max_tokens, query]
90+
required: [model_service_provider, model_name, custom_llm_provider, prompt_template, temperature, query]
9791
values:
9892
model_service_provider: openrouter
9993
model_name: deepseek/deepseek-v3-base:free
10094
custom_llm_provider: openrouter
10195
prompt_template: "{query}"
10296
temperature: 0.7
103-
max_tokens: 1000
10497
query: "{{ nodes.start.output.query }}"
10598
output:
10699
schema:

tests/e2e_test/testdata/rag-flow.yaml

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -295,12 +295,6 @@ nodes:
295295
minimum: 0
296296
maximum: 1
297297
description: Sampling temperature
298-
max_tokens:
299-
type: integer
300-
default: 1000
301-
minimum: 1
302-
maximum: 128000
303-
description: Max tokens for generation
304298
query:
305299
type: string
306300
description: User's question or query
@@ -309,14 +303,13 @@ nodes:
309303
description: Docs for LLM context
310304
items:
311305
$ref: '#/schema/document_with_score'
312-
required: [model_service_provider, model_name, custom_llm_provider, prompt_template, temperature, max_tokens, query, docs]
306+
required: [model_service_provider, model_name, custom_llm_provider, prompt_template, temperature, query, docs]
313307
values:
314308
model_service_provider: openrouter
315309
model_name: deepseek/deepseek-v3-base:free
316310
custom_llm_provider: openrouter
317311
prompt_template: "{context}\n{query}"
318312
temperature: 0.7
319-
max_tokens: 1000
320313
query: "{{ nodes.start.output.query }}"
321314
docs: "{{ nodes.rerank_5c7e1b2a.output.docs }}"
322315
output:

0 commit comments

Comments
 (0)