refactor: remove max_tokens parameter from LLM configurations and update related logic (#997)

iziang · web-flow · commit 771c271621ea · 2025-06-27T14:33:37.000+08:00
- Removed max_tokens from LLM input and output schemas across multiple YAML files.
- Updated LLM service logic to dynamically calculate context length based on model's max_tokens.
- Adjusted frontend components to reflect the removal of max_tokens input.
- Updated test data to align with the new model configurations.
diff --git a/aperag/flow/examples/rag_flow.yaml b/aperag/flow/examples/rag_flow.yaml
@@ -222,13 +222,6 @@ nodes:
             minimum: 0
             maximum: 1
             description: Sampling temperature
-          max_tokens:
-            value: 1000
-            type: integer
-            default: 1000
-            minimum: 1
-            maximum: 128000
-            description: Max tokens for generation
           query:
             value: {{ .nodes.start.output.query }}
             type: string
@@ -244,7 +237,6 @@ nodes:
           - model_name
           - prompt_template
           - temperature
-          - max_tokens
           - query
           - docs
       output:
diff --git a/aperag/flow/examples/rag_flow2.yaml b/aperag/flow/examples/rag_flow2.yaml
@@ -295,12 +295,6 @@ nodes:
               minimum: 0
               maximum: 1
               description: Sampling temperature
-            max_tokens:
-              type: integer
-              default: 1000
-              minimum: 1
-              maximum: 128000
-              description: Max tokens for generation
             query:
               type: string
               description: User's question or query
@@ -309,14 +303,13 @@ nodes:
               description: Docs for LLM context
               items:
                 $ref: '#/schema/document_with_score'
-          required: [model_service_provider, model_name, custom_llm_provider, prompt_template, temperature, max_tokens, query, docs]
+          required: [model_service_provider, model_name, custom_llm_provider, prompt_template, temperature, query, docs]
         values:
           model_service_provider: openrouter
           model_name: deepseek/deepseek-v3-base:free
           custom_llm_provider: openrouter
           prompt_template: "{context}\n{query}"
           temperature: 0.7
-          max_tokens: 1000
           query: "{{ nodes.start.output.query }}"
           docs: "{{ nodes.rerank_5c7e1b2a.output.docs }}"
       output:
diff --git a/aperag/flow/runners/llm.py b/aperag/flow/runners/llm.py
@@ -20,6 +20,7 @@
 from litellm import BaseModel
 from pydantic import Field
 
+from aperag.db.models import APIType
 from aperag.db.ops import async_db_ops
 from aperag.flow.base.models import BaseNodeRunner, SystemInput, register_node_runner
 from aperag.llm.completion.completion_service import CompletionService
@@ -29,7 +30,15 @@
 from aperag.utils.history import BaseChatMessageHistory
 from aperag.utils.utils import now_unix_milliseconds
 
-MAX_CONTEXT_LENGTH = 100000
+# Character to token estimation ratio for Chinese/mixed content
+# Conservative estimate: 2 characters = 1 token
+CHAR_TO_TOKEN_RATIO = 2.0
+
+# Reserve tokens for output generation (default 1000 tokens)
+DEFAULT_OUTPUT_TOKENS = 1000
+
+# Fallback max context length if model max_tokens is not available
+FALLBACK_MAX_CONTEXT_LENGTH = 50000
 
 
 class Message(BaseModel):
@@ -81,14 +90,39 @@ class LLMInput(BaseModel):
     custom_llm_provider: str = Field(..., description="Custom LLM provider")
     prompt_template: str = Field(..., description="Prompt template")
     temperature: float = Field(..., description="Sampling temperature")
-    max_tokens: int = Field(..., description="Max tokens for generation")
     docs: Optional[List[DocumentWithScore]] = Field(None, description="Documents")
 
 
 class LLMOutput(BaseModel):
     text: str
 
 
+def estimate_token_count(text: str) -> int:
+    """
+    Estimate token count from character count for Chinese/mixed content.
+    Using conservative ratio: 2 characters = 1 token
+    """
+    return int(len(text) / CHAR_TO_TOKEN_RATIO)
+
+
+def calculate_max_context_length(model_max_tokens: Optional[int], output_tokens: int = DEFAULT_OUTPUT_TOKENS) -> int:
+    """
+    Calculate maximum context length based on model's max_tokens limit.
+    Reserve tokens for output generation.
+    """
+    if not model_max_tokens:
+        return FALLBACK_MAX_CONTEXT_LENGTH
+    
+    # Reserve tokens for output, convert to character count
+    max_context_tokens = model_max_tokens - output_tokens
+    if max_context_tokens <= 0:
+        # If model max_tokens is too small, use a minimal context
+        max_context_tokens = max(model_max_tokens // 2, 100)
+    
+    # Convert tokens to character count
+    return int(max_context_tokens * CHAR_TO_TOKEN_RATIO)
+
+
 # Database operations interface
 class LLMRepository:
     """Repository interface for LLM database operations"""
@@ -114,7 +148,6 @@ async def generate_response(
         custom_llm_provider: str,
         prompt_template: str,
         temperature: float,
-        max_tokens: int,
         docs: Optional[List[DocumentWithScore]] = None,
     ) -> Tuple[str, Dict]:
         """Generate LLM response with given parameters"""
@@ -130,23 +163,43 @@ async def generate_response(
         except Exception:
             raise Exception(f"LLMProvider {model_service_provider} not found")
 
+        # Get model configuration to determine max_tokens
+        try:
+            model_config = await async_db_ops.query_llm_provider_model(
+                provider_name=model_service_provider,
+                api=APIType.COMPLETION.value,
+                model=model_name
+            )
+            model_max_tokens = model_config.max_tokens if model_config else None
+        except Exception:
+            model_max_tokens = None
+
+        # Calculate dynamic context length based on model's max_tokens
+        max_context_length = calculate_max_context_length(model_max_tokens)
+
         # Build context and references from documents
         context = ""
         references = []
         if docs:
             for doc in docs:
-                if len(context) + len(doc.text) > MAX_CONTEXT_LENGTH:
+                if len(context) + len(doc.text) > max_context_length:
                     break
                 context += doc.text
                 references.append({"text": doc.text, "metadata": doc.metadata, "score": doc.score})
 
         prompt = prompt_template.format(query=query, context=context)
-        output_max_tokens = max_tokens - len(prompt)
-
-        if output_max_tokens < 0:
-            raise Exception(
-                "max_tokens %d is too small to hold the prompt which size is %d" % (max_tokens, len(prompt))
-            )
+        
+        # Estimate prompt tokens and calculate output tokens
+        prompt_tokens = estimate_token_count(prompt)
+        if model_max_tokens:
+            output_max_tokens = model_max_tokens - prompt_tokens
+            if output_max_tokens < 100:  # Ensure minimum output tokens
+                raise Exception(
+                    f"Model max_tokens {model_max_tokens} is too small to hold the prompt which requires approximately {prompt_tokens} tokens"
+                )
+        else:
+            # Use default output tokens if model max_tokens is unknown
+            output_max_tokens = DEFAULT_OUTPUT_TOKENS
 
         cs = CompletionService(custom_llm_provider, model_name, base_url, api_key, temperature, output_max_tokens)
 
@@ -193,7 +246,6 @@ async def run(self, ui: LLMInput, si: SystemInput) -> Tuple[LLMOutput, dict]:
             custom_llm_provider=ui.custom_llm_provider,
             prompt_template=ui.prompt_template,
             temperature=ui.temperature,
-            max_tokens=ui.max_tokens,
             docs=ui.docs,
         )
 
diff --git a/aperag/llm/completion/completion_service.py b/aperag/llm/completion/completion_service.py
@@ -82,7 +82,7 @@ async def _acompletion_non_stream(self, history, prompt, memory=False) -> str:
                 base_url=self.base_url,
                 api_key=self.api_key,
                 temperature=self.temperature,
-                max_tokens=self.max_tokens,
+                # max_tokens=self.max_tokens,
                 messages=messages,
                 stream=False,
                 caching=self.caching,
@@ -109,7 +109,7 @@ async def _acompletion_stream_raw(self, history, prompt, memory=False) -> AsyncG
                 base_url=self.base_url,
                 api_key=self.api_key,
                 temperature=self.temperature,
-                max_tokens=self.max_tokens,
+                # max_tokens=self.max_tokens,
                 messages=messages,
                 stream=True,
                 caching=self.caching,
@@ -149,7 +149,7 @@ def _completion_core(self, history, prompt, memory=False) -> str:
                 base_url=self.base_url,
                 api_key=self.api_key,
                 temperature=self.temperature,
-                max_tokens=self.max_tokens,
+                # max_tokens=self.max_tokens,
                 messages=messages,
                 stream=False,
                 caching=self.caching,
diff --git a/frontend/src/locales/en-US.ts b/frontend/src/locales/en-US.ts
@@ -278,7 +278,6 @@ export default {
   'flow.temperature': 'Temperature',
   'flow.temperature.tips':
     "This parameter controls the randomness of the model's predictions. A lower temperature results in more conservative responses, while a higher temperature yields more creative and diverse responses.",
-  'flow.max_tokens': 'Max tokens',
   'flow.merge.merge_strategy': 'Strategy',
   'flow.merge.deduplicate': 'Deduplicate',
   'flow.merge.deduplicate.tips':
diff --git a/frontend/src/locales/zh-CN.ts b/frontend/src/locales/zh-CN.ts
@@ -276,7 +276,6 @@ export default {
   'flow.temperature': '温度',
   'flow.temperature.tips':
     '该参数控制模型预测的随机性。 较低的温度使模型对其响应更有信心，而较高的温度则使其更具创造性和多样性。',
-  'flow.max_tokens': '最大token数',
   'flow.merge.merge_strategy': '策略',
   'flow.merge.deduplicate': '是否去重',
   'flow.merge.deduplicate.tips':
diff --git a/frontend/src/pages/bots/$botId/flow/_nodes/_node_llm.tsx b/frontend/src/pages/bots/$botId/flow/_nodes/_node_llm.tsx
@@ -114,23 +114,6 @@ export const ApeNodeLlm = ({ node }: { node: ApeNode }) => {
                       }}
                     />
                   </Form.Item>
-                  <Form.Item
-                    required
-                    label={formatMessage({ id: 'flow.max_tokens' })}
-                  >
-                    <InputNumber
-                      min={_.get(schema, 'properties.max_tokens.minimum')}
-                      max={_.get(schema, 'properties.max_tokens.maximum')}
-                      step={10}
-                      variant="filled"
-                      style={{ width: '100%' }}
-                      value={_.get(values, 'max_tokens')}
-                      onChange={(value) => {
-                        _.set(values, 'max_tokens', value);
-                        applyChanges();
-                      }}
-                    />
-                  </Form.Item>
                   <Form.Item label={formatMessage({ id: 'flow.input.source' })}>
                     <NodeInput
                       variant="filled"
diff --git a/frontend/src/pages/bots/$botId/flow/utils.ts b/frontend/src/pages/bots/$botId/flow/utils.ts
@@ -439,13 +439,6 @@ export const nodeLlmDefinition = (params?: {
           maximum: 1,
           description: 'Sampling temperature',
         },
-        max_tokens: {
-          type: 'integer',
-          default: 1000,
-          minimum: 1,
-          maximum: 128000,
-          description: 'Max tokens for generation',
-        },
         query: {
           type: 'string',
           description: "User's question or query",
@@ -464,7 +457,6 @@ export const nodeLlmDefinition = (params?: {
         'custom_llm_provider',
         'prompt_template',
         'temperature',
-        'max_tokens',
         'query',
         'docs',
       ],
@@ -476,7 +468,6 @@ export const nodeLlmDefinition = (params?: {
       prompt_template:
         params?.botType === 'knowledge' ? '{context}\n{query}' : '{query}',
       temperature: 0.7,
-      max_tokens: 1000,
       query: params?.startId
         ? `{{ nodes.${params.startId}.output.query }}`
         : '',
diff --git a/tests/e2e_test/testdata/basic-flow.yaml b/tests/e2e_test/testdata/basic-flow.yaml
@@ -84,23 +84,16 @@ nodes:
               minimum: 0
               maximum: 1
               description: Sampling temperature
-            max_tokens:
-              type: integer
-              default: 1000
-              minimum: 1
-              maximum: 128000
-              description: Max tokens for generation
             query:
               type: string
               description: User's question or query
-          required: [model_service_provider, model_name, custom_llm_provider, prompt_template, temperature, max_tokens, query]
+          required: [model_service_provider, model_name, custom_llm_provider, prompt_template, temperature, query]
         values:
           model_service_provider: openrouter
           model_name: deepseek/deepseek-v3-base:free
           custom_llm_provider: openrouter
           prompt_template: "{query}"
           temperature: 0.7
-          max_tokens: 1000
           query: "{{ nodes.start.output.query }}"
       output:
         schema:
diff --git a/tests/e2e_test/testdata/rag-flow.yaml b/tests/e2e_test/testdata/rag-flow.yaml
@@ -295,12 +295,6 @@ nodes:
               minimum: 0
               maximum: 1
               description: Sampling temperature
-            max_tokens:
-              type: integer
-              default: 1000
-              minimum: 1
-              maximum: 128000
-              description: Max tokens for generation
             query:
               type: string
               description: User's question or query
@@ -309,14 +303,13 @@ nodes:
               description: Docs for LLM context
               items:
                 $ref: '#/schema/document_with_score'
-          required: [model_service_provider, model_name, custom_llm_provider, prompt_template, temperature, max_tokens, query, docs]
+          required: [model_service_provider, model_name, custom_llm_provider, prompt_template, temperature, query, docs]
         values:
           model_service_provider: openrouter
           model_name: deepseek/deepseek-v3-base:free
           custom_llm_provider: openrouter
           prompt_template: "{context}\n{query}"
           temperature: 0.7
-          max_tokens: 1000
           query: "{{ nodes.start.output.query }}"
           docs: "{{ nodes.rerank_5c7e1b2a.output.docs }}"
       output:
diff --git a/tests/model_test/basic-flow.yaml b/tests/model_test/basic-flow.yaml
@@ -84,23 +84,16 @@ nodes:
               minimum: 0
               maximum: 1
               description: Sampling temperature
-            max_tokens:
-              type: integer
-              default: 1000
-              minimum: 1
-              maximum: 128000
-              description: Max tokens for generation
             query:
               type: string
               description: User's question or query
-          required: [model_service_provider, model_name, custom_llm_provider, prompt_template, temperature, max_tokens, query]
+          required: [model_service_provider, model_name, custom_llm_provider, prompt_template, temperature, query]
         values:
           model_service_provider: openrouter
           model_name: deepseek/deepseek-v3-base:free
           custom_llm_provider: openrouter
           prompt_template: "{query}"
           temperature: 0.7
-          max_tokens: 1000
           query: "{{ nodes.start.output.query }}"
       output:
         schema: