diff --git a/aperag/flow/examples/rag_flow.yaml b/aperag/flow/examples/rag_flow.yaml index e265bb3ef..3011284f0 100644 --- a/aperag/flow/examples/rag_flow.yaml +++ b/aperag/flow/examples/rag_flow.yaml @@ -222,13 +222,6 @@ nodes: minimum: 0 maximum: 1 description: Sampling temperature - max_tokens: - value: 1000 - type: integer - default: 1000 - minimum: 1 - maximum: 128000 - description: Max tokens for generation query: value: {{ .nodes.start.output.query }} type: string @@ -244,7 +237,6 @@ nodes: - model_name - prompt_template - temperature - - max_tokens - query - docs output: diff --git a/aperag/flow/examples/rag_flow2.yaml b/aperag/flow/examples/rag_flow2.yaml index 2560d4120..bceb42e2e 100644 --- a/aperag/flow/examples/rag_flow2.yaml +++ b/aperag/flow/examples/rag_flow2.yaml @@ -295,12 +295,6 @@ nodes: minimum: 0 maximum: 1 description: Sampling temperature - max_tokens: - type: integer - default: 1000 - minimum: 1 - maximum: 128000 - description: Max tokens for generation query: type: string description: User's question or query @@ -309,14 +303,13 @@ nodes: description: Docs for LLM context items: $ref: '#/schema/document_with_score' - required: [model_service_provider, model_name, custom_llm_provider, prompt_template, temperature, max_tokens, query, docs] + required: [model_service_provider, model_name, custom_llm_provider, prompt_template, temperature, query, docs] values: model_service_provider: openrouter model_name: deepseek/deepseek-v3-base:free custom_llm_provider: openrouter prompt_template: "{context}\n{query}" temperature: 0.7 - max_tokens: 1000 query: "{{ nodes.start.output.query }}" docs: "{{ nodes.rerank_5c7e1b2a.output.docs }}" output: diff --git a/aperag/flow/runners/llm.py b/aperag/flow/runners/llm.py index ea7420df1..a7eb0dc4d 100644 --- a/aperag/flow/runners/llm.py +++ b/aperag/flow/runners/llm.py @@ -20,6 +20,7 @@ from litellm import BaseModel from pydantic import Field +from aperag.db.models import APIType from aperag.db.ops import async_db_ops from aperag.flow.base.models import BaseNodeRunner, SystemInput, register_node_runner from aperag.llm.completion.completion_service import CompletionService @@ -29,7 +30,15 @@ from aperag.utils.history import BaseChatMessageHistory from aperag.utils.utils import now_unix_milliseconds -MAX_CONTEXT_LENGTH = 100000 +# Character to token estimation ratio for Chinese/mixed content +# Conservative estimate: 2 characters = 1 token +CHAR_TO_TOKEN_RATIO = 2.0 + +# Reserve tokens for output generation (default 1000 tokens) +DEFAULT_OUTPUT_TOKENS = 1000 + +# Fallback max context length if model max_tokens is not available +FALLBACK_MAX_CONTEXT_LENGTH = 50000 class Message(BaseModel): @@ -81,7 +90,6 @@ class LLMInput(BaseModel): custom_llm_provider: str = Field(..., description="Custom LLM provider") prompt_template: str = Field(..., description="Prompt template") temperature: float = Field(..., description="Sampling temperature") - max_tokens: int = Field(..., description="Max tokens for generation") docs: Optional[List[DocumentWithScore]] = Field(None, description="Documents") @@ -89,6 +97,32 @@ class LLMOutput(BaseModel): text: str +def estimate_token_count(text: str) -> int: + """ + Estimate token count from character count for Chinese/mixed content. + Using conservative ratio: 2 characters = 1 token + """ + return int(len(text) / CHAR_TO_TOKEN_RATIO) + + +def calculate_max_context_length(model_max_tokens: Optional[int], output_tokens: int = DEFAULT_OUTPUT_TOKENS) -> int: + """ + Calculate maximum context length based on model's max_tokens limit. + Reserve tokens for output generation. + """ + if not model_max_tokens: + return FALLBACK_MAX_CONTEXT_LENGTH + + # Reserve tokens for output, convert to character count + max_context_tokens = model_max_tokens - output_tokens + if max_context_tokens <= 0: + # If model max_tokens is too small, use a minimal context + max_context_tokens = max(model_max_tokens // 2, 100) + + # Convert tokens to character count + return int(max_context_tokens * CHAR_TO_TOKEN_RATIO) + + # Database operations interface class LLMRepository: """Repository interface for LLM database operations""" @@ -114,7 +148,6 @@ async def generate_response( custom_llm_provider: str, prompt_template: str, temperature: float, - max_tokens: int, docs: Optional[List[DocumentWithScore]] = None, ) -> Tuple[str, Dict]: """Generate LLM response with given parameters""" @@ -130,23 +163,43 @@ async def generate_response( except Exception: raise Exception(f"LLMProvider {model_service_provider} not found") + # Get model configuration to determine max_tokens + try: + model_config = await async_db_ops.query_llm_provider_model( + provider_name=model_service_provider, + api=APIType.COMPLETION.value, + model=model_name + ) + model_max_tokens = model_config.max_tokens if model_config else None + except Exception: + model_max_tokens = None + + # Calculate dynamic context length based on model's max_tokens + max_context_length = calculate_max_context_length(model_max_tokens) + # Build context and references from documents context = "" references = [] if docs: for doc in docs: - if len(context) + len(doc.text) > MAX_CONTEXT_LENGTH: + if len(context) + len(doc.text) > max_context_length: break context += doc.text references.append({"text": doc.text, "metadata": doc.metadata, "score": doc.score}) prompt = prompt_template.format(query=query, context=context) - output_max_tokens = max_tokens - len(prompt) - - if output_max_tokens < 0: - raise Exception( - "max_tokens %d is too small to hold the prompt which size is %d" % (max_tokens, len(prompt)) - ) + + # Estimate prompt tokens and calculate output tokens + prompt_tokens = estimate_token_count(prompt) + if model_max_tokens: + output_max_tokens = model_max_tokens - prompt_tokens + if output_max_tokens < 100: # Ensure minimum output tokens + raise Exception( + f"Model max_tokens {model_max_tokens} is too small to hold the prompt which requires approximately {prompt_tokens} tokens" + ) + else: + # Use default output tokens if model max_tokens is unknown + output_max_tokens = DEFAULT_OUTPUT_TOKENS cs = CompletionService(custom_llm_provider, model_name, base_url, api_key, temperature, output_max_tokens) @@ -193,7 +246,6 @@ async def run(self, ui: LLMInput, si: SystemInput) -> Tuple[LLMOutput, dict]: custom_llm_provider=ui.custom_llm_provider, prompt_template=ui.prompt_template, temperature=ui.temperature, - max_tokens=ui.max_tokens, docs=ui.docs, ) diff --git a/aperag/llm/completion/completion_service.py b/aperag/llm/completion/completion_service.py index 7ab9a26b0..0003a6b3a 100644 --- a/aperag/llm/completion/completion_service.py +++ b/aperag/llm/completion/completion_service.py @@ -82,7 +82,7 @@ async def _acompletion_non_stream(self, history, prompt, memory=False) -> str: base_url=self.base_url, api_key=self.api_key, temperature=self.temperature, - max_tokens=self.max_tokens, + # max_tokens=self.max_tokens, messages=messages, stream=False, caching=self.caching, @@ -109,7 +109,7 @@ async def _acompletion_stream_raw(self, history, prompt, memory=False) -> AsyncG base_url=self.base_url, api_key=self.api_key, temperature=self.temperature, - max_tokens=self.max_tokens, + # max_tokens=self.max_tokens, messages=messages, stream=True, caching=self.caching, @@ -149,7 +149,7 @@ def _completion_core(self, history, prompt, memory=False) -> str: base_url=self.base_url, api_key=self.api_key, temperature=self.temperature, - max_tokens=self.max_tokens, + # max_tokens=self.max_tokens, messages=messages, stream=False, caching=self.caching, diff --git a/frontend/src/locales/en-US.ts b/frontend/src/locales/en-US.ts index b59b33db6..49a6fd9ac 100644 --- a/frontend/src/locales/en-US.ts +++ b/frontend/src/locales/en-US.ts @@ -278,7 +278,6 @@ export default { 'flow.temperature': 'Temperature', 'flow.temperature.tips': "This parameter controls the randomness of the model's predictions. A lower temperature results in more conservative responses, while a higher temperature yields more creative and diverse responses.", - 'flow.max_tokens': 'Max tokens', 'flow.merge.merge_strategy': 'Strategy', 'flow.merge.deduplicate': 'Deduplicate', 'flow.merge.deduplicate.tips': diff --git a/frontend/src/locales/zh-CN.ts b/frontend/src/locales/zh-CN.ts index 324c923b1..263b9b4c3 100644 --- a/frontend/src/locales/zh-CN.ts +++ b/frontend/src/locales/zh-CN.ts @@ -276,7 +276,6 @@ export default { 'flow.temperature': '温度', 'flow.temperature.tips': '该参数控制模型预测的随机性。 较低的温度使模型对其响应更有信心,而较高的温度则使其更具创造性和多样性。', - 'flow.max_tokens': '最大token数', 'flow.merge.merge_strategy': '策略', 'flow.merge.deduplicate': '是否去重', 'flow.merge.deduplicate.tips': diff --git a/frontend/src/pages/bots/$botId/flow/_nodes/_node_llm.tsx b/frontend/src/pages/bots/$botId/flow/_nodes/_node_llm.tsx index 380277fac..493ad313f 100644 --- a/frontend/src/pages/bots/$botId/flow/_nodes/_node_llm.tsx +++ b/frontend/src/pages/bots/$botId/flow/_nodes/_node_llm.tsx @@ -114,23 +114,6 @@ export const ApeNodeLlm = ({ node }: { node: ApeNode }) => { }} /> -