Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions aperag/flow/examples/rag_flow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -222,13 +222,6 @@ nodes:
minimum: 0
maximum: 1
description: Sampling temperature
max_tokens:
value: 1000
type: integer
default: 1000
minimum: 1
maximum: 128000
description: Max tokens for generation
query:
value: {{ .nodes.start.output.query }}
type: string
Expand All @@ -244,7 +237,6 @@ nodes:
- model_name
- prompt_template
- temperature
- max_tokens
- query
- docs
output:
Expand Down
9 changes: 1 addition & 8 deletions aperag/flow/examples/rag_flow2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -295,12 +295,6 @@ nodes:
minimum: 0
maximum: 1
description: Sampling temperature
max_tokens:
type: integer
default: 1000
minimum: 1
maximum: 128000
description: Max tokens for generation
query:
type: string
description: User's question or query
Expand All @@ -309,14 +303,13 @@ nodes:
description: Docs for LLM context
items:
$ref: '#/schema/document_with_score'
required: [model_service_provider, model_name, custom_llm_provider, prompt_template, temperature, max_tokens, query, docs]
required: [model_service_provider, model_name, custom_llm_provider, prompt_template, temperature, query, docs]
values:
model_service_provider: openrouter
model_name: deepseek/deepseek-v3-base:free
custom_llm_provider: openrouter
prompt_template: "{context}\n{query}"
temperature: 0.7
max_tokens: 1000
query: "{{ nodes.start.output.query }}"
docs: "{{ nodes.rerank_5c7e1b2a.output.docs }}"
output:
Expand Down
74 changes: 63 additions & 11 deletions aperag/flow/runners/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from litellm import BaseModel
from pydantic import Field

from aperag.db.models import APIType
from aperag.db.ops import async_db_ops
from aperag.flow.base.models import BaseNodeRunner, SystemInput, register_node_runner
from aperag.llm.completion.completion_service import CompletionService
Expand All @@ -29,7 +30,15 @@
from aperag.utils.history import BaseChatMessageHistory
from aperag.utils.utils import now_unix_milliseconds

MAX_CONTEXT_LENGTH = 100000
# Character to token estimation ratio for Chinese/mixed content
# Conservative estimate: 2 characters = 1 token
CHAR_TO_TOKEN_RATIO = 2.0

# Reserve tokens for output generation (default 1000 tokens)
DEFAULT_OUTPUT_TOKENS = 1000

# Fallback max context length if model max_tokens is not available
FALLBACK_MAX_CONTEXT_LENGTH = 50000


class Message(BaseModel):
Expand Down Expand Up @@ -81,14 +90,39 @@ class LLMInput(BaseModel):
custom_llm_provider: str = Field(..., description="Custom LLM provider")
prompt_template: str = Field(..., description="Prompt template")
temperature: float = Field(..., description="Sampling temperature")
max_tokens: int = Field(..., description="Max tokens for generation")
docs: Optional[List[DocumentWithScore]] = Field(None, description="Documents")


class LLMOutput(BaseModel):
text: str


def estimate_token_count(text: str) -> int:
"""
Estimate token count from character count for Chinese/mixed content.
Using conservative ratio: 2 characters = 1 token
"""
return int(len(text) / CHAR_TO_TOKEN_RATIO)


def calculate_max_context_length(model_max_tokens: Optional[int], output_tokens: int = DEFAULT_OUTPUT_TOKENS) -> int:
"""
Calculate maximum context length based on model's max_tokens limit.
Reserve tokens for output generation.
"""
if not model_max_tokens:
return FALLBACK_MAX_CONTEXT_LENGTH

# Reserve tokens for output, convert to character count
max_context_tokens = model_max_tokens - output_tokens
if max_context_tokens <= 0:
# If model max_tokens is too small, use a minimal context
max_context_tokens = max(model_max_tokens // 2, 100)

# Convert tokens to character count
return int(max_context_tokens * CHAR_TO_TOKEN_RATIO)


# Database operations interface
class LLMRepository:
"""Repository interface for LLM database operations"""
Expand All @@ -114,7 +148,6 @@ async def generate_response(
custom_llm_provider: str,
prompt_template: str,
temperature: float,
max_tokens: int,
docs: Optional[List[DocumentWithScore]] = None,
) -> Tuple[str, Dict]:
"""Generate LLM response with given parameters"""
Expand All @@ -130,23 +163,43 @@ async def generate_response(
except Exception:
raise Exception(f"LLMProvider {model_service_provider} not found")

# Get model configuration to determine max_tokens
try:
model_config = await async_db_ops.query_llm_provider_model(
provider_name=model_service_provider,
api=APIType.COMPLETION.value,
model=model_name
)
model_max_tokens = model_config.max_tokens if model_config else None
except Exception:
model_max_tokens = None

# Calculate dynamic context length based on model's max_tokens
max_context_length = calculate_max_context_length(model_max_tokens)

# Build context and references from documents
context = ""
references = []
if docs:
for doc in docs:
if len(context) + len(doc.text) > MAX_CONTEXT_LENGTH:
if len(context) + len(doc.text) > max_context_length:
break
context += doc.text
references.append({"text": doc.text, "metadata": doc.metadata, "score": doc.score})

prompt = prompt_template.format(query=query, context=context)
output_max_tokens = max_tokens - len(prompt)

if output_max_tokens < 0:
raise Exception(
"max_tokens %d is too small to hold the prompt which size is %d" % (max_tokens, len(prompt))
)

# Estimate prompt tokens and calculate output tokens
prompt_tokens = estimate_token_count(prompt)
if model_max_tokens:
output_max_tokens = model_max_tokens - prompt_tokens
if output_max_tokens < 100: # Ensure minimum output tokens
raise Exception(
f"Model max_tokens {model_max_tokens} is too small to hold the prompt which requires approximately {prompt_tokens} tokens"
)
else:
# Use default output tokens if model max_tokens is unknown
output_max_tokens = DEFAULT_OUTPUT_TOKENS

cs = CompletionService(custom_llm_provider, model_name, base_url, api_key, temperature, output_max_tokens)

Expand Down Expand Up @@ -193,7 +246,6 @@ async def run(self, ui: LLMInput, si: SystemInput) -> Tuple[LLMOutput, dict]:
custom_llm_provider=ui.custom_llm_provider,
prompt_template=ui.prompt_template,
temperature=ui.temperature,
max_tokens=ui.max_tokens,
docs=ui.docs,
)

Expand Down
6 changes: 3 additions & 3 deletions aperag/llm/completion/completion_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ async def _acompletion_non_stream(self, history, prompt, memory=False) -> str:
base_url=self.base_url,
api_key=self.api_key,
temperature=self.temperature,
max_tokens=self.max_tokens,
# max_tokens=self.max_tokens,
messages=messages,
stream=False,
caching=self.caching,
Expand All @@ -109,7 +109,7 @@ async def _acompletion_stream_raw(self, history, prompt, memory=False) -> AsyncG
base_url=self.base_url,
api_key=self.api_key,
temperature=self.temperature,
max_tokens=self.max_tokens,
# max_tokens=self.max_tokens,
messages=messages,
stream=True,
caching=self.caching,
Expand Down Expand Up @@ -149,7 +149,7 @@ def _completion_core(self, history, prompt, memory=False) -> str:
base_url=self.base_url,
api_key=self.api_key,
temperature=self.temperature,
max_tokens=self.max_tokens,
# max_tokens=self.max_tokens,
messages=messages,
stream=False,
caching=self.caching,
Expand Down
1 change: 0 additions & 1 deletion frontend/src/locales/en-US.ts
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,6 @@ export default {
'flow.temperature': 'Temperature',
'flow.temperature.tips':
"This parameter controls the randomness of the model's predictions. A lower temperature results in more conservative responses, while a higher temperature yields more creative and diverse responses.",
'flow.max_tokens': 'Max tokens',
'flow.merge.merge_strategy': 'Strategy',
'flow.merge.deduplicate': 'Deduplicate',
'flow.merge.deduplicate.tips':
Expand Down
1 change: 0 additions & 1 deletion frontend/src/locales/zh-CN.ts
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,6 @@ export default {
'flow.temperature': '温度',
'flow.temperature.tips':
'该参数控制模型预测的随机性。 较低的温度使模型对其响应更有信心,而较高的温度则使其更具创造性和多样性。',
'flow.max_tokens': '最大token数',
'flow.merge.merge_strategy': '策略',
'flow.merge.deduplicate': '是否去重',
'flow.merge.deduplicate.tips':
Expand Down
17 changes: 0 additions & 17 deletions frontend/src/pages/bots/$botId/flow/_nodes/_node_llm.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -114,23 +114,6 @@ export const ApeNodeLlm = ({ node }: { node: ApeNode }) => {
}}
/>
</Form.Item>
<Form.Item
required
label={formatMessage({ id: 'flow.max_tokens' })}
>
<InputNumber
min={_.get(schema, 'properties.max_tokens.minimum')}
max={_.get(schema, 'properties.max_tokens.maximum')}
step={10}
variant="filled"
style={{ width: '100%' }}
value={_.get(values, 'max_tokens')}
onChange={(value) => {
_.set(values, 'max_tokens', value);
applyChanges();
}}
/>
</Form.Item>
<Form.Item label={formatMessage({ id: 'flow.input.source' })}>
<NodeInput
variant="filled"
Expand Down
9 changes: 0 additions & 9 deletions frontend/src/pages/bots/$botId/flow/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -439,13 +439,6 @@ export const nodeLlmDefinition = (params?: {
maximum: 1,
description: 'Sampling temperature',
},
max_tokens: {
type: 'integer',
default: 1000,
minimum: 1,
maximum: 128000,
description: 'Max tokens for generation',
},
query: {
type: 'string',
description: "User's question or query",
Expand All @@ -464,7 +457,6 @@ export const nodeLlmDefinition = (params?: {
'custom_llm_provider',
'prompt_template',
'temperature',
'max_tokens',
'query',
'docs',
],
Expand All @@ -476,7 +468,6 @@ export const nodeLlmDefinition = (params?: {
prompt_template:
params?.botType === 'knowledge' ? '{context}\n{query}' : '{query}',
temperature: 0.7,
max_tokens: 1000,
query: params?.startId
? `{{ nodes.${params.startId}.output.query }}`
: '',
Expand Down
9 changes: 1 addition & 8 deletions tests/e2e_test/testdata/basic-flow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,23 +84,16 @@ nodes:
minimum: 0
maximum: 1
description: Sampling temperature
max_tokens:
type: integer
default: 1000
minimum: 1
maximum: 128000
description: Max tokens for generation
query:
type: string
description: User's question or query
required: [model_service_provider, model_name, custom_llm_provider, prompt_template, temperature, max_tokens, query]
required: [model_service_provider, model_name, custom_llm_provider, prompt_template, temperature, query]
values:
model_service_provider: openrouter
model_name: deepseek/deepseek-v3-base:free
custom_llm_provider: openrouter
prompt_template: "{query}"
temperature: 0.7
max_tokens: 1000
query: "{{ nodes.start.output.query }}"
output:
schema:
Expand Down
9 changes: 1 addition & 8 deletions tests/e2e_test/testdata/rag-flow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -295,12 +295,6 @@ nodes:
minimum: 0
maximum: 1
description: Sampling temperature
max_tokens:
type: integer
default: 1000
minimum: 1
maximum: 128000
description: Max tokens for generation
query:
type: string
description: User's question or query
Expand All @@ -309,14 +303,13 @@ nodes:
description: Docs for LLM context
items:
$ref: '#/schema/document_with_score'
required: [model_service_provider, model_name, custom_llm_provider, prompt_template, temperature, max_tokens, query, docs]
required: [model_service_provider, model_name, custom_llm_provider, prompt_template, temperature, query, docs]
values:
model_service_provider: openrouter
model_name: deepseek/deepseek-v3-base:free
custom_llm_provider: openrouter
prompt_template: "{context}\n{query}"
temperature: 0.7
max_tokens: 1000
query: "{{ nodes.start.output.query }}"
docs: "{{ nodes.rerank_5c7e1b2a.output.docs }}"
output:
Expand Down
9 changes: 1 addition & 8 deletions tests/model_test/basic-flow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,23 +84,16 @@ nodes:
minimum: 0
maximum: 1
description: Sampling temperature
max_tokens:
type: integer
default: 1000
minimum: 1
maximum: 128000
description: Max tokens for generation
query:
type: string
description: User's question or query
required: [model_service_provider, model_name, custom_llm_provider, prompt_template, temperature, max_tokens, query]
required: [model_service_provider, model_name, custom_llm_provider, prompt_template, temperature, query]
values:
model_service_provider: openrouter
model_name: deepseek/deepseek-v3-base:free
custom_llm_provider: openrouter
prompt_template: "{query}"
temperature: 0.7
max_tokens: 1000
query: "{{ nodes.start.output.query }}"
output:
schema:
Expand Down
Loading