|
79 | 79 | TopLogprob, |
80 | 80 | UpdateParamsRequest, |
81 | 81 | UsageInfo, |
82 | | - build_usage_info, |
83 | 82 | ) |
84 | 83 | from lmdeploy.serve.openai.responses import create_responses_router |
85 | 84 | from lmdeploy.serve.openai.utils import maybe_filter_parallel_tool_calls |
@@ -585,7 +584,7 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]: |
585 | 584 | if request.return_logprob: |
586 | 585 | output_token_logprobs = _create_output_token_logprobs(res.token_ids, res.logprobs) |
587 | 586 | if res.finish_reason and include_usage: |
588 | | - final_usage = build_usage_info( |
| 587 | + final_usage = UsageInfo.build( |
589 | 588 | prompt_tokens=res.input_token_len, |
590 | 589 | completion_tokens=res.generate_token_len, |
591 | 590 | cached_tokens=res.cached_tokens, |
@@ -722,7 +721,7 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]: |
722 | 721 | cache_block_ids = cache_block_ids[0] |
723 | 722 | remote_token_ids = [remote_token_ids[0][-1]] |
724 | 723 |
|
725 | | - usage = build_usage_info( |
| 724 | + usage = UsageInfo.build( |
726 | 725 | prompt_tokens=final_res.input_token_len, |
727 | 726 | completion_tokens=final_res.generate_token_len, |
728 | 727 | cached_tokens=final_res.cached_tokens, |
@@ -916,7 +915,7 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]: |
916 | 915 | response_json['remote_token_ids'] = res.token_ids |
917 | 916 | yield f'data: {json.dumps(response_json)}\n\n' |
918 | 917 | if include_usage: |
919 | | - final_usage = build_usage_info( |
| 918 | + final_usage = UsageInfo.build( |
920 | 919 | prompt_tokens=prompt_tokens_acc, |
921 | 920 | completion_tokens=completion_tokens_acc, |
922 | 921 | cached_tokens=cached_tokens_acc, |
@@ -975,7 +974,7 @@ async def _inner_call(i, generator, session): |
975 | 974 | cached_tokens_acc += final_res.cached_tokens |
976 | 975 |
|
977 | 976 | await asyncio.gather(*[_inner_call(i, generators[i], sessions[i]) for i in range(len(generators))]) |
978 | | - usage = build_usage_info( |
| 977 | + usage = UsageInfo.build( |
979 | 978 | prompt_tokens=prompt_tokens_acc, |
980 | 979 | completion_tokens=completion_tokens_acc, |
981 | 980 | cached_tokens=cached_tokens_acc, |
@@ -1196,7 +1195,7 @@ async def pooling(request: PoolingRequest, raw_request: Request = None): |
1196 | 1195 |
|
1197 | 1196 | batch_scores = await async_engine.async_get_reward_score(input_ids) |
1198 | 1197 | prompt_tokens = sum(len(ids) for ids in input_ids) |
1199 | | - usage = build_usage_info(prompt_tokens=prompt_tokens, completion_tokens=0, cached_tokens=0) |
| 1198 | + usage = UsageInfo.build(prompt_tokens=prompt_tokens, completion_tokens=0, cached_tokens=0) |
1200 | 1199 |
|
1201 | 1200 | data = [] |
1202 | 1201 | for i, score in enumerate(batch_scores): |
|
0 commit comments