|
44 | 44 | stream_chat_response, |
45 | 45 | stream_chat_response_fanout, |
46 | 46 | ) |
| 47 | +from .serving_anthropic import ( |
| 48 | + AnthropicMessagesRequest, |
| 49 | + anthropic_to_openai_messages, |
| 50 | + anthropic_to_openai_tools, |
| 51 | + build_anthropic_response, |
| 52 | + stream_content_block_delta, |
| 53 | + stream_content_block_start, |
| 54 | + stream_content_block_stop, |
| 55 | + stream_message_delta, |
| 56 | + stream_message_start, |
| 57 | + stream_message_stop, |
| 58 | +) |
47 | 59 | from .serving_completion import ( |
48 | 60 | build_completion_response, |
49 | 61 | build_completion_response_multi, |
@@ -773,6 +785,146 @@ async def completions(request: CompletionRequest): |
773 | 785 | raise HTTPException(status_code=500, detail=str(e)) |
774 | 786 |
|
775 | 787 |
|
| 788 | +@app.post("/v1/messages") |
| 789 | +async def anthropic_messages(request: AnthropicMessagesRequest, raw_request: Request): |
| 790 | + """Handle Anthropic Messages API requests. |
| 791 | +
|
| 792 | + Translates Anthropic format to OpenAI format internally, runs inference, |
| 793 | + and returns Anthropic-formatted responses. Enables Claude Code and other |
| 794 | + Anthropic-compatible tools to use ATOM as a backend. |
| 795 | + """ |
| 796 | + global engine, tokenizer, model_name |
| 797 | + |
| 798 | + try: |
| 799 | + # Convert Anthropic messages to OpenAI format |
| 800 | + openai_messages = anthropic_to_openai_messages(request.messages, request.system) |
| 801 | + |
| 802 | + # Apply chat template |
| 803 | + from .protocol import ChatMessage |
| 804 | + |
| 805 | + messages = [ChatMessage(**m) for m in openai_messages] |
| 806 | + |
| 807 | + merged_kwargs = dict(default_chat_template_kwargs) |
| 808 | + prompt = apply_chat_template( |
| 809 | + tokenizer, |
| 810 | + custom_message_encoder, |
| 811 | + [msg.to_template_dict() for msg in messages], |
| 812 | + tools=None, |
| 813 | + **merged_kwargs, |
| 814 | + ) |
| 815 | + |
| 816 | + sampling_params = _build_sampling_params( |
| 817 | + temperature=request.temperature or 1.0, |
| 818 | + max_tokens=request.max_tokens, |
| 819 | + stop_strings=request.stop_sequences, |
| 820 | + top_k=request.top_k, |
| 821 | + top_p=request.top_p, |
| 822 | + ) |
| 823 | + |
| 824 | + request_id = uuid.uuid4().hex[:24] |
| 825 | + input_tokens = len(tokenizer.encode(prompt)) |
| 826 | + |
| 827 | + if request.stream: |
| 828 | + # Streaming response |
| 829 | + seq_id, stream_queue = await setup_streaming_request( |
| 830 | + prompt, sampling_params, request_id |
| 831 | + ) |
| 832 | + |
| 833 | + async def generate_anthropic_stream(): |
| 834 | + from .reasoning import ReasoningFilter |
| 835 | + |
| 836 | + reasoning_filter = ReasoningFilter() |
| 837 | + block_index = 0 |
| 838 | + started_text = False |
| 839 | + started_thinking = False |
| 840 | + output_tokens = 0 |
| 841 | + |
| 842 | + yield stream_message_start(request_id, model_name, input_tokens) |
| 843 | + |
| 844 | + try: |
| 845 | + while True: |
| 846 | + chunk_data = await stream_queue.get() |
| 847 | + new_text = chunk_data["text"] |
| 848 | + output_tokens += len(chunk_data.get("token_ids", [])) |
| 849 | + finished = chunk_data.get("finished", False) |
| 850 | + |
| 851 | + segments = reasoning_filter.process(new_text) |
| 852 | + if finished: |
| 853 | + segments.extend(reasoning_filter.flush()) |
| 854 | + |
| 855 | + for field, text in segments: |
| 856 | + if not text: |
| 857 | + continue |
| 858 | + |
| 859 | + if field == "reasoning_content": |
| 860 | + if not started_thinking: |
| 861 | + yield stream_content_block_start(block_index, "thinking") |
| 862 | + started_thinking = True |
| 863 | + yield stream_content_block_delta(block_index, text, "thinking") |
| 864 | + else: |
| 865 | + if started_thinking and not started_text: |
| 866 | + yield stream_content_block_stop(block_index) |
| 867 | + block_index += 1 |
| 868 | + if not started_text: |
| 869 | + yield stream_content_block_start(block_index, "text") |
| 870 | + started_text = True |
| 871 | + yield stream_content_block_delta(block_index, text, "text") |
| 872 | + |
| 873 | + if finished: |
| 874 | + if started_thinking and not started_text: |
| 875 | + yield stream_content_block_stop(block_index) |
| 876 | + block_index += 1 |
| 877 | + yield stream_content_block_start(block_index, "text") |
| 878 | + yield stream_content_block_delta(block_index, "", "text") |
| 879 | + yield stream_content_block_stop(block_index) |
| 880 | + yield stream_message_delta("end_turn", output_tokens) |
| 881 | + yield stream_message_stop() |
| 882 | + break |
| 883 | + finally: |
| 884 | + cleanup_streaming_request(seq_id) |
| 885 | + |
| 886 | + return StreamingResponse( |
| 887 | + generate_anthropic_stream(), |
| 888 | + media_type="text/event-stream", |
| 889 | + headers={ |
| 890 | + "anthropic-version": "2023-06-01", |
| 891 | + "x-request-id": request_id, |
| 892 | + }, |
| 893 | + ) |
| 894 | + |
| 895 | + # Non-streaming response |
| 896 | + from .reasoning import separate_reasoning |
| 897 | + |
| 898 | + final_output = None |
| 899 | + async for output in generate_async(prompt, sampling_params, request_id): |
| 900 | + final_output = output |
| 901 | + if final_output is None: |
| 902 | + raise RuntimeError("No output generated") |
| 903 | + |
| 904 | + raw_text = final_output["text"] |
| 905 | + reasoning_content, content = separate_reasoning(raw_text) |
| 906 | + output_tokens = len(tokenizer.encode(raw_text)) |
| 907 | + |
| 908 | + return build_anthropic_response( |
| 909 | + request_id=request_id, |
| 910 | + model=model_name, |
| 911 | + content_text=content, |
| 912 | + reasoning_content=reasoning_content, |
| 913 | + input_tokens=input_tokens, |
| 914 | + output_tokens=output_tokens, |
| 915 | + ) |
| 916 | + |
| 917 | + except Exception as e: |
| 918 | + logger.error(f"Error in anthropic_messages: {e}", exc_info=True) |
| 919 | + return JSONResponse( |
| 920 | + status_code=500, |
| 921 | + content={ |
| 922 | + "type": "error", |
| 923 | + "error": {"type": "api_error", "message": str(e)}, |
| 924 | + }, |
| 925 | + ) |
| 926 | + |
| 927 | + |
776 | 928 | @app.get("/v1/models") |
777 | 929 | async def list_models(): |
778 | 930 | """List available models.""" |
|
0 commit comments