diff --git a/.claude/skills/migrate-groovy-to-java/SKILL.md b/.claude/skills/migrate-groovy-to-java/SKILL.md index 052e9fff0e3..4ae8ac292cf 100644 --- a/.claude/skills/migrate-groovy-to-java/SKILL.md +++ b/.claude/skills/migrate-groovy-to-java/SKILL.md @@ -21,3 +21,4 @@ When converting Groovy code to Java code, make sure that: - Do not wrap checked exceptions and throw a Runtime exception; prefer adding a throws clause at method declaration - Do not mark local variables `final` - Ensure variables are human-readable; avoid single-letter names and pre-define variables that are referenced multiple times +- When translating Spock `Mock(...)` usage, use `libs.bundles.mockito` instead of writing manual recording/stub implementations diff --git a/dd-trace-api/build.gradle.kts b/dd-trace-api/build.gradle.kts index af67a1b00fb..bc05a8753a4 100644 --- a/dd-trace-api/build.gradle.kts +++ b/dd-trace-api/build.gradle.kts @@ -74,5 +74,6 @@ description = "dd-trace-api" dependencies { api(libs.slf4j) testImplementation(libs.guava) + testImplementation(libs.bundles.mockito) testImplementation(project(":utils:test-utils")) } diff --git a/dd-trace-api/src/test/groovy/datadog/trace/api/aiguard/AIGuardTest.groovy b/dd-trace-api/src/test/groovy/datadog/trace/api/aiguard/AIGuardTest.groovy deleted file mode 100644 index 5ac83712278..00000000000 --- a/dd-trace-api/src/test/groovy/datadog/trace/api/aiguard/AIGuardTest.groovy +++ /dev/null @@ -1,151 +0,0 @@ -package datadog.trace.api.aiguard - -import spock.lang.Specification - -import static datadog.trace.api.aiguard.AIGuard.Action.ALLOW - - -class AIGuardTest extends Specification { - - void 'test text message'() { - when: - final message = AIGuard.Message.message('user', 'What day is today?') - - then: - message.role == 'user' - message.content == 'What day is today?' - message.toolCallId == null - message.toolCalls == null - } - - void 'test assistant tool call'() { - when: - final message = AIGuard.Message.assistant( - AIGuard.ToolCall.toolCall('1', 'execute_http_request', '{ "url": "http://localhost" }'), - AIGuard.ToolCall.toolCall('2', 'random_number', '{ "min": 0, "max": 10 }') - ) - - then: - message.role == 'assistant' - message.content == null - message.toolCallId == null - message.toolCalls.size() == 2 - - final http = message.toolCalls[0] - http.id == '1' - http.function.name == 'execute_http_request' - http.function.arguments == '{ "url": "http://localhost" }' - - final random = message.toolCalls[1] - random.id == '2' - random.function.name == 'random_number' - random.function.arguments == '{ "min": 0, "max": 10 }' - } - - void 'test tool'() { - when: - final message = AIGuard.Message.tool('2', '5') - - then: - message.role == 'tool' - message.content == '5' - message.toolCallId == '2' - message.toolCalls == null - } - - void 'test noop implementation'() { - when: - final eval = AIGuard.evaluate([ - AIGuard.Message.message('system', 'You are a beautiful AI assistant'), - AIGuard.Message.message('user', 'What day is today?'), - AIGuard.Message.message('assistant', 'Today is monday'), - AIGuard.Message.message('user', 'Give me a random number'), - AIGuard.Message.assistant(AIGuard.ToolCall.toolCall('1', 'generate_random_number', '{ "min": 0, "max": 10 }')), - AIGuard.Message.tool('1', '5'), - AIGuard.Message.message('assistant', 'Your number is 5') - ]) - - then: - eval.action == ALLOW - eval.reason == 'AI Guard is not enabled' - } - - void 'test ContentPart.text factory'() { - when: - final part = AIGuard.ContentPart.text('Hello world') - - then: - part.type == AIGuard.ContentPart.Type.TEXT - part.text == 'Hello world' - part.imageUrl == null - } - - void 'test ContentPart.imageUrl from String factory'() { - when: - final part = AIGuard.ContentPart.imageUrl('https://example.com/image.jpg') - - then: - part.type == AIGuard.ContentPart.Type.IMAGE_URL - part.text == null - part.imageUrl != null - part.imageUrl.url == 'https://example.com/image.jpg' - } - - void 'test Message with contentParts'() { - when: - final message = AIGuard.Message.message('user', [ - AIGuard.ContentPart.text('Describe this image:'), - AIGuard.ContentPart.imageUrl('https://example.com/image.jpg') - ]) - - then: - message.role == 'user' - message.content == null - message.contentParts != null - message.contentParts.size() == 2 - message.contentParts[0].type == AIGuard.ContentPart.Type.TEXT - message.contentParts[0].text == 'Describe this image:' - message.contentParts[1].type == AIGuard.ContentPart.Type.IMAGE_URL - message.contentParts[1].imageUrl.url == 'https://example.com/image.jpg' - } - - void 'test Message with plain content returns null contentParts'() { - when: - final message = AIGuard.Message.message('user', 'Hello') - - then: - message.content == 'Hello' - message.contentParts == null - } - - void 'test Message with contentParts returns null content'() { - when: - final message = AIGuard.Message.message('user', [AIGuard.ContentPart.text('Hello')]) - - then: - message.content == null - message.contentParts != null - } - - void 'test Message validation allows null content for assistant with tool calls'() { - when: - final message = AIGuard.Message.assistant( - AIGuard.ToolCall.toolCall('1', 'test', '{}') - ) - - then: - message.role == 'assistant' - message.content == null - message.contentParts == null - message.toolCalls != null - } - - void 'test Message allows empty contentParts list'() { - when: - def message = new AIGuard.Message('user', [], null, null) - - then: - message.contentParts != null - message.contentParts.isEmpty() - } -} diff --git a/dd-trace-api/src/test/groovy/datadog/trace/api/llmobs/LLMObsTest.groovy b/dd-trace-api/src/test/groovy/datadog/trace/api/llmobs/LLMObsTest.groovy deleted file mode 100644 index ab3f96ff2ae..00000000000 --- a/dd-trace-api/src/test/groovy/datadog/trace/api/llmobs/LLMObsTest.groovy +++ /dev/null @@ -1,294 +0,0 @@ -package datadog.trace.api.llmobs - -import datadog.trace.api.llmobs.noop.NoOpLLMObsSpan -import datadog.trace.api.llmobs.noop.NoOpLLMObsSpanFactory -import datadog.trace.api.llmobs.noop.NoOpLLMObsEvalProcessor -import datadog.trace.test.util.DDSpecification -import spock.lang.Shared -import java.lang.reflect.Field - -class LLMObsTest extends DDSpecification { - - @Shared - def originalSpanFactory - @Shared - def originalEvalProcessor - - def setupSpec() { - // Store original values - originalSpanFactory = getStaticField("SPAN_FACTORY") - originalEvalProcessor = getStaticField("EVAL_PROCESSOR") - } - - def cleanupSpec() { - // Restore original values - setStaticField("SPAN_FACTORY", originalSpanFactory) - setStaticField("EVAL_PROCESSOR", originalEvalProcessor) - } - - def cleanup() { - // Reset to defaults after each test - setStaticField("SPAN_FACTORY", NoOpLLMObsSpanFactory.INSTANCE) - setStaticField("EVAL_PROCESSOR", NoOpLLMObsEvalProcessor.INSTANCE) - } - - private static void setStaticField(String fieldName, Object value) { - Field field = LLMObs.getDeclaredField(fieldName) - field.setAccessible(true) - field.set(null, value) - } - - private static Object getStaticField(String fieldName) { - Field field = LLMObs.getDeclaredField(fieldName) - field.setAccessible(true) - return field.get(null) - } - - def "test ToolCall creation and getters"() { - given: - def arguments = [location: "New York", unit: "celsius"] - - when: - def toolCall = LLMObs.ToolCall.from("get_weather", "function", "tool-123", arguments) - - then: - toolCall.name == "get_weather" - toolCall.type == "function" - toolCall.toolId == "tool-123" - toolCall.arguments == arguments - } - - def "test ToolCall with null arguments"() { - when: - def toolCall = LLMObs.ToolCall.from("get_weather", "function", "tool-123", null) - - then: - toolCall.name == "get_weather" - toolCall.type == "function" - toolCall.toolId == "tool-123" - toolCall.arguments == null - } - - def "test LLMMessage creation with toolCalls"() { - given: - def toolCall = LLMObs.ToolCall.from("get_weather", "function", "tool-123", [location: "Paris"]) - def toolCalls = [toolCall] - - when: - def message = LLMObs.LLMMessage.from("assistant", "Let me check the weather", toolCalls) - - then: - message.role == "assistant" - message.content == "Let me check the weather" - message.toolCalls == toolCalls - message.toolCalls.size() == 1 - message.toolCalls[0].name == "get_weather" - message.toolCalls[0].type == "function" - message.toolCalls[0].toolId == "tool-123" - message.toolCalls[0].arguments == [location: "Paris"] - } - - def "test LLMMessage creation without toolCalls"() { - when: - def message = LLMObs.LLMMessage.from("user", "What's the weather like?") - - then: - message.role == "user" - message.content == "What's the weather like?" - message.toolCalls == null - } - - def "test LLMMessage with multiple toolCalls"() { - given: - def toolCall1 = LLMObs.ToolCall.from("get_weather", "function", "tool-1", [location: "New York"]) - def toolCall2 = LLMObs.ToolCall.from("get_stock_price", "function", "tool-2", [symbol: "AAPL"]) - def toolCalls = [toolCall1, toolCall2] - - when: - def message = LLMObs.LLMMessage.from("assistant", "I'll help you with both requests", toolCalls) - - then: - message.role == "assistant" - message.content == "I'll help you with both requests" - message.toolCalls == toolCalls - message.toolCalls.size() == 2 - message.toolCalls[0].name == "get_weather" - message.toolCalls[1].name == "get_stock_price" - } - - def "test default NoOp span factory behavior"() { - when: - def llmSpan = LLMObs.startLLMSpan("test", "gpt-4", "openai", "app", "session") - def agentSpan = LLMObs.startAgentSpan("test", "app", "session") - def toolSpan = LLMObs.startToolSpan("test", "app", "session") - def taskSpan = LLMObs.startTaskSpan("test", "app", "session") - def workflowSpan = LLMObs.startWorkflowSpan("test", "app", "session") - def embeddingSpan = LLMObs.startEmbeddingSpan("test", "app", "openai", "model", "session") - def retrievalSpan = LLMObs.startRetrievalSpan("test", "app", "session") - - then: - llmSpan == NoOpLLMObsSpan.INSTANCE - agentSpan == NoOpLLMObsSpan.INSTANCE - toolSpan == NoOpLLMObsSpan.INSTANCE - taskSpan == NoOpLLMObsSpan.INSTANCE - workflowSpan == NoOpLLMObsSpan.INSTANCE - embeddingSpan == NoOpLLMObsSpan.INSTANCE - retrievalSpan == NoOpLLMObsSpan.INSTANCE - } - - def "test span creation with null optional parameters"() { - when: - def llmSpan = LLMObs.startLLMSpan("test", "gpt-4", "openai", null, null) - def agentSpan = LLMObs.startAgentSpan("test", null, null) - def toolSpan = LLMObs.startToolSpan("test", null, null) - def taskSpan = LLMObs.startTaskSpan("test", null, null) - def workflowSpan = LLMObs.startWorkflowSpan("test", null, null) - def embeddingSpan = LLMObs.startEmbeddingSpan("test", null, null, null, null) - def retrievalSpan = LLMObs.startRetrievalSpan("test", null, null) - - then: - llmSpan == NoOpLLMObsSpan.INSTANCE - agentSpan == NoOpLLMObsSpan.INSTANCE - toolSpan == NoOpLLMObsSpan.INSTANCE - taskSpan == NoOpLLMObsSpan.INSTANCE - workflowSpan == NoOpLLMObsSpan.INSTANCE - embeddingSpan == NoOpLLMObsSpan.INSTANCE - retrievalSpan == NoOpLLMObsSpan.INSTANCE - } - - def "test default NoOp evaluation processor behavior"() { - when: - // These should not throw exceptions - LLMObs.SubmitEvaluation(NoOpLLMObsSpan.INSTANCE, "label", 0.5, [:]) - LLMObs.SubmitEvaluation(NoOpLLMObsSpan.INSTANCE, "label", 0.5, "app", [:]) - LLMObs.SubmitEvaluation(NoOpLLMObsSpan.INSTANCE, "label", "value", [:]) - LLMObs.SubmitEvaluation(NoOpLLMObsSpan.INSTANCE, "label", "value", "app", [:]) - - then: - noExceptionThrown() - } - - def "test evaluation submission with various score values"() { - given: - def span = NoOpLLMObsSpan.INSTANCE - def tags = [category: "test", version: "1.0"] - - when: - LLMObs.SubmitEvaluation(span, "accuracy", 0.0, tags) - LLMObs.SubmitEvaluation(span, "precision", 1.0, tags) - LLMObs.SubmitEvaluation(span, "recall", 0.85, tags) - LLMObs.SubmitEvaluation(span, "f1_score", 0.92, "myapp", tags) - - then: - noExceptionThrown() - } - - def "test evaluation submission with categorical values"() { - given: - def span = NoOpLLMObsSpan.INSTANCE - def tags = [evaluator: "human", context: "production"] - - when: - LLMObs.SubmitEvaluation(span, "quality", "excellent", tags) - LLMObs.SubmitEvaluation(span, "relevance", "poor", tags) - LLMObs.SubmitEvaluation(span, "toxicity", "safe", "content-app", tags) - - then: - noExceptionThrown() - } - - def "test evaluation submission with empty tags"() { - given: - def span = NoOpLLMObsSpan.INSTANCE - def emptyTags = [:] - - when: - LLMObs.SubmitEvaluation(span, "score", 0.75, emptyTags) - LLMObs.SubmitEvaluation(span, "category", "good", emptyTags) - - then: - noExceptionThrown() - } - - def "test span creation with custom factory returns actual spans"() { - given: - def mockSpanFactory = Mock(LLMObs.LLMObsSpanFactory) - def mockEvalProcessor = Mock(LLMObs.LLMObsEvalProcessor) - - def mockLLMSpan = Mock(LLMObsSpan) - def mockAgentSpan = Mock(LLMObsSpan) - def mockToolSpan = Mock(LLMObsSpan) - def mockTaskSpan = Mock(LLMObsSpan) - def mockWorkflowSpan = Mock(LLMObsSpan) - def mockEmbeddingSpan = Mock(LLMObsSpan) - def mockRetrievalSpan = Mock(LLMObsSpan) - - // Set up the custom factory - setStaticField("SPAN_FACTORY", mockSpanFactory) - setStaticField("EVAL_PROCESSOR", mockEvalProcessor) - - when: - def llmSpan = LLMObs.startLLMSpan("chat-completion", "gpt-4", "openai", "my-app", "session-1") - def agentSpan = LLMObs.startAgentSpan("agent-task", "my-app", "session-1") - def toolSpan = LLMObs.startToolSpan("weather-tool", "my-app", "session-1") - def taskSpan = LLMObs.startTaskSpan("summarize-task", "my-app", "session-1") - def workflowSpan = LLMObs.startWorkflowSpan("data-workflow", "my-app", "session-1") - def embeddingSpan = LLMObs.startEmbeddingSpan("text-embed", "my-app", "openai", "text-embedding-ada-002", "session-1") - def retrievalSpan = LLMObs.startRetrievalSpan("document-retrieval", "my-app", "session-1") - - // Test evaluation submission - LLMObs.SubmitEvaluation(mockLLMSpan, "accuracy", 0.95, [test: "value"]) - LLMObs.SubmitEvaluation(mockAgentSpan, "quality", "excellent", "eval-app", [reviewer: "human"]) - - then: - // Verify all span factory methods were called with correct parameters - 1 * mockSpanFactory.startLLMSpan("chat-completion", "gpt-4", "openai", "my-app", "session-1") >> mockLLMSpan - 1 * mockSpanFactory.startAgentSpan("agent-task", "my-app", "session-1") >> mockAgentSpan - 1 * mockSpanFactory.startToolSpan("weather-tool", "my-app", "session-1") >> mockToolSpan - 1 * mockSpanFactory.startTaskSpan("summarize-task", "my-app", "session-1") >> mockTaskSpan - 1 * mockSpanFactory.startWorkflowSpan("data-workflow", "my-app", "session-1") >> mockWorkflowSpan - 1 * mockSpanFactory.startEmbeddingSpan("text-embed", "my-app", "openai", "text-embedding-ada-002", "session-1") >> mockEmbeddingSpan - 1 * mockSpanFactory.startRetrievalSpan("document-retrieval", "my-app", "session-1") >> mockRetrievalSpan - - // Verify evaluation processor methods were called - 1 * mockEvalProcessor.SubmitEvaluation(mockLLMSpan, "accuracy", 0.95, [test: "value"]) - 1 * mockEvalProcessor.SubmitEvaluation(mockAgentSpan, "quality", "excellent", "eval-app", [reviewer: "human"]) - - // Verify the correct spans were returned - llmSpan == mockLLMSpan - agentSpan == mockAgentSpan - toolSpan == mockToolSpan - taskSpan == mockTaskSpan - workflowSpan == mockWorkflowSpan - embeddingSpan == mockEmbeddingSpan - retrievalSpan == mockRetrievalSpan - - // Verify spans are not the NoOp instances - llmSpan != NoOpLLMObsSpan.INSTANCE - agentSpan != NoOpLLMObsSpan.INSTANCE - toolSpan != NoOpLLMObsSpan.INSTANCE - taskSpan != NoOpLLMObsSpan.INSTANCE - workflowSpan != NoOpLLMObsSpan.INSTANCE - embeddingSpan != NoOpLLMObsSpan.INSTANCE - retrievalSpan != NoOpLLMObsSpan.INSTANCE - } - - def "test span creation with null parameters using custom factory"() { - given: - def mockSpanFactory = Mock(LLMObs.LLMObsSpanFactory) - def mockSpan = Mock(LLMObsSpan) - - setStaticField("SPAN_FACTORY", mockSpanFactory) - - when: - def llmSpan = LLMObs.startLLMSpan("test-span", "gpt-4", "openai", null, null) - def embeddingSpan = LLMObs.startEmbeddingSpan("embed-span", null, null, null, null) - - then: - 1 * mockSpanFactory.startLLMSpan("test-span", "gpt-4", "openai", null, null) >> mockSpan - 1 * mockSpanFactory.startEmbeddingSpan("embed-span", null, null, null, null) >> mockSpan - - llmSpan == mockSpan - embeddingSpan == mockSpan - } -} diff --git a/dd-trace-api/src/test/java/datadog/trace/api/aiguard/AIGuardTest.java b/dd-trace-api/src/test/java/datadog/trace/api/aiguard/AIGuardTest.java new file mode 100644 index 00000000000..0d44d81a975 --- /dev/null +++ b/dd-trace-api/src/test/java/datadog/trace/api/aiguard/AIGuardTest.java @@ -0,0 +1,157 @@ +package datadog.trace.api.aiguard; + +import static datadog.trace.api.aiguard.AIGuard.Action.ALLOW; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import org.junit.jupiter.api.Test; + +class AIGuardTest { + + @Test + void testTextMessage() { + AIGuard.Message message = AIGuard.Message.message("user", "What day is today?"); + + assertEquals("user", message.getRole()); + assertEquals("What day is today?", message.getContent()); + assertNull(message.getToolCallId()); + assertNull(message.getToolCalls()); + } + + @Test + void testAssistantToolCall() { + AIGuard.Message message = + AIGuard.Message.assistant( + AIGuard.ToolCall.toolCall( + "1", "execute_http_request", "{ \"url\": \"http://localhost\" }"), + AIGuard.ToolCall.toolCall("2", "random_number", "{ \"min\": 0, \"max\": 10 }")); + + assertEquals("assistant", message.getRole()); + assertNull(message.getContent()); + assertNull(message.getToolCallId()); + assertNotNull(message.getToolCalls()); + assertEquals(2, message.getToolCalls().size()); + + AIGuard.ToolCall http = message.getToolCalls().get(0); + assertEquals("1", http.getId()); + assertEquals("execute_http_request", http.getFunction().getName()); + assertEquals("{ \"url\": \"http://localhost\" }", http.getFunction().getArguments()); + + AIGuard.ToolCall random = message.getToolCalls().get(1); + assertEquals("2", random.getId()); + assertEquals("random_number", random.getFunction().getName()); + assertEquals("{ \"min\": 0, \"max\": 10 }", random.getFunction().getArguments()); + } + + @Test + void testTool() { + AIGuard.Message message = AIGuard.Message.tool("2", "5"); + + assertEquals("tool", message.getRole()); + assertEquals("5", message.getContent()); + assertEquals("2", message.getToolCallId()); + assertNull(message.getToolCalls()); + } + + @Test + void testNoopImplementation() { + List messages = + Arrays.asList( + AIGuard.Message.message("system", "You are a beautiful AI assistant"), + AIGuard.Message.message("user", "What day is today?"), + AIGuard.Message.message("assistant", "Today is monday"), + AIGuard.Message.message("user", "Give me a random number"), + AIGuard.Message.assistant( + AIGuard.ToolCall.toolCall( + "1", "generate_random_number", "{ \"min\": 0, \"max\": 10 }")), + AIGuard.Message.tool("1", "5"), + AIGuard.Message.message("assistant", "Your number is 5")); + + AIGuard.Evaluation evaluation = AIGuard.evaluate(messages); + + assertEquals(ALLOW, evaluation.getAction()); + assertEquals("AI Guard is not enabled", evaluation.getReason()); + } + + @Test + void testContentPartTextFactory() { + AIGuard.ContentPart part = AIGuard.ContentPart.text("Hello world"); + + assertEquals(AIGuard.ContentPart.Type.TEXT, part.getType()); + assertEquals("Hello world", part.getText()); + assertNull(part.getImageUrl()); + } + + @Test + void testContentPartImageUrlFromStringFactory() { + AIGuard.ContentPart part = AIGuard.ContentPart.imageUrl("https://example.com/image.jpg"); + + assertEquals(AIGuard.ContentPart.Type.IMAGE_URL, part.getType()); + assertNull(part.getText()); + assertNotNull(part.getImageUrl()); + assertEquals("https://example.com/image.jpg", part.getImageUrl().getUrl()); + } + + @Test + void testMessageWithContentParts() { + AIGuard.Message message = + AIGuard.Message.message( + "user", + Arrays.asList( + AIGuard.ContentPart.text("Describe this image:"), + AIGuard.ContentPart.imageUrl("https://example.com/image.jpg"))); + + assertEquals("user", message.getRole()); + assertNull(message.getContent()); + assertNotNull(message.getContentParts()); + assertEquals(2, message.getContentParts().size()); + assertEquals(AIGuard.ContentPart.Type.TEXT, message.getContentParts().get(0).getType()); + assertEquals("Describe this image:", message.getContentParts().get(0).getText()); + assertEquals(AIGuard.ContentPart.Type.IMAGE_URL, message.getContentParts().get(1).getType()); + assertEquals( + "https://example.com/image.jpg", message.getContentParts().get(1).getImageUrl().getUrl()); + } + + @Test + void testMessageWithPlainContentReturnsNullContentParts() { + AIGuard.Message message = AIGuard.Message.message("user", "Hello"); + + assertEquals("Hello", message.getContent()); + assertNull(message.getContentParts()); + } + + @Test + void testMessageWithContentPartsReturnsNullContent() { + AIGuard.Message message = + AIGuard.Message.message( + "user", Collections.singletonList(AIGuard.ContentPart.text("Hello"))); + + assertNull(message.getContent()); + assertNotNull(message.getContentParts()); + } + + @Test + void testMessageValidationAllowsNullContentForAssistantWithToolCalls() { + AIGuard.Message message = + AIGuard.Message.assistant(AIGuard.ToolCall.toolCall("1", "test", "{}")); + + assertEquals("assistant", message.getRole()); + assertNull(message.getContent()); + assertNull(message.getContentParts()); + assertNotNull(message.getToolCalls()); + } + + @Test + void testMessageAllowsEmptyContentPartsList() { + AIGuard.Message message = + new AIGuard.Message("user", Collections.emptyList(), null, null); + + assertNotNull(message.getContentParts()); + assertTrue(message.getContentParts().isEmpty()); + } +} diff --git a/dd-trace-api/src/test/java/datadog/trace/api/llmobs/LLMObsTest.java b/dd-trace-api/src/test/java/datadog/trace/api/llmobs/LLMObsTest.java new file mode 100644 index 00000000000..71f29740165 --- /dev/null +++ b/dd-trace-api/src/test/java/datadog/trace/api/llmobs/LLMObsTest.java @@ -0,0 +1,323 @@ +package datadog.trace.api.llmobs; + +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotSame; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertSame; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import datadog.trace.api.llmobs.noop.NoOpLLMObsEvalProcessor; +import datadog.trace.api.llmobs.noop.NoOpLLMObsSpan; +import datadog.trace.api.llmobs.noop.NoOpLLMObsSpanFactory; +import java.lang.reflect.Field; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +class LLMObsTest { + + private static Object originalSpanFactory; + private static Object originalEvalProcessor; + + @BeforeAll + static void setupSpec() throws Exception { + originalSpanFactory = getStaticField("SPAN_FACTORY"); + originalEvalProcessor = getStaticField("EVAL_PROCESSOR"); + } + + @AfterAll + static void cleanupSpec() throws Exception { + setStaticField("SPAN_FACTORY", originalSpanFactory); + setStaticField("EVAL_PROCESSOR", originalEvalProcessor); + } + + @AfterEach + void cleanup() throws Exception { + setStaticField("SPAN_FACTORY", NoOpLLMObsSpanFactory.INSTANCE); + setStaticField("EVAL_PROCESSOR", NoOpLLMObsEvalProcessor.INSTANCE); + } + + @Test + void testToolCallCreationAndGetters() { + Map arguments = new HashMap<>(); + arguments.put("location", "New York"); + arguments.put("unit", "celsius"); + + LLMObs.ToolCall toolCall = + LLMObs.ToolCall.from("get_weather", "function", "tool-123", arguments); + + assertEquals("get_weather", toolCall.getName()); + assertEquals("function", toolCall.getType()); + assertEquals("tool-123", toolCall.getToolId()); + assertEquals(arguments, toolCall.getArguments()); + } + + @Test + void testToolCallWithNullArguments() { + LLMObs.ToolCall toolCall = LLMObs.ToolCall.from("get_weather", "function", "tool-123", null); + + assertEquals("get_weather", toolCall.getName()); + assertEquals("function", toolCall.getType()); + assertEquals("tool-123", toolCall.getToolId()); + assertNull(toolCall.getArguments()); + } + + @Test + void testLLMMessageCreationWithToolCalls() { + Map args = new HashMap<>(); + args.put("location", "Paris"); + LLMObs.ToolCall toolCall = LLMObs.ToolCall.from("get_weather", "function", "tool-123", args); + List toolCalls = Collections.singletonList(toolCall); + + LLMObs.LLMMessage message = + LLMObs.LLMMessage.from("assistant", "Let me check the weather", toolCalls); + + assertEquals("assistant", message.getRole()); + assertEquals("Let me check the weather", message.getContent()); + assertEquals(toolCalls, message.getToolCalls()); + assertEquals(1, message.getToolCalls().size()); + assertEquals("get_weather", message.getToolCalls().get(0).getName()); + assertEquals("function", message.getToolCalls().get(0).getType()); + assertEquals("tool-123", message.getToolCalls().get(0).getToolId()); + assertEquals(args, message.getToolCalls().get(0).getArguments()); + } + + @Test + void testLLMMessageCreationWithoutToolCalls() { + LLMObs.LLMMessage message = LLMObs.LLMMessage.from("user", "What's the weather like?"); + + assertEquals("user", message.getRole()); + assertEquals("What's the weather like?", message.getContent()); + assertNull(message.getToolCalls()); + } + + @Test + void testLLMMessageWithMultipleToolCalls() { + Map weatherArgs = new HashMap<>(); + weatherArgs.put("location", "New York"); + LLMObs.ToolCall toolCall1 = + LLMObs.ToolCall.from("get_weather", "function", "tool-1", weatherArgs); + + Map stockArgs = new HashMap<>(); + stockArgs.put("symbol", "AAPL"); + LLMObs.ToolCall toolCall2 = + LLMObs.ToolCall.from("get_stock_price", "function", "tool-2", stockArgs); + + List toolCalls = Arrays.asList(toolCall1, toolCall2); + + LLMObs.LLMMessage message = + LLMObs.LLMMessage.from("assistant", "I'll help you with both requests", toolCalls); + + assertEquals("assistant", message.getRole()); + assertEquals("I'll help you with both requests", message.getContent()); + assertEquals(toolCalls, message.getToolCalls()); + assertEquals(2, message.getToolCalls().size()); + assertEquals("get_weather", message.getToolCalls().get(0).getName()); + assertEquals("get_stock_price", message.getToolCalls().get(1).getName()); + } + + @Test + void testDefaultNoOpSpanFactoryBehavior() { + LLMObsSpan llmSpan = LLMObs.startLLMSpan("test", "gpt-4", "openai", "app", "session"); + LLMObsSpan agentSpan = LLMObs.startAgentSpan("test", "app", "session"); + LLMObsSpan toolSpan = LLMObs.startToolSpan("test", "app", "session"); + LLMObsSpan taskSpan = LLMObs.startTaskSpan("test", "app", "session"); + LLMObsSpan workflowSpan = LLMObs.startWorkflowSpan("test", "app", "session"); + LLMObsSpan embeddingSpan = + LLMObs.startEmbeddingSpan("test", "app", "openai", "model", "session"); + LLMObsSpan retrievalSpan = LLMObs.startRetrievalSpan("test", "app", "session"); + + assertSame(NoOpLLMObsSpan.INSTANCE, llmSpan); + assertSame(NoOpLLMObsSpan.INSTANCE, agentSpan); + assertSame(NoOpLLMObsSpan.INSTANCE, toolSpan); + assertSame(NoOpLLMObsSpan.INSTANCE, taskSpan); + assertSame(NoOpLLMObsSpan.INSTANCE, workflowSpan); + assertSame(NoOpLLMObsSpan.INSTANCE, embeddingSpan); + assertSame(NoOpLLMObsSpan.INSTANCE, retrievalSpan); + } + + @Test + void testSpanCreationWithNullOptionalParameters() { + LLMObsSpan llmSpan = LLMObs.startLLMSpan("test", "gpt-4", "openai", null, null); + LLMObsSpan agentSpan = LLMObs.startAgentSpan("test", null, null); + LLMObsSpan toolSpan = LLMObs.startToolSpan("test", null, null); + LLMObsSpan taskSpan = LLMObs.startTaskSpan("test", null, null); + LLMObsSpan workflowSpan = LLMObs.startWorkflowSpan("test", null, null); + LLMObsSpan embeddingSpan = LLMObs.startEmbeddingSpan("test", null, null, null, null); + LLMObsSpan retrievalSpan = LLMObs.startRetrievalSpan("test", null, null); + + assertSame(NoOpLLMObsSpan.INSTANCE, llmSpan); + assertSame(NoOpLLMObsSpan.INSTANCE, agentSpan); + assertSame(NoOpLLMObsSpan.INSTANCE, toolSpan); + assertSame(NoOpLLMObsSpan.INSTANCE, taskSpan); + assertSame(NoOpLLMObsSpan.INSTANCE, workflowSpan); + assertSame(NoOpLLMObsSpan.INSTANCE, embeddingSpan); + assertSame(NoOpLLMObsSpan.INSTANCE, retrievalSpan); + } + + @Test + void testDefaultNoOpEvaluationProcessorBehavior() { + assertDoesNotThrow( + () -> { + Map emptyTags = new HashMap<>(); + LLMObs.SubmitEvaluation(NoOpLLMObsSpan.INSTANCE, "label", 0.5, emptyTags); + LLMObs.SubmitEvaluation(NoOpLLMObsSpan.INSTANCE, "label", 0.5, "app", emptyTags); + LLMObs.SubmitEvaluation(NoOpLLMObsSpan.INSTANCE, "label", "value", emptyTags); + LLMObs.SubmitEvaluation(NoOpLLMObsSpan.INSTANCE, "label", "value", "app", emptyTags); + }); + } + + @Test + void testEvaluationSubmissionWithVariousScoreValues() { + LLMObsSpan span = NoOpLLMObsSpan.INSTANCE; + Map tags = new HashMap<>(); + tags.put("category", "test"); + tags.put("version", "1.0"); + + assertDoesNotThrow( + () -> { + LLMObs.SubmitEvaluation(span, "accuracy", 0.0, tags); + LLMObs.SubmitEvaluation(span, "precision", 1.0, tags); + LLMObs.SubmitEvaluation(span, "recall", 0.85, tags); + LLMObs.SubmitEvaluation(span, "f1_score", 0.92, "myapp", tags); + }); + } + + @Test + void testEvaluationSubmissionWithCategoricalValues() { + LLMObsSpan span = NoOpLLMObsSpan.INSTANCE; + Map tags = new HashMap<>(); + tags.put("evaluator", "human"); + tags.put("context", "production"); + + assertDoesNotThrow( + () -> { + LLMObs.SubmitEvaluation(span, "quality", "excellent", tags); + LLMObs.SubmitEvaluation(span, "relevance", "poor", tags); + LLMObs.SubmitEvaluation(span, "toxicity", "safe", "content-app", tags); + }); + } + + @Test + void testEvaluationSubmissionWithEmptyTags() { + LLMObsSpan span = NoOpLLMObsSpan.INSTANCE; + Map emptyTags = new HashMap<>(); + + assertDoesNotThrow( + () -> { + LLMObs.SubmitEvaluation(span, "score", 0.75, emptyTags); + LLMObs.SubmitEvaluation(span, "category", "good", emptyTags); + }); + } + + @Test + void testSpanCreationWithCustomFactoryReturnsActualSpans() throws Exception { + LLMObs.LLMObsSpanFactory mockFactory = mock(LLMObs.LLMObsSpanFactory.class); + LLMObs.LLMObsEvalProcessor mockEvalProcessor = mock(LLMObs.LLMObsEvalProcessor.class); + LLMObsSpan mockLLMSpan = mock(LLMObsSpan.class); + LLMObsSpan mockAgentSpan = mock(LLMObsSpan.class); + LLMObsSpan mockToolSpan = mock(LLMObsSpan.class); + LLMObsSpan mockTaskSpan = mock(LLMObsSpan.class); + LLMObsSpan mockWorkflowSpan = mock(LLMObsSpan.class); + LLMObsSpan mockEmbeddingSpan = mock(LLMObsSpan.class); + LLMObsSpan mockRetrievalSpan = mock(LLMObsSpan.class); + + when(mockFactory.startLLMSpan("chat-completion", "gpt-4", "openai", "my-app", "session-1")) + .thenReturn(mockLLMSpan); + when(mockFactory.startAgentSpan("agent-task", "my-app", "session-1")).thenReturn(mockAgentSpan); + when(mockFactory.startToolSpan("weather-tool", "my-app", "session-1")).thenReturn(mockToolSpan); + when(mockFactory.startTaskSpan("summarize-task", "my-app", "session-1")) + .thenReturn(mockTaskSpan); + when(mockFactory.startWorkflowSpan("data-workflow", "my-app", "session-1")) + .thenReturn(mockWorkflowSpan); + when(mockFactory.startEmbeddingSpan( + "text-embed", "my-app", "openai", "text-embedding-ada-002", "session-1")) + .thenReturn(mockEmbeddingSpan); + when(mockFactory.startRetrievalSpan("document-retrieval", "my-app", "session-1")) + .thenReturn(mockRetrievalSpan); + + setStaticField("SPAN_FACTORY", mockFactory); + setStaticField("EVAL_PROCESSOR", mockEvalProcessor); + + LLMObsSpan llmSpan = + LLMObs.startLLMSpan("chat-completion", "gpt-4", "openai", "my-app", "session-1"); + LLMObsSpan agentSpan = LLMObs.startAgentSpan("agent-task", "my-app", "session-1"); + LLMObsSpan toolSpan = LLMObs.startToolSpan("weather-tool", "my-app", "session-1"); + LLMObsSpan taskSpan = LLMObs.startTaskSpan("summarize-task", "my-app", "session-1"); + LLMObsSpan workflowSpan = LLMObs.startWorkflowSpan("data-workflow", "my-app", "session-1"); + LLMObsSpan embeddingSpan = + LLMObs.startEmbeddingSpan( + "text-embed", "my-app", "openai", "text-embedding-ada-002", "session-1"); + LLMObsSpan retrievalSpan = + LLMObs.startRetrievalSpan("document-retrieval", "my-app", "session-1"); + + Map scoreTags = new HashMap<>(); + scoreTags.put("test", "value"); + LLMObs.SubmitEvaluation(llmSpan, "accuracy", 0.95, scoreTags); + + Map categoricalTags = new HashMap<>(); + categoricalTags.put("reviewer", "human"); + LLMObs.SubmitEvaluation(agentSpan, "quality", "excellent", "eval-app", categoricalTags); + + assertSame(mockLLMSpan, llmSpan); + assertSame(mockAgentSpan, agentSpan); + assertSame(mockToolSpan, toolSpan); + assertSame(mockTaskSpan, taskSpan); + assertSame(mockWorkflowSpan, workflowSpan); + assertSame(mockEmbeddingSpan, embeddingSpan); + assertSame(mockRetrievalSpan, retrievalSpan); + + assertNotSame(NoOpLLMObsSpan.INSTANCE, llmSpan); + assertNotSame(NoOpLLMObsSpan.INSTANCE, agentSpan); + assertNotSame(NoOpLLMObsSpan.INSTANCE, toolSpan); + assertNotSame(NoOpLLMObsSpan.INSTANCE, taskSpan); + assertNotSame(NoOpLLMObsSpan.INSTANCE, workflowSpan); + assertNotSame(NoOpLLMObsSpan.INSTANCE, embeddingSpan); + assertNotSame(NoOpLLMObsSpan.INSTANCE, retrievalSpan); + + verify(mockEvalProcessor).SubmitEvaluation(mockLLMSpan, "accuracy", 0.95, scoreTags); + verify(mockEvalProcessor) + .SubmitEvaluation(mockAgentSpan, "quality", "excellent", "eval-app", categoricalTags); + } + + @Test + void testSpanCreationWithNullParametersUsingCustomFactory() throws Exception { + LLMObs.LLMObsSpanFactory mockFactory = mock(LLMObs.LLMObsSpanFactory.class); + LLMObsSpan mockSpan = mock(LLMObsSpan.class); + + when(mockFactory.startLLMSpan("test-span", "gpt-4", "openai", null, null)).thenReturn(mockSpan); + when(mockFactory.startEmbeddingSpan("embed-span", null, null, null, null)).thenReturn(mockSpan); + + setStaticField("SPAN_FACTORY", mockFactory); + + LLMObsSpan llmSpan = LLMObs.startLLMSpan("test-span", "gpt-4", "openai", null, null); + LLMObsSpan embeddingSpan = LLMObs.startEmbeddingSpan("embed-span", null, null, null, null); + + assertSame(mockSpan, llmSpan); + assertSame(mockSpan, embeddingSpan); + + verify(mockFactory).startLLMSpan("test-span", "gpt-4", "openai", null, null); + verify(mockFactory).startEmbeddingSpan("embed-span", null, null, null, null); + } + + private static void setStaticField(String fieldName, Object value) throws Exception { + Field field = LLMObs.class.getDeclaredField(fieldName); + field.setAccessible(true); + field.set(null, value); + } + + private static Object getStaticField(String fieldName) throws Exception { + Field field = LLMObs.class.getDeclaredField(fieldName); + field.setAccessible(true); + return field.get(null); + } +}