Fix moderation text extraction after format transforms

root · root · commit 6efb82b4775a · 2026-04-23T19:05:28.000+08:00
diff --git a/src/format.rs b/src/format.rs
@@ -34,6 +34,7 @@ impl RequestFormat {
 pub struct RequestPlan {
     pub source_format: Option<RequestFormat>,
     pub target_format: Option<RequestFormat>,
+    pub moderation_text: Option<String>,
     pub body: Value,
     pub path: String,
     pub stream: bool,
@@ -96,6 +97,7 @@ pub fn process_request(
     let mut plan = RequestPlan {
         source_format: None,
         target_format: None,
+        moderation_text: None,
         stream: body.get("stream").and_then(Value::as_bool).unwrap_or(false),
         body,
         path: path.to_string(),
@@ -187,6 +189,7 @@ pub fn process_request(
     plan.stream = internal.stream;
     plan.source_format = Some(source);
     plan.target_format = Some(target);
+    plan.moderation_text = Some(moderation_text_from_internal_request(&internal));
 
     if target != source || disable_tools {
         plan.body = emit_request(target, &internal)
@@ -199,6 +202,40 @@ pub fn process_request(
     Ok(plan)
 }
 
+fn moderation_text_from_internal_request(req: &InternalRequest) -> String {
+    let mut texts = Vec::new();
+    for message in &req.messages {
+        for block in &message.content {
+            match block {
+                InternalContentBlock::Text(text) => push_non_empty_text(text, &mut texts),
+                InternalContentBlock::ToolResult { output, .. } => {
+                    collect_moderation_value_text(output, &mut texts);
+                }
+                InternalContentBlock::ToolCall { .. } | InternalContentBlock::ImageUrl { .. } => {}
+            }
+        }
+    }
+    texts.join("\n")
+}
+
+fn collect_moderation_value_text(value: &Value, texts: &mut Vec<String>) {
+    match value {
+        Value::String(text) => push_non_empty_text(text, texts),
+        Value::Array(items) => {
+            for item in items {
+                collect_moderation_value_text(item, texts);
+            }
+        }
+        _ => {}
+    }
+}
+
+fn push_non_empty_text(text: &str, texts: &mut Vec<String>) {
+    if !text.is_empty() {
+        texts.push(text.to_string());
+    }
+}
+
 #[cfg_attr(not(test), allow(dead_code))]
 fn detect_format(from_cfg: Option<&Value>, path: &str, headers: &[(String, String)], body: &Value) -> Option<RequestFormat> {
     detect_formats_from_candidates(&configured_candidates(from_cfg), path, headers, body).into_iter().next()
diff --git a/src/proxy.rs b/src/proxy.rs
@@ -134,8 +134,9 @@ async fn proxy_entry_with_cfg(
             .or_else(|| detect_source_format(&path, request_json.as_ref()))
             .unwrap_or("openai_chat")
             .to_string();
-        let moderation_text =
-            extract::extract_text_for_moderation(&request_plan.body, moderation_format.as_str());
+        let moderation_text = request_plan.moderation_text.clone().unwrap_or_else(|| {
+            extract::extract_text_for_moderation(&request_plan.body, moderation_format.as_str())
+        });
         Some((moderation_format, moderation_text))
     } else {
         None
diff --git a/tests/format_process_tests.rs b/tests/format_process_tests.rs
@@ -34,11 +34,60 @@ fn detects_openai_chat_and_rewrites_path_for_responses_target() {
     assert_eq!(plan.target_format, Some(RequestFormat::OpenAiResponses));
     assert!(plan.stream);
     assert_eq!(plan.path, "/proxy/openai/v1/responses");
+    assert_eq!(plan.moderation_text.as_deref(), Some("Be terse.\nPing"));
     assert_eq!(plan.body["instructions"], "Be terse.");
     assert_eq!(plan.body["input"][0]["role"], "user");
     assert_eq!(plan.body["input"][0]["content"][0]["text"], "Ping");
 }
 
+#[test]
+fn preserves_moderation_text_when_chat_transforms_into_responses_instructions() {
+    let plan = process_request(
+        &transform_config(true, "openai_responses"),
+        "/v1/chat/completions",
+        &[],
+        json!({
+            "model": "gpt-4.1-mini",
+            "stream": false,
+            "messages": [
+                {"role": "system", "content": "forbidden system text"},
+                {"role": "user", "content": "safe user text"}
+            ]
+        }),
+    )
+    .expect("openai chat request should transform");
+
+    assert_eq!(plan.source_format, Some(RequestFormat::OpenAiChat));
+    assert_eq!(plan.target_format, Some(RequestFormat::OpenAiResponses));
+    assert_eq!(
+        plan.moderation_text.as_deref(),
+        Some("forbidden system text\nsafe user text")
+    );
+    assert_eq!(plan.body["instructions"], "forbidden system text");
+}
+
+#[test]
+fn preserves_moderation_text_for_native_openai_responses_requests() {
+    let plan = process_request(
+        &transform_config(true, "claude_chat"),
+        "/v1/responses",
+        &[],
+        json!({
+            "model": "gpt-4.1-mini",
+            "instructions": "forbidden instruction text",
+            "input": "safe user text"
+        }),
+    )
+    .expect("responses request should transform");
+
+    assert_eq!(plan.source_format, Some(RequestFormat::OpenAiResponses));
+    assert_eq!(plan.target_format, Some(RequestFormat::ClaudeChat));
+    assert_eq!(
+        plan.moderation_text.as_deref(),
+        Some("forbidden instruction text\nsafe user text")
+    );
+}
+
 #[test]
 fn detects_claude_chat_from_headers_and_rewrites_path_for_openai_chat_target() {
     let plan = process_request(
diff --git a/tests/format_runtime.rs b/tests/format_runtime.rs
@@ -36,6 +36,7 @@ impl RequestFormat {
 pub struct RequestPlan {
     pub source_format: Option<RequestFormat>,
     pub target_format: Option<RequestFormat>,
+    pub moderation_text: Option<String>,
     pub body: Value,
     pub path: String,
     pub stream: bool,
@@ -98,6 +99,7 @@ pub fn process_request(
     let mut plan = RequestPlan {
         source_format: None,
         target_format: None,
+        moderation_text: None,
         stream: body.get("stream").and_then(Value::as_bool).unwrap_or(false),
         body,
         path: path.to_string(),
@@ -190,6 +192,7 @@ pub fn process_request(
     plan.stream = internal.stream;
     plan.source_format = Some(source);
     plan.target_format = Some(target);
+    plan.moderation_text = Some(moderation_text_from_internal_request(&internal));
 
     if target != source || disable_tools {
         plan.body = emit_request(target, &internal)
@@ -202,6 +205,40 @@ pub fn process_request(
     Ok(plan)
 }
 
+fn moderation_text_from_internal_request(req: &InternalRequest) -> String {
+    let mut texts = Vec::new();
+    for message in &req.messages {
+        for block in &message.content {
+            match block {
+                InternalContentBlock::Text(text) => push_non_empty_text(text, &mut texts),
+                InternalContentBlock::ToolResult { output, .. } => {
+                    collect_moderation_value_text(output, &mut texts);
+                }
+                InternalContentBlock::ToolCall { .. } | InternalContentBlock::ImageUrl { .. } => {}
+            }
+        }
+    }
+    texts.join("\n")
+}
+
+fn collect_moderation_value_text(value: &Value, texts: &mut Vec<String>) {
+    match value {
+        Value::String(text) => push_non_empty_text(text, texts),
+        Value::Array(items) => {
+            for item in items {
+                collect_moderation_value_text(item, texts);
+            }
+        }
+        _ => {}
+    }
+}
+
+fn push_non_empty_text(text: &str, texts: &mut Vec<String>) {
+    if !text.is_empty() {
+        texts.push(text.to_string());
+    }
+}
+
 fn detect_format(from_cfg: Option<&Value>, path: &str, headers: &[(String, String)], body: &Value) -> Option<RequestFormat> {
     detect_formats_from_candidates(&configured_candidates(from_cfg), path, headers, body)
         .into_iter()