Be more resilient to gemini outputting thoughts in api response

aaronvg · aaronvg · commit 23314b9e458c · 2025-05-06T23:21:15.000-07:00
diff --git a/.github/workflows/build-ruby-release.reusable.yaml b/.github/workflows/build-ruby-release.reusable.yaml
@@ -3,7 +3,7 @@ name: BAML Release - Build Ruby
 on:
   workflow_call: {}
   push:
-    branches: [bump-ruby]
+    branches: [gemini-fix]
 
 permissions:
   contents: read
diff --git a/engine/Cargo.lock b/engine/Cargo.lock
diff --git a/engine/baml-runtime/Cargo.toml b/engine/baml-runtime/Cargo.toml
@@ -172,7 +172,6 @@ notify-debouncer-full = "0.3.1"
 ring = { version = "0.17.4", features = ["std"] }
 tokio = { version = "1", features = ["full"] }
 reqwest.workspace = true
-rustls = "0.23.26"
 walkdir = "2.5.0"
 which = "6.0.3"
 indicatif = "0.17"
diff --git a/engine/baml-runtime/src/internal/llm_client/primitive/google/googleai_client.rs b/engine/baml-runtime/src/internal/llm_client/primitive/google/googleai_client.rs
@@ -359,26 +359,6 @@ impl ToProviderMessage for GoogleAIClient {
     }
 }
 
-/// The Google Gemini 2 model has an experimental feature
-/// called Flash Thinking Mode, which is turned on in a particular
-/// named model: gemini-2.0-flash-thinking-exp-1219
-///
-/// When run in this mode, Gemini returns `candidates` with 2 parts each.
-/// Part 0 is the chain of thought, part 1 is the actual output.
-/// Other Gemini models put the output data in part 0.
-///
-/// TODO: Explicitly represent Flash Thinking Mode response and
-/// do more thorough checking for the content part.
-/// For examples of how to introspect the response more safely, see:
-/// https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/getting-started/intro_gemini_2_0_flash_thinking_mode.ipynb
-fn content_part(model_name: &str) -> usize {
-    if model_name.contains("gemini-2.0-flash-thinking-exp-1219") {
-        1
-    } else {
-        0
-    }
-}
-
 impl CompletionToProviderBody for GoogleAIClient {
     fn completion_to_provider_body(
         &self,
diff --git a/engine/baml-runtime/src/internal/llm_client/primitive/google/response_handler.rs b/engine/baml-runtime/src/internal/llm_client/primitive/google/response_handler.rs
@@ -1,6 +1,6 @@
 use anyhow::Result;
 
-use super::types::GoogleResponse;
+use super::types::{GoogleResponse, Part};
 use crate::internal::llm_client::{
     primitive::request::RequestBuilder, traits::WithClient, ErrorCode, LLMCompleteResponse,
     LLMCompleteResponseMetadata, LLMErrorResponse, LLMResponse,
@@ -26,6 +26,7 @@ pub fn parse_google_response<C: WithClient + RequestBuilder>(
     instant_now: web_time::Instant,
     model_name: Option<String>,
 ) -> LLMResponse {
+    // baml_log::info!("Parsing Google response: {:#?}", response_body);
     let response = match GoogleResponse::deserialize(&response_body)
         .context(format!(
             "Failed to parse into a response accepted by {}: {}",
@@ -78,11 +79,11 @@ pub fn parse_google_response<C: WithClient + RequestBuilder>(
     };
 
     let model_name = model_name.unwrap_or("<unknown>".to_string());
-    let part_index = content_part(&model_name);
+    let text_content = text_content_part(&content.parts);
     LLMResponse::Success(LLMCompleteResponse {
         client: client.context().name.to_string(),
         prompt: to_prompt(prompt),
-        content: content.parts[part_index].text.clone(),
+        content: text_content.unwrap_or_default(),
         start_time: system_now,
         latency: instant_now.elapsed(),
         request_options: client.request_options().clone(),
@@ -101,12 +102,11 @@ pub fn parse_google_response<C: WithClient + RequestBuilder>(
     })
 }
 
-fn content_part(model_name: &str) -> usize {
-    if model_name.contains("gemini-2.0-flash-thinking-exp-1219") {
-        1
-    } else {
-        0
-    }
+fn text_content_part(parts: &Vec<Part>) -> Option<String> {
+    parts
+        .iter()
+        .position(|part| !part.text.is_empty() && part.thought.unwrap_or(false) == false)
+        .map(|index| parts[index].text.clone())
 }
 
 pub fn scan_google_response_stream(
@@ -145,18 +145,13 @@ pub fn scan_google_response_stream(
         Err(e) => return Err(e),
     };
     if let Some(choice) = event.candidates.get(0) {
-        let part_index = content_part(
-            model_name
-                .as_ref()
-                .map(|s| s.as_str())
-                .unwrap_or("<unknown>"),
-        );
-        if let Some(content) = choice
+        let text_content = &choice
             .content
             .as_ref()
-            .and_then(|c| c.parts.get(part_index))
-        {
-            inner.content += &content.text;
+            .and_then(|c| text_content_part(&c.parts));
+
+        if let Some(text_content) = text_content {
+            inner.content += &text_content;
         }
         inner.metadata.finish_reason = choice.finish_reason.as_ref().map(|r| r.to_string());
         if choice
@@ -235,6 +230,7 @@ mod tests {
                         function_call: None,
                         function_response: None,
                         video_metadata: None,
+                        thought: None,
                     }],
                     role: Some("model".to_string()),
                 }),
diff --git a/engine/baml-runtime/src/internal/llm_client/primitive/google/types.rs b/engine/baml-runtime/src/internal/llm_client/primitive/google/types.rs
@@ -245,6 +245,7 @@ pub struct Part {
     pub function_call: Option<FunctionCall>,
     pub function_response: Option<FunctionResponse>,
     pub video_metadata: Option<VideoMetadata>,
+    pub thought: Option<bool>,
 }
 
 #[derive(Serialize, Deserialize, Debug)]
diff --git a/engine/baml-runtime/src/lib.rs b/engine/baml-runtime/src/lib.rs
@@ -115,16 +115,16 @@ static TOKIO_SINGLETON: OnceLock<std::io::Result<Arc<tokio::runtime::Runtime>>>
 
 static INIT: std::sync::Once = std::sync::Once::new();
 
-fn setup_crypto_provider() {
-    #[cfg(not(target_arch = "wasm32"))]
-    {
-        use rustls::crypto::CryptoProvider;
-        INIT.call_once(|| {
-            let provider = rustls::crypto::ring::default_provider();
-            CryptoProvider::install_default(provider).expect("failed to install CryptoProvider");
-        });
-    }
-}
+// fn setup_crypto_provider() {
+//     #[cfg(not(target_arch = "wasm32"))]
+//     {
+//         use rustls::crypto::CryptoProvider;
+//         INIT.call_once(|| {
+//             let provider = rustls::crypto::ring::default_provider();
+//             CryptoProvider::install_default(provider).expect("failed to install CryptoProvider");
+//         });
+//     }
+// }
 
 #[derive(Clone)]
 pub struct BamlRuntime {
@@ -187,7 +187,7 @@ impl BamlRuntime {
         path: &std::path::Path,
         env_vars: HashMap<T, T>,
     ) -> Result<Self> {
-        setup_crypto_provider();
+        // setup_crypto_provider();
         let path = Self::parse_baml_src_path(path)?;
 
         let copy = env_vars
@@ -210,7 +210,7 @@ impl BamlRuntime {
         files: &HashMap<T, T>,
         env_vars: HashMap<U, U>,
     ) -> Result<Self> {
-        setup_crypto_provider();
+        // setup_crypto_provider();
         let copy = env_vars
             .iter()
             .map(|(k, v)| (k.as_ref().to_string(), v.as_ref().to_string()))
diff --git a/integ-tests/python/tests/test_functions.py b/integ-tests/python/tests/test_functions.py
@@ -792,6 +792,56 @@ async def test_streaming_gemini():
     assert msgs[-1] == final, "Expected last stream message to match final response."
 
 
+@pytest.mark.asyncio
+async def test_gemini_models():
+    client_registry = baml_py.ClientRegistry()
+    # # Test with gemini-1.5-flash-thinking-exp-1219
+    # client_registry.add_llm_client(
+    #     "MyCustomGeminiClient",
+    #     "google-ai",
+    #     {"model": "gemini-1.5-flash-thinking-exp-1219"},
+    # )
+    # client_registry.set_primary("MyCustomGeminiClient")
+    # res = await b.TestGemini(
+    #     input="Dr.Pepper", baml_options={"client_registry": client_registry}
+    # )
+    # assert len(res) > 0, "Expected non-empty result but got empty."
+
+    # Test with gemini-2.5-pro-preview-05-06
+    # client_registry.add_llm_client(
+    #     "Gemini25ProMay", "google-ai", {"model": "gemini-2.5-pro-preview-05-06"}
+    # )
+    # client_registry.set_primary("Gemini25ProMay")
+    # res = await b.TestGemini(
+    #     input="sea. Actually output the multiplication of 23*12/12+3 and take square root of 10.",
+    #     baml_options={"client_registry": client_registry},
+    # )
+    # assert len(res) > 0, "Expected non-empty result but got empty."
+
+    # Test with gemini-2.5-pro-preview-03-25
+    # client_registry.add_llm_client(
+    #     "Gemini25ProMarch", "google-ai", {"model": "gemini-2.5-pro-preview-03-25"}
+    # )
+    # client_registry.set_primary("Gemini25ProMarch")
+    # res = await b.TestGemini(
+    #     input="sea. Actually just output a json object with the keys 'name' and 'age'.",
+    #     baml_options={"client_registry": client_registry},
+    # )
+    # assert len(res) > 0, "Expected non-empty result but got empty."
+
+    # Test with gemini-2.0-flash-thinking-exp-1219
+    client_registry.add_llm_client(
+        "GeminiFlashThinking",
+        "google-ai",
+        {"model": "gemini-2.0-flash-thinking-exp-1219"},
+    )
+    client_registry.set_primary("GeminiFlashThinking")
+    res = await b.TestGemini(
+        input="sea", baml_options={"client_registry": client_registry}
+    )
+    assert len(res) > 0, "Expected non-empty result but got empty."
+
+
 @pytest.mark.asyncio
 async def test_tracing_async_only():
     @trace

Original file line number	Diff line number	Diff line change
`@@ -245,6 +245,7 @@ pub struct Part {`
`245`	`245`	`pub function_call: Option<FunctionCall>,`
`246`	`246`	`pub function_response: Option<FunctionResponse>,`
`247`	`247`	`pub video_metadata: Option<VideoMetadata>,`
	`248`	`+ pub thought: Option<bool>,`
`248`	`249`	`}`
`249`	`250`
`250`	`251`	`#[derive(Serialize, Deserialize, Debug)]`