Unstructured-IO · fzowl · Jan 18, 2025 · Jan 28, 2025 · May 21, 2025 · Oct 16, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -200,6 +200,7 @@
 - Speed up function _assign_hash_ids by 34% (codeflash)
 
 ### Features
+- Add support for VoyageAI voyage-4 family models: `voyage-4`, `voyage-4-lite`, `voyage-4-large`
 
 ### Fixes
 - Bumped dependencies via pip-compile to address the following CVEs:
@@ -244,6 +245,7 @@
 ## 0.18.13
 
 ### Enhancements
+- **Refactoring the VoyageAI integration** to use voyageai package directly, allowing extra features.
 
 ### Features
 

diff --git a/test_unstructured/embed/test_voyageai.py b/test_unstructured/embed/test_voyageai.py
@@ -143,6 +143,16 @@ def test_get_token_limit(mocker):
     config_v2 = VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-2")
     assert config_v2.get_token_limit() == 320_000
 
+    # Test voyage-4 family
+    config_v4 = VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-4")
+    assert config_v4.get_token_limit() == 320_000
+
+    config_v4_lite = VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-4-lite")
+    assert config_v4_lite.get_token_limit() == 1_000_000
+
+    config_v4_large = VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-4-large")
+    assert config_v4_large.get_token_limit() == 120_000
+
     # Test unknown model (should use default)
     config_unknown = VoyageAIEmbeddingConfig(api_key="api_key", model_name="unknown-model")
     assert config_unknown.get_token_limit() == 120_000
@@ -158,11 +168,12 @@ def test_is_context_model(mocker):
     )
     assert encoder_context._is_context_model() is True
 
-    # Test with regular model
-    encoder_regular = VoyageAIEmbeddingEncoder(
-        config=VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-3.5")
-    )
-    assert encoder_regular._is_context_model() is False
+    # Test with regular models
+    for model_name in ["voyage-3.5", "voyage-4", "voyage-4-lite", "voyage-4-large"]:
+        encoder_regular = VoyageAIEmbeddingEncoder(
+            config=VoyageAIEmbeddingConfig(api_key="api_key", model_name=model_name)
+        )
+        assert encoder_regular._is_context_model() is False
 
 
 def test_build_batches_with_token_limits(mocker):

diff --git a/unstructured/embed/voyageai.py b/unstructured/embed/voyageai.py
@@ -13,6 +13,9 @@
 
 # Token limits for different VoyageAI models
 VOYAGE_TOTAL_TOKEN_LIMITS = {
+    "voyage-4-lite": 1_000_000,
+    "voyage-4": 320_000,
+    "voyage-4-large": 120_000,
     "voyage-context-3": 32_000,
     "voyage-3.5-lite": 1_000_000,
     "voyage-3.5": 320_000,