diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ecf33d9f0..92e74d90c0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -200,6 +200,7 @@ - Speed up function _assign_hash_ids by 34% (codeflash) ### Features +- Add support for VoyageAI voyage-4 family models: `voyage-4`, `voyage-4-lite`, `voyage-4-large` ### Fixes - Bumped dependencies via pip-compile to address the following CVEs: @@ -244,6 +245,7 @@ ## 0.18.13 ### Enhancements +- **Refactoring the VoyageAI integration** to use voyageai package directly, allowing extra features. ### Features diff --git a/test_unstructured/embed/test_voyageai.py b/test_unstructured/embed/test_voyageai.py index 1273d9b283..16e0f9332d 100644 --- a/test_unstructured/embed/test_voyageai.py +++ b/test_unstructured/embed/test_voyageai.py @@ -143,6 +143,16 @@ def test_get_token_limit(mocker): config_v2 = VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-2") assert config_v2.get_token_limit() == 320_000 + # Test voyage-4 family + config_v4 = VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-4") + assert config_v4.get_token_limit() == 320_000 + + config_v4_lite = VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-4-lite") + assert config_v4_lite.get_token_limit() == 1_000_000 + + config_v4_large = VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-4-large") + assert config_v4_large.get_token_limit() == 120_000 + # Test unknown model (should use default) config_unknown = VoyageAIEmbeddingConfig(api_key="api_key", model_name="unknown-model") assert config_unknown.get_token_limit() == 120_000 @@ -158,11 +168,12 @@ def test_is_context_model(mocker): ) assert encoder_context._is_context_model() is True - # Test with regular model - encoder_regular = VoyageAIEmbeddingEncoder( - config=VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-3.5") - ) - assert encoder_regular._is_context_model() is False + # Test with regular models + for model_name in ["voyage-3.5", "voyage-4", "voyage-4-lite", "voyage-4-large"]: + encoder_regular = VoyageAIEmbeddingEncoder( + config=VoyageAIEmbeddingConfig(api_key="api_key", model_name=model_name) + ) + assert encoder_regular._is_context_model() is False def test_build_batches_with_token_limits(mocker): diff --git a/unstructured/embed/voyageai.py b/unstructured/embed/voyageai.py index 8a496a68a3..56c6023198 100644 --- a/unstructured/embed/voyageai.py +++ b/unstructured/embed/voyageai.py @@ -13,6 +13,9 @@ # Token limits for different VoyageAI models VOYAGE_TOTAL_TOKEN_LIMITS = { + "voyage-4-lite": 1_000_000, + "voyage-4": 320_000, + "voyage-4-large": 120_000, "voyage-context-3": 32_000, "voyage-3.5-lite": 1_000_000, "voyage-3.5": 320_000,