Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@
- Speed up function _assign_hash_ids by 34% (codeflash)

### Features
- Add support for VoyageAI voyage-4 family models: `voyage-4`, `voyage-4-lite`, `voyage-4-large`

### Fixes
- Bumped dependencies via pip-compile to address the following CVEs:
Expand Down Expand Up @@ -244,6 +245,7 @@
## 0.18.13

### Enhancements
- **Refactoring the VoyageAI integration** to use voyageai package directly, allowing extra features.

### Features

Expand Down
21 changes: 16 additions & 5 deletions test_unstructured/embed/test_voyageai.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,16 @@ def test_get_token_limit(mocker):
config_v2 = VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-2")
assert config_v2.get_token_limit() == 320_000

# Test voyage-4 family
config_v4 = VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-4")
assert config_v4.get_token_limit() == 320_000

config_v4_lite = VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-4-lite")
assert config_v4_lite.get_token_limit() == 1_000_000

config_v4_large = VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-4-large")
assert config_v4_large.get_token_limit() == 120_000

# Test unknown model (should use default)
config_unknown = VoyageAIEmbeddingConfig(api_key="api_key", model_name="unknown-model")
assert config_unknown.get_token_limit() == 120_000
Expand All @@ -158,11 +168,12 @@ def test_is_context_model(mocker):
)
assert encoder_context._is_context_model() is True

# Test with regular model
encoder_regular = VoyageAIEmbeddingEncoder(
config=VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-3.5")
)
assert encoder_regular._is_context_model() is False
# Test with regular models
for model_name in ["voyage-3.5", "voyage-4", "voyage-4-lite", "voyage-4-large"]:
encoder_regular = VoyageAIEmbeddingEncoder(
config=VoyageAIEmbeddingConfig(api_key="api_key", model_name=model_name)
)
assert encoder_regular._is_context_model() is False


def test_build_batches_with_token_limits(mocker):
Expand Down
3 changes: 3 additions & 0 deletions unstructured/embed/voyageai.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@

# Token limits for different VoyageAI models
VOYAGE_TOTAL_TOKEN_LIMITS = {
"voyage-4-lite": 1_000_000,
"voyage-4": 320_000,
"voyage-4-large": 120_000,
"voyage-context-3": 32_000,
"voyage-3.5-lite": 1_000_000,
"voyage-3.5": 320_000,
Expand Down
Loading