11import os
2+ import warnings
23from typing import TYPE_CHECKING , Any
34
45from pydantic import ConfigDict
1314# ignore that voyageai isn't imported
1415# mypy: disable-error-code="name-defined"
1516
17+ # Sentinel used to detect when model is not explicitly passed to __init__
18+ _MODEL_NOT_SET = object ()
19+
1620# Token limits for VoyageAI models (used for token-aware batching)
1721VOYAGE_TOTAL_TOKEN_LIMITS = {
1822 "voyage-context-3" : 32_000 ,
@@ -128,7 +132,7 @@ class VoyageAIVectorizer(BaseVectorizer):
128132
129133 def __init__ (
130134 self ,
131- model : str = "voyage-3-large" ,
135+ model : str = _MODEL_NOT_SET , # type: ignore[assignment]
132136 api_config : dict [str , Any ] | None = None ,
133137 dtype : str = "float32" ,
134138 cache : "EmbeddingsCache | None" = None ,
@@ -140,6 +144,8 @@ def __init__(
140144
141145 Args:
142146 model (str): Model to use for embedding. Defaults to "voyage-3-large".
147+ The default will be removed in the next major version; please specify
148+ the model explicitly.
143149 api_config (Optional[Dict], optional): Dictionary containing the API key.
144150 Defaults to None.
145151 dtype (str): the default datatype to use when embedding content as byte arrays.
@@ -157,6 +163,16 @@ def __init__(
157163 ffmpeg installed on the system. Image embeddings require pillow to be installed.
158164
159165 """
166+ if model is _MODEL_NOT_SET :
167+ warnings .warn (
168+ "Instantiating VoyageAIVectorizer without an explicit 'model' "
169+ "parameter is deprecated. The default ('voyage-3-large') will be "
170+ "removed in the next major version. Please pass model='voyage-3-large' "
171+ "(or your preferred model) explicitly." ,
172+ DeprecationWarning ,
173+ stacklevel = 2 ,
174+ )
175+ model = "voyage-3-large"
160176 super ().__init__ (model = model , dtype = dtype , cache = cache )
161177 # Initialize client and set up the model
162178 self ._setup (api_config , ** kwargs )
@@ -353,7 +369,9 @@ def _embed_many(
353369 Args:
354370 contents: List of items to embed - each item must be one of str, PIL.Image.Image, or
355371 voyageai.video_utils.Video. Images and video require a multimodal model to be configured.
356- batch_size: Number of items to process in each API call
372+ batch_size: Deprecated. Number of items to process in each API call.
373+ Batch size is now determined automatically based on the model.
374+ This parameter will be removed in the next major version.
357375 **kwargs: Additional parameters to pass to the VoyageAI API
358376
359377 Returns:
@@ -371,8 +389,17 @@ def _embed_many(
371389 # Validate inputs
372390 self ._validate_input (contents , input_type , truncation )
373391
374- # Determine batch size if not provided
375- if batch_size is None :
392+ # Determine batch size - auto-determined based on model; explicit
393+ # batch_size is deprecated.
394+ if batch_size is not None :
395+ warnings .warn (
396+ "The 'batch_size' parameter is deprecated for VoyageAIVectorizer. "
397+ "Batch size is now automatically determined based on the model's "
398+ "token limits. This parameter will be removed in the next major version." ,
399+ DeprecationWarning ,
400+ stacklevel = 2 ,
401+ )
402+ else :
376403 batch_size = self ._get_batch_size ()
377404
378405 try :
@@ -428,7 +455,9 @@ async def _aembed_many(
428455 Args:
429456 contents: List of items to embed - each item must be one of str, PIL.Image.Image, or
430457 voyageai.video_utils.Video. Images and video require a multimodal model to be configured.
431- batch_size: Number of texts to process in each API call
458+ batch_size: Deprecated. Number of texts to process in each API call.
459+ Batch size is now determined automatically based on the model.
460+ This parameter will be removed in the next major version.
432461 **kwargs: Additional parameters to pass to the VoyageAI API
433462
434463 Returns:
@@ -446,8 +475,17 @@ async def _aembed_many(
446475 # Validate inputs
447476 self ._validate_input (contents , input_type , truncation )
448477
449- # Determine batch size if not provided
450- if batch_size is None :
478+ # Determine batch size - auto-determined based on model; explicit
479+ # batch_size is deprecated.
480+ if batch_size is not None :
481+ warnings .warn (
482+ "The 'batch_size' parameter is deprecated for VoyageAIVectorizer. "
483+ "Batch size is now automatically determined based on the model's "
484+ "token limits. This parameter will be removed in the next major version." ,
485+ DeprecationWarning ,
486+ stacklevel = 2 ,
487+ )
488+ else :
451489 batch_size = self ._get_batch_size ()
452490
453491 try :
@@ -495,17 +533,20 @@ def _is_context_model(self) -> bool:
495533 """
496534 return "context" in self .model
497535
498- def count_tokens (self , texts : List [str ]) -> List [int ]:
536+ def count_tokens (self , texts : list [str ]) -> list [int ]:
499537 """
500- Count tokens for the given texts using VoyageAI's tokenization API .
538+ Count tokens for the given texts using VoyageAI's local tokenizer .
501539
502- This is useful for managing API usage and optimizing batching strategies.
540+ This method runs entirely on the CPU using the HuggingFace ``tokenizers``
541+ library — it does NOT make any network/API calls. It is safe to call
542+ frequently (e.g., for token-aware batching) without incurring API costs
543+ or latency.
503544
504545 Args:
505546 texts: List of texts to count tokens for.
506547
507548 Returns:
508- List [int]: List of token counts for each text.
549+ list [int]: List of token counts for each text.
509550
510551 Raises:
511552 ValueError: If tokenization fails.
@@ -519,25 +560,27 @@ def count_tokens(self, texts: List[str]) -> List[int]:
519560 return []
520561
521562 try :
563+ # tokenize() is a local CPU operation using HuggingFace tokenizers,
564+ # not a remote API call.
522565 token_lists = self ._client .tokenize (texts , model = self .model )
523566 return [len (token_list ) for token_list in token_lists ]
524567 except Exception as e :
525568 raise ValueError (f"Token counting failed: { e } " )
526569
527- async def acount_tokens (self , texts : List [str ]) -> List [int ]:
570+ async def acount_tokens (self , texts : list [str ]) -> list [int ]:
528571 """
529- Asynchronously count tokens for the given texts using VoyageAI's tokenization API.
530-
531- This is useful for managing API usage and optimizing batching strategies.
572+ Asynchronously count tokens for the given texts using VoyageAI's local tokenizer.
532573
533- Note: The underlying VoyageAI tokenize API is synchronous, so this method
534- provides async compatibility but doesn't offer true async performance benefits.
574+ This method runs entirely on the CPU using the HuggingFace ``tokenizers``
575+ library — it does NOT make any network/API calls. The underlying
576+ tokenize operation is synchronous (CPU-bound), so this async wrapper
577+ provides interface compatibility but does not yield to the event loop.
535578
536579 Args:
537580 texts: List of texts to count tokens for.
538581
539582 Returns:
540- List [int]: List of token counts for each text.
583+ list [int]: List of token counts for each text.
541584
542585 Raises:
543586 ValueError: If tokenization fails.
@@ -551,7 +594,8 @@ async def acount_tokens(self, texts: List[str]) -> List[int]:
551594 return []
552595
553596 try :
554- # Note: VoyageAI's tokenize is synchronous even on AsyncClient
597+ # tokenize() is a local CPU operation (HuggingFace tokenizers),
598+ # not a remote API call. Synchronous even on AsyncClient.
555599 token_lists = self ._aclient .tokenize (texts , model = self .model )
556600 return [len (token_list ) for token_list in token_lists ]
557601 except Exception as e :
0 commit comments