@@ -98,7 +98,7 @@ pip install crossvector[all]
9898## Quick Start
9999
100100``` python
101- from crossvector import VectorEngine
101+ from crossvector import VectorEngine, VectorDocument
102102from crossvector.embeddings.openai import OpenAIEmbeddingAdapter
103103from crossvector.dbs.astradb import AstraDBAdapter
104104
@@ -110,28 +110,32 @@ engine = VectorEngine(
110110 store_text = True # Optional: Set to False to not store original text
111111)
112112
113- # Create documents from texts with automatic embedding
114- docs = engine.create_from_texts (
113+ # Create documents from texts with automatic embedding (recommended)
114+ docs = engine.upsert_from_texts (
115115 texts = [" The quick brown fox" , " Artificial intelligence" ],
116116 metadatas = [{" category" : " animals" }, {" category" : " tech" }],
117- pks = [None , " doc2" ] # None = auto-generated, or specify custom pk
117+ pks = [" doc1 " , " doc2" ] # Optional: auto-generated if not provided
118118)
119119print (f " Inserted { len (docs)} documents " )
120120
121121# Alternative: Upsert with VectorDocument (if you have embeddings already)
122- from crossvector import VectorDocument
123122vector_docs = [
124- VectorDocument(pk = " doc3" , text = " Python programming" , vector = [0.1 ]* 1536 , metadata = {" category" : " tech" })
123+ VectorDocument(
124+ id = " doc3" ,
125+ text = " Python programming" ,
126+ vector = [0.1 ]* 1536 ,
127+ metadata = {" category" : " tech" }
128+ )
125129]
126130result = engine.upsert(vector_docs)
127131
128132# Search with automatic query embedding
129- results = engine.search(" AI and machine learning" , limit = 5 )
133+ results = engine.search(query = " AI and machine learning" , limit = 5 )
130134for doc in results:
131- print (f " ID: { doc.pk } , Text: { doc.text} " )
135+ print (f " ID: { doc.id } , Text: { doc.text} " )
132136
133137# Search with filters
134- results = engine.search(" python" , limit = 5 , where = {" category" : " tech" })
138+ results = engine.search(query = " python" , limit = 5 , where = {" category" : " tech" })
135139
136140# Get document by ID
137141doc = engine.get(" doc2" )
@@ -260,39 +264,157 @@ adapter.initialize(
260264
261265``` python
262266from crossvector.abc import VectorDBAdapter
263- from typing import Any, Dict, List, Set
267+ from crossvector.schema import VectorDocument
268+ from typing import Any, Dict, List, Set, Optional, Union, Sequence, Tuple
264269
265270class MyCustomDBAdapter (VectorDBAdapter ):
266- def initialize (self , collection_name : str , embedding_dimension : int , metric : str = " cosine" , store_text : bool = True ):
271+ """ Custom vector database adapter implementation."""
272+
273+ # Optional: Set to True if your database uses '$vector' instead of 'vector'
274+ use_dollar_vector: bool = False
275+
276+ def initialize (
277+ self ,
278+ collection_name : str ,
279+ embedding_dimension : int ,
280+ metric : str = " cosine" ,
281+ ** kwargs : Any
282+ ) -> None :
283+ """ Initialize database and ensure collection is ready."""
284+ # Your implementation
285+ pass
286+
287+ def add_collection (
288+ self ,
289+ collection_name : str ,
290+ embedding_dimension : int ,
291+ metric : str = " cosine"
292+ ) -> Any:
293+ """ Create a new collection."""
267294 # Your implementation
268295 pass
269296
270- def get_collection (self , collection_name : str , embedding_dimension : int , metric : str = " cosine" ):
297+ def get_collection (self , collection_name : str ) -> Any:
298+ """ Retrieve an existing collection."""
271299 # Your implementation
272300 pass
273301
274- def upsert (self , documents : List[Dict[str , Any]]):
302+ def get_or_create_collection (
303+ self ,
304+ collection_name : str ,
305+ embedding_dimension : int ,
306+ metric : str = " cosine"
307+ ) -> Any:
308+ """ Get existing collection or create if doesn't exist."""
275309 # Your implementation
276310 pass
277311
278- def search (self , vector : List[float ], limit : int , fields : Set[str ]) -> List[Dict[str , Any]]:
312+ def drop_collection (self , collection_name : str ) -> bool :
313+ """ Delete a collection and all its documents."""
279314 # Your implementation
280315 pass
281316
282- def get (self , id : str ) -> Dict[str , Any] | None :
317+ def clear_collection (self ) -> int :
318+ """ Delete all documents from current collection."""
283319 # Your implementation
284320 pass
285321
286322 def count (self ) -> int :
323+ """ Count total documents in current collection."""
324+ # Your implementation
325+ pass
326+
327+ def search (
328+ self ,
329+ vector : List[float ],
330+ limit : int ,
331+ offset : int = 0 ,
332+ where : Dict[str , Any] | None = None ,
333+ fields : Set[str ] | None = None ,
334+ ) -> List[VectorDocument]:
335+ """ Perform vector similarity search."""
336+ # Your implementation
337+ # Should return List[VectorDocument]
338+ pass
339+
340+ def get (self , * args , ** kwargs ) -> VectorDocument:
341+ """ Retrieve a single document by primary key."""
287342 # Your implementation
343+ # Should return VectorDocument instance
288344 pass
289345
290- def delete_one (self , id : str ) -> int :
346+ def get_or_create (
347+ self ,
348+ defaults : Optional[Dict[str , Any]] = None ,
349+ ** kwargs
350+ ) -> Tuple[VectorDocument, bool ]:
351+ """ Get document by pk or create if not found."""
291352 # Your implementation
353+ # Should return (VectorDocument, created: bool)
292354 pass
293355
294- def delete_many (self , ids : List[str ]) -> int :
356+ def create (self , ** kwargs : Any) -> VectorDocument:
357+ """ Create and persist a single document."""
295358 # Your implementation
359+ # Should return VectorDocument instance
360+ pass
361+
362+ def bulk_create (
363+ self ,
364+ documents : List[VectorDocument],
365+ batch_size : int = None ,
366+ ignore_conflicts : bool = False ,
367+ update_conflicts : bool = False ,
368+ update_fields : List[str ] = None ,
369+ ) -> List[VectorDocument]:
370+ """ Create multiple documents in batch."""
371+ # Your implementation
372+ # Should return List[VectorDocument]
373+ pass
374+
375+ def delete (self , ids : Union[str , Sequence[str ]]) -> int :
376+ """ Delete document(s) by primary key."""
377+ # Your implementation
378+ # Should return count of deleted documents
379+ pass
380+
381+ def update (self , ** kwargs ) -> VectorDocument:
382+ """ Update existing document by pk."""
383+ # Your implementation
384+ # Should return updated VectorDocument instance
385+ pass
386+
387+ def update_or_create (
388+ self ,
389+ defaults : Optional[Dict[str , Any]] = None ,
390+ create_defaults : Optional[Dict[str , Any]] = None ,
391+ ** kwargs
392+ ) -> Tuple[VectorDocument, bool ]:
393+ """ Update document if exists, otherwise create."""
394+ # Your implementation
395+ # Should return (VectorDocument, created: bool)
396+ pass
397+
398+ def bulk_update (
399+ self ,
400+ documents : List[VectorDocument],
401+ batch_size : int = None ,
402+ ignore_conflicts : bool = False ,
403+ update_fields : List[str ] = None ,
404+ ) -> List[VectorDocument]:
405+ """ Update multiple existing documents by pk in batch."""
406+ # Your implementation
407+ # Should return List[VectorDocument]
408+ pass
409+
410+ def upsert (
411+ self ,
412+ documents : List[VectorDocument],
413+ batch_size : int = None
414+ ) -> List[VectorDocument]:
415+ """ Insert new documents or update existing ones."""
416+ # Your implementation
417+ # Should return List[VectorDocument]
296418 pass
297419```
298420
@@ -322,13 +444,13 @@ CrossVector uses a standardized JSON format across all vector databases. Here's
322444
323445### 1. User Level (Creating Documents)
324446
325- When you create documents, use the ` Document ` class:
447+ When you create documents, use the ` VectorDocument ` class:
326448
327449``` python
328- from crossvector import Document
450+ from crossvector import VectorDocument
329451
330- # Option 1: With explicit ID
331- doc = Document (
452+ # Option 1: With explicit ID (string)
453+ doc = VectorDocument (
332454 id = " my-custom-id" ,
333455 text = " The content of my document" ,
334456 metadata = {
@@ -338,12 +460,18 @@ doc = Document(
338460 }
339461)
340462
341- # Option 2: Auto-generated ID (SHA256 hash of text)
342- doc = Document(
463+ # Option 2: Auto-generated ID (based on PRIMARY_KEY_MODE setting)
464+ # Default mode: 'uuid' - Random UUID
465+ doc = VectorDocument(
343466 text = " Another document without ID" ,
344467 metadata = {" category" : " auto" }
345468)
346- # doc.id will be a 64-character SHA256 hash
469+ # doc.id will be auto-generated based on your PRIMARY_KEY_MODE:
470+ # - 'uuid': Random UUID (32-char hex string)
471+ # - 'hash_text': SHA256 hash of text (64-char hex string)
472+ # - 'hash_vector': SHA256 hash of vector (64-char hex string)
473+ # - 'int64': Sequential integer (as string: "1", "2", "3", ...)
474+ # - 'auto': Hash text if available, else hash vector, else UUID
347475
348476# Timestamps are automatically generated
349477print (doc.created_timestamp) # Unix timestamp: 1732349789.123456
@@ -355,7 +483,7 @@ created_dt = datetime.fromtimestamp(doc.created_timestamp, tz=timezone.utc)
355483print (created_dt) # 2024-11-23 11:16:29.123456+00:00
356484
357485# You can safely use your own created_at/updated_at in metadata!
358- doc_with_article_timestamps = Document (
486+ doc_with_article_timestamps = VectorDocument (
359487 text = " My article content" ,
360488 metadata = {
361489 " title" : " My Article" ,
@@ -371,7 +499,13 @@ doc_with_article_timestamps = Document(
371499
372500** Auto-Generated Fields:**
373501
374- - ` id ` : SHA256 hash of text if not provided
502+ - ` id ` : Auto-generated if not provided, based on ` PRIMARY_KEY_MODE ` setting:
503+ - ` uuid ` (default): Random UUID hex string
504+ - ` hash_text ` : SHA256 hash of text content
505+ - ` hash_vector ` : SHA256 hash of vector
506+ - ` int64 ` : Sequential integer (returned as string)
507+ - ` auto ` : Smart mode - hash text if available, else hash vector, else UUID
508+ - Custom: Can specify ` PRIMARY_KEY_FACTORY ` for custom ID generation function
375509- ` created_timestamp ` : Unix timestamp (float) when document was created
376510- ` updated_timestamp ` : Unix timestamp (float), updated on every modification
377511
@@ -509,29 +643,27 @@ AstraDB stores everything at the document root level:
509643
510644### 4. Search Results Format
511645
512- When you call ` search() ` or ` get() ` , results are returned in a unified format :
646+ When you call ` search() ` or ` get() ` , results are returned as ` VectorDocument ` instances :
513647
514648``` python
515649# Search results
516- results = engine.search(SearchRequest( query = " example" , limit = 5 ) )
650+ results = engine.search(query = " example" , limit = 5 )
517651
518- # Each result:
519- {
520- " id" : " unique-doc-id" , # Document ID
521- " score" : 0.92 , # Similarity score (lower = more similar for some metrics)
522- " text" : " original text" , # If requested in fields
523- " metadata" : { # Original metadata structure
524- " category" : " example" ,
525- " source" : " manual" ,
526- " tags" : [" important" ]
527- }
528- }
652+ # Each result is a VectorDocument instance with:
653+ for doc in results:
654+ doc.id # Document ID (string)
655+ doc.score # Similarity score (added by search, lower = more similar for some metrics)
656+ doc.text # Original text (if store_text=True and requested in fields)
657+ doc.vector # Embedding vector (if requested in fields)
658+ doc.metadata # Metadata dictionary
659+ doc.created_timestamp # Creation timestamp (float)
660+ doc.updated_timestamp # Last update timestamp (float)
529661```
530662
531663### 5. Example: Complete Flow
532664
533665``` python
534- from crossvector import VectorEngine, Document, UpsertRequest, SearchRequest
666+ from crossvector import VectorEngine, VectorDocument
535667from crossvector.embeddings.openai import OpenAIEmbeddingAdapter
536668from crossvector.dbs.pgvector import PGVectorAdapter
537669
@@ -542,37 +674,42 @@ engine = VectorEngine(
542674 store_text = True
543675)
544676
545- # 1. Create documents (User Level)
677+ # 1. Create documents from texts (User Level - Recommended)
678+ result = engine.upsert_from_texts(
679+ texts = [" Python is a programming language" ],
680+ metadatas = [{" lang" : " en" , " category" : " tech" }]
681+ )
682+
683+ # Alternative: Create VectorDocument directly (if you have embeddings)
546684docs = [
547- Document (
685+ VectorDocument (
548686 text = " Python is a programming language" ,
687+ vector = [0.1 ]* 1536 , # Pre-computed embedding
549688 metadata = {" lang" : " en" , " category" : " tech" }
550689 )
551690]
691+ engine.upsert(docs)
552692
553- # 2. Upsert (Engine Level conversion happens automatically)
554- engine.upsert(UpsertRequest(documents = docs))
555-
556- # 3. Search (Results in unified format)
557- results = engine.search(SearchRequest(
693+ # 2. Search (Results in unified format)
694+ results = engine.search(
558695 query = " programming languages" ,
559696 limit = 5 ,
560697 fields = {" text" , " metadata" } # Specify what to return
561- ))
562-
563- # 4 . Use results
564- for result in results:
565- print (f " ID: { result[ ' id ' ] } " )
566- print (f " Score: { result[ ' score' ] } " )
567- print (f " Text: { result.get( ' text' , ' N/A ' ) } " )
568- print (f " Metadata: { result.get( ' metadata' , {}) } " )
698+ )
699+
700+ # 3 . Use results (VectorDocument instances)
701+ for doc in results:
702+ print (f " ID: { doc.id } " )
703+ print (f " Score: { getattr (doc, ' score' , ' N/A ' ) } " )
704+ print (f " Text: { doc. text} " )
705+ print (f " Metadata: { doc. metadata} " )
569706```
570707
571708### Summary Table
572709
573710| Level | Format | Key Fields | Notes |
574711| -------| --------| -----------| -------|
575- | ** User** | ` Document ` object | ` id ` , ` text ` , ` metadata ` | Pydantic validation, auto-generated ID |
712+ | ** User** | ` VectorDocument ` object | ` id ` , ` text ` , ` vector ` , ` metadata ` | Pydantic validation, auto-generated ID |
576713| ** Engine** | Python dict | ` _id ` , ` vector ` , ` text ` , metadata fields | Standardized across all DBs |
577714| ** PGVector** | SQL row | ` doc_id ` , ` vector ` , ` text ` , ` metadata ` (JSONB) | Text in separate column |
578715| ** Milvus** | JSON document | ` doc_id ` , ` vector ` , ` text ` , ` metadata ` (JSON) | Text in VARCHAR field |
0 commit comments