Skip to content

Commit f821d51

Browse files
committed
refactor: rename VectorDocument, update API, reorganize utils
- Rename VectorDocument class (backward compat alias maintained) - Remove SearchRequest/UpsertRequest wrappers - use direct method calls - Add private _vector attribute with emb property - Move generate_pk and helpers from schema to utils - Reorganize utils.py into logical sections - Update all docs to reflect new API and PK generation modes - Fix integration tests to use new engine methods - Delete obsolete test_schema.py
1 parent b2a5789 commit f821d51

16 files changed

Lines changed: 523 additions & 702 deletions

File tree

README.md

Lines changed: 193 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ pip install crossvector[all]
9898
## Quick Start
9999

100100
```python
101-
from crossvector import VectorEngine
101+
from crossvector import VectorEngine, VectorDocument
102102
from crossvector.embeddings.openai import OpenAIEmbeddingAdapter
103103
from crossvector.dbs.astradb import AstraDBAdapter
104104

@@ -110,28 +110,32 @@ engine = VectorEngine(
110110
store_text=True # Optional: Set to False to not store original text
111111
)
112112

113-
# Create documents from texts with automatic embedding
114-
docs = engine.create_from_texts(
113+
# Create documents from texts with automatic embedding (recommended)
114+
docs = engine.upsert_from_texts(
115115
texts=["The quick brown fox", "Artificial intelligence"],
116116
metadatas=[{"category": "animals"}, {"category": "tech"}],
117-
pks=[None, "doc2"] # None = auto-generated, or specify custom pk
117+
pks=["doc1", "doc2"] # Optional: auto-generated if not provided
118118
)
119119
print(f"Inserted {len(docs)} documents")
120120

121121
# Alternative: Upsert with VectorDocument (if you have embeddings already)
122-
from crossvector import VectorDocument
123122
vector_docs = [
124-
VectorDocument(pk="doc3", text="Python programming", vector=[0.1]*1536, metadata={"category": "tech"})
123+
VectorDocument(
124+
id="doc3",
125+
text="Python programming",
126+
vector=[0.1]*1536,
127+
metadata={"category": "tech"}
128+
)
125129
]
126130
result = engine.upsert(vector_docs)
127131

128132
# Search with automatic query embedding
129-
results = engine.search("AI and machine learning", limit=5)
133+
results = engine.search(query="AI and machine learning", limit=5)
130134
for doc in results:
131-
print(f"ID: {doc.pk}, Text: {doc.text}")
135+
print(f"ID: {doc.id}, Text: {doc.text}")
132136

133137
# Search with filters
134-
results = engine.search("python", limit=5, where={"category": "tech"})
138+
results = engine.search(query="python", limit=5, where={"category": "tech"})
135139

136140
# Get document by ID
137141
doc = engine.get("doc2")
@@ -260,39 +264,157 @@ adapter.initialize(
260264

261265
```python
262266
from crossvector.abc import VectorDBAdapter
263-
from typing import Any, Dict, List, Set
267+
from crossvector.schema import VectorDocument
268+
from typing import Any, Dict, List, Set, Optional, Union, Sequence, Tuple
264269

265270
class MyCustomDBAdapter(VectorDBAdapter):
266-
def initialize(self, collection_name: str, embedding_dimension: int, metric: str = "cosine", store_text: bool = True):
271+
"""Custom vector database adapter implementation."""
272+
273+
# Optional: Set to True if your database uses '$vector' instead of 'vector'
274+
use_dollar_vector: bool = False
275+
276+
def initialize(
277+
self,
278+
collection_name: str,
279+
embedding_dimension: int,
280+
metric: str = "cosine",
281+
**kwargs: Any
282+
) -> None:
283+
"""Initialize database and ensure collection is ready."""
284+
# Your implementation
285+
pass
286+
287+
def add_collection(
288+
self,
289+
collection_name: str,
290+
embedding_dimension: int,
291+
metric: str = "cosine"
292+
) -> Any:
293+
"""Create a new collection."""
267294
# Your implementation
268295
pass
269296

270-
def get_collection(self, collection_name: str, embedding_dimension: int, metric: str = "cosine"):
297+
def get_collection(self, collection_name: str) -> Any:
298+
"""Retrieve an existing collection."""
271299
# Your implementation
272300
pass
273301

274-
def upsert(self, documents: List[Dict[str, Any]]):
302+
def get_or_create_collection(
303+
self,
304+
collection_name: str,
305+
embedding_dimension: int,
306+
metric: str = "cosine"
307+
) -> Any:
308+
"""Get existing collection or create if doesn't exist."""
275309
# Your implementation
276310
pass
277311

278-
def search(self, vector: List[float], limit: int, fields: Set[str]) -> List[Dict[str, Any]]:
312+
def drop_collection(self, collection_name: str) -> bool:
313+
"""Delete a collection and all its documents."""
279314
# Your implementation
280315
pass
281316

282-
def get(self, id: str) -> Dict[str, Any] | None:
317+
def clear_collection(self) -> int:
318+
"""Delete all documents from current collection."""
283319
# Your implementation
284320
pass
285321

286322
def count(self) -> int:
323+
"""Count total documents in current collection."""
324+
# Your implementation
325+
pass
326+
327+
def search(
328+
self,
329+
vector: List[float],
330+
limit: int,
331+
offset: int = 0,
332+
where: Dict[str, Any] | None = None,
333+
fields: Set[str] | None = None,
334+
) -> List[VectorDocument]:
335+
"""Perform vector similarity search."""
336+
# Your implementation
337+
# Should return List[VectorDocument]
338+
pass
339+
340+
def get(self, *args, **kwargs) -> VectorDocument:
341+
"""Retrieve a single document by primary key."""
287342
# Your implementation
343+
# Should return VectorDocument instance
288344
pass
289345

290-
def delete_one(self, id: str) -> int:
346+
def get_or_create(
347+
self,
348+
defaults: Optional[Dict[str, Any]] = None,
349+
**kwargs
350+
) -> Tuple[VectorDocument, bool]:
351+
"""Get document by pk or create if not found."""
291352
# Your implementation
353+
# Should return (VectorDocument, created: bool)
292354
pass
293355

294-
def delete_many(self, ids: List[str]) -> int:
356+
def create(self, **kwargs: Any) -> VectorDocument:
357+
"""Create and persist a single document."""
295358
# Your implementation
359+
# Should return VectorDocument instance
360+
pass
361+
362+
def bulk_create(
363+
self,
364+
documents: List[VectorDocument],
365+
batch_size: int = None,
366+
ignore_conflicts: bool = False,
367+
update_conflicts: bool = False,
368+
update_fields: List[str] = None,
369+
) -> List[VectorDocument]:
370+
"""Create multiple documents in batch."""
371+
# Your implementation
372+
# Should return List[VectorDocument]
373+
pass
374+
375+
def delete(self, ids: Union[str, Sequence[str]]) -> int:
376+
"""Delete document(s) by primary key."""
377+
# Your implementation
378+
# Should return count of deleted documents
379+
pass
380+
381+
def update(self, **kwargs) -> VectorDocument:
382+
"""Update existing document by pk."""
383+
# Your implementation
384+
# Should return updated VectorDocument instance
385+
pass
386+
387+
def update_or_create(
388+
self,
389+
defaults: Optional[Dict[str, Any]] = None,
390+
create_defaults: Optional[Dict[str, Any]] = None,
391+
**kwargs
392+
) -> Tuple[VectorDocument, bool]:
393+
"""Update document if exists, otherwise create."""
394+
# Your implementation
395+
# Should return (VectorDocument, created: bool)
396+
pass
397+
398+
def bulk_update(
399+
self,
400+
documents: List[VectorDocument],
401+
batch_size: int = None,
402+
ignore_conflicts: bool = False,
403+
update_fields: List[str] = None,
404+
) -> List[VectorDocument]:
405+
"""Update multiple existing documents by pk in batch."""
406+
# Your implementation
407+
# Should return List[VectorDocument]
408+
pass
409+
410+
def upsert(
411+
self,
412+
documents: List[VectorDocument],
413+
batch_size: int = None
414+
) -> List[VectorDocument]:
415+
"""Insert new documents or update existing ones."""
416+
# Your implementation
417+
# Should return List[VectorDocument]
296418
pass
297419
```
298420

@@ -322,13 +444,13 @@ CrossVector uses a standardized JSON format across all vector databases. Here's
322444

323445
### 1. User Level (Creating Documents)
324446

325-
When you create documents, use the `Document` class:
447+
When you create documents, use the `VectorDocument` class:
326448

327449
```python
328-
from crossvector import Document
450+
from crossvector import VectorDocument
329451

330-
# Option 1: With explicit ID
331-
doc = Document(
452+
# Option 1: With explicit ID (string)
453+
doc = VectorDocument(
332454
id="my-custom-id",
333455
text="The content of my document",
334456
metadata={
@@ -338,12 +460,18 @@ doc = Document(
338460
}
339461
)
340462

341-
# Option 2: Auto-generated ID (SHA256 hash of text)
342-
doc = Document(
463+
# Option 2: Auto-generated ID (based on PRIMARY_KEY_MODE setting)
464+
# Default mode: 'uuid' - Random UUID
465+
doc = VectorDocument(
343466
text="Another document without ID",
344467
metadata={"category": "auto"}
345468
)
346-
# doc.id will be a 64-character SHA256 hash
469+
# doc.id will be auto-generated based on your PRIMARY_KEY_MODE:
470+
# - 'uuid': Random UUID (32-char hex string)
471+
# - 'hash_text': SHA256 hash of text (64-char hex string)
472+
# - 'hash_vector': SHA256 hash of vector (64-char hex string)
473+
# - 'int64': Sequential integer (as string: "1", "2", "3", ...)
474+
# - 'auto': Hash text if available, else hash vector, else UUID
347475

348476
# Timestamps are automatically generated
349477
print(doc.created_timestamp) # Unix timestamp: 1732349789.123456
@@ -355,7 +483,7 @@ created_dt = datetime.fromtimestamp(doc.created_timestamp, tz=timezone.utc)
355483
print(created_dt) # 2024-11-23 11:16:29.123456+00:00
356484

357485
# You can safely use your own created_at/updated_at in metadata!
358-
doc_with_article_timestamps = Document(
486+
doc_with_article_timestamps = VectorDocument(
359487
text="My article content",
360488
metadata={
361489
"title": "My Article",
@@ -371,7 +499,13 @@ doc_with_article_timestamps = Document(
371499

372500
**Auto-Generated Fields:**
373501

374-
- `id`: SHA256 hash of text if not provided
502+
- `id`: Auto-generated if not provided, based on `PRIMARY_KEY_MODE` setting:
503+
- `uuid` (default): Random UUID hex string
504+
- `hash_text`: SHA256 hash of text content
505+
- `hash_vector`: SHA256 hash of vector
506+
- `int64`: Sequential integer (returned as string)
507+
- `auto`: Smart mode - hash text if available, else hash vector, else UUID
508+
- Custom: Can specify `PRIMARY_KEY_FACTORY` for custom ID generation function
375509
- `created_timestamp`: Unix timestamp (float) when document was created
376510
- `updated_timestamp`: Unix timestamp (float), updated on every modification
377511

@@ -509,29 +643,27 @@ AstraDB stores everything at the document root level:
509643

510644
### 4. Search Results Format
511645

512-
When you call `search()` or `get()`, results are returned in a unified format:
646+
When you call `search()` or `get()`, results are returned as `VectorDocument` instances:
513647

514648
```python
515649
# Search results
516-
results = engine.search(SearchRequest(query="example", limit=5))
650+
results = engine.search(query="example", limit=5)
517651

518-
# Each result:
519-
{
520-
"id": "unique-doc-id", # Document ID
521-
"score": 0.92, # Similarity score (lower = more similar for some metrics)
522-
"text": "original text", # If requested in fields
523-
"metadata": { # Original metadata structure
524-
"category": "example",
525-
"source": "manual",
526-
"tags": ["important"]
527-
}
528-
}
652+
# Each result is a VectorDocument instance with:
653+
for doc in results:
654+
doc.id # Document ID (string)
655+
doc.score # Similarity score (added by search, lower = more similar for some metrics)
656+
doc.text # Original text (if store_text=True and requested in fields)
657+
doc.vector # Embedding vector (if requested in fields)
658+
doc.metadata # Metadata dictionary
659+
doc.created_timestamp # Creation timestamp (float)
660+
doc.updated_timestamp # Last update timestamp (float)
529661
```
530662

531663
### 5. Example: Complete Flow
532664

533665
```python
534-
from crossvector import VectorEngine, Document, UpsertRequest, SearchRequest
666+
from crossvector import VectorEngine, VectorDocument
535667
from crossvector.embeddings.openai import OpenAIEmbeddingAdapter
536668
from crossvector.dbs.pgvector import PGVectorAdapter
537669

@@ -542,37 +674,42 @@ engine = VectorEngine(
542674
store_text=True
543675
)
544676

545-
# 1. Create documents (User Level)
677+
# 1. Create documents from texts (User Level - Recommended)
678+
result = engine.upsert_from_texts(
679+
texts=["Python is a programming language"],
680+
metadatas=[{"lang": "en", "category": "tech"}]
681+
)
682+
683+
# Alternative: Create VectorDocument directly (if you have embeddings)
546684
docs = [
547-
Document(
685+
VectorDocument(
548686
text="Python is a programming language",
687+
vector=[0.1]*1536, # Pre-computed embedding
549688
metadata={"lang": "en", "category": "tech"}
550689
)
551690
]
691+
engine.upsert(docs)
552692

553-
# 2. Upsert (Engine Level conversion happens automatically)
554-
engine.upsert(UpsertRequest(documents=docs))
555-
556-
# 3. Search (Results in unified format)
557-
results = engine.search(SearchRequest(
693+
# 2. Search (Results in unified format)
694+
results = engine.search(
558695
query="programming languages",
559696
limit=5,
560697
fields={"text", "metadata"} # Specify what to return
561-
))
562-
563-
# 4. Use results
564-
for result in results:
565-
print(f"ID: {result['id']}")
566-
print(f"Score: {result['score']}")
567-
print(f"Text: {result.get('text', 'N/A')}")
568-
print(f"Metadata: {result.get('metadata', {})}")
698+
)
699+
700+
# 3. Use results (VectorDocument instances)
701+
for doc in results:
702+
print(f"ID: {doc.id}")
703+
print(f"Score: {getattr(doc, 'score', 'N/A')}")
704+
print(f"Text: {doc.text}")
705+
print(f"Metadata: {doc.metadata}")
569706
```
570707

571708
### Summary Table
572709

573710
| Level | Format | Key Fields | Notes |
574711
|-------|--------|-----------|-------|
575-
| **User** | `Document` object | `id`, `text`, `metadata` | Pydantic validation, auto-generated ID |
712+
| **User** | `VectorDocument` object | `id`, `text`, `vector`, `metadata` | Pydantic validation, auto-generated ID |
576713
| **Engine** | Python dict | `_id`, `vector`, `text`, metadata fields | Standardized across all DBs |
577714
| **PGVector** | SQL row | `doc_id`, `vector`, `text`, `metadata` (JSONB) | Text in separate column |
578715
| **Milvus** | JSON document | `doc_id`, `vector`, `text`, `metadata` (JSON) | Text in VARCHAR field |

0 commit comments

Comments
 (0)