-
Notifications
You must be signed in to change notification settings - Fork 33
Expand file tree
/
Copy pathembeddings.py
More file actions
80 lines (63 loc) · 2.48 KB
/
embeddings.py
File metadata and controls
80 lines (63 loc) · 2.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import logging
from typing import List, Optional
import numpy as np
from openai import OpenAI
from tenacity import retry, stop_after_attempt, wait_exponential
from coderag.config import OPENAI_API_KEY, OPENAI_EMBEDDING_MODEL
logger = logging.getLogger(__name__)
# Initialize the OpenAI client with error handling
client: Optional[OpenAI]
try:
if not OPENAI_API_KEY:
raise ValueError("OpenAI API key not found in environment variables")
client = OpenAI(api_key=OPENAI_API_KEY)
logger.info(f"OpenAI client initialized with model: {OPENAI_EMBEDDING_MODEL}")
except Exception as e:
logger.error(f"Failed to initialize OpenAI client: {e}")
client = None
def _chunk_text(text: str, max_chars: int = 4000) -> List[str]:
"""Naive chunking by characters to avoid overly long inputs."""
text = text.strip()
if len(text) <= max_chars:
return [text]
return [text[i : i + max_chars] for i in range(0, len(text), max_chars)]
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=0.5, max=8),
reraise=True,
)
def _embed_batch(inputs: List[str]) -> np.ndarray:
"""Call OpenAI embeddings with basic retry/backoff. Returns shape (n, d)."""
if client is None:
raise RuntimeError("OpenAI client not initialized")
response = client.embeddings.create(
model=OPENAI_EMBEDDING_MODEL,
input=inputs,
timeout=30,
)
arr = np.array([d.embedding for d in response.data], dtype="float32")
return arr
def generate_embeddings(text: str) -> Optional[np.ndarray]:
"""Generate embeddings using OpenAI's embedding API.
Args:
text: The input text to generate embeddings for
Returns:
numpy array of embeddings or None if generation fails
"""
if not client:
logger.error("OpenAI client not initialized")
return None
if not text or not text.strip():
logger.warning("Empty text provided for embedding generation")
return None
try:
logger.debug(f"Generating embeddings for text of length: {len(text)}")
chunks = _chunk_text(text, max_chars=4000)
vecs = _embed_batch(chunks) # shape (n, d)
# Average chunk embeddings for a stable single vector
avg = np.mean(vecs, axis=0, dtype=np.float32).reshape(1, -1)
logger.debug(f"Successfully generated embeddings with shape: {avg.shape}")
return avg
except Exception as e:
logger.error(f"Failed to generate embeddings: {e}")
return None