Skip to content

Commit bf4e059

Browse files
committed
fix: improve log visibility when image_bytes discarded on caption failure (#590)
1 parent ffc80e2 commit bf4e059

1 file changed

Lines changed: 19 additions & 6 deletions

File tree

backend/app/rag/vision.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -182,8 +182,13 @@ def _openai_caption(image_bytes: bytes) -> str:
182182
)
183183
return response.choices[0].message.content.strip()
184184

185-
except Exception as exc:
186-
logger.debug("OpenAI vision caption failed: %s", exc)
185+
except Exception as exc:
186+
logger.warning(
187+
"OpenAI vision caption failed — falling back to OCR/placeholder. "
188+
"This may be a transient API error (rate-limit, timeout). Error: %s",
189+
exc,
190+
exc_info=True,
191+
)
187192
return ""
188193

189194

@@ -227,7 +232,6 @@ def caption_image(
227232
dims = "unknown size"
228233

229234
return f"Figure on page {page} ({dims})." if page else f"Figure ({dims})."
230-
231235
# Placeholder for provider-based captioning (e.g., OpenAI / LLaVA hooks)
232236
provider = getattr(settings, "VISION_PROVIDER", None)
233237

@@ -273,11 +277,20 @@ def generate_captions_for_chunks(chunks: List[Dict[str, Any]]) -> None:
273277
chunk["is_image"] = True
274278
chunk["image_caption"] = caption
275279
except Exception as exc:
276-
logger.debug("Failed to caption image chunk: %s", exc)
280+
page = chunk.get("page", "?")
281+
logger.warning(
282+
"Caption generation failed for image on page %s — image_bytes will be "
283+
"permanently discarded. This may indicate a transient network error "
284+
"(e.g. API rate-limit or timeout). If this repeats, check your VLM "
285+
"provider configuration. Error: %s",
286+
page,
287+
exc,
288+
exc_info=True,
289+
)
277290
chunk["is_image"] = True
278-
fallback = f"Image on page {chunk.get('page', '?')}"
291+
fallback = f"Image on page {page}"
279292
chunk.setdefault("text", fallback)
280293
chunk["image_caption"] = chunk["text"]
281294
finally:
282295
# Always strip raw bytes — never serialise them into ChromaDB
283-
chunk.pop("image_bytes", None)
296+
chunk.pop("image_bytes", None)

0 commit comments

Comments
 (0)