Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .cursor/skills/component-check/SKILL.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
---
name: component-check
name: component-check2
description: Enforces component reuse and design system compliance for React + TypeScript + Tailwind + shadcn/ui. Use when creating or modifying components, when the user asks about UI components, mobile layouts, extending a feature under components/features, or when reviewing component architecture.
---

Expand Down
18 changes: 9 additions & 9 deletions .env.local.example
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@
# 200 [{"translation_text":"..."}]
# 2. HF_INFERENCE_ENDPOINT_URL — chat-model inference endpoint (also
# used by /api/ask and /api/safety). Prompted to translate.
# 3. HF_TOKEN only — falls back to the HF router at HF_ASK_BASE_URL with
# HF_ASK_MODEL (same OpenAI-compatible stack as /api/summarize).
HF_TRANSLATE_ENDPOINT_URL=

# Chat-model HF Inference Endpoint — required for `/api/ask` and
# `/api/safety`; secondary for /api/translate. Custom-handler endpoint
# that accepts {"inputs":{"messages":[...]}} and returns
# {"generated_text":"..."}.
# Chat-model HF Inference Endpoint — optional; when set it is preferred for
# `/api/ask` and `/api/safety`, and is the second choice for /api/translate.
# Custom handler: {"inputs":{"messages":[...]}} → {"generated_text":"..."}.
HF_INFERENCE_ENDPOINT_URL=

# Hugging Face Space — Read Aloud / TTS (Coqui).
Expand All @@ -24,12 +25,11 @@ HF_TTS_SPACE_URL=https://resilient-coders-aidoc-tts.hf.space
# the app calls the endpoint directly instead of the Space.
# HF_TTS_ENDPOINT_EN=https://kqb8pjk2dlp2yay8.eu-west-1.aws.endpoints.huggingface.cloud

# Hugging Face Inference Providers — Ask (`/api/ask`) legacy router fallback +
# Summarize (`/api/summarize`). `HF_TOKEN` is required for the legacy Ask path
# AND for Summarize. Model/base overrides are optional (routes have defaults).
# Hugging Face Inference Providers — `HF_TOKEN` powers Summarize (`/api/summarize`),
# Ask router fallback, Translate router fallback (when no translate/inference URL),
# and Safety router fallback (when HF_INFERENCE_ENDPOINT_URL is unset).
# When HF_INFERENCE_ENDPOINT_URL is set, HF_ASK_MODEL / HF_ASK_BASE_URL are ignored
# by /api/ask. HF_TOKEN is still forwarded as Bearer auth to the inference
# endpoint when set.
# by /api/ask only; they still apply to summarize / translate / safety router paths.
HF_TOKEN=
HF_ASK_MODEL=meta-llama/Llama-3.1-8B-Instruct:cheapest
HF_ASK_BASE_URL=https://router.huggingface.co/v1
Expand Down
124 changes: 124 additions & 0 deletions DEMO_SCRIPT.md

Large diffs are not rendered by default.

632 changes: 632 additions & 0 deletions WORKING_VERSION.md

Large diffs are not rendered by default.

163 changes: 93 additions & 70 deletions app/api/ask/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -180,12 +180,18 @@ export async function POST(request: Request) {

const chunkCount = Array.isArray(chunks) ? chunks.length : 0;

// Resolve which upstream to use. The dedicated inference endpoint
// wins when configured; otherwise we fall back to the legacy router
// path which requires HF_TOKEN.
// Resolve which upstream(s) to use. Priority:
// 1. HF_INFERENCE_ENDPOINT_URL (preferred when configured).
// 2. HF_TOKEN-backed router via @ai-sdk/openai (`getAskLanguageModel`).
//
// The router path is also resolved up-front when HF_TOKEN is set so
// that a transient failure of the dedicated endpoint (cold-start
// timeout past the LB's 60s budget, 5xx, network error) can fall
// back to the working router path instead of breaking the user's
// chat session.
const inferenceEndpointUrl =
process.env.HF_INFERENCE_ENDPOINT_URL?.trim() || undefined;
const model = inferenceEndpointUrl ? null : getAskLanguageModel();
const model = getAskLanguageModel();
if (!inferenceEndpointUrl && !model) {
const durationMs = Date.now() - started;
/* eslint-disable no-console */
Expand Down Expand Up @@ -273,7 +279,7 @@ export async function POST(request: Request) {
// here too so observability shape stays identical to the streamText path.
if (inferenceEndpointUrl) {
const inferenceModelId = "hf-inference-endpoint";
let answerText: string;
let answerText: string | null = null;
try {
answerText = await callHfInferenceAskProvider({
url: inferenceEndpointUrl,
Expand All @@ -283,90 +289,107 @@ export async function POST(request: Request) {
timeoutMs: HF_INFERENCE_ASK_TIMEOUT_MS,
});
} catch (err) {
/* eslint-disable no-console */
const message = err instanceof Error ? err.message : String(err);
const durationMs = Date.now() - started;
console.error(
"[ask] inference error:",
if (!model) {
/* eslint-disable no-console */
console.error(
"[ask] inference error:",
message,
"— latency:",
durationMs,
"ms",
"— provider:",
inferenceModelId,
);
/* eslint-enable no-console */
return NextResponse.json(
{ error: "Question answering failed" },
{ status: 502 },
);
}
/* eslint-disable no-console -- one-line ops note when we fall back */
console.warn(
"[ask] HF Inference Endpoint failed, falling back to HF router:",
message,
"— latency:",
durationMs,
"ms",
"— provider:",
inferenceModelId,
);
/* eslint-enable no-console */
return NextResponse.json(
{ error: "Question answering failed" },
{ status: 502 },
);
}

// Fire-and-forget observability — same shape as the streamText path.
void postAskTurnToLangSmith({
questionHash,
questionLen,
chunkCount,
answerLanguage: answerLanguage ?? "unset",
contextTextLen: safeContext.length,
modelId: inferenceModelId,
finishReason: "stop",
totalUsage: undefined,
answerText,
confidenceBandsVersion: ASK_CONFIDENCE_BANDS_VERSION,
}).catch((langsmithErr) => {
/* eslint-disable no-console */
console.error(
"[ask] LangSmith export error:",
langsmithErr instanceof Error ? langsmithErr.message : String(langsmithErr),
);
/* eslint-enable no-console */
});
evaluateAsync({
input: safeQuestion,
output: answerText,
model: inferenceModelId,
feature: "ask",
metadata: {
// When the dedicated endpoint succeeded, return its answer wrapped
// in a UI message stream so AskTab keeps consuming the same protocol
// as the streamText branch below. When it threw and a router model
// is available, fall through to the streamText branch instead.
if (answerText !== null) {
const finalAnswerText = answerText;
void postAskTurnToLangSmith({
questionHash,
questionLen,
chunkCount,
answerLanguage: answerLanguage ?? "unset",
confidenceBandsVersion: ASK_CONFIDENCE_BANDS_VERSION,
contextTextLen: safeContext.length,
modelId: inferenceModelId,
finishReason: "stop",
provider: inferenceModelId,
},
});
totalUsage: undefined,
answerText: finalAnswerText,
confidenceBandsVersion: ASK_CONFIDENCE_BANDS_VERSION,
}).catch((langsmithErr) => {
/* eslint-disable no-console */
console.error(
"[ask] LangSmith export error:",
langsmithErr instanceof Error ? langsmithErr.message : String(langsmithErr),
);
/* eslint-enable no-console */
});
evaluateAsync({
input: safeQuestion,
output: finalAnswerText,
model: inferenceModelId,
feature: "ask",
metadata: {
questionHash,
chunkCount,
answerLanguage: answerLanguage ?? "unset",
confidenceBandsVersion: ASK_CONFIDENCE_BANDS_VERSION,
finishReason: "stop",
provider: inferenceModelId,
},
});

const durationMs = Date.now() - started;
const successLogPayload = { questionHash, questionLen };
assertAskLogHasNoRawTextPayload(
"inference_endpoint_complete",
successLogPayload,
);
/* eslint-disable no-console */
console.log(
"[ask] inference endpoint complete — latency:",
durationMs,
"ms",
successLogPayload,
);
/* eslint-enable no-console */
const durationMs = Date.now() - started;
const successLogPayload = { questionHash, questionLen };
assertAskLogHasNoRawTextPayload(
"inference_endpoint_complete",
successLogPayload,
);
/* eslint-disable no-console */
console.log(
"[ask] inference endpoint complete — latency:",
durationMs,
"ms",
successLogPayload,
);
/* eslint-enable no-console */

// Emit a UI message stream with one text-delta carrying the whole
// answer. AskTab streams it through `readUIMessageStream` exactly
// like the streamText path, so the chat bubble renders the same way.
const textId = randomUUID();
const stream = createUIMessageStream({
execute: ({ writer }) => {
writer.write({ type: "text-start", id: textId });
writer.write({ type: "text-delta", id: textId, delta: answerText });
writer.write({ type: "text-end", id: textId });
},
});
return createUIMessageStreamResponse({ stream });
const textId = randomUUID();
const stream = createUIMessageStream({
execute: ({ writer }) => {
writer.write({ type: "text-start", id: textId });
writer.write({ type: "text-delta", id: textId, delta: finalAnswerText });
writer.write({ type: "text-end", id: textId });
},
});
return createUIMessageStreamResponse({ stream });
}
}

// ── Legacy streaming branch (HF Inference Providers router) ──────────────
// Reached when (a) `HF_INFERENCE_ENDPOINT_URL` is unset and `model`
// is the only available provider, or (b) the dedicated endpoint
// threw above and we are falling back to the router.
// [Karlee] — V1 uses baseline Llama 3.1 8B + RAG + prompts (no fine-tuning per team decision, Apr 2026).
// HF/LLM errors often surface when the client consumes the stream (after this handler returns),
// not here — so logs below mean "stream object ready", not "model finished successfully".
Expand Down
Loading
Loading