Resilient-Labs · branic18 · May 11, 2026
diff --git a/.cursor/skills/component-check/SKILL.md b/.cursor/skills/component-check/SKILL.md
@@ -1,5 +1,5 @@
 ---
-name: component-check
+name: component-check2
 description: Enforces component reuse and design system compliance for React + TypeScript + Tailwind + shadcn/ui. Use when creating or modifying components, when the user asks about UI components, mobile layouts, extending a feature under components/features, or when reviewing component architecture.
 ---
 

diff --git a/.env.local.example b/.env.local.example
@@ -9,12 +9,13 @@
 #         200  [{"translation_text":"..."}]
 #   2. HF_INFERENCE_ENDPOINT_URL — chat-model inference endpoint (also
 #      used by /api/ask and /api/safety). Prompted to translate.
+#   3. HF_TOKEN only — falls back to the HF router at HF_ASK_BASE_URL with
+#      HF_ASK_MODEL (same OpenAI-compatible stack as /api/summarize).
 HF_TRANSLATE_ENDPOINT_URL=
 
-# Chat-model HF Inference Endpoint — required for `/api/ask` and
-# `/api/safety`; secondary for /api/translate. Custom-handler endpoint
-# that accepts {"inputs":{"messages":[...]}} and returns
-# {"generated_text":"..."}.
+# Chat-model HF Inference Endpoint — optional; when set it is preferred for
+# `/api/ask` and `/api/safety`, and is the second choice for /api/translate.
+# Custom handler: {"inputs":{"messages":[...]}} → {"generated_text":"..."}.
 HF_INFERENCE_ENDPOINT_URL=
 
 # Hugging Face Space — Read Aloud / TTS (Coqui).
@@ -24,12 +25,11 @@ HF_TTS_SPACE_URL=https://resilient-coders-aidoc-tts.hf.space
 # the app calls the endpoint directly instead of the Space.
 # HF_TTS_ENDPOINT_EN=https://kqb8pjk2dlp2yay8.eu-west-1.aws.endpoints.huggingface.cloud
 
-# Hugging Face Inference Providers — Ask (`/api/ask`) legacy router fallback +
-# Summarize (`/api/summarize`). `HF_TOKEN` is required for the legacy Ask path
-# AND for Summarize. Model/base overrides are optional (routes have defaults).
+# Hugging Face Inference Providers — `HF_TOKEN` powers Summarize (`/api/summarize`),
+# Ask router fallback, Translate router fallback (when no translate/inference URL),
+# and Safety router fallback (when HF_INFERENCE_ENDPOINT_URL is unset).
 # When HF_INFERENCE_ENDPOINT_URL is set, HF_ASK_MODEL / HF_ASK_BASE_URL are ignored
-# by /api/ask. HF_TOKEN is still forwarded as Bearer auth to the inference
-# endpoint when set.
+# by /api/ask only; they still apply to summarize / translate / safety router paths.
 HF_TOKEN=
 HF_ASK_MODEL=meta-llama/Llama-3.1-8B-Instruct:cheapest
 HF_ASK_BASE_URL=https://router.huggingface.co/v1

diff --git a/DEMO_SCRIPT.md b/DEMO_SCRIPT.md
diff --git a/WORKING_VERSION.md b/WORKING_VERSION.md
diff --git a/app/api/ask/route.ts b/app/api/ask/route.ts
@@ -180,12 +180,18 @@ export async function POST(request: Request) {
 
     const chunkCount = Array.isArray(chunks) ? chunks.length : 0;
 
-    // Resolve which upstream to use. The dedicated inference endpoint
-    // wins when configured; otherwise we fall back to the legacy router
-    // path which requires HF_TOKEN.
+    // Resolve which upstream(s) to use. Priority:
+    //   1. HF_INFERENCE_ENDPOINT_URL (preferred when configured).
+    //   2. HF_TOKEN-backed router via @ai-sdk/openai (`getAskLanguageModel`).
+    //
+    // The router path is also resolved up-front when HF_TOKEN is set so
+    // that a transient failure of the dedicated endpoint (cold-start
+    // timeout past the LB's 60s budget, 5xx, network error) can fall
+    // back to the working router path instead of breaking the user's
+    // chat session.
     const inferenceEndpointUrl =
       process.env.HF_INFERENCE_ENDPOINT_URL?.trim() || undefined;
-    const model = inferenceEndpointUrl ? null : getAskLanguageModel();
+    const model = getAskLanguageModel();
     if (!inferenceEndpointUrl && !model) {
       const durationMs = Date.now() - started;
       /* eslint-disable no-console */
@@ -273,7 +279,7 @@ export async function POST(request: Request) {
     // here too so observability shape stays identical to the streamText path.
     if (inferenceEndpointUrl) {
       const inferenceModelId = "hf-inference-endpoint";
-      let answerText: string;
+      let answerText: string | null = null;
       try {
         answerText = await callHfInferenceAskProvider({
           url: inferenceEndpointUrl,
@@ -283,90 +289,107 @@ export async function POST(request: Request) {
           timeoutMs: HF_INFERENCE_ASK_TIMEOUT_MS,
         });
       } catch (err) {
-        /* eslint-disable no-console */
         const message = err instanceof Error ? err.message : String(err);
         const durationMs = Date.now() - started;
-        console.error(
-          "[ask] inference error:",
+        if (!model) {
+          /* eslint-disable no-console */
+          console.error(
+            "[ask] inference error:",
+            message,
+            "— latency:",
+            durationMs,
+            "ms",
+            "— provider:",
+            inferenceModelId,
+          );
+          /* eslint-enable no-console */
+          return NextResponse.json(
+            { error: "Question answering failed" },
+            { status: 502 },
+          );
+        }
+        /* eslint-disable no-console -- one-line ops note when we fall back */
+        console.warn(
+          "[ask] HF Inference Endpoint failed, falling back to HF router:",
           message,
           "— latency:",
           durationMs,
           "ms",
-          "— provider:",
-          inferenceModelId,
         );
         /* eslint-enable no-console */
-        return NextResponse.json(
-          { error: "Question answering failed" },
-          { status: 502 },
-        );
       }
 
-      // Fire-and-forget observability — same shape as the streamText path.
-      void postAskTurnToLangSmith({
-        questionHash,
-        questionLen,
-        chunkCount,
-        answerLanguage: answerLanguage ?? "unset",
-        contextTextLen: safeContext.length,
-        modelId: inferenceModelId,
-        finishReason: "stop",
-        totalUsage: undefined,
-        answerText,
-        confidenceBandsVersion: ASK_CONFIDENCE_BANDS_VERSION,
-      }).catch((langsmithErr) => {
-        /* eslint-disable no-console */
-        console.error(
-          "[ask] LangSmith export error:",
-          langsmithErr instanceof Error ? langsmithErr.message : String(langsmithErr),
-        );
-        /* eslint-enable no-console */
-      });
-      evaluateAsync({
-        input: safeQuestion,
-        output: answerText,
-        model: inferenceModelId,
-        feature: "ask",
-        metadata: {
+      // When the dedicated endpoint succeeded, return its answer wrapped
+      // in a UI message stream so AskTab keeps consuming the same protocol
+      // as the streamText branch below. When it threw and a router model
+      // is available, fall through to the streamText branch instead.
+      if (answerText !== null) {
+        const finalAnswerText = answerText;
+        void postAskTurnToLangSmith({
           questionHash,
+          questionLen,
           chunkCount,
           answerLanguage: answerLanguage ?? "unset",
-          confidenceBandsVersion: ASK_CONFIDENCE_BANDS_VERSION,
+          contextTextLen: safeContext.length,
+          modelId: inferenceModelId,
           finishReason: "stop",
-          provider: inferenceModelId,
-        },
-      });
+          totalUsage: undefined,
+          answerText: finalAnswerText,
+          confidenceBandsVersion: ASK_CONFIDENCE_BANDS_VERSION,
+        }).catch((langsmithErr) => {
+          /* eslint-disable no-console */
+          console.error(
+            "[ask] LangSmith export error:",
+            langsmithErr instanceof Error ? langsmithErr.message : String(langsmithErr),
+          );
+          /* eslint-enable no-console */
+        });
+        evaluateAsync({
+          input: safeQuestion,
+          output: finalAnswerText,
+          model: inferenceModelId,
+          feature: "ask",
+          metadata: {
+            questionHash,
+            chunkCount,
+            answerLanguage: answerLanguage ?? "unset",
+            confidenceBandsVersion: ASK_CONFIDENCE_BANDS_VERSION,
+            finishReason: "stop",
+            provider: inferenceModelId,
+          },
+        });
 
-      const durationMs = Date.now() - started;
-      const successLogPayload = { questionHash, questionLen };
-      assertAskLogHasNoRawTextPayload(
-        "inference_endpoint_complete",
-        successLogPayload,
-      );
-      /* eslint-disable no-console */
-      console.log(
-        "[ask] inference endpoint complete — latency:",
-        durationMs,
-        "ms",
-        successLogPayload,
-      );
-      /* eslint-enable no-console */
+        const durationMs = Date.now() - started;
+        const successLogPayload = { questionHash, questionLen };
+        assertAskLogHasNoRawTextPayload(
+          "inference_endpoint_complete",
+          successLogPayload,
+        );
+        /* eslint-disable no-console */
+        console.log(
+          "[ask] inference endpoint complete — latency:",
+          durationMs,
+          "ms",
+          successLogPayload,
+        );
+        /* eslint-enable no-console */
 
-      // Emit a UI message stream with one text-delta carrying the whole
-      // answer. AskTab streams it through `readUIMessageStream` exactly
-      // like the streamText path, so the chat bubble renders the same way.
-      const textId = randomUUID();
-      const stream = createUIMessageStream({
-        execute: ({ writer }) => {
-          writer.write({ type: "text-start", id: textId });
-          writer.write({ type: "text-delta", id: textId, delta: answerText });
-          writer.write({ type: "text-end", id: textId });
-        },
-      });
-      return createUIMessageStreamResponse({ stream });
+        const textId = randomUUID();
+        const stream = createUIMessageStream({
+          execute: ({ writer }) => {
+            writer.write({ type: "text-start", id: textId });
+            writer.write({ type: "text-delta", id: textId, delta: finalAnswerText });
+            writer.write({ type: "text-end", id: textId });
+          },
+        });
+        return createUIMessageStreamResponse({ stream });
+      }
     }
 
     // ── Legacy streaming branch (HF Inference Providers router) ──────────────
+    // Reached when (a) `HF_INFERENCE_ENDPOINT_URL` is unset and `model`
+    // is the only available provider, or (b) the dedicated endpoint
+    // threw above and we are falling back to the router.
     // [Karlee] — V1 uses baseline Llama 3.1 8B + RAG + prompts (no fine-tuning per team decision, Apr 2026).
     // HF/LLM errors often surface when the client consumes the stream (after this handler returns),
     // not here — so logs below mean "stream object ready", not "model finished successfully".