Python Inference Refactor and Frontend API Layer (#8)

Eamon2009 · web-flow · commit e3b48203186d · 2026-05-01T10:37:06.000+05:30
# Description This PR synchronizes the model interaction logic across both the Python backend utilities and the web frontend. It establishes a consistent way to interface with the model weights and the C++ engine. ## Python Backend (inference.py) - Goal: Refactor the standalone inference script to support modern weight loading. - Weight Mapping: Updated to load and map .pt files directly using the refactored architecture. - Chat Mode: Implemented a robust interactive loop for rapid model testing and verification. ## Frontend Layer (frontend/src/api) - Goal: Establish the bridge between the UI and the Quadtrix engine. - Service Definition: Created the base API client to handle requests to the C++ backend. - Dual-Path Logic: Added handlers for both Training control and Inference/Chat endpoints. - Stream Support: Prepared the API layer to handle "generation" data chunks for real-time UI updates. ## other PR merge #7 #6 #5 #4 #3
diff --git a/engine/inference.py b/engine/inference.py
@@ -0,0 +1,286 @@
+import argparse
+from pathlib import Path
+import time
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+import tiktoken
+
+
+W = 78
+DOUBLE = "=" * W
+SINGLE = "-" * W
+ARROW = "->"
+
+block_size = 32
+n_embd = 64
+n_head = 4
+n_layer = 4
+dropout = 0.1
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+def header(title, subtitle=""):
+    print(f"\n{DOUBLE}")
+    print(f"  {title}")
+    if subtitle:
+        print(f"  {subtitle}")
+    print(DOUBLE)
+
+
+def row(label, value="", unit="", note=""):
+    label_col = f"  {label:<28}"
+    value_col = f"{str(value):<20}"
+    unit_col = f"{unit:<8}"
+    note_col = f"  {note}" if note else ""
+    print(f"{label_col}{value_col}{unit_col}{note_col}")
+
+
+def rule():
+    print(f"  {SINGLE}")
+
+
+def blank():
+    print()
+
+
+def get_tokenizer(encoding_name="gpt2"):
+    tokenizer = tiktoken.get_encoding(encoding_name)
+    return tokenizer, tokenizer.n_vocab
+
+
+def encode(text, tokenizer):
+    return tokenizer.encode(text)
+
+
+def decode(tokens, tokenizer):
+    return tokenizer.decode(tokens)
+
+
+tokenizer, vocab_size = get_tokenizer("gpt2")
+
+
+class Head(nn.Module):
+    def __init__(self, head_size):
+        super().__init__()
+        self.key = nn.Linear(n_embd, head_size, bias=False)
+        self.query = nn.Linear(n_embd, head_size, bias=False)
+        self.value = nn.Linear(n_embd, head_size, bias=False)
+        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        _, T, _ = x.shape
+        k = self.key(x)
+        q = self.query(x)
+        wei = q @ k.transpose(-2, -1) * k.shape[-1] ** -0.5
+        wei = wei.masked_fill(self.tril[:T, :T] == 0, float("-inf"))
+        wei = F.softmax(wei, dim=-1)
+        wei = self.dropout(wei)
+        return wei @ self.value(x)
+
+
+class MultiHeadAttention(nn.Module):
+    def __init__(self, num_heads, head_size):
+        super().__init__()
+        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
+        self.proj = nn.Linear(head_size * num_heads, n_embd)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        out = torch.cat([h(x) for h in self.heads], dim=-1)
+        return self.dropout(self.proj(out))
+
+
+class FeedForward(nn.Module):
+    def __init__(self, n_embd):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(n_embd, 4 * n_embd),
+            nn.ReLU(),
+            nn.Linear(4 * n_embd, n_embd),
+            nn.Dropout(dropout),
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class Block(nn.Module):
+    def __init__(self, n_embd, n_head):
+        super().__init__()
+        head_size = n_embd // n_head
+        self.sa = MultiHeadAttention(n_head, head_size)
+        self.ffwd = FeedForward(n_embd)
+        self.ln1 = nn.LayerNorm(n_embd)
+        self.ln2 = nn.LayerNorm(n_embd)
+
+    def forward(self, x):
+        x = x + self.sa(self.ln1(x))
+        x = x + self.ffwd(self.ln2(x))
+        return x
+
+
+class GPTLanguageModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
+        self.position_embedding_table = nn.Embedding(block_size, n_embd)
+        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
+        self.ln_f = nn.LayerNorm(n_embd)
+        self.lm_head = nn.Linear(n_embd, vocab_size)
+
+    def forward(self, idx, targets=None):
+        B, T = idx.shape
+        tok_emb = self.token_embedding_table(idx)
+        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
+        x = tok_emb + pos_emb
+        x = self.blocks(x)
+        x = self.ln_f(x)
+        logits = self.lm_head(x)
+
+        if targets is None:
+            loss = None
+        else:
+            B, T, C = logits.shape
+            logits = logits.view(B * T, C)
+            targets = targets.view(B * T)
+            loss = F.cross_entropy(logits, targets)
+        return logits, loss
+
+    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
+        for _ in range(max_new_tokens):
+            idx_cond = idx[:, -block_size:]
+            logits, _ = self(idx_cond)
+            logits = logits[:, -1, :] / max(temperature, 1e-6)
+
+            if top_k is not None:
+                values, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < values[:, [-1]]] = float("-inf")
+
+            probs = F.softmax(logits, dim=-1)
+            idx_next = torch.multinomial(probs, num_samples=1)
+            idx = torch.cat((idx, idx_next), dim=1)
+        return idx
+
+
+def default_checkpoint_path():
+    script_dir = Path(__file__).resolve().parent
+    candidates = [
+        script_dir / "best_model.pt",
+        Path.cwd() / "best_model.pt",
+        Path.cwd() / "engine" / "best_model.pt",
+    ]
+    for candidate in candidates:
+        if candidate.exists():
+            return candidate
+    return script_dir / "best_model.pt"
+
+
+def load_model(checkpoint_path):
+    checkpoint_path = Path(checkpoint_path)
+    if not checkpoint_path.exists():
+        raise FileNotFoundError(
+            f"Checkpoint not found: {checkpoint_path}\n"
+            "Train first with engine/main.py, or pass --checkpoint path/to/best_model.pt"
+        )
+
+    model = GPTLanguageModel().to(device)
+    state_dict = torch.load(checkpoint_path, map_location=device)
+    model.load_state_dict(state_dict)
+    model.eval()
+    return model
+
+
+def generate_response(model, prompt, max_new_tokens, temperature, top_k):
+    encoded_prompt = encode(prompt, tokenizer)
+    context = torch.tensor([encoded_prompt], dtype=torch.long, device=device)
+
+    with torch.no_grad():
+        output_ids = model.generate(
+            context,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_k=top_k,
+        )
+
+    new_tokens = output_ids[0][len(encoded_prompt):].tolist()
+    return decode(new_tokens, tokenizer).strip()
+
+
+def chat(model, args):
+    header("INFERENCE", "quit / exit / q -> end session")
+    blank()
+
+    while True:
+        prompt = input(f"  user  {ARROW} ").strip()
+        if prompt.lower() in ("quit", "exit", "q"):
+            blank()
+            print("  Session ended.")
+            break
+        if not prompt:
+            continue
+
+        response = generate_response(
+            model,
+            prompt,
+            args.max_new_tokens,
+            args.temperature,
+            args.top_k,
+        )
+        blank()
+        print(f"  Model {ARROW} {response}")
+        blank()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run inference from an engine trained .pt checkpoint.")
+    parser.add_argument(
+        "--checkpoint",
+        type=Path,
+        default=default_checkpoint_path(),
+        help="Path to the .pt file generated by engine/main.py.",
+    )
+    parser.add_argument("--prompt", type=str, default=None, help="Generate once from this prompt.")
+    parser.add_argument("--max-new-tokens", type=int, default=200)
+    parser.add_argument("--temperature", type=float, default=1.0)
+    parser.add_argument("--top-k", type=int, default=None)
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    start = time.time()
+
+    print(f"{'Quadtrix-v1.0':^{W}}")
+    blank()
+    row("Started", time.strftime("%Y-%m-%d  %H:%M:%S"))
+    row("Device", str(device))
+    row("PyTorch", torch.__version__)
+    row("Checkpoint", args.checkpoint)
+    rule()
+
+    model = load_model(args.checkpoint)
+
+    if args.prompt:
+        response = generate_response(
+            model,
+            args.prompt,
+            args.max_new_tokens,
+            args.temperature,
+            args.top_k,
+        )
+        blank()
+        print(response)
+    else:
+        chat(model, args)
+
+    blank()
+    row("Total", f"{time.time() - start:.2f}s")
+    print(DOUBLE)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/frontend/src/api/chat.ts b/frontend/src/api/chat.ts
@@ -0,0 +1,24 @@
+import { useMutation, useQueryClient } from "@tanstack/react-query";
+
+import { apiFetch } from "./client";
+import type { ChatRequest, ChatResponse } from "../types";
+
+export function useSendMessage() {
+  const queryClient = useQueryClient();
+  return useMutation({
+    mutationFn: async (payload: ChatRequest): Promise<ChatResponse> => {
+      try {
+        return await apiFetch<ChatResponse>("/api/chat", {
+          method: "POST",
+          body: JSON.stringify(payload),
+        });
+      } catch (error) {
+        throw error;
+      }
+    },
+    onSuccess: async (response) => {
+      await queryClient.invalidateQueries({ queryKey: ["sessions"] });
+      await queryClient.invalidateQueries({ queryKey: ["messages", response.session_id] });
+    },
+  });
+}
diff --git a/frontend/src/api/client.ts b/frontend/src/api/client.ts
@@ -0,0 +1,37 @@
+import { useSettingsStore } from "../store/settingsStore";
+import type { ApiError } from "../types";
+
+export class ApiClientError extends Error {
+  apiError: ApiError;
+
+  constructor(apiError: ApiError) {
+    super(apiError.message);
+    this.apiError = apiError;
+  }
+}
+
+export async function apiFetch<T>(path: string, init?: RequestInit): Promise<T> {
+  const baseUrl = useSettingsStore.getState().apiBaseUrl.replace(/\/$/, "");
+  const headers = new Headers(init?.headers);
+  headers.set("Content-Type", "application/json");
+  try {
+    const response = await fetch(`${baseUrl}${path}`, {
+      ...init,
+      headers,
+    });
+    if (!response.ok) {
+      const fallback: ApiError = { error: "request_failed", message: response.statusText, code: response.status };
+      const error = (await response.json().catch(() => fallback)) as ApiError;
+      throw new ApiClientError(error);
+    }
+    if (response.status === 204) {
+      return undefined as T;
+    }
+    return (await response.json()) as T;
+  } catch (error) {
+    if (error instanceof ApiClientError) {
+      throw error;
+    }
+    throw new ApiClientError({ error: "network_error", message: "Could not reach the API server", code: 0 });
+  }
+}
diff --git a/frontend/src/api/health.ts b/frontend/src/api/health.ts
@@ -0,0 +1,34 @@
+import { useQuery } from "@tanstack/react-query";
+
+import { apiFetch } from "./client";
+import type { HealthResponse, ModelStats } from "../types";
+
+export function useHealth() {
+  return useQuery({
+    queryKey: ["health"],
+    queryFn: async (): Promise<HealthResponse> => {
+      try {
+        return await apiFetch<HealthResponse>("/api/health");
+      } catch (error) {
+        throw error;
+      }
+    },
+    refetchInterval: 30000,
+    retry: 1,
+  });
+}
+
+export function useStats() {
+  return useQuery({
+    queryKey: ["stats"],
+    queryFn: async (): Promise<ModelStats> => {
+      try {
+        return await apiFetch<ModelStats>("/api/stats");
+      } catch (error) {
+        throw error;
+      }
+    },
+    refetchInterval: 30000,
+    retry: 1,
+  });
+}
diff --git a/frontend/src/api/sessions.ts b/frontend/src/api/sessions.ts