|
12 | 12 |
|
13 | 13 | from .config import Settings, get_settings |
14 | 14 | from .ollama_client import OllamaClient, OllamaError |
| 15 | +from .runner import ( |
| 16 | + DEFAULT_TIMEOUT_SEC, |
| 17 | + MAX_CODE_BYTES, |
| 18 | + RunnerError, |
| 19 | + run_python, |
| 20 | +) |
15 | 21 | from .schemas import ( |
16 | 22 | ChatMessage, |
17 | 23 | ChatRequest, |
18 | 24 | ChatResponse, |
19 | 25 | ConfigResponse, |
| 26 | + EvaluateRequest, |
| 27 | + EvaluateResponse, |
20 | 28 | HealthResponse, |
| 29 | + RunRequest, |
| 30 | + RunResponse, |
21 | 31 | ) |
22 | 32 |
|
23 | 33 |
|
@@ -76,6 +86,142 @@ async def config() -> ConfigResponse: |
76 | 86 | ollama_url=settings.ollama_url, |
77 | 87 | default_model=settings.model, |
78 | 88 | request_timeout=settings.request_timeout, |
| 89 | + run_timeout_default=DEFAULT_TIMEOUT_SEC, |
| 90 | + run_max_code_bytes=MAX_CODE_BYTES, |
| 91 | + ) |
| 92 | + |
| 93 | + def _result_to_response(result) -> RunResponse: |
| 94 | + return RunResponse( |
| 95 | + stdout=result.stdout, |
| 96 | + stderr=result.stderr, |
| 97 | + exit_code=result.exit_code, |
| 98 | + duration_ms=result.duration_ms, |
| 99 | + timed_out=result.timed_out, |
| 100 | + truncated=result.truncated, |
| 101 | + ) |
| 102 | + |
| 103 | + @app.post("/api/run", response_model=RunResponse) |
| 104 | + async def run(req: RunRequest) -> RunResponse: |
| 105 | + try: |
| 106 | + result = await run_python( |
| 107 | + req.code, stdin=req.stdin, timeout=req.timeout |
| 108 | + ) |
| 109 | + except RunnerError as exc: |
| 110 | + raise HTTPException(status_code=400, detail=str(exc)) from exc |
| 111 | + return _result_to_response(result) |
| 112 | + |
| 113 | + def _build_evaluation_prompt( |
| 114 | + code: str, |
| 115 | + run_resp: RunResponse, |
| 116 | + section: str | None, |
| 117 | + question: str | None, |
| 118 | + ) -> str: |
| 119 | + # Build a compact, factual evidence packet. The LLM is told to act |
| 120 | + # on these facts and not to invent runtime behaviour. |
| 121 | + lines: list[str] = [] |
| 122 | + lines.append( |
| 123 | + "You are reviewing a student's Python attempt. Use only the runtime" |
| 124 | + " evidence below — do not claim outputs or behaviour you can't see." |
| 125 | + " Reply in three short parts:" |
| 126 | + ) |
| 127 | + lines.append(" 1. Assessment — one line: passed | needs_work | error.") |
| 128 | + lines.append( |
| 129 | + " 2. Feedback — 2-4 sentences, hint-first. If the code errored," |
| 130 | + " explain the error in beginner terms. If it ran cleanly, judge" |
| 131 | + " whether the approach is right; otherwise give a hint, not a fix." |
| 132 | + ) |
| 133 | + lines.append( |
| 134 | + " 3. Next step — one short concrete suggestion (a small change to" |
| 135 | + " try, or a follow-up exercise)." |
| 136 | + ) |
| 137 | + lines.append("") |
| 138 | + if section: |
| 139 | + lines.append(f'Section context: "{section}".') |
| 140 | + if question: |
| 141 | + lines.append(f"Student question: {question}") |
| 142 | + lines.append("") |
| 143 | + lines.append("Student code:") |
| 144 | + lines.append("```python") |
| 145 | + lines.append(code) |
| 146 | + lines.append("```") |
| 147 | + lines.append("") |
| 148 | + lines.append(f"Exit code: {run_resp.exit_code}") |
| 149 | + lines.append(f"Duration: {run_resp.duration_ms} ms") |
| 150 | + if run_resp.timed_out: |
| 151 | + lines.append("NOTE: execution hit the runner's timeout.") |
| 152 | + lines.append("Stdout:") |
| 153 | + lines.append("```") |
| 154 | + lines.append(run_resp.stdout or "(empty)") |
| 155 | + lines.append("```") |
| 156 | + lines.append("Stderr:") |
| 157 | + lines.append("```") |
| 158 | + lines.append(run_resp.stderr or "(empty)") |
| 159 | + lines.append("```") |
| 160 | + return "\n".join(lines) |
| 161 | + |
| 162 | + def _classify_assessment(text: str, run_resp: RunResponse) -> str: |
| 163 | + """Best-effort parse of the model's first line; fall back to evidence.""" |
| 164 | + first = (text or "").strip().splitlines()[0].lower() if text else "" |
| 165 | + for label in ("passed", "needs_work", "needs work", "error"): |
| 166 | + if label in first: |
| 167 | + return "needs_work" if label == "needs work" else label |
| 168 | + if run_resp.timed_out or run_resp.exit_code != 0: |
| 169 | + return "error" if run_resp.stderr else "needs_work" |
| 170 | + return "needs_work" |
| 171 | + |
| 172 | + def _extract_next_step(text: str) -> str | None: |
| 173 | + if not text: |
| 174 | + return None |
| 175 | + for line in text.splitlines(): |
| 176 | + stripped = line.strip().lstrip("-*0123456789. ").strip() |
| 177 | + low = stripped.lower() |
| 178 | + if low.startswith("next step"): |
| 179 | + # "Next step: ..." or "Next step — ..." |
| 180 | + for sep in (":", "—", "-"): |
| 181 | + if sep in stripped: |
| 182 | + return stripped.split(sep, 1)[1].strip() or None |
| 183 | + return stripped |
| 184 | + return None |
| 185 | + |
| 186 | + @app.post("/api/evaluate", response_model=EvaluateResponse) |
| 187 | + async def evaluate(req: EvaluateRequest) -> EvaluateResponse: |
| 188 | + if req.run_output is not None: |
| 189 | + run_resp = req.run_output |
| 190 | + else: |
| 191 | + try: |
| 192 | + result = await run_python( |
| 193 | + req.code, stdin=req.stdin, timeout=None |
| 194 | + ) |
| 195 | + except RunnerError as exc: |
| 196 | + raise HTTPException(status_code=400, detail=str(exc)) from exc |
| 197 | + run_resp = _result_to_response(result) |
| 198 | + |
| 199 | + prompt = _build_evaluation_prompt( |
| 200 | + req.code, run_resp, req.section, req.question |
| 201 | + ) |
| 202 | + model = req.model or settings.model |
| 203 | + messages = [ |
| 204 | + ChatMessage(role="system", content=settings.system_prompt), |
| 205 | + ChatMessage(role="user", content=prompt), |
| 206 | + ] |
| 207 | + client = make_client() |
| 208 | + try: |
| 209 | + raw = await client.chat( |
| 210 | + model=model, |
| 211 | + messages=messages, |
| 212 | + temperature=req.temperature, |
| 213 | + ) |
| 214 | + except OllamaError as exc: |
| 215 | + raise HTTPException(status_code=502, detail=str(exc)) from exc |
| 216 | + |
| 217 | + msg = raw.get("message") or {} |
| 218 | + feedback = msg.get("content", "") or "" |
| 219 | + return EvaluateResponse( |
| 220 | + assessment=_classify_assessment(feedback, run_resp), |
| 221 | + feedback=feedback, |
| 222 | + next_step=_extract_next_step(feedback), |
| 223 | + run=run_resp, |
| 224 | + model=raw.get("model", model), |
79 | 225 | ) |
80 | 226 |
|
81 | 227 | @app.post("/api/chat", response_model=ChatResponse) |
|
0 commit comments