diff --git a/tests/tasks/uipath-agents/deploy_my_workspace/check_deploy_my_workspace.py b/tests/tasks/uipath-agents/deploy_my_workspace/check_deploy_my_workspace.py
new file mode 100644
index 000000000..ce6db5312
--- /dev/null
+++ b/tests/tasks/uipath-agents/deploy_my_workspace/check_deploy_my_workspace.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+"""Deploy-lifecycle artifact + metadata check.
+
+Asserts the artifacts `uip codedagent pack` / `deploy` produce in
+`.uipath/`, and that `pyproject.toml` carries the four fields the
+deployment guide flags as required (`name`, `version`, `description`,
+`authors`). Without `authors`, packaging fails with `Project authors
+cannot be empty`.
+
+Checks:
+  1. `deploy-smoke/pyproject.toml` has `name`, `version`,
+     `description`, and `authors`. No `[build-system]`.
+  2. `deploy-smoke/.uipath/` exists and contains a `*.nupkg` file
+     (proof that `pack` ran successfully).
+  3. `deploy-smoke/invoke-output.txt` exists, is non-empty, and the
+     `file_contains` criterion in the YAML separately checks that it
+     surfaces an `https://` URL — kept here as a complementary
+     "non-empty" guard.
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+from pathlib import Path
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from _shared.project_root import find_project_root  # noqa: E402
+
+ROOT = find_project_root("deploy-smoke")
+
+
+def _read_text(path: Path) -> str:
+    if not path.is_file():
+        sys.exit(f"FAIL: Missing {path}")
+    return path.read_text(encoding="utf-8")
+
+
+def check_pyproject() -> None:
+    text = _read_text(ROOT / "pyproject.toml")
+    if "[build-system]" in text:
+        sys.exit("FAIL: pyproject.toml contains a [build-system] section")
+    for needle in ("name", "version", "description", "authors"):
+        if needle not in text:
+            sys.exit(
+                f"FAIL: pyproject.toml is missing `{needle}` — "
+                "deployment guide requires all four fields."
+            )
+    print("OK: pyproject.toml has name, version, description, authors")
+
+
+def check_pack_artifacts() -> None:
+    uipath_dir = ROOT / ".uipath"
+    if not uipath_dir.is_dir():
+        sys.exit(
+            f"FAIL: {uipath_dir} does not exist — `uip codedagent pack` "
+            "did not run."
+        )
+    nupkgs = sorted(uipath_dir.glob("*.nupkg"))
+    if not nupkgs:
+        sys.exit(
+            f"FAIL: no .nupkg file in {uipath_dir} — pack did not produce "
+            "the expected package artifact."
+        )
+    print(f"OK: {uipath_dir.name}/{nupkgs[0].name} exists ({len(nupkgs)} package(s) total)")
+
+
+def check_invoke_output() -> None:
+    path = ROOT / "invoke-output.txt"
+    text = _read_text(path)
+    if not text.strip():
+        sys.exit(f"FAIL: {path.name} is empty — `uip codedagent invoke` produced no output")
+    if "https://" not in text:
+        sys.exit(f"FAIL: {path.name} does not contain a monitoring URL (no `https://` substring)")
+    print(f"OK: {path.name} captured {len(text)} bytes of invoke stdout (with monitoring URL)")
+
+
+def main() -> None:
+    if not ROOT.is_dir():
+        sys.exit(f"FAIL: project directory {ROOT} does not exist")
+    check_pyproject()
+    check_pack_artifacts()
+    check_invoke_output()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/tasks/uipath-agents/deploy_my_workspace/deploy_my_workspace.yaml b/tests/tasks/uipath-agents/deploy_my_workspace/deploy_my_workspace.yaml
new file mode 100644
index 000000000..258c94249
--- /dev/null
+++ b/tests/tasks/uipath-agents/deploy_my_workspace/deploy_my_workspace.yaml
@@ -0,0 +1,65 @@
+task_id: skill-agent-coded-deploy-my-workspace
+description: >
+  Coded-agent deploy lifecycle. Verifies the skill guides the agent
+  through `uip codedagent pack` → `uip codedagent publish
+  --my-workspace` (or the combined `uip codedagent deploy
+  --my-workspace`), then `uip codedagent invoke` to start a cloud
+  job and surface the monitoring URL. Exercises the entire production
+  packaging path the existing test suite never reaches.
+tags: [uipath-agents, e2e, coded, lifecycle:deploy, feature:deploy]
+max_iterations: 1
+
+agent:
+  type: claude-code
+  permission_mode: acceptEdits
+  allowed_tools: ["Skill", "Bash", "Read", "Write", "Edit", "Glob", "Grep"]
+  turn_timeout: 1200
+
+sandbox:
+  driver: tempdir
+  python: {}
+
+initial_prompt: |
+  Build a minimal Simple Function UiPath coded agent named
+  `deploy-smoke` whose `main(input)` returns `{"echoed":
+  input.message}`. No LLM. The point is the deployment lifecycle,
+  not the agent logic.
+
+  Take the agent through scaffold → init → run, then deploy it to
+  the user's personal workspace and invoke the published version
+  with `{"message": "deployed"}`.
+
+  After invoke succeeds, write its full stdout (which includes the
+  monitoring URL) to `deploy-smoke/invoke-output.txt` (inside the
+  scaffolded project directory, alongside `pyproject.toml`) so the
+  test harness can verify the URL was surfaced.
+
+  The test harness has UiPath auth pre-configured.
+
+  Do NOT pause between planning and implementation. Complete
+  end-to-end in a single pass.
+
+success_criteria:
+  - type: command_executed
+    description: "Agent packed and published to my-workspace (deploy or pack+publish)"
+    tool_name: "Bash"
+    command_pattern: 'uip\s+codedagent\s+(deploy|publish)\s+.*--my-workspace'
+    min_count: 1
+    weight: 2.5
+    pass_threshold: 1.0
+
+  - type: command_executed
+    description: "Agent invoked the published agent"
+    tool_name: "Bash"
+    command_pattern: 'uip\s+codedagent\s+invoke\s+main'
+    min_count: 1
+    weight: 2.5
+    pass_threshold: 1.0
+
+  - type: run_command
+    description: "Deploy artifacts present and pyproject.toml is well-formed for packaging"
+    command: "python3 $TASK_DIR/check_deploy_my_workspace.py"
+    timeout: 30
+    expected_exit_code: 0
+    weight: 4.0
+    pass_threshold: 1.0
diff --git a/tests/tasks/uipath-agents/eval_exact_match/check_eval_exact_match.py b/tests/tasks/uipath-agents/eval_exact_match/check_eval_exact_match.py
new file mode 100644
index 000000000..200bd859d
--- /dev/null
+++ b/tests/tasks/uipath-agents/eval_exact_match/check_eval_exact_match.py
@@ -0,0 +1,162 @@
+#!/usr/bin/env python3
+"""Eval-lifecycle check for the deterministic ExactMatch path.
+
+Validates that the agent authored both halves of the evaluation
+harness — the evaluator config under `evaluations/evaluators/` AND
+the evaluation set under `evaluations/eval-sets/` whose `evaluatorRefs`
+match the evaluator `id` — and that `uip codedagent eval --no-report`
+produced an output file in which every test case has
+`status == "PASSED"` (deterministic agent + deterministic evaluator
+means anything else is a bug).
+
+Checks:
+  1. `adder/evaluations/evaluators/<file>.json` has `evaluatorTypeId`
+     == "uipath-exact-match" and a non-empty `id`.
+  2. `adder/evaluations/eval-sets/<file>.json` has version "1.0",
+     `evaluatorRefs` referencing the evaluator id, at least 2 test
+     cases, and each test case's `evaluationCriterias` keys the
+     evaluator id.
+  3. `eval-results.json` exists with the documented top-level shape
+     (`evaluationSetName`, `evaluationSetResults: [...]`), every
+     test case in `evaluationSetResults` has at least one matching
+     `evaluationRunResults[]` entry for the configured evaluator,
+     and every such entry scored exactly 1.0 (deterministic agent +
+     deterministic evaluator: anything below 1.0 is a bug).
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+from pathlib import Path
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from _shared.project_root import find_project_root  # noqa: E402
+
+ROOT = find_project_root("adder")
+
+
+def _load_json(path: Path) -> dict:
+    if not path.is_file():
+        sys.exit(f"FAIL: Missing {path}")
+    try:
+        return json.loads(path.read_text(encoding="utf-8"))
+    except json.JSONDecodeError as e:
+        sys.exit(f"FAIL: {path} is not valid JSON: {e}")
+
+
+def find_single_json(directory: Path) -> Path:
+    if not directory.is_dir():
+        sys.exit(f"FAIL: {directory} does not exist")
+    files = sorted(p for p in directory.glob("*.json") if p.is_file())
+    if not files:
+        sys.exit(f"FAIL: {directory} contains no .json files")
+    if len(files) > 1:
+        sys.exit(f"FAIL: {directory} should contain exactly one .json file, got {len(files)}")
+    return files[0]
+
+
+def check_evaluator() -> str:
+    path = find_single_json(ROOT / "evaluations" / "evaluators")
+    doc = _load_json(path)
+    type_id = doc.get("evaluatorTypeId")
+    if type_id != "uipath-exact-match":
+        sys.exit(
+            f'FAIL: {path.name} evaluatorTypeId should be "uipath-exact-match", '
+            f'got {type_id!r}'
+        )
+    eval_id = doc.get("id")
+    if not eval_id:
+        sys.exit(f"FAIL: {path.name} is missing required `id` field")
+    print(f'OK: evaluator config {path.name} has evaluatorTypeId={type_id!r} id={eval_id!r}')
+    return eval_id
+
+
+def check_eval_set(evaluator_id: str) -> int:
+    path = find_single_json(ROOT / "evaluations" / "eval-sets")
+    doc = _load_json(path)
+    if doc.get("version") != "1.0":
+        sys.exit(f'FAIL: eval set version should be "1.0", got {doc.get("version")!r}')
+    refs = doc.get("evaluatorRefs") or []
+    if evaluator_id not in refs:
+        sys.exit(
+            f'FAIL: eval set `evaluatorRefs` does not include the evaluator '
+            f'id {evaluator_id!r}. Got: {refs}'
+        )
+    cases = doc.get("evaluations") or []
+    if len(cases) < 2:
+        sys.exit(f"FAIL: eval set must have at least 2 test cases, got {len(cases)}")
+    for i, case in enumerate(cases):
+        crit = case.get("evaluationCriterias") or {}
+        if evaluator_id not in crit:
+            sys.exit(
+                f'FAIL: eval set test case {i} (`{case.get("id", "?")}`) does '
+                f'not key its evaluationCriterias on the evaluator id '
+                f'{evaluator_id!r}. Got keys: {list(crit.keys())}'
+            )
+    print(f'OK: eval set {path.name} references {evaluator_id!r} across {len(cases)} test cases')
+    return len(cases)
+
+
+def check_results(evaluator_id: str, expected_case_count: int) -> None:
+    path = ROOT / "eval-results.json"
+    doc = _load_json(path)
+    if not isinstance(doc, dict):
+        sys.exit(f"FAIL: {path.name} top-level should be an object, got {type(doc).__name__}")
+    cases = doc.get("evaluationSetResults")
+    if not isinstance(cases, list) or not cases:
+        sys.exit(
+            f"FAIL: {path.name} is missing a non-empty `evaluationSetResults` "
+            f"list. Top-level keys: {list(doc.keys())}"
+        )
+    if len(cases) != expected_case_count:
+        sys.exit(
+            f"FAIL: expected {expected_case_count} entries in "
+            f"`evaluationSetResults` (one per eval-set test case), got {len(cases)}"
+        )
+    print(f'OK: eval-results.json carries {len(cases)} entries in evaluationSetResults')
+    bad_cases = []
+    matching_run_count = 0
+    for case in cases:
+        if not isinstance(case, dict):
+            continue
+        case_name = case.get("evaluationName") or "?"
+        runs = case.get("evaluationRunResults") or []
+        matching = [
+            r for r in runs
+            if isinstance(r, dict) and r.get("evaluatorId") == evaluator_id
+        ]
+        if not matching:
+            bad_cases.append(
+                f'{case_name!r}: no evaluationRunResults entry references '
+                f'evaluatorId={evaluator_id!r}'
+            )
+            continue
+        for r in matching:
+            score = (r.get("result") or {}).get("score")
+            if score != 1.0:
+                bad_cases.append(
+                    f'{case_name!r}: evaluator {evaluator_id!r} scored '
+                    f'{score!r}, expected 1.0 (deterministic agent + '
+                    f'deterministic evaluator)'
+                )
+        matching_run_count += len(matching)
+    if bad_cases:
+        sys.exit("FAIL: " + " | ".join(bad_cases))
+    print(
+        f'OK: every test case has an ExactMatchEvaluator run scoring 1.0 '
+        f'({matching_run_count} run(s) across {len(cases)} case(s))'
+    )
+
+
+def main() -> None:
+    if not ROOT.is_dir():
+        sys.exit(f"FAIL: project directory {ROOT} does not exist")
+    evaluator_id = check_evaluator()
+    case_count = check_eval_set(evaluator_id)
+    check_results(evaluator_id, case_count)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/tasks/uipath-agents/eval_exact_match/eval_exact_match.yaml b/tests/tasks/uipath-agents/eval_exact_match/eval_exact_match.yaml
new file mode 100644
index 000000000..e63a7d144
--- /dev/null
+++ b/tests/tasks/uipath-agents/eval_exact_match/eval_exact_match.yaml
@@ -0,0 +1,66 @@
+task_id: skill-agent-coded-eval-exact-match
+description: >
+  Coded-agent eval lifecycle, deterministic path. Verifies the skill
+  guides the agent to author an evaluator config under
+  `evaluations/evaluators/`, an evaluation set under
+  `evaluations/eval-sets/` whose `evaluatorRefs` match the evaluator
+  `id`, and run `uip codedagent eval --no-report` against a
+  deterministic Simple Function agent. All test cases must score 1.0
+  and report PASSED in the output file.
+tags: [uipath-agents, e2e, coded, lifecycle:validate, feature:eval]
+max_iterations: 1
+
+agent:
+  type: claude-code
+  permission_mode: acceptEdits
+  allowed_tools: ["Skill", "Bash", "Read", "Write", "Edit", "Glob", "Grep"]
+  turn_timeout: 1200
+
+sandbox:
+  driver: tempdir
+  python: {}
+
+initial_prompt: |
+  Build a Simple Function UiPath coded agent named `adder` whose
+  `main(input)` returns the sum of two integers. No LLM — purely
+  deterministic.
+
+  Input fields: `a` (int), `b` (int). Output field: `result` (int).
+
+  Take the agent through scaffold → init → run, then add an
+  evaluation harness using the **ExactMatch evaluator** with three
+  test cases (e.g. 2+3=5, 0+0=0, 7+1=8). Run the eval set locally
+  (no Studio Web reporting) and save the output to
+  `eval-results.json` in the project root.
+
+  Every test case must score 1.0 — deterministic agent +
+  deterministic evaluator means anything below 1.0 is a regression.
+
+  Do NOT publish, upload, or deploy. Do NOT call `uip login`. Do NOT
+  pause between planning and implementation. Complete end-to-end in
+  a single pass.
+
+success_criteria:
+  - type: command_executed
+    description: "Agent scaffolded the project with uip codedagent new"
+    tool_name: "Bash"
+    command_pattern: 'uip\s+codedagent\s+new'
+    min_count: 1
+    weight: 1.0
+    pass_threshold: 1.0
+
+  - type: command_executed
+    description: "Agent ran the eval suite with --no-report"
+    tool_name: "Bash"
+    command_pattern: 'uip\s+codedagent\s+eval\s+.*--no-report'
+    min_count: 1
+    weight: 2.0
+    pass_threshold: 1.0
+
+  - type: run_command
+    description: "Evaluator + eval-set + results shape, all test cases PASSED"
+    command: "python3 $TASK_DIR/check_eval_exact_match.py"
+    timeout: 30
+    expected_exit_code: 0
+    weight: 5.0
+    pass_threshold: 1.0
diff --git a/tests/tasks/uipath-agents/eval_llm_judges/check_eval_llm_judges.py b/tests/tasks/uipath-agents/eval_llm_judges/check_eval_llm_judges.py
new file mode 100644
index 000000000..5c90e6206
--- /dev/null
+++ b/tests/tasks/uipath-agents/eval_llm_judges/check_eval_llm_judges.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+"""Eval-lifecycle check for the LLM-judge path (two evaluators).
+
+Validates the dual-evaluator harness:
+  - `LLMJudgeOutputEvaluator` config with the
+    `uipath-llm-judge-output-semantic-similarity` typeId.
+  - `LLMJudgeTrajectoryEvaluator` config with the
+    `uipath-llm-judge-trajectory-similarity` typeId.
+  - One eval set whose `evaluatorRefs` lists BOTH ids and whose test
+    cases key `evaluationCriterias` on BOTH ids — the output judge
+    gets an `expectedOutput` block, the trajectory judge gets an
+    `expectedAgentBehavior` string.
+  - `eval-results.json` exists and is a non-empty test-case list.
+    LLM-judge scores are continuous (0.0-1.0) so we don't assert an
+    exact score — only that the results file is well-formed and
+    references the expected evaluator ids.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+from pathlib import Path
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from _shared.project_root import find_project_root  # noqa: E402
+
+ROOT = find_project_root("intent-classifier")
+
+EXPECTED_EVALUATORS = {
+    "LLMJudgeOutputEvaluator": "uipath-llm-judge-output-semantic-similarity",
+    "LLMJudgeTrajectoryEvaluator": "uipath-llm-judge-trajectory-similarity",
+}
+
+
+def _load_json(path: Path) -> dict:
+    if not path.is_file():
+        sys.exit(f"FAIL: Missing {path}")
+    try:
+        return json.loads(path.read_text(encoding="utf-8"))
+    except json.JSONDecodeError as e:
+        sys.exit(f"FAIL: {path} is not valid JSON: {e}")
+
+
+def check_evaluator_configs() -> None:
+    evaluators_dir = ROOT / "evaluations" / "evaluators"
+    if not evaluators_dir.is_dir():
+        sys.exit(f"FAIL: {evaluators_dir} does not exist")
+    found_by_id: dict[str, Path] = {}
+    for json_file in sorted(evaluators_dir.glob("*.json")):
+        doc = _load_json(json_file)
+        eval_id = doc.get("id")
+        type_id = doc.get("evaluatorTypeId")
+        if eval_id in EXPECTED_EVALUATORS:
+            expected_type = EXPECTED_EVALUATORS[eval_id]
+            if type_id != expected_type:
+                sys.exit(
+                    f'FAIL: evaluator {eval_id!r} should have evaluatorTypeId='
+                    f'{expected_type!r}, got {type_id!r}'
+                )
+            found_by_id[eval_id] = json_file
+            print(f'OK: evaluator config {json_file.name} has id={eval_id!r} typeId={type_id!r}')
+    missing = set(EXPECTED_EVALUATORS) - set(found_by_id)
+    if missing:
+        sys.exit(
+            f'FAIL: missing evaluator configs for ids {sorted(missing)}. '
+            f'Found ids: {sorted(found_by_id)}'
+        )
+
+
+def check_eval_set() -> None:
+    eval_sets_dir = ROOT / "evaluations" / "eval-sets"
+    if not eval_sets_dir.is_dir():
+        sys.exit(f"FAIL: {eval_sets_dir} does not exist")
+    files = sorted(eval_sets_dir.glob("*.json"))
+    if not files:
+        sys.exit(f"FAIL: no eval set files in {eval_sets_dir}")
+    if len(files) > 1:
+        sys.exit(f"FAIL: expected exactly one eval set file, got {len(files)}")
+    path = files[0]
+    doc = _load_json(path)
+    if doc.get("version") != "1.0":
+        sys.exit(f'FAIL: eval set version should be "1.0", got {doc.get("version")!r}')
+    refs = doc.get("evaluatorRefs") or []
+    missing_refs = set(EXPECTED_EVALUATORS) - set(refs)
+    if missing_refs:
+        sys.exit(
+            f'FAIL: eval set `evaluatorRefs` is missing {sorted(missing_refs)}. '
+            f'Got: {refs}'
+        )
+    cases = doc.get("evaluations") or []
+    if len(cases) < 2:
+        sys.exit(f"FAIL: eval set must have at least 2 test cases, got {len(cases)}")
+    for i, case in enumerate(cases):
+        crit = case.get("evaluationCriterias") or {}
+        for evaluator_id in EXPECTED_EVALUATORS:
+            if evaluator_id not in crit:
+                sys.exit(
+                    f'FAIL: test case {i} (`{case.get("id", "?")}`) does not '
+                    f'key evaluationCriterias on {evaluator_id!r}. Got keys: '
+                    f'{list(crit.keys())}'
+                )
+        # Trajectory judge requires `expectedAgentBehavior`.
+        traj = crit.get("LLMJudgeTrajectoryEvaluator") or {}
+        if not traj.get("expectedAgentBehavior"):
+            sys.exit(
+                f'FAIL: test case {i} LLMJudgeTrajectoryEvaluator entry is '
+                f'missing the required `expectedAgentBehavior` field. Got: {traj}'
+            )
+        # Output judge requires `expectedOutput`.
+        out = crit.get("LLMJudgeOutputEvaluator") or {}
+        if "expectedOutput" not in out:
+            sys.exit(
+                f'FAIL: test case {i} LLMJudgeOutputEvaluator entry is '
+                f'missing the required `expectedOutput` field. Got: {out}'
+            )
+    print(
+        f"OK: eval set {path.name} references both judges across {len(cases)} "
+        "test cases with the right per-judge criteria"
+    )
+
+
+def check_results() -> None:
+    path = ROOT / "eval-results.json"
+    doc = _load_json(path)
+    if not isinstance(doc, dict):
+        sys.exit(f"FAIL: {path.name} top-level should be an object, got {type(doc).__name__}")
+    cases = doc.get("evaluationSetResults")
+    if not isinstance(cases, list) or not cases:
+        sys.exit(
+            f"FAIL: {path.name} is missing a non-empty `evaluationSetResults` "
+            f"list. Top-level keys: {list(doc.keys())}"
+        )
+    seen_ids: set[str] = set()
+    for c in cases:
+        if not isinstance(c, dict):
+            continue
+        for r in c.get("evaluationRunResults") or []:
+            if isinstance(r, dict):
+                eid = r.get("evaluatorId")
+                if eid:
+                    seen_ids.add(eid)
+    missing = set(EXPECTED_EVALUATORS) - seen_ids
+    if missing:
+        sys.exit(
+            f'FAIL: results file does not surface evaluatorId entries for '
+            f'{sorted(missing)} (seen: {sorted(seen_ids)}). Both judges '
+            f'should run on every test case.'
+        )
+    print(
+        f"OK: results file references both evaluator ids ({sorted(seen_ids)}) "
+        f"across {len(cases)} test case(s)"
+    )
+
+
+def main() -> None:
+    if not ROOT.is_dir():
+        sys.exit(f"FAIL: project directory {ROOT} does not exist")
+    check_evaluator_configs()
+    check_eval_set()
+    check_results()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/tasks/uipath-agents/eval_llm_judges/eval_llm_judges.yaml b/tests/tasks/uipath-agents/eval_llm_judges/eval_llm_judges.yaml
new file mode 100644
index 000000000..f94c9803e
--- /dev/null
+++ b/tests/tasks/uipath-agents/eval_llm_judges/eval_llm_judges.yaml
@@ -0,0 +1,67 @@
+task_id: skill-agent-coded-eval-llm-judges
+description: >
+  Coded-agent eval lifecycle, LLM-judge path. Verifies the agent
+  authors two LLM-judge evaluator configs in one eval set —
+  `LLMJudgeOutputEvaluator`
+  (`uipath-llm-judge-output-semantic-similarity`) and
+  `LLMJudgeTrajectoryEvaluator`
+  (`uipath-llm-judge-trajectory-similarity`) — and runs them against
+  a LangGraph classifier with `--no-report` and `--mocker-cache` so
+  the judges are reproducible.
+tags: [uipath-agents, e2e, coded, lifecycle:validate, feature:eval]
+max_iterations: 1
+
+agent:
+  type: claude-code
+  permission_mode: acceptEdits
+  allowed_tools: ["Skill", "Bash", "Read", "Write", "Edit", "Glob", "Grep"]
+  turn_timeout: 1200
+
+sandbox:
+  driver: tempdir
+  python: {}
+
+initial_prompt: |
+  Build a LangGraph UiPath coded agent named `intent-classifier` that
+  classifies a free-text user request into one of: `weather`,
+  `news`, `joke`.
+
+  Input: `text` (str). Output: `category` (str), `text` (str).
+
+  For deterministic test runs, pin the LLM to a low-cost gateway
+  model (e.g. `gpt-4o-mini-2024-07-18`) with `temperature=0`.
+
+  Take the agent through scaffold → init → run, then add an
+  evaluation harness with **two LLM judges in one eval set**:
+
+  - The semantic-output LLM judge (id `LLMJudgeOutputEvaluator`).
+    Each test case's criteria block carries an `expectedOutput`.
+  - The trajectory LLM judge (id `LLMJudgeTrajectoryEvaluator`).
+    Each test case's criteria block carries an
+    `expectedAgentBehavior` string — e.g. "Agent classifies the
+    input into exactly one of weather/news/joke and returns that
+    label."
+
+  Three test cases. Run the eval set locally (no Studio Web
+  reporting) and cache LLM responses for reproducibility. Save the
+  results to `eval-results.json` in the project root.
+
+  Do NOT publish, upload, or deploy. Do NOT pause between planning
+  and implementation. Complete end-to-end in a single pass.
+
+success_criteria:
+  - type: command_executed
+    description: "Agent ran the eval suite with --no-report"
+    tool_name: "Bash"
+    command_pattern: 'uip\s+codedagent\s+eval\s+.*--no-report'
+    min_count: 1
+    weight: 2.0
+    pass_threshold: 1.0
+
+  - type: run_command
+    description: "Both LLM-judge evaluator configs + eval-set + results shape"
+    command: "python3 $TASK_DIR/check_eval_llm_judges.py"
+    timeout: 30
+    expected_exit_code: 0
+    weight: 5.0
+    pass_threshold: 1.0