UiPath · radugheo · May 5, 2026 · Apr 30, 2026
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+"""Deploy-lifecycle artifact + metadata check.
+
+Asserts the artifacts `uip codedagent pack` / `deploy` produce in
+`.uipath/`, and that `pyproject.toml` carries the four fields the
+deployment guide flags as required (`name`, `version`, `description`,
+`authors`). Without `authors`, packaging fails with `Project authors
+cannot be empty`.
+
+Checks:
+  1. `deploy-smoke/pyproject.toml` has `name`, `version`,
+     `description`, and `authors`. No `[build-system]`.
+  2. `deploy-smoke/.uipath/` exists and contains a `*.nupkg` file
+     (proof that `pack` ran successfully).
+  3. `deploy-smoke/invoke-output.txt` exists, is non-empty, and the
+     `file_contains` criterion in the YAML separately checks that it
+     surfaces an `https://` URL — kept here as a complementary
+     "non-empty" guard.
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+from pathlib import Path
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from _shared.project_root import find_project_root  # noqa: E402
+
+ROOT = find_project_root("deploy-smoke")
+
+
+def _read_text(path: Path) -> str:
+    if not path.is_file():
+        sys.exit(f"FAIL: Missing {path}")
+    return path.read_text(encoding="utf-8")
+
+
+def check_pyproject() -> None:
+    text = _read_text(ROOT / "pyproject.toml")
+    if "[build-system]" in text:
+        sys.exit("FAIL: pyproject.toml contains a [build-system] section")
+    for needle in ("name", "version", "description", "authors"):
+        if needle not in text:
+            sys.exit(
+                f"FAIL: pyproject.toml is missing `{needle}` — "
+                "deployment guide requires all four fields."
+            )
+    print("OK: pyproject.toml has name, version, description, authors")
+
+
+def check_pack_artifacts() -> None:
+    uipath_dir = ROOT / ".uipath"
+    if not uipath_dir.is_dir():
+        sys.exit(
+            f"FAIL: {uipath_dir} does not exist — `uip codedagent pack` "
+            "did not run."
+        )
+    nupkgs = sorted(uipath_dir.glob("*.nupkg"))
+    if not nupkgs:
+        sys.exit(
+            f"FAIL: no .nupkg file in {uipath_dir} — pack did not produce "
+            "the expected package artifact."
+        )
+    print(f"OK: {uipath_dir.name}/{nupkgs[0].name} exists ({len(nupkgs)} package(s) total)")
+
+
+def check_invoke_output() -> None:
+    path = ROOT / "invoke-output.txt"
+    text = _read_text(path)
+    if not text.strip():
+        sys.exit(f"FAIL: {path.name} is empty — `uip codedagent invoke` produced no output")
+    if "https://" not in text:
+        sys.exit(f"FAIL: {path.name} does not contain a monitoring URL (no `https://` substring)")
+    print(f"OK: {path.name} captured {len(text)} bytes of invoke stdout (with monitoring URL)")
+
+
+def main() -> None:
+    if not ROOT.is_dir():
+        sys.exit(f"FAIL: project directory {ROOT} does not exist")
+    check_pyproject()
+    check_pack_artifacts()
+    check_invoke_output()
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,65 @@
+task_id: skill-agent-coded-deploy-my-workspace
+description: >
+  Coded-agent deploy lifecycle. Verifies the skill guides the agent
+  through `uip codedagent pack` → `uip codedagent publish
+  --my-workspace` (or the combined `uip codedagent deploy
+  --my-workspace`), then `uip codedagent invoke` to start a cloud
+  job and surface the monitoring URL. Exercises the entire production
+  packaging path the existing test suite never reaches.
+tags: [uipath-agents, e2e, coded, lifecycle:deploy, feature:deploy]
+max_iterations: 1
+
+agent:
+  type: claude-code
+  permission_mode: acceptEdits
+  allowed_tools: ["Skill", "Bash", "Read", "Write", "Edit", "Glob", "Grep"]
+  turn_timeout: 1200
+
+sandbox:
+  driver: tempdir
+  python: {}
+
+initial_prompt: |
+  Build a minimal Simple Function UiPath coded agent named
+  `deploy-smoke` whose `main(input)` returns `{"echoed":
+  input.message}`. No LLM. The point is the deployment lifecycle,
+  not the agent logic.
+
+  Take the agent through scaffold → init → run, then deploy it to
+  the user's personal workspace and invoke the published version
+  with `{"message": "deployed"}`.
+
+  After invoke succeeds, write its full stdout (which includes the
+  monitoring URL) to `deploy-smoke/invoke-output.txt` (inside the
+  scaffolded project directory, alongside `pyproject.toml`) so the
+  test harness can verify the URL was surfaced.
+
+  The test harness has UiPath auth pre-configured.
+
+  Do NOT pause between planning and implementation. Complete
+  end-to-end in a single pass.
+
+success_criteria:
+  - type: command_executed
+    description: "Agent packed and published to my-workspace (deploy or pack+publish)"
+    tool_name: "Bash"
+    command_pattern: 'uip\s+codedagent\s+(deploy|publish)\s+.*--my-workspace'
+    min_count: 1
+    weight: 2.5
+    pass_threshold: 1.0
+
+  - type: command_executed
+    description: "Agent invoked the published agent"
+    tool_name: "Bash"
+    command_pattern: 'uip\s+codedagent\s+invoke\s+main'
+    min_count: 1
+    weight: 2.5
+    pass_threshold: 1.0
+
+  - type: run_command
+    description: "Deploy artifacts present and pyproject.toml is well-formed for packaging"
+    command: "python3 $TASK_DIR/check_deploy_my_workspace.py"
+    timeout: 30
+    expected_exit_code: 0
+    weight: 4.0
+    pass_threshold: 1.0
@@ -0,0 +1,162 @@
+#!/usr/bin/env python3
+"""Eval-lifecycle check for the deterministic ExactMatch path.
+
+Validates that the agent authored both halves of the evaluation
+harness — the evaluator config under `evaluations/evaluators/` AND
+the evaluation set under `evaluations/eval-sets/` whose `evaluatorRefs`
+match the evaluator `id` — and that `uip codedagent eval --no-report`
+produced an output file in which every test case has
+`status == "PASSED"` (deterministic agent + deterministic evaluator
+means anything else is a bug).
+
+Checks:
+  1. `adder/evaluations/evaluators/<file>.json` has `evaluatorTypeId`
+     == "uipath-exact-match" and a non-empty `id`.
+  2. `adder/evaluations/eval-sets/<file>.json` has version "1.0",
+     `evaluatorRefs` referencing the evaluator id, at least 2 test
+     cases, and each test case's `evaluationCriterias` keys the
+     evaluator id.
+  3. `eval-results.json` exists with the documented top-level shape
+     (`evaluationSetName`, `evaluationSetResults: [...]`), every
+     test case in `evaluationSetResults` has at least one matching
+     `evaluationRunResults[]` entry for the configured evaluator,
+     and every such entry scored exactly 1.0 (deterministic agent +
+     deterministic evaluator: anything below 1.0 is a bug).
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+from pathlib import Path
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from _shared.project_root import find_project_root  # noqa: E402
+
+ROOT = find_project_root("adder")
+
+
+def _load_json(path: Path) -> dict:
+    if not path.is_file():
+        sys.exit(f"FAIL: Missing {path}")
+    try:
+        return json.loads(path.read_text(encoding="utf-8"))
+    except json.JSONDecodeError as e:
+        sys.exit(f"FAIL: {path} is not valid JSON: {e}")
+
+
+def find_single_json(directory: Path) -> Path:
+    if not directory.is_dir():
+        sys.exit(f"FAIL: {directory} does not exist")
+    files = sorted(p for p in directory.glob("*.json") if p.is_file())
+    if not files:
+        sys.exit(f"FAIL: {directory} contains no .json files")
+    if len(files) > 1:
+        sys.exit(f"FAIL: {directory} should contain exactly one .json file, got {len(files)}")
+    return files[0]
+
+
+def check_evaluator() -> str:
+    path = find_single_json(ROOT / "evaluations" / "evaluators")
+    doc = _load_json(path)
+    type_id = doc.get("evaluatorTypeId")
+    if type_id != "uipath-exact-match":
+        sys.exit(
+            f'FAIL: {path.name} evaluatorTypeId should be "uipath-exact-match", '
+            f'got {type_id!r}'
+        )
+    eval_id = doc.get("id")
+    if not eval_id:
+        sys.exit(f"FAIL: {path.name} is missing required `id` field")
+    print(f'OK: evaluator config {path.name} has evaluatorTypeId={type_id!r} id={eval_id!r}')
+    return eval_id
+
+
+def check_eval_set(evaluator_id: str) -> int:
+    path = find_single_json(ROOT / "evaluations" / "eval-sets")
+    doc = _load_json(path)
+    if doc.get("version") != "1.0":
+        sys.exit(f'FAIL: eval set version should be "1.0", got {doc.get("version")!r}')
+    refs = doc.get("evaluatorRefs") or []
+    if evaluator_id not in refs:
+        sys.exit(
+            f'FAIL: eval set `evaluatorRefs` does not include the evaluator '
+            f'id {evaluator_id!r}. Got: {refs}'
+        )
+    cases = doc.get("evaluations") or []
+    if len(cases) < 2:
+        sys.exit(f"FAIL: eval set must have at least 2 test cases, got {len(cases)}")
+    for i, case in enumerate(cases):
+        crit = case.get("evaluationCriterias") or {}
+        if evaluator_id not in crit:
+            sys.exit(
+                f'FAIL: eval set test case {i} (`{case.get("id", "?")}`) does '
+                f'not key its evaluationCriterias on the evaluator id '
+                f'{evaluator_id!r}. Got keys: {list(crit.keys())}'
+            )
+    print(f'OK: eval set {path.name} references {evaluator_id!r} across {len(cases)} test cases')
+    return len(cases)
+
+
+def check_results(evaluator_id: str, expected_case_count: int) -> None:
+    path = ROOT / "eval-results.json"
+    doc = _load_json(path)
+    if not isinstance(doc, dict):
+        sys.exit(f"FAIL: {path.name} top-level should be an object, got {type(doc).__name__}")
+    cases = doc.get("evaluationSetResults")
+    if not isinstance(cases, list) or not cases:
+        sys.exit(
+            f"FAIL: {path.name} is missing a non-empty `evaluationSetResults` "
+            f"list. Top-level keys: {list(doc.keys())}"
+        )
+    if len(cases) != expected_case_count:
+        sys.exit(
+            f"FAIL: expected {expected_case_count} entries in "
+            f"`evaluationSetResults` (one per eval-set test case), got {len(cases)}"
+        )
+    print(f'OK: eval-results.json carries {len(cases)} entries in evaluationSetResults')
+    bad_cases = []
+    matching_run_count = 0
+    for case in cases:
+        if not isinstance(case, dict):
+            continue
+        case_name = case.get("evaluationName") or "?"
+        runs = case.get("evaluationRunResults") or []
+        matching = [
+            r for r in runs
+            if isinstance(r, dict) and r.get("evaluatorId") == evaluator_id
+        ]
+        if not matching:
+            bad_cases.append(
+                f'{case_name!r}: no evaluationRunResults entry references '
+                f'evaluatorId={evaluator_id!r}'
+            )
+            continue
+        for r in matching:
+            score = (r.get("result") or {}).get("score")
+            if score != 1.0:
+                bad_cases.append(
+                    f'{case_name!r}: evaluator {evaluator_id!r} scored '
+                    f'{score!r}, expected 1.0 (deterministic agent + '
+                    f'deterministic evaluator)'
+                )
+        matching_run_count += len(matching)
+    if bad_cases:
+        sys.exit("FAIL: " + " | ".join(bad_cases))
+    print(
+        f'OK: every test case has an ExactMatchEvaluator run scoring 1.0 '
+        f'({matching_run_count} run(s) across {len(cases)} case(s))'
+    )
+
+
+def main() -> None:
+    if not ROOT.is_dir():
+        sys.exit(f"FAIL: project directory {ROOT} does not exist")
+    evaluator_id = check_evaluator()
+    case_count = check_eval_set(evaluator_id)
+    check_results(evaluator_id, case_count)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,66 @@
+task_id: skill-agent-coded-eval-exact-match
+description: >
+  Coded-agent eval lifecycle, deterministic path. Verifies the skill
+  guides the agent to author an evaluator config under
+  `evaluations/evaluators/`, an evaluation set under
+  `evaluations/eval-sets/` whose `evaluatorRefs` match the evaluator
+  `id`, and run `uip codedagent eval --no-report` against a
+  deterministic Simple Function agent. All test cases must score 1.0
+  and report PASSED in the output file.
+tags: [uipath-agents, e2e, coded, lifecycle:validate, feature:eval]
+max_iterations: 1
+
+agent:
+  type: claude-code
+  permission_mode: acceptEdits
+  allowed_tools: ["Skill", "Bash", "Read", "Write", "Edit", "Glob", "Grep"]
+  turn_timeout: 1200
+
+sandbox:
+  driver: tempdir
+  python: {}
+
+initial_prompt: |
+  Build a Simple Function UiPath coded agent named `adder` whose
+  `main(input)` returns the sum of two integers. No LLM — purely
+  deterministic.
+
+  Input fields: `a` (int), `b` (int). Output field: `result` (int).
+
+  Take the agent through scaffold → init → run, then add an
+  evaluation harness using the **ExactMatch evaluator** with three
+  test cases (e.g. 2+3=5, 0+0=0, 7+1=8). Run the eval set locally
+  (no Studio Web reporting) and save the output to
+  `eval-results.json` in the project root.
+
+  Every test case must score 1.0 — deterministic agent +
+  deterministic evaluator means anything below 1.0 is a regression.
+
+  Do NOT publish, upload, or deploy. Do NOT call `uip login`. Do NOT
+  pause between planning and implementation. Complete end-to-end in
+  a single pass.
+
+success_criteria:
+  - type: command_executed
+    description: "Agent scaffolded the project with uip codedagent new"
+    tool_name: "Bash"
+    command_pattern: 'uip\s+codedagent\s+new'
+    min_count: 1
+    weight: 1.0
+    pass_threshold: 1.0
+
+  - type: command_executed
+    description: "Agent ran the eval suite with --no-report"
+    tool_name: "Bash"
+    command_pattern: 'uip\s+codedagent\s+eval\s+.*--no-report'
+    min_count: 1
+    weight: 2.0
+    pass_threshold: 1.0
+
+  - type: run_command
+    description: "Evaluator + eval-set + results shape, all test cases PASSED"
+    command: "python3 $TASK_DIR/check_eval_exact_match.py"
+    timeout: 30
+    expected_exit_code: 0
+    weight: 5.0
+    pass_threshold: 1.0