diff --git a/docs/checklists/dogfood-evidence-adoption.md b/docs/checklists/dogfood-evidence-adoption.md new file mode 100644 index 0000000..da640a5 --- /dev/null +++ b/docs/checklists/dogfood-evidence-adoption.md @@ -0,0 +1,94 @@ +# Dogfood Evidence Adoption Checklist + +Use this checklist before adding a target repository as a dogfood report, +README badge, lifecycle result, validation note, or effectiveness example in +this kit. + +Dogfood evidence should make the kit easier to evaluate. It should not turn a +single target run into an unsupported effectiveness claim. + +## Required Before Adoption + +- Source tracking exists and names the kit source, commit, applied profile, and + adoption or setup context, usually in `.harness/source.json`. +- The target repository commit or PR being cited is stable and linkable. +- The report separates non-comparable setup work from comparable product-task + outcomes. +- Each counted product task has a task outcome record with repository ref, + prompt ref or prompt hash, expected boundary, known failure mode, files + changed, first-pass verification, final verification, and inclusion flags. +- The target normal completion gate is named from the target's real workflow. +- Deterministic, local, non-network, reasonably fast behavior checks are either + included in that normal gate or have a recorded reason for focused/manual + placement. +- Live API, credential, provider-uptime, visual, device, slow, watcher, or + otherwise fragile checks are kept outside the normal gate unless the target + intentionally expects them in normal verification. +- Failure records exist for non-transient failed setup checks, failed harness + checks, recurring agent mistakes, cross-environment mismatches, or high-risk + bug paths that should not recur. +- Each failure record names a regression test, fixture, smoke check, lint rule, + drift check, CI gate, or manual review point that detects or prevents + recurrence, or explains why no check is practical. +- Aggregate reports state clearly whether the evidence is baseline-vs-harnessed + or harnessed-only tracking. +- Harnessed-only reports explicitly say they do not prove effectiveness + improvement without a later comparison point. + +## Required Checks + +Run the target's normal gate and this kit's report validators before adopting +the evidence: + +```bash +python scripts/check_harness.py +python /path/to/harness-starter-kit/scripts/check_effectiveness_plan.py +python /path/to/harness-starter-kit/scripts/check_failure_memory.py +``` + +Use the target's real normal gate if it is not `python scripts/check_harness.py`. +For JavaScript targets, this might be `npm run check:harness`; for framework +targets, it might be `make test`, `just check`, Maven, Gradle, Django, or +another local command. + +## Reject Or Defer Adoption When + +- The evidence relies on local-only paths without stable repository refs or + prompt hashes. +- Setup failures are excluded from metrics but not evaluated for failure + memory. +- A template or placeholder task outcome is included in the effectiveness report + or comparable product-task count. +- The aggregate report says product tasks are complete while also saying no + product-task records are complete. +- The report uses Harness Doctor, passing checks, or fixture tests as proof of + agent effectiveness. +- The target adopted starter-kit defaults blindly instead of preserving its own + architecture, package manager, docs, commands, and conventions. +- The example would require copying target-specific architecture into generic + templates. + +## Report Placement + +Use the smallest durable placement that fits the evidence: + +- `docs/examples/effectiveness-report--dogfood.md` for an aggregate + dogfood report. +- `docs/examples/lifecycle-pilot-results.md` for a short lifecycle or dogfood + summary. +- `docs/evaluation.md` for the example index. +- `docs/validation.md` when the target is used as validation or dogfood + evidence. +- README badges only when the target repository is public and intentionally + maintained as dogfood evidence. + +## Review Questions + +- Does the report preserve the target repository as the source of truth? +- Does it count only comparable product-task outcomes? +- Does it name the target's real normal gate and gate-placement decisions? +- Does it record misses honestly, including wrong-file edits and failed first + verification? +- Does it link failure memory to detection or prevention? +- Does it avoid claiming improvement unless there is a comparable baseline or + later comparison window? diff --git a/docs/component-map.md b/docs/component-map.md index 3542957..a69c709 100644 --- a/docs/component-map.md +++ b/docs/component-map.md @@ -26,6 +26,7 @@ This map connects harness engineering concepts to files in a target repository. | External API work recipe | server-only API boundary, redaction, live/mock fallback, and smoke checks | `docs/checklists/external-api-work.md` | | Decision and failure memory guidance | examples for when to record ADRs, failure notes, domain docs, or final-report notes | `docs/checklists/decision-failure-memory.md` | | Verification script patterns | custom smoke checks and transparent `check:harness` composition | `docs/checklists/verification-scripts.md` | +| Dogfood evidence adoption | source tracking, task outcome, failure memory, gate placement, and claim-boundary review | `docs/checklists/dogfood-evidence-adoption.md` | | Stack-specific rules | lint/type/pre-commit/framework snippets | `templates/profiles/*` | | Stack profile guide | available profiles and how to treat snippets as reference material | `docs/profiles.md` | | Profile absorption | checklist for turning profile snippets into project rules | `docs/checklists/profile-absorption.md` | diff --git a/docs/decisions/0005-validate-dogfood-evidence-consistency.md b/docs/decisions/0005-validate-dogfood-evidence-consistency.md new file mode 100644 index 0000000..f2a2096 --- /dev/null +++ b/docs/decisions/0005-validate-dogfood-evidence-consistency.md @@ -0,0 +1,91 @@ +# 0005. Validate Dogfood Evidence Consistency Before Adoption + +## Status + +Accepted + +## Date + +2026-06-06 + +## Context + +Dogfood evidence is useful only when it preserves the difference between +harness health, setup evidence, comparable product-task outcomes, and actual +agent effectiveness. + +During Harness ERP dogfood review, the evidence was directionally strong but +initially exposed two adoption-quality gaps: + +- an aggregate effectiveness report could say product-task runs were complete + while later text still said no product-task records were complete +- a task outcome template could accidentally keep inclusion flags enabled and + contaminate future mechanical counts + +The target repository remained the source of truth, and the right response was +not to make adoption automatic. The kit needed a small validation and checklist +layer so future dogfood evidence can be accepted or deferred using repeatable +criteria. + +## Decision + +Extend `scripts/check_effectiveness_plan.py` to validate dogfood evidence +consistency: + +- effectiveness reports that claim completed product-task outcomes must not + also contain stale "no completed records yet" language or "record outcomes as + they run" follow-up language +- task outcome templates or placeholder task outcomes must not be included in + effectiveness reports or comparable product-task counts + +Ship the same checker behavior in +`templates/generic/scripts/check_effectiveness_plan.py` so target repositories +receive the guard during adoption. + +Add `docs/checklists/dogfood-evidence-adoption.md` as a prompt-first review +checklist for deciding whether a dogfood target should become a report, +lifecycle note, validation note, or README badge in this kit. + +## Rationale + +- These checks catch concrete evidence-quality gaps without inferring + effectiveness improvement from passing tests or Harness Doctor scores. +- The validation remains lightweight and local, using the same standard-library + checker style as the rest of the kit. +- The checklist keeps dogfood adoption prompt-first and reviewable instead of + making the installer or checker copy target-specific architecture into + generic templates. +- Template inclusion flags are high-risk for future aggregation because a + parser can count them even when a human reader understands they are + placeholders. + +## Alternatives Considered + +- Manual review only: rejected because the Harness ERP review showed the same + stale aggregate text and template inclusion risk can survive until a later + reviewer notices it. +- Parse every task outcome as full YAML: rejected for now because the kit avoids + external dependencies and only needs a few scalar fields for this guard. +- Require baseline-vs-harnessed evidence before dogfood adoption: rejected + because harnessed-only dogfood is still useful operational evidence when it + is labeled correctly and does not claim improvement. + +## Consequences + +- `scripts/check_effectiveness_plan.py` now checks selected task outcome YAML + records in addition to adoption and effectiveness Markdown reports. +- Dogfood reports that contain stale aggregate completion language fail local + validation. +- Target-local template task outcome files must set inclusion flags to false, + unknown, TODO, or another non-truthy value. +- Future dogfood adoption should cite or run the dogfood evidence checklist + before adding README badges or validation examples. + +## Agent Guidance + +When adding dogfood evidence to this kit, run the target's normal gate and this +kit's effectiveness and failure-memory validators. Do not adopt the evidence +when template task outcomes are countable, stale aggregate language contradicts +the completed records, setup failures have not been evaluated for failure +memory, or the report implies effectiveness improvement without a comparison +point. diff --git a/docs/evaluation.md b/docs/evaluation.md index 048d2fe..3dd3135 100644 --- a/docs/evaluation.md +++ b/docs/evaluation.md @@ -111,3 +111,8 @@ cheaper to correct after the harness becomes part of the repository. - [Small harness outcome evidence report](examples/effectiveness-report-small-evidence.md) records three harnessed task outcomes and summarizes a narrow operational evidence pass without treating Harness Doctor scores or passing checks as proof of agent effectiveness. - [TodayBus harnessed-only dogfood benchmark](examples/effectiveness-report-todaybus-dogfood.md) records three product-task outcomes, excludes a non-comparable setup run, and treats the result as an initial benchmark rather than proof of effectiveness improvement. - [Harness ERP Spring/Maven dogfood benchmark](examples/effectiveness-report-harness-erp-dogfood.md) records five backend product-task outcomes, one honest boundary miss, prompt hashes, failure-memory linkage, and source tracking as initial benchmark evidence rather than proof of effectiveness improvement. + +Before adding a new dogfood report to this kit, use +[`docs/checklists/dogfood-evidence-adoption.md`](checklists/dogfood-evidence-adoption.md) +to verify source tracking, task outcomes, failure memory, gate placement, and +claim boundaries. diff --git a/docs/examples/task-outcomes/001-recipe-api-harness-adoption.yaml b/docs/examples/task-outcomes/001-recipe-api-harness-adoption.yaml index 0e070a4..3e5a1be 100644 --- a/docs/examples/task-outcomes/001-recipe-api-harness-adoption.yaml +++ b/docs/examples/task-outcomes/001-recipe-api-harness-adoption.yaml @@ -69,4 +69,5 @@ outcome: follow_up: harness_change_needed: false decision_or_failure_record: No failure record added; review findings were one-time adoption cleanup. - include_in_effectiveness_report: true \ No newline at end of file + include_in_effectiveness_report: true + include_in_comparable_product_task_count: false diff --git a/docs/examples/task-outcomes/002-recipe-api-category-feature.yaml b/docs/examples/task-outcomes/002-recipe-api-category-feature.yaml index 35223a8..e8100bc 100644 --- a/docs/examples/task-outcomes/002-recipe-api-category-feature.yaml +++ b/docs/examples/task-outcomes/002-recipe-api-category-feature.yaml @@ -69,4 +69,5 @@ outcome: follow_up: harness_change_needed: false decision_or_failure_record: docs/decisions/003-add-recipe-categories.md - include_in_effectiveness_report: true \ No newline at end of file + include_in_effectiveness_report: true + include_in_comparable_product_task_count: true diff --git a/docs/examples/task-outcomes/003-recipe-api-category-update-tests.yaml b/docs/examples/task-outcomes/003-recipe-api-category-update-tests.yaml index 17082e6..01ff750 100644 --- a/docs/examples/task-outcomes/003-recipe-api-category-update-tests.yaml +++ b/docs/examples/task-outcomes/003-recipe-api-category-update-tests.yaml @@ -55,4 +55,5 @@ outcome: follow_up: harness_change_needed: false decision_or_failure_record: Existing ADR updated. - include_in_effectiveness_report: true \ No newline at end of file + include_in_effectiveness_report: true + include_in_comparable_product_task_count: false diff --git a/docs/failures/0006-dogfood-evidence-consistency-gaps-were-not-checked.md b/docs/failures/0006-dogfood-evidence-consistency-gaps-were-not-checked.md new file mode 100644 index 0000000..017fe02 --- /dev/null +++ b/docs/failures/0006-dogfood-evidence-consistency-gaps-were-not-checked.md @@ -0,0 +1,69 @@ +# 0006. Dogfood Evidence Consistency Gaps Were Not Checked + +## Date Observed + +2026-06-06 + +## Failure Type + +Harness maintenance gap and repeated agent mistake risk. + +## Goal + +Dogfood evidence adopted into this kit should not contain stale aggregate +effectiveness language or count placeholder task outcome templates as real +evidence. + +## What Happened Or Was Tried + +Harness ERP was used as Spring/Maven dogfood evidence. A review found the +evidence was useful but initially not adoptable as-is: + +- the aggregate effectiveness report said five comparable product-task runs + were complete while the interpretation still said no completed product-task + records existed yet +- the target-local task outcome template had inclusion flags set to true, which + could contaminate future mechanical aggregation + +The evidence was corrected in the target repository before adoption, but the +starter kit did not yet have a local check that would catch those two gaps for +future dogfood targets. + +## Why It Failed + +- `scripts/check_effectiveness_plan.py` validated required report sections and + TODO markers, but did not inspect consistency between completed-outcome claims + and stale no-records language. +- The checker did not inspect task outcome YAML records, so a placeholder or + template record could keep inclusion flags enabled without failing + validation. +- Dogfood adoption criteria were implicit in review judgment instead of written + as a reusable checklist. + +## Current Replacement + +`scripts/check_effectiveness_plan.py` now validates: + +- aggregate effectiveness reports that claim completed product-task outcomes do + not also use stale no-completed-records language +- task outcome templates and placeholder task outcomes are not included in + effectiveness reports or comparable product-task counts + +`templates/generic/scripts/check_effectiveness_plan.py` carries the same guard +for target repositories. `docs/checklists/dogfood-evidence-adoption.md` +documents the source tracking, task outcome, failure memory, gate placement, +and claim-boundary criteria for adding dogfood evidence to this kit. + +## Detection Or Prevention Check + +`tests/test_check_effectiveness_plan.py` covers aggregate completion-language +contradictions, task outcome templates with truthy inclusion flags, and +placeholder task outcomes with truthy inclusion flags. `scripts/check_effectiveness_plan.py` +is the local checker that prevents those evidence-quality gaps from passing. + +## Agent Guidance + +Before adopting dogfood evidence, run `scripts/check_effectiveness_plan.py` and +review `docs/checklists/dogfood-evidence-adoption.md`. Do not count setup-only +runs as comparable product tasks, do not leave template task outcomes countable, +and do not claim effectiveness improvement from harnessed-only evidence. diff --git a/docs/validation.md b/docs/validation.md index 631eb6c..89b4b6e 100644 --- a/docs/validation.md +++ b/docs/validation.md @@ -99,6 +99,10 @@ claiming effectiveness improvement: - [Harness ERP Spring/Maven dogfood benchmark](examples/effectiveness-report-harness-erp-dogfood.md) for a Spring Boot backend target +Use the +[dogfood evidence adoption checklist](checklists/dogfood-evidence-adoption.md) +before adding another target as validation or effectiveness evidence. + ## Example Reports Use these examples when checking whether a target adoption report is complete: diff --git a/scripts/check_effectiveness_plan.py b/scripts/check_effectiveness_plan.py index fdafe44..8f7f00e 100644 --- a/scripts/check_effectiveness_plan.py +++ b/scripts/check_effectiveness_plan.py @@ -54,6 +54,11 @@ "Skipped", ) +TASK_OUTCOME_INCLUDE_FIELDS = ( + "include_in_effectiveness_report", + "include_in_comparable_product_task_count", +) + NO_FAILURE_RECORD_PHRASES = ( "no failure record", "no failure note", @@ -154,6 +159,38 @@ TODO_RE = re.compile(r"\bTODO\b", flags=re.IGNORECASE) SECTION_RE = re.compile(r"^##\s+", flags=re.MULTILINE) +COMPLETED_OUTCOME_PATTERNS = ( + re.compile( + r"\b(?:one|two|three|four|five|six|seven|eight|nine|ten|\d+)\s+" + r"comparable\s+product-task\s+runs?\s+have\s+been\s+completed\b", + flags=re.IGNORECASE, + ), + re.compile( + r"\b(?:all\s+)?(?:one|two|three|four|five|six|seven|eight|nine|ten|\d+)\s+" + r"planned\s+product-task\s+records?\s+are\s+complete\b", + flags=re.IGNORECASE, + ), + re.compile( + r"\bproduct-task\s+outcomes\s+counted\s*\|\s*" + r"(?:not available|unknown|n/a)\s*\|\s*" + r"(?:[1-9]\d*|one|two|three|four|five|six|seven|eight|nine|ten)\b", + flags=re.IGNORECASE, + ), +) +STALE_NO_COMPLETED_PATTERNS = ( + re.compile( + r"\bno\s+completed\s+(?:product[- ]task\s+)?records?\s+yet\b", + flags=re.IGNORECASE, + ), + re.compile( + r"\bno\s+completed\s+product[- ]task\s+runs?\s+yet\b", + flags=re.IGNORECASE, + ), + re.compile( + r"\brecord\s+.{1,160}\btask\s+outcomes\s+as\s+they\s+run\b", + flags=re.IGNORECASE, + ), +) @dataclass(frozen=True) @@ -204,6 +241,18 @@ def iter_reports(root: Path) -> list[Path]: ] +def iter_task_outcomes(root: Path) -> list[Path]: + paths: list[Path] = [] + for pattern in ("*.yaml", "*.yml"): + for path in root.rglob(pattern): + relative = path.relative_to(root) + if is_ignored(relative) or is_template(relative): + continue + if "task-outcomes" in relative.parts or path.name.startswith("task-outcome"): + paths.append(path) + return sorted(set(paths)) + + def field_value(text: str, field: str) -> str | None: pattern = re.compile(rf"^(\s*)-\s*{re.escape(field)}:\s*(.*)$") lines = text.splitlines() @@ -250,6 +299,33 @@ def is_placeholder(value: str | None) -> bool: return value is None or not value or bool(TODO_RE.search(value)) +def yaml_field_value(text: str, field: str) -> str | None: + pattern = re.compile( + rf"^[ \t]*{re.escape(field)}:[ \t]*(.*?)[ \t]*$", + flags=re.MULTILINE, + ) + match = pattern.search(text) + if match is None: + return None + return match.group(1).split("#", 1)[0].strip() + + +def is_truthy_yaml_value(value: str | None) -> bool: + if value is None: + return False + normalized = value.strip().strip("\"'`").lower() + return normalized in {"true", "yes", "1"} + + +def is_missing_or_placeholder_yaml_value(value: str | None) -> bool: + if value is None: + return True + normalized = value.strip().strip("\"'`").lower() + return not normalized or normalized in {"todo", "unknown"} or bool( + TODO_RE.search(value) + ) + + def recorded_failure_exists(value: str | None) -> bool: if value is None: return False @@ -495,6 +571,85 @@ def validate_effectiveness_report(path: Path, text: str) -> list[Finding]: findings.append(Finding(path, f"missing required section: {section}")) if TODO_RE.search(text): findings.append(Finding(path, "effectiveness report still contains TODO")) + if any(pattern.search(text) for pattern in COMPLETED_OUTCOME_PATTERNS) and any( + pattern.search(text) for pattern in STALE_NO_COMPLETED_PATTERNS + ): + findings.append( + Finding(path, "contradictory effectiveness-report completion language") + ) + return findings + + +def validate_task_outcome(path: Path, text: str) -> list[Finding]: + report_include_value = yaml_field_value(text, "include_in_effectiveness_report") + comparable_count_value = yaml_field_value( + text, "include_in_comparable_product_task_count" + ) + findings: list[Finding] = [] + + if is_truthy_yaml_value(report_include_value) and is_missing_or_placeholder_yaml_value( + comparable_count_value + ): + findings.append( + Finding( + path, + ( + "task outcome included in effectiveness report must declare " + "include_in_comparable_product_task_count" + ), + ) + ) + + if is_truthy_yaml_value(comparable_count_value) and not is_truthy_yaml_value( + report_include_value + ): + findings.append( + Finding( + path, + ( + "task outcome included in comparable product-task count must set " + "include_in_effectiveness_report to true" + ), + ) + ) + + truthy_include_fields = [ + field + for field in TASK_OUTCOME_INCLUDE_FIELDS + if is_truthy_yaml_value(yaml_field_value(text, field)) + ] + if not truthy_include_fields: + return findings + + name = path.name.lower() + if "template" in name: + findings.append( + Finding( + path, + ( + "task outcome template must not be included in effectiveness " + "or comparable product-task counts" + ), + ) + ) + return findings + + placeholder_fields = [ + field + for field in ("id", "run_id", "prompt_summary") + if is_missing_or_placeholder_yaml_value(yaml_field_value(text, field)) + ] + if placeholder_fields: + findings.append( + Finding( + path, + ( + "placeholder task outcome must not be included in " + "effectiveness or comparable product-task counts" + ), + ) + ) + return findings @@ -514,6 +669,9 @@ def check_effectiveness_plan(root: Path, require_report: bool) -> int: if "effectiveness-report" in name: findings.extend(validate_effectiveness_report(path, text)) + for path in iter_task_outcomes(root): + findings.extend(validate_task_outcome(path, path.read_text(encoding="utf-8"))) + for finding in findings: print(f"{finding.path.relative_to(root)}: {finding.message}") diff --git a/templates/generic/scripts/check_effectiveness_plan.py b/templates/generic/scripts/check_effectiveness_plan.py index fdafe44..8f7f00e 100644 --- a/templates/generic/scripts/check_effectiveness_plan.py +++ b/templates/generic/scripts/check_effectiveness_plan.py @@ -54,6 +54,11 @@ "Skipped", ) +TASK_OUTCOME_INCLUDE_FIELDS = ( + "include_in_effectiveness_report", + "include_in_comparable_product_task_count", +) + NO_FAILURE_RECORD_PHRASES = ( "no failure record", "no failure note", @@ -154,6 +159,38 @@ TODO_RE = re.compile(r"\bTODO\b", flags=re.IGNORECASE) SECTION_RE = re.compile(r"^##\s+", flags=re.MULTILINE) +COMPLETED_OUTCOME_PATTERNS = ( + re.compile( + r"\b(?:one|two|three|four|five|six|seven|eight|nine|ten|\d+)\s+" + r"comparable\s+product-task\s+runs?\s+have\s+been\s+completed\b", + flags=re.IGNORECASE, + ), + re.compile( + r"\b(?:all\s+)?(?:one|two|three|four|five|six|seven|eight|nine|ten|\d+)\s+" + r"planned\s+product-task\s+records?\s+are\s+complete\b", + flags=re.IGNORECASE, + ), + re.compile( + r"\bproduct-task\s+outcomes\s+counted\s*\|\s*" + r"(?:not available|unknown|n/a)\s*\|\s*" + r"(?:[1-9]\d*|one|two|three|four|five|six|seven|eight|nine|ten)\b", + flags=re.IGNORECASE, + ), +) +STALE_NO_COMPLETED_PATTERNS = ( + re.compile( + r"\bno\s+completed\s+(?:product[- ]task\s+)?records?\s+yet\b", + flags=re.IGNORECASE, + ), + re.compile( + r"\bno\s+completed\s+product[- ]task\s+runs?\s+yet\b", + flags=re.IGNORECASE, + ), + re.compile( + r"\brecord\s+.{1,160}\btask\s+outcomes\s+as\s+they\s+run\b", + flags=re.IGNORECASE, + ), +) @dataclass(frozen=True) @@ -204,6 +241,18 @@ def iter_reports(root: Path) -> list[Path]: ] +def iter_task_outcomes(root: Path) -> list[Path]: + paths: list[Path] = [] + for pattern in ("*.yaml", "*.yml"): + for path in root.rglob(pattern): + relative = path.relative_to(root) + if is_ignored(relative) or is_template(relative): + continue + if "task-outcomes" in relative.parts or path.name.startswith("task-outcome"): + paths.append(path) + return sorted(set(paths)) + + def field_value(text: str, field: str) -> str | None: pattern = re.compile(rf"^(\s*)-\s*{re.escape(field)}:\s*(.*)$") lines = text.splitlines() @@ -250,6 +299,33 @@ def is_placeholder(value: str | None) -> bool: return value is None or not value or bool(TODO_RE.search(value)) +def yaml_field_value(text: str, field: str) -> str | None: + pattern = re.compile( + rf"^[ \t]*{re.escape(field)}:[ \t]*(.*?)[ \t]*$", + flags=re.MULTILINE, + ) + match = pattern.search(text) + if match is None: + return None + return match.group(1).split("#", 1)[0].strip() + + +def is_truthy_yaml_value(value: str | None) -> bool: + if value is None: + return False + normalized = value.strip().strip("\"'`").lower() + return normalized in {"true", "yes", "1"} + + +def is_missing_or_placeholder_yaml_value(value: str | None) -> bool: + if value is None: + return True + normalized = value.strip().strip("\"'`").lower() + return not normalized or normalized in {"todo", "unknown"} or bool( + TODO_RE.search(value) + ) + + def recorded_failure_exists(value: str | None) -> bool: if value is None: return False @@ -495,6 +571,85 @@ def validate_effectiveness_report(path: Path, text: str) -> list[Finding]: findings.append(Finding(path, f"missing required section: {section}")) if TODO_RE.search(text): findings.append(Finding(path, "effectiveness report still contains TODO")) + if any(pattern.search(text) for pattern in COMPLETED_OUTCOME_PATTERNS) and any( + pattern.search(text) for pattern in STALE_NO_COMPLETED_PATTERNS + ): + findings.append( + Finding(path, "contradictory effectiveness-report completion language") + ) + return findings + + +def validate_task_outcome(path: Path, text: str) -> list[Finding]: + report_include_value = yaml_field_value(text, "include_in_effectiveness_report") + comparable_count_value = yaml_field_value( + text, "include_in_comparable_product_task_count" + ) + findings: list[Finding] = [] + + if is_truthy_yaml_value(report_include_value) and is_missing_or_placeholder_yaml_value( + comparable_count_value + ): + findings.append( + Finding( + path, + ( + "task outcome included in effectiveness report must declare " + "include_in_comparable_product_task_count" + ), + ) + ) + + if is_truthy_yaml_value(comparable_count_value) and not is_truthy_yaml_value( + report_include_value + ): + findings.append( + Finding( + path, + ( + "task outcome included in comparable product-task count must set " + "include_in_effectiveness_report to true" + ), + ) + ) + + truthy_include_fields = [ + field + for field in TASK_OUTCOME_INCLUDE_FIELDS + if is_truthy_yaml_value(yaml_field_value(text, field)) + ] + if not truthy_include_fields: + return findings + + name = path.name.lower() + if "template" in name: + findings.append( + Finding( + path, + ( + "task outcome template must not be included in effectiveness " + "or comparable product-task counts" + ), + ) + ) + return findings + + placeholder_fields = [ + field + for field in ("id", "run_id", "prompt_summary") + if is_missing_or_placeholder_yaml_value(yaml_field_value(text, field)) + ] + if placeholder_fields: + findings.append( + Finding( + path, + ( + "placeholder task outcome must not be included in " + "effectiveness or comparable product-task counts" + ), + ) + ) + return findings @@ -514,6 +669,9 @@ def check_effectiveness_plan(root: Path, require_report: bool) -> int: if "effectiveness-report" in name: findings.extend(validate_effectiveness_report(path, text)) + for path in iter_task_outcomes(root): + findings.extend(validate_task_outcome(path, path.read_text(encoding="utf-8"))) + for finding in findings: print(f"{finding.path.relative_to(root)}: {finding.message}") diff --git a/tests/test_check_effectiveness_plan.py b/tests/test_check_effectiveness_plan.py index 5c5dbcc..f4a0e57 100644 --- a/tests/test_check_effectiveness_plan.py +++ b/tests/test_check_effectiveness_plan.py @@ -90,6 +90,39 @@ def touch_local_path(self, root: Path, relative: str) -> None: path.parent.mkdir(parents=True, exist_ok=True) path.write_text("", encoding="utf-8") + def write_task_outcome( + self, + root: Path, + relative: str, + include_in_report: str = "true", + include_in_count: str = "true", + task_id: str = "T1", + run_id: str = "T1-001", + prompt_summary: str = "Add route", + start_ref: str = "abc123", + ) -> None: + path = root / relative + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text( + "\n".join( + ( + "schema_version: 1", + "", + "task:", + f" id: {task_id}", + f" run_id: {run_id}", + f" prompt_summary: {prompt_summary}", + f" start_ref: {start_ref}", + "", + "follow_up:", + f" include_in_effectiveness_report: {include_in_report}", + f" include_in_comparable_product_task_count: {include_in_count}", + "", + ) + ), + encoding="utf-8", + ) + def write_package_json(self, root: Path, scripts: dict[str, str]) -> None: (root / "package.json").write_text( json.dumps({"scripts": scripts}), @@ -781,6 +814,42 @@ def test_complete_effectiveness_report_passes(self) -> None: self.assertEqual("", result.stdout) self.assertEqual(0, result.returncode) + def test_effectiveness_report_completion_contradiction_fails(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + (root / "node-effectiveness-report.md").write_text( + COMPLETE_EFFECTIVENESS_REPORT + + "\nFive comparable product-task runs have been completed.\n" + + "\n- Confounders or limitations: no baseline and no completed " + "product-task records yet.\n" + + "- Harness changes to make next: record T1 through T5 task " + "outcomes as they run.\n", + encoding="utf-8", + ) + + result = self.run_checker(root) + + self.assertIn( + "contradictory effectiveness-report completion language", + result.stdout, + ) + self.assertEqual(1, result.returncode) + + def test_no_completed_language_without_completion_claim_passes(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + (root / "node-effectiveness-report.md").write_text( + COMPLETE_EFFECTIVENESS_REPORT + + "\n- Confounders or limitations: no completed product-task " + "records yet.\n", + encoding="utf-8", + ) + + result = self.run_checker(root) + + self.assertEqual("", result.stdout) + self.assertEqual(0, result.returncode) + def test_effectiveness_report_with_todo_fails(self) -> None: with tempfile.TemporaryDirectory() as tmp: root = Path(tmp) @@ -794,6 +863,146 @@ def test_effectiveness_report_with_todo_fails(self) -> None: self.assertIn("effectiveness report still contains TODO", result.stdout) self.assertEqual(1, result.returncode) + def test_task_outcome_template_with_true_inclusion_fails(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + self.write_task_outcome( + root, + "docs/effectiveness/task-outcomes/task-outcome-template.yaml", + task_id="unknown", + run_id="unknown", + prompt_summary="unknown", + start_ref="unknown", + ) + + result = self.run_checker(root) + + self.assertIn("task outcome template must not be included", result.stdout) + self.assertEqual(1, result.returncode) + + def test_included_task_outcome_requires_comparable_count_flag(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + self.write_task_outcome( + root, + "docs/effectiveness/task-outcomes/001-route.yaml", + include_in_count="", + ) + + result = self.run_checker(root) + + self.assertIn( + "must declare include_in_comparable_product_task_count", + result.stdout, + ) + self.assertEqual(1, result.returncode) + + def test_comparable_task_outcome_requires_report_inclusion_flag(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + self.write_task_outcome( + root, + "docs/effectiveness/task-outcomes/001-route.yaml", + include_in_report="", + ) + + result = self.run_checker(root) + + self.assertIn( + "must set include_in_effectiveness_report to true", + result.stdout, + ) + self.assertEqual(1, result.returncode) + + def test_comparable_task_outcome_rejects_false_report_inclusion(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + self.write_task_outcome( + root, + "docs/effectiveness/task-outcomes/001-route.yaml", + include_in_report="false", + include_in_count="true", + ) + + result = self.run_checker(root) + + self.assertIn( + "must set include_in_effectiveness_report to true", + result.stdout, + ) + self.assertEqual(1, result.returncode) + + def test_report_included_non_product_task_outcome_passes(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + self.write_task_outcome( + root, + "docs/effectiveness/task-outcomes/001-adoption-cleanup.yaml", + include_in_report="true", + include_in_count="false", + ) + + result = self.run_checker(root) + + self.assertEqual("", result.stdout) + self.assertEqual(0, result.returncode) + + def test_task_outcome_template_with_false_inclusion_passes(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + self.write_task_outcome( + root, + "docs/effectiveness/task-outcomes/task-outcome-template.yaml", + include_in_report="false", + include_in_count="false", + task_id="unknown", + run_id="unknown", + prompt_summary="unknown", + start_ref="unknown", + ) + + result = self.run_checker(root) + + self.assertEqual("", result.stdout) + self.assertEqual(0, result.returncode) + + def test_placeholder_task_outcome_with_true_inclusion_fails(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + self.write_task_outcome( + root, + "docs/effectiveness/task-outcomes/001-placeholder.yaml", + task_id="unknown", + ) + + result = self.run_checker(root) + + self.assertIn( + "placeholder task outcome must not be included", + result.stdout, + ) + self.assertEqual(1, result.returncode) + + def test_todo_task_outcome_with_true_inclusion_fails(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + self.write_task_outcome( + root, + "docs/effectiveness/task-outcomes/001-todo.yaml", + task_id="TODO", + run_id="TODO", + prompt_summary="TODO", + start_ref="TODO", + ) + + result = self.run_checker(root) + + self.assertIn( + "placeholder task outcome must not be included", + result.stdout, + ) + self.assertEqual(1, result.returncode) + if __name__ == "__main__": unittest.main()