diff --git a/.gitignore b/.gitignore index 92481d5..9e33315 100644 --- a/.gitignore +++ b/.gitignore @@ -65,4 +65,5 @@ target/ pyosmo/config.py.bak .pytest_cache -.hypothesis \ No newline at end of file +.hypothesis +models \ No newline at end of file diff --git a/doc/prd_tla_plus_verification.md b/doc/prd_tla_plus_verification.md new file mode 100644 index 0000000..4278485 --- /dev/null +++ b/doc/prd_tla_plus_verification.md @@ -0,0 +1,474 @@ +# PRD: Formal Verification of PyOsmo Engine with TLA+ + +| Field | Value | +|-------|-------| +| **Status** | Draft | +| **Author** | PyOsmo Team | +| **Created** | 2026-01-24 | +| **Last Updated** | 2026-01-24 | +| **Priority** | Medium | +| **Category** | Quality / Reliability | + +--- + +## 1. Problem Statement + +PyOsmo's test generation engine is a state machine with nested execution loops, two-level error handling, pluggable algorithms, and composable end conditions. The correctness of this engine is critical because: + +- **Users trust PyOsmo to generate valid test sequences.** If the engine has subtle state management bugs, generated tests may not faithfully represent the model, leading to false confidence in the system under test. +- **Error handling is complex and multi-layered.** The two-level strategy cascade (test-level + suite-level) has subtle interactions that are difficult to verify through unit testing alone. +- **Algorithm fairness claims are unproven.** The balancing algorithms claim to distribute step execution fairly, but no formal proof exists that they achieve this under all step availability patterns. +- **Existing bugs have been found in end conditions** during code review (inverted range checks, per-test vs. cumulative coverage semantics), suggesting more may exist in unexplored paths. + +Traditional testing cannot exhaustively verify these properties because the combinatorial explosion of states (error timing, guard availability changes, strategy combinations) makes complete coverage impractical. + +--- + +## 2. Goals + +### Primary Goals + +1. **Prove engine execution correctness**: Formally verify that lifecycle hooks execute in the guaranteed order under all error paths +2. **Verify error strategy behavior**: Prove that no error is silently dropped when `AlwaysRaise` is configured, and that `AllowCount(n)` has correct threshold semantics +3. **Prove algorithm fairness**: Mathematically verify that `BalancingAlgorithm` achieves `max_count - min_count <= 1` for all step availability patterns +4. **Validate end condition composition**: Verify that `And`/`Or` combinators behave correctly and that no non-`Endless` configuration causes infinite execution + +### Secondary Goals + +5. **Document engine invariants**: The TLA+ specifications serve as executable, machine-checkable documentation of intended behavior +6. **Establish formal methods practice**: Build team capability in formal verification for future use +7. **Identify and fix existing bugs**: Surface edge-case bugs that code review and testing have missed + +### Non-Goals + +- Formal verification of the entire codebase (model discovery, CLI, history data structures) +- Replacing existing pytest-based test suite +- Verifying Python implementation correctness (TLA+ verifies the *design*, not the code) +- Performance optimization + +--- + +## 3. Background + +### 3.1 What is TLA+ + +TLA+ (Temporal Logic of Actions) is a formal specification language created by Leslie Lamport. It models systems as state machines and uses the TLC model checker to exhaustively explore all reachable states, verifying: + +- **Safety properties** (invariants): Conditions that must hold in every reachable state ("bad things never happen") +- **Liveness properties**: Conditions that must eventually become true ("good things eventually happen") +- **Fairness**: Guarantees that enabled actions are eventually taken +- **Action properties**: Constraints on state transitions + +### 3.2 Industry Precedent + +- **Amazon Web Services**: Seven teams use TLA+ for critical distributed systems. TLC found a data-loss bug requiring 35 high-level steps to reproduce—impossible to find through testing. +- **MongoDB**: Uses TLA+ for replication protocol verification and conformance testing (trace checking). +- **Tendermint/Cosmos**: Model-based testing with TLA+ and Apalache generated 500+ lines of integration tests from 6 lines of specification. + +### 3.3 PyOsmo Engine Architecture + +The engine (`pyosmo/osmo.py`) implements a nested state machine: + +``` +IDLE → SUITE_RUNNING → TEST_RUNNING → STEP_EXECUTING → TEST_RUNNING → ... +``` + +With lifecycle hooks at each transition: +``` +before_suite → (before_test → (before → pre_X → step → post_X → after)* → after_test)* → after_suite +``` + +Key complexity comes from: +- Error strategies that may swallow or propagate exceptions at two levels +- End conditions evaluated at different loop points +- Guards that dynamically change step availability between iterations +- Algorithm state that depends on execution history + +--- + +## 4. Requirements + +### 4.1 Functional Requirements + +#### FR-1: Engine State Machine Specification +**Priority: P0** + +Write a TLA+ specification that models the PyOsmo execution engine with the following state variables: +- Execution phase (`idle`, `suite`, `test`, `step`) +- Test and step counters +- Error counters (per-test and per-suite) +- Hook execution trace +- Error strategy configuration + +The specification must model: +- All valid phase transitions +- Lifecycle hook execution at each transition +- Error occurrence and propagation through two-level strategy +- End condition evaluation and loop termination + +#### FR-2: Safety Properties +**Priority: P0** + +Define and verify these invariants: +- **Phase transition validity**: Only legal transitions occur (e.g., `step` can only transition to `test` or `suite`, never to `idle`) +- **Hook ordering**: `after_*` hooks are never called without corresponding `before_*` hooks having executed +- **Monotonic counters**: `total_steps` and `error_count` never decrease +- **Single active test**: At most one test case is in "running" state at any time +- **Steps added to active test only**: `add_step()` is never called when no test case is running + +#### FR-3: Liveness Properties +**Priority: P1** + +Define and verify these temporal properties: +- **Suite termination**: For any non-`Endless` end condition, the suite eventually reaches `idle` state +- **Step fairness**: Under `BalancingAlgorithm` with all steps always available, every step is eventually executed +- **Error propagation**: Under `AlwaysRaise`, every error eventually causes an exception to propagate + +#### FR-4: Error Strategy Cascade Specification +**Priority: P0** + +Model all four error strategies and verify: +- `AlwaysRaise`: No error is ever silently dropped +- `AlwaysIgnore`: Suite always terminates normally regardless of errors +- `IgnoreAsserts`: Only `AssertionError` is swallowed; all other errors propagate +- `AllowCount(n)`: Exactly `n` errors are tolerated; the `(n+1)`th error propagates + +Verify behavior for all combinations of test-level and suite-level strategies (4x4 = 16 combinations). + +#### FR-5: Algorithm Fairness Specification +**Priority: P1** + +Model each step-selection algorithm and verify: + +**BalancingAlgorithm:** +- After `k * |Steps|` iterations (k >= 1), `max_count - min_count <= 1` for all available steps +- Every continuously-available step is eventually selected + +**WeightedBalancingAlgorithm:** +- Computed weights are always positive (never negative or zero) +- The "rescue negative totals" normalization produces valid probability distributions +- With equal weights, behavior converges to `BalancingAlgorithm` properties + +**RandomAlgorithm / WeightedAlgorithm:** +- `choices` list is never empty when `choose()` is called (guard against `IndexError`) + +#### FR-6: End Condition Composition Specification +**Priority: P2** + +Model composable end conditions and verify: +- `And(c1, c2)` terminates iff both `c1` and `c2` are satisfied +- `Or(c1, c2)` terminates iff either `c1` or `c2` is satisfied +- `Length(n)` terminates after exactly `n` steps/tests +- `StepCoverage(p)` terminates when coverage percentage is reached +- No non-`Endless` composition can produce infinite execution (with finite step space) + +### 4.2 Non-Functional Requirements + +#### NFR-1: Specification Readability +Specifications must include comments explaining the correspondence between TLA+ variables/actions and Python code locations (file:line references). + +#### NFR-2: Model Checking Performance +TLC model checking must complete within 10 minutes on a standard development machine for the primary engine specification with small constants (MaxSteps=5, MaxTests=3, |Steps|=4). + +#### NFR-3: CI Integration (Optional) +TLC checks should be runnable as a CI step. Failures produce counterexample traces that map back to engine code. + +#### NFR-4: Documentation Value +Each specification file serves as executable documentation of the intended behavior. Non-TLA+ developers should be able to read the invariant definitions and understand what properties are guaranteed. + +--- + +## 5. User Stories + +### US-1: Engine Developer +> As a PyOsmo engine developer, I want mathematical proof that my lifecycle hook ordering is never violated under any error path, so that users can rely on cleanup hooks (`after_test`, `after_suite`) always executing. + +### US-2: Algorithm Author +> As a developer adding a new step-selection algorithm, I want a TLA+ specification template that defines required fairness properties, so that I can verify my algorithm meets the same guarantees as existing ones. + +### US-3: Bug Reporter +> As a user who encountered unexpected behavior with `AllowCount(3)` + `AlwaysIgnore` strategy combination, I want the framework maintainers to have formally verified all 16 strategy combinations, so that edge cases are caught before release. + +### US-4: PyOsmo User +> As a PyOsmo user, I want confidence that when I configure `StepCoverage(80)` with `And(Length(100))`, the suite will terminate and achieve the coverage I specified, not run forever due to a logic bug. + +### US-5: New Contributor +> As a new contributor to PyOsmo, I want to read the TLA+ specifications to understand the engine's intended state machine behavior without having to reverse-engineer it from Python code. + +--- + +## 6. Technical Design + +### 6.1 Specification Structure + +``` +pyosmo/ + specs/ + tla/ + PyOsmoEngine.tla # Core state machine (FR-1, FR-2, FR-3) + ErrorStrategyCascade.tla # Error handling (FR-4) + BalancingAlgorithm.tla # Balancing fairness (FR-5) + WeightedBalancing.tla # Weighted balancing (FR-5) + EndConditions.tla # Condition composition (FR-6) + PyOsmoEngine.cfg # TLC model config + ErrorStrategyCascade.cfg + BalancingAlgorithm.cfg + WeightedBalancing.cfg + EndConditions.cfg +``` + +### 6.2 Engine State Machine Model + +**State Variables:** +``` +phase ∈ {"idle", "suite", "test", "step"} +test_count ∈ Nat +step_count ∈ Nat (current test) +total_steps ∈ Nat (all tests) +error_count ∈ Nat (current test) +suite_errors ∈ Nat (all tests) +available ⊆ Steps +hook_trace ∈ Seq(HookNames) +test_strategy ∈ {"raise", "ignore", "ignore_asserts", "allow_count"} +suite_strategy ∈ {"raise", "ignore", "ignore_asserts", "allow_count"} +``` + +**Actions:** +- `StartSuite`: idle → suite, execute before_suite +- `StartTest`: suite → test, reset per-test counters, execute before_test +- `ChooseStep`: test → step, algorithm selects from available +- `CompleteStep`: step → test, increment counters, execute after +- `StepError`: step → test|suite (depends on strategy), increment error counters +- `EndTest`: test → suite (when end condition met), execute after_test +- `EndSuite`: suite → idle (when suite condition met), execute after_suite +- `UpdateAvailability`: test → test, change guard results (models dynamic availability) + +**Key Properties to Check:** +``` +INVARIANT TypeInvariant +INVARIANT ValidPhaseTransitions +INVARIANT HookOrderingInvariant +INVARIANT MonotonicCounters +INVARIANT SingleActiveTestCase +PROPERTY SuiteEventuallyTerminates +PROPERTY AllStepsEventuallyExecuted (under Balancing) +``` + +### 6.3 Error Strategy Model + +Models all 16 combinations of test × suite strategies. State tracks: +- Whether current error should propagate or be swallowed +- Running error count vs. allow threshold +- Whether suite continues or halts after test-level propagation + +### 6.4 Algorithm Model + +For `BalancingAlgorithm`: deterministic selection of minimum-count step with tie-breaking. Verify `max - min <= 1` after sufficient iterations. + +For `WeightedBalancingAlgorithm`: models weight normalization arithmetic. Uses finite precision (integers scaled by 100) to approximate floating-point behavior. Verifies all intermediate weights remain positive. + +### 6.5 Correspondence to Implementation + +| TLA+ Action | Python Code | +|-------------|-------------| +| `StartSuite` | `osmo.py:97-103` (before_suite, algorithm init) | +| `StartTest` | `osmo.py:105-108` (start_new_test, before_test) | +| `ChooseStep` | `osmo.py:110-111` (before, algorithm.choose) | +| `CompleteStep` | `osmo.py:112-115` (_run_step success path) | +| `StepError` | `osmo.py:116-119` (_run_step error + strategy) | +| `EndTest` | `osmo.py:121-123` (end_test check, after_test) | +| `EndSuite` | `osmo.py:129-130` (end_suite check, after_suite) | + +--- + +## 7. Discovered Issues + +The investigation surfaced these existing bugs that TLA+ verification would formally confirm: + +| # | Issue | Location | Severity | Status | +|---|-------|----------|----------|--------| +| 1 | `StepCoverage` range validation inverted: `if coverage_percent in range(1, 100)` should be `not in range(1, 101)` | `step_coverage.py:12` | Critical | Open | +| 2 | `StepCoverage.end_suite()` checks current test coverage, not cumulative across all tests | `step_coverage.py:25-32` | Major | Open | +| 3 | `OsmoTestCaseRecord.is_running()` returns True when stopped (inverted logic) | `test_case.py:19-20` | Moderate | Open | +| 4 | Empty `available_steps` causes `IndexError` in all algorithms (no guard against all-guards-False) | `osmo.py:110` | Major | Open | +| 5 | `WeightedBalancingAlgorithm` rescue logic may produce negative individual weights | `weighted.py:50-53` | Moderate | Open | +| 6 | `TestStepLog.name` uses `function_name` not semantic step name, causing history mismatches for decorator-based steps | `test_step_log.py:22-23` | Minor | Open | + +--- + +## 8. Success Criteria + +### 8.1 Verification Milestones + +| Milestone | Criteria | Priority | +|-----------|----------|----------| +| M1: Engine spec passes TLC | All safety invariants hold for MaxSteps=5, MaxTests=3, Steps=4 | P0 | +| M2: Error strategies verified | All 16 strategy combinations verified for correct propagation behavior | P0 | +| M3: Balancing fairness proven | `BalancedCounts` invariant holds for Steps=5, MaxIterations=25 | P1 | +| M4: End conditions verified | And/Or composition correct; no infinite loops for finite configurations | P2 | +| M5: CI integration | TLC runs on PR checks, blocks merge on invariant violations | P3 | + +### 8.2 Acceptance Criteria + +- [ ] All P0 invariants pass TLC model checking with zero violations +- [ ] Each TLA+ specification has comments mapping actions to Python source locations +- [ ] Counterexample traces (if any found) are documented with corresponding Python reproduction steps +- [ ] Discovered bugs (Section 7) are fixed and covered by both TLA+ invariants and pytest regression tests +- [ ] Specifications are reviewed by at least one team member not involved in writing them + +--- + +## 9. Risks and Mitigations + +| Risk | Likelihood | Impact | Mitigation | +|------|-----------|--------|------------| +| **Learning curve**: Team has no TLA+ experience | High | High | Start with simplest spec (error cascade). Use Learn TLA+ resources. Consider pairing with formal methods consultant for first spec. | +| **Specification drift**: TLA+ specs diverge from Python implementation over time | Medium | High | Add CI check that runs TLC. Include spec update in PR checklist for engine changes. | +| **State space explosion**: TLC runs too long with realistic constants | Medium | Medium | Use small constants for verification (MaxSteps=5). Rely on symmetry reduction. Parameterize specs. | +| **False confidence**: Team assumes TLA+ verification means implementation is correct | Low | High | Document clearly: TLA+ verifies the *design*, not the Python code. Maintain pytest suite for implementation testing. | +| **Over-specification**: Specs become too detailed, matching implementation rather than design intent | Medium | Low | Focus on *properties* (what should hold), not *how* (implementation details). Review specs for unnecessary specificity. | + +--- + +## 10. Alternatives Considered + +### 10.1 Property-Based Testing with Hypothesis (Recommended as Phase 0) + +Python's `hypothesis` library can verify many of the same properties through randomized testing: + +```python +@given(strategies=st.sampled_from(["raise", "ignore", "allow_count"]), + error_positions=st.lists(st.booleans())) +def test_error_cascade_never_drops_errors(strategies, error_positions): + ... +``` + +**Pros**: No new language to learn; integrates with existing pytest; fast iteration +**Cons**: Probabilistic (not exhaustive); may miss edge cases; no liveness proofs + +**Recommendation**: Implement Hypothesis property tests as Phase 0 (immediate value), then pursue TLA+ for mathematically rigorous verification of critical properties. + +### 10.2 Alloy (Lightweight Formal Methods) + +Alloy is a simpler formal specification language with automatic analysis. + +**Pros**: Easier learning curve than TLA+; good for structural properties +**Cons**: Less suited for temporal/liveness properties; smaller community; less industry adoption + +### 10.3 Python-based Model Checking (e.g., Strix, mCRL2) + +Use Python bindings for model checking directly. + +**Pros**: Same language as implementation; lower adoption barrier +**Cons**: Less mature tooling; TLA+ has stronger community and industry validation + +### 10.4 Do Nothing (Status Quo) + +Rely on existing pytest suite and code review. + +**Pros**: No investment needed +**Cons**: Leaves subtle state machine bugs undetected; no formal guarantees about algorithm fairness or hook ordering + +--- + +## 11. Implementation Phases + +### Phase 0: Hypothesis Property Tests (Prerequisite) + +Add property-based tests covering the same properties targeted by TLA+ specs. This provides immediate value and serves as a validation baseline for the TLA+ work. + +Scope: +- Hook ordering property test +- Error strategy combination test (all 16 pairs) +- Balancing algorithm fairness test +- End condition composition test +- Empty available_steps edge case test + +### Phase 1: Core Engine Specification (P0) + +Scope: +- `PyOsmoEngine.tla` with phase transitions, hook ordering, error cascade +- TLC configuration with small constants +- Verify all safety invariants +- Document any counterexamples found + +### Phase 2: Algorithm Specifications (P1) + +Scope: +- `BalancingAlgorithm.tla` with fairness proof +- `WeightedBalancing.tla` with weight positivity proof +- Verify with varying step availability patterns + +### Phase 3: End Condition Specification (P2) + +Scope: +- `EndConditions.tla` with And/Or composition +- Verify termination for all non-Endless configurations +- Model StepCoverage with test vs. suite semantics + +### Phase 4: CI Integration (P3) + +Scope: +- Add TLC to GitHub Actions workflow +- Run on PR checks (blocking) +- Report counterexample traces in PR comments + +--- + +## 12. Dependencies + +| Dependency | Type | Notes | +|------------|------|-------| +| TLA+ Toolbox / VS Code extension | Tool | Free, open source | +| TLC model checker | Tool | Included with TLA+ toolbox | +| Java runtime (for TLC) | Runtime | TLC requires JVM | +| Team TLA+ training | Knowledge | 1-2 weeks ramp-up for basics | +| Bug fixes from Section 7 | Code | Fix before spec matches intent, or spec against intended behavior | + +--- + +## 13. Relationship Between TLA+ and PyOsmo (Meta-Level) + +### 13.1 Conceptual Overlap + +Both TLA+ and PyOsmo are tools for systematic state space exploration: + +| Aspect | PyOsmo (MBT) | TLA+ (Formal Verification) | +|--------|--------------|----------------------------| +| **Model language** | Python classes | TLA+ mathematical notation | +| **Exploration** | Random/weighted sampling | Exhaustive breadth-first search | +| **Guarantees** | Probabilistic coverage | Mathematical proof | +| **Bug finding** | Heuristic (may miss) | Complete (for finite models) | +| **Learning curve** | Low (Python developers) | High (formal methods expertise) | +| **Runtime** | Fast (random sampling) | Slow (exponential state space) | +| **Target** | System under test | System design/algorithm | + +### 13.2 Complementary Roles + +1. **TLA+ verifies PyOsmo's engine** — proves the test generator itself is correct +2. **PyOsmo tests implementations** — runs generated tests against real systems +3. **TLA+ specs could become PyOsmo models** — future integration where formal specs compile to runtime test models + +### 13.3 Future Opportunity: TLA+ to PyOsmo Model Compiler + +A potential future project: translating TLA+ specifications into PyOsmo models automatically. This would combine: +- TLA+'s rigorous design verification (prove the design is correct) +- PyOsmo's runtime testing (verify the implementation matches the design) + +Prior art: Kuprianov & Konnov's work with TLA+ and Apalache generated integration tests from TLA+ specs for Tendermint. MongoDB's "eXtreme Modelling" uses trace checking for conformance. + +--- + +## 14. References + +- [TLA+ Wikipedia](https://en.wikipedia.org/wiki/TLA+) +- [A Primer on Formal Verification and TLA+](https://jack-vanlightly.com/blog/2023/10/10/a-primer-on-formal-verification-and-tla) — Jack Vanlightly +- [Safety and Liveness Properties](https://www.hillelwayne.com/post/safety-and-liveness/) — Hillel Wayne +- [Use of Formal Methods at Amazon Web Services](https://lamport.azurewebsites.net/tla/formal-methods-amazon.pdf) — Newcombe et al. +- [Model-Based Testing with TLA+ and Apalache](http://conf.tlapl.us/2020/09-Kuprianov_and_Konnov-Model-based_testing_with_TLA_+_and_Apalache.pdf) — Kuprianov & Konnov +- [Fully-Tested Code Generation from TLA+ Specifications](https://dl.acm.org/doi/fullHtml/10.1145/3559744.3559747) +- [eXtreme Modelling in Practice (MongoDB)](https://arxiv.org/abs/2006.00915) — Davis et al. +- [Formal Verification of Concurrent Scheduling Strategies using TLA](https://www.researchgate.net/publication/4319663_Formal_verification_of_concurrent_scheduling_strategies_using_TLA) +- [Learn TLA+](https://learntla.com/) +- [Safety, Liveness, and Fairness](https://lamport.azurewebsites.net/tla/safety-liveness.pdf) — Leslie Lamport +- [The Coming Need for Formal Specification (Dec 2025)](https://benjamincongdon.me/blog/2025/12/12/The-Coming-Need-for-Formal-Specification/) — Ben Congdon diff --git a/doc/prompt_tla_verification_agent.md b/doc/prompt_tla_verification_agent.md new file mode 100644 index 0000000..42bb3aa --- /dev/null +++ b/doc/prompt_tla_verification_agent.md @@ -0,0 +1,262 @@ +# Agent Prompt: Implement TLA+ Formal Verification for PyOsmo + +## Context + +You are working on the PyOsmo project (`/Users/oopee/git/pyosmo`), a Model-Based Testing framework for Python. Your task is to implement formal verification of the PyOsmo engine using TLA+ specifications, following the PRD at `doc/prd_tla_plus_verification.md` and the technical investigation at `doc/tla_plus_investigation.md`. + +## Your Mission + +Implement the TLA+ formal verification work in phases. Complete each phase fully before moving to the next. After each phase, run verification to confirm correctness before proceeding. + +--- + +## Phase 0: Fix Discovered Bugs + +Before writing specifications, fix the bugs discovered during investigation. These are documented in Section 7 of the PRD. + +1. **Fix `StepCoverage` range validation** (`pyosmo/end_conditions/step_coverage.py:12`): + - Current: `if coverage_percent in range(1, 100)` raises exception (inverted logic) + - Fix: Should reject values NOT in valid range. Valid range is 1-100 inclusive. + +2. **Fix `StepCoverage.end_suite()`** (`pyosmo/end_conditions/step_coverage.py:25-32`): + - Current: Checks `current_test_case` coverage (per-test) + - Fix: Should check cumulative coverage across ALL test cases in the suite + +3. **Fix `OsmoTestCaseRecord.is_running()`** (`pyosmo/history/test_case.py:19-20`): + - Current: Returns True when stopped (inverted) + - Fix: Rename to `is_stopped()` or invert the logic. Check all callers. + +4. **Fix empty `available_steps` crash** (`pyosmo/osmo.py`): + - Current: If all guards return False, `algorithm.choose()` gets empty list → `IndexError` + - Fix: Check for empty available steps before calling algorithm. Raise a clear `OsmoError` explaining no steps are available. + +5. **Fix `TestStepLog.name`** (`pyosmo/history/test_step_log.py:22-23`): + - Current: Uses `_step.function_name` instead of `_step.name` + - Fix: Use `_step.name` for semantic step name in history/statistics + +After each fix: +- Run `pytest pyosmo/tests/` to verify no regressions +- Run `ruff check pyosmo/ --fix && ruff format pyosmo/` +- Add a regression test if one doesn't exist for the fixed behavior + +--- + +## Phase 1: Hypothesis Property Tests + +Before writing TLA+ specs, implement property-based tests using Hypothesis that verify the same properties. These serve as immediate value and validation baseline. + +Create file: `pyosmo/tests/test_properties.py` + +Implement these property tests: + +### 1.1 Hook Ordering Property +```python +# Test that lifecycle hooks always execute in correct order regardless of: +# - Number of steps per test +# - Number of tests per suite +# - Which steps are available +# - Whether errors occur +# Capture hook calls via a tracking model, verify ordering invariant holds. +``` + +### 1.2 Error Strategy Combinations (all 16 pairs) +```python +# For each combination of test_strategy × suite_strategy: +# ("raise", "ignore", "ignore_asserts", "allow_count") × same +# Run with models that produce errors at random positions. +# Verify: +# - AlwaysRaise at test level: error always propagates to suite level +# - AlwaysIgnore: suite always completes normally +# - AllowCount(n): exactly n errors tolerated, (n+1)th propagates +``` + +### 1.3 Balancing Algorithm Fairness +```python +# Given N steps all always available, after k*N iterations: +# - max_count - min_count <= 1 +# Given steps with changing availability: +# - No available step is starved (count stays within bounds) +``` + +### 1.4 End Condition Composition +```python +# And(Length(a), Length(b)) terminates after max(a, b) steps +# Or(Length(a), Length(b)) terminates after min(a, b) steps +# StepCoverage works correctly at both test and suite level +``` + +### 1.5 Empty Available Steps +```python +# When all guards return False, engine raises clear error +# When guards alternate (some steps available, then none), behavior is correct +``` + +After implementing: +- Run `pytest pyosmo/tests/test_properties.py -v` +- All tests must pass +- Run `ruff check pyosmo/ --fix && ruff format pyosmo/` + +--- + +## Phase 2: TLA+ Engine State Machine Specification + +Create directory: `pyosmo/specs/tla/` + +### 2.1 Write `PyOsmoEngine.tla` + +Model the core execution engine state machine. Reference the specification sketch in Section 4.1 of `doc/tla_plus_investigation.md` but expand it to include: + +- All phase transitions (idle, suite, test, step) +- Lifecycle hook execution tracking +- Two-level error handling with configurable strategies +- End condition evaluation +- Dynamic step availability (guards changing between iterations) +- History state (test count, step count, error count) + +The specification should be approximately 150-250 lines. + +### 2.2 Write `PyOsmoEngine.cfg` + +TLC configuration with small constants: +``` +CONSTANTS + Steps = {s1, s2, s3, s4} + MaxSteps = 5 + MaxTests = 3 + ErrorStrategies = {"raise", "ignore", "allow_count"} + +INVARIANT TypeInvariant +INVARIANT ValidPhaseTransitions +INVARIANT HookOrderingInvariant +INVARIANT MonotonicCounters +INVARIANT SingleActiveTestCase + +PROPERTY SuiteTerminates +``` + +### 2.3 Write `ErrorStrategyCascade.tla` + +Focused specification for error strategy interactions: +- Model all 4 strategies at test level and suite level +- Track error occurrence, propagation, and swallowing +- Verify no silent error loss under AlwaysRaise +- Verify AllowCount threshold semantics + +### 2.4 Write `ErrorStrategyCascade.cfg` + +Configuration testing all 16 strategy combinations. + +### 2.5 Verify with TLC + +Run TLC on each specification: +```bash +java -jar tla2tools.jar -config PyOsmoEngine.cfg PyOsmoEngine.tla +java -jar tla2tools.jar -config ErrorStrategyCascade.cfg ErrorStrategyCascade.tla +``` + +If TLC is not available locally, document the expected verification results based on the specification analysis. Note any properties that cannot be verified without TLC and explain why. + +--- + +## Phase 3: Algorithm Fairness Specifications + +### 3.1 Write `BalancingAlgorithm.tla` + +Model the deterministic balancing algorithm: +- State: step_counts function, available set, chosen step +- Action: Choose minimum-count step, increment its count +- Invariant: After |Steps| iterations, max - min <= 1 +- Liveness: All available steps eventually executed + +### 3.2 Write `WeightedBalancing.tla` + +Model the weighted balancing algorithm: +- State: step_counts, weights (fixed), available set +- Action: Compute normalized weights, subtract history norm, rescue negatives, choose +- Invariant: All computed weights > 0 after normalization +- Safety: No division by zero in normalization + +Use integer arithmetic scaled by 100 to model the floating-point behavior. + +### 3.3 Write configurations and verify + +--- + +## Phase 4: End Condition Specifications + +### 4.1 Write `EndConditions.tla` + +Model composable end conditions: +- Length, StepCoverage, Time (abstracted), Endless +- And/Or combinators +- Test-level vs suite-level semantics +- Verify termination properties + +### 4.2 Verify no infinite loops + +For all non-Endless configurations with finite step sets, prove suite eventually terminates. + +--- + +## Phase 5: Documentation and Integration + +### 5.1 Add README for specs +Create `pyosmo/specs/tla/README.md` explaining: +- What each spec verifies +- How to run TLC +- How to interpret counterexamples +- Correspondence between TLA+ and Python code + +### 5.2 Update CLAUDE.md +Add a section about the TLA+ specifications to the project's CLAUDE.md. + +### 5.3 Final verification +- Run full test suite: `pytest pyosmo/tests/` +- Run linter: `ruff check pyosmo/ --fix && ruff format pyosmo/` +- Verify all property tests pass +- Verify TLA+ specs are syntactically valid (if tooling available) + +--- + +## Constraints and Guidelines + +1. **Read before writing**: Always read the relevant source files before modifying them. Understand the existing code patterns. + +2. **Test after each change**: Run `pytest pyosmo/tests/` after every code modification. Do not proceed if tests fail. + +3. **Lint after each change**: Run `ruff check pyosmo/ --fix && ruff format pyosmo/` after code changes. + +4. **Follow existing patterns**: Match the project's code style (double quotes, 120 char lines, type hints encouraged). + +5. **Minimal changes for bug fixes**: Fix only what's broken. Don't refactor surrounding code. + +6. **TLA+ specs should be self-documenting**: Include comments mapping each action to the Python source location (file:line). + +7. **If TLC is not installed**: Write the specifications correctly anyway. Document that verification requires `tla2tools.jar` (available from https://github.com/tlaplus/tlaplus/releases). The specs themselves have value as executable documentation. + +8. **If stuck on TLA+ syntax**: Use standard TLA+ patterns from Leslie Lamport's examples. The key operators are: + - `/\` (and), `\/` (or), `=>` (implies) + - `'` (next state), `UNCHANGED` (stuttering) + - `[]` (always), `<>` (eventually), `~>` (leads-to) + - `WF_vars(Action)` (weak fairness) + +9. **Commit messages**: If committing, use descriptive messages like "Add TLA+ engine state machine specification" or "Fix StepCoverage range validation bug". + +10. **Do not skip phases**: Complete Phase 0 (bug fixes) before Phase 1 (Hypothesis tests) before Phase 2 (TLA+ specs). Each phase validates the previous. + +--- + +## Definition of Done + +- [ ] All 5 bugs from Section 7 of PRD are fixed with regression tests +- [ ] Property-based tests in `test_properties.py` pass (5+ properties) +- [ ] `PyOsmoEngine.tla` specification written with safety and liveness properties +- [ ] `ErrorStrategyCascade.tla` covers all 16 strategy combinations +- [ ] `BalancingAlgorithm.tla` proves fairness property +- [ ] `WeightedBalancing.tla` proves weight positivity +- [ ] `EndConditions.tla` proves termination for non-Endless configs +- [ ] All TLC configs written +- [ ] README for specs directory exists +- [ ] `pytest pyosmo/tests/` passes with zero failures +- [ ] `ruff check pyosmo/` passes with zero errors +- [ ] No regressions in existing test suite diff --git a/doc/tla_plus_investigation.md b/doc/tla_plus_investigation.md new file mode 100644 index 0000000..c38a081 --- /dev/null +++ b/doc/tla_plus_investigation.md @@ -0,0 +1,583 @@ +# TLA+ Investigation for PyOsmo Engine Verification + +## Executive Summary + +This document investigates whether TLA+ (Temporal Logic of Actions) can help verify the correctness of the PyOsmo model-based testing engine. The investigation covers three dimensions: + +1. **Engine Verification**: Using TLA+ to formally verify PyOsmo's internal execution logic +2. **Algorithm Correctness**: Proving properties of step-selection algorithms +3. **Meta-level Relationship**: The conceptual overlap between TLA+ formal methods and PyOsmo's MBT approach + +**Verdict**: TLA+ is well-suited for verifying specific critical aspects of PyOsmo's engine, particularly the execution loop state machine, error strategy cascade, and algorithm fairness properties. However, the effort is only justified for the core state machine and algorithm correctness proofs—not for the entire codebase. + +--- + +## 1. Background: What TLA+ Offers + +TLA+ is a formal specification language for modeling and verifying systems as state machines. Its model checker (TLC) exhaustively explores all reachable states and checks: + +- **Safety Properties (Invariants)**: "Bad things never happen" — a condition that must hold in every reachable state +- **Liveness Properties**: "Good things eventually happen" — something that must occur in every possible execution +- **Fairness**: Guarantees that enabled actions are eventually taken (weak/strong fairness) +- **Action Properties**: Constraints on state *transitions* (pairs of states) + +TLA+ excels at finding subtle bugs in state machines, scheduling algorithms, and concurrent systems where the interleaving of operations creates complex emergent behavior. + +--- + +## 2. PyOsmo Engine as a State Machine + +### 2.1 The Core Execution State Machine + +PyOsmo's `generate()` method (`pyosmo/osmo.py:93-130`) implements a nested state machine: + +``` +STATES: + IDLE → SUITE_RUNNING → TEST_RUNNING → STEP_EXECUTING → TEST_RUNNING → ... + +TRANSITIONS: + IDLE --[generate()]-→ SUITE_RUNNING + SUITE_RUNNING --[start_new_test()]-→ TEST_RUNNING + TEST_RUNNING --[algorithm.choose()]-→ STEP_EXECUTING + STEP_EXECUTING --[step completes]-→ TEST_RUNNING + TEST_RUNNING --[end_test = true]-→ SUITE_RUNNING + SUITE_RUNNING --[end_suite = true]-→ IDLE + ANY --[error + raise]-→ ERROR_HANDLING → SUITE_RUNNING or IDLE +``` + +### 2.2 Key State Variables + +| Variable | Type | Transitions | +|----------|------|-------------| +| `history.test_cases` | List | Append-only growth | +| `current_test_case` | Record | Created/stopped per test | +| `current_test_case.steps_log` | List | Append during test | +| `algorithm state` | Varies | Updated per choose() | +| `error count` | Counter | Monotonically increasing | + +### 2.3 Why This Matters + +The engine has complex control flow with: +- Two-level error handling (test-level and suite-level strategies) +- Lifecycle hooks that can themselves raise exceptions +- End conditions checked at different points in the loop +- History state that must remain consistent across all paths + +--- + +## 3. Specific TLA+ Application Areas + +### 3.1 Lifecycle Hook Ordering Invariant (HIGH VALUE) + +**Problem**: PyOsmo guarantees a strict execution order for lifecycle hooks: +``` +before_suite → (before_test → (before → pre_X → step → post_X → after)* → after_test)* → after_suite +``` + +**Safety Property to Verify**: +```tla +TypeInvariant == + /\ phase \in {"idle", "suite", "test", "step"} + /\ hook_stack is well-formed (no after without before) + +HookOrdering == + /\ phase = "step" => last_hook \in {"before", "pre_step"} + /\ phase = "test" => last_hook \in {"before_test", "after"} + /\ after_suite_called => before_suite_called + /\ after_test_called => before_test_called +``` + +**What TLA+ Would Find**: Whether any error path (exception in a hook, error strategy decision) can violate the hook ordering—e.g., calling `after_test` without `before_test` having completed, or skipping `after_suite` on error. + +**Current Risk in Code**: The `try/except BaseException` at `osmo.py:125` catches errors from the inner loop and passes them to `test_suite_error_strategy`. If the strategy doesn't re-raise, execution continues to the next test. But `after_test` is called *before* this catch block (`osmo.py:123`), meaning an error in `after_test` itself would skip to the suite-level handler. This is correct but subtle—TLA+ would verify all such paths. + +### 3.2 Error Strategy Cascade (HIGH VALUE) + +**Problem**: PyOsmo has a two-level error handling system: +1. Step error → `test_error_strategy.failure_in_test()` (may raise or swallow) +2. If raised → `test_suite_error_strategy.failure_in_suite()` (may raise or continue) + +**Properties to Verify**: +```tla +\* Safety: No error is silently lost when AlwaysRaise is configured +ErrorNeverLost == + /\ error_strategy = "AlwaysRaise" => + (error_occurred => eventually_raised) + +\* Safety: AllowCount correctly tracks error counts +AllowCountCorrect == + /\ allow_count_strategy => + (error_count <= allow_count => test_continues) + /\ (error_count > allow_count => error_raised) + +\* Liveness: Under AlwaysIgnore, suite eventually terminates +IgnoreTerminates == + /\ error_strategy = "AlwaysIgnore" => + <>(phase = "idle") \* Suite eventually completes +``` + +**Current Risk in Code**: The `AllowCount` strategy (`allow_count.py:14-16`) uses `>` comparison: +```python +if history.current_test_case.error_count > self.allow_count: + raise error +``` +This means `allow_count=3` allows exactly 3 errors and raises on the 4th. TLA+ could verify this matches the intended semantics across all orderings of errors and steps. + +### 3.3 Algorithm Fairness Properties (HIGH VALUE) + +**Problem**: The `BalancingAlgorithm` claims to balance step execution, and `WeightedBalancingAlgorithm` combines weights with history-based balancing. + +**Properties to Verify**: + +```tla +\* Liveness/Fairness: Every available step is eventually executed +StepFairness == + \A s \in AvailableSteps: + <>(step_count[s] > 0) \* Every step is eventually taken + +\* Safety: BalancingAlgorithm always selects minimum-count step +BalancingCorrect == + /\ chosen_step = ArgMin(available_steps, step_count) + +\* Safety: Weights are always positive when passed to random.choices +WeightsPositive == + /\ \A w \in computed_weights: w > 0 + +\* Liveness: Under WeightedBalancing, all steps get "fair" share +WeightedFairness == + /\ \A s \in Steps: + <>(step_count[s] / total_steps >= weight[s] / total_weight - epsilon) +``` + +**Current Risk in Code**: The `WeightedBalancingAlgorithm` (`weighted.py:39-56`) has complex weight normalization: +```python +total_weights = [a - b if a - b != 0 else 0.1 for (a, b) in zip(norm_weights, hist_norm)] +if sum(total_weights) < 0: + temp_add = (abs(sum(total_weights)) + 0.2) / len(total_weights) + total_weights = [temp_add + x for x in total_weights] +``` +This "rescue negative totals" logic could produce pathological weight distributions. TLA+ could enumerate all possible weight/history combinations to verify weights remain sensible. + +### 3.4 End Condition Composition (MEDIUM VALUE) + +**Problem**: `And` and `Or` end conditions compose multiple conditions. Their interaction with test vs. suite semantics needs verification. + +**Properties to Verify**: +```tla +\* And(Length(5), StepCoverage(100)) terminates iff BOTH are met +AndCorrect == + /\ And_terminates <=> (cond1_met /\ cond2_met) + +\* Or(Length(5), Time(10s)) terminates when EITHER is met +OrCorrect == + /\ Or_terminates <=> (cond1_met \/ cond2_met) + +\* Safety: End conditions can never cause infinite loops +\* (except Endless, which is intentional) +NoUnintendedInfiniteLoop == + /\ end_condition /= Endless => + <>(phase = "idle") +``` + +**Current Risk in Code**: If `StepCoverage` is used as a suite end condition, it checks `current_test_case` coverage (not cumulative)—meaning a suite could run forever if individual tests never reach the coverage threshold but the test end condition stops them first. + +### 3.5 History State Consistency (MEDIUM VALUE) + +**Properties to Verify**: +```tla +\* Invariant: At most one test case is "running" at any time +SingleActiveTest == + /\ Cardinality({t \in test_cases : t.is_running}) <= 1 + +\* Invariant: Steps can only be added to running test cases +StepAdditionValid == + /\ add_step_called => current_test_case.is_running + +\* Monotonicity: Step counts never decrease +MonotonicSteps == + /\ total_steps' >= total_steps + /\ error_count' >= error_count +``` + +### 3.6 Step Discovery and Guard Resolution (LOW VALUE) + +The priority-based guard resolution (`model.py:80-105`) follows a fixed priority: +1. `_osmo_enabled` attribute +2. Inline `_osmo_guard_inline` +3. Decorator `@guard("name")` +4. Naming convention `guard_name()` + +**Property**: No two mechanisms can conflict (the first match wins). This is simple enough that TLA+ verification adds limited value—a unit test suffices. + +--- + +## 4. Proposed TLA+ Specifications + +### 4.1 Engine State Machine Specification + +```tla +---------------------------- MODULE PyOsmoEngine ---------------------------- +EXTENDS Naturals, Sequences, FiniteSets + +CONSTANTS Steps, MaxSteps, MaxTests, ErrorStrategies + +VARIABLES + phase, \* "idle" | "suite" | "test" | "step" + test_count, \* Number of completed tests + step_count, \* Steps in current test + total_steps, \* Total steps across all tests + error_count, \* Errors in current test + suite_errors, \* Total errors across suite + available, \* Set of currently available steps + chosen, \* Last chosen step + hook_trace, \* Sequence of executed hooks + test_strategy, \* Current test error strategy + suite_strategy \* Current suite error strategy + +TypeInvariant == + /\ phase \in {"idle", "suite", "test", "step"} + /\ test_count \in Nat + /\ step_count \in Nat + /\ error_count \in Nat + +Init == + /\ phase = "idle" + /\ test_count = 0 + /\ step_count = 0 + /\ total_steps = 0 + /\ error_count = 0 + /\ suite_errors = 0 + /\ available = Steps + /\ chosen = "none" + /\ hook_trace = <<>> + /\ test_strategy = "raise" + /\ suite_strategy = "raise" + +StartSuite == + /\ phase = "idle" + /\ phase' = "suite" + /\ hook_trace' = Append(hook_trace, "before_suite") + /\ UNCHANGED <> + +StartTest == + /\ phase = "suite" + /\ phase' = "test" + /\ step_count' = 0 + /\ error_count' = 0 + /\ hook_trace' = Append(hook_trace, "before_test") + /\ UNCHANGED <> + +ChooseStep == + /\ phase = "test" + /\ available /= {} + /\ phase' = "step" + /\ \E s \in available: + chosen' = s + /\ hook_trace' = Append(hook_trace, "before") + /\ UNCHANGED <> + +CompleteStep == + /\ phase = "step" + /\ phase' = "test" + /\ step_count' = step_count + 1 + /\ total_steps' = total_steps + 1 + /\ hook_trace' = Append(hook_trace, "after") + /\ UNCHANGED <> + +StepError == + /\ phase = "step" + /\ error_count' = error_count + 1 + /\ suite_errors' = suite_errors + 1 + /\ IF test_strategy = "raise" + THEN phase' = "suite" \* Error propagates up + ELSE phase' = "test" \* Error swallowed, continue + /\ step_count' = step_count + 1 + /\ total_steps' = total_steps + 1 + /\ UNCHANGED <> + +EndTest == + /\ phase = "test" + /\ step_count >= MaxSteps \* Simplified end condition + /\ phase' = "suite" + /\ test_count' = test_count + 1 + /\ hook_trace' = Append(hook_trace, "after_test") + /\ UNCHANGED <> + +EndSuite == + /\ phase = "suite" + /\ test_count >= MaxTests + /\ phase' = "idle" + /\ hook_trace' = Append(hook_trace, "after_suite") + /\ UNCHANGED <> + +Next == + \/ StartSuite + \/ StartTest + \/ ChooseStep + \/ CompleteStep + \/ StepError + \/ EndTest + \/ EndSuite + +\* --- PROPERTIES TO CHECK --- + +\* Safety: Phase transitions are valid +ValidTransitions == + /\ (phase = "idle") => (phase' \in {"idle", "suite"}) + /\ (phase = "suite") => (phase' \in {"suite", "test", "idle"}) + /\ (phase = "test") => (phase' \in {"test", "step", "suite"}) + /\ (phase = "step") => (phase' \in {"test", "suite"}) + +\* Safety: Steps only increase +MonotonicSteps == + /\ total_steps' >= total_steps + +\* Liveness: Suite eventually terminates +SuiteTerminates == + <>(phase = "idle" /\ test_count >= MaxTests) + +\* Safety: Hook ordering +HookOrderingInvariant == + /\ Len(hook_trace) > 0 => + (Last(hook_trace) = "after_suite" => "before_suite" \in Range(hook_trace)) + +Spec == Init /\ [][Next]_<> + /\ WF_<>(Next) + +============================================================================= +``` + +### 4.2 Balancing Algorithm Specification + +```tla +------------------------ MODULE BalancingAlgorithm -------------------------- +EXTENDS Naturals, Sequences, FiniteSets + +CONSTANTS Steps, MaxIterations + +VARIABLES + step_counts, \* Function: Step -> Nat (execution counts) + available, \* Set of currently available steps + chosen, \* Last chosen step + iteration \* Current iteration count + +TypeInvariant == + /\ \A s \in Steps: step_counts[s] \in Nat + /\ available \subseteq Steps + /\ iteration \in Nat + +Init == + /\ step_counts = [s \in Steps |-> 0] + /\ available = Steps + /\ chosen = CHOOSE s \in Steps : TRUE + /\ iteration = 0 + +\* Balancing: Choose the step with minimum count +Choose == + /\ iteration < MaxIterations + /\ available /= {} + /\ LET min_count == CHOOSE c \in {step_counts[s] : s \in available} : + \A c2 \in {step_counts[s] : s \in available} : c <= c2 + IN \E s \in available: + /\ step_counts[s] = min_count + /\ chosen' = s + /\ step_counts' = [step_counts EXCEPT ![s] = @ + 1] + /\ iteration' = iteration + 1 + /\ UNCHANGED available + +\* --- PROPERTIES --- + +\* Safety: Chosen step always has minimum count (before increment) +BalancingCorrect == + /\ available /= {} => + \A s \in available: + step_counts[chosen] - 1 <= step_counts[s] + +\* Liveness: All steps eventually get executed +AllStepsExecuted == + \A s \in Steps: + <>(step_counts[s] > 0) + +\* Safety: Counts are balanced (max - min <= 1) +BalancedCounts == + /\ iteration > Cardinality(Steps) => + LET max_c == CHOOSE c \in {step_counts[s] : s \in Steps} : + \A c2 \in {step_counts[s] : s \in Steps} : c >= c2 + min_c == CHOOSE c \in {step_counts[s] : s \in Steps} : + \A c2 \in {step_counts[s] : s \in Steps} : c <= c2 + IN max_c - min_c <= 1 + +Spec == Init /\ [][Choose]_<> + /\ WF_<>(Choose) + +============================================================================= +``` + +--- + +## 5. What TLA+ Would Likely Reveal + +Based on the code analysis, TLA+ model checking would likely surface these issues: + +### 5.1 Confirmed Bugs (Already Found During Investigation) + +| # | Issue | Location | Severity | +|---|-------|----------|----------| +| 1 | `StepCoverage` range check inverted | `step_coverage.py:12` | Critical | +| 2 | `StepCoverage.end_suite()` uses per-test coverage, not cumulative | `step_coverage.py:25-32` | Major | +| 3 | `OsmoTestCaseRecord.is_running()` returns True when stopped | `test_case.py:19-20` | Moderate | + +### 5.2 Properties TLA+ Would Verify/Refute + +| Property | Expected Result | Value | +|----------|----------------|-------| +| Hook ordering never violated | PASS (but worth proving) | High | +| AlwaysRaise never silently drops errors | PASS | High | +| AllowCount(n) raises on exactly the (n+1)th error | PASS (current `>` is correct) | Medium | +| BalancingAlgorithm achieves max-min <= 1 | PASS (when all steps always available) | High | +| BalancingAlgorithm fair with changing availability | NEEDS VERIFICATION (may fail) | High | +| WeightedBalancing weights always positive | NEEDS VERIFICATION (rescue logic) | High | +| Suite terminates under all non-Endless conditions | PASS (assuming finite steps) | Medium | +| StepCoverage + Length composition works correctly | LIKELY FAIL (due to bug #2) | High | + +### 5.3 Subtle Issues TLA+ Could Expose + +1. **Error in `after_test` hook**: If `after_test` raises, it's caught by the suite-level handler. This means `after_test` may execute partially. TLA+ could verify whether this leads to inconsistent model state. + +2. **Algorithm with empty available steps**: If all guards return False simultaneously, `algorithm.choose()` receives an empty list. Current code doesn't handle this—`random.choice([])` raises `IndexError`. TLA+ with the invariant `available /= {} => choose succeeds` would catch this. + +3. **WeightedBalancingAlgorithm negative weights**: The normalization `a - b` can produce negative values. The rescue adds `(abs(sum) + 0.2) / len` to each weight, but if a single weight is very negative while others are positive, the result could still have a negative individual weight after rescue. TLA+ could enumerate all cases. + +4. **Seed determinism across runs**: Setting the same seed should produce identical test sequences. TLA+ can't verify this directly (it's about implementation, not logic), but it could verify that the algorithm's *decision function* is deterministic given the same inputs. + +--- + +## 6. The Meta-Level: TLA+ and MBT Relationship + +### 6.1 PyOsmo IS an MBT Tool; TLA+ CAN DO MBT + +There's a fascinating recursive relationship: + +- **PyOsmo** generates tests from *Python models* (classes with steps/guards) +- **TLA+** can generate tests from *TLA+ specifications* (formal state machines) + +Both approaches share the same goal: systematically exploring a system's state space to find bugs. The difference is in formality and exhaustiveness: + +| Aspect | PyOsmo (MBT) | TLA+ (Formal) | +|--------|--------------|----------------| +| Model language | Python | TLA+ math notation | +| Exploration | Random/weighted sampling | Exhaustive BFS | +| Guarantees | Probabilistic coverage | Mathematical proof | +| Bug finding | Heuristic | Complete (for finite models) | +| Learning curve | Low (Python developers) | High (formal methods) | +| Runtime | Fast (random sampling) | Slow (exponential state space) | + +### 6.2 Complementary Usage + +Rather than TLA+ *replacing* PyOsmo, they serve complementary roles: + +1. **TLA+ verifies PyOsmo's engine** (this document's main topic) +2. **TLA+ specifications could inform PyOsmo models**: A TLA+ spec could be "compiled down" to a PyOsmo model for runtime testing +3. **PyOsmo tests what TLA+ proves**: TLA+ proves algorithmic correctness; PyOsmo tests the actual implementation against real systems + +### 6.3 Prior Art: Model-Based Testing with TLA+ and Apalache + +Research by Kuprianov and Konnov demonstrates using TLA+ specifications directly for integration test generation. In their Tendermint case study, 6 lines of TLA+ generated 500+ lines of integration tests. This approach "plays the role of the missing link between specifications and implementation." + +MongoDB's "eXtreme Modelling" approach uses TLA+ for both verification and conformance testing—verifying that implementations match their formal specifications through trace checking. + +--- + +## 7. Practical Recommendations + +### 7.1 High-Priority TLA+ Specifications to Write + +| Specification | Effort | Value | Rationale | +|---------------|--------|-------|-----------| +| Engine state machine (Section 4.1) | Medium | High | Verifies core execution correctness | +| Error cascade behavior | Low | High | Simple model, critical correctness | +| Balancing algorithm fairness | Low | High | Mathematical property, small state space | +| End condition composition | Low | Medium | Verifies And/Or/coverage interactions | + +### 7.2 What NOT to Specify in TLA+ + +- **Model discovery logic** (parsing decorators/naming conventions): This is pure sequential code without state machine complexity. Unit tests suffice. +- **History data structures**: Simple append-only collections. No complex invariants beyond what Python's type system guarantees. +- **CLI/configuration**: No concurrency or state machine behavior. + +### 7.3 Implementation Approach + +1. **Start with the engine state machine** (Section 4.1): This is the highest-value specification. Model the phase transitions, hook ordering, and error handling cascade. Run TLC to verify safety and liveness properties. + +2. **Add algorithm specifications**: Model each algorithm (especially `WeightedBalancingAlgorithm`) and verify fairness properties under various step availability patterns. + +3. **Verify end condition composition**: Small TLA+ model verifying `And`/`Or` behavior with various condition combinations. + +4. **Integrate with CI (optional)**: TLC can run as a CI check. Specifications serve as executable documentation of intended behavior. + +### 7.4 Estimated Complexity + +- **Engine state machine TLA+ spec**: ~150-200 lines of TLA+ +- **Algorithm specs**: ~50-80 lines each +- **End condition spec**: ~60-80 lines +- **Learning curve**: Significant if team has no TLA+ experience (the main barrier to adoption) + +### 7.5 Alternative: Property-Based Testing with Hypothesis + +For teams that find TLA+ too heavyweight, Python's `hypothesis` library (already available in the project) can verify many of the same properties through randomized testing: + +```python +from hypothesis import given, strategies as st + +@given(st.lists(st.sampled_from(["step_a", "step_b", "step_c"]), min_size=1)) +def test_balancing_fairness(steps): + """After N*len(steps) iterations, all counts within 1 of each other.""" + ... +``` + +This provides probabilistic guarantees rather than TLA+'s mathematical proofs, but with much lower adoption cost. + +--- + +## 8. Conclusion + +TLA+ is a strong fit for verifying PyOsmo's core engine correctness. The main value propositions are: + +1. **Proving hook ordering invariants** across all error paths (including error-in-hook scenarios) +2. **Verifying error strategy cascade** correctness for all strategy combinations +3. **Proving algorithm fairness** properties mathematically +4. **Catching edge cases** in end condition composition (especially the existing StepCoverage bugs) + +The main barrier is the learning curve. For a small team, the alternative of enhanced property-based testing with Hypothesis may provide 80% of the value at 20% of the adoption cost. However, for the core state machine (the engine loop), TLA+ provides guarantees that no amount of testing can match. + +The meta-level relationship between TLA+ and PyOsmo is also worth noting: both are tools for systematic state space exploration, operating at different levels of formality. A future integration where TLA+ specifications could be translated into PyOsmo models would be a powerful combination of formal verification and practical runtime testing. + +--- + +## References + +- [TLA+ Wikipedia](https://en.wikipedia.org/wiki/TLA+) +- [A Primer on Formal Verification and TLA+](https://jack-vanlightly.com/blog/2023/10/10/a-primer-on-formal-verification-and-tla) — Jack Vanlightly +- [Safety and Liveness Properties](https://www.hillelwayne.com/post/safety-and-liveness/) — Hillel Wayne +- [Use of Formal Methods at Amazon Web Services](https://lamport.azurewebsites.net/tla/formal-methods-amazon.pdf) — Newcombe et al. +- [Model-Based Testing with TLA+ and Apalache](http://conf.tlapl.us/2020/09-Kuprianov_and_Konnov-Model-based_testing_with_TLA_+_and_Apalache.pdf) — Kuprianov & Konnov +- [Fully-Tested Code Generation from TLA+ Specifications](https://dl.acm.org/doi/fullHtml/10.1145/3559744.3559747) +- [eXtreme Modelling in Practice (MongoDB)](https://arxiv.org/abs/2006.00915) — Davis et al. +- [Formal Verification of Concurrent Scheduling Strategies using TLA](https://www.researchgate.net/publication/4319663_Formal_verification_of_concurrent_scheduling_strategies_using_TLA) +- [Learn TLA+](https://learntla.com/) +- [Temporal Properties in TLA+](https://learntla.com/core/temporal-logic.html) +- [Safety, Liveness, and Fairness](https://lamport.azurewebsites.net/tla/safety-liveness.pdf) — Leslie Lamport +- [The Coming Need for Formal Specification (Dec 2025)](https://benjamincongdon.me/blog/2025/12/12/The-Coming-Need-for-Formal-Specification/) — Ben Congdon diff --git a/examples/page_object_model.py b/examples/page_object_model.py new file mode 100644 index 0000000..e3a1223 --- /dev/null +++ b/examples/page_object_model.py @@ -0,0 +1,106 @@ +"""Page Object Model (POM) example using guard_all. + +Demonstrates how to use guard_all to model page-based navigation +where each page is a separate model class. Only the steps belonging +to the current page are available at any time. + +Run: python examples/page_object_model.py +""" + +from pyosmo import Osmo, guard, step +from pyosmo.end_conditions import Length + + +class AppState: + """Shared state across all page models.""" + + def __init__(self): + self.current_page = 'home' + self.logged_in = False + self.actions: list[str] = [] + + +class HomePage: + def __init__(self, state: AppState): + self.state = state + + def guard_all(self): + """All steps in this model require being on the home page.""" + return self.state.current_page == 'home' + + @step + def click_login(self): + self.state.actions.append('home -> login') + self.state.current_page = 'login' + + @step + def click_about(self): + self.state.actions.append('home -> about') + self.state.current_page = 'about' + + +class LoginPage: + def __init__(self, state: AppState): + self.state = state + + def guard_all(self): + return self.state.current_page == 'login' + + @step + @guard(lambda self: not self.state.logged_in) + def submit_login(self): + """Can only log in when not already logged in.""" + self.state.actions.append('login: submit') + self.state.logged_in = True + self.state.current_page = 'dashboard' + + @step + def click_back(self): + self.state.actions.append('login -> home') + self.state.current_page = 'home' + + +class AboutPage: + def __init__(self, state: AppState): + self.state = state + + def guard_all(self): + return self.state.current_page == 'about' + + @step + def click_home(self): + self.state.actions.append('about -> home') + self.state.current_page = 'home' + + +class DashboardPage: + def __init__(self, state: AppState): + self.state = state + + def guard_all(self): + return self.state.current_page == 'dashboard' and self.state.logged_in + + @step + def logout(self): + self.state.actions.append('dashboard: logout') + self.state.logged_in = False + self.state.current_page = 'home' + + @step + def view_profile(self): + self.state.actions.append('dashboard: view profile') + + +if __name__ == '__main__': + state = AppState() + osmo = Osmo([HomePage(state), LoginPage(state), AboutPage(state), DashboardPage(state)]) + osmo.test_end_condition = Length(20) + osmo.seed = 42 + + osmo.run() + + print('Actions taken:') + for action in state.actions: + print(f' {action}') + print(f'\nFinal page: {state.current_page}') + print(f'Logged in: {state.logged_in}') diff --git a/license_to_commit.sh b/license_to_commit.sh index 78c12a3..16ae6ca 100755 --- a/license_to_commit.sh +++ b/license_to_commit.sh @@ -7,4 +7,4 @@ uv run ruff format uv run mypy pyosmo/ uv run mypy examples/ uv run pytest pyosmo/tests/ -pytest pytest_pyosmo/test_models_example.py \ No newline at end of file +uv run pytest pytest_pyosmo/test_models_example.py \ No newline at end of file diff --git a/model-creator/README.md b/model-creator/README.md index d944ee1..e46e547 100644 --- a/model-creator/README.md +++ b/model-creator/README.md @@ -116,9 +116,13 @@ def after(self): ### Step Methods (Actions) +The generated models use PyOsmo's decorator syntax for steps and guards: + Form submission: ```python -def step_submit_login(self): +@step +@guard(lambda self: not self.logged_in) +def submit_login(self): """Submit the login form.""" data = { "username": "testuser", @@ -136,31 +140,73 @@ def step_submit_login(self): Navigation: ```python -def step_navigate_to_about(self): +@step +def navigate_to_about(self): """Navigate to about page.""" self.response = self.session.get("https://example.com/about") self.current_page = "about" print("Navigated to: about") ``` -### Guard Methods (Preconditions) +Steps with authentication requirements: +```python +@step +@guard(lambda self: self.logged_in) +def navigate_to_dashboard(self): + """Navigate to dashboard (requires login).""" + self.response = self.session.get("https://example.com/dashboard") + self.current_page = "dashboard" + print("Navigated to: dashboard") +``` + +### Main Function +Generated models include a `main()` function for easy execution: ```python -def guard_submit_login(self): - """Guard for login - can only login when not logged in.""" - return not self.logged_in +def main(): + """Run the model with PyOsmo.""" + model = WebsiteModel() + + osmo = ( + Osmo(model) + .weighted_algorithm() + .stop_after_steps(100) + .run_tests(1) + ) + + print(f"Starting test generation for {model.base_url}") + osmo.generate() + + # Print summary + stats = osmo.history.statistics() + print("\n" + "=" * 50) + print("Test generation complete!") + print(f"Steps executed: {stats.total_steps}") + print(f"Unique steps: {len(stats.step_coverage)}") + -def guard_submit_logout(self): - """Guard for logout - can only logout when logged in.""" - return self.logged_in +if __name__ == "__main__": + main() +``` + +You can run the generated model directly: +```bash +python models/example_model.py ``` ## Running Generated Models -Once you have a generated model, you can run it with PyOsmo: +Generated models include a `main()` function that can be run directly: + +```bash +# Run the model directly (uses built-in defaults) +python models/example_model.py +``` + +Or use PyOsmo's CLI for more control: ```bash -# Run model exploration +# Run model exploration with custom settings python -m osmo.explorer -m models/example_model.py:WebsiteModel # Run with specific seed for reproducibility @@ -268,10 +314,12 @@ Generated models can be customized after creation: 1. **Add custom verification logic** in the `after()` method 2. **Customize default values** in form submission steps -3. **Add weights** to control action frequency: +3. **Add weights** to control action frequency using the `weight_value` parameter: ```python - def weight_submit_login(self): - return 5 # Higher weight = more frequent + @step(weight_value=5) # Higher weight = more frequent + @guard(lambda self: not self.logged_in) + def submit_login(self): + ... ``` 4. **Add invariants** for state checking 5. **Customize guards** for complex preconditions diff --git a/model-creator/examples/example_generated_model.py b/model-creator/examples/example_generated_model.py index 730487b..c8f2c8e 100644 --- a/model-creator/examples/example_generated_model.py +++ b/model-creator/examples/example_generated_model.py @@ -11,6 +11,9 @@ except ImportError: raise ImportError('Model requires requests library. Install with: pip install requests') +from pyosmo import Osmo +from pyosmo.decorators import guard, step + class ExampleWebsiteModel: """ @@ -50,14 +53,15 @@ def after(self): # Verify response is valid assert self.response.status_code < 500, f'Server error: {self.response.status_code}' - # Check for error messages if we expect success + # Could add more sophisticated error checking here if self.response.status_code == 200: - # Could add more sophisticated error checking here pass # --- Form Submission Actions --- - def step_submit_login(self): + @step + @guard(lambda self: not self.logged_in) + def submit_login(self): """Submit the login form.""" data = { 'username': 'testuser', @@ -77,12 +81,9 @@ def step_submit_login(self): self.current_page = 'login' print('Executed: submit_login') - def guard_submit_login(self): - """Guard for submit_login.""" - # Can only login when not logged in - return not self.logged_in - - def step_submit_register(self): + @step + @guard(lambda self: not self.logged_in) + def submit_register(self): """Submit the registration form.""" data = { 'username': 'testuser', @@ -99,12 +100,9 @@ def step_submit_register(self): self.current_page = 'register' print('Executed: submit_register') - def guard_submit_register(self): - """Guard for submit_register.""" - # Can only register when not logged in - return not self.logged_in - - def step_submit_logout(self): + @step + @guard(lambda self: self.logged_in) + def submit_logout(self): """Submit the logout form.""" data = {} @@ -119,12 +117,8 @@ def step_submit_logout(self): self.current_page = 'logout' print('Executed: submit_logout') - def guard_submit_logout(self): - """Guard for submit_logout.""" - # Can only logout when logged in - return self.logged_in - - def step_submit_contact(self): + @step + def submit_contact(self): """Submit the contact form.""" data = { 'name': 'Test User', @@ -140,11 +134,8 @@ def step_submit_contact(self): self.current_page = 'contact' print('Executed: submit_contact') - def guard_submit_contact(self): - """Guard for submit_contact.""" - return True - - def step_submit_search(self): + @step + def submit_search(self): """Submit the search form.""" data = { 'q': 'test_q', @@ -158,70 +149,62 @@ def step_submit_search(self): self.current_page = 'search' print('Executed: submit_search') - def guard_submit_search(self): - """Guard for submit_search.""" - return True - # --- Navigation Actions --- - def step_navigate_to_home(self): + @step + def navigate_to_home(self): """Navigate to home.""" self.response = self.session.get('https://example.com') self.current_page = 'home' print('Navigated to: home') - def guard_navigate_to_home(self): - """Guard for navigate_to_home.""" - return True - - def step_navigate_to_about(self): + @step + def navigate_to_about(self): """Navigate to about.""" self.response = self.session.get('https://example.com/about') self.current_page = 'about' print('Navigated to: about') - def guard_navigate_to_about(self): - """Guard for navigate_to_about.""" - return True - - def step_navigate_to_contact(self): + @step + def navigate_to_contact(self): """Navigate to contact.""" self.response = self.session.get('https://example.com/contact') self.current_page = 'contact' print('Navigated to: contact') - def guard_navigate_to_contact(self): - """Guard for navigate_to_contact.""" - return True - - def step_navigate_to_login(self): + @step + @guard(lambda self: not self.logged_in) + def navigate_to_login(self): """Navigate to login.""" self.response = self.session.get('https://example.com/login') self.current_page = 'login' print('Navigated to: login') - def guard_navigate_to_login(self): - """Guard for navigate_to_login.""" - # Only show login page when not logged in - return not self.logged_in - - def step_navigate_to_dashboard(self): + @step + @guard(lambda self: self.logged_in) + def navigate_to_dashboard(self): """Navigate to dashboard.""" self.response = self.session.get('https://example.com/dashboard') self.current_page = 'dashboard' print('Navigated to: dashboard') - def guard_navigate_to_dashboard(self): - """Guard for navigate_to_dashboard.""" - # Dashboard requires authentication - return self.logged_in - # --- Optional: Custom Methods --- +def main(): + """Run the model with PyOsmo.""" + model = ExampleWebsiteModel() + + osmo = Osmo(model).weighted_algorithm().stop_after_steps(100).run_tests(1) + + print(f'Starting test generation for {model.base_url}') + osmo.generate() + + # Print summary + stats = osmo.history.statistics() + print('\n' + '=' * 50) + print('Test generation complete!') + print(f'Steps executed: {stats.total_steps}') + print(f'Unique steps: {len(stats.step_coverage)}') - def weight_submit_login(self): - """Make login more likely when not logged in.""" - return 10 if not self.logged_in else 1 - def weight_navigate_to_dashboard(self): - """Dashboard is important - visit it more often when logged in.""" - return 15 if self.logged_in else 0 +if __name__ == '__main__': + main() diff --git a/model-creator/generator.py b/model-creator/generator.py index cf92660..a74ddad 100644 --- a/model-creator/generator.py +++ b/model-creator/generator.py @@ -203,6 +203,9 @@ def generate_model_class(self, class_name: str = 'WebsiteModel') -> str: ' "Model requires requests library. Install with: pip install requests"', ' )', '', + 'from pyosmo import Osmo', + 'from pyosmo.decorators import step, guard', + '', '', ] ) @@ -268,6 +271,9 @@ def generate_model_class(self, class_name: str = 'WebsiteModel') -> str: lines.extend(self._generate_step_method(action)) lines.append('') + # Generate main function + lines.extend(self._generate_main_function(class_name)) + return '\n'.join(line for line in lines if line is not None) def _generate_step_method(self, action: dict) -> list[str]: @@ -282,13 +288,27 @@ def _generate_step_method(self, action: dict) -> list[str]: return lines def _generate_form_step(self, action: dict) -> list[str]: - """Generate a form submission step.""" + """Generate a form submission step with decorators.""" lines = [] method_name = action['name'] form = action['form'] - # Method signature - lines.append(f' def step_{method_name}(self):') + # Determine guard condition + if action['is_login']: + guard_condition = 'not self.logged_in' + elif action['is_logout']: + guard_condition = 'self.logged_in' + else: + page = self.pages.get(action['page_url']) + guard_condition = 'self.logged_in' if page and page.requires_auth else None + + # Add decorators + lines.append(' @step') + if guard_condition: + lines.append(f' @guard(lambda self: {guard_condition})') + + # Method signature (no step_ prefix) + lines.append(f' def {method_name}(self):') lines.append(f' """Submit the {method_name.replace("_", " ")} form."""') # Prepare form data @@ -326,33 +346,23 @@ def _generate_form_step(self, action: dict) -> list[str]: lines.append(f' print(f"Executed: {method_name}")') - # Guard method - lines.append('') - lines.append(f' def guard_{method_name}(self):') - lines.append(f' """Guard for {method_name}."""') - - if action['is_login']: - lines.append(' # Can only login when not logged in') - lines.append(' return not self.logged_in') - elif action['is_logout']: - lines.append(' # Can only logout when logged in') - lines.append(' return self.logged_in') - else: - # Check if requires login - page = self.pages.get(action['page_url']) - if page and page.requires_auth: - lines.append(' return self.logged_in') - else: - lines.append(' return True') - return lines def _generate_navigation_step(self, action: dict) -> list[str]: - """Generate a navigation step.""" + """Generate a navigation step with decorators.""" lines = [] method_name = action['name'] - lines.append(f' def step_{method_name}(self):') + # Determine guard condition + guard_condition = 'self.logged_in' if action['requires_auth'] else None + + # Add decorators + lines.append(' @step') + if guard_condition: + lines.append(f' @guard(lambda self: {guard_condition})') + + # Method signature (no step_ prefix) + lines.append(f' def {method_name}(self):') lines.append(f' """Navigate to {action["target_page"]}."""') lines.append(f' self.response = self.session.get("{action["target_url"]}")') @@ -361,16 +371,6 @@ def _generate_navigation_step(self, action: dict) -> list[str]: lines.append(f' print(f"Navigated to: {action["target_page"]}")') - # Guard - lines.append('') - lines.append(f' def guard_{method_name}(self):') - lines.append(f' """Guard for {method_name}."""') - - if action['requires_auth']: - lines.append(' return self.logged_in') - else: - lines.append(' return True') - return lines def _get_default_value(self, field: FormField) -> str: @@ -399,6 +399,36 @@ def _get_default_value(self, field: FormField) -> str: return f'"{field.options[0]}"' return f'"test_{field.name}"' + def _generate_main_function(self, class_name: str) -> list[str]: + """Generate main function for running the model with PyOsmo.""" + return [ + '', + '', + 'def main():', + ' """Run the model with PyOsmo."""', + f' model = {class_name}()', + '', + ' osmo = (', + ' Osmo(model)', + ' .weighted_algorithm()', + ' .stop_after_steps(100)', + ' .run_tests(1)', + ' )', + '', + ' print(f"Starting test generation for {model.base_url}")', + ' osmo.generate()', + '', + ' # Print summary', + ' stats = osmo.history.statistics()', + ' print("\\n" + "=" * 50)', + ' print("Test generation complete!")', + ' print(stats)', + '', + '', + 'if __name__ == "__main__":', + ' main()', + ] + def save_model(self, output_path: Path, class_name: str = 'WebsiteModel'): """ Generate and save the model to a file. diff --git a/model-creator/models/radio.py b/model-creator/models/radio.py new file mode 100644 index 0000000..2d9621f --- /dev/null +++ b/model-creator/models/radio.py @@ -0,0 +1,159 @@ +""" +Auto-generated PyOsmo model for website testing. +Base URL: http://127.0.0.1:8000/radio/ +""" + +import random +from typing import Optional + +try: + import requests +except ImportError: + raise ImportError( + "Model requires requests library. Install with: pip install requests" + ) + +from pyosmo import Osmo +from pyosmo.decorators import step, guard + + +class WebsiteModel: + """ + Model-based testing for http://127.0.0.1:8000/radio/ + """ + + def __init__(self): + """Initialize the model state.""" + self.base_url = "http://127.0.0.1:8000/radio/" + self.session = requests.Session() + self.logged_in = False + self.current_user: Optional[str] = None + self.current_page = "home" + self.response = None + self.last_error: Optional[str] = None + + def before_test(self): + """Called before each test run.""" + self.session = requests.Session() + self.logged_in = False + self.current_page = "home" + print("Starting new test run") + + def after_test(self): + """Called after each test run.""" + self.session.close() + print("Test run completed") + + def after(self): + """Called after each step - verification point.""" + if self.response is not None: + # Verify response is valid + assert self.response.status_code < 500, \ + f"Server error: {self.response.status_code}" + + @step + @guard(lambda self: not self.logged_in) + def submit_action(self): + """Submit the submit action form.""" + data = { + "csrfmiddlewaretoken": "HK8TvhajxGsMd6jiIGGihCLWyLUFHUecntoyitbhL6CBN7j0MDvHD7m4IHMbgLMa", + "username": "testuser", + "password": "testpassword123", + } + + self.response = self.session.post( + "http://127.0.0.1:8000/", + data=data, + ) + + # Update login state on success + if self.response.status_code == 200: + self.logged_in = True + self.current_user = data.get("username") or data.get("email") + self.current_page = "http__1270018000_radio" + print(f"Executed: submit_action") + + @step + def submit_send_password(self): + """Submit the submit send password form.""" + data = { + "csrfmiddlewaretoken": "9vzl8L3Hrmai1snQ5U9WIEFyYkvnVHNtPeP0VX4FFMk7Btny9RYl49gG8gnTuylr", + "email": "test@example.com", + } + + self.response = self.session.post( + "http://127.0.0.1:8000/send_password/", + data=data, + ) + + self.current_page = "http__1270018000_send_password" + print(f"Executed: submit_send_password") + + @step + @guard(lambda self: not self.logged_in) + def submit_login(self): + """Submit the submit login form.""" + data = { + "csrfmiddlewaretoken": "R5SiN04jB7GaJtA9UM2ocP088Ts8i2gNxO8XAc5hPxQZjuARYJRNykBgiPkERTOL", + "username": "testuser", + "password": "testpassword123", + } + + self.response = self.session.post( + "http://127.0.0.1:8000/login/", + data=data, + ) + + # Update login state on success + if self.response.status_code == 200: + self.logged_in = True + self.current_user = data.get("username") or data.get("email") + self.current_page = "http__1270018000_login" + print(f"Executed: submit_login") + + @step + def navigate_to_http__1270018000_send_password(self): + """Navigate to http__1270018000_send_password.""" + self.response = self.session.get("http://127.0.0.1:8000/send_password/") + self.current_page = "http__1270018000_send_password" + print(f"Navigated to: http__1270018000_send_password") + + @step + def navigate_to_http__1270018000_static_files_tietosuojaseloste_srk_kesaseuraviestinta_22082025pdf(self): + """Navigate to http__1270018000_static_files_tietosuojaseloste_srk_kesaseuraviestinta_22082025pdf.""" + self.response = self.session.get("http://127.0.0.1:8000/static/files/Tietosuojaseloste_SRK_Kesaseuraviestinta_22082025.pdf") + self.current_page = "http__1270018000_static_files_tietosuojaseloste_srk_kesaseuraviestinta_22082025pdf" + print(f"Navigated to: http__1270018000_static_files_tietosuojaseloste_srk_kesaseuraviestinta_22082025pdf") + + @step + def navigate_to_http__1270018000_login(self): + """Navigate to http__1270018000_login.""" + self.response = self.session.get("http://127.0.0.1:8000/login/") + self.current_page = "http__1270018000_login" + print(f"Navigated to: http__1270018000_login") + + + +def main(): + """Run the model with PyOsmo.""" + model = WebsiteModel() + + osmo = ( + Osmo(model) + .weighted_algorithm() + .stop_after_steps(100) + .run_tests(1) + ) + + print(f"Starting test generation for {model.base_url}") + osmo.generate() + + # Print summary + stats = osmo.history.statistics() + print("\n" + "=" * 50) + print("Test generation complete!") + print(stats) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/model-creator/tests/test_generator.py b/model-creator/tests/test_generator.py index ad23c67..eeb273f 100644 --- a/model-creator/tests/test_generator.py +++ b/model-creator/tests/test_generator.py @@ -197,9 +197,16 @@ def test_generate_model_class(self): # Check for imports self.assertIn('import requests', code) self.assertIn('import random', code) + self.assertIn('from pyosmo import Osmo', code) + self.assertIn('from pyosmo.decorators import step, guard', code) + + # Check for main function + self.assertIn('def main():', code) + self.assertIn('if __name__ == "__main__":', code) + self.assertIn('Osmo(model)', code) def test_generate_form_step(self): - """Test form step generation.""" + """Test form step generation with decorators.""" action = { 'type': 'form_submit', 'name': 'submit_login', @@ -222,9 +229,12 @@ def test_generate_form_step(self): lines = self.generator._generate_form_step(action) code = '\n'.join(lines) - # Check method definition - self.assertIn('def step_submit_login(self):', code) - self.assertIn('def guard_submit_login(self):', code) + # Check decorators + self.assertIn('@step', code) + self.assertIn('@guard(lambda self: not self.logged_in)', code) + + # Check method definition (no step_ prefix) + self.assertIn('def submit_login(self):', code) # Check form data preparation self.assertIn('data = {', code) @@ -237,11 +247,8 @@ def test_generate_form_step(self): # Check login state update self.assertIn('self.logged_in = True', code) - # Check guard for login - self.assertIn('return not self.logged_in', code) - def test_generate_navigation_step(self): - """Test navigation step generation.""" + """Test navigation step generation with decorators.""" action = { 'type': 'navigation', 'name': 'navigate_to_about', @@ -254,16 +261,37 @@ def test_generate_navigation_step(self): lines = self.generator._generate_navigation_step(action) code = '\n'.join(lines) - # Check method definition - self.assertIn('def step_navigate_to_about(self):', code) - self.assertIn('def guard_navigate_to_about(self):', code) + # Check decorator (no guard since requires_auth is False) + self.assertIn('@step', code) + self.assertNotIn('@guard', code) + + # Check method definition (no step_ prefix) + self.assertIn('def navigate_to_about(self):', code) # Check navigation self.assertIn('self.response = self.session.get(', code) self.assertIn('https://example.com/about', code) - # Check guard - self.assertIn('return True', code) + def test_generate_navigation_step_with_auth(self): + """Test navigation step generation with auth requirement.""" + action = { + 'type': 'navigation', + 'name': 'navigate_to_dashboard', + 'target_url': 'https://example.com/dashboard', + 'target_page': 'dashboard', + 'link_text': 'Dashboard', + 'requires_auth': True, + } + + lines = self.generator._generate_navigation_step(action) + code = '\n'.join(lines) + + # Check decorators + self.assertIn('@step', code) + self.assertIn('@guard(lambda self: self.logged_in)', code) + + # Check method definition (no step_ prefix) + self.assertIn('def navigate_to_dashboard(self):', code) def test_get_statistics(self): """Test statistics generation.""" diff --git a/pyosmo/__init__.py b/pyosmo/__init__.py index 8758894..32c362f 100644 --- a/pyosmo/__init__.py +++ b/pyosmo/__init__.py @@ -3,6 +3,7 @@ import pyosmo.models from pyosmo.decorators import ( guard, + model_guard, post, pre, requires, @@ -22,6 +23,7 @@ 'weight', 'step', 'guard', + 'model_guard', 'pre', 'post', 'requires', diff --git a/pyosmo/decorators.py b/pyosmo/decorators.py index 935e665..90f31f7 100644 --- a/pyosmo/decorators.py +++ b/pyosmo/decorators.py @@ -164,6 +164,25 @@ def decorator(func: F) -> F: return decorator +def model_guard(guard_func: Callable[..., Any]) -> Callable[[type], type]: + """Class decorator that adds a model-level guard to all steps. + + Usage: + @model_guard(lambda self: self.state.current_page == "home") + class HomePage: + ... + """ + + def decorator(cls: type) -> type: + def guard_all(self: Any) -> Any: + return guard_func(self) + + cls.guard_all = guard_all # type: ignore[attr-defined] + return cls + + return decorator + + def requires(*requirements: str) -> Callable[[F], F]: """Mark a step as satisfying one or more requirements. diff --git a/pyosmo/end_conditions/step_coverage.py b/pyosmo/end_conditions/step_coverage.py index f2e69e9..bf6423f 100644 --- a/pyosmo/end_conditions/step_coverage.py +++ b/pyosmo/end_conditions/step_coverage.py @@ -9,8 +9,8 @@ class StepCoverage(OsmoEndCondition): """ def __init__(self, coverage_percent: int): - if coverage_percent in range(1, 100): - raise Exception(f'Coverage is {coverage_percent} and it need to be > 0 and < 100') + if coverage_percent not in range(1, 101): + raise Exception(f'Coverage is {coverage_percent} and it need to be >= 1 and <= 100') self.coverage = coverage_percent / 100 def end_test(self, history: OsmoHistory, model: OsmoModelCollector) -> bool: @@ -23,10 +23,8 @@ def end_test(self, history: OsmoHistory, model: OsmoModelCollector) -> bool: return current_coverage >= self.coverage def end_suite(self, history: OsmoHistory, model: OsmoModelCollector) -> bool: - """Stops test suite when defined number of test cases are executed""" - if history.current_test_case is None: - return False + """Stops test suite when cumulative step coverage across all test cases reaches threshold""" all_steps = len(list(model.all_steps)) - used_steps = sum(1 if history.current_test_case.is_used(s) else 0 for s in model.all_steps) + used_steps = sum(1 if history.is_used(s) else 0 for s in model.all_steps) current_coverage = used_steps / all_steps return current_coverage >= self.coverage diff --git a/pyosmo/history/test_case.py b/pyosmo/history/test_case.py index 8f0ede9..e49f280 100644 --- a/pyosmo/history/test_case.py +++ b/pyosmo/history/test_case.py @@ -17,10 +17,10 @@ def is_started(self) -> bool: return self._start_time is not None def is_running(self) -> bool: - return self._stop_time is not None + return self._stop_time is None def add_step(self, step_log: TestStepLog) -> None: - if self.is_running(): + if not self.is_running(): raise Exception('Test case is not running, cannot add more test steps!') self.steps_log.append(step_log) diff --git a/pyosmo/history/test_step_log.py b/pyosmo/history/test_step_log.py index 3b2932c..57bc983 100644 --- a/pyosmo/history/test_step_log.py +++ b/pyosmo/history/test_step_log.py @@ -20,7 +20,7 @@ def error(self) -> Exception | None: @property def name(self) -> str: - return self._step.function_name + return self._step.name @property def timestamp(self) -> datetime: diff --git a/pyosmo/model.py b/pyosmo/model.py index d60a7d6..952938c 100644 --- a/pyosmo/model.py +++ b/pyosmo/model.py @@ -18,7 +18,7 @@ def default_weight(self) -> float: try: return float(self.object_instance.weight) # type: ignore[attr-defined] except AttributeError: - return 0.0 + return 1.0 # Default weight of 1.0 for all steps @property def func(self) -> Callable[[], Any]: @@ -79,6 +79,11 @@ def weight(self) -> float: @property def is_available(self) -> bool: """Check if step is available right now""" + # Check model-level guard first (guard_all) + guard_all_func = getattr(self.object_instance, 'guard_all', None) + if guard_all_func is not None and callable(guard_all_func) and not guard_all_func(): + return False + # Check if step is disabled by decorator if hasattr(self.func, '_osmo_enabled') and not self.func._osmo_enabled: return False diff --git a/pyosmo/osmo.py b/pyosmo/osmo.py index eedda7c..afb91d5 100644 --- a/pyosmo/osmo.py +++ b/pyosmo/osmo.py @@ -109,7 +109,10 @@ def generate(self) -> None: while True: # Use algorithm to select the step self.model.execute_optional('before') - step = self.algorithm.choose(self.history, self.model.available_steps) + available = self.model.available_steps + if not available: + raise Exception('No steps available! All guards returned False.') + step = self.algorithm.choose(self.history, available) self.model.execute_optional(f'pre_{step}') try: self._run_step(step) diff --git a/pyosmo/tests/test_core.py b/pyosmo/tests/test_core.py index 981b9ab..d87c5a8 100644 --- a/pyosmo/tests/test_core.py +++ b/pyosmo/tests/test_core.py @@ -1,4 +1,7 @@ +import pytest + from pyosmo import Osmo +from pyosmo.end_conditions import Length def test_empty_model(): @@ -84,3 +87,49 @@ def step_first(self): osmo.generate() assert tm1.step_execute, 'Osmo did not execute step in first model' assert tm2.step_execute, 'Osmo did not execute step in second model' + + +def test_no_available_steps_raises_clear_error(): + """When all guards return False, engine should raise a clear error""" + + class AllGuardedModel: + @staticmethod + def step_blocked(): + pass + + @staticmethod + def guard_blocked(): + return False + + osmo = Osmo(AllGuardedModel()) + osmo.test_end_condition = Length(10) + with pytest.raises(Exception, match='No steps available'): + osmo.generate() + + +def test_step_name_in_history_uses_semantic_name(): + """TestStepLog.name should return the semantic step name, not the function name""" + + class NameModel: + @staticmethod + def step_login(): + pass + + @staticmethod + def step_logout(): + pass + + osmo = Osmo(NameModel()) + osmo.test_end_condition = Length(5) + osmo.generate() + + # History should use semantic names (without 'step_' prefix) + for tc in osmo.history.test_cases: + for step_log in tc.steps_log: + assert not step_log.name.startswith('step_'), f'Expected semantic name, got function name: {step_log.name}' + assert step_log.name in ('login', 'logout') + + # step_frequency should also use semantic names + frequency = osmo.history.step_frequency() + for name in frequency: + assert not name.startswith('step_') diff --git a/pyosmo/tests/test_end_conditions.py b/pyosmo/tests/test_end_conditions.py index 45535de..d1d3ab8 100644 --- a/pyosmo/tests/test_end_conditions.py +++ b/pyosmo/tests/test_end_conditions.py @@ -1,5 +1,6 @@ from datetime import datetime, timedelta +import pytest from hypothesis import given from hypothesis.strategies import integers @@ -86,3 +87,76 @@ def test_step_coverage(): assert osmo.history.get_step_count(step_first) > 0 assert osmo.history.get_step_count(step_second) > 0 + + +def test_step_coverage_valid_values(): + """StepCoverage should accept values 1-100 inclusive""" + StepCoverage(1) + StepCoverage(50) + StepCoverage(99) + StepCoverage(100) + + +def test_step_coverage_invalid_values(): + """StepCoverage should reject values outside 1-100""" + with pytest.raises(Exception, match='Coverage is'): + StepCoverage(0) + with pytest.raises(Exception, match='Coverage is'): + StepCoverage(-1) + with pytest.raises(Exception, match='Coverage is'): + StepCoverage(101) + + +def test_step_coverage_mid_range(): + """StepCoverage with 50% should stop when half the steps are covered""" + model = TempModel() + osmo = Osmo(model) + osmo.test_end_condition = StepCoverage(50) + osmo.test_suite_end_condition = Length(1) + osmo.generate() + # With 2 steps and 50% coverage, at least 1 step must have been used + step_first = osmo.model.get_step_by_name('step_first') + step_second = osmo.model.get_step_by_name('step_second') + assert step_first is not None + assert step_second is not None + used_first = osmo.history.get_step_count(step_first) > 0 + used_second = osmo.history.get_step_count(step_second) > 0 + assert used_first or used_second + + +def test_step_coverage_end_suite_cumulative(): + """StepCoverage as suite end condition should check coverage across ALL test cases""" + + class SingleStepModel: + """Model where only one step is available at a time, alternating per test""" + + def __init__(self): + self.test_number = 0 + + def before_test(self): + self.test_number += 1 + + def step_first(self): + pass + + def guard_first(self): + return self.test_number % 2 == 1 + + def step_second(self): + pass + + def guard_second(self): + return self.test_number % 2 == 0 + + model = SingleStepModel() + osmo = Osmo(model) + osmo.test_end_condition = Length(1) + osmo.test_suite_end_condition = StepCoverage(100) + osmo.generate() + # Suite should stop once both steps have been used across multiple tests + step_first = osmo.model.get_step_by_name('step_first') + step_second = osmo.model.get_step_by_name('step_second') + assert step_first is not None + assert step_second is not None + assert osmo.history.get_step_count(step_first) > 0 + assert osmo.history.get_step_count(step_second) > 0 diff --git a/pyosmo/tests/test_guard_all.py b/pyosmo/tests/test_guard_all.py new file mode 100644 index 0000000..0f35be9 --- /dev/null +++ b/pyosmo/tests/test_guard_all.py @@ -0,0 +1,348 @@ +"""Test guard_all model-level guard and @model_guard decorator""" + +from pyosmo import Osmo, guard, model_guard, step +from pyosmo.end_conditions import Length + + +class SimpleGuardAllModel: + """Model with guard_all that controls all steps.""" + + def __init__(self): + self.active = True + self.step_a_count = 0 + self.step_b_count = 0 + + def guard_all(self): + return self.active + + @step + def step_a(self): + self.step_a_count += 1 + + @step + def step_b(self): + self.step_b_count += 1 + + +def test_guard_all_allows_steps(): + """Steps are available when guard_all returns True.""" + model = SimpleGuardAllModel() + osmo = Osmo(model) + + available = osmo.model.available_steps + names = [s.name for s in available] + assert 'step_a' in names + assert 'step_b' in names + + +def test_guard_all_blocks_steps(): + """All steps are blocked when guard_all returns False.""" + model = SimpleGuardAllModel() + model.active = False + osmo = Osmo(model) + + available = osmo.model.available_steps + assert len(available) == 0 + + +def test_guard_all_dynamic_state(): + """guard_all responds to state changes during execution.""" + model = SimpleGuardAllModel() + osmo = Osmo(model) + + # Initially available + assert len(osmo.model.available_steps) == 2 + + # Disable + model.active = False + assert len(osmo.model.available_steps) == 0 + + # Re-enable + model.active = True + assert len(osmo.model.available_steps) == 2 + + +def test_guard_all_with_per_step_guard(): + """guard_all and per-step guards both must pass.""" + + class CombinedGuardModel: + def __init__(self): + self.model_active = True + self.step_ready = True + self.always_count = 0 + self.guarded_count = 0 + + def guard_all(self): + return self.model_active + + @step + @guard(lambda self: self.step_ready) + def guarded_step(self): + self.guarded_count += 1 + + @step + def always_step(self): + self.always_count += 1 + + model = CombinedGuardModel() + osmo = Osmo(model) + + # Both active: both steps available + names = [s.name for s in osmo.model.available_steps] + assert 'guarded_step' in names + assert 'always_step' in names + + # guard_all True, per-step guard False: only always_step + model.step_ready = False + names = [s.name for s in osmo.model.available_steps] + assert 'guarded_step' not in names + assert 'always_step' in names + + # guard_all False: nothing available regardless of per-step guard + model.model_active = False + model.step_ready = True + names = [s.name for s in osmo.model.available_steps] + assert len(names) == 0 + + +def test_guard_all_with_naming_convention_guard(): + """guard_all works with naming convention guards.""" + + class NamingConventionModel: + def __init__(self): + self.model_active = True + self.step_ready = True + + def guard_all(self): + return self.model_active + + def step_action(self): + pass + + def guard_action(self): + return self.step_ready + + model = NamingConventionModel() + osmo = Osmo(model) + + # Both pass + names = [s.name for s in osmo.model.available_steps] + assert 'action' in names + + # guard_all False + model.model_active = False + names = [s.name for s in osmo.model.available_steps] + assert 'action' not in names + + # guard_all True, per-step False + model.model_active = True + model.step_ready = False + names = [s.name for s in osmo.model.available_steps] + assert 'action' not in names + + +def test_multi_model_guard_all(): + """Only the active model's steps are available in multi-model setup.""" + + class ModelA: + def __init__(self, state): + self.state = state + + def guard_all(self): + return self.state['active'] == 'A' + + @step + def action_a(self): + self.state['active'] = 'B' + + class ModelB: + def __init__(self, state): + self.state = state + + def guard_all(self): + return self.state['active'] == 'B' + + @step + def action_b(self): + self.state['active'] = 'A' + + state = {'active': 'A'} + model_a = ModelA(state) + model_b = ModelB(state) + osmo = Osmo([model_a, model_b]) + + # Only model A steps available + names = [s.name for s in osmo.model.available_steps] + assert 'action_a' in names + assert 'action_b' not in names + + # Switch state + state['active'] = 'B' + names = [s.name for s in osmo.model.available_steps] + assert 'action_a' not in names + assert 'action_b' in names + + +def test_multi_model_page_transitions(): + """Full page transition scenario with guard_all.""" + + class PageHome: + def __init__(self, state): + self.state = state + + def guard_all(self): + return self.state['page'] == 'home' + + @step + def go_to_login(self): + self.state['page'] = 'login' + + class PageLogin: + def __init__(self, state): + self.state = state + + def guard_all(self): + return self.state['page'] == 'login' + + @step + def do_login(self): + self.state['page'] = 'dashboard' + + @step + def go_back(self): + self.state['page'] = 'home' + + class PageDashboard: + def __init__(self, state): + self.state = state + + def guard_all(self): + return self.state['page'] == 'dashboard' + + @step + def do_logout(self): + self.state['page'] = 'home' + + state = {'page': 'home'} + osmo = Osmo([PageHome(state), PageLogin(state), PageDashboard(state)]) + osmo.test_end_condition = Length(10) + osmo.seed = 42 + + osmo.run() + + # Test completed without error, transitions worked + history = osmo.history + assert len(history.test_cases) == 1 + assert len(history.test_cases[0].steps_log) == 10 + + +def test_backward_compatibility_no_guard_all(): + """Models without guard_all work exactly as before.""" + + class PlainModel: + def __init__(self): + self.count = 0 + + @step + def action(self): + self.count += 1 + + model = PlainModel() + osmo = Osmo(model) + osmo.test_end_condition = Length(5) + + osmo.run() + + assert model.count == 5 + + +def test_model_guard_decorator(): + """@model_guard class decorator works like guard_all method.""" + + @model_guard(lambda self: self.active) + class DecoratedModel: + def __init__(self): + self.active = True + self.count = 0 + + @step + def action(self): + self.count += 1 + + model = DecoratedModel() + osmo = Osmo(model) + + # Active: step available + names = [s.name for s in osmo.model.available_steps] + assert 'action' in names + + # Inactive: step blocked + model.active = False + names = [s.name for s in osmo.model.available_steps] + assert len(names) == 0 + + +def test_model_guard_decorator_with_per_step_guard(): + """@model_guard combines with per-step guards.""" + + @model_guard(lambda self: self.model_ok) + class DecoratedGuardModel: + def __init__(self): + self.model_ok = True + self.step_ok = True + + @step + @guard(lambda self: self.step_ok) + def guarded(self): + pass + + @step + def unguarded(self): + pass + + model = DecoratedGuardModel() + osmo = Osmo(model) + + # Both pass + names = [s.name for s in osmo.model.available_steps] + assert 'guarded' in names + assert 'unguarded' in names + + # model_guard True, per-step False + model.step_ok = False + names = [s.name for s in osmo.model.available_steps] + assert 'guarded' not in names + assert 'unguarded' in names + + # model_guard False: nothing + model.model_ok = False + model.step_ok = True + assert len(osmo.model.available_steps) == 0 + + +def test_guard_all_naming_convention_steps(): + """guard_all works with naming convention steps (step_* methods).""" + + class NamingModel: + def __init__(self): + self.active = True + self.count = 0 + + def guard_all(self): + return self.active + + def step_do_thing(self): + self.count += 1 + + def step_do_other(self): + self.count += 1 + + model = NamingModel() + osmo = Osmo(model) + + names = [s.name for s in osmo.model.available_steps] + assert 'do_thing' in names + assert 'do_other' in names + + model.active = False + assert len(osmo.model.available_steps) == 0 diff --git a/pyosmo/tests/test_history_statistics.py b/pyosmo/tests/test_history_statistics.py index a216eab..ead6856 100644 --- a/pyosmo/tests/test_history_statistics.py +++ b/pyosmo/tests/test_history_statistics.py @@ -179,3 +179,38 @@ def test_statistics_str_representation(): assert 'Total Tests:' in str_repr assert 'Total Steps:' in str_repr assert 'Duration:' in str_repr + + +def test_test_case_is_running(): + """is_running() should return True when test is active, False when stopped""" + from pyosmo.history.test_case import OsmoTestCaseRecord + + tc = OsmoTestCaseRecord() + assert tc.is_running() is True, 'New test case should be running' + + tc.stop() + assert tc.is_running() is False, 'Stopped test case should not be running' + + +def test_cannot_add_step_to_stopped_test(): + """Adding a step to a stopped test case should raise""" + import pytest + + from pyosmo.history.test_case import OsmoTestCaseRecord + from pyosmo.history.test_step_log import TestStepLog + from pyosmo.model import TestStep + + class DummyModel: + def step_dummy(self): + pass + + tc = OsmoTestCaseRecord() + tc.stop() + + dummy_step = TestStep('step_dummy', DummyModel()) + from datetime import timedelta + + step_log = TestStepLog(dummy_step, timedelta(seconds=0)) + + with pytest.raises(Exception, match='not running'): + tc.add_step(step_log) diff --git a/pytest_pyosmo/pytest_pyosmo_plugin.py b/pytest_pyosmo/pytest_pyosmo_plugin.py index f2ab75a..9591ece 100644 --- a/pytest_pyosmo/pytest_pyosmo_plugin.py +++ b/pytest_pyosmo/pytest_pyosmo_plugin.py @@ -214,7 +214,7 @@ def collect(self): # Extract sequences from history sequences = [] for test_case in osmo.history.test_cases: - sequence = [step_log.name for step_log in test_case.steps_log] + sequence = [step_log.step.function_name for step_log in test_case.steps_log] sequences.append(sequence) except Exception as e: yield ModelError.from_parent(