Skip to content

Commit c835a3e

Browse files
committed
Disallow formula-time randomness
1 parent 6876829 commit c835a3e

4 files changed

Lines changed: 16 additions & 189 deletions

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Disallowed formula-time randomness through `random()`, directing model authors to use input seed or draw variables instead.

policyengine_core/commons/formulas.py

Lines changed: 8 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -293,48 +293,18 @@ def amount_between(
293293

294294
def random(population):
295295
"""
296-
Generate random values for each entity in the population.
296+
Raise an error for formula-time randomness.
297297
298-
Args:
299-
population: The population object containing simulation data.
300-
301-
Returns:
302-
np.ndarray: Array of random values for each entity.
298+
Random values should be created during data construction and exposed to
299+
formulas as ordinary input variables, so simulations remain reproducible and
300+
calibration outputs stay tied to the records that were calibrated.
303301
"""
304-
# Initialize count of random calls if not already present
305-
if not hasattr(population.simulation, "count_random_calls"):
306-
population.simulation.count_random_calls = 0
307-
population.simulation.count_random_calls += 1
308-
309-
# Get known periods or use default calculation period
310-
known_periods = population.simulation.get_holder(
311-
f"{population.entity.key}_id"
312-
).get_known_periods()
313-
period = (
314-
known_periods[0]
315-
if known_periods
316-
else population.simulation.default_calculation_period
302+
raise RuntimeError(
303+
"Formula-time randomness is not allowed. Create random seeds or draws "
304+
"during microdata construction and read them through input variables "
305+
"inside formulas."
317306
)
318307

319-
# Get entity IDs for the period
320-
entity_ids = population(f"{population.entity.key}_id", period)
321-
322-
# Generate deterministic random values using vectorised hash
323-
seeds = np.abs(entity_ids * 100 + population.simulation.count_random_calls).astype(
324-
np.uint64
325-
)
326-
327-
# PCG-style mixing function for high-quality pseudo-random generation
328-
x = seeds * np.uint64(0x5851F42D4C957F2D)
329-
x = x ^ (x >> np.uint64(33))
330-
x = x * np.uint64(0xC4CEB9FE1A85EC53)
331-
x = x ^ (x >> np.uint64(33))
332-
333-
# Convert to float in [0, 1) using upper 53 bits for full double precision
334-
values = (x >> np.uint64(11)).astype(np.float64) / (2**53)
335-
336-
return values
337-
338308

339309
def is_in(values: ArrayLike, *targets: list) -> ArrayLike:
340310
"""Returns true if the value is in the list of targets.
Lines changed: 4 additions & 148 deletions
Original file line numberDiff line numberDiff line change
@@ -1,152 +1,8 @@
1-
"""Test the random function with large entity IDs to ensure no overflow."""
2-
3-
import numpy as np
41
import pytest
5-
from unittest.mock import Mock
6-
from policyengine_core.commons.formulas import random
7-
8-
9-
class TestRandomSeed:
10-
"""Test random seed handling to prevent NumPy overflow errors."""
11-
12-
def test_random_with_large_entity_ids(self):
13-
"""Test that random() handles large entity IDs without overflow."""
14-
# Create a mock population with simulation
15-
population = Mock()
16-
population.simulation = Mock()
17-
population.simulation.count_random_calls = 0
18-
population.entity = Mock()
19-
population.entity.key = "person"
20-
21-
# Mock the get_holder and get_known_periods
22-
holder = Mock()
23-
holder.get_known_periods.return_value = []
24-
population.simulation.get_holder.return_value = holder
25-
population.simulation.default_calculation_period = Mock()
26-
27-
# Test with very large entity IDs that would cause overflow
28-
# if not handled properly
29-
large_ids = np.array(
30-
[
31-
np.iinfo(np.int64).max - 1000, # Very large positive ID
32-
np.iinfo(np.int64).max // 2, # Large positive ID
33-
1234567890123456789, # Another large ID
34-
]
35-
)
36-
37-
# Mock the population call to return large IDs
38-
population.side_effect = lambda key, period: large_ids
39-
40-
# This should not raise a ValueError about negative seeds
41-
result = random(population)
42-
43-
# Check that we got valid random values
44-
assert isinstance(result, np.ndarray)
45-
assert len(result) == len(large_ids)
46-
assert all(0 <= val <= 1 for val in result)
47-
48-
def test_random_seed_consistency(self):
49-
"""Test that random() produces consistent results for same inputs."""
50-
# Create mock population
51-
population = Mock()
52-
population.simulation = Mock()
53-
population.simulation.count_random_calls = 0
54-
population.entity = Mock()
55-
population.entity.key = "household"
56-
57-
holder = Mock()
58-
holder.get_known_periods.return_value = []
59-
population.simulation.get_holder.return_value = holder
60-
population.simulation.default_calculation_period = Mock()
61-
62-
# Use same IDs
63-
ids = np.array([1, 2, 3])
64-
population.side_effect = lambda key, period: ids
65-
66-
# First call
67-
result1 = random(population)
68-
69-
# Reset count to simulate same conditions
70-
population.simulation.count_random_calls = 0
712

72-
# Second call with same conditions
73-
result2 = random(population)
74-
75-
# Results should be identical
76-
np.testing.assert_array_equal(result1, result2)
77-
78-
def test_random_increments_call_count(self):
79-
"""Test that random() increments the call counter."""
80-
population = Mock()
81-
population.simulation = Mock()
82-
population.simulation.count_random_calls = 0
83-
population.entity = Mock()
84-
population.entity.key = "person"
85-
86-
holder = Mock()
87-
holder.get_known_periods.return_value = []
88-
population.simulation.get_holder.return_value = holder
89-
population.simulation.default_calculation_period = Mock()
90-
91-
ids = np.array([1, 2, 3])
92-
population.side_effect = lambda key, period: ids
93-
94-
# First call
95-
random(population)
96-
assert population.simulation.count_random_calls == 1
97-
98-
# Second call
99-
random(population)
100-
assert population.simulation.count_random_calls == 2
101-
102-
def test_random_handles_negative_ids(self):
103-
"""Test that random() handles negative IDs properly."""
104-
population = Mock()
105-
population.simulation = Mock()
106-
population.simulation.count_random_calls = 0
107-
population.entity = Mock()
108-
population.entity.key = "person"
109-
110-
holder = Mock()
111-
holder.get_known_periods.return_value = []
112-
population.simulation.get_holder.return_value = holder
113-
population.simulation.default_calculation_period = Mock()
114-
115-
# Include negative IDs
116-
ids = np.array([-100, -1, 0, 1, 100])
117-
population.side_effect = lambda key, period: ids
118-
119-
# Should handle negative IDs without errors
120-
result = random(population)
121-
122-
assert isinstance(result, np.ndarray)
123-
assert len(result) == len(ids)
124-
assert all(0 <= val <= 1 for val in result)
125-
126-
def test_no_negative_seed_error_with_overflow(self):
127-
"""Test that seed calculation overflow doesn't cause negative seed error."""
128-
population = Mock()
129-
population.simulation = Mock()
130-
population.simulation.count_random_calls = 999999999 # Large count
131-
population.entity = Mock()
132-
population.entity.key = "person"
133-
134-
holder = Mock()
135-
holder.get_known_periods.return_value = []
136-
population.simulation.get_holder.return_value = holder
137-
population.simulation.default_calculation_period = Mock()
138-
139-
# Use the exact ID that would cause overflow in old implementation
140-
# This ID when multiplied by 100 and added to count_random_calls
141-
# would overflow int64 and become negative
142-
overflow_id = np.array([np.iinfo(np.int64).max // 100])
143-
population.side_effect = lambda key, period: overflow_id
3+
from policyengine_core.commons.formulas import random
1444

145-
# In the old implementation, this would raise:
146-
# ValueError: Seed must be between 0 and 2**32 - 1
147-
# With the fix using abs(), it should work fine
148-
result = random(population)
1495

150-
assert isinstance(result, np.ndarray)
151-
assert len(result) == 1
152-
assert 0 <= result[0] <= 1
6+
def test_random_raises_for_formula_time_randomness():
7+
with pytest.raises(RuntimeError, match="Formula-time randomness is not allowed"):
8+
random(None)

tests/core/test_stable_hash_seed.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
33
Python's built-in ``hash()`` is randomized per process for strings, so any seed
44
derived from it changes from one ``python`` invocation to the next. This module
5-
ensures ``Simulation`` uses a stable hash so results involving ``random()`` are
6-
reproducible across runs (issue C6 in the 2026-04 bug hunt, related to #412).
5+
ensures ``Simulation`` uses a stable hash when it seeds NumPy for existing
6+
deterministic simulation paths.
77
"""
88

99
from __future__ import annotations
@@ -54,7 +54,7 @@ def test_stable_hash_to_seed_covers_seed_range():
5454

5555
def test_sort_keys_makes_equivalent_inputs_share_a_seed():
5656
# Two equivalent situations constructed with different dict insertion order
57-
# must produce the same hash / seed so calls to ``random()`` are stable.
57+
# must produce the same hash / seed.
5858
a = {"person": {"you": {"employment_income": 1000, "age": 30}}}
5959
b = {"person": {"you": {"age": 30, "employment_income": 1000}}}
6060
seed_a = _stable_hash_to_seed(json.dumps(a, sort_keys=True))

0 commit comments

Comments
 (0)