Skip to content

Commit ff3a021

Browse files
committed
Add tests for new nationwide changesets.
1 parent 4a61fea commit ff3a021

5 files changed

Lines changed: 411 additions & 11 deletions

File tree

tests/test_format_observations.py

Lines changed: 248 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,248 @@
1+
# -------------------------------------------------------------
2+
# Copyright (c) Henry Spatial Analysis. All rights reserved.
3+
# Licensed under the MIT License. See LICENSE in project root for information.
4+
# -------------------------------------------------------------
5+
6+
"""
7+
Unit tests for openpois.osm.format_observations.
8+
"""
9+
from __future__ import annotations
10+
11+
import pandas as pd
12+
import pyarrow as pa
13+
import pyarrow.parquet as pq
14+
15+
from openpois.osm.format_observations import format_observations_duckdb
16+
17+
18+
VERSIONS_SCHEMA = pa.schema([
19+
("id", pa.int64()),
20+
("version", pa.int64()),
21+
("changeset", pa.int64()),
22+
("timestamp", pa.string()),
23+
("user", pa.string()),
24+
("uid", pa.int64()),
25+
("type", pa.string()),
26+
])
27+
28+
CHANGES_SCHEMA = pa.schema([
29+
("key", pa.string()),
30+
("value", pa.string()),
31+
("change", pa.string()),
32+
("id", pa.int64()),
33+
("version", pa.int64()),
34+
("type", pa.string()),
35+
])
36+
37+
38+
def _make_versions(rows):
39+
return pa.table(
40+
{f.name: [r.get(f.name) for r in rows] for f in VERSIONS_SCHEMA},
41+
schema=VERSIONS_SCHEMA,
42+
)
43+
44+
45+
def _make_changes(rows):
46+
return pa.table(
47+
{f.name: [r.get(f.name) for r in rows] for f in CHANGES_SCHEMA},
48+
schema=CHANGES_SCHEMA,
49+
)
50+
51+
52+
def _synthetic_inputs():
53+
"""
54+
Three POIs with name tags. Each has an Added, then a Changed on the
55+
tag_key, so each should yield 2 observations (the second with changed=1).
56+
"""
57+
versions = []
58+
changes = []
59+
for elem_id in [100, 200, 300]:
60+
for ver in [1, 2]:
61+
versions.append({
62+
"id": elem_id, "version": ver, "changeset": 1000 + ver,
63+
"timestamp": f"2024-01-{ver:02d}T00:00:00+00:00",
64+
"user": "u", "uid": 1, "type": "node",
65+
})
66+
changes.append({
67+
"key": "name", "value": f"n{elem_id}.v{ver}",
68+
"change": "Added" if ver == 1 else "Changed",
69+
"id": elem_id, "version": ver, "type": "node",
70+
})
71+
changes.append({
72+
"key": "amenity", "value": "cafe",
73+
"change": "Added" if ver == 1 else "Changed",
74+
"id": elem_id, "version": ver, "type": "node",
75+
})
76+
return versions, changes
77+
78+
79+
def _write_parquets(tmp_path, versions, changes):
80+
v_path = tmp_path / "versions.parquet"
81+
c_path = tmp_path / "changes.parquet"
82+
pq.write_table(_make_versions(versions), v_path)
83+
pq.write_table(_make_changes(changes), c_path)
84+
return v_path, c_path
85+
86+
87+
class TestFormatObservationsDuckdb:
88+
"""``format_observations_duckdb`` is the production entry point for the
89+
OSM history → observations pipeline; these tests pin its row count,
90+
column set, and state-machine semantics."""
91+
92+
def test_synthetic_inputs_produce_expected_rows(self, tmp_path):
93+
versions, changes = _synthetic_inputs()
94+
v_path, c_path = _write_parquets(tmp_path, versions, changes)
95+
out_path = tmp_path / "obs.csv"
96+
total = format_observations_duckdb(
97+
changes_path = c_path,
98+
versions_path = v_path,
99+
output_path = out_path,
100+
tag_key = "name",
101+
keep_keys = ["amenity"],
102+
verbose = False,
103+
)
104+
# Two observations per POI (Added + Changed), three POIs
105+
assert total == 6
106+
out = pd.read_csv(out_path)
107+
assert len(out) == 6
108+
assert set(out["id"]) == {100, 200, 300}
109+
assert set(out["version"]) == {1, 2}
110+
assert out["changed"].sum() == 6
111+
# Expected output columns
112+
expected = {
113+
"id", "version", "changeset", "obs_timestamp", "last_obs_timestamp",
114+
"last_tag_timestamp", "user", "last_tag_user",
115+
"tag_value", "last_tag_value", "changed", "deleted",
116+
"amenity", "amenity_last_value", "tag_key",
117+
}
118+
assert set(out.columns) == expected
119+
# last_tag_value reflects the PRE-update state: None on v1, v1's value on v2.
120+
out = out.sort_values(["id", "version"]).reset_index(drop = True)
121+
for poi_id in [100, 200, 300]:
122+
rows = out[out["id"] == poi_id].reset_index(drop = True)
123+
assert pd.isna(rows.loc[0, "last_tag_value"])
124+
assert rows.loc[1, "last_tag_value"] == f"n{poi_id}.v1"
125+
assert rows.loc[0, "tag_value"] == f"n{poi_id}.v1"
126+
assert rows.loc[1, "tag_value"] == f"n{poi_id}.v2"
127+
128+
def test_tag_state_machine(self, tmp_path):
129+
"""Added → Changed → visible=false → visible=true sequence.
130+
131+
The re-added version should restore ``tag_value`` to the last SET
132+
value (``"bar"``), matching the original state-machine semantics.
133+
"""
134+
versions = []
135+
changes = []
136+
seq = [
137+
("Added", "foo", None, None),
138+
("Changed", "bar", None, None),
139+
(None, None, "false", "Added"),
140+
(None, None, "true", "Changed"),
141+
]
142+
for ver, (tag_ch, tag_val, vis_val, vis_ch) in enumerate(seq, start = 1):
143+
versions.append({
144+
"id": 42, "version": ver, "changeset": 1000 + ver,
145+
"timestamp": f"2024-01-{ver:02d}T00:00:00+00:00",
146+
"user": "u", "uid": 1, "type": "node",
147+
})
148+
if tag_ch is not None:
149+
changes.append({
150+
"key": "name", "value": tag_val, "change": tag_ch,
151+
"id": 42, "version": ver, "type": "node",
152+
})
153+
if vis_ch is not None:
154+
changes.append({
155+
"key": "visible", "value": vis_val, "change": vis_ch,
156+
"id": 42, "version": ver, "type": "node",
157+
})
158+
v_path, c_path = _write_parquets(tmp_path, versions, changes)
159+
out_path = tmp_path / "obs.csv"
160+
format_observations_duckdb(
161+
changes_path = c_path,
162+
versions_path = v_path,
163+
output_path = out_path,
164+
tag_key = "name",
165+
keep_keys = [],
166+
verbose = False,
167+
)
168+
out = pd.read_csv(out_path).sort_values("version").reset_index(drop = True)
169+
assert list(out["version"]) == [1, 2, 3, 4]
170+
assert list(out["tag_value"].fillna("")) == ["foo", "bar", "", "bar"]
171+
assert list(out["changed"]) == [1, 1, 1, 1]
172+
173+
def test_keep_key_stickiness(self, tmp_path):
174+
"""``{k}_last_value`` must persist across versions that don't touch ``k``."""
175+
versions = []
176+
changes = []
177+
seq = [
178+
("Added", "foo", "Added", "restaurant"),
179+
("Changed", "foo2", None, None),
180+
("Changed", "foo3", "Changed", "bar"),
181+
]
182+
for ver, (tag_ch, tag_val, kk_ch, kk_val) in enumerate(seq, start = 1):
183+
versions.append({
184+
"id": 7, "version": ver, "changeset": 1000 + ver,
185+
"timestamp": f"2024-02-{ver:02d}T00:00:00+00:00",
186+
"user": "u", "uid": 1, "type": "node",
187+
})
188+
changes.append({
189+
"key": "name", "value": tag_val, "change": tag_ch,
190+
"id": 7, "version": ver, "type": "node",
191+
})
192+
if kk_ch is not None:
193+
changes.append({
194+
"key": "amenity", "value": kk_val, "change": kk_ch,
195+
"id": 7, "version": ver, "type": "node",
196+
})
197+
v_path, c_path = _write_parquets(tmp_path, versions, changes)
198+
out_path = tmp_path / "obs.csv"
199+
format_observations_duckdb(
200+
changes_path = c_path,
201+
versions_path = v_path,
202+
output_path = out_path,
203+
tag_key = "name",
204+
keep_keys = ["amenity"],
205+
verbose = False,
206+
)
207+
out = pd.read_csv(out_path).sort_values("version").reset_index(drop = True)
208+
amenities = list(out["amenity"].fillna(""))
209+
lasts = list(out["amenity_last_value"].fillna(""))
210+
assert amenities == ["restaurant", "restaurant", "bar"]
211+
# v1: pre-change was None; v2: no change → last stays empty; v3: last = "restaurant".
212+
assert lasts == ["", "", "restaurant"]
213+
214+
def test_left_join_null_inheritance(self, tmp_path):
215+
"""Versions with no relevant changes (LEFT-JOIN produces NULLs) should
216+
inherit prior state without crashing."""
217+
versions = []
218+
changes = []
219+
for ver in [1, 2, 3]:
220+
versions.append({
221+
"id": 99, "version": ver, "changeset": 2000 + ver,
222+
"timestamp": f"2024-03-{ver:02d}T00:00:00+00:00",
223+
"user": "u", "uid": 1, "type": "node",
224+
})
225+
# Only v1 has tag/keep-key changes; v2 and v3 have no rows at all.
226+
changes.append({
227+
"key": "name", "value": "cafe", "change": "Added",
228+
"id": 99, "version": 1, "type": "node",
229+
})
230+
changes.append({
231+
"key": "amenity", "value": "cafe", "change": "Added",
232+
"id": 99, "version": 1, "type": "node",
233+
})
234+
v_path, c_path = _write_parquets(tmp_path, versions, changes)
235+
out_path = tmp_path / "obs.csv"
236+
total = format_observations_duckdb(
237+
changes_path = c_path,
238+
versions_path = v_path,
239+
output_path = out_path,
240+
tag_key = "name",
241+
keep_keys = ["amenity"],
242+
verbose = False,
243+
)
244+
assert total == 3
245+
out = pd.read_csv(out_path).sort_values("version").reset_index(drop = True)
246+
assert list(out["tag_value"].fillna("")) == ["cafe", "cafe", "cafe"]
247+
assert list(out["amenity"].fillna("")) == ["cafe", "cafe", "cafe"]
248+
assert list(out["changed"]) == [1, 0, 0]

tests/test_match.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -188,8 +188,8 @@ def test_exact_shared_label(self):
188188
scores = compute_type_scores(
189189
osm_shared_labels = np.array(["Restaurant"]),
190190
overture_shared_labels = np.array(["Restaurant"]),
191-
osm_l0_bits = np.array([3], dtype = np.uint8),
192-
overture_l0_bits = np.array([2], dtype = np.uint8),
191+
osm_l0_bits = np.array([3], dtype = np.uint16),
192+
overture_l0_bits = np.array([2], dtype = np.uint16),
193193
osm_idx = np.array([0]),
194194
overture_idx = np.array([0]),
195195
)
@@ -200,8 +200,8 @@ def test_l0_overlap(self):
200200
scores = compute_type_scores(
201201
osm_shared_labels = np.array(["Restaurant"]),
202202
overture_shared_labels = np.array(["Cafe"]),
203-
osm_l0_bits = np.array([3], dtype = np.uint8),
204-
overture_l0_bits = np.array([2], dtype = np.uint8),
203+
osm_l0_bits = np.array([3], dtype = np.uint16),
204+
overture_l0_bits = np.array([2], dtype = np.uint16),
205205
osm_idx = np.array([0]),
206206
overture_idx = np.array([0]),
207207
)
@@ -212,8 +212,8 @@ def test_no_l0_overlap(self):
212212
scores = compute_type_scores(
213213
osm_shared_labels = np.array(["Restaurant"]),
214214
overture_shared_labels = np.array(["Park"]),
215-
osm_l0_bits = np.array([3], dtype = np.uint8),
216-
overture_l0_bits = np.array([16], dtype = np.uint8),
215+
osm_l0_bits = np.array([3], dtype = np.uint16),
216+
overture_l0_bits = np.array([16], dtype = np.uint16),
217217
osm_idx = np.array([0]),
218218
overture_idx = np.array([0]),
219219
)
@@ -224,8 +224,8 @@ def test_unmapped_zero(self):
224224
scores = compute_type_scores(
225225
osm_shared_labels = np.array([""]),
226226
overture_shared_labels = np.array(["Restaurant"]),
227-
osm_l0_bits = np.array([0], dtype = np.uint8),
228-
overture_l0_bits = np.array([2], dtype = np.uint8),
227+
osm_l0_bits = np.array([0], dtype = np.uint16),
228+
overture_l0_bits = np.array([2], dtype = np.uint16),
229229
osm_idx = np.array([0]),
230230
overture_idx = np.array([0]),
231231
)

0 commit comments

Comments
 (0)