Skip to content

Commit e1959e6

Browse files
author
1
committed
test: add edge-case hardening (97→134 tests)
1 parent e063ff8 commit e1959e6

1 file changed

Lines changed: 308 additions & 0 deletions

File tree

tests/test_edge_cases.py

Lines changed: 308 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,308 @@
1+
"""Edge-case and hardened tests for modeldiff modules."""
2+
3+
import json
4+
import math
5+
6+
import pytest
7+
from modeldiff._types import (
8+
ChangeType,
9+
DiffEntry,
10+
DiffReport,
11+
FingerprintResult,
12+
Prompt,
13+
Response,
14+
Severity,
15+
Snapshot,
16+
)
17+
from modeldiff.capture import capture, capture_from_file
18+
from modeldiff.diff import diff_snapshots, diff_text, _text_similarity
19+
from modeldiff.drift import (
20+
full_drift_report,
21+
latency_drift,
22+
length_drift,
23+
refusal_drift,
24+
vocabulary_drift,
25+
)
26+
from modeldiff.fingerprint import compare_fingerprints, fingerprint
27+
from modeldiff.report import (
28+
format_markdown,
29+
format_report_rich,
30+
format_report_text,
31+
report_to_dict,
32+
save_json,
33+
load_json,
34+
)
35+
36+
37+
def _snap(model, outputs, latencies=None, errors=None):
38+
responses = []
39+
for i, output in enumerate(outputs):
40+
p = Prompt(text=f"prompt_{i}")
41+
lat = latencies[i] if latencies else 100.0
42+
err = errors[i] if errors else None
43+
responses.append(Response(
44+
prompt=p, output=output, model_name=model,
45+
latency_ms=lat, error=err,
46+
))
47+
return Snapshot(model_name=model, responses=responses)
48+
49+
50+
# ---- diff edge cases ----
51+
52+
53+
class TestDiffEdgeCases:
54+
def test_both_empty_outputs(self):
55+
snap_a = _snap("a", [""])
56+
snap_b = _snap("b", [""])
57+
report = diff_snapshots(snap_a, snap_b)
58+
assert report.n_identical == 1
59+
60+
def test_both_errors(self):
61+
snap_a = _snap("a", [""], errors=["err1"])
62+
snap_b = _snap("b", [""], errors=["err2"])
63+
report = diff_snapshots(snap_a, snap_b)
64+
# Both errored -- not an error-state-change
65+
assert all(e.change_type != ChangeType.ERROR for e in report.entries)
66+
67+
def test_whitespace_difference_is_format(self):
68+
snap_a = _snap("a", ["Hello world."])
69+
snap_b = _snap("b", ["Hello world. "]) # trailing space
70+
report = diff_snapshots(snap_a, snap_b)
71+
assert report.n_identical == 1 # strip makes them identical
72+
73+
def test_custom_length_threshold(self):
74+
snap_a = _snap("a", ["short"])
75+
snap_b = _snap("b", ["short " * 20])
76+
report = diff_snapshots(snap_a, snap_b, length_threshold=0.1)
77+
assert report.n_changes == 1
78+
79+
def test_large_batch(self):
80+
n = 50
81+
snap_a = _snap("a", [f"answer {i}" for i in range(n)])
82+
snap_b = _snap("b", [f"answer {i}" for i in range(n)])
83+
report = diff_snapshots(snap_a, snap_b)
84+
assert report.n_identical == n
85+
86+
def test_diff_text_multiline(self):
87+
result = diff_text("line1\nline2\nline3\n", "line1\nchanged\nline3\n")
88+
assert "-line2" in result or "- line2" in result or "-changed" in result or "changed" in result
89+
90+
def test_text_similarity_identical(self):
91+
assert _text_similarity("hello", "hello") == 1.0
92+
93+
def test_text_similarity_empty(self):
94+
assert _text_similarity("", "") == 1.0
95+
96+
def test_regression_score_zero(self):
97+
report = DiffReport(model_a="a", model_b="b")
98+
assert report.regression_score == 0.0
99+
100+
def test_by_type_empty(self):
101+
report = DiffReport(model_a="a", model_b="b")
102+
assert report.by_type == {}
103+
104+
def test_by_severity_empty(self):
105+
report = DiffReport(model_a="a", model_b="b")
106+
assert report.by_severity == {}
107+
108+
109+
# ---- drift edge cases ----
110+
111+
112+
class TestDriftEdgeCases:
113+
def test_length_drift_single_response(self):
114+
snap_a = _snap("a", ["one"])
115+
snap_b = _snap("b", ["one two three four five six seven eight nine ten"])
116+
result = length_drift(snap_a, snap_b)
117+
assert result["mean_b"] > result["mean_a"]
118+
# Single-element std=0, drift should be 0
119+
assert result["std_a"] == 0.0
120+
121+
def test_latency_drift_zero_latencies(self):
122+
snap_a = _snap("a", ["a"] * 5, [0.0] * 5)
123+
snap_b = _snap("b", ["a"] * 5, [0.0] * 5)
124+
result = latency_drift(snap_a, snap_b)
125+
# latency filter skips latency 0
126+
assert result["drift"] == 0.0
127+
128+
def test_vocabulary_drift_empty_outputs(self):
129+
snap_a = _snap("a", [""])
130+
snap_b = _snap("b", [""])
131+
result = vocabulary_drift(snap_a, snap_b)
132+
# Empty strings produce empty freq dicts -> jaccard 1.0 (vacuously)
133+
assert result["jaccard_similarity"] == 1.0
134+
135+
def test_refusal_drift_all_refusals(self):
136+
snap_a = _snap("a", ["I can't help"] * 5)
137+
snap_b = _snap("b", ["I can't help"] * 5)
138+
result = refusal_drift(snap_a, snap_b)
139+
assert result["delta"] == 0.0
140+
141+
def test_full_drift_report_all_keys(self):
142+
snap_a = _snap("a", ["hello world"] * 3)
143+
snap_b = _snap("b", ["goodbye earth"] * 3)
144+
result = full_drift_report(snap_a, snap_b)
145+
assert set(result.keys()) == {"length", "refusal", "latency", "vocabulary"}
146+
147+
148+
# ---- fingerprint edge cases ----
149+
150+
151+
class TestFingerprintEdgeCases:
152+
def test_single_response(self):
153+
snap = _snap("m", ["just one response"])
154+
result = fingerprint(snap)
155+
assert result.dimensions["length_consistency"] == 1.0
156+
157+
def test_all_errors(self):
158+
snap = _snap("m", ["", ""], errors=["e1", "e2"])
159+
result = fingerprint(snap)
160+
assert result.dimensions["error_rate"] == 1.0
161+
assert result.dimensions["verbosity"] == 0.0
162+
163+
def test_formality_casual(self):
164+
snap = _snap("m", ["don't can't won't I'm gonna lol!!"] * 5)
165+
result = fingerprint(snap)
166+
assert result.dimensions["formality"] < 0.5
167+
168+
def test_formality_formal(self):
169+
snap = _snap("m", ["Therefore furthermore however additionally consequently."] * 5)
170+
result = fingerprint(snap)
171+
assert result.dimensions["formality"] > 0.5
172+
173+
def test_compare_empty_dimensions(self):
174+
fp_a = FingerprintResult(model_name="a", dimensions={})
175+
fp_b = FingerprintResult(model_name="b", dimensions={})
176+
result = compare_fingerprints(fp_a, fp_b)
177+
assert result["euclidean_distance"] == 0.0
178+
assert result["similar"]
179+
180+
def test_compare_partial_overlap(self):
181+
fp_a = FingerprintResult(model_name="a", dimensions={"x": 1.0})
182+
fp_b = FingerprintResult(model_name="b", dimensions={"y": 1.0})
183+
result = compare_fingerprints(fp_a, fp_b)
184+
# x=1,y=0 vs x=0,y=1 -> distance = sqrt(2)
185+
assert result["euclidean_distance"] == pytest.approx(math.sqrt(2), abs=0.01)
186+
187+
188+
# ---- capture edge cases ----
189+
190+
191+
class TestCaptureEdgeCases:
192+
def test_model_fn_returns_unicode(self):
193+
snap = capture(
194+
[Prompt(text="q")],
195+
lambda t: "日本語の応答",
196+
model_name="unicode_model",
197+
)
198+
assert snap.responses[0].output == "日本語の応答"
199+
200+
def test_prompt_with_expected(self):
201+
p = Prompt(text="2+2?", expected="4")
202+
snap = capture([p], lambda t: "4", model_name="m")
203+
assert snap.responses[0].prompt.expected == "4"
204+
205+
def test_single_json_object(self, tmp_path):
206+
path = tmp_path / "single.json"
207+
path.write_text(json.dumps({"text": "hello"}))
208+
snap = capture_from_file(str(path), lambda t: t.upper(), model_name="m")
209+
assert snap.n_responses == 1
210+
assert snap.responses[0].output == "HELLO"
211+
212+
213+
# ---- report edge cases ----
214+
215+
216+
class TestReportEdgeCases:
217+
def test_empty_report_text(self):
218+
report = DiffReport(model_a="a", model_b="b")
219+
text = format_report_text(report)
220+
assert "a" in text and "b" in text
221+
assert "0" in text # 0 changes
222+
223+
def test_empty_report_markdown(self):
224+
report = DiffReport(model_a="a", model_b="b")
225+
md = format_markdown(report)
226+
assert "a" in md
227+
228+
def test_empty_report_rich(self):
229+
report = DiffReport(model_a="a", model_b="b")
230+
result = format_report_rich(report)
231+
assert isinstance(result, str)
232+
233+
def test_save_load_empty(self, tmp_path):
234+
report = DiffReport(model_a="a", model_b="b")
235+
path = tmp_path / "empty.json"
236+
save_json(report, str(path))
237+
loaded = load_json(str(path))
238+
assert loaded["n_changes"] == 0
239+
240+
def test_report_unicode_prompts(self):
241+
report = DiffReport(
242+
model_a="a", model_b="b",
243+
entries=[DiffEntry(
244+
prompt=Prompt(text="日本語のプロンプト"),
245+
output_a="応答A", output_b="応答B",
246+
change_type=ChangeType.CONTENT, severity=Severity.HIGH,
247+
description="content changed",
248+
)],
249+
)
250+
d = report_to_dict(report)
251+
s = json.dumps(d, ensure_ascii=False)
252+
assert "日本語" in s
253+
254+
255+
# ---- types edge cases ----
256+
257+
258+
class TestTypesEdgeCases:
259+
def test_response_word_count(self):
260+
r = Response(prompt=Prompt(text="q"), output="one two three", model_name="m")
261+
assert r.word_count == 3
262+
263+
def test_response_word_count_empty(self):
264+
r = Response(prompt=Prompt(text="q"), output="", model_name="m")
265+
assert r.word_count == 0
266+
267+
def test_response_is_refusal_false(self):
268+
r = Response(prompt=Prompt(text="q"), output="Sure, here you go!", model_name="m")
269+
assert not r.is_refusal
270+
271+
def test_snapshot_roundtrip(self, tmp_path):
272+
snap = _snap("m", ["hello", "world"])
273+
path = tmp_path / "snap.json"
274+
snap.save(str(path))
275+
loaded = Snapshot.load(str(path))
276+
assert loaded.model_name == "m"
277+
assert loaded.n_responses == 2
278+
assert loaded.responses[0].output == "hello"
279+
280+
def test_snapshot_empty_roundtrip(self, tmp_path):
281+
snap = Snapshot(model_name="empty")
282+
path = tmp_path / "empty.json"
283+
snap.save(str(path))
284+
loaded = Snapshot.load(str(path))
285+
assert loaded.model_name == "empty"
286+
assert loaded.n_responses == 0
287+
288+
def test_diff_report_regression_score_high(self):
289+
entries = [
290+
DiffEntry(
291+
prompt=Prompt(text="q"), output_a="a", output_b="b",
292+
change_type=ChangeType.CONTENT, severity=Severity.CRITICAL,
293+
description="crit",
294+
)
295+
] * 5
296+
report = DiffReport(model_a="a", model_b="b", entries=entries)
297+
assert report.regression_score == 1.0
298+
299+
def test_change_rate_all_changed(self):
300+
entries = [
301+
DiffEntry(
302+
prompt=Prompt(text="q"), output_a="a", output_b="b",
303+
change_type=ChangeType.CONTENT, severity=Severity.MEDIUM,
304+
description="d",
305+
)
306+
] * 3
307+
report = DiffReport(model_a="a", model_b="b", entries=entries)
308+
assert report.change_rate == 1.0

0 commit comments

Comments
 (0)