|
| 1 | +"""Edge-case and hardened tests for modeldiff modules.""" |
| 2 | + |
| 3 | +import json |
| 4 | +import math |
| 5 | + |
| 6 | +import pytest |
| 7 | +from modeldiff._types import ( |
| 8 | + ChangeType, |
| 9 | + DiffEntry, |
| 10 | + DiffReport, |
| 11 | + FingerprintResult, |
| 12 | + Prompt, |
| 13 | + Response, |
| 14 | + Severity, |
| 15 | + Snapshot, |
| 16 | +) |
| 17 | +from modeldiff.capture import capture, capture_from_file |
| 18 | +from modeldiff.diff import diff_snapshots, diff_text, _text_similarity |
| 19 | +from modeldiff.drift import ( |
| 20 | + full_drift_report, |
| 21 | + latency_drift, |
| 22 | + length_drift, |
| 23 | + refusal_drift, |
| 24 | + vocabulary_drift, |
| 25 | +) |
| 26 | +from modeldiff.fingerprint import compare_fingerprints, fingerprint |
| 27 | +from modeldiff.report import ( |
| 28 | + format_markdown, |
| 29 | + format_report_rich, |
| 30 | + format_report_text, |
| 31 | + report_to_dict, |
| 32 | + save_json, |
| 33 | + load_json, |
| 34 | +) |
| 35 | + |
| 36 | + |
| 37 | +def _snap(model, outputs, latencies=None, errors=None): |
| 38 | + responses = [] |
| 39 | + for i, output in enumerate(outputs): |
| 40 | + p = Prompt(text=f"prompt_{i}") |
| 41 | + lat = latencies[i] if latencies else 100.0 |
| 42 | + err = errors[i] if errors else None |
| 43 | + responses.append(Response( |
| 44 | + prompt=p, output=output, model_name=model, |
| 45 | + latency_ms=lat, error=err, |
| 46 | + )) |
| 47 | + return Snapshot(model_name=model, responses=responses) |
| 48 | + |
| 49 | + |
| 50 | +# ---- diff edge cases ---- |
| 51 | + |
| 52 | + |
| 53 | +class TestDiffEdgeCases: |
| 54 | + def test_both_empty_outputs(self): |
| 55 | + snap_a = _snap("a", [""]) |
| 56 | + snap_b = _snap("b", [""]) |
| 57 | + report = diff_snapshots(snap_a, snap_b) |
| 58 | + assert report.n_identical == 1 |
| 59 | + |
| 60 | + def test_both_errors(self): |
| 61 | + snap_a = _snap("a", [""], errors=["err1"]) |
| 62 | + snap_b = _snap("b", [""], errors=["err2"]) |
| 63 | + report = diff_snapshots(snap_a, snap_b) |
| 64 | + # Both errored -- not an error-state-change |
| 65 | + assert all(e.change_type != ChangeType.ERROR for e in report.entries) |
| 66 | + |
| 67 | + def test_whitespace_difference_is_format(self): |
| 68 | + snap_a = _snap("a", ["Hello world."]) |
| 69 | + snap_b = _snap("b", ["Hello world. "]) # trailing space |
| 70 | + report = diff_snapshots(snap_a, snap_b) |
| 71 | + assert report.n_identical == 1 # strip makes them identical |
| 72 | + |
| 73 | + def test_custom_length_threshold(self): |
| 74 | + snap_a = _snap("a", ["short"]) |
| 75 | + snap_b = _snap("b", ["short " * 20]) |
| 76 | + report = diff_snapshots(snap_a, snap_b, length_threshold=0.1) |
| 77 | + assert report.n_changes == 1 |
| 78 | + |
| 79 | + def test_large_batch(self): |
| 80 | + n = 50 |
| 81 | + snap_a = _snap("a", [f"answer {i}" for i in range(n)]) |
| 82 | + snap_b = _snap("b", [f"answer {i}" for i in range(n)]) |
| 83 | + report = diff_snapshots(snap_a, snap_b) |
| 84 | + assert report.n_identical == n |
| 85 | + |
| 86 | + def test_diff_text_multiline(self): |
| 87 | + result = diff_text("line1\nline2\nline3\n", "line1\nchanged\nline3\n") |
| 88 | + assert "-line2" in result or "- line2" in result or "-changed" in result or "changed" in result |
| 89 | + |
| 90 | + def test_text_similarity_identical(self): |
| 91 | + assert _text_similarity("hello", "hello") == 1.0 |
| 92 | + |
| 93 | + def test_text_similarity_empty(self): |
| 94 | + assert _text_similarity("", "") == 1.0 |
| 95 | + |
| 96 | + def test_regression_score_zero(self): |
| 97 | + report = DiffReport(model_a="a", model_b="b") |
| 98 | + assert report.regression_score == 0.0 |
| 99 | + |
| 100 | + def test_by_type_empty(self): |
| 101 | + report = DiffReport(model_a="a", model_b="b") |
| 102 | + assert report.by_type == {} |
| 103 | + |
| 104 | + def test_by_severity_empty(self): |
| 105 | + report = DiffReport(model_a="a", model_b="b") |
| 106 | + assert report.by_severity == {} |
| 107 | + |
| 108 | + |
| 109 | +# ---- drift edge cases ---- |
| 110 | + |
| 111 | + |
| 112 | +class TestDriftEdgeCases: |
| 113 | + def test_length_drift_single_response(self): |
| 114 | + snap_a = _snap("a", ["one"]) |
| 115 | + snap_b = _snap("b", ["one two three four five six seven eight nine ten"]) |
| 116 | + result = length_drift(snap_a, snap_b) |
| 117 | + assert result["mean_b"] > result["mean_a"] |
| 118 | + # Single-element std=0, drift should be 0 |
| 119 | + assert result["std_a"] == 0.0 |
| 120 | + |
| 121 | + def test_latency_drift_zero_latencies(self): |
| 122 | + snap_a = _snap("a", ["a"] * 5, [0.0] * 5) |
| 123 | + snap_b = _snap("b", ["a"] * 5, [0.0] * 5) |
| 124 | + result = latency_drift(snap_a, snap_b) |
| 125 | + # latency filter skips latency 0 |
| 126 | + assert result["drift"] == 0.0 |
| 127 | + |
| 128 | + def test_vocabulary_drift_empty_outputs(self): |
| 129 | + snap_a = _snap("a", [""]) |
| 130 | + snap_b = _snap("b", [""]) |
| 131 | + result = vocabulary_drift(snap_a, snap_b) |
| 132 | + # Empty strings produce empty freq dicts -> jaccard 1.0 (vacuously) |
| 133 | + assert result["jaccard_similarity"] == 1.0 |
| 134 | + |
| 135 | + def test_refusal_drift_all_refusals(self): |
| 136 | + snap_a = _snap("a", ["I can't help"] * 5) |
| 137 | + snap_b = _snap("b", ["I can't help"] * 5) |
| 138 | + result = refusal_drift(snap_a, snap_b) |
| 139 | + assert result["delta"] == 0.0 |
| 140 | + |
| 141 | + def test_full_drift_report_all_keys(self): |
| 142 | + snap_a = _snap("a", ["hello world"] * 3) |
| 143 | + snap_b = _snap("b", ["goodbye earth"] * 3) |
| 144 | + result = full_drift_report(snap_a, snap_b) |
| 145 | + assert set(result.keys()) == {"length", "refusal", "latency", "vocabulary"} |
| 146 | + |
| 147 | + |
| 148 | +# ---- fingerprint edge cases ---- |
| 149 | + |
| 150 | + |
| 151 | +class TestFingerprintEdgeCases: |
| 152 | + def test_single_response(self): |
| 153 | + snap = _snap("m", ["just one response"]) |
| 154 | + result = fingerprint(snap) |
| 155 | + assert result.dimensions["length_consistency"] == 1.0 |
| 156 | + |
| 157 | + def test_all_errors(self): |
| 158 | + snap = _snap("m", ["", ""], errors=["e1", "e2"]) |
| 159 | + result = fingerprint(snap) |
| 160 | + assert result.dimensions["error_rate"] == 1.0 |
| 161 | + assert result.dimensions["verbosity"] == 0.0 |
| 162 | + |
| 163 | + def test_formality_casual(self): |
| 164 | + snap = _snap("m", ["don't can't won't I'm gonna lol!!"] * 5) |
| 165 | + result = fingerprint(snap) |
| 166 | + assert result.dimensions["formality"] < 0.5 |
| 167 | + |
| 168 | + def test_formality_formal(self): |
| 169 | + snap = _snap("m", ["Therefore furthermore however additionally consequently."] * 5) |
| 170 | + result = fingerprint(snap) |
| 171 | + assert result.dimensions["formality"] > 0.5 |
| 172 | + |
| 173 | + def test_compare_empty_dimensions(self): |
| 174 | + fp_a = FingerprintResult(model_name="a", dimensions={}) |
| 175 | + fp_b = FingerprintResult(model_name="b", dimensions={}) |
| 176 | + result = compare_fingerprints(fp_a, fp_b) |
| 177 | + assert result["euclidean_distance"] == 0.0 |
| 178 | + assert result["similar"] |
| 179 | + |
| 180 | + def test_compare_partial_overlap(self): |
| 181 | + fp_a = FingerprintResult(model_name="a", dimensions={"x": 1.0}) |
| 182 | + fp_b = FingerprintResult(model_name="b", dimensions={"y": 1.0}) |
| 183 | + result = compare_fingerprints(fp_a, fp_b) |
| 184 | + # x=1,y=0 vs x=0,y=1 -> distance = sqrt(2) |
| 185 | + assert result["euclidean_distance"] == pytest.approx(math.sqrt(2), abs=0.01) |
| 186 | + |
| 187 | + |
| 188 | +# ---- capture edge cases ---- |
| 189 | + |
| 190 | + |
| 191 | +class TestCaptureEdgeCases: |
| 192 | + def test_model_fn_returns_unicode(self): |
| 193 | + snap = capture( |
| 194 | + [Prompt(text="q")], |
| 195 | + lambda t: "日本語の応答", |
| 196 | + model_name="unicode_model", |
| 197 | + ) |
| 198 | + assert snap.responses[0].output == "日本語の応答" |
| 199 | + |
| 200 | + def test_prompt_with_expected(self): |
| 201 | + p = Prompt(text="2+2?", expected="4") |
| 202 | + snap = capture([p], lambda t: "4", model_name="m") |
| 203 | + assert snap.responses[0].prompt.expected == "4" |
| 204 | + |
| 205 | + def test_single_json_object(self, tmp_path): |
| 206 | + path = tmp_path / "single.json" |
| 207 | + path.write_text(json.dumps({"text": "hello"})) |
| 208 | + snap = capture_from_file(str(path), lambda t: t.upper(), model_name="m") |
| 209 | + assert snap.n_responses == 1 |
| 210 | + assert snap.responses[0].output == "HELLO" |
| 211 | + |
| 212 | + |
| 213 | +# ---- report edge cases ---- |
| 214 | + |
| 215 | + |
| 216 | +class TestReportEdgeCases: |
| 217 | + def test_empty_report_text(self): |
| 218 | + report = DiffReport(model_a="a", model_b="b") |
| 219 | + text = format_report_text(report) |
| 220 | + assert "a" in text and "b" in text |
| 221 | + assert "0" in text # 0 changes |
| 222 | + |
| 223 | + def test_empty_report_markdown(self): |
| 224 | + report = DiffReport(model_a="a", model_b="b") |
| 225 | + md = format_markdown(report) |
| 226 | + assert "a" in md |
| 227 | + |
| 228 | + def test_empty_report_rich(self): |
| 229 | + report = DiffReport(model_a="a", model_b="b") |
| 230 | + result = format_report_rich(report) |
| 231 | + assert isinstance(result, str) |
| 232 | + |
| 233 | + def test_save_load_empty(self, tmp_path): |
| 234 | + report = DiffReport(model_a="a", model_b="b") |
| 235 | + path = tmp_path / "empty.json" |
| 236 | + save_json(report, str(path)) |
| 237 | + loaded = load_json(str(path)) |
| 238 | + assert loaded["n_changes"] == 0 |
| 239 | + |
| 240 | + def test_report_unicode_prompts(self): |
| 241 | + report = DiffReport( |
| 242 | + model_a="a", model_b="b", |
| 243 | + entries=[DiffEntry( |
| 244 | + prompt=Prompt(text="日本語のプロンプト"), |
| 245 | + output_a="応答A", output_b="応答B", |
| 246 | + change_type=ChangeType.CONTENT, severity=Severity.HIGH, |
| 247 | + description="content changed", |
| 248 | + )], |
| 249 | + ) |
| 250 | + d = report_to_dict(report) |
| 251 | + s = json.dumps(d, ensure_ascii=False) |
| 252 | + assert "日本語" in s |
| 253 | + |
| 254 | + |
| 255 | +# ---- types edge cases ---- |
| 256 | + |
| 257 | + |
| 258 | +class TestTypesEdgeCases: |
| 259 | + def test_response_word_count(self): |
| 260 | + r = Response(prompt=Prompt(text="q"), output="one two three", model_name="m") |
| 261 | + assert r.word_count == 3 |
| 262 | + |
| 263 | + def test_response_word_count_empty(self): |
| 264 | + r = Response(prompt=Prompt(text="q"), output="", model_name="m") |
| 265 | + assert r.word_count == 0 |
| 266 | + |
| 267 | + def test_response_is_refusal_false(self): |
| 268 | + r = Response(prompt=Prompt(text="q"), output="Sure, here you go!", model_name="m") |
| 269 | + assert not r.is_refusal |
| 270 | + |
| 271 | + def test_snapshot_roundtrip(self, tmp_path): |
| 272 | + snap = _snap("m", ["hello", "world"]) |
| 273 | + path = tmp_path / "snap.json" |
| 274 | + snap.save(str(path)) |
| 275 | + loaded = Snapshot.load(str(path)) |
| 276 | + assert loaded.model_name == "m" |
| 277 | + assert loaded.n_responses == 2 |
| 278 | + assert loaded.responses[0].output == "hello" |
| 279 | + |
| 280 | + def test_snapshot_empty_roundtrip(self, tmp_path): |
| 281 | + snap = Snapshot(model_name="empty") |
| 282 | + path = tmp_path / "empty.json" |
| 283 | + snap.save(str(path)) |
| 284 | + loaded = Snapshot.load(str(path)) |
| 285 | + assert loaded.model_name == "empty" |
| 286 | + assert loaded.n_responses == 0 |
| 287 | + |
| 288 | + def test_diff_report_regression_score_high(self): |
| 289 | + entries = [ |
| 290 | + DiffEntry( |
| 291 | + prompt=Prompt(text="q"), output_a="a", output_b="b", |
| 292 | + change_type=ChangeType.CONTENT, severity=Severity.CRITICAL, |
| 293 | + description="crit", |
| 294 | + ) |
| 295 | + ] * 5 |
| 296 | + report = DiffReport(model_a="a", model_b="b", entries=entries) |
| 297 | + assert report.regression_score == 1.0 |
| 298 | + |
| 299 | + def test_change_rate_all_changed(self): |
| 300 | + entries = [ |
| 301 | + DiffEntry( |
| 302 | + prompt=Prompt(text="q"), output_a="a", output_b="b", |
| 303 | + change_type=ChangeType.CONTENT, severity=Severity.MEDIUM, |
| 304 | + description="d", |
| 305 | + ) |
| 306 | + ] * 3 |
| 307 | + report = DiffReport(model_a="a", model_b="b", entries=entries) |
| 308 | + assert report.change_rate == 1.0 |
0 commit comments