|
| 1 | +"""Cross-source default value gate for ``graph_extraction_window_size`` (task #33 P3). |
| 2 | +
|
| 3 | +Background — task #30 B3 (PR #1925, merge ``43648f9``) locked |
| 4 | +``graph_extraction_window_size`` default to ``2`` (sweet spot) per |
| 5 | +``earayu2`` directive ``msg=adb0c366``. The lock landed in **four** |
| 6 | +independent sources that all need to agree: |
| 7 | +
|
| 8 | +1. **Python const** — ``aperag/indexing/graph_extractor.py`` |
| 9 | + ``_DEFAULT_GRAPH_EXTRACTION_WINDOW_SIZE``: the runtime fallback the |
| 10 | + extractor actually uses when a collection does not override the |
| 11 | + value. |
| 12 | +2. **Pydantic Field** — ``aperag/schema/common.py`` |
| 13 | + ``KnowledgeGraphConfig.graph_extraction_window_size``: the API |
| 14 | + schema's user-facing default (surfaced via the Field's ``examples`` |
| 15 | + tuple and the description text). |
| 16 | +3. **TS schema** — ``web/src/api-v2/schema.d.ts``: the auto-generated |
| 17 | + typed client schema's ``@example`` tag, consumed by frontend code. |
| 18 | +4. **Spec doc** — ``docs/zh-CN/architecture/task-30-graph-chunk-window-spec-v1.md`` |
| 19 | + § 3.1.1 + § 4.2 lock declaration: the architectural source of truth |
| 20 | + that PRs CR against. |
| 21 | +
|
| 22 | +PR #1925 itself surfaced the cross-source drift risk: the initial |
| 23 | +commit only updated source 1 + 2 + 4. ``Weston`` ``msg=1b7d9bef`` |
| 24 | +BLOCKER 1 caught that ``schema.d.ts`` still carried the old default |
| 25 | +``1``; ``huangheng`` ``msg=bf785b12`` NIT 1 caught § 3.1.1 line 85 |
| 26 | +still saying default ``1``. Both required a fix-forward commit. This |
| 27 | +test gate exists so that next time only ``cicd-push`` lint+unit needs |
| 28 | +to fail the PR — no manual cross-source inspection required, no |
| 29 | +fix-forward round trip with reviewers acting as drift detectors. |
| 30 | +
|
| 31 | +Why a unit test (not a boundary test): ``tests/boundaries/`` is not |
| 32 | +currently invoked by ``make test-unit`` / ``test-integration`` / |
| 33 | +``cicd-push.yml`` (audit task #33 Layer 1 finding). ``tests/unit_test/`` |
| 34 | +runs on every push. Per simple-stable directive (earayu2 |
| 35 | +``msg=1224bec8``), the cheapest reliable gate is a unit test that runs |
| 36 | +in the existing CI lane, not a new workflow file. |
| 37 | +
|
| 38 | +Scope discipline: this test pins the **default value parity** across |
| 39 | +four sources only. It does not pin description text, override-recommendation |
| 40 | +phrasing, or rationale wording — those evolve. If a future change |
| 41 | +moves the default away from 2, this test will fail; the fix is to |
| 42 | +update the Python const, Pydantic ``examples``, TS schema ``@example``, |
| 43 | +and spec doc § 3.1.1 + § 4.2 in the same PR. The failure message |
| 44 | +spells out which sources disagree so the operator does not have to |
| 45 | +guess. |
| 46 | +
|
| 47 | +Sediment alignment: Lesson #13 v3 (cross-source default value |
| 48 | +alignment, ``docs/zh-CN/architecture/task-17-cr-review-checklist.md`` |
| 49 | +§ 四) — this test is the codified gate Lesson #13 v3 was waiting for. |
| 50 | +""" |
| 51 | + |
| 52 | +from __future__ import annotations |
| 53 | + |
| 54 | +import re |
| 55 | +from pathlib import Path |
| 56 | + |
| 57 | +import pytest |
| 58 | + |
| 59 | +REPO_ROOT = Path(__file__).resolve().parents[3] |
| 60 | +GRAPH_EXTRACTOR_PATH = REPO_ROOT / "aperag" / "indexing" / "graph_extractor.py" |
| 61 | +SCHEMA_COMMON_PATH = REPO_ROOT / "aperag" / "schema" / "common.py" |
| 62 | +TS_SCHEMA_PATH = REPO_ROOT / "web" / "src" / "api-v2" / "schema.d.ts" |
| 63 | +SPEC_DOC_PATH = REPO_ROOT / "docs" / "zh-CN" / "architecture" / "task-30-graph-chunk-window-spec-v1.md" |
| 64 | + |
| 65 | + |
| 66 | +def _python_const_default() -> int: |
| 67 | + """Read ``_DEFAULT_GRAPH_EXTRACTION_WINDOW_SIZE`` from ``graph_extractor.py``. |
| 68 | +
|
| 69 | + Imports the module so the value matches what runtime actually uses; |
| 70 | + catches the case where the constant is shadowed / re-bound at import |
| 71 | + time. |
| 72 | + """ |
| 73 | + from aperag.indexing.graph_extractor import ( |
| 74 | + _DEFAULT_GRAPH_EXTRACTION_WINDOW_SIZE, |
| 75 | + ) |
| 76 | + |
| 77 | + return _DEFAULT_GRAPH_EXTRACTION_WINDOW_SIZE |
| 78 | + |
| 79 | + |
| 80 | +def _pydantic_field_default() -> int: |
| 81 | + """Read the ``examples`` value of ``KnowledgeGraphConfig.graph_extraction_window_size``. |
| 82 | +
|
| 83 | + Pydantic ``Field(examples=[N])`` is the canonical default surfaced |
| 84 | + in OpenAPI + ``schema.d.ts`` ``@example`` generation, so it must |
| 85 | + match the Python const. |
| 86 | + """ |
| 87 | + from aperag.schema.common import KnowledgeGraphConfig |
| 88 | + |
| 89 | + field = KnowledgeGraphConfig.model_fields["graph_extraction_window_size"] |
| 90 | + examples = field.examples |
| 91 | + assert examples, ( |
| 92 | + "KnowledgeGraphConfig.graph_extraction_window_size Field is missing " |
| 93 | + "examples=[...]; the OpenAPI / TS schema default-value annotation " |
| 94 | + "depends on this. Add examples=[<canonical default>] in " |
| 95 | + "aperag/schema/common.py." |
| 96 | + ) |
| 97 | + return int(examples[0]) |
| 98 | + |
| 99 | + |
| 100 | +_TS_SCHEMA_BLOCK_RE = re.compile( |
| 101 | + # Match the JSDoc block immediately preceding `graph_extraction_window_size?:`, |
| 102 | + # then capture the @example value inside that block. |
| 103 | + r"@example\s+(\d+)\s*\*/\s*\n\s*graph_extraction_window_size\?:", |
| 104 | + re.MULTILINE, |
| 105 | +) |
| 106 | + |
| 107 | + |
| 108 | +def _ts_schema_default() -> int: |
| 109 | + """Extract the ``@example`` integer attached to ``graph_extraction_window_size`` in ``schema.d.ts``. |
| 110 | +
|
| 111 | + The TS schema is auto-generated from the Pydantic ``examples`` field, |
| 112 | + but it is committed to the repo (frontend imports it directly), so |
| 113 | + a stale regen drifts silently. This regex pins the lookup to the |
| 114 | + JSDoc block that immediately precedes the field declaration so we |
| 115 | + do not match the same ``@example 2`` in some other field block. |
| 116 | + """ |
| 117 | + text = TS_SCHEMA_PATH.read_text(encoding="utf-8") |
| 118 | + match = _TS_SCHEMA_BLOCK_RE.search(text) |
| 119 | + assert match, ( |
| 120 | + f"Could not locate JSDoc @example for graph_extraction_window_size in " |
| 121 | + f"{TS_SCHEMA_PATH}. Either the schema regen is missing the field, the " |
| 122 | + f"field name was renamed, or the JSDoc layout changed. Re-run the " |
| 123 | + f"OpenAPI -> TS schema regen and re-check this test." |
| 124 | + ) |
| 125 | + return int(match.group(1)) |
| 126 | + |
| 127 | + |
| 128 | +_SPEC_LOCK_LINE_RE = re.compile( |
| 129 | + # Section 4.2 canonical lock declaration: **`graph_extraction_window_size = 2`** |
| 130 | + r"\*\*`graph_extraction_window_size\s*=\s*(\d+)`\*\*", |
| 131 | +) |
| 132 | +_SPEC_311_LINE_RE = re.compile( |
| 133 | + # Section 3.1.1 enumeration: **B3 lock default `2`** |
| 134 | + r"\*\*B3 lock default\s*`(\d+)`\*\*", |
| 135 | +) |
| 136 | + |
| 137 | + |
| 138 | +def _spec_doc_defaults() -> dict[str, int]: |
| 139 | + """Extract every locked default value from the task #30 spec doc. |
| 140 | +
|
| 141 | + Two canonical lock sites in the spec: |
| 142 | +
|
| 143 | + * § 3.1.1 — ``**B3 lock default `N`**`` in the schema-path enumeration |
| 144 | + * § 4.2 — ``**`graph_extraction_window_size = N`**`` in the lock |
| 145 | + chapter title line |
| 146 | +
|
| 147 | + Both lines are part of the spec's locked-value contract. If either |
| 148 | + drifts away from the runtime default, ``CR 必对照架构文档`` (per |
| 149 | + earayu2 ``msg=f19f9fc5``) breaks down. |
| 150 | + """ |
| 151 | + text = SPEC_DOC_PATH.read_text(encoding="utf-8") |
| 152 | + |
| 153 | + section_42_match = _SPEC_LOCK_LINE_RE.search(text) |
| 154 | + section_311_match = _SPEC_311_LINE_RE.search(text) |
| 155 | + |
| 156 | + assert section_42_match, ( |
| 157 | + f"Could not locate § 4.2 lock line `**`graph_extraction_window_size = N`**` " |
| 158 | + f"in {SPEC_DOC_PATH}. Either the lock chapter was renamed/removed or the " |
| 159 | + f"markdown emphasis style changed. Restore the canonical lock line or " |
| 160 | + f"update this regex." |
| 161 | + ) |
| 162 | + assert section_311_match, ( |
| 163 | + f"Could not locate § 3.1.1 lock line `**B3 lock default `N`**` in " |
| 164 | + f"{SPEC_DOC_PATH}. Either § 3.1.1 was rewritten or the lock-shorthand " |
| 165 | + f"phrasing changed. Restore the canonical phrasing or update this regex." |
| 166 | + ) |
| 167 | + |
| 168 | + return { |
| 169 | + "section_4_2_lock": int(section_42_match.group(1)), |
| 170 | + "section_3_1_1_enumeration": int(section_311_match.group(1)), |
| 171 | + } |
| 172 | + |
| 173 | + |
| 174 | +def test_graph_extraction_window_size_default_consistent_across_sources(): |
| 175 | + """All four sources of ``graph_extraction_window_size`` default must agree. |
| 176 | +
|
| 177 | + Runs on every push via ``cicd-push.yml`` -> ``make test-unit``. |
| 178 | +
|
| 179 | + This is the codified Lesson #13 v3 gate. It catches the same drift |
| 180 | + class that required two BLOCKER fix-forward rounds on PR #1925 |
| 181 | + (Weston ``msg=1b7d9bef`` BLOCKER 1 + huangheng ``msg=bf785b12`` NIT |
| 182 | + 1) — schema.d.ts and spec § 3.1.1 still carrying ``default 1`` while |
| 183 | + Python + Pydantic moved to ``2``. |
| 184 | + """ |
| 185 | + python_const = _python_const_default() |
| 186 | + pydantic_examples = _pydantic_field_default() |
| 187 | + ts_example = _ts_schema_default() |
| 188 | + spec_defaults = _spec_doc_defaults() |
| 189 | + |
| 190 | + sources: dict[str, int] = { |
| 191 | + "python_const (aperag/indexing/graph_extractor.py)": python_const, |
| 192 | + "pydantic_examples (aperag/schema/common.py)": pydantic_examples, |
| 193 | + "ts_schema_example (web/src/api-v2/schema.d.ts)": ts_example, |
| 194 | + **{ |
| 195 | + f"spec_doc § {k.replace('_', '.').replace('section.', '')} " |
| 196 | + f"(docs/zh-CN/architecture/task-30-graph-chunk-window-spec-v1.md)": v |
| 197 | + for k, v in spec_defaults.items() |
| 198 | + }, |
| 199 | + } |
| 200 | + |
| 201 | + distinct = set(sources.values()) |
| 202 | + assert len(distinct) == 1, ( |
| 203 | + "graph_extraction_window_size default has drifted across sources.\n" |
| 204 | + "All four sources must declare the same integer (currently locked to 2 " |
| 205 | + "per task #30 B3 / earayu2 msg=adb0c366).\n" |
| 206 | + "Update ALL of the following in the same PR:\n" |
| 207 | + " 1. aperag/indexing/graph_extractor.py: _DEFAULT_GRAPH_EXTRACTION_WINDOW_SIZE\n" |
| 208 | + " 2. aperag/schema/common.py: KnowledgeGraphConfig.graph_extraction_window_size Field examples=[N]\n" |
| 209 | + " 3. web/src/api-v2/schema.d.ts: @example N (regenerate via OpenAPI -> TS pipeline)\n" |
| 210 | + " 4. docs/zh-CN/architecture/task-30-graph-chunk-window-spec-v1.md: § 3.1.1 + § 4.2 lock lines\n" |
| 211 | + "Future-default-change procedure (per spec § 4.2): ≥10 samples + ≥3 models " |
| 212 | + "no regression + PM + architect + earayu2 三方 confirm.\n" |
| 213 | + "\nObserved values per source:\n" + "\n".join(f" - {name}: {value}" for name, value in sources.items()) |
| 214 | + ) |
| 215 | + |
| 216 | + |
| 217 | +def test_graph_extraction_window_size_default_is_positive_integer(): |
| 218 | + """Sanity check — the locked default must be a positive integer. |
| 219 | +
|
| 220 | + ``window_size <= 0`` would break the assembler's ``len(chunks) // |
| 221 | + window_size`` math and the bootstrap formula; ``window_size`` is a |
| 222 | + chunk count, fractional values are not meaningful. |
| 223 | + """ |
| 224 | + value = _python_const_default() |
| 225 | + assert isinstance(value, int) and value >= 1, ( |
| 226 | + f"_DEFAULT_GRAPH_EXTRACTION_WINDOW_SIZE must be a positive integer, " |
| 227 | + f"got {value!r}. Negative or zero values break the window assembler " |
| 228 | + f"in aperag/indexing/graph_extractor.py." |
| 229 | + ) |
| 230 | + |
| 231 | + |
| 232 | +@pytest.mark.parametrize( |
| 233 | + "source_name, getter", |
| 234 | + [ |
| 235 | + ("python_const", _python_const_default), |
| 236 | + ("pydantic_examples", _pydantic_field_default), |
| 237 | + ("ts_schema_example", _ts_schema_default), |
| 238 | + ], |
| 239 | +) |
| 240 | +def test_individual_source_extractor_does_not_raise(source_name, getter): |
| 241 | + """Each individual source extractor must succeed (no missing file / regex |
| 242 | + drift / Pydantic field rename). |
| 243 | +
|
| 244 | + This separates "extractor broken" failures from "values drifted" failures |
| 245 | + so when CI turns red the operator immediately knows whether to update the |
| 246 | + test infrastructure or the schema. |
| 247 | + """ |
| 248 | + value = getter() |
| 249 | + assert isinstance(value, int), ( |
| 250 | + f"{source_name} extractor returned non-int {value!r}; the source format may have changed." |
| 251 | + ) |
0 commit comments