-
-
Notifications
You must be signed in to change notification settings - Fork 209
Expand file tree
/
Copy pathparsed-chunk-v1.schema.json
More file actions
106 lines (106 loc) · 6.9 KB
/
parsed-chunk-v1.schema.json
File metadata and controls
106 lines (106 loc) · 6.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://neomjs.com/schemas/parsed-chunk-v1.schema.json",
"title": "parsed-chunk-v1",
"description": "Client-side parsed knowledge chunk emitted by a tenant parser-runner and consumed by KnowledgeBaseIngestionService. Distinct from backup-record-v1 (restore-only, embedding-preserving). Records of this shape ALWAYS trigger server-side embedding via TextEmbeddingService.embedTexts(); records carrying an embedding field are routed away from this contract toward backup-record-v1 (restore-only) or rejected per spoof-rejection policy. See: ../source/Base.mjs (extract contract), ../../knowledge-base/VectorService.mjs (embedding pipeline at lines 243-274), Discussion #11623 §4 Q4 + §1 #3 + §7 Phase 0/1 + ADR 0003 Chroma Topology Unified Only.",
"type": "object",
"additionalProperties": false,
"required": [
"schemaVersion",
"tenantId",
"repoSlug",
"rootKind",
"sourcePath",
"content",
"hashInputs",
"parserId",
"parserVersion",
"kind",
"name"
],
"properties": {
"schemaVersion": {
"type": "string",
"const": "1.0.0",
"description": "Schema version this record conforms to. Server validates exact match; deprecation warning emitted on version drift (Phase 2 ingestion service behavior)."
},
"tenantId": {
"type": "string",
"minLength": 1,
"pattern": "^[a-z0-9][a-z0-9-]{0,62}[a-z0-9]$|^[a-z0-9]$",
"description": "Authoritative tenant identifier. Server-OVERWRITES or REJECTS client-supplied values during ingestion per Phase 0/1C spoof-rejection invariant; the wire-format MUST still carry this field as the client's claim, but the server-stamped value is authoritative. The shared-Neo-content team-namespace constant is 'neo-shared' (reserved). Lowercase kebab-case constraint matches AgentIdentity slug convention."
},
"repoSlug": {
"type": "string",
"minLength": 1,
"description": "Tenant-owned repo identifier within their workspace; e.g. 'client-org/main-app' or 'internal/docs'. Forms part of the path-identity tuple {tenantId, repoSlug, rootKind, sourcePath}. Disambiguates same sourcePath under different repos for the same tenant."
},
"rootKind": {
"type": "string",
"enum": [
"neo-workspace",
"bare-repo",
"external-source"
],
"description": "Repository topology hint: 'neo-workspace' = npx-neo-app-created workspace where neo is a node_module; 'bare-repo' = plain git repo (any language); 'external-source' = non-VCS source (e.g. live API mirror, generated docs). Hint for hydration mode selection (Phase 2D Q12)."
},
"sourcePath": {
"type": "string",
"minLength": 1,
"description": "Path relative to the repoSlug root. NOT resolved against KB server's neoRootDir — cross-tenant content paths cannot share that root assumption (see SearchService.mjs:118-120 single-root assumption being lifted in Phase 2D). Forward-slash normalized; no leading slash."
},
"content": {
"type": "string",
"description": "The chunk text payload that will be embedded server-side via TextEmbeddingService.embedTexts() in VectorService.mjs:243-274. Chunking semantics are parser-determined; this schema does not impose a length budget but Phase 2 ingestion service may enforce per-tenant size policies."
},
"hashInputs": {
"type": "array",
"items": { "type": "string" },
"minItems": 1,
"uniqueItems": true,
"description": "Field names that compose the chunkId hash input on the server side. Required so server-side rehashing is deterministic and tenant-aware (Phase 0/1C tenant-aware chunkId derivation). The server prepends tenantId+repoSlug into the hash function regardless of this list — that prepend is implicit and not duplicated here."
},
"parserId": {
"type": "string",
"minLength": 1,
"description": "Identifier of the parser that produced this chunk. Maps to a registered parser in the tenant's source/parser registry (Phase 0/1B). Required for provenance + future parser-protocol versioning."
},
"parserVersion": {
"type": "string",
"pattern": "^[0-9]+\\.[0-9]+\\.[0-9]+(?:[-+].+)?$",
"description": "Semver-like version of the parser. Used by the Phase 2 ingestion service to emit deprecation warnings for stale parser versions."
},
"kind": {
"type": "string",
"description": "Semantic chunk category. Open enum (parser-specific); examples from Neo's own parsers: 'module-context', 'class-properties', 'class-config', 'method', 'doc-section', 'skill', 'test'. Custom parsers may emit additional kind values; Phase 2 ingestion service does not enforce closed-enum here to keep the parser-protocol extensible."
},
"name": {
"type": "string",
"minLength": 1,
"description": "Human-readable chunk name, e.g. 'Neo.foo.Bar' for a class or 'foo.bar.mjs - methodName()' for a method. Used in retrieval ranking and operator-facing surfaces."
},
"line_start": {
"type": "integer",
"minimum": 1,
"description": "Optional 1-based starting line of the chunk in its source file. Useful for source-link hydration (Phase 2D Q12)."
},
"line_end": {
"type": "integer",
"minimum": 1,
"description": "Optional 1-based ending line of the chunk in its source file."
},
"className": {
"type": "string",
"description": "Optional fully-qualified class name when the chunk is a class member (e.g. 'Neo.ai.services.knowledge-base.parser.SourceParser')."
},
"extends": {
"type": "string",
"description": "Optional superclass for class-context chunks. Hierarchy maps are computed downstream from this signal."
},
"customMeta": {
"type": "object",
"description": "Open extension slot for parser-specific metadata that does not fit any of the standard fields. Server-side validators MUST NOT promote keys from customMeta to top-level chunk metadata silently — explicit migration via schemaVersion bump is the only sanctioned path."
}
},
"$comment": "Forbidden field: 'embedding'. Records carrying an embedding field are restore-only and MUST flow through backup-record-v1 instead (see DatabaseService.importDatabase, ai/services/knowledge-base/DatabaseService.mjs:250-349). The Phase 2 KnowledgeBaseIngestionService boundary rejects parsed-chunk-v1 records with an embedding field per spoof-rejection invariant."
}