Skip to content

Commit 0fcf5ba

Browse files
authored
Merge pull request #1174 from aaron-tsar/fix/vault-doc-content-on-create
fix(vault): accept and persist content on POST/PUT vault/documents
2 parents 812adee + 9edcc00 commit 0fcf5ba

4 files changed

Lines changed: 739 additions & 0 deletions

internal/http/vault_handler_documents.go

Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,149 @@
11
package http
22

33
import (
4+
"errors"
45
"log/slog"
56
"net/http"
7+
"os"
8+
"path/filepath"
69
"strings"
10+
"time"
711

12+
"github.com/google/uuid"
13+
"github.com/nextlevelbuilder/goclaw/internal/eventbus"
814
"github.com/nextlevelbuilder/goclaw/internal/store"
15+
"github.com/nextlevelbuilder/goclaw/internal/vault"
916
)
1017

18+
// writeDocumentContent writes the given content bytes to a tenant-workspace-
19+
// relative path, after validating the path stays inside the workspace and uses
20+
// an allowed extension. Returns the SHA-256 content hash on success.
21+
//
22+
// This is the shared writer used by handleCreateDocument and handleUpdateDocument
23+
// to materialise inline JSON `content` payloads on disk, so the enrichment
24+
// pipeline (which reads files by path) can later compute summaries, embeddings
25+
// and links.
26+
func (h *VaultHandler) writeDocumentContent(workspace, relPath string, content []byte) (string, error) {
27+
// Ensure the tenant workspace exists before resolving symlinks. Non-master
28+
// tenants live under workspace/tenants/{slug}/ which handleUpload creates on
29+
// demand (see vault_handler_upload.go) — without this, the very first JSON
30+
// content write into a fresh tenant would fail EvalSymlinks with ErrNotExist
31+
// and 500 the request.
32+
if err := os.MkdirAll(workspace, 0o755); err != nil {
33+
return "", err
34+
}
35+
36+
// Resolve the workspace root through any symlinks so all containment checks
37+
// compare canonical paths.
38+
realWS, err := filepath.EvalSymlinks(workspace)
39+
if err != nil {
40+
return "", err
41+
}
42+
target := filepath.Join(realWS, filepath.Clean(relPath))
43+
44+
// Lexical containment: blocks "../" and absolute-path escapes.
45+
if target != realWS && !strings.HasPrefix(target, realWS+string(os.PathSeparator)) {
46+
slog.Warn("security.vault_symlink_escape",
47+
"site", "path_traversal", "attempted", target, "workspace", realWS)
48+
return "", os.ErrInvalid
49+
}
50+
51+
// Symlink containment: a lexical prefix check is not enough — a symlink that
52+
// lives *inside* the workspace but points outside it (e.g. workspace/link ->
53+
// /etc) would otherwise let the write follow it and clobber a file beyond
54+
// the workspace. Resolve the deepest already-existing ancestor of the target
55+
// and confirm it still canonicalises to a path inside the workspace, before
56+
// any directory is created or byte written.
57+
for ancestor := filepath.Dir(target); ; {
58+
resolved, rerr := filepath.EvalSymlinks(ancestor)
59+
if rerr == nil {
60+
if resolved != realWS && !strings.HasPrefix(resolved, realWS+string(os.PathSeparator)) {
61+
slog.Warn("security.vault_symlink_escape",
62+
"site", "ancestor", "resolved", resolved, "workspace", realWS)
63+
return "", os.ErrInvalid
64+
}
65+
break
66+
}
67+
if !os.IsNotExist(rerr) {
68+
return "", rerr
69+
}
70+
parent := filepath.Dir(ancestor)
71+
if parent == ancestor {
72+
break // walked to the filesystem root without an existing ancestor
73+
}
74+
ancestor = parent
75+
}
76+
77+
if err := os.MkdirAll(filepath.Dir(target), 0o755); err != nil {
78+
return "", err
79+
}
80+
81+
// Fast-fail when the final component is already a symlink — gives a clean
82+
// rejection + security log without parsing OS-specific error codes. The
83+
// O_NOFOLLOW open below closes the TOCTOU window this Lstat leaves open
84+
// (an attacker could swap the file for a symlink between calls).
85+
if fi, lerr := os.Lstat(target); lerr == nil && fi.Mode()&os.ModeSymlink != 0 {
86+
slog.Warn("security.vault_symlink_escape",
87+
"site", "final_lstat", "target", target, "workspace", realWS)
88+
return "", os.ErrInvalid
89+
}
90+
91+
// O_NOFOLLOW makes the kernel refuse to follow a symlink at the final
92+
// component atomically with the open, removing the Lstat→Open race entirely
93+
// on Unix. On Windows oNoFollow == 0 and the Lstat above is best-effort.
94+
f, err := os.OpenFile(target, os.O_WRONLY|os.O_CREATE|os.O_TRUNC|oNoFollow, 0o644)
95+
if err != nil {
96+
if isSymlinkLoopErr(err) {
97+
slog.Warn("security.vault_symlink_escape",
98+
"site", "open_nofollow", "target", target, "workspace", realWS)
99+
return "", os.ErrInvalid
100+
}
101+
return "", err
102+
}
103+
if _, werr := f.Write(content); werr != nil {
104+
_ = f.Close() // best-effort cleanup; the write error is the real one
105+
return "", werr
106+
}
107+
// Explicit Close so a delayed FS error (e.g. disk-full at flush) surfaces
108+
// instead of being silently swallowed by a deferred close.
109+
if cerr := f.Close(); cerr != nil {
110+
return "", cerr
111+
}
112+
return vault.ContentHash(content), nil
113+
}
114+
115+
// publishDocUpserted enqueues the standard EventVaultDocUpserted so the
116+
// enrichment worker re-runs summary/embedding/link extraction for this doc.
117+
// No-op when the event bus is not wired (some test setups).
118+
func (h *VaultHandler) publishDocUpserted(tenantID uuid.UUID, agentID, docID, relPath, hash, workspace string) {
119+
if h.eventBus == nil {
120+
return
121+
}
122+
tenantIDStr := tenantID.String()
123+
// Start enrichment progress before publishing so the worker pool can't drain
124+
// the event and finish before the progress tracker registers it — same
125+
// ordering handleUpload relies on to avoid that race.
126+
if h.enrichProgress != nil {
127+
h.enrichProgress.Start(1, tenantID)
128+
}
129+
h.eventBus.Publish(eventbus.DomainEvent{
130+
ID: uuid.Must(uuid.NewV7()).String(),
131+
Type: eventbus.EventVaultDocUpserted,
132+
SourceID: docID + ":" + hash,
133+
TenantID: tenantIDStr,
134+
AgentID: agentID,
135+
Timestamp: time.Now(),
136+
Payload: eventbus.VaultDocUpsertedPayload{
137+
DocID: docID,
138+
TenantID: tenantIDStr,
139+
AgentID: agentID,
140+
Path: relPath,
141+
ContentHash: hash,
142+
Workspace: workspace,
143+
},
144+
})
145+
}
146+
11147
// handleListAllDocuments lists vault documents across all agents in tenant.
12148
// Optional query param agent_id to filter by specific agent.
13149
func (h *VaultHandler) handleListAllDocuments(w http.ResponseWriter, r *http.Request) {
@@ -99,6 +235,16 @@ func (h *VaultHandler) handleGetDocument(w http.ResponseWriter, r *http.Request)
99235
}
100236

101237
// handleCreateDocument creates a new vault document.
238+
//
239+
// `content` semantics — symmetric with handleUpdateDocument:
240+
// - field omitted (nil pointer): metadata-only stub, no file write, no event
241+
// (typically followed by a rescan that materialises the file from disk).
242+
// - field present and non-empty: bytes materialised at <tenant-workspace>/<path>,
243+
// SHA-256 hash stored on the row, EventVaultDocUpserted emitted so the
244+
// enrichment worker computes summary/embeddings/links — same code path the
245+
// multipart /v1/vault/upload endpoint and the filesystem rescan use.
246+
// - field present and empty (""): a 0-byte file is written and the event still
247+
// fires. Same behaviour as PUT — pick the right shape on the client side.
102248
func (h *VaultHandler) handleCreateDocument(w http.ResponseWriter, r *http.Request) {
103249
locale := extractLocale(r)
104250
tenantID := store.TenantIDFromContext(r.Context())
@@ -107,6 +253,7 @@ func (h *VaultHandler) handleCreateDocument(w http.ResponseWriter, r *http.Reque
107253
var body struct {
108254
Path string `json:"path"`
109255
Title string `json:"title"`
256+
Content *string `json:"content"` // nil=no write; ""=write empty file; "data"=write bytes (mirrors PUT)
110257
DocType string `json:"doc_type"`
111258
Scope string `json:"scope"`
112259
TeamID string `json:"team_id"`
@@ -137,6 +284,16 @@ func (h *VaultHandler) handleCreateDocument(w http.ResponseWriter, r *http.Reque
137284
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "invalid scope"})
138285
return
139286
}
287+
// Validate extension upfront if caller wants to write content — matches the
288+
// whitelist enforced by /v1/vault/upload so the two endpoints accept the
289+
// same file types. nil pointer = "no content field", which skips the check.
290+
if body.Content != nil {
291+
ext := strings.ToLower(filepath.Ext(body.Path))
292+
if !allowedUploadExts[ext] {
293+
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "unsupported file type: " + ext})
294+
return
295+
}
296+
}
140297

141298
doc := &store.VaultDocument{
142299
TenantID: tenantID.String(),
@@ -160,11 +317,50 @@ func (h *VaultHandler) handleCreateDocument(w http.ResponseWriter, r *http.Reque
160317
doc.Scope = "team"
161318
}
162319
}
320+
321+
// Persist file content if provided, before the DB upsert so the
322+
// content_hash column is set in a single write.
323+
var (
324+
wsPath string
325+
writtenHash string
326+
contentWasSet bool
327+
)
328+
if body.Content != nil {
329+
wsPath = h.resolveTenantWorkspace(r.Context())
330+
if wsPath == "" {
331+
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "workspace not available"})
332+
return
333+
}
334+
hash, werr := h.writeDocumentContent(wsPath, body.Path, []byte(*body.Content))
335+
if werr != nil {
336+
slog.Warn("vault.create: write content failed", "path", body.Path, "error", werr)
337+
if errors.Is(werr, os.ErrInvalid) {
338+
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "path escapes workspace"})
339+
return
340+
}
341+
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": "failed to write content"})
342+
return
343+
}
344+
doc.ContentHash = hash
345+
writtenHash = hash
346+
contentWasSet = true
347+
}
348+
163349
if err := h.store.UpsertDocument(r.Context(), doc); err != nil {
164350
slog.Warn("vault.create failed", "error", err)
165351
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": err.Error()})
166352
return
167353
}
354+
355+
// Fire enrichment after the DB row exists so the worker can locate the doc.
356+
if contentWasSet {
357+
evAgent := ""
358+
if doc.AgentID != nil {
359+
evAgent = *doc.AgentID
360+
}
361+
h.publishDocUpserted(tenantID, evAgent, doc.ID, body.Path, writtenHash, wsPath)
362+
}
363+
168364
// Re-fetch by ID (set via RETURNING) — unambiguous even when same path exists across teams.
169365
created, _ := h.store.GetDocumentByID(r.Context(), tenantID.String(), doc.ID)
170366
if created != nil {
@@ -189,6 +385,7 @@ func (h *VaultHandler) handleUpdateDocument(w http.ResponseWriter, r *http.Reque
189385

190386
var body struct {
191387
Title *string `json:"title"`
388+
Content *string `json:"content"` // nil=no change; "" clears the file; non-empty rewrites it
192389
DocType *string `json:"doc_type"`
193390
Scope *string `json:"scope"`
194391
TeamID *string `json:"team_id"` // nil=no change, ""=clear, "uuid"=set
@@ -225,11 +422,52 @@ func (h *VaultHandler) handleUpdateDocument(w http.ResponseWriter, r *http.Reque
225422
existing.Metadata = body.Metadata
226423
}
227424

425+
// Rewrite content on disk when caller supplied a `content` field.
426+
var (
427+
wsPath string
428+
writtenHash string
429+
contentWasSet bool
430+
)
431+
if body.Content != nil {
432+
ext := strings.ToLower(filepath.Ext(existing.Path))
433+
if !allowedUploadExts[ext] {
434+
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "unsupported file type: " + ext})
435+
return
436+
}
437+
wsPath = h.resolveTenantWorkspace(r.Context())
438+
if wsPath == "" {
439+
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "workspace not available"})
440+
return
441+
}
442+
hash, werr := h.writeDocumentContent(wsPath, existing.Path, []byte(*body.Content))
443+
if werr != nil {
444+
slog.Warn("vault.update: write content failed", "path", existing.Path, "error", werr)
445+
if errors.Is(werr, os.ErrInvalid) {
446+
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "path escapes workspace"})
447+
return
448+
}
449+
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": "failed to write content"})
450+
return
451+
}
452+
existing.ContentHash = hash
453+
writtenHash = hash
454+
contentWasSet = true
455+
}
456+
228457
if err := h.store.UpsertDocument(r.Context(), existing); err != nil {
229458
slog.Warn("vault.update failed", "error", err)
230459
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": err.Error()})
231460
return
232461
}
462+
463+
if contentWasSet {
464+
evAgent := ""
465+
if existing.AgentID != nil {
466+
evAgent = *existing.AgentID
467+
}
468+
h.publishDocUpserted(tenantID, evAgent, existing.ID, existing.Path, writtenHash, wsPath)
469+
}
470+
233471
updated, _ := h.store.GetDocumentByID(r.Context(), tenantID.String(), docID)
234472
if updated != nil {
235473
writeJSON(w, http.StatusOK, updated)
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
//go:build !windows
2+
3+
package http
4+
5+
import (
6+
"errors"
7+
"syscall"
8+
)
9+
10+
// oNoFollow is OR'd into the open flags so the kernel refuses to follow a
11+
// symlink at the final path component — closes the Lstat→Open TOCTOU window
12+
// where a tenant could swap a regular file for a symlink between the two
13+
// syscalls. On Windows this is a no-op (see the _windows.go counterpart).
14+
const oNoFollow = syscall.O_NOFOLLOW
15+
16+
// isSymlinkLoopErr reports whether err is the kernel's "refused to follow
17+
// symlink" signal from an O_NOFOLLOW open (ELOOP on Linux/macOS).
18+
func isSymlinkLoopErr(err error) bool {
19+
return errors.Is(err, syscall.ELOOP)
20+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
//go:build windows
2+
3+
package http
4+
5+
// Windows has no O_NOFOLLOW concept and follows reparse points transparently;
6+
// the Lstat short-circuit in writeDocumentContent is the only protection
7+
// available there. Desktop edition is single-user with no untrusted tenants,
8+
// so this degradation is acceptable.
9+
const oNoFollow = 0
10+
11+
func isSymlinkLoopErr(err error) bool { return false }

0 commit comments

Comments
 (0)