diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6625265..0c40487 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -52,6 +52,8 @@ jobs: cache: true - name: Clone tok (workspace dep) run: git clone --depth=1 https://github.com/GrayCodeAI/tok.git ../tok + - name: Boundary guard + run: bash ./scripts/check-ecosystem-boundaries.sh - name: gofumpt diff run: | go install mvdan.cc/gofumpt@v0.10.0 @@ -78,6 +80,8 @@ jobs: cache: true - name: Clone tok (workspace dep) run: git clone --depth=1 https://github.com/GrayCodeAI/tok.git ../tok + - name: Boundary guard + run: bash ./scripts/check-ecosystem-boundaries.sh - uses: golangci/golangci-lint-action@4afd733a84b1f43292c63897423277bb7f4313a9 # v7.0.0 with: version: v2.1.0 @@ -99,6 +103,8 @@ jobs: cache: true - name: Clone tok (workspace dep) run: git clone --depth=1 https://github.com/GrayCodeAI/tok.git ../tok + - name: Boundary guard + run: bash ./scripts/check-ecosystem-boundaries.sh - name: Tidy check run: | go mod tidy diff --git a/Makefile b/Makefile index 328d421..8efe5a4 100644 --- a/Makefile +++ b/Makefile @@ -31,9 +31,12 @@ GOVULNCHECK := $(GOBIN_DIR)/govulncheck # --------------------------------------------------------------------------- # Phony declarations (alphabetical). # --------------------------------------------------------------------------- -.PHONY: all bench build ci clean cover fmt help lint lint-fix \ +.PHONY: all bench boundaries build ci clean cover fmt help lint lint-fix \ security sync-version test test-10x test-race tidy version vet +boundaries: ## Enforce support-repo import boundaries. + bash ./scripts/check-ecosystem-boundaries.sh + # --------------------------------------------------------------------------- # Default target. # --------------------------------------------------------------------------- @@ -101,7 +104,7 @@ sync-version: ## Copy root VERSION into internal/version/VERSION (kept in sync f # --------------------------------------------------------------------------- # Composite gate used by CI and pre-push. # --------------------------------------------------------------------------- -ci: tidy fmt vet lint test-race security ## Run everything CI runs. +ci: tidy fmt vet lint boundaries test-race security ## Run everything CI runs. @echo "All CI checks passed." # --------------------------------------------------------------------------- diff --git a/README.md b/README.md index 7d8b1a4..3830c8f 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,14 @@ sessions, across models, across projects — with no separate install or daemon > ) > ``` +## Ecosystem Boundaries + +Yaad is a Hawk support engine. Keep the dependency edge one-way: + +- depend on `hawk-core-contracts` when a stable cross-repo contract is needed +- do not import `hawk/internal/*` +- do not import removed legacy path `hawk/shared/types`; use `hawk-core-contracts/types` + --- ## What Happens Next diff --git a/engine/context_pack.go b/engine/context_pack.go index 2cf16db..e5b209e 100644 --- a/engine/context_pack.go +++ b/engine/context_pack.go @@ -5,7 +5,6 @@ import ( "sort" "strings" - "github.com/GrayCodeAI/tok" "github.com/GrayCodeAI/yaad/storage" ) @@ -84,7 +83,7 @@ func (cp *ContextPacker) Pack(nodes []*storage.Node) *PackedContext { // Format and check budget line := cp.formatNode(n) - lineTokens := tok.EstimateTokens(line) + lineTokens := estimateTokens(line) if tokensUsed+lineTokens > cp.budget { break diff --git a/engine/token_utils.go b/engine/token_utils.go new file mode 100644 index 0000000..c2f26a4 --- /dev/null +++ b/engine/token_utils.go @@ -0,0 +1,57 @@ +package engine + +import ( + "sync" + + tiktoken "github.com/tiktoken-go/tokenizer" +) + +var ( + tokenizerOnce sync.Once + tokenizerBPE tiktoken.Codec + tokenizerErr error +) + +func estimateTokens(content string) int { + if content == "" { + return 0 + } + + tokenizerOnce.Do(func() { + tokenizerBPE, tokenizerErr = tiktoken.Get(tiktoken.Cl100kBase) + }) + if tokenizerErr == nil && tokenizerBPE != nil { + if count, err := tokenizerBPE.Count(content); err == nil { + return count + } + } + + return fallbackTokenCount(content) +} + +func fallbackTokenCount(content string) int { + length := len(content) + if length < 30 { + return (length + 2) / 3 + } + if length < 100 { + return (length + 3) / 4 + } + + spaces := 0 + sample := length + if sample > 200 { + sample = 200 + } + for i := 0; i < sample; i++ { + switch content[i] { + case ' ', '\n', '\t', '\r': + spaces++ + } + } + + spaceRatio := float64(spaces) / float64(sample) + nonSpaceChars := float64(length) * (1 - spaceRatio) + spaceTokens := float64(length) * spaceRatio + return int(nonSpaceChars/3.5 + spaceTokens) +} diff --git a/go.mod b/go.mod index 7f386bb..48b8455 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,6 @@ go 1.26.4 require ( github.com/BurntSushi/toml v1.6.0 - github.com/GrayCodeAI/tok v0.1.0 github.com/charmbracelet/bubbles v1.0.0 github.com/charmbracelet/bubbletea v1.3.10 github.com/charmbracelet/lipgloss v1.1.0 diff --git a/go.sum b/go.sum index edc53e4..ee0701f 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,5 @@ github.com/BurntSushi/toml v1.6.0 h1:dRaEfpa2VI55EwlIW72hMRHdWouJeRF7TPYhI+AUQjk= github.com/BurntSushi/toml v1.6.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho= -github.com/GrayCodeAI/tok v0.1.0 h1:6lhxIGg1eDsnOtAuGOZf803aqj4CrPmVmTwKRw25Zio= -github.com/GrayCodeAI/tok v0.1.0/go.mod h1:oqA7HXbXuyrZ3+uJC+TKJWmYYPlyShaXGDQpftEJ9OE= github.com/atotto/clipboard v0.1.4 h1:EH0zSVneZPSuFR11BlR9YppQTVDbh5+16AmcJi4g1z4= github.com/atotto/clipboard v0.1.4/go.mod h1:ZY9tmq7sm5xIbd9bOK4onWV4S6X0u6GY7Vn0Yu86PYI= github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k= diff --git a/go.work b/go.work index b7f7ecf..4b21626 100644 --- a/go.work +++ b/go.work @@ -1,5 +1,3 @@ go 1.26.4 use . - -replace github.com/GrayCodeAI/tok => ../tok diff --git a/ingest/chunker.go b/ingest/chunker.go index c55c23c..b3e50e8 100644 --- a/ingest/chunker.go +++ b/ingest/chunker.go @@ -14,8 +14,6 @@ import ( "path/filepath" "regexp" "strings" - - "github.com/GrayCodeAI/tok" ) // Chunk represents a semantically meaningful unit of code. @@ -806,9 +804,10 @@ func (c *Chunker) AddOverlap(chunks []Chunk, overlapLines int) []Chunk { } // EstimateTokens returns the estimated token count for the given text. -// Delegates to tok.EstimateTokens for accurate heuristic-based estimation. +// Uses a local BPE-first estimator to avoid engine-to-engine coupling on tok +// while preserving the old chunking behavior closely. func EstimateTokens(content string) int { - return tok.EstimateTokens(content) + return estimateTokens(content) } // splitLargeChunk splits a chunk that exceeds MaxChunkTokens into smaller pieces. diff --git a/ingest/token_utils.go b/ingest/token_utils.go new file mode 100644 index 0000000..a0fab21 --- /dev/null +++ b/ingest/token_utils.go @@ -0,0 +1,57 @@ +package ingest + +import ( + "sync" + + tiktoken "github.com/tiktoken-go/tokenizer" +) + +var ( + tokenizerOnce sync.Once + tokenizerBPE tiktoken.Codec + tokenizerErr error +) + +func estimateTokens(content string) int { + if content == "" { + return 0 + } + + tokenizerOnce.Do(func() { + tokenizerBPE, tokenizerErr = tiktoken.Get(tiktoken.Cl100kBase) + }) + if tokenizerErr == nil && tokenizerBPE != nil { + if count, err := tokenizerBPE.Count(content); err == nil { + return count + } + } + + return fallbackTokenCount(content) +} + +func fallbackTokenCount(content string) int { + length := len(content) + if length < 30 { + return (length + 2) / 3 + } + if length < 100 { + return (length + 3) / 4 + } + + spaces := 0 + sample := length + if sample > 200 { + sample = 200 + } + for i := 0; i < sample; i++ { + switch content[i] { + case ' ', '\n', '\t', '\r': + spaces++ + } + } + + spaceRatio := float64(spaces) / float64(sample) + nonSpaceChars := float64(length) * (1 - spaceRatio) + spaceTokens := float64(length) * spaceRatio + return int(nonSpaceChars/3.5 + spaceTokens) +} diff --git a/lefthook.yml b/lefthook.yml index ba5700d..7d5bdaf 100644 --- a/lefthook.yml +++ b/lefthook.yml @@ -110,3 +110,18 @@ commit-msg: echo " full guide: https://www.conventionalcommits.org/" exit 1 fi + + strip-co-authored-by: + run: | + # Strip Co-authored-by: trailers that AI tools (Claude, Cursor, etc.) add. + # This enforces the rule that commits list only the human author. + sed '/^[Cc]o-[Aa]uthored-[Bb]y:/d' "{1}" > "{1}.tmp" && mv "{1}.tmp" "{1}" + +# --------------------------------------------------------------------------- +# prepare-commit-msg — strip AI co-author trailers after tools inject them. +# --------------------------------------------------------------------------- +prepare-commit-msg: + commands: + strip-co-authored-by: + run: | + sed '/^[Cc]o-[Aa]uthored-[Bb]y:/d' "{1}" > "{1}.tmp" && mv "{1}.tmp" "{1}" diff --git a/scripts/check-ecosystem-boundaries.sh b/scripts/check-ecosystem-boundaries.sh new file mode 100644 index 0000000..57f81ba --- /dev/null +++ b/scripts/check-ecosystem-boundaries.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +cd "$ROOT_DIR" + +if command -v rg >/dev/null 2>&1; then + violations="$(rg -n 'github\.com/GrayCodeAI/hawk/(internal/|shared/types)' --glob '*.go' . || true)" +else + violations="$(grep -rn --include='*.go' -E 'github\.com/GrayCodeAI/hawk/(internal/|shared/types)' . || true)" +fi + +if [[ -n "${violations}" ]]; then + echo "forbidden Hawk imports found:" + echo "${violations}" + echo + echo "support repos must use hawk-core-contracts or local contracts, not hawk/internal or removed hawk/shared/types" + exit 1 +fi + +echo "ecosystem boundary guard passed"