Skip to content

Commit 59d05b6

Browse files
authored
ci(boundary): enforce Hawk ecosystem import boundaries (#24)
* ci(boundary): add ecosystem boundary guard yaad is a Hawk support engine with no cross-repo shared types of its own, so it stays contract-free. Add the one-way ecosystem boundary guard to keep it that way: - add scripts/check-ecosystem-boundaries.sh (forbids hawk/internal and hawk/shared/types imports) - wire the guard into the Makefile and CI - document the boundary rule in the README build and the boundary guard pass. * refactor: decouple yaad from tok helpers * docs: remove legacy shared types references * chore: strip Co-authored-by trailers in lefthook hooks * fix(boundary): fall back to grep when rg is unavailable * build(deps): go mod tidy after dropping tok dependency
1 parent be58656 commit 59d05b6

12 files changed

Lines changed: 180 additions & 50 deletions

File tree

.github/workflows/ci.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ jobs:
5252
cache: true
5353
- name: Clone tok (workspace dep)
5454
run: git clone --depth=1 https://github.com/GrayCodeAI/tok.git ../tok
55+
- name: Boundary guard
56+
run: bash ./scripts/check-ecosystem-boundaries.sh
5557
- name: gofumpt diff
5658
run: |
5759
go install mvdan.cc/gofumpt@v0.10.0
@@ -78,6 +80,8 @@ jobs:
7880
cache: true
7981
- name: Clone tok (workspace dep)
8082
run: git clone --depth=1 https://github.com/GrayCodeAI/tok.git ../tok
83+
- name: Boundary guard
84+
run: bash ./scripts/check-ecosystem-boundaries.sh
8185
- uses: golangci/golangci-lint-action@4afd733a84b1f43292c63897423277bb7f4313a9 # v7.0.0
8286
with:
8387
version: v2.1.0
@@ -99,6 +103,8 @@ jobs:
99103
cache: true
100104
- name: Clone tok (workspace dep)
101105
run: git clone --depth=1 https://github.com/GrayCodeAI/tok.git ../tok
106+
- name: Boundary guard
107+
run: bash ./scripts/check-ecosystem-boundaries.sh
102108
- name: Tidy check
103109
run: |
104110
go mod tidy

Makefile

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,12 @@ GOVULNCHECK := $(GOBIN_DIR)/govulncheck
3131
# ---------------------------------------------------------------------------
3232
# Phony declarations (alphabetical).
3333
# ---------------------------------------------------------------------------
34-
.PHONY: all bench build ci clean cover fmt help lint lint-fix \
34+
.PHONY: all bench boundaries build ci clean cover fmt help lint lint-fix \
3535
security sync-version test test-10x test-race tidy version vet
3636

37+
boundaries: ## Enforce support-repo import boundaries.
38+
bash ./scripts/check-ecosystem-boundaries.sh
39+
3740
# ---------------------------------------------------------------------------
3841
# Default target.
3942
# ---------------------------------------------------------------------------
@@ -101,7 +104,7 @@ sync-version: ## Copy root VERSION into internal/version/VERSION (kept in sync f
101104
# ---------------------------------------------------------------------------
102105
# Composite gate used by CI and pre-push.
103106
# ---------------------------------------------------------------------------
104-
ci: tidy fmt vet lint test-race security ## Run everything CI runs.
107+
ci: tidy fmt vet lint boundaries test-race security ## Run everything CI runs.
105108
@echo "All CI checks passed."
106109

107110
# ---------------------------------------------------------------------------

README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,14 @@ sessions, across models, across projects — with no separate install or daemon
4242
> )
4343
> ```
4444
45+
## Ecosystem Boundaries
46+
47+
Yaad is a Hawk support engine. Keep the dependency edge one-way:
48+
49+
- depend on `hawk-core-contracts` when a stable cross-repo contract is needed
50+
- do not import `hawk/internal/*`
51+
- do not import removed legacy path `hawk/shared/types`; use `hawk-core-contracts/types`
52+
4553
---
4654
4755
## What Happens Next

engine/context_pack.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ import (
55
"sort"
66
"strings"
77

8-
"github.com/GrayCodeAI/tok"
98
"github.com/GrayCodeAI/yaad/storage"
109
)
1110

@@ -84,7 +83,7 @@ func (cp *ContextPacker) Pack(nodes []*storage.Node) *PackedContext {
8483

8584
// Format and check budget
8685
line := cp.formatNode(n)
87-
lineTokens := tok.EstimateTokens(line)
86+
lineTokens := estimateTokens(line)
8887

8988
if tokensUsed+lineTokens > cp.budget {
9089
break

engine/token_utils.go

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
package engine
2+
3+
import (
4+
"sync"
5+
6+
tiktoken "github.com/tiktoken-go/tokenizer"
7+
)
8+
9+
var (
10+
tokenizerOnce sync.Once
11+
tokenizerBPE tiktoken.Codec
12+
tokenizerErr error
13+
)
14+
15+
func estimateTokens(content string) int {
16+
if content == "" {
17+
return 0
18+
}
19+
20+
tokenizerOnce.Do(func() {
21+
tokenizerBPE, tokenizerErr = tiktoken.Get(tiktoken.Cl100kBase)
22+
})
23+
if tokenizerErr == nil && tokenizerBPE != nil {
24+
if count, err := tokenizerBPE.Count(content); err == nil {
25+
return count
26+
}
27+
}
28+
29+
return fallbackTokenCount(content)
30+
}
31+
32+
func fallbackTokenCount(content string) int {
33+
length := len(content)
34+
if length < 30 {
35+
return (length + 2) / 3
36+
}
37+
if length < 100 {
38+
return (length + 3) / 4
39+
}
40+
41+
spaces := 0
42+
sample := length
43+
if sample > 200 {
44+
sample = 200
45+
}
46+
for i := 0; i < sample; i++ {
47+
switch content[i] {
48+
case ' ', '\n', '\t', '\r':
49+
spaces++
50+
}
51+
}
52+
53+
spaceRatio := float64(spaces) / float64(sample)
54+
nonSpaceChars := float64(length) * (1 - spaceRatio)
55+
spaceTokens := float64(length) * spaceRatio
56+
return int(nonSpaceChars/3.5 + spaceTokens)
57+
}

go.mod

Lines changed: 1 addition & 11 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

go.sum

Lines changed: 6 additions & 29 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

go.work

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
11
go 1.26.4
22

33
use .
4-
5-
replace github.com/GrayCodeAI/tok => ../tok

ingest/chunker.go

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@ import (
1414
"path/filepath"
1515
"regexp"
1616
"strings"
17-
18-
"github.com/GrayCodeAI/tok"
1917
)
2018

2119
// Chunk represents a semantically meaningful unit of code.
@@ -806,9 +804,10 @@ func (c *Chunker) AddOverlap(chunks []Chunk, overlapLines int) []Chunk {
806804
}
807805

808806
// EstimateTokens returns the estimated token count for the given text.
809-
// Delegates to tok.EstimateTokens for accurate heuristic-based estimation.
807+
// Uses a local BPE-first estimator to avoid engine-to-engine coupling on tok
808+
// while preserving the old chunking behavior closely.
810809
func EstimateTokens(content string) int {
811-
return tok.EstimateTokens(content)
810+
return estimateTokens(content)
812811
}
813812

814813
// splitLargeChunk splits a chunk that exceeds MaxChunkTokens into smaller pieces.

ingest/token_utils.go

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
package ingest
2+
3+
import (
4+
"sync"
5+
6+
tiktoken "github.com/tiktoken-go/tokenizer"
7+
)
8+
9+
var (
10+
tokenizerOnce sync.Once
11+
tokenizerBPE tiktoken.Codec
12+
tokenizerErr error
13+
)
14+
15+
func estimateTokens(content string) int {
16+
if content == "" {
17+
return 0
18+
}
19+
20+
tokenizerOnce.Do(func() {
21+
tokenizerBPE, tokenizerErr = tiktoken.Get(tiktoken.Cl100kBase)
22+
})
23+
if tokenizerErr == nil && tokenizerBPE != nil {
24+
if count, err := tokenizerBPE.Count(content); err == nil {
25+
return count
26+
}
27+
}
28+
29+
return fallbackTokenCount(content)
30+
}
31+
32+
func fallbackTokenCount(content string) int {
33+
length := len(content)
34+
if length < 30 {
35+
return (length + 2) / 3
36+
}
37+
if length < 100 {
38+
return (length + 3) / 4
39+
}
40+
41+
spaces := 0
42+
sample := length
43+
if sample > 200 {
44+
sample = 200
45+
}
46+
for i := 0; i < sample; i++ {
47+
switch content[i] {
48+
case ' ', '\n', '\t', '\r':
49+
spaces++
50+
}
51+
}
52+
53+
spaceRatio := float64(spaces) / float64(sample)
54+
nonSpaceChars := float64(length) * (1 - spaceRatio)
55+
spaceTokens := float64(length) * spaceRatio
56+
return int(nonSpaceChars/3.5 + spaceTokens)
57+
}

0 commit comments

Comments
 (0)