Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ jobs:
cache: true
- name: Clone tok (workspace dep)
run: git clone --depth=1 https://github.com/GrayCodeAI/tok.git ../tok
- name: Boundary guard
run: bash ./scripts/check-ecosystem-boundaries.sh
- name: gofumpt diff
run: |
go install mvdan.cc/gofumpt@v0.10.0
Expand All @@ -78,6 +80,8 @@ jobs:
cache: true
- name: Clone tok (workspace dep)
run: git clone --depth=1 https://github.com/GrayCodeAI/tok.git ../tok
- name: Boundary guard
run: bash ./scripts/check-ecosystem-boundaries.sh
- uses: golangci/golangci-lint-action@4afd733a84b1f43292c63897423277bb7f4313a9 # v7.0.0
with:
version: v2.1.0
Expand All @@ -99,6 +103,8 @@ jobs:
cache: true
- name: Clone tok (workspace dep)
run: git clone --depth=1 https://github.com/GrayCodeAI/tok.git ../tok
- name: Boundary guard
run: bash ./scripts/check-ecosystem-boundaries.sh
- name: Tidy check
run: |
go mod tidy
Expand Down
7 changes: 5 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,12 @@ GOVULNCHECK := $(GOBIN_DIR)/govulncheck
# ---------------------------------------------------------------------------
# Phony declarations (alphabetical).
# ---------------------------------------------------------------------------
.PHONY: all bench build ci clean cover fmt help lint lint-fix \
.PHONY: all bench boundaries build ci clean cover fmt help lint lint-fix \
security sync-version test test-10x test-race tidy version vet

boundaries: ## Enforce support-repo import boundaries.
bash ./scripts/check-ecosystem-boundaries.sh

# ---------------------------------------------------------------------------
# Default target.
# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -101,7 +104,7 @@ sync-version: ## Copy root VERSION into internal/version/VERSION (kept in sync f
# ---------------------------------------------------------------------------
# Composite gate used by CI and pre-push.
# ---------------------------------------------------------------------------
ci: tidy fmt vet lint test-race security ## Run everything CI runs.
ci: tidy fmt vet lint boundaries test-race security ## Run everything CI runs.
@echo "All CI checks passed."

# ---------------------------------------------------------------------------
Expand Down
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@ sessions, across models, across projects — with no separate install or daemon
> )
> ```

## Ecosystem Boundaries

Yaad is a Hawk support engine. Keep the dependency edge one-way:

- depend on `hawk-core-contracts` when a stable cross-repo contract is needed
- do not import `hawk/internal/*`
- do not import removed legacy path `hawk/shared/types`; use `hawk-core-contracts/types`

---

## What Happens Next
Expand Down
3 changes: 1 addition & 2 deletions engine/context_pack.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import (
"sort"
"strings"

"github.com/GrayCodeAI/tok"
"github.com/GrayCodeAI/yaad/storage"
)

Expand Down Expand Up @@ -84,7 +83,7 @@ func (cp *ContextPacker) Pack(nodes []*storage.Node) *PackedContext {

// Format and check budget
line := cp.formatNode(n)
lineTokens := tok.EstimateTokens(line)
lineTokens := estimateTokens(line)

if tokensUsed+lineTokens > cp.budget {
break
Expand Down
57 changes: 57 additions & 0 deletions engine/token_utils.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package engine

import (
"sync"

tiktoken "github.com/tiktoken-go/tokenizer"
)

var (
tokenizerOnce sync.Once
tokenizerBPE tiktoken.Codec
tokenizerErr error
)

func estimateTokens(content string) int {
if content == "" {
return 0
}

tokenizerOnce.Do(func() {
tokenizerBPE, tokenizerErr = tiktoken.Get(tiktoken.Cl100kBase)
})
if tokenizerErr == nil && tokenizerBPE != nil {
if count, err := tokenizerBPE.Count(content); err == nil {
return count
}
}

return fallbackTokenCount(content)
}

func fallbackTokenCount(content string) int {
length := len(content)
if length < 30 {
return (length + 2) / 3
}
if length < 100 {
return (length + 3) / 4
}

spaces := 0
sample := length
if sample > 200 {
sample = 200
}
for i := 0; i < sample; i++ {
switch content[i] {
case ' ', '\n', '\t', '\r':
spaces++
}
}

spaceRatio := float64(spaces) / float64(sample)
nonSpaceChars := float64(length) * (1 - spaceRatio)
spaceTokens := float64(length) * spaceRatio
return int(nonSpaceChars/3.5 + spaceTokens)
}
1 change: 0 additions & 1 deletion go.mod

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 0 additions & 2 deletions go.sum

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 0 additions & 2 deletions go.work
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
go 1.26.4

use .

replace github.com/GrayCodeAI/tok => ../tok
7 changes: 3 additions & 4 deletions ingest/chunker.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ import (
"path/filepath"
"regexp"
"strings"

"github.com/GrayCodeAI/tok"
)

// Chunk represents a semantically meaningful unit of code.
Expand Down Expand Up @@ -806,9 +804,10 @@ func (c *Chunker) AddOverlap(chunks []Chunk, overlapLines int) []Chunk {
}

// EstimateTokens returns the estimated token count for the given text.
// Delegates to tok.EstimateTokens for accurate heuristic-based estimation.
// Uses a local BPE-first estimator to avoid engine-to-engine coupling on tok
// while preserving the old chunking behavior closely.
func EstimateTokens(content string) int {
return tok.EstimateTokens(content)
return estimateTokens(content)
}

// splitLargeChunk splits a chunk that exceeds MaxChunkTokens into smaller pieces.
Expand Down
57 changes: 57 additions & 0 deletions ingest/token_utils.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package ingest

import (
"sync"

tiktoken "github.com/tiktoken-go/tokenizer"
)

var (
tokenizerOnce sync.Once
tokenizerBPE tiktoken.Codec
tokenizerErr error
)

func estimateTokens(content string) int {
if content == "" {
return 0
}

tokenizerOnce.Do(func() {
tokenizerBPE, tokenizerErr = tiktoken.Get(tiktoken.Cl100kBase)
})
if tokenizerErr == nil && tokenizerBPE != nil {
if count, err := tokenizerBPE.Count(content); err == nil {
return count
}
}

return fallbackTokenCount(content)
}

func fallbackTokenCount(content string) int {
length := len(content)
if length < 30 {
return (length + 2) / 3
}
if length < 100 {
return (length + 3) / 4
}

spaces := 0
sample := length
if sample > 200 {
sample = 200
}
for i := 0; i < sample; i++ {
switch content[i] {
case ' ', '\n', '\t', '\r':
spaces++
}
}

spaceRatio := float64(spaces) / float64(sample)
nonSpaceChars := float64(length) * (1 - spaceRatio)
spaceTokens := float64(length) * spaceRatio
return int(nonSpaceChars/3.5 + spaceTokens)
}
15 changes: 15 additions & 0 deletions lefthook.yml
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,18 @@ commit-msg:
echo " full guide: https://www.conventionalcommits.org/"
exit 1
fi

strip-co-authored-by:
run: |
# Strip Co-authored-by: trailers that AI tools (Claude, Cursor, etc.) add.
# This enforces the rule that commits list only the human author.
sed '/^[Cc]o-[Aa]uthored-[Bb]y:/d' "{1}" > "{1}.tmp" && mv "{1}.tmp" "{1}"

# ---------------------------------------------------------------------------
# prepare-commit-msg — strip AI co-author trailers after tools inject them.
# ---------------------------------------------------------------------------
prepare-commit-msg:
commands:
strip-co-authored-by:
run: |
sed '/^[Cc]o-[Aa]uthored-[Bb]y:/d' "{1}" > "{1}.tmp" && mv "{1}.tmp" "{1}"
21 changes: 21 additions & 0 deletions scripts/check-ecosystem-boundaries.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/usr/bin/env bash
set -euo pipefail

ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$ROOT_DIR"

if command -v rg >/dev/null 2>&1; then
violations="$(rg -n 'github\.com/GrayCodeAI/hawk/(internal/|shared/types)' --glob '*.go' . || true)"
else
violations="$(grep -rn --include='*.go' -E 'github\.com/GrayCodeAI/hawk/(internal/|shared/types)' . || true)"
fi

if [[ -n "${violations}" ]]; then
echo "forbidden Hawk imports found:"
echo "${violations}"
echo
echo "support repos must use hawk-core-contracts or local contracts, not hawk/internal or removed hawk/shared/types"
exit 1
fi

echo "ecosystem boundary guard passed"
Loading