Skip to content

Commit 39bbdab

Browse files
authored
Extraction/buffer sizing improvements (#887)
* Extraction/buffer sizing improvements Signed-off-by: egibs <20933572+egibs@users.noreply.github.com> * Reorder scanSinglePath Signed-off-by: egibs <20933572+egibs@users.noreply.github.com> * Re-add comments Signed-off-by: egibs <20933572+egibs@users.noreply.github.com> * Bump max archive size again Signed-off-by: egibs <20933572+egibs@users.noreply.github.com> * Fix comment Signed-off-by: egibs <20933572+egibs@users.noreply.github.com> * Replace panic with return, fix ELF condition Signed-off-by: egibs <20933572+egibs@users.noreply.github.com> * Replace error check for partial file reads Signed-off-by: egibs <20933572+egibs@users.noreply.github.com> * Clean up error conditions Signed-off-by: egibs <20933572+egibs@users.noreply.github.com> * Keep zip limit at GOMAXPROCS Signed-off-by: egibs <20933572+egibs@users.noreply.github.com> * Speed up scans with native-code-serialization Signed-off-by: egibs <20933572+egibs@users.noreply.github.com> --------- Signed-off-by: egibs <20933572+egibs@users.noreply.github.com>
1 parent fd709a3 commit 39bbdab

16 files changed

Lines changed: 464 additions & 237 deletions

File tree

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ install-yara-x: out/$(YARAX_REPO)/.git/commit-$(YARAX_COMMIT)
124124
mkdir -p out/include
125125
cd out/$(YARAX_REPO) && \
126126
cargo install cargo-c --locked && \
127-
cargo cinstall -p yara-x-capi --release --prefix="$(LINT_ROOT)/out" --libdir="$(LINT_ROOT)/out/lib"
127+
cargo cinstall -p yara-x-capi --features=native-code-serialization --release --prefix="$(LINT_ROOT)/out" --libdir="$(LINT_ROOT)/out/lib"
128128

129129
# unit tests only
130130
.PHONY: test

go.mod

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,14 @@ require (
1313
github.com/charmbracelet/bubbles v0.21.0
1414
github.com/charmbracelet/bubbletea v1.3.4
1515
github.com/charmbracelet/lipgloss v1.1.0
16+
github.com/cosnicolaou/pbzip2 v1.0.5
1617
github.com/egibs/go-debian v0.18.0
1718
github.com/fatih/color v1.18.0
1819
github.com/gabriel-vasile/mimetype v1.4.9
1920
github.com/google/go-cmp v0.7.0
2021
github.com/google/go-containerregistry v0.20.3
2122
github.com/klauspost/compress v1.18.0
23+
github.com/klauspost/pgzip v1.2.6
2224
github.com/olekukonko/tablewriter v0.0.5
2325
github.com/shirou/gopsutil/v4 v4.25.3
2426
github.com/ulikunitz/xz v0.5.12
@@ -45,6 +47,7 @@ require (
4547
github.com/ebitengine/purego v0.8.2 // indirect
4648
github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect
4749
github.com/go-ole/go-ole v1.3.0 // indirect
50+
github.com/kr/pretty v0.1.0 // indirect
4851
github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
4952
github.com/lufia/plan9stats v0.0.0-20240909124753-873cd0166683 // indirect
5053
github.com/mailru/easyjson v0.9.0 // indirect
@@ -74,5 +77,6 @@ require (
7477
golang.org/x/sys v0.32.0 // indirect
7578
golang.org/x/text v0.24.0 // indirect
7679
google.golang.org/protobuf v1.36.3 // indirect
80+
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect
7781
pault.ag/go/topsort v0.1.1 // indirect
7882
)

go.sum

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ github.com/charmbracelet/x/term v0.2.1 h1:AQeHeLZ1OqSXhrAWpYUtZyX1T3zVxfpZuEQMIQ
3030
github.com/charmbracelet/x/term v0.2.1/go.mod h1:oQ4enTYFV7QN4m0i9mzHrViD7TQKvNEEkHUMCmsxdUg=
3131
github.com/containerd/stargz-snapshotter/estargz v0.16.3 h1:7evrXtoh1mSbGj/pfRccTampEyKpjpOnS3CyiV1Ebr8=
3232
github.com/containerd/stargz-snapshotter/estargz v0.16.3/go.mod h1:uyr4BfYfOj3G9WBVE8cOlQmXAbPN9VEQpBBeJIuOipU=
33+
github.com/cosnicolaou/pbzip2 v1.0.5 h1:+PZ8yRBx6bRXncOJWQvEThyFm8XhF9Yb6WUMN6KsgrA=
34+
github.com/cosnicolaou/pbzip2 v1.0.5/go.mod h1:uCNfm0iE2wIKGRlLyq31M4toziFprNhEnvueGmh5u3M=
3335
github.com/cpuguy83/go-md2man/v2 v2.0.6 h1:XJtiaUW6dEEqVuZiMTn1ldk455QWwEIsMIJlo5vtkx0=
3436
github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
3537
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@@ -61,6 +63,13 @@ github.com/google/go-containerregistry v0.20.3 h1:oNx7IdTI936V8CQRveCjaxOiegWwvM
6163
github.com/google/go-containerregistry v0.20.3/go.mod h1:w00pIgBRDVUDFM6bq+Qx8lwNWK+cxgCuX1vd3PIBDNI=
6264
github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
6365
github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
66+
github.com/klauspost/pgzip v1.2.6 h1:8RXeL5crjEUFnR2/Sn6GJNWtSQ3Dk8pq4CL3jvdDyjU=
67+
github.com/klauspost/pgzip v1.2.6/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
68+
github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
69+
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
70+
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
71+
github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
72+
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
6473
github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY=
6574
github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
6675
github.com/lufia/plan9stats v0.0.0-20240909124753-873cd0166683 h1:7UMa6KCCMjZEMDtTVdcGu0B1GmmC7QJKiCCjyTAWQy0=
@@ -149,8 +158,9 @@ golang.org/x/text v0.24.0 h1:dd5Bzh4yt5KYA8f9CJHCP4FB4D51c2c6JvN37xJJkJ0=
149158
golang.org/x/text v0.24.0/go.mod h1:L8rBsPeo2pSS+xqN0d5u2ikmjtmoJbDBT1b7nHvFCdU=
150159
google.golang.org/protobuf v1.36.3 h1:82DV7MYdb8anAVi3qge1wSnMDrnKK7ebr+I0hHRN1BU=
151160
google.golang.org/protobuf v1.36.3/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE=
152-
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
153161
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
162+
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
163+
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
154164
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
155165
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
156166
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

pkg/action/scan.go

Lines changed: 87 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ var (
4848
)
4949

5050
// scanSinglePath YARA scans a single path and converts it to a fileReport.
51+
//
52+
//nolint:cyclop // ignore complexity of 38
5153
func scanSinglePath(ctx context.Context, c malcontent.Config, path string, ruleFS []fs.FS, absPath string, archiveRoot string) (*malcontent.FileReport, error) {
5254
if ctx.Err() != nil {
5355
return &malcontent.FileReport{}, ctx.Err()
@@ -56,30 +58,22 @@ func scanSinglePath(ctx context.Context, c malcontent.Config, path string, ruleF
5658
logger := clog.FromContext(ctx)
5759
logger = logger.With("path", path)
5860

59-
var yrs *yarax.Rules
60-
var err error
61-
if c.Rules == nil {
62-
yrs, err = CachedRules(ctx, ruleFS)
63-
if err != nil {
64-
return nil, fmt.Errorf("rules: %w", err)
65-
}
66-
} else {
67-
yrs = c.Rules
68-
}
61+
isArchive := archiveRoot != ""
6962

70-
initializeOnce.Do(func() {
71-
filePool = pool.NewBufferPool()
72-
scannerPool = pool.NewScannerPool(yrs, c.Concurrency)
73-
})
63+
fi, err := os.Stat(path)
64+
if err != nil {
65+
return nil, err
66+
}
7467

75-
scanner := scannerPool.Get()
76-
// Scanner should not be nil here, but guard against it anyway
77-
if scanner == nil {
78-
scanner = yarax.NewScanner(yrs)
68+
size := fi.Size()
69+
if size == 0 {
70+
fr := &malcontent.FileReport{Skipped: "zero-sized file", Path: path}
71+
if isArchive {
72+
defer os.RemoveAll(path)
73+
}
74+
return fr, nil
7975
}
80-
defer scannerPool.Put(scanner)
8176

82-
isArchive := archiveRoot != ""
8377
mime := "<unknown>"
8478
kind, err := programkind.File(path)
8579
if err != nil && !interactive(c) {
@@ -88,6 +82,7 @@ func scanSinglePath(ctx context.Context, c malcontent.Config, path string, ruleF
8882
if kind != nil {
8983
mime = kind.MIME
9084
}
85+
9186
if !c.IncludeDataFiles && kind == nil {
9287
logger.Debugf("skipping %s [%s]: data file or empty", path, mime)
9388
fr := &malcontent.FileReport{Skipped: "data file or empty", Path: path}
@@ -99,33 +94,51 @@ func scanSinglePath(ctx context.Context, c malcontent.Config, path string, ruleF
9994
}
10095
logger = logger.With("mime", mime)
10196

102-
f, err := os.Open(path)
103-
if err != nil {
104-
return nil, err
97+
var yrs *yarax.Rules
98+
if c.Rules == nil {
99+
yrs, err = CachedRules(ctx, ruleFS)
100+
if err != nil {
101+
return nil, fmt.Errorf("rules: %w", err)
102+
}
103+
} else {
104+
yrs = c.Rules
105105
}
106-
defer f.Close()
107106

108-
fi, err := f.Stat()
109-
if err != nil {
110-
return nil, err
107+
initializeOnce.Do(func() {
108+
filePool = pool.NewBufferPool(c.Concurrency + 1)
109+
scannerPool = pool.NewScannerPool(yrs, c.Concurrency+1)
110+
})
111+
112+
scanner := scannerPool.Get()
113+
if scanner == nil {
114+
scanner = yarax.NewScanner(yrs)
111115
}
112-
size := fi.Size()
116+
defer scannerPool.Put(scanner)
113117

114-
if size == 0 {
115-
fr := &malcontent.FileReport{Skipped: "zero-sized file", Path: path}
116-
defer os.RemoveAll(path)
117-
return fr, nil
118+
f, err := os.Open(path)
119+
if err != nil {
120+
return nil, err
118121
}
122+
defer f.Close()
119123

120124
fc := filePool.Get(size)
121125
defer filePool.Put(fc)
122-
if _, err := io.ReadFull(f, fc); err != nil {
123-
return nil, err
126+
127+
var bytesRead int
128+
var totalRead int64
129+
for totalRead < size {
130+
bytesRead, err = f.Read(fc[totalRead:])
131+
if errors.Is(err, io.EOF) {
132+
break
133+
}
134+
if err != nil {
135+
return nil, err
136+
}
137+
totalRead += int64(bytesRead)
124138
}
125139

126-
// Immediately remove archive files read into memory
127-
if isArchive {
128-
defer os.RemoveAll(path)
140+
if totalRead < size && err != nil {
141+
return nil, fmt.Errorf("incomplete read: got %d bytes, expected %d: %w", totalRead, size, err)
129142
}
130143

131144
mrs, err := scanner.Scan(fc)
@@ -137,9 +150,12 @@ func scanSinglePath(ctx context.Context, c malcontent.Config, path string, ruleF
137150
// If running a scan, only generate reports for mrs that satisfy the risk threshold of 3
138151
// This is a short-circuit that avoids any report generation logic
139152
risk := report.HighestMatchRisk(mrs)
140-
if c.Scan && risk < 3 {
153+
threshold := max(3, c.MinFileRisk, c.MinRisk)
154+
if c.Scan && risk < threshold {
141155
fr := &malcontent.FileReport{Skipped: "overall risk too low for scan", Path: path}
142-
os.RemoveAll(path)
156+
if isArchive {
157+
os.RemoveAll(path)
158+
}
143159
return fr, nil
144160
}
145161

@@ -166,15 +182,13 @@ func scanSinglePath(ctx context.Context, c malcontent.Config, path string, ruleF
166182
fr.ArchiveRoot = archiveRootAbs
167183
fr.FullPath = pathAbs
168184
clean = formatPath(cleanPath(pathAbs, archiveRootAbs))
169-
}
170185

171-
// If absPath is provided, use it instead of the path if they are different.
172-
// This is useful when scanning images and archives.
173-
if absPath != "" && absPath != path && (isArchive || c.OCI) {
174-
if len(c.TrimPrefixes) > 0 {
175-
absPath = report.TrimPrefixes(absPath, c.TrimPrefixes)
186+
if absPath != "" && absPath != path && (isArchive || c.OCI) {
187+
if len(c.TrimPrefixes) > 0 {
188+
absPath = report.TrimPrefixes(absPath, c.TrimPrefixes)
189+
}
190+
fr.Path = fmt.Sprintf("%s ∴ %s", absPath, clean)
176191
}
177-
fr.Path = fmt.Sprintf("%s ∴ %s", absPath, clean)
178192
}
179193

180194
if len(fr.Behaviors) == 0 {
@@ -396,17 +410,17 @@ func processPaths(ctx context.Context, paths []string, scanInfo scanPathInfo, c
396410
cancel()
397411
}()
398412

399-
g := setupErrorGroup(maxConcurrency)
413+
g, gCtx := errgroup.WithContext(scanCtx)
414+
g.SetLimit(maxConcurrency)
400415

401-
setupMatchHandler(scanCtx, matchChan, c, cancel, logger)
416+
setupMatchHandler(gCtx, matchChan, c, cancel, logger)
402417

403418
pc := make(chan string, len(paths))
404419
go func() {
405420
defer close(pc)
406-
407421
for _, path := range paths {
408422
select {
409-
case <-scanCtx.Done():
423+
case <-gCtx.Done():
410424
return
411425
case pc <- path:
412426
}
@@ -415,10 +429,10 @@ func processPaths(ctx context.Context, paths []string, scanInfo scanPathInfo, c
415429

416430
for path := range pc {
417431
g.Go(func() error {
418-
if scanCtx.Err() != nil {
432+
if gCtx.Err() != nil {
419433
return scanCtx.Err()
420434
}
421-
return processPath(scanCtx, path, scanInfo, c, r, matchChan, matchOnce, logger)
435+
return processPath(gCtx, path, scanInfo, c, r, matchChan, matchOnce, logger)
422436
})
423437
}
424438

@@ -447,21 +461,6 @@ func getMaxConcurrency(configured int) int {
447461
return configured
448462
}
449463

450-
func createPathChannel(paths []string) chan string {
451-
pc := make(chan string, len(paths))
452-
for _, path := range paths {
453-
pc <- path
454-
}
455-
close(pc)
456-
return pc
457-
}
458-
459-
func setupErrorGroup(maxConcurrency int) *errgroup.Group {
460-
g := &errgroup.Group{}
461-
g.SetLimit(maxConcurrency)
462-
return g
463-
}
464-
465464
func setupMatchHandler(ctx context.Context, matchChan chan matchResult, c malcontent.Config, cancel context.CancelFunc, logger *clog.Logger) {
466465
if ctx.Err() != nil {
467466
return
@@ -634,13 +633,12 @@ func processArchive(ctx context.Context, c malcontent.Config, rfs []fs.FS, archi
634633
return nil, fmt.Errorf("extract to temp: %w", err)
635634
}
636635
// Ensure that tmpRoot is removed before returning if created successfully
637-
if tmpRoot != "" {
638-
defer func() {
639-
if err := os.RemoveAll(tmpRoot); err != nil {
640-
logger.Errorf("remove %s: %v", tmpRoot, err)
641-
}
642-
}()
643-
}
636+
defer func() {
637+
if err := os.RemoveAll(tmpRoot); err != nil {
638+
logger.Errorf("remove %s: %v", tmpRoot, err)
639+
}
640+
}()
641+
644642
// macOS will prefix temporary directories with `/private`
645643
// update tmpRoot with this prefix to allow strings.TrimPrefix to work
646644
if runtime.GOOS == "darwin" {
@@ -652,16 +650,28 @@ func processArchive(ctx context.Context, c malcontent.Config, rfs []fs.FS, archi
652650
return nil, fmt.Errorf("find: %w", err)
653651
}
654652

653+
ep := make(chan string, len(extractedPaths))
654+
go func() {
655+
defer close(ep)
656+
for _, path := range extractedPaths {
657+
select {
658+
case <-ctx.Done():
659+
return
660+
case ep <- path:
661+
}
662+
}
663+
}()
664+
655665
maxConcurrency := getMaxConcurrency(c.Concurrency)
656666
scanCtx, cancel := context.WithCancel(ctx)
657667
defer cancel()
658668

659-
g := setupErrorGroup(maxConcurrency)
669+
g, gCtx := errgroup.WithContext(scanCtx)
670+
g.SetLimit(maxConcurrency)
660671

661-
ep := createPathChannel(extractedPaths)
662672
for path := range ep {
663673
g.Go(func() error {
664-
fr, err := processFile(scanCtx, c, rfs, path, archivePath, tmpRoot, logger)
674+
fr, err := processFile(gCtx, c, rfs, path, archivePath, tmpRoot, logger)
665675
if err != nil {
666676
return err
667677
}

0 commit comments

Comments
 (0)