Skip to content

Commit fe2b5f5

Browse files
authored
Persist compiled rules locally rather than in-memory (#1122)
We currently persist compiled rules in-memory which only works for the duration of a single mal invocation. In cases where we want to run successive mal scans (usually when looping over specific files or directories to produce per-scan result files), we run into rule compilation overhead which takes at least several seconds per run which can be extremely slow when done dozens of times. This PR instead stores rules locally in the user's cache directory using the compiled rule hash. This file is read from each time mal is run and will only be recreated if it does not exist. I also added tests and benchmarks so we can validate this works and is faster (which it is by a factor of 10-12x) Signed-off-by: egibs <20933572+egibs@users.noreply.github.com>
1 parent a81fc70 commit fe2b5f5

3 files changed

Lines changed: 558 additions & 1 deletion

File tree

pkg/action/scan.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -307,7 +307,7 @@ func CachedRules(ctx context.Context, fss []fs.FS) (*yarax.Rules, error) {
307307
var err error
308308
compileOnce.Do(func() {
309309
var yrs *yarax.Rules
310-
yrs, err = compile.Recursive(ctx, fss)
310+
yrs, err = compile.RecursiveCached(ctx, fss)
311311
if err != nil {
312312
err = fmt.Errorf("compile: %w", err)
313313
return

pkg/compile/compile.go

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,14 @@ import (
77
"context"
88
"fmt"
99
"io/fs"
10+
"log/slog"
11+
"os"
1012
"path/filepath"
1113
"regexp"
1214
"strings"
1315

16+
"github.com/minio/sha256-simd"
17+
1418
"github.com/chainguard-dev/clog"
1519
"github.com/chainguard-dev/malcontent/rules"
1620

@@ -221,3 +225,129 @@ func Recursive(ctx context.Context, fss []fs.FS) (*yarax.Rules, error) {
221225

222226
return yxc.Build(), nil
223227
}
228+
229+
// getCacheDir returns the directory for storing compiled rules.
230+
func getCacheDir() (string, error) {
231+
var cacheDir string
232+
233+
if userCacheDir, err := os.UserCacheDir(); err == nil {
234+
cacheDir = filepath.Join(userCacheDir, "malcontent")
235+
} else {
236+
cacheDir = filepath.Join(os.TempDir(), "malcontent-cache")
237+
}
238+
239+
if err := os.MkdirAll(cacheDir, 0o755); err != nil {
240+
return "", fmt.Errorf("create cache dir: %w", err)
241+
}
242+
243+
return cacheDir, nil
244+
}
245+
246+
// loadCachedRules attempts to load rules from the local, compiled rules.
247+
func loadCachedRules(cacheFile string) (*yarax.Rules, error) {
248+
file, err := os.Open(cacheFile)
249+
if err != nil {
250+
return nil, err
251+
}
252+
defer file.Close()
253+
254+
compiledRules, err := yarax.ReadFrom(file)
255+
if err != nil {
256+
return nil, fmt.Errorf("read cached rules: %w", err)
257+
}
258+
259+
return compiledRules, nil
260+
}
261+
262+
// saveCachedRules saves rules to a local file.
263+
func saveCachedRules(compiledRules *yarax.Rules, cacheFile string) error {
264+
tmpFile := cacheFile + ".tmp"
265+
file, err := os.OpenFile(tmpFile, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o644)
266+
if err != nil {
267+
return fmt.Errorf("create cache file: %w", err)
268+
}
269+
defer file.Close()
270+
271+
if _, err := compiledRules.WriteTo(file); err != nil {
272+
os.Remove(tmpFile)
273+
return fmt.Errorf("write rules to cache: %w", err)
274+
}
275+
276+
if err := os.Rename(tmpFile, cacheFile); err != nil {
277+
os.Remove(tmpFile)
278+
return fmt.Errorf("rename cache file: %w", err)
279+
}
280+
281+
return nil
282+
}
283+
284+
// getRulesHash computes a hash of the rule sources for cache validation.
285+
func getRulesHash(ctx context.Context, fss []fs.FS) (string, error) {
286+
if ctx.Err() != nil {
287+
return "", ctx.Err()
288+
}
289+
290+
hasher := sha256.New()
291+
292+
for _, fsys := range fss {
293+
err := fs.WalkDir(fsys, ".", func(path string, d fs.DirEntry, err error) error {
294+
if err != nil {
295+
return err
296+
}
297+
if d.IsDir() {
298+
return nil
299+
}
300+
if filepath.Ext(path) == ".yara" || filepath.Ext(path) == ".yar" {
301+
hasher.Write([]byte(path))
302+
content, err := fs.ReadFile(fsys, path)
303+
if err != nil {
304+
return err
305+
}
306+
hasher.Write(content)
307+
}
308+
return nil
309+
})
310+
if err != nil {
311+
return "", err
312+
}
313+
}
314+
315+
return fmt.Sprintf("%x", hasher.Sum(nil)), nil
316+
}
317+
318+
// RecursiveCached compiles rules with persistent disk caching to avoid penalizing successive executions with repeated rule compilations.
319+
func RecursiveCached(ctx context.Context, fss []fs.FS) (*yarax.Rules, error) {
320+
if ctx.Err() != nil {
321+
return nil, ctx.Err()
322+
}
323+
324+
cacheDir, cacheErr := getCacheDir()
325+
if cacheErr != nil {
326+
return Recursive(ctx, fss)
327+
}
328+
329+
hash, hashErr := getRulesHash(ctx, fss)
330+
if hashErr != nil {
331+
return Recursive(ctx, fss)
332+
}
333+
334+
cacheFile := filepath.Join(cacheDir, fmt.Sprintf("rules-%s.cache", hash))
335+
if cachedRules, loadErr := loadCachedRules(cacheFile); loadErr == nil {
336+
slog.Debug("Loaded rules from cache", "file", cacheFile)
337+
return cachedRules, nil
338+
}
339+
340+
slog.Debug("Cache miss, compiling rules", "file", cacheFile)
341+
compiledRules, err := Recursive(ctx, fss)
342+
if err != nil {
343+
return nil, fmt.Errorf("compile: %w", err)
344+
}
345+
346+
if saveErr := saveCachedRules(compiledRules, cacheFile); saveErr != nil {
347+
slog.Warn("Failed to save rules to cache", "error", saveErr)
348+
} else {
349+
slog.Debug("Saved rules to cache", "file", cacheFile)
350+
}
351+
352+
return compiledRules, nil
353+
}

0 commit comments

Comments
 (0)