Skip to content

Commit e1c2447

Browse files
authored
feat: support multi-part archive downloads (#193)
## Summary - Adds a `parts` field to the archive source config as a mutually-exclusive alternative to `file` - Each part (URL or local path) is downloaded and cached independently, then all parts are concatenated into a single cached archive before extraction - Useful when the archive exceeds per-asset upload size limits (e.g. GitHub Releases) - Per-part and combined files both use sha256-derived cache keys so re-runs skip redundant downloads ### Example config ```yaml runner: benchmark: tests: source: archive: parts: - https://github.com/org/repo/releases/download/v1.0.0/tests.tar.gz.00.part - https://github.com/org/repo/releases/download/v1.0.0/tests.tar.gz.01.part steps: test: - "testing/*.txt" ``` ## Test plan - [x] New unit tests for parts download + caching (`TestArchiveSource_PrepareWithParts`) - [x] New unit tests for local-path parts (`TestArchiveSource_PrepareWithLocalParts`) - [x] New unit tests for source info (`TestArchiveSource_GetSourceInfo_Parts`) - [x] Validation tests for mutual exclusivity - [x] All existing archive tests still pass - [x] Run with a real parts-based archive URL and verify extraction + test discovery
1 parent 6c58c51 commit e1c2447

10 files changed

Lines changed: 428 additions & 33 deletions

File tree

config.example.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,12 @@ runner:
134134
# # GitHub Actions artifact URLs are auto-converted to API download endpoints.
135135
# # archive:
136136
# # file: https://github.com/NethermindEth/gas-benchmarks/actions/runs/23847558369/artifacts/6222084759
137+
# # # Alternative to `file`: a list of parts to download and concatenate.
138+
# # # Useful when the archive is split across multiple files because of
139+
# # # per-asset size limits. Mutually exclusive with `file`.
140+
# # # parts:
141+
# # # - https://example.com/tests.tar.gz.00.part
142+
# # # - https://example.com/tests.tar.gz.01.part
137143
# # pre_run_steps:
138144
# # - "perf-devnet-3/gas-bump.txt"
139145
# # - "perf-devnet-3/funding.txt"

docs/configuration.md

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -325,12 +325,27 @@ tests:
325325

326326
| Option | Type | Required | Description |
327327
|--------|------|----------|-------------|
328-
| `file` | string | Yes | Local path or URL to a ZIP or tar.gz archive. GitHub Actions artifact URLs are auto-converted to API endpoints |
328+
| `file` | string | One of `file`/`parts` | Local path or URL to a ZIP or tar.gz archive. GitHub Actions artifact URLs are auto-converted to API endpoints |
329+
| `parts` | []string | One of `file`/`parts` | Ordered list of local paths or URLs to concatenate into the final archive. Useful when the archive is split because of per-asset size limits. Mutually exclusive with `file` |
329330
| `pre_run_steps` | []string | No | Glob patterns for steps executed once before all tests |
330331
| `steps.setup` | []string | No | Glob patterns for setup phase files |
331332
| `steps.test` | []string | No | Glob patterns for test phase files |
332333
| `steps.cleanup` | []string | No | Glob patterns for cleanup phase files |
333334

335+
**Multi-part archives:** when an archive is too large for a single asset upload, `parts` accepts an ordered list of URLs or local paths. All parts are downloaded (with caching) and concatenated into a single file before extraction:
336+
337+
```yaml
338+
tests:
339+
source:
340+
archive:
341+
parts:
342+
- https://github.com/org/repo/releases/download/v1.0.0/tests.tar.gz.00.part
343+
- https://github.com/org/repo/releases/download/v1.0.0/tests.tar.gz.01.part
344+
steps:
345+
test:
346+
- "testing/*.txt"
347+
```
348+
334349
##### Opcode Source
335350

336351
Optional external opcode metadata can be configured alongside the test source:

pkg/config/config.go

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -334,8 +334,13 @@ type LocalSourceV2 struct {
334334

335335
// ArchiveSourceConfig defines an archive file source for tests.
336336
// The file can be a local path or a URL (HTTP/HTTPS) to a ZIP or tar.gz archive.
337+
// Alternatively, `parts` can be a list of paths/URLs that are concatenated in
338+
// order into the final archive (useful when the archive is split across
339+
// multiple files because of per-asset size limits). `file` and `parts` are
340+
// mutually exclusive.
337341
type ArchiveSourceConfig struct {
338-
File string `yaml:"file" mapstructure:"file"`
342+
File string `yaml:"file,omitempty" mapstructure:"file"`
343+
Parts []string `yaml:"parts,omitempty" mapstructure:"parts"`
339344
PreRunSteps []string `yaml:"pre_run_steps,omitempty" mapstructure:"pre_run_steps"`
340345
Steps *StepsConfig `yaml:"steps,omitempty" mapstructure:"steps"`
341346
}
@@ -1145,8 +1150,15 @@ func (s *SourceConfig) Validate() error {
11451150
}
11461151

11471152
if s.Archive != nil {
1148-
if s.Archive.File == "" {
1149-
return fmt.Errorf("archive.file is required")
1153+
hasFile := s.Archive.File != ""
1154+
hasParts := len(s.Archive.Parts) > 0
1155+
1156+
if !hasFile && !hasParts {
1157+
return fmt.Errorf("archive.file or archive.parts is required")
1158+
}
1159+
1160+
if hasFile && hasParts {
1161+
return fmt.Errorf("archive.file and archive.parts are mutually exclusive")
11501162
}
11511163
}
11521164

pkg/config/config_test.go

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -628,12 +628,35 @@ func TestSourceConfig_Validate(t *testing.T) {
628628
wantErr: false,
629629
},
630630
{
631-
name: "archive missing file",
631+
name: "archive missing file and parts",
632632
source: SourceConfig{
633633
Archive: &ArchiveSourceConfig{},
634634
},
635635
wantErr: true,
636-
errSubstr: "archive.file is required",
636+
errSubstr: "archive.file or archive.parts is required",
637+
},
638+
{
639+
name: "valid archive source with parts",
640+
source: SourceConfig{
641+
Archive: &ArchiveSourceConfig{
642+
Parts: []string{
643+
"https://example.com/fixtures.tar.gz.00.part",
644+
"https://example.com/fixtures.tar.gz.01.part",
645+
},
646+
},
647+
},
648+
wantErr: false,
649+
},
650+
{
651+
name: "archive file and parts are mutually exclusive",
652+
source: SourceConfig{
653+
Archive: &ArchiveSourceConfig{
654+
File: "https://example.com/fixtures.tar.gz",
655+
Parts: []string{"https://example.com/fixtures.tar.gz.00.part"},
656+
},
657+
},
658+
wantErr: true,
659+
errSubstr: "mutually exclusive",
637660
},
638661
{
639662
name: "multiple sources not allowed - archive and git",

pkg/executor/archive_source.go

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"crypto/sha256"
66
"encoding/hex"
77
"fmt"
8+
"io"
89
"os"
910
"path/filepath"
1011
"regexp"
@@ -91,6 +92,7 @@ func (s *ArchiveSource) Cleanup() error {
9192
func (s *ArchiveSource) GetSourceInfo() (*SuiteSource, error) {
9293
info := &ArchiveSourceInfo{
9394
File: s.cfg.File,
95+
Parts: s.cfg.Parts,
9496
PreRunSteps: s.cfg.PreRunSteps,
9597
}
9698

@@ -107,7 +109,13 @@ func (s *ArchiveSource) GetSourceInfo() (*SuiteSource, error) {
107109

108110
// resolveFile returns the local path to the archive file. For URLs, it checks
109111
// the cache directory first and only downloads if the file is not already cached.
112+
// When the config uses `parts`, the parts are downloaded (with caching) and
113+
// concatenated into a single cached file.
110114
func (s *ArchiveSource) resolveFile(ctx context.Context) (string, error) {
115+
if len(s.cfg.Parts) > 0 {
116+
return s.resolvePartsFile(ctx)
117+
}
118+
111119
file := s.cfg.File
112120

113121
if strings.HasPrefix(file, "http://") || strings.HasPrefix(file, "https://") {
@@ -181,6 +189,149 @@ func (s *ArchiveSource) cachedArchivePath() string {
181189
return filepath.Join(cacheDir, name)
182190
}
183191

192+
// resolvePartsFile downloads (with caching) all configured parts and
193+
// concatenates them in order into a single cached file, returning its path.
194+
// Local paths and URLs can be mixed freely in the parts list.
195+
func (s *ArchiveSource) resolvePartsFile(ctx context.Context) (string, error) {
196+
cacheDir := s.cacheDir
197+
if cacheDir == "" {
198+
cacheDir = os.TempDir()
199+
}
200+
201+
// Combined cache key is derived from the full ordered parts list so any
202+
// change to the list produces a fresh combined file.
203+
combined := sha256.Sum256([]byte(strings.Join(s.cfg.Parts, "\n")))
204+
combinedPath := filepath.Join(cacheDir, "archive-parts-"+hex.EncodeToString(combined[:8]))
205+
206+
if _, err := os.Stat(combinedPath); err == nil {
207+
s.log.WithField("path", combinedPath).Info("Using cached combined archive")
208+
209+
return combinedPath, nil
210+
}
211+
212+
if err := os.MkdirAll(cacheDir, 0755); err != nil {
213+
return "", fmt.Errorf("creating cache directory: %w", err)
214+
}
215+
216+
// Resolve each part to a local file path (downloading URLs as needed).
217+
partPaths := make([]string, 0, len(s.cfg.Parts))
218+
219+
for i, part := range s.cfg.Parts {
220+
s.log.WithFields(logrus.Fields{
221+
"part": i + 1,
222+
"total": len(s.cfg.Parts),
223+
"ref": part,
224+
}).Info("Resolving archive part")
225+
226+
partPath, err := s.resolvePart(ctx, part, cacheDir)
227+
if err != nil {
228+
return "", fmt.Errorf("resolving part %d (%s): %w", i+1, part, err)
229+
}
230+
231+
partPaths = append(partPaths, partPath)
232+
}
233+
234+
// Concatenate all parts into a temporary file, then atomically rename.
235+
tmpPath := combinedPath + ".tmp"
236+
237+
if err := concatFiles(tmpPath, partPaths); err != nil {
238+
_ = os.Remove(tmpPath)
239+
240+
return "", fmt.Errorf("concatenating parts: %w", err)
241+
}
242+
243+
if err := os.Rename(tmpPath, combinedPath); err != nil {
244+
_ = os.Remove(tmpPath)
245+
246+
return "", fmt.Errorf("caching combined archive: %w", err)
247+
}
248+
249+
s.log.WithField("path", combinedPath).Info("Combined archive parts")
250+
251+
return combinedPath, nil
252+
}
253+
254+
// resolvePart resolves a single part reference (URL or local path) to a local
255+
// file path, downloading and caching remote parts in cacheDir.
256+
func (s *ArchiveSource) resolvePart(ctx context.Context, part, cacheDir string) (string, error) {
257+
if strings.HasPrefix(part, "http://") || strings.HasPrefix(part, "https://") {
258+
hash := sha256.Sum256([]byte(part))
259+
cachedPath := filepath.Join(cacheDir, "archive-part-"+hex.EncodeToString(hash[:8]))
260+
261+
if _, err := os.Stat(cachedPath); err == nil {
262+
s.log.WithFields(logrus.Fields{
263+
"url": part,
264+
"path": cachedPath,
265+
}).Info("Using cached archive part")
266+
267+
return cachedPath, nil
268+
}
269+
270+
downloadURL, token := s.resolveDownloadURL(part)
271+
272+
tmpPath := cachedPath + ".tmp"
273+
274+
if err := downloadToFile(ctx, downloadURL, tmpPath, token, s.log); err != nil {
275+
_ = os.Remove(tmpPath)
276+
277+
return "", err
278+
}
279+
280+
if err := os.Rename(tmpPath, cachedPath); err != nil {
281+
_ = os.Remove(tmpPath)
282+
283+
return "", fmt.Errorf("caching archive part: %w", err)
284+
}
285+
286+
return cachedPath, nil
287+
}
288+
289+
// Local file path — resolve relative paths.
290+
if !filepath.IsAbs(part) {
291+
absPath, err := filepath.Abs(part)
292+
if err != nil {
293+
return "", fmt.Errorf("resolving path %q: %w", part, err)
294+
}
295+
296+
part = absPath
297+
}
298+
299+
if _, err := os.Stat(part); os.IsNotExist(err) {
300+
return "", fmt.Errorf("archive part %q does not exist", part)
301+
}
302+
303+
return part, nil
304+
}
305+
306+
// concatFiles concatenates src files (in order) into dst. dst is created (or
307+
// truncated) and written through a streamed copy so memory usage stays flat.
308+
func concatFiles(dst string, srcs []string) error {
309+
out, err := os.Create(dst)
310+
if err != nil {
311+
return fmt.Errorf("creating %s: %w", dst, err)
312+
}
313+
defer out.Close() //nolint:errcheck
314+
315+
for _, src := range srcs {
316+
in, err := os.Open(src) //nolint:gosec // trusted paths under our cache
317+
if err != nil {
318+
return fmt.Errorf("opening %s: %w", src, err)
319+
}
320+
321+
if _, err := io.Copy(out, in); err != nil {
322+
_ = in.Close()
323+
324+
return fmt.Errorf("copying %s: %w", src, err)
325+
}
326+
327+
if err := in.Close(); err != nil {
328+
return fmt.Errorf("closing %s: %w", src, err)
329+
}
330+
}
331+
332+
return out.Close()
333+
}
334+
184335
// resolveDownloadURL converts browser URLs to API URLs where needed and returns
185336
// the appropriate auth token. For GitHub Actions artifact URLs, it converts to
186337
// the GitHub API download endpoint with bearer token auth.

0 commit comments

Comments
 (0)