Skip to content

Commit 7778d70

Browse files
authored
feat: lake: conditional cache overwrites (#14036)
This PR refines when and how Lake decides to overwrite data while caching, and has Lake prefer outputs in a local trace file over those stored in the cache. **Key details** * Introduces an overwrite toggle controlled by the new options `--no-overwrite` and `--force-overwrite`. The default for `lake cache add` is `--force-overwrite` and the default for `lake cache stage/unstage` is `--no-overwrite`. * With `--force-overwrite`, already existing mappings and artifacts are overwritten. With `--no-overwrite`, they are not. * Existing trace's outputs will not overwrite an existing cached input-to-output mapping (so they may diverge). This avoids writing to the cache on every trace read. * The `ltar` output is retained in the module's cached input-to-output mapping after unpacking from the cache. Closes #13997.
1 parent 4e28544 commit 7778d70

8 files changed

Lines changed: 136 additions & 30 deletions

File tree

src/lake/Lake/Build/Common.lean

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -538,6 +538,9 @@ open ResolveOutputs in
538538
/--
539539
Retrieve artifacts from the Lake cache using only the outputs stored in the saved trace file.
540540
541+
If the cache is writable, saves the input-to-output mapping derived from the trace to the cache,
542+
unless a mapping for the `inputHash` already exists.
543+
541544
**For internal use only.**
542545
-/
543546
@[specialize] public def getArtifactsUsingTrace?
@@ -549,7 +552,7 @@ Retrieve artifacts from the Lake cache using only the outputs stored in the save
549552
try
550553
let arts ← resolveOutputs (.ofData out)
551554
if (← pkg.isArtifactCacheWritable) then
552-
let act := (← getLakeCache).writeOutputs pkg.cacheScope inputHash out
555+
let act := (← getLakeCache).writeOutputs pkg.cacheScope inputHash out (overwrite := false)
553556
if let .error e ← act.toBaseIO then
554557
logWarning s!"could not write outputs to cache: {e}"
555558
return some arts
@@ -560,17 +563,17 @@ Retrieve artifacts from the Lake cache using only the outputs stored in the save
560563
open ResolveOutputs in
561564
/--
562565
Retrieve artifacts from the Lake cache using the outputs stored in either
563-
the saved trace file or (unless `traceOnly` is `true`) in the cached input-to-content mapping.
566+
the saved trace file or in the cached input-to-content mapping.
564567
565568
**For internal use only.**
566569
-/
567570
@[inline] public nonrec def getArtifacts?
568571
[ResolveOutputs α] (inputHash : Hash) (savedTrace : SavedTrace) (pkg : Package)
569572
: JobM (Option α) := do
570-
if let some a ← getArtifactsUsingCache? inputHash pkg then
571-
return some a
572573
if let some a ← getArtifactsUsingTrace? inputHash savedTrace pkg then
573574
return some a
575+
if let some a ← getArtifactsUsingCache? inputHash pkg then
576+
return some a
574577
return none
575578

576579
/-- **For internal use only.** -/

src/lake/Lake/Build/Module.lean

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -871,6 +871,7 @@ def Module.recBuildLtar (self : Module) : FetchM (Job FilePath) := do
871871
modify ({· with wantsRebuild := true})
872872
error "archive does not exist and needs to be built"
873873
else
874+
updateAction .build
874875
self.packLtar arts
875876
newTrace s!"{self.name.toString}:ltar"
876877
addTrace art.trace
@@ -948,7 +949,11 @@ where
948949
-- end up in the build directory and, if writable, the cache
949950
let arts ← mod.restoreAllArtifacts {arts with ltar? := some ltar}
950951
if (← mod.pkg.isArtifactCacheWritable) then
951-
.inl <$> mod.cacheOutputArtifacts setup.isModule restoreAll
952+
let arts ← mod.cacheOutputArtifacts setup.isModule restoreAll
953+
-- Note: Cache service metadata is not preserved on an output update because it would
954+
-- result in downloading module outputs that are not available on the remote.
955+
(← getLakeCache).writeOutputs mod.pkg.cacheScope inputHash arts.descrs (overwrite := true)
956+
return .inl arts
952957
else
953958
return .inl arts
954959
else

src/lake/Lake/CLI/Help.lean

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -455,7 +455,7 @@ artifacts. If no mappings are found, Lake will backtrack the Git history up to
455455
will search the repository's entire history (or as far as Git will allow).
456456
457457
By default, Lake will download both the input-to-output mappings and the
458-
output artifacts for a package. By using `--mappings-onlys`, Lake will only
458+
output artifacts for a package. By using `--mappings-only`, Lake will only
459459
download the mappings and delay downloading artifacts until they are needed.
460460
461461
If a download for an artifact fails or the download process for a whole
@@ -509,12 +509,16 @@ OPTIONS:
509509
--service=<name> cache service to fetch from on demand
510510
--scope=<remote-scope> the prefix of artifacts within the service
511511
--repo=<github-repo> for Reservoir, a GitHub repository scope
512+
--no-overwrite do not overwrite existing mappings
512513
513514
Reads a list of input-to-output mappings from the provided file and adds
514-
them to the local Lake cache. If `--service` is provided, the output artifacts
515-
can then be fetched lazily from that service during a Lake build. The service
516-
must either be `reservoir` or be configured through the Lake system
517-
configuration (see the help page of `lake cache services` for details).
515+
them to the local Lake cache. Mappings already in the cache are overwritten
516+
unless `--no-overwrite` is specified.
517+
518+
If `--service` is provided, the output artifacts can then be fetched lazily
519+
from that service during a Lake build. The service must either be `reservoir`
520+
or be configured through the Lake system configuration (see the help page of
521+
`lake cache services` for details).
518522
519523
Since Lake does not currently use cryptographically secure hashes for
520524
artifacts and outputs, artifacts in a cache service are prefixed with a scope
@@ -526,25 +530,28 @@ def helpCacheStage :=
526530
"Copy build outputs from the cache to a staging directory
527531
528532
USAGE:
529-
lake cache stage <mappings> <staging-directory>
533+
lake cache stage <mappings> <staging-directory> [--force-overwrite]
530534
531-
Creates the staging directory and copies the mappings file to it. Then, it
532-
copies all artifacts described within the mappings file from the cache to the
533-
staging directory. Errors if any of the artifacts described cannot be found in
534-
the cache."
535+
Creates the staging directory and copies the mappings file to it. Then,
536+
it copies all artifacts described within the mappings file from the cache to
537+
the staging directory. Artifacts in the staging directory are not overwritten
538+
unless `--force-overwrite` is specified. Errors if any of the artifacts
539+
described cannot be found in the cache."
535540

536541
def helpCacheUnstage :=
537542
"Cache build outputs from a staging directory
538543
539544
USAGE:
540-
lake cache unstage <staging-directory>
545+
lake cache unstage <staging-directory> [--force-overwrite]
541546
542547
Copies the mappings and artifacts stored in staging directory (e.g., via
543548
`lake cache stage`) back into the cache.
544549
545550
Reads the mappings file located at `outputs.jsonl` within the staging
546551
directory and writes the mappings to the Lake cache. Then, it copies the
547-
described artifacts from the staging directory into the cache."
552+
described artifacts from the staging directory into the cache. Mappings and
553+
artifacts already in the cache are not overwritten unless `--force-overwrite`
554+
is specified."
548555

549556
def helpCachePutStaged :=
550557
"Upload build outputs from a staging directory to a remote service

src/lake/Lake/CLI/Main.lean

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ public structure LakeOptions where
6565
outFormat : OutFormat := .text
6666
offline : Bool := false
6767
outputsFile? : Option FilePath := none
68+
overwrite? : Option Bool := none
6869
forceDownload : Bool := false
6970
mappingsOnly : Bool := false
7071
service? : Option String := none
@@ -285,6 +286,8 @@ def lakeLongOption : (opt : String) → CliM PUnit
285286
| "--offline" => modifyThe LakeOptions ({· with offline := true})
286287
| "--wfail" => modifyThe LakeOptions ({· with failLv := .warning})
287288
| "--iofail" => modifyThe LakeOptions ({· with failLv := .info})
289+
| "--no-overwrite" => modifyThe LakeOptions ({· with overwrite? := some false})
290+
| "--force-overwrite" => modifyThe LakeOptions ({· with overwrite? := some true})
288291
| "--force-download" => modifyThe LakeOptions ({· with forceDownload := true})
289292
| "--download-arts" => modifyThe LakeOptions ({· with mappingsOnly := false})
290293
| "--mappings-only" => modifyThe LakeOptions ({· with mappingsOnly := true})
@@ -467,6 +470,10 @@ protected def get : CliM PUnit := do
467470
let cfg ← mkLoadConfig opts
468471
let ws ← loadWorkspace cfg
469472
let cache := ws.lakeCache
473+
let overwrite := opts.overwrite?.getD true
474+
unless overwrite do
475+
-- artifacts of skipped mappings with `--no-overwrite` cannot be cleanly handled
476+
error "`--no-overwrite` is not supported for `cache get`"
470477
if let some file := mappings? then liftM (m := LoggerIO) do
471478
if opts.mappingsOnly then
472479
error "`--mappings-only` is not supported with a mappings file; use `lake cache add` instead"
@@ -487,7 +494,7 @@ protected def get : CliM PUnit := do
487494
else
488495
return ws.defaultCacheService
489496
let map ← CacheMap.load file
490-
cache.writeMap ws.root.cacheScope map service.name? (some remoteScope)
497+
cache.writeMap ws.root.cacheScope map service.name? (some remoteScope) overwrite
491498
let descrs ← map.collectOutputDescrs
492499
service.downloadArtifacts descrs cache remoteScope opts.forceDownload
493500
else
@@ -532,7 +539,7 @@ protected def get : CliM PUnit := do
532539
return map
533540
else
534541
findOutputs cache service pkg remoteScope opts platform toolchain
535-
cache.writeMap pkg.cacheScope map service.name? (some remoteScope)
542+
cache.writeMap pkg.cacheScope map service.name? (some remoteScope) overwrite
536543
unless opts.mappingsOnly do
537544
let descrs ← map.collectOutputDescrs
538545
service.downloadArtifacts descrs cache remoteScope opts.forceDownload
@@ -546,7 +553,7 @@ protected def get : CliM PUnit := do
546553
let toolchain := cacheToolchain pkg toolchain
547554
try
548555
let map ← findOutputs cache service pkg remoteScope opts platform toolchain
549-
cache.writeMap pkg.cacheScope map service.name? (some remoteScope)
556+
cache.writeMap pkg.cacheScope map service.name? (some remoteScope) overwrite
550557
unless opts.mappingsOnly do
551558
let descrs ← map.collectOutputDescrs
552559
service.downloadArtifacts descrs cache remoteScope opts.forceDownload
@@ -679,7 +686,8 @@ protected def add : CliM PUnit := do
679686
error (serviceNotFound service ws.lakeConfig.config.cache.services)
680687
return some (.ofString service)
681688
let map ← CacheMap.load file
682-
ws.lakeCache.writeMap localScope map service? opts.scope?
689+
let overwrite := opts.overwrite?.getD true
690+
ws.lakeCache.writeMap localScope map service? opts.scope? overwrite
683691

684692
private def stagingOutputsFile := "outputs.jsonl"
685693

@@ -701,9 +709,12 @@ protected def stage : CliM PUnit := do
701709
let descrs ← map.collectOutputDescrs
702710
IO.FS.createDirAll stagingDir
703711
copyFile mappingsFile (stagingDir / stagingOutputsFile)
712+
let overwrite := opts.overwrite?.getD false
704713
let ok ← descrs.foldlM (init := true) fun ok descr => do
705714
let cachePath := cache.artifactDir / descr.relPath
706715
let stagingPath := stagingDir / descr.relPath
716+
unless overwrite || !(← stagingPath.pathExists) do
717+
return ok
707718
match (← copyFile cachePath stagingPath |>.toBaseIO) with
708719
| .ok _ =>
709720
return ok
@@ -741,9 +752,16 @@ protected def unstage : CliM PUnit := do
741752
let descrs ← map.collectOutputDescrs
742753
let artDir := ws.lakeCache.artifactDir
743754
IO.FS.createDirAll artDir
755+
let overwrite := opts.overwrite?.getD false
744756
let ok ← descrs.foldlM (init := true) fun ok descr => do
745757
let cachePath := artDir/ descr.relPath
746758
let stagingPath := stagingDir / descr.relPath
759+
if (← cachePath.pathExists) then
760+
if overwrite then
761+
-- Cache artifacts are read-only, so the old artifact must be deleted first.
762+
IO.FS.removeFile cachePath
763+
else
764+
return ok
747765
match (← copyFile stagingPath cachePath |>.toBaseIO) with
748766
| .ok _ =>
749767
return ok
@@ -756,7 +774,7 @@ protected def unstage : CliM PUnit := do
756774
unless ok do
757775
logError "failed to copy all outputs to the staging directory"
758776
exit 1
759-
ws.lakeCache.writeMap localScope map service? opts.scope?
777+
ws.lakeCache.writeMap localScope map service? opts.scope? overwrite
760778

761779
protected def putStaged : CliM PUnit := do
762780
processOptions lakeOption

src/lake/Lake/Config/Cache.lean

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -434,22 +434,30 @@ public def getArtifact (cache : Cache) (descr : ArtifactDescr) : EIO String Arti
434434
def writeOutputsCore
435435
(cache : Cache) (scope : String) (inputHash : Hash) (out : Json)
436436
(service? : Option CacheServiceName) (remoteScope? : Option CacheServiceScope)
437+
(overwrite : Bool)
437438
: IO Unit := do
438439
let file := cache.outputsFile scope inputHash
439440
createParentDirs file
440441
let out := {service?, scope? := remoteScope?, data := out : CacheOutput}
441-
IO.FS.writeFile file (toJson out).pretty
442+
let contents := (toJson out).pretty
443+
if overwrite then
444+
IO.FS.writeFile file contents
445+
else
446+
writeFileIfNew file contents
442447

443448
/-- Cache the outputs corresponding to the given input for the package. -/
444449
@[inline] public def writeOutputs
445450
[ToJson α] (cache : Cache) (scope : String) (inputHash : Hash) (outputs : α)
446-
: IO Unit := cache.writeOutputsCore scope inputHash (toJson outputs) none none
451+
(service? : Option CacheServiceName := none) (remoteScope? : Option CacheServiceScope := none)
452+
(overwrite := true)
453+
: IO Unit := cache.writeOutputsCore scope inputHash (toJson outputs) service? remoteScope? overwrite
447454

448455
/-- Cache the input-to-outputs mappings from a `CacheMap`. -/
449456
public def writeMap
450457
(cache : Cache) (scope : String) (map : CacheMap)
451458
(service? : Option CacheServiceName := none) (remoteScope? : Option CacheServiceScope := none)
452-
: IO Unit := map.forM fun i e => cache.writeOutputsCore scope i e.out service? remoteScope?
459+
(overwrite := true)
460+
: IO Unit := map.forM fun i e => cache.writeOutputsCore scope i e.out service? remoteScope? overwrite
453461

454462
/-- Retrieve the cached outputs corresponding to the given input for the package (if any). -/
455463
public def readOutputs? (cache : Cache) (scope : String) (inputHash : Hash) : LogIO (Option CacheOutput) := do

tests/lake/tests/cache/online-test.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,8 @@ test_err 'service `bogus` not found in system configuration'\
6969
cache put bogus.jsonl --scope='bogus' --service='bogus'
7070

7171
# Test `cache get` command errors for bad configurations
72+
test_err '`--no-overwrite` is not supported for `cache get`' \
73+
cache get --no-overwrite
7274
test_err 'the `--platform` and `--toolchain` options do nothing' \
7375
cache get bogus.jsonl --scope='bogus' --platform='bogus' --wfail
7476
test_err 'the `--platform` and `--toolchain` options do nothing' \

tests/lake/tests/cache/test.sh

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -233,11 +233,41 @@ test_run cache unstage .lake/staging
233233
# Verify that `cache unstage` fails if staging artifacts are missing
234234
test_cmd mkdir -p .lake/staging-empty
235235
test_cmd cp .lake/outputs.jsonl .lake/staging-empty/outputs.jsonl
236-
test_err 'artifact not found in staging directory' cache unstage .lake/staging-empty
236+
test_err 'artifact not found in staging directory' cache unstage .lake/staging-empty \
237+
--force-overwrite # needed since the artifact is in the cache
238+
239+
# Test stage/unstage behavior regarding `--no-overwrite` / `--force-overwrite`
240+
test_cmd rm -rf "$CACHE_DIR"
241+
test_run build Test:static -o .lake/outputs.jsonl
242+
cache_art=$(echo "$CACHE_DIR"/artifacts/*.a)
243+
test_exp -s $cache_art
244+
staging_art=$(echo .lake/staging/*.a)
245+
test_exp -s $staging_art
246+
# Verify stage overwritting
247+
test_cmd rm -rf .lake/staging
248+
test_run cache stage .lake/outputs.jsonl .lake/staging --no-overwrite
249+
test_exp -f $staging_art # verify copy can occur with `--no-overwrite`
250+
test_cmd rm $staging_art
251+
test_cmd touch $staging_art
252+
test_exp ! -s $staging_art
253+
test_run cache stage .lake/outputs.jsonl .lake/staging
254+
test_exp ! -s $staging_art
255+
test_run cache stage .lake/outputs.jsonl .lake/staging --force-overwrite
256+
test_exp -s $staging_art
257+
# Verify unstage overwritting
258+
test_cmd rm -rf "$CACHE_DIR"
259+
test_run cache unstage .lake/staging --no-overwrite
260+
test_exp -f $cache_art # verify copy can occur with `--no-overwrite`
261+
test_cmd rm $staging_art
262+
test_cmd touch $staging_art
263+
test_exp ! -s $staging_art
264+
test_run cache unstage .lake/staging
265+
test_exp -s $cache_art
266+
test_run cache unstage .lake/staging --force-overwrite
267+
test_exp ! -s $cache_art
237268

238269
# Verify that `lake cache clean` deletes the cache directory
239270
test_exp -d "$CACHE_DIR"
240-
test_cmd cp -r "$CACHE_DIR" .lake/cache-backup
241271
test_run cache clean
242272
test_exp ! -d "$CACHE_DIR"
243273

@@ -280,9 +310,11 @@ if command -v jq > /dev/null; then # skip if no jq found
280310
test_cmd rm -f $libPath
281311
inputHash=$(jq -r '.depHash' $libPath.trace)
282312
echo $inputHash
313+
test_cmd rm -f $libPath.trace
283314
echo bogus > "$CACHE_DIR/outputs/test/$inputHash.json"
284315
test_out 'invalid JSON' build Test:static
285-
test_cmd rm -f $libPath
316+
test_exp -f $libPath
317+
test_cmd rm -f $libPath $libPath.trace
286318
echo '"bogus"' > "$CACHE_DIR/outputs/test/$inputHash.json"
287319
test_out 'some output(s) have issues' build Test:static
288320
test_exp -f $libPath

tests/lake/tests/ltar/test.sh

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ source ../common.sh
77
export LAKE_CACHE_DIR=
88

99
#-------------------------------------------------------------------------------
10-
# The tests covers the (offline) use of `leantar` in Lake
10+
# This test suite covers the (offline) use of `leantar` in Lake
1111
#-------------------------------------------------------------------------------
1212

1313
# Test regular build does not produce an `ltar`
@@ -19,7 +19,7 @@ test_err "archive does not exist and needs to be built" \
1919
build +Test:ltar --no-build -v
2020

2121
# Test the build of the `ltar` facet
22-
test_run build +Test:ltar -v
22+
test_out "Built Test:ltar" build +Test:ltar -v
2323
test_exp -f .lake/build/ir/Test.ltar
2424

2525
# Test that Lake unpacks the archive if the module's artifacts are missing
@@ -54,6 +54,37 @@ test_run -f restoreAll.toml build +Test -v --wfail -o .lake/outputs.jsonl
5454
test_cmd ls .lake/cache/artifacts/*.ltar
5555
test_exp -f .lake/build/ir/Test.ltar
5656

57+
# Test that the cache input-to-output mapping is overwritten without
58+
# `--no-overwrite`. Since the build mapping includes module outputs and the
59+
# tracked mapping only includes the `ltar`, the tracking mapping should
60+
# require an unpack, whereas the build mapping should not.
61+
rm -rf .lake/build
62+
no_match_text ltar .lake/cache/outputs/test/*.json
63+
test_run cache add .lake/outputs.jsonl --no-overwrite
64+
no_match_text ltar .lake/cache/outputs/test/*.json
65+
LAKE_ARTIFACT_CACHE=true test_not_out "leantar" build +Test --no-build -v
66+
rm -rf .lake/build
67+
no_match_text ltar .lake/cache/outputs/test/*.json
68+
test_run cache add .lake/outputs.jsonl --service reservoir --repo 'foo/bar'
69+
match_text ltar .lake/cache/outputs/test/*.json
70+
match_text reservoir .lake/cache/outputs/test/*.json
71+
LAKE_ARTIFACT_CACHE=true test_out "leantar" build +Test --no-build -v
72+
# Unpack should have updated the cached mapping: added module outputs,
73+
# removed service metadata, and left the ltar.
74+
rm -rf .lake/build
75+
match_text ltar .lake/cache/outputs/test/*.json
76+
match_text olean .lake/cache/outputs/test/*.json
77+
no_match_text reservoir .lake/cache/outputs/test/*.json
78+
LAKE_ARTIFACT_CACHE=true test_not_out "leantar" build +Test --no-build -v
79+
80+
# Test that Lake prefers the local trace outputs over the cache
81+
rm -f .lake/build/lib/lean/*.[!t]*
82+
test_run cache add .lake/outputs.jsonl
83+
match_text ltar .lake/cache/outputs/test/*.json
84+
LAKE_ARTIFACT_CACHE=true test_not_out "leantar" build +Test --no-build -v
85+
# Test that the cache output is not overwritten without an unpack
86+
match_text ltar .lake/cache/outputs/test/*.json
87+
5788
# Test producing an `ltar` without already restored artifacts
5889
rm -rf .lake/cache .lake/build
5990
LAKE_ARTIFACT_CACHE=true test_run build +Test -v

0 commit comments

Comments
 (0)