diff --git a/src/lake/Lake/Build/Common.lean b/src/lake/Lake/Build/Common.lean index 513b864bfe91..3110e23e0481 100644 --- a/src/lake/Lake/Build/Common.lean +++ b/src/lake/Lake/Build/Common.lean @@ -538,6 +538,9 @@ open ResolveOutputs in /-- Retrieve artifacts from the Lake cache using only the outputs stored in the saved trace file. +If the cache is writable, saves the input-to-output mapping derived from the trace to the cache, +unless a mapping for the `inputHash` already exists. + **For internal use only.** -/ @[specialize] public def getArtifactsUsingTrace? @@ -549,7 +552,7 @@ Retrieve artifacts from the Lake cache using only the outputs stored in the save try let arts ← resolveOutputs (.ofData out) if (← pkg.isArtifactCacheWritable) then - let act := (← getLakeCache).writeOutputs pkg.cacheScope inputHash out + let act := (← getLakeCache).writeOutputs pkg.cacheScope inputHash out (overwrite := false) if let .error e ← act.toBaseIO then logWarning s!"could not write outputs to cache: {e}" return some arts @@ -560,17 +563,17 @@ Retrieve artifacts from the Lake cache using only the outputs stored in the save open ResolveOutputs in /-- Retrieve artifacts from the Lake cache using the outputs stored in either -the saved trace file or (unless `traceOnly` is `true`) in the cached input-to-content mapping. +the saved trace file or in the cached input-to-content mapping. **For internal use only.** -/ @[inline] public nonrec def getArtifacts? [ResolveOutputs α] (inputHash : Hash) (savedTrace : SavedTrace) (pkg : Package) : JobM (Option α) := do - if let some a ← getArtifactsUsingCache? inputHash pkg then - return some a if let some a ← getArtifactsUsingTrace? inputHash savedTrace pkg then return some a + if let some a ← getArtifactsUsingCache? inputHash pkg then + return some a return none /-- **For internal use only.** -/ diff --git a/src/lake/Lake/Build/Module.lean b/src/lake/Lake/Build/Module.lean index 3d889e4e4a8b..2321766aa69d 100644 --- a/src/lake/Lake/Build/Module.lean +++ b/src/lake/Lake/Build/Module.lean @@ -871,6 +871,7 @@ def Module.recBuildLtar (self : Module) : FetchM (Job FilePath) := do modify ({· with wantsRebuild := true}) error "archive does not exist and needs to be built" else + updateAction .build self.packLtar arts newTrace s!"{self.name.toString}:ltar" addTrace art.trace @@ -948,7 +949,11 @@ where -- end up in the build directory and, if writable, the cache let arts ← mod.restoreAllArtifacts {arts with ltar? := some ltar} if (← mod.pkg.isArtifactCacheWritable) then - .inl <$> mod.cacheOutputArtifacts setup.isModule restoreAll + let arts ← mod.cacheOutputArtifacts setup.isModule restoreAll + -- Note: Cache service metadata is not preserved on an output update because it would + -- result in downloading module outputs that are not available on the remote. + (← getLakeCache).writeOutputs mod.pkg.cacheScope inputHash arts.descrs (overwrite := true) + return .inl arts else return .inl arts else diff --git a/src/lake/Lake/CLI/Help.lean b/src/lake/Lake/CLI/Help.lean index c2a2aba4f6be..823f08ca4540 100644 --- a/src/lake/Lake/CLI/Help.lean +++ b/src/lake/Lake/CLI/Help.lean @@ -451,7 +451,7 @@ artifacts. If no mappings are found, Lake will backtrack the Git history up to will search the repository's entire history (or as far as Git will allow). By default, Lake will download both the input-to-output mappings and the -output artifacts for a package. By using `--mappings-onlys`, Lake will only +output artifacts for a package. By using `--mappings-only`, Lake will only download the mappings and delay downloading artifacts until they are needed. If a download for an artifact fails or the download process for a whole @@ -505,12 +505,16 @@ OPTIONS: --service= cache service to fetch from on demand --scope= the prefix of artifacts within the service --repo= for Reservoir, a GitHub repository scope + --no-overwrite do not overwrite existing mappings Reads a list of input-to-output mappings from the provided file and adds -them to the local Lake cache. If `--service` is provided, the output artifacts -can then be fetched lazily from that service during a Lake build. The service -must either be `reservoir` or be configured through the Lake system -configuration (see the help page of `lake cache services` for details). +them to the local Lake cache. Mappings already in the cache are overwritten +unless `--no-overwrite` is specified. + +If `--service` is provided, the output artifacts can then be fetched lazily +from that service during a Lake build. The service must either be `reservoir` +or be configured through the Lake system configuration (see the help page of +`lake cache services` for details). Since Lake does not currently use cryptographically secure hashes for artifacts and outputs, artifacts in a cache service are prefixed with a scope @@ -522,25 +526,28 @@ def helpCacheStage := "Copy build outputs from the cache to a staging directory USAGE: - lake cache stage + lake cache stage [--force-overwrite] -Creates the staging directory and copies the mappings file to it. Then, it -copies all artifacts described within the mappings file from the cache to the -staging directory. Errors if any of the artifacts described cannot be found in -the cache." +Creates the staging directory and copies the mappings file to it. Then, +it copies all artifacts described within the mappings file from the cache to +the staging directory. Artifacts in the staging directory are not overwritten +unless `--force-overwrite` is specified. Errors if any of the artifacts +described cannot be found in the cache." def helpCacheUnstage := "Cache build outputs from a staging directory USAGE: - lake cache unstage + lake cache unstage [--force-overwrite] Copies the mappings and artifacts stored in staging directory (e.g., via `lake cache stage`) back into the cache. Reads the mappings file located at `outputs.jsonl` within the staging directory and writes the mappings to the Lake cache. Then, it copies the -described artifacts from the staging directory into the cache." +described artifacts from the staging directory into the cache. Mappings and +artifacts already in the cache are not overwritten unless `--force-overwrite` +is specified." def helpCachePutStaged := "Upload build outputs from a staging directory to a remote service diff --git a/src/lake/Lake/CLI/Main.lean b/src/lake/Lake/CLI/Main.lean index 4a3bb14528e4..25d7df83876c 100644 --- a/src/lake/Lake/CLI/Main.lean +++ b/src/lake/Lake/CLI/Main.lean @@ -65,6 +65,7 @@ public structure LakeOptions where outFormat : OutFormat := .text offline : Bool := false outputsFile? : Option FilePath := none + overwrite? : Option Bool := none forceDownload : Bool := false mappingsOnly : Bool := false service? : Option String := none @@ -285,6 +286,8 @@ def lakeLongOption : (opt : String) → CliM PUnit | "--offline" => modifyThe LakeOptions ({· with offline := true}) | "--wfail" => modifyThe LakeOptions ({· with failLv := .warning}) | "--iofail" => modifyThe LakeOptions ({· with failLv := .info}) +| "--no-overwrite" => modifyThe LakeOptions ({· with overwrite? := some false}) +| "--force-overwrite" => modifyThe LakeOptions ({· with overwrite? := some true}) | "--force-download" => modifyThe LakeOptions ({· with forceDownload := true}) | "--download-arts" => modifyThe LakeOptions ({· with mappingsOnly := false}) | "--mappings-only" => modifyThe LakeOptions ({· with mappingsOnly := true}) @@ -465,6 +468,10 @@ protected def get : CliM PUnit := do let cfg ← mkLoadConfig opts let ws ← loadWorkspace cfg let cache := ws.lakeCache + let overwrite := opts.overwrite?.getD true + unless overwrite do + -- artifacts of skipped mappings with `--no-overwrite` cannot be cleanly handled + error "`--no-overwrite` is not supported for `cache get`" if let some file := mappings? then liftM (m := LoggerIO) do if opts.mappingsOnly then error "`--mappings-only` is not supported with a mappings file; use `lake cache add` instead" @@ -485,7 +492,7 @@ protected def get : CliM PUnit := do else return ws.defaultCacheService let map ← CacheMap.load file - cache.writeMap ws.root.cacheScope map service.name? (some remoteScope) + cache.writeMap ws.root.cacheScope map service.name? (some remoteScope) overwrite let descrs ← map.collectOutputDescrs service.downloadArtifacts descrs cache remoteScope opts.forceDownload else @@ -530,7 +537,7 @@ protected def get : CliM PUnit := do return map else findOutputs cache service pkg remoteScope opts platform toolchain - cache.writeMap pkg.cacheScope map service.name? (some remoteScope) + cache.writeMap pkg.cacheScope map service.name? (some remoteScope) overwrite unless opts.mappingsOnly do let descrs ← map.collectOutputDescrs service.downloadArtifacts descrs cache remoteScope opts.forceDownload @@ -544,7 +551,7 @@ protected def get : CliM PUnit := do let toolchain := cacheToolchain pkg toolchain try let map ← findOutputs cache service pkg remoteScope opts platform toolchain - cache.writeMap pkg.cacheScope map service.name? (some remoteScope) + cache.writeMap pkg.cacheScope map service.name? (some remoteScope) overwrite unless opts.mappingsOnly do let descrs ← map.collectOutputDescrs service.downloadArtifacts descrs cache remoteScope opts.forceDownload @@ -677,7 +684,8 @@ protected def add : CliM PUnit := do error (serviceNotFound service ws.lakeConfig.config.cache.services) return some (.ofString service) let map ← CacheMap.load file - ws.lakeCache.writeMap localScope map service? opts.scope? + let overwrite := opts.overwrite?.getD true + ws.lakeCache.writeMap localScope map service? opts.scope? overwrite private def stagingOutputsFile := "outputs.jsonl" @@ -699,9 +707,12 @@ protected def stage : CliM PUnit := do let descrs ← map.collectOutputDescrs IO.FS.createDirAll stagingDir copyFile mappingsFile (stagingDir / stagingOutputsFile) + let overwrite := opts.overwrite?.getD false let ok ← descrs.foldlM (init := true) fun ok descr => do let cachePath := cache.artifactDir / descr.relPath let stagingPath := stagingDir / descr.relPath + unless overwrite || !(← stagingPath.pathExists) do + return ok match (← copyFile cachePath stagingPath |>.toBaseIO) with | .ok _ => return ok @@ -739,9 +750,16 @@ protected def unstage : CliM PUnit := do let descrs ← map.collectOutputDescrs let artDir := ws.lakeCache.artifactDir IO.FS.createDirAll artDir + let overwrite := opts.overwrite?.getD false let ok ← descrs.foldlM (init := true) fun ok descr => do let cachePath := artDir/ descr.relPath let stagingPath := stagingDir / descr.relPath + if (← cachePath.pathExists) then + if overwrite then + -- Cache artifacts are read-only, so the old artifact must be deleted first. + IO.FS.removeFile cachePath + else + return ok match (← copyFile stagingPath cachePath |>.toBaseIO) with | .ok _ => return ok @@ -754,7 +772,7 @@ protected def unstage : CliM PUnit := do unless ok do logError "failed to copy all outputs to the staging directory" exit 1 - ws.lakeCache.writeMap localScope map service? opts.scope? + ws.lakeCache.writeMap localScope map service? opts.scope? overwrite protected def putStaged : CliM PUnit := do processOptions lakeOption diff --git a/src/lake/Lake/Config/Cache.lean b/src/lake/Lake/Config/Cache.lean index 1fe4afa5bedc..6e1138d085a5 100644 --- a/src/lake/Lake/Config/Cache.lean +++ b/src/lake/Lake/Config/Cache.lean @@ -434,22 +434,30 @@ public def getArtifact (cache : Cache) (descr : ArtifactDescr) : EIO String Arti def writeOutputsCore (cache : Cache) (scope : String) (inputHash : Hash) (out : Json) (service? : Option CacheServiceName) (remoteScope? : Option CacheServiceScope) + (overwrite : Bool) : IO Unit := do let file := cache.outputsFile scope inputHash createParentDirs file let out := {service?, scope? := remoteScope?, data := out : CacheOutput} - IO.FS.writeFile file (toJson out).pretty + let contents := (toJson out).pretty + if overwrite then + IO.FS.writeFile file contents + else + writeFileIfNew file contents /-- Cache the outputs corresponding to the given input for the package. -/ @[inline] public def writeOutputs [ToJson α] (cache : Cache) (scope : String) (inputHash : Hash) (outputs : α) -: IO Unit := cache.writeOutputsCore scope inputHash (toJson outputs) none none + (service? : Option CacheServiceName := none) (remoteScope? : Option CacheServiceScope := none) + (overwrite := true) +: IO Unit := cache.writeOutputsCore scope inputHash (toJson outputs) service? remoteScope? overwrite /-- Cache the input-to-outputs mappings from a `CacheMap`. -/ public def writeMap (cache : Cache) (scope : String) (map : CacheMap) (service? : Option CacheServiceName := none) (remoteScope? : Option CacheServiceScope := none) -: IO Unit := map.forM fun i e => cache.writeOutputsCore scope i e.out service? remoteScope? + (overwrite := true) +: IO Unit := map.forM fun i e => cache.writeOutputsCore scope i e.out service? remoteScope? overwrite /-- Retrieve the cached outputs corresponding to the given input for the package (if any). -/ public def readOutputs? (cache : Cache) (scope : String) (inputHash : Hash) : LogIO (Option CacheOutput) := do diff --git a/tests/lake/tests/cache/online-test.sh b/tests/lake/tests/cache/online-test.sh index 94399966263f..1a39f88b00b3 100755 --- a/tests/lake/tests/cache/online-test.sh +++ b/tests/lake/tests/cache/online-test.sh @@ -69,6 +69,8 @@ test_err 'service `bogus` not found in system configuration'\ cache put bogus.jsonl --scope='bogus' --service='bogus' # Test `cache get` command errors for bad configurations +test_err '`--no-overwrite` is not supported for `cache get`' \ + cache get --no-overwrite test_err 'the `--platform` and `--toolchain` options do nothing' \ cache get bogus.jsonl --scope='bogus' --platform='bogus' --wfail test_err 'the `--platform` and `--toolchain` options do nothing' \ diff --git a/tests/lake/tests/cache/test.sh b/tests/lake/tests/cache/test.sh index 4cfe8580480b..b1151c2e7e91 100755 --- a/tests/lake/tests/cache/test.sh +++ b/tests/lake/tests/cache/test.sh @@ -233,11 +233,41 @@ test_run cache unstage .lake/staging # Verify that `cache unstage` fails if staging artifacts are missing test_cmd mkdir -p .lake/staging-empty test_cmd cp .lake/outputs.jsonl .lake/staging-empty/outputs.jsonl -test_err 'artifact not found in staging directory' cache unstage .lake/staging-empty +test_err 'artifact not found in staging directory' cache unstage .lake/staging-empty \ + --force-overwrite # needed since the artifact is in the cache + +# Test stage/unstage behavior regarding `--no-overwrite` / `--force-overwrite` +test_cmd rm -rf "$CACHE_DIR" +test_run build Test:static -o .lake/outputs.jsonl +cache_art=$(echo "$CACHE_DIR"/artifacts/*.a) +test_exp -s $cache_art +staging_art=$(echo .lake/staging/*.a) +test_exp -s $staging_art +# Verify stage overwritting +test_cmd rm -rf .lake/staging +test_run cache stage .lake/outputs.jsonl .lake/staging --no-overwrite +test_exp -f $staging_art # verify copy can occur with `--no-overwrite` +test_cmd rm $staging_art +test_cmd touch $staging_art +test_exp ! -s $staging_art +test_run cache stage .lake/outputs.jsonl .lake/staging +test_exp ! -s $staging_art +test_run cache stage .lake/outputs.jsonl .lake/staging --force-overwrite +test_exp -s $staging_art +# Verify unstage overwritting +test_cmd rm -rf "$CACHE_DIR" +test_run cache unstage .lake/staging --no-overwrite +test_exp -f $cache_art # verify copy can occur with `--no-overwrite` +test_cmd rm $staging_art +test_cmd touch $staging_art +test_exp ! -s $staging_art +test_run cache unstage .lake/staging +test_exp -s $cache_art +test_run cache unstage .lake/staging --force-overwrite +test_exp ! -s $cache_art # Verify that `lake cache clean` deletes the cache directory test_exp -d "$CACHE_DIR" -test_cmd cp -r "$CACHE_DIR" .lake/cache-backup test_run cache clean test_exp ! -d "$CACHE_DIR" @@ -280,9 +310,11 @@ if command -v jq > /dev/null; then # skip if no jq found test_cmd rm -f $libPath inputHash=$(jq -r '.depHash' $libPath.trace) echo $inputHash + test_cmd rm -f $libPath.trace echo bogus > "$CACHE_DIR/outputs/test/$inputHash.json" test_out 'invalid JSON' build Test:static - test_cmd rm -f $libPath + test_exp -f $libPath + test_cmd rm -f $libPath $libPath.trace echo '"bogus"' > "$CACHE_DIR/outputs/test/$inputHash.json" test_out 'some output(s) have issues' build Test:static test_exp -f $libPath diff --git a/tests/lake/tests/ltar/test.sh b/tests/lake/tests/ltar/test.sh index 37d1a0609f9d..7a975ae62220 100755 --- a/tests/lake/tests/ltar/test.sh +++ b/tests/lake/tests/ltar/test.sh @@ -7,7 +7,7 @@ source ../common.sh export LAKE_CACHE_DIR= #------------------------------------------------------------------------------- -# The tests covers the (offline) use of `leantar` in Lake +# This test suite covers the (offline) use of `leantar` in Lake #------------------------------------------------------------------------------- # Test regular build does not produce an `ltar` @@ -19,7 +19,7 @@ test_err "archive does not exist and needs to be built" \ build +Test:ltar --no-build -v # Test the build of the `ltar` facet -test_run build +Test:ltar -v +test_out "Built Test:ltar" build +Test:ltar -v test_exp -f .lake/build/ir/Test.ltar # Test that Lake unpacks the archive if the module's artifacts are missing @@ -54,6 +54,37 @@ test_run -f restoreAll.toml build +Test -v --wfail -o .lake/outputs.jsonl test_cmd ls .lake/cache/artifacts/*.ltar test_exp -f .lake/build/ir/Test.ltar +# Test that the cache input-to-output mapping is overwritten without +# `--no-overwrite`. Since the build mapping includes module outputs and the +# tracked mapping only includes the `ltar`, the tracking mapping should +# require an unpack, whereas the build mapping should not. +rm -rf .lake/build +no_match_text ltar .lake/cache/outputs/test/*.json +test_run cache add .lake/outputs.jsonl --no-overwrite +no_match_text ltar .lake/cache/outputs/test/*.json +LAKE_ARTIFACT_CACHE=true test_not_out "leantar" build +Test --no-build -v +rm -rf .lake/build +no_match_text ltar .lake/cache/outputs/test/*.json +test_run cache add .lake/outputs.jsonl --service reservoir --repo 'foo/bar' +match_text ltar .lake/cache/outputs/test/*.json +match_text reservoir .lake/cache/outputs/test/*.json +LAKE_ARTIFACT_CACHE=true test_out "leantar" build +Test --no-build -v +# Unpack should have updated the cached mapping: added module outputs, +# removed service metadata, and left the ltar. +rm -rf .lake/build +match_text ltar .lake/cache/outputs/test/*.json +match_text olean .lake/cache/outputs/test/*.json +no_match_text reservoir .lake/cache/outputs/test/*.json +LAKE_ARTIFACT_CACHE=true test_not_out "leantar" build +Test --no-build -v + +# Test that Lake prefers the local trace outputs over the cache +rm -f .lake/build/lib/lean/*.[!t]* +test_run cache add .lake/outputs.jsonl +match_text ltar .lake/cache/outputs/test/*.json +LAKE_ARTIFACT_CACHE=true test_not_out "leantar" build +Test --no-build -v +# Test that the cache output is not overwritten without an unpack +match_text ltar .lake/cache/outputs/test/*.json + # Test producing an `ltar` without already restored artifacts rm -rf .lake/cache .lake/build LAKE_ARTIFACT_CACHE=true test_run build +Test -v