From 738b5eae1ebae000550fce112f1a61360eb1b2a8 Mon Sep 17 00:00:00 2001 From: Felipe Cotti Date: Thu, 28 May 2026 13:39:36 -0300 Subject: [PATCH] changelog: make generated entry YAMLs self-describing for the scrubber MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The changelog-upload Lambda (`docs-lambda-changelog-scrubber`) reads each `{product}/changelogs/*.yaml` from the private S3 bucket and applies the allowlist sanitizer before mirroring to the public bucket. Because each per-entry YAML carries no embedded repo context, it calls `LinkAllowlistSanitizer.TryApplyChangelogEntry(..., defaultOwner: "elastic", defaultRepo: null, ...)`. For an entry produced by the upload action's fork-PR re-derivation step — docs-builder changelog add --concise --use-pr-number \ --owner elastic --repo cloud --prs 155500 — `BuildChangelogData` stored the bare PR number verbatim in the entry's `Prs` list, so the written YAML contained `prs: ['155500']`. When the Lambda later tried to scrub it, `ChangelogTextUtilities.TryGetGitHubRepo` couldn't resolve a repo for the bare number (no `defaultRepo`), the sanitizer emitted an error and aborted, and SQS retried the message indefinitely with the same outcome: System.InvalidOperationException: Failed to apply allowlist to changelog entry; errors: 1 Two fixes here, with tests: 1. `ChangelogFileWriter.NormalizeReferences` — when `--owner` and `--repo` are supplied and a `--prs` / `--issues` value is just a number, expand it to `https://github.com/{owner}/{repo}/{pull|issues}/N` before writing the YAML. Bundle-style multi-repo strings (`a+b`) and pre-qualified `org/repo` values are left alone to avoid producing wrong URLs. Full URLs and `owner/repo#N` short-forms pass through unchanged. The validator already requires `--owner` and `--repo` when `--prs` is bare, so this is a strict superset of accepted input. 2. `LinkAllowlistSanitizer.FilterReferenceList` (defense in depth) — if we encounter a bare numeric reference and `defaultRepo` is unknown, keep the reference as-is and emit a warning instead of failing the entry. A bare number carries no repo identity, so it cannot leak a private link on its own, and downstream rendering supplies owner/repo from runtime context. Genuinely unparseable references (e.g. `not-a-pr-ref`) still hit the original fail-closed error branch so schema regressions remain visible. Together these unblock the already-uploaded entries that the Lambda is currently dead-lettering, and prevent the same shape of bad YAML from ever being produced again. Verified `dotnet format`, `./build.sh build --skip-dirty-check`, AOT publish (`dotnet publish src/tooling/docs-builder -c Release`), the full `Elastic.Changelog.Tests` suite (689 passing), and `docs/cli-schema.json` regeneration shows no change (the CLI surface is unchanged). Refs: elastic/cloud changelog-upload SQS DLQ for `cloud-hosted/changelogs/155500.yaml` Co-authored-by: Cursor --- .../Bundling/LinkAllowlistSanitizer.cs | 15 ++ .../Creation/ChangelogFileWriter.cs | 35 ++++- .../Changelogs/Create/PrIntegrationTests.cs | 65 +++++++++ .../Changelogs/LinkAllowlistSanitizerTests.cs | 70 ++++++++++ .../Creation/NormalizeReferencesTests.cs | 130 ++++++++++++++++++ 5 files changed, 313 insertions(+), 2 deletions(-) create mode 100644 tests/Elastic.Changelog.Tests/Creation/NormalizeReferencesTests.cs diff --git a/src/services/Elastic.Changelog/Bundling/LinkAllowlistSanitizer.cs b/src/services/Elastic.Changelog/Bundling/LinkAllowlistSanitizer.cs index 611b04c75f..e93ddeb1c0 100644 --- a/src/services/Elastic.Changelog/Bundling/LinkAllowlistSanitizer.cs +++ b/src/services/Elastic.Changelog/Bundling/LinkAllowlistSanitizer.cs @@ -366,6 +366,21 @@ public static void ValidateNoPrivateReferences(string serializedYaml, IReadOnlyL if (!ChangelogTextUtilities.TryGetGitHubRepo(r, defaultOwner, defaultBundleRepo ?? string.Empty, out var owner, out var repo)) { + // A bare numeric reference with no repo context (e.g. `prs: ['155500']` loaded by the + // scrubber Lambda, which doesn't know which repo a per-entry YAML belongs to) carries no + // repository identity, so it cannot leak a private reference on its own. Keep it as-is — + // downstream rendering supplies the owner/repo from runtime context — and emit a warning so + // the diagnostic is still visible without failing the whole entry. Anything that still + // encodes a repo (URL / owner/repo#N short-form) hits the explicit error branch below. + if (!string.IsNullOrWhiteSpace(r) && uint.TryParse(r.Trim(), out _)) + { + list.Add(r); + collector.EmitWarning( + string.Empty, + $"Bare {referenceKind} reference '{r}' has no embedded owner/repo and no default repo was supplied; keeping as-is for downstream rendering to resolve."); + continue; + } + collector.EmitError( string.Empty, $"Link allowlist filtering could not parse {referenceKind} reference '{r}'. " + diff --git a/src/services/Elastic.Changelog/Creation/ChangelogFileWriter.cs b/src/services/Elastic.Changelog/Creation/ChangelogFileWriter.cs index 8c40eac84a..ce3ba610d2 100644 --- a/src/services/Elastic.Changelog/Creation/ChangelogFileWriter.cs +++ b/src/services/Elastic.Changelog/Creation/ChangelogFileWriter.cs @@ -143,13 +143,44 @@ private static ChangelogEntry BuildChangelogData(CreateChangelogArguments input) Action = input.Action, FeatureId = input.FeatureId, Highlight = input.Highlight, - Prs = input.Prs is { Length: > 0 } ? input.Prs.ToList() : null, + Prs = NormalizeReferences(input.Prs, input.Owner, input.Repo, "pull"), Products = input.Products.Select(p => p.ToProductReference()).ToList(), Areas = input.Areas is { Length: > 0 } ? input.Areas.ToList() : null, - Issues = input.Issues is { Length: > 0 } ? input.Issues.ToList() : null + Issues = NormalizeReferences(input.Issues, input.Owner, input.Repo, "issues") }; } + /// + /// Expands bare numeric PR/issue references to full https://github.com/owner/repo/{pull|issues}/N + /// URLs when owner and repo are available, so the written YAML is self-describing — readers (including the + /// changelog scrubber Lambda, which has no per-entry repo context) can resolve the target repository without + /// extra arguments. References that are already full URLs or owner/repo#N short-forms are returned + /// unchanged. Returns null when there is nothing to write, preserving the previous omit-on-empty + /// serialization behaviour. + /// + internal static List? NormalizeReferences(string[]? refs, string? owner, string? repo, string pathSegment) + { + if (refs is not { Length: > 0 }) + return null; + + // Only expand when we have an unambiguous single-repo context. Bundle-style multi-repo strings + // (e.g. "elasticsearch+kibana") or pre-qualified "org/repo" values would expand to the wrong URL. + var hasContext = !string.IsNullOrWhiteSpace(owner) + && !string.IsNullOrWhiteSpace(repo) + && !repo.Contains('/', StringComparison.Ordinal) + && !repo.Contains('+', StringComparison.Ordinal); + + var list = new List(refs.Length); + foreach (var r in refs) + { + if (hasContext && !string.IsNullOrWhiteSpace(r) && uint.TryParse(r.Trim(), out _)) + list.Add($"https://github.com/{owner}/{repo}/{pathSegment}/{r.Trim()}"); + else + list.Add(r); + } + return list; + } + private static string GenerateYaml(ChangelogEntry data, ChangelogConfiguration config, bool titleMissing, bool typeMissing) { // Create a mutable copy for serialization adjustments diff --git a/tests/Elastic.Changelog.Tests/Changelogs/Create/PrIntegrationTests.cs b/tests/Elastic.Changelog.Tests/Changelogs/Create/PrIntegrationTests.cs index 0bc0eb04d3..2c1760c00d 100644 --- a/tests/Elastic.Changelog.Tests/Changelogs/Create/PrIntegrationTests.cs +++ b/tests/Elastic.Changelog.Tests/Changelogs/Create/PrIntegrationTests.cs @@ -597,4 +597,69 @@ public async Task CreateChangelog_WithMixedPrsFromFileAndCommaSeparated_Processe yamlContents.Should().Contain(c => c.Contains("prs:") && c.Contains("https://github.com/elastic/elasticsearch/pull/1111")); yamlContents.Should().Contain(c => c.Contains("prs:") && c.Contains("https://github.com/elastic/elasticsearch/pull/2222")); } + + [Fact] + public async Task CreateChangelog_WithBareNumberPrAndOwnerRepo_WritesFullUrlIntoYaml() + { + // Mirrors the upload action's fork-PR re-derivation invocation: + // docs-builder changelog add --concise --use-pr-number --owner elastic --repo cloud --prs 155500 + // The previous bare-number-in YAML was unparseable by the scrubber Lambda (which has no per-entry + // repo context), so the writer must normalize bare numbers to full URLs when owner+repo are known. + var prInfo = new GitHubPrInfo + { + Title = "Fix upload failures for extensions with >5GiB", + Labels = ["type:bug"] + }; + + A.CallTo(() => MockGitHubService.FetchPrInfoAsync( + "155500", + "elastic", + "cloud", + A._)) + .Returns(prInfo); + + // language=yaml + var configContent = + """ + pivot: + types: + feature: + bug-fix: "type:bug" + breaking-change: + lifecycles: + - preview + - beta + - ga + """; + var configPath = await CreateConfigDirectory(configContent); + + var service = CreateService(); + + var input = new CreateChangelogArguments + { + Prs = ["155500"], + Owner = "elastic", + Repo = "cloud", + Products = [new ProductArgument { Product = "elasticsearch", Target = "9.2.0", Lifecycle = "ga" }], + Config = configPath, + Output = CreateOutputDirectory(), + UsePrNumber = true, + Concise = true + }; + + var result = await service.CreateChangelog(Collector, input, TestContext.Current.CancellationToken); + + result.Should().BeTrue(); + Collector.Errors.Should().Be(0); + + var outputDir = input.Output ?? FileSystem.Directory.GetCurrentDirectory(); + var files = FileSystem.Directory.GetFiles(outputDir, "*.yaml"); + files.Should().HaveCount(1); + + var yamlContent = await FileSystem.File.ReadAllTextAsync(files[0], TestContext.Current.CancellationToken); + yamlContent.Should().Contain("prs:"); + yamlContent.Should().Contain("https://github.com/elastic/cloud/pull/155500"); + yamlContent.Should().NotMatch("*prs:*\n- '155500'*"); + yamlContent.Should().NotMatch("*prs:*\n- 155500*"); + } } diff --git a/tests/Elastic.Changelog.Tests/Changelogs/LinkAllowlistSanitizerTests.cs b/tests/Elastic.Changelog.Tests/Changelogs/LinkAllowlistSanitizerTests.cs index 851a05472e..b3cb5cc2a8 100644 --- a/tests/Elastic.Changelog.Tests/Changelogs/LinkAllowlistSanitizerTests.cs +++ b/tests/Elastic.Changelog.Tests/Changelogs/LinkAllowlistSanitizerTests.cs @@ -530,6 +530,76 @@ public void TryApplyChangelogEntry_NullFields_PreservesNulls() sanitized.Description.Should().BeNull(); } + [Fact] + public void TryApplyChangelogEntry_BarePrNumberWithoutDefaultRepo_KeptWithWarning() + { + // The scrubber Lambda calls TryApplyChangelogEntry with defaultRepo=null because per-entry + // YAMLs uploaded under {product}/changelogs/*.yaml carry no embedded repo context. A bare + // numeric PR ref ("155500") must be tolerated rather than failing the whole entry — the + // reference carries no repo identity so it cannot leak a private link, and downstream + // rendering supplies the owner/repo from runtime context. + var entry = new BundledEntry + { + Title = "Fork PR entry", + Prs = ["155500"], + Issues = null + }; + + var allow = new[] { "elastic/elasticsearch" }; + var ok = LinkAllowlistSanitizer.TryApplyChangelogEntry( + Collector, entry, allow, "elastic", null, + out var sanitized, out var changed); + + ok.Should().BeTrue(); + changed.Should().BeFalse(); + sanitized.Prs.Should().BeEquivalentTo(["155500"]); + Collector.Errors.Should().Be(0); + Collector.Warnings.Should().BeGreaterThan(0); + } + + [Fact] + public void TryApplyChangelogEntry_BareIssueNumberWithoutDefaultRepo_KeptWithWarning() + { + var entry = new BundledEntry + { + Title = "Entry with bare issue", + Prs = null, + Issues = ["4274"] + }; + + var allow = new[] { "elastic/elasticsearch" }; + var ok = LinkAllowlistSanitizer.TryApplyChangelogEntry( + Collector, entry, allow, "elastic", null, + out var sanitized, out var changed); + + ok.Should().BeTrue(); + changed.Should().BeFalse(); + sanitized.Issues.Should().BeEquivalentTo(["4274"]); + Collector.Errors.Should().Be(0); + Collector.Warnings.Should().BeGreaterThan(0); + } + + [Fact] + public void TryApplyChangelogEntry_UnparseableNonNumericRef_StillErrors() + { + // Genuinely unparseable references (not bare numbers and not URL/short-form) should still + // fail-closed so we surface schema regressions instead of silently dropping data. + var entry = new BundledEntry + { + Title = "Malformed entry", + Prs = ["not-a-pr-ref"], + Issues = null + }; + + var allow = new[] { "elastic/elasticsearch" }; + var ok = LinkAllowlistSanitizer.TryApplyChangelogEntry( + Collector, entry, allow, "elastic", "elasticsearch", + out _, out _); + + ok.Should().BeFalse(); + Collector.Errors.Should().BeGreaterThan(0); + } + // --- ScrubText --- [Fact] diff --git a/tests/Elastic.Changelog.Tests/Creation/NormalizeReferencesTests.cs b/tests/Elastic.Changelog.Tests/Creation/NormalizeReferencesTests.cs new file mode 100644 index 0000000000..b049f79a32 --- /dev/null +++ b/tests/Elastic.Changelog.Tests/Creation/NormalizeReferencesTests.cs @@ -0,0 +1,130 @@ +// Licensed to Elasticsearch B.V under one or more agreements. +// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. +// See the LICENSE file in the project root for more information + +using AwesomeAssertions; +using Elastic.Changelog.Creation; + +namespace Elastic.Changelog.Tests.Creation; + +/// +/// Covers , which is the choke point for making +/// generated entry YAMLs self-describing. Bare PR/issue numbers must be expanded to full GitHub URLs +/// when owner+repo are supplied so downstream consumers without per-entry repo context — most +/// importantly the changelog-scrubber Lambda — can resolve the target repository. +/// +public class NormalizeReferencesTests +{ + [Fact] + public void Null_ReturnsNull() => + ChangelogFileWriter.NormalizeReferences(null, "elastic", "cloud", "pull") + .Should().BeNull(); + + [Fact] + public void Empty_ReturnsNull() => + ChangelogFileWriter.NormalizeReferences([], "elastic", "cloud", "pull") + .Should().BeNull(); + + [Fact] + public void BareNumber_WithOwnerAndRepo_ExpandsToFullUrl() + { + var result = ChangelogFileWriter.NormalizeReferences( + ["155500"], "elastic", "cloud", "pull"); + + result.Should().BeEquivalentTo(["https://github.com/elastic/cloud/pull/155500"]); + } + + [Fact] + public void BareIssueNumber_WithOwnerAndRepo_ExpandsToIssuesUrl() + { + var result = ChangelogFileWriter.NormalizeReferences( + ["4274"], "elastic", "cloud", "issues"); + + result.Should().BeEquivalentTo(["https://github.com/elastic/cloud/issues/4274"]); + } + + [Fact] + public void BareNumber_WithoutOwner_LeftAsIs() + { + var result = ChangelogFileWriter.NormalizeReferences( + ["155500"], null, "cloud", "pull"); + + result.Should().BeEquivalentTo(["155500"]); + } + + [Fact] + public void BareNumber_WithoutRepo_LeftAsIs() + { + var result = ChangelogFileWriter.NormalizeReferences( + ["155500"], "elastic", null, "pull"); + + result.Should().BeEquivalentTo(["155500"]); + } + + [Fact] + public void BareNumber_WithBundleStyleRepo_LeftAsIs() + { + // `elasticsearch+kibana` is a multi-repo bundle string; we can't pick which one a bare + // number targets, so we conservatively leave it alone. + var result = ChangelogFileWriter.NormalizeReferences( + ["100"], "elastic", "elasticsearch+kibana", "pull"); + + result.Should().BeEquivalentTo(["100"]); + } + + [Fact] + public void BareNumber_WithSlashInRepo_LeftAsIs() + { + // A pre-qualified `org/repo` value supplied through `--repo` shouldn't be re-combined. + var result = ChangelogFileWriter.NormalizeReferences( + ["100"], "elastic", "elastic/cloud", "pull"); + + result.Should().BeEquivalentTo(["100"]); + } + + [Fact] + public void FullUrl_LeftAsIs() + { + var result = ChangelogFileWriter.NormalizeReferences( + ["https://github.com/elastic/cloud/pull/155500"], "elastic", "cloud", "pull"); + + result.Should().BeEquivalentTo(["https://github.com/elastic/cloud/pull/155500"]); + } + + [Fact] + public void ShortFormReference_LeftAsIs() + { + var result = ChangelogFileWriter.NormalizeReferences( + ["elastic/cloud#155500"], "elastic", "cloud", "pull"); + + result.Should().BeEquivalentTo(["elastic/cloud#155500"]); + } + + [Fact] + public void MixedReferences_OnlyBareNumbersExpand() + { + var result = ChangelogFileWriter.NormalizeReferences( + [ + "155500", + "https://github.com/elastic/cloud/pull/155501", + "elastic/cloud#155502" + ], + "elastic", "cloud", "pull"); + + result.Should().BeEquivalentTo( + [ + "https://github.com/elastic/cloud/pull/155500", + "https://github.com/elastic/cloud/pull/155501", + "elastic/cloud#155502" + ]); + } + + [Fact] + public void NumberWithWhitespace_TrimmedAndExpanded() + { + var result = ChangelogFileWriter.NormalizeReferences( + [" 155500 "], "elastic", "cloud", "pull"); + + result.Should().BeEquivalentTo(["https://github.com/elastic/cloud/pull/155500"]); + } +}