diff --git a/docs/instance-discovery-rules.md b/docs/instance-discovery-rules.md new file mode 100644 index 0000000000..92f956d55a --- /dev/null +++ b/docs/instance-discovery-rules.md @@ -0,0 +1,324 @@ +# Instance Discovery Rules for MSAL + +## Purpose + +Instance discovery resolves an authority host (e.g., `login.microsoftonline.com`) to its **metadata**: preferred network host, preferred cache host, and a list of aliases. This metadata enables SSO across aliased environments and ensures tokens are sent to the correct endpoint. + +This document describes the rules implemented in MSAL.NET to aid reimplementation in other MSAL libraries. + +--- + +## 1. Core Data Model + +Instance discovery produces a single entry per authority environment: + +``` +InstanceDiscoveryMetadataEntry: + preferred_network: string # host to use for token requests + preferred_cache: string # host to use as cache key + aliases: string[] # all equivalent hosts (used for SSO, cache lookups) +``` + +A successful network response returns an array of these entries: + +```json +{ + "tenant_discovery_endpoint": "https://login.microsoftonline.com/{tenant}/.well-known/openid-configuration", + "metadata": [ + { + "preferred_network": "login.microsoftonline.com", + "preferred_cache": "login.windows.net", + "aliases": ["login.microsoftonline.com", "login.windows.net", "login.microsoft.com", "sts.windows.net"] + } + ] +} +``` + +--- + +## 2. Applicability + +| Authority Type | Instance Discovery Supported | +|---|---| +| AAD (Entra ID) | ✅ Yes | +| B2C | ❌ No — use self-entry | +| ADFS | ❌ No — use self-entry | +| CIAM / dSTS | ❌ No — use self-entry | + +A **self-entry** means: `preferred_network = preferred_cache = aliases = [configured_authority_host]`. + +**Rule:** If the authority type does not support instance discovery, skip all providers and immediately return a self-entry. + +--- + +## 3. Known Environments (Hardcoded) + +MSAL maintains a static, hardcoded list of known cloud environments and their metadata. This avoids network calls for common clouds. + +| Cloud | Aliases | Preferred Network | Preferred Cache | +|---|---|---|---| +| **Public** | `login.microsoftonline.com`, `login.windows.net`, `login.microsoft.com`, `sts.windows.net` | `login.microsoftonline.com` | `login.windows.net` | +| **China** | `login.partner.microsoftonline.cn`, `login.chinacloudapi.cn` | `login.partner.microsoftonline.cn` | `login.partner.microsoftonline.cn` | +| **US Gov** | `login.microsoftonline.us`, `login.usgovcloudapi.net` | `login.microsoftonline.us` | `login.microsoftonline.us` | +| **Germany (legacy)** | `login.microsoftonline.de` | `login.microsoftonline.de` | `login.microsoftonline.de` | +| **US (login-us)** | `login-us.microsoftonline.com` | `login-us.microsoftonline.com` | `login-us.microsoftonline.com` | +| **PPE** | `login.windows-ppe.net`, `sts.windows-ppe.net`, `login.microsoft-ppe.com` | `login.windows-ppe.net` | `login.windows-ppe.net` | +| **Bleu (FR)** | `login.sovcloud-identity.fr` | `login.sovcloud-identity.fr` | `login.sovcloud-identity.fr` | +| **Delos (DE)** | `login.sovcloud-identity.de` | `login.sovcloud-identity.de` | `login.sovcloud-identity.de` | +| **GovSG** | `login.sovcloud-identity.sg` | `login.sovcloud-identity.sg` | `login.sovcloud-identity.sg` | + +**Known environment check for the known metadata provider**: The known metadata provider is only usable when **all** environments already present in the token cache are themselves known. If any cached environment is unknown, the known provider must be skipped (because the network may return richer alias data). + +--- + +## 4. Discovery Endpoint Selection + +When a network call is needed, MSAL must choose which host to call: + +| Authority Host | Discovery Endpoint Host | +|---|---| +| Known environment (e.g., `login.microsoftonline.com`) | Same host: `https://{authority_host}/common/discovery/instance` | +| Unknown environment (e.g., `login.microsoft.new`) | Fallback to default trusted host: `https://login.microsoftonline.com/common/discovery/instance` | + +The query parameters are: +``` +GET https://{discovery_host}/common/discovery/instance + ?api-version=1.1 + &authorization_endpoint=https://{authority_host}/{tenant}/oauth2/v2.0/authorize +``` + +--- + +## 5. Provider Resolution Order + +### 5.1 Full Flow (GetMetadataEntryAsync — used during token acquisition) + +Providers are consulted in strict priority order. The first non-null result wins. + +``` +1. [If instance discovery is disabled (WithInstanceDiscovery(false))] + → Check region discovery provider (regions are not affected by the disable flag) + → If still null, return self-entry. STOP. + +2. Region discovery provider + +3. Network call (FetchNetworkMetadataOrFallback) + → On success: cache all entries by alias in the network cache + → On "invalid_instance" error: see §6 + → On any other error (404, 502, network failure, etc.): see §7 + +4. [If still null] Log warning, create self-entry, cache it in network cache +``` + +### 5.2 Cache-Preferring Flow (GetMetadataEntryTryAvoidNetworkAsync — used during token acquisition with cache check) + +This flow tries to avoid network calls when possible: + +``` +1. Region discovery provider + +2. [If instance discovery is disabled] → return self-entry + +3. Network cache (static, populated from prior network calls) + +4. Known metadata provider (hardcoded, but only if all cached environments are known) + +5. Full flow (§5.1) + +6. [If still null] Return self-entry +``` + +### 5.3 Offline Flow (GetMetadataEntryAvoidNetwork — used for GetAccounts/AcquireTokenSilent) + +No network calls are ever made: + +``` +1. [If instance discovery is enabled]: + a. Network cache + b. Known metadata provider (with null existing environments → always usable) + +2. [If still null] Return self-entry +``` + +--- + +## 6. Error Handling: `invalid_instance` + +When the discovery endpoint returns an `invalid_instance` error (the AAD server-side error `AADSTS50049`), this means the authority genuinely does not exist. + +| `ValidateAuthority` setting | Behavior | +|---|---| +| `true` (default) | **Throw** `MsalServiceException` with error code `invalid_instance`. Do NOT cache. | +| `false` | Return a self-entry. Continue without validation. | + +--- + +## 7. Error Handling: All Other Errors (404, 502, network failures, etc.) + +When instance discovery fails with any error other than `invalid_instance`: + +1. **Try the known metadata provider** for the authority host (with empty `existingEnvironmentsInCache` to force lookup). +2. If the known provider has no entry, **create a self-entry**. +3. **Cache the result** in the network cache so that subsequent calls do NOT retry the network call. +4. Return the entry. + +**Critical rule**: The fallback entry MUST be cached. Without caching, every token request retries the failing network call, causing performance degradation and unnecessary traffic. (See [issue #5804](https://github.com/AzureAD/microsoft-authentication-library-for-dotnet/issues/5804).) + +--- + +## 8. Caching Rules + +### 8.1 Network Cache + +- **Scope**: Static / process-wide (shared across all app instances in the same process). +- **Key**: Authority host string (case-sensitive). +- **Population**: + - On successful network response: cache each entry keyed by each of its aliases. + - On network failure (non-`invalid_instance`): cache the fallback entry keyed by the authority host. +- **Eviction**: None. Entries persist for the process lifetime. +- **Thread safety**: Must be thread-safe (concurrent dictionary or equivalent). + +### 8.2 Known Metadata Cache + +- **Scope**: Static / compiled into the library. +- **Immutable**: Never modified at runtime. +- **Guard**: Only usable when all environments in the token cache are themselves known environments. + +--- + +## 9. Interaction with Regions + +- Regional discovery (e.g., `centralus.login.microsoft.com`) runs independently of instance discovery. +- Even when instance discovery is disabled (`WithInstanceDiscovery(false)`), region discovery still runs. +- Regional metadata takes precedence when available (checked before the network call). + +--- + +## 10. Interaction with Authority Validation + +Authority validation is a separate step that runs **after** instance discovery: + +1. Instance discovery resolves metadata (preferred network, aliases). +2. If `ValidateAuthority` is true AND instance discovery is enabled: + - Validate the authority against the OIDC endpoint. + - Cache successful validations by environment. + +--- + +## 11. Validation Test Matrix + +The following test scenarios should be implemented in any MSAL that supports instance discovery. Tests use HTTP mocking (no real network calls). + +### T1: Known Cloud — Discovery Happens on Same Host + +**Setup**: Authority = `https://login.microsoftonline.com/tenant` (or any known cloud host). +**Expected**: +- Instance discovery GET request goes to `https://login.microsoftonline.com/common/discovery/instance`. +- Token request goes to `https://login.microsoftonline.com/tenant/oauth2/v2.0/token`. +- Token is acquired successfully. + +**Test hosts to cover**: `login.microsoftonline.com`, `login.microsoftonline.us`, `login.microsoftonline.de`, `login.partner.microsoftonline.cn`, `login.sovcloud-identity.fr`, `login.sovcloud-identity.de`, `login.sovcloud-identity.sg`. + +### T2: Instance Discovery Disabled — No Network Discovery Call + +**Setup**: Authority = any (known or unknown), `WithInstanceDiscovery(false)`. +**Expected**: +- No GET request to the discovery endpoint. +- Token request goes directly to the configured authority. +- Token is acquired successfully. + +### T3: Unknown Cloud — Discovery Falls Back to Default Trusted Host + +**Setup**: Authority = `https://unknown.host.example/tenant` (not in known list). +**Expected**: +- Instance discovery GET request goes to `https://login.microsoftonline.com/common/discovery/instance` (the default trusted host). +- If discovery succeeds: metadata is cached, token request uses the resolved preferred network. +- If discovery fails (e.g., 404): fallback entry is created and cached, token request goes to the original authority. + +### T4: Discovery Failure (404/502) — Fallback Is Cached, No Retry + +**Setup**: Authority = unknown host. Discovery endpoint returns 404 or 502. +**Expected**: +- First `AcquireTokenForClient`: discovery fails, fallback entry created and cached, token acquired from IdP. +- Second `AcquireTokenForClient` (same scope): token served from cache, no network calls. +- Third `AcquireTokenForClient` (different scope): token acquired from IdP, NO discovery call (fallback is cached). + +**HTTP mocks (in order)**: +1. GET discovery → 404 (or 502) +2. POST token → 200 (success) +3. POST token → 200 (success) — for the different-scope call + +If the SDK makes an unexpected discovery call, the mock framework should fail. + +### T5: Discovery Failure (Network Error / HttpRequestException) — Fallback, No Retry + +**Setup**: Authority = unknown host. Discovery endpoint throws a network-level exception. +**Expected**: Same as T4 — fallback is cached, subsequent calls don't retry. + +### T6: Discovery Failure with `invalid_instance` — Throw (ValidateAuthority=true) + +**Setup**: Authority = unknown host. Discovery endpoint returns `invalid_instance` error. `ValidateAuthority` = true (default). +**Expected**: +- `MsalServiceException` is thrown with error code `invalid_instance`. +- The known metadata provider is NOT consulted as a fallback. + +### T7: Discovery Failure with `invalid_instance` — Proceed (ValidateAuthority=false) + +**Setup**: Authority = unknown host. Discovery endpoint returns `invalid_instance`. `ValidateAuthority` = false. +**Expected**: +- No exception thrown. +- A self-entry is returned (authority used as-is). +- Token is acquired successfully. + +### T8: Known Metadata — Used When All Cache Environments Are Known + +**Setup**: Authority = `login.microsoftonline.com`. Token cache contains entries only for known environments. +**Expected**: +- Known metadata provider returns the hardcoded entry. +- No network discovery call. + +### T9: Known Metadata Bypassed — Unknown Environment in Cache + +**Setup**: Authority = `login.microsoftonline.com`. Token cache contains entries for both known and unknown environments. +**Expected**: +- Known metadata provider is bypassed (returns null). +- Network discovery call is made. + +### T10: B2C / ADFS — Instance Discovery Skipped + +**Setup**: Authority = B2C or ADFS authority. +**Expected**: +- No instance discovery call. +- Self-entry is returned. + +### T11: Airgapped Cloud with Regions — Discovery Disabled + +**Setup**: Authority = unknown host. `WithInstanceDiscovery(false)`. `WithAzureRegion("centralus")`. +**Expected**: +- No instance discovery call. +- Region discovery still runs. +- Token request goes to the regionalized endpoint. + +### T12: Airgapped Cloud with Regions — Discovery Enabled but Fails + +**Setup**: Authority = unknown host. `WithAzureRegion("centralus")`. Instance discovery call throws (e.g., `HttpRequestException`). +**Expected**: +- Instance discovery failure is swallowed. +- Token request still succeeds using the regionalized endpoint. +- No retry of instance discovery on subsequent calls. + +--- + +## 12. Implementation Checklist + +- [ ] Hardcode the known cloud metadata table (§3). +- [ ] Implement the three resolution flows (§5.1, §5.2, §5.3). +- [ ] Implement discovery endpoint host selection (§4). +- [ ] Handle `invalid_instance` separately from other errors (§6 vs §7). +- [ ] Cache fallback entries on non-`invalid_instance` failures (§7 — critical). +- [ ] Use a process-wide static cache for network results (§8.1). +- [ ] Guard known metadata usage by checking all cached environments are known (§8.2). +- [ ] Support `WithInstanceDiscovery(false)` to disable network discovery. +- [ ] Ensure region discovery is independent of instance discovery toggle (§9). +- [ ] Self-entry: `preferred_network = preferred_cache = aliases = [authority_host]` (§2). +- [ ] Implement all tests T1–T12 in §11. diff --git a/src/client/Microsoft.Identity.Client/Instance/Discovery/InstanceDiscoveryManager.cs b/src/client/Microsoft.Identity.Client/Instance/Discovery/InstanceDiscoveryManager.cs index 64ce65928e..07ff3cd65c 100644 --- a/src/client/Microsoft.Identity.Client/Instance/Discovery/InstanceDiscoveryManager.cs +++ b/src/client/Microsoft.Identity.Client/Instance/Discovery/InstanceDiscoveryManager.cs @@ -202,15 +202,24 @@ private async Task FetchNetworkMetadataOrFallbac requestContext.Logger.Error($"[Instance Discovery] Instance discovery failed - invalid instance! "); throw; } - catch (Exception e) + catch (OperationCanceledException) when (requestContext.UserCancellationToken.IsCancellationRequested) + { + requestContext.Logger.Info("[Instance Discovery] Instance discovery was canceled by the caller. "); + throw; + } + catch (Exception e) { requestContext.Logger.Warning( - $"[Instance Discovery] Instance Discovery failed. MSAL will continue without network instance metadata. \n\r" + + $"[Instance Discovery] Instance discovery failed. MSAL will continue using fallback (non network-discovered) instance metadata. \n\r" + $" Exception: {e} "); - - return + + var fallbackEntry = _knownMetadataProvider.GetMetadata(authorityUri.Host, Enumerable.Empty(), requestContext.Logger) - ?? CreateEntryForSingleAuthority(authorityUri); + ?? CreateEntryForSingleAuthority(authorityUri); + + _networkCacheMetadataProvider.AddMetadata(authorityUri.Host, fallbackEntry); + + return fallbackEntry; } } diff --git a/tests/Microsoft.Identity.Test.Unit/PublicApiTests/InstanceDiscoveryTests.cs b/tests/Microsoft.Identity.Test.Unit/PublicApiTests/InstanceDiscoveryTests.cs index 1123837ec6..2902bccacc 100644 --- a/tests/Microsoft.Identity.Test.Unit/PublicApiTests/InstanceDiscoveryTests.cs +++ b/tests/Microsoft.Identity.Test.Unit/PublicApiTests/InstanceDiscoveryTests.cs @@ -24,6 +24,7 @@ using Microsoft.VisualStudio.TestTools.UnitTesting; using NSubstitute; using Microsoft.Identity.Client.Extensibility; +using Microsoft.Identity.Test.Unit.Helpers; using System.Net; using System.Security.Cryptography; using System.Text; @@ -156,5 +157,67 @@ public async Task HttpErrorsInDiscoveryShouldBeIgnored_AirgappedCould_TestAsync( // Assert happens when httpManager disposes and checks for unconsumed handlers } } + + [DataTestMethod] + [DataRow(HttpStatusCode.NotFound, 1, DisplayName = "404 - no retry")] + [DataRow(HttpStatusCode.BadGateway, 2, DisplayName = "502 - 1 retry")] + [WorkItem(5804)] // https://github.com/AzureAD/microsoft-authentication-library-for-dotnet/issues/5804 + public async Task InstanceDiscoveryFailure_IsCached_NotRetriedOnSubsequentCalls_Async( + HttpStatusCode errorStatusCode, int expectedDiscoveryCalls) + { + using (var httpManager = new MockHttpManager()) + { + // Arrange - use an authority unknown to MSAL so instance discovery goes to the network + var app = ConfidentialClientApplicationBuilder.Create(TestConstants.ClientId) + .WithAuthority(TestConstants.AuthorityNotKnownTenanted) + .WithClientSecret(TestConstants.ClientSecret) + .WithHttpManager(httpManager) + .WithRetryPolicyFactory(new TestRetryPolicyFactory()) + .BuildConcrete(); + + // First call: instance discovery returns an HTTP error, then token endpoint succeeds. + // 404 is not retried; 502 (5xx) is retried once by the STS retry policy. + for (int i = 0; i < expectedDiscoveryCalls; i++) + { + httpManager.AddMockHandler(new MockHttpMessageHandler() + { + ExpectedMethod = HttpMethod.Get, + ResponseMessage = new HttpResponseMessage(errorStatusCode) + { + Content = new StringContent("error") + } + }); + } + httpManager.AddMockHandlerSuccessfulClientCredentialTokenResponseMessage(); + + var result1 = await app.AcquireTokenForClient(TestConstants.s_scope.ToArray()) + .ExecuteAsync(CancellationToken.None) + .ConfigureAwait(false); + + Assert.AreEqual(TokenSource.IdentityProvider, result1.AuthenticationResultMetadata.TokenSource); + + // Second call with the same scope - should come from cache, no network calls at all. + // No mock handlers added - if any network call happens, the test will fail. + var result2 = await app.AcquireTokenForClient(TestConstants.s_scope.ToArray()) + .ExecuteAsync(CancellationToken.None) + .ConfigureAwait(false); + + Assert.AreEqual(TokenSource.Cache, result2.AuthenticationResultMetadata.TokenSource); + + // Third call with a different scope to force a new token request from the STS. + // Only mock the token endpoint - NO instance discovery mock! + // If instance discovery were retried, the test would fail because + // MockHttpManager would receive an unexpected HTTP call. + httpManager.AddMockHandlerSuccessfulClientCredentialTokenResponseMessage(); + + var result3 = await app.AcquireTokenForClient(TestConstants.s_scopeForAnotherResource.ToArray()) + .ExecuteAsync(CancellationToken.None) + .ConfigureAwait(false); + + Assert.AreEqual(TokenSource.IdentityProvider, result3.AuthenticationResultMetadata.TokenSource); + + // MockHttpManager.Dispose() asserts all handlers were consumed and no extra calls were made + } + } } }