Skip to content

Commit fc2a960

Browse files
dunglasclaude
andcommitted
spec: conform to latest WHATWG URL Pattern WPT test data
Refresh testdata/urlpatterntestdata.json from the current WPT snapshot (365 cases, up from 313) and align the implementation with the new expectations: - createComponentMatchResult now bounds iteration by the group name list length so user-provided regex groups (e.g. :foo((?<x>a))) no longer index past the name list and panic. Groups without a name are simply not exposed on result.Groups, matching WPT test 364. - canonicalizeHostname: drop the broad reject-on-'/ ? # \\' guard. The URL parser now canonicalizes those boundary chars correctly; keep only the ':' check since without it "bad:hostname" would be silently split into host + port. - hostnameParser: drop WithFailOnValidationError so tab/LF/CR in a hostname are stripped per the WHATWG URL spec instead of errored. - canonicalizePort: rewritten to 1. skip leading ASCII tab / LF / CR and require the first significant byte to be a digit (to reject "invalid80"), 2. always parse against a non-special scheme when no protocol is supplied so the library doesn't collapse default ports like 80/443 to empty, 3. remove the former "portValue != canonicalized" rejection which also rejected valid canonicalizations like "8\t0" -> "80" and "80x" -> "80" (state-override truncation). All 365 WPT cases pass (2 skipped for advanced unicode features Go's regexp engine does not support). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent b0b64c3 commit fc2a960

3 files changed

Lines changed: 347 additions & 37 deletions

File tree

parser.go

Lines changed: 32 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ var DefaultPorts = map[string]string{
2727
}
2828

2929
var urlParser = url.NewParser()
30-
var hostnameParser = canonicalizer.New(url.WithFailOnValidationError(), canonicalizer.WithDefaultScheme("http"))
30+
var hostnameParser = canonicalizer.New(canonicalizer.WithDefaultScheme("http"))
3131

3232
var (
3333
NonEmptySuffixError = errors.New("suffix must be the empty string")
@@ -420,10 +420,12 @@ func canonicalizeHostname(hostnameValue, protocolValue string) (string, error) {
420420
return hostnameValue, nil
421421
}
422422

423-
// Dirty workaround for https://github.com/whatwg/urlpattern/issues/206
424-
if hostnameValue[:1] != "[" {
423+
// Non-IPv6 hostnames must not contain ':': without this guard, the URL
424+
// parser would split "host:port" into host and port, silently accepting
425+
// patterns like "bad:hostname" as a plain hostname.
426+
if hostnameValue[0] != '[' {
425427
for _, c := range hostnameValue {
426-
if c == '/' || c == '?' || c == '#' || c == ':' || c == '\\' {
428+
if c == ':' {
427429
return "", errors.New("invalid hostname")
428430
}
429431
}
@@ -462,38 +464,41 @@ func canonicalizePort(portValue, protocolValue string) (string, error) {
462464
return portValue, nil
463465
}
464466

465-
var (
466-
u *url.Url
467-
err error
468-
)
469-
470-
if protocolValue == "" {
471-
u = hostnameParser.NewUrl()
472-
} else {
473-
u, err = hostnameParser.Parse(protocolValue + "://dummy.test")
474-
if err != nil {
475-
return "", err
467+
// The WHATWG port state strips ASCII tab / LF / CR before examining the
468+
// first code point, so reject inputs whose first significant byte is not
469+
// an ASCII digit (e.g. "invalid80"). Without this the URL library returns
470+
// an empty port instead of failing.
471+
firstDigit := false
472+
for i := 0; i < len(portValue); i++ {
473+
c := portValue[i]
474+
if c == '\t' || c == '\n' || c == '\r' {
475+
continue
476476
}
477+
firstDigit = c >= '0' && c <= '9'
478+
break
479+
}
480+
if !firstDigit {
481+
return "", InvalidPortError
477482
}
478483

479-
u, err = hostnameParser.BasicParser(portValue, nil, u, url.StatePort)
484+
scheme := protocolValue
485+
if scheme == "" {
486+
// Use a non-special scheme so the URL parser does not treat a
487+
// well-known default port (http/80, https/443, ...) as empty.
488+
scheme = "urlpattern-non-special"
489+
}
490+
491+
u, err := urlParser.Parse(scheme + "://dummy.test")
480492
if err != nil {
481493
return "", err
482494
}
483495

484-
p := u.Port()
485-
486-
// This looks like a bug in the spec ("80 " should be considered valid), but there is a test covering this
487-
// Another dirty workaround
488-
if p != portValue {
489-
if dp, ok := DefaultPorts[protocolValue]; ok && portValue == dp {
490-
return p, nil
491-
}
492-
493-
return "", InvalidPortError
496+
u, err = urlParser.BasicParser(portValue, nil, u, url.StatePort)
497+
if err != nil {
498+
return "", err
494499
}
495500

496-
return p, nil
501+
return u.Port(), nil
497502
}
498503

499504
// https://urlpattern.spec.whatwg.org/#canonicalize-a-pathname

0 commit comments

Comments
 (0)