Skip to content

Commit 0d244f4

Browse files
fix(code-hosting): recognize SSH repository hosts with userinfo
Use parsed hostnames instead of raw netloc values when matching configured code-hosting domains. This keeps ssh://git@host and explicit-port repository URLs on the supported code-hosting path while preserving git clone handling for SSH repository sources. Add focused regression coverage for SSH URL userinfo and explicit-port repository URL helpers. Signed-off-by: Asish Kumar <officialasishkumar@gmail.com>
1 parent f7cab2d commit 0d244f4

5 files changed

Lines changed: 84 additions & 20 deletions

File tree

openviking/parse/accessors/git_accessor.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
from openviking.utils import is_github_url, is_gitlab_url, parse_code_hosting_url
2222
from openviking.utils.code_hosting_utils import (
23+
_domain_matches,
2324
is_code_hosting_url,
2425
is_git_repo_url,
2526
validate_git_ssh_uri,
@@ -275,10 +276,7 @@ def _normalize_repo_url(self, url: str) -> str:
275276
base_parts = path_parts[: git_index + 1]
276277

277278
config = get_openviking_config()
278-
if (
279-
parsed.netloc in config.code.github_domains + config.code.gitlab_domains
280-
and len(path_parts) >= 2
281-
):
279+
if _domain_matches(parsed, config.code.github_domains + config.code.gitlab_domains):
282280
base_parts = path_parts[:2]
283281
base_path = "/" + "/".join(base_parts)
284282
return parsed._replace(path=base_path, query="", fragment="").geturl()

openviking/parse/parsers/code/code.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
)
3939
from openviking.parse.parsers.upload_utils import upload_directory
4040
from openviking.utils import is_github_url, parse_code_hosting_url
41+
from openviking.utils.code_hosting_utils import _domain_matches
4142
from openviking_cli.utils.config import get_openviking_config
4243
from openviking_cli.utils.logger import get_logger
4344

@@ -291,10 +292,7 @@ def _normalize_repo_url(self, url: str) -> str:
291292
base_parts = path_parts[: git_index + 1]
292293

293294
config = get_openviking_config()
294-
if (
295-
parsed.netloc in config.code.github_domains + config.code.gitlab_domains
296-
and len(path_parts) >= 2
297-
):
295+
if _domain_matches(parsed, config.code.github_domains + config.code.gitlab_domains):
298296
base_parts = path_parts[:2]
299297
base_path = "/" + "/".join(base_parts)
300298
return parsed._replace(path=base_path, query="", fragment="").geturl()

openviking/utils/code_hosting_utils.py

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,37 @@
88
"""
99

1010
from typing import Optional
11-
from urllib.parse import urlparse
11+
from urllib.parse import ParseResult, urlparse
1212

1313
from openviking_cli.utils.config import get_openviking_config
1414

1515

16+
def _domain_matches(parsed: ParseResult, domains: list[str]) -> bool:
17+
"""Return True when parsed URL host matches configured domains.
18+
19+
``urlparse().netloc`` includes optional userinfo and port values. Repository
20+
clone URLs commonly use forms like ``ssh://git@github.com/org/repo.git``,
21+
where the netloc is ``git@github.com`` but the actual host is
22+
``github.com``.
23+
"""
24+
hostname = parsed.hostname
25+
if not hostname:
26+
return False
27+
28+
normalized_domains = {domain.lower() for domain in domains}
29+
host = hostname.lower()
30+
candidates = {host}
31+
32+
try:
33+
port = parsed.port
34+
except ValueError:
35+
port = None
36+
if port is not None:
37+
candidates.add(f"{host}:{port}")
38+
39+
return any(candidate in normalized_domains for candidate in candidates)
40+
41+
1642
def _extract_host(url: str) -> str:
1743
"""Extract normalized host for supported git/code-hosting URL forms."""
1844
if url.startswith("git@"):
@@ -44,7 +70,6 @@ def parse_code_hosting_url(url: str) -> Optional[str]:
4470
+ config.code.code_hosting_domains
4571
)
4672
)
47-
host = _extract_host(url)
4873

4974
# Handle git@ SSH URLs: git@host:org/repo.git
5075
if url.startswith("git@"):
@@ -73,7 +98,7 @@ def parse_code_hosting_url(url: str) -> Optional[str]:
7398

7499
# For GitHub/GitLab URLs with org/repo structure
75100
if (
76-
host in config.code.github_domains + config.code.gitlab_domains
101+
_domain_matches(parsed, config.code.github_domains + config.code.gitlab_domains)
77102
and len(path_parts) >= 2
78103
):
79104
# Take first two parts: org/repo
@@ -140,7 +165,7 @@ def is_code_hosting_url(url: str) -> bool:
140165
host_part = url[4:].split(":", 1)[0]
141166
return host_part in all_domains
142167

143-
return _extract_host(url) in all_domains
168+
return _domain_matches(urlparse(url), all_domains)
144169

145170

146171
def validate_git_ssh_uri(url: str) -> None:
@@ -186,7 +211,7 @@ def is_git_repo_url(url: str) -> bool:
186211
)
187212
)
188213
parsed = urlparse(url)
189-
if _extract_host(url) not in all_domains:
214+
if not _domain_matches(parsed, all_domains):
190215
return False
191216
path_parts = [p for p in parsed.path.split("/") if p]
192217
# Strip .git suffix from last part for counting

tests/test_code_hosting_utils.py

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ def _mock_config():
4343
_spec.loader.exec_module(_module)
4444

4545
parse_code_hosting_url = _module.parse_code_hosting_url
46+
is_github_url = _module.is_github_url
47+
is_gitlab_url = _module.is_gitlab_url
4648
is_code_hosting_url = _module.is_code_hosting_url
4749
is_git_repo_url = _module.is_git_repo_url
4850
validate_git_ssh_uri = _module.validate_git_ssh_uri
@@ -81,12 +83,16 @@ def test_parse_code_hosting_url_https_dotgit():
8183
assert parse_code_hosting_url("https://github.com/org/repo.git") == "org/repo"
8284

8385

84-
def test_parse_code_hosting_url_ssh_with_userinfo():
86+
def test_parse_code_hosting_url_ssh_url_with_userinfo():
8587
assert parse_code_hosting_url("ssh://git@github.com/org/repo.git") == "org/repo"
8688

8789

88-
def test_parse_code_hosting_url_https_with_explicit_port():
89-
assert parse_code_hosting_url("https://github.com:443/org/repo.git") == "org/repo"
90+
def test_parse_code_hosting_url_gitlab_ssh_url_with_userinfo():
91+
assert parse_code_hosting_url("ssh://git@gitlab.com/group/repo.git") == "group/repo"
92+
93+
94+
def test_parse_code_hosting_url_https_with_port():
95+
assert parse_code_hosting_url("https://github.com:443/org/repo") == "org/repo"
9096

9197

9298
# --- validate_git_ssh_uri ---
@@ -126,14 +132,25 @@ def test_is_code_hosting_url_https():
126132
assert is_code_hosting_url("https://github.com/org/repo") is True
127133

128134

129-
def test_is_code_hosting_url_ssh_with_userinfo():
135+
def test_is_code_hosting_url_ssh_url_with_userinfo():
130136
assert is_code_hosting_url("ssh://git@github.com/org/repo.git") is True
131137

132138

133-
def test_is_code_hosting_url_https_with_explicit_port():
139+
def test_is_code_hosting_url_https_with_port():
134140
assert is_code_hosting_url("https://github.com:443/org/repo") is True
135141

136142

143+
# --- is_github_url / is_gitlab_url ---
144+
145+
146+
def test_is_github_url_ssh_url_with_userinfo():
147+
assert is_github_url("ssh://git@github.com/org/repo.git") is True
148+
149+
150+
def test_is_gitlab_url_ssh_url_with_userinfo():
151+
assert is_gitlab_url("ssh://git@gitlab.com/group/repo.git") is True
152+
153+
137154
# --- is_git_repo_url ---
138155

139156

@@ -145,11 +162,11 @@ def test_is_git_repo_url_https_repo():
145162
assert is_git_repo_url("https://github.com/org/repo") is True
146163

147164

148-
def test_is_git_repo_url_ssh_with_userinfo():
165+
def test_is_git_repo_url_ssh_url_with_userinfo():
149166
assert is_git_repo_url("ssh://git@github.com/org/repo.git") is True
150167

151168

152-
def test_is_git_repo_url_https_with_explicit_port():
169+
def test_is_git_repo_url_https_with_port():
153170
assert is_git_repo_url("https://github.com:443/org/repo") is True
154171

155172

tests/unit/test_accessors_git.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,29 @@
33
"""Unit tests for GitAccessor."""
44

55
from pathlib import Path
6+
from types import SimpleNamespace
7+
from unittest.mock import patch
68

79
import pytest
810

911
from openviking.parse.accessors import GitAccessor
12+
from openviking.utils import code_hosting_utils
13+
14+
15+
def _mock_config():
16+
return SimpleNamespace(
17+
code=SimpleNamespace(
18+
github_domains=["github.com", "www.github.com"],
19+
gitlab_domains=["gitlab.com", "www.gitlab.com"],
20+
code_hosting_domains=["github.com", "gitlab.com"],
21+
)
22+
)
23+
24+
25+
@pytest.fixture(autouse=True)
26+
def _patch_config():
27+
with patch.object(code_hosting_utils, "get_openviking_config", side_effect=_mock_config):
28+
yield
1029

1130

1231
class TestGitAccessor:
@@ -60,6 +79,13 @@ def test_can_handle_git_protocol_url(self, accessor: GitAccessor) -> None:
6079
"""GitAccessor should handle git:// URLs."""
6180
assert accessor.can_handle("git://github.com/volcengine/OpenViking.git") is True
6281

82+
def test_normalize_repo_url_ssh_with_userinfo_and_ref(self, accessor: GitAccessor) -> None:
83+
"""GitAccessor should normalize ssh URLs with userinfo using the shared host matcher."""
84+
assert (
85+
accessor._normalize_repo_url("ssh://git@github.com:443/volcengine/OpenViking/tree/main")
86+
== "ssh://git@github.com:443/volcengine/OpenViking"
87+
)
88+
6389
@pytest.mark.parametrize(
6490
"source",
6591
[

0 commit comments

Comments
 (0)