Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions openviking/parse/accessors/git_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

from openviking.utils import is_github_url, is_gitlab_url, parse_code_hosting_url
from openviking.utils.code_hosting_utils import (
_domain_matches,
is_code_hosting_url,
is_git_repo_url,
validate_git_ssh_uri,
Expand Down Expand Up @@ -275,10 +276,7 @@ def _normalize_repo_url(self, url: str) -> str:
base_parts = path_parts[: git_index + 1]

config = get_openviking_config()
if (
parsed.netloc in config.code.github_domains + config.code.gitlab_domains
and len(path_parts) >= 2
):
if _domain_matches(parsed, config.code.github_domains + config.code.gitlab_domains):
base_parts = path_parts[:2]
base_path = "/" + "/".join(base_parts)
return parsed._replace(path=base_path, query="", fragment="").geturl()
Expand Down
6 changes: 2 additions & 4 deletions openviking/parse/parsers/code/code.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
)
from openviking.parse.parsers.upload_utils import upload_directory
from openviking.utils import is_github_url, parse_code_hosting_url
from openviking.utils.code_hosting_utils import _domain_matches
from openviking_cli.utils.config import get_openviking_config
from openviking_cli.utils.logger import get_logger

Expand Down Expand Up @@ -291,10 +292,7 @@ def _normalize_repo_url(self, url: str) -> str:
base_parts = path_parts[: git_index + 1]

config = get_openviking_config()
if (
parsed.netloc in config.code.github_domains + config.code.gitlab_domains
and len(path_parts) >= 2
):
if _domain_matches(parsed, config.code.github_domains + config.code.gitlab_domains):
base_parts = path_parts[:2]
base_path = "/" + "/".join(base_parts)
return parsed._replace(path=base_path, query="", fragment="").geturl()
Expand Down
35 changes: 30 additions & 5 deletions openviking/utils/code_hosting_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,37 @@
"""

from typing import Optional
from urllib.parse import urlparse
from urllib.parse import ParseResult, urlparse

from openviking_cli.utils.config import get_openviking_config


def _domain_matches(parsed: ParseResult, domains: list[str]) -> bool:
"""Return True when parsed URL host matches configured domains.

``urlparse().netloc`` includes optional userinfo and port values. Repository
clone URLs commonly use forms like ``ssh://git@github.com/org/repo.git``,
where the netloc is ``git@github.com`` but the actual host is
``github.com``.
"""
hostname = parsed.hostname
if not hostname:
return False

normalized_domains = {domain.lower() for domain in domains}
host = hostname.lower()
candidates = {host}

try:
port = parsed.port
except ValueError:
port = None
if port is not None:
candidates.add(f"{host}:{port}")

return any(candidate in normalized_domains for candidate in candidates)


def _extract_host(url: str) -> str:
"""Extract normalized host for supported git/code-hosting URL forms."""
if url.startswith("git@"):
Expand Down Expand Up @@ -44,7 +70,6 @@ def parse_code_hosting_url(url: str) -> Optional[str]:
+ config.code.code_hosting_domains
)
)
host = _extract_host(url)

# Handle git@ SSH URLs: git@host:org/repo.git
if url.startswith("git@"):
Expand Down Expand Up @@ -73,7 +98,7 @@ def parse_code_hosting_url(url: str) -> Optional[str]:

# For GitHub/GitLab URLs with org/repo structure
if (
host in config.code.github_domains + config.code.gitlab_domains
_domain_matches(parsed, config.code.github_domains + config.code.gitlab_domains)
and len(path_parts) >= 2
):
# Take first two parts: org/repo
Expand Down Expand Up @@ -140,7 +165,7 @@ def is_code_hosting_url(url: str) -> bool:
host_part = url[4:].split(":", 1)[0]
return host_part in all_domains

return _extract_host(url) in all_domains
return _domain_matches(urlparse(url), all_domains)


def validate_git_ssh_uri(url: str) -> None:
Expand Down Expand Up @@ -186,7 +211,7 @@ def is_git_repo_url(url: str) -> bool:
)
)
parsed = urlparse(url)
if _extract_host(url) not in all_domains:
if not _domain_matches(parsed, all_domains):
return False
path_parts = [p for p in parsed.path.split("/") if p]
# Strip .git suffix from last part for counting
Expand Down
31 changes: 24 additions & 7 deletions tests/test_code_hosting_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ def _mock_config():
_spec.loader.exec_module(_module)

parse_code_hosting_url = _module.parse_code_hosting_url
is_github_url = _module.is_github_url
is_gitlab_url = _module.is_gitlab_url
is_code_hosting_url = _module.is_code_hosting_url
is_git_repo_url = _module.is_git_repo_url
validate_git_ssh_uri = _module.validate_git_ssh_uri
Expand Down Expand Up @@ -81,12 +83,16 @@ def test_parse_code_hosting_url_https_dotgit():
assert parse_code_hosting_url("https://github.com/org/repo.git") == "org/repo"


def test_parse_code_hosting_url_ssh_with_userinfo():
def test_parse_code_hosting_url_ssh_url_with_userinfo():
assert parse_code_hosting_url("ssh://git@github.com/org/repo.git") == "org/repo"


def test_parse_code_hosting_url_https_with_explicit_port():
assert parse_code_hosting_url("https://github.com:443/org/repo.git") == "org/repo"
def test_parse_code_hosting_url_gitlab_ssh_url_with_userinfo():
assert parse_code_hosting_url("ssh://git@gitlab.com/group/repo.git") == "group/repo"


def test_parse_code_hosting_url_https_with_port():
assert parse_code_hosting_url("https://github.com:443/org/repo") == "org/repo"


# --- validate_git_ssh_uri ---
Expand Down Expand Up @@ -126,14 +132,25 @@ def test_is_code_hosting_url_https():
assert is_code_hosting_url("https://github.com/org/repo") is True


def test_is_code_hosting_url_ssh_with_userinfo():
def test_is_code_hosting_url_ssh_url_with_userinfo():
assert is_code_hosting_url("ssh://git@github.com/org/repo.git") is True


def test_is_code_hosting_url_https_with_explicit_port():
def test_is_code_hosting_url_https_with_port():
assert is_code_hosting_url("https://github.com:443/org/repo") is True


# --- is_github_url / is_gitlab_url ---


def test_is_github_url_ssh_url_with_userinfo():
assert is_github_url("ssh://git@github.com/org/repo.git") is True


def test_is_gitlab_url_ssh_url_with_userinfo():
assert is_gitlab_url("ssh://git@gitlab.com/group/repo.git") is True


# --- is_git_repo_url ---


Expand All @@ -145,11 +162,11 @@ def test_is_git_repo_url_https_repo():
assert is_git_repo_url("https://github.com/org/repo") is True


def test_is_git_repo_url_ssh_with_userinfo():
def test_is_git_repo_url_ssh_url_with_userinfo():
assert is_git_repo_url("ssh://git@github.com/org/repo.git") is True


def test_is_git_repo_url_https_with_explicit_port():
def test_is_git_repo_url_https_with_port():
assert is_git_repo_url("https://github.com:443/org/repo") is True


Expand Down
26 changes: 26 additions & 0 deletions tests/unit/test_accessors_git.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,29 @@
"""Unit tests for GitAccessor."""

from pathlib import Path
from types import SimpleNamespace
from unittest.mock import patch

import pytest

from openviking.parse.accessors import GitAccessor
from openviking.utils import code_hosting_utils


def _mock_config():
return SimpleNamespace(
code=SimpleNamespace(
github_domains=["github.com", "www.github.com"],
gitlab_domains=["gitlab.com", "www.gitlab.com"],
code_hosting_domains=["github.com", "gitlab.com"],
)
)


@pytest.fixture(autouse=True)
def _patch_config():
with patch.object(code_hosting_utils, "get_openviking_config", side_effect=_mock_config):
yield


class TestGitAccessor:
Expand Down Expand Up @@ -60,6 +79,13 @@ def test_can_handle_git_protocol_url(self, accessor: GitAccessor) -> None:
"""GitAccessor should handle git:// URLs."""
assert accessor.can_handle("git://github.com/volcengine/OpenViking.git") is True

def test_normalize_repo_url_ssh_with_userinfo_and_ref(self, accessor: GitAccessor) -> None:
"""GitAccessor should normalize ssh URLs with userinfo using the shared host matcher."""
assert (
accessor._normalize_repo_url("ssh://git@github.com:443/volcengine/OpenViking/tree/main")
== "ssh://git@github.com:443/volcengine/OpenViking"
)

@pytest.mark.parametrize(
"source",
[
Expand Down
Loading