Skip to content

Commit 9ad9515

Browse files
Fixes open-metadata#24348: Strip URL scheme from hostPort to prevent ValueError
1 parent 56dd723 commit 9ad9515

4 files changed

Lines changed: 187 additions & 10 deletions

File tree

ingestion/src/metadata/ingestion/models/custom_pydantic.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -42,17 +42,30 @@ class BaseModel(PydanticBaseModel):
4242

4343
def model_post_init(self, context: Any, /):
4444
"""
45-
This function is used to parse the FilterPattern fields for the Connection classes.
46-
This is needed because dict is defined in the JSON schema for the FilterPattern field,
47-
but a FilterPattern object is required in the generated code.
45+
Post-init hook for Connection classes:
46+
- Sanitises ``hostPort`` by stripping accidental URL scheme prefixes.
47+
- Converts raw ``dict`` values into ``FilterPattern`` objects.
4848
"""
4949
# pylint: disable=import-outside-toplevel
50+
if not self.__class__.__name__.endswith("Connection"):
51+
return
52+
if not hasattr(self, "__pydantic_fields__"):
53+
return
54+
55+
if "hostPort" in self.__pydantic_fields__:
56+
raw = getattr(self, "hostPort", None)
57+
if isinstance(raw, str) and "://" in raw:
58+
try:
59+
from metadata.utils.db_utils import clean_host_port
60+
61+
object.__setattr__(self, "hostPort", clean_host_port(raw))
62+
except Exception:
63+
logger.warning(
64+
"Failed to clean hostPort '%s'; leaving as-is",
65+
raw[:50],
66+
)
67+
5068
try:
51-
if not self.__class__.__name__.endswith("Connection"):
52-
# Only parse FilterPattern for Connection classes
53-
return
54-
if not hasattr(self, "__pydantic_fields__"):
55-
return
5669
for field in self.__pydantic_fields__:
5770
if field.endswith("FilterPattern"):
5871
from metadata.generated.schema.type.filterPattern import (

ingestion/src/metadata/utils/db_utils.py

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,11 @@
1212
"""
1313
Helpers module for db sources
1414
"""
15+
1516
import time
1617
import traceback
1718
from typing import Iterable, List, Union
19+
from urllib.parse import urlparse
1820

1921
from metadata.generated.schema.api.lineage.addLineage import AddLineageRequest
2022
from metadata.generated.schema.entity.data.table import Table
@@ -43,12 +45,63 @@
4345
PUBLIC_SCHEMA = "public"
4446

4547

48+
def clean_host_port(host_port: str) -> str:
49+
"""
50+
Strip URL scheme prefixes from a hostPort string.
51+
52+
Users sometimes enter a full URL (e.g. 'http://localhost:3306')
53+
instead of just 'localhost:3306'. This strips the scheme to avoid
54+
ValueError when parsing host and port.
55+
"""
56+
host_port = host_port.strip()
57+
if "://" not in host_port:
58+
return host_port.rstrip("/")
59+
60+
parsed = urlparse(host_port)
61+
hostname = parsed.hostname or ""
62+
safe_label = (
63+
f"{parsed.scheme}://{hostname}"
64+
if parsed.scheme and hostname
65+
else "URL with scheme"
66+
)
67+
logger.warning(
68+
"The hostPort '%s' contains a URL scheme. "
69+
"Expected format is 'hostname[:port]' (e.g. 'localhost:3306'). "
70+
"Stripping the scheme prefix.",
71+
safe_label,
72+
)
73+
try:
74+
port = parsed.port
75+
except ValueError as exc:
76+
raise ValueError(
77+
f"Invalid hostPort '{safe_label}'. Expected format is "
78+
"'hostname[:port]' (e.g. 'localhost:3306')."
79+
) from exc
80+
81+
if not hostname:
82+
# urlparse couldn't extract hostname (e.g. jdbc:postgresql://host:5432)
83+
# Fall back to stripping everything before the last ://
84+
raw = host_port.rsplit("://", 1)[-1]
85+
raw = raw.split("/", 1)[0]
86+
raw = raw.split("?", 1)[0]
87+
raw = raw.split("#", 1)[0]
88+
if "@" in raw:
89+
raw = raw.rsplit("@", 1)[-1]
90+
return raw
91+
92+
host = f"[{hostname}]" if ":" in hostname else hostname
93+
return f"{host}:{port}" if port is not None else host
94+
95+
4696
def get_host_from_host_port(uri: str) -> str:
4797
"""
4898
if uri is like "localhost:9000"
4999
then return the host "localhost"
50100
"""
51-
return uri.split(":")[0]
101+
cleaned = clean_host_port(uri)
102+
if cleaned.startswith("["):
103+
return cleaned.split("]")[0] + "]"
104+
return cleaned.split(":")[0]
52105

53106

54107
# pylint: disable=too-many-locals

ingestion/tests/unit/test_build_connection_url.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,34 @@ def test_get_connection_url_mysql(self):
9797
"mysql+pymysql://openmetadata_user:mocked_token@localhost:3306/openmetadata_db",
9898
)
9999

100+
def test_get_connection_url_mysql_with_url_scheme(self):
101+
"""hostPort with http:// prefix should be cleaned automatically"""
102+
connection = MysqlConnectionConfig(
103+
username="openmetadata_user",
104+
authType=BasicAuth(password="openmetadata_password"),
105+
hostPort="http://localhost:3306",
106+
databaseSchema="openmetadata_db",
107+
)
108+
engine_connection = MySQLConnection(connection).client
109+
self.assertEqual(
110+
engine_connection.url.render_as_string(hide_password=False),
111+
"mysql+pymysql://openmetadata_user:openmetadata_password@localhost:3306/openmetadata_db",
112+
)
113+
114+
def test_get_connection_url_postgres_with_url_scheme(self):
115+
"""hostPort with https:// prefix should be cleaned automatically"""
116+
connection = PostgresConnectionConfig(
117+
username="openmetadata_user",
118+
authType=BasicAuth(password="openmetadata_password"),
119+
hostPort="https://localhost:5432",
120+
database="openmetadata_db",
121+
)
122+
engine_connection = PostgresConnection(connection).client
123+
self.assertEqual(
124+
engine_connection.url.render_as_string(hide_password=False),
125+
"postgresql+psycopg2://openmetadata_user:openmetadata_password@localhost:5432/openmetadata_db",
126+
)
127+
100128
def test_get_connection_url_postgres(self):
101129
connection = PostgresConnectionConfig(
102130
username="openmetadata_user",

ingestion/tests/unit/test_db_utils.py

Lines changed: 84 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
"""
1313
Unit tests for db_utils module
1414
"""
15+
1516
import uuid
1617
from copy import deepcopy
1718
from unittest import TestCase
@@ -36,7 +37,11 @@
3637
from metadata.ingestion.lineage.models import Dialect
3738
from metadata.ingestion.lineage.sql_lineage import search_cache
3839
from metadata.ingestion.source.models import TableView
39-
from metadata.utils.db_utils import get_host_from_host_port, get_view_lineage
40+
from metadata.utils.db_utils import (
41+
clean_host_port,
42+
get_host_from_host_port,
43+
get_view_lineage,
44+
)
4045

4146

4247
# Mock LineageTable class to simulate collate_sqllineage.core.models.Table
@@ -118,6 +123,84 @@ def test_get_host_from_host_port(self):
118123
self.assertEqual(get_host_from_host_port("localhost"), "localhost")
119124
self.assertEqual(get_host_from_host_port("example.com"), "example.com")
120125

126+
# Test with URL scheme prefixes
127+
self.assertEqual(get_host_from_host_port("http://localhost:3306"), "localhost")
128+
self.assertEqual(
129+
get_host_from_host_port("https://example.com:5432"), "example.com"
130+
)
131+
self.assertEqual(get_host_from_host_port("http://localhost"), "localhost")
132+
133+
# Test with IPv6 addresses
134+
self.assertEqual(get_host_from_host_port("http://[::1]:3306"), "[::1]")
135+
self.assertEqual(get_host_from_host_port("[::1]:3306"), "[::1]")
136+
137+
def test_clean_host_port(self):
138+
"""Test clean_host_port strips URL scheme prefixes"""
139+
# Already-clean values pass through unchanged
140+
self.assertEqual(clean_host_port("localhost:3306"), "localhost:3306")
141+
self.assertEqual(clean_host_port("127.0.0.1:5432"), "127.0.0.1:5432")
142+
self.assertEqual(clean_host_port("example.com"), "example.com")
143+
144+
# HTTP prefix is stripped
145+
self.assertEqual(clean_host_port("http://localhost:3306"), "localhost:3306")
146+
self.assertEqual(clean_host_port("http://example.com:8080"), "example.com:8080")
147+
148+
# HTTPS prefix is stripped
149+
self.assertEqual(clean_host_port("https://localhost:5432"), "localhost:5432")
150+
self.assertEqual(
151+
clean_host_port("https://mydb.example.com:3306"), "mydb.example.com:3306"
152+
)
153+
154+
# Trailing slash is stripped
155+
self.assertEqual(clean_host_port("http://localhost:3306/"), "localhost:3306")
156+
157+
# Host only with scheme
158+
self.assertEqual(clean_host_port("http://localhost"), "localhost")
159+
self.assertEqual(clean_host_port("https://example.com"), "example.com")
160+
161+
# URL with path is handled — path/query/fragment are discarded
162+
self.assertEqual(clean_host_port("http://localhost:3306/db"), "localhost:3306")
163+
self.assertEqual(
164+
clean_host_port("https://example.com:5432/mydb?ssl=true"),
165+
"example.com:5432",
166+
)
167+
168+
# Whitespace is stripped
169+
self.assertEqual(clean_host_port(" localhost:3306 "), "localhost:3306")
170+
self.assertEqual(clean_host_port(" http://localhost:3306 "), "localhost:3306")
171+
172+
# JDBC-style URLs fall back to raw extraction
173+
self.assertEqual(clean_host_port("jdbc:postgresql://host:5432"), "host:5432")
174+
self.assertEqual(clean_host_port("jdbc:postgresql://host:5432/db"), "host:5432")
175+
self.assertEqual(
176+
clean_host_port("jdbc:postgresql://host:5432?ssl=true"), "host:5432"
177+
)
178+
self.assertEqual(
179+
clean_host_port("jdbc:postgresql://host:5432/db?ssl=true#ref"),
180+
"host:5432",
181+
)
182+
183+
# IPv6 addresses — brackets are preserved
184+
self.assertEqual(clean_host_port("http://[::1]:3306"), "[::1]:3306")
185+
self.assertEqual(clean_host_port("https://[::1]:5432"), "[::1]:5432")
186+
self.assertEqual(clean_host_port("http://[::1]"), "[::1]")
187+
self.assertEqual(
188+
clean_host_port("http://[2001:db8::1]:3306"), "[2001:db8::1]:3306"
189+
)
190+
191+
# Plain IPv6 without scheme passes through unchanged
192+
self.assertEqual(clean_host_port("[::1]:3306"), "[::1]:3306")
193+
194+
# JDBC with userinfo — credentials are stripped
195+
self.assertEqual(
196+
clean_host_port("jdbc:postgresql://user:pass@host:5432/db"),
197+
"host:5432",
198+
)
199+
200+
# Invalid port raises ValueError
201+
with self.assertRaises(ValueError):
202+
clean_host_port("http://localhost:abc")
203+
121204
@patch("metadata.utils.db_utils.ConnectionTypeDialectMapper")
122205
@patch("metadata.utils.db_utils.fqn")
123206
def test_get_view_lineage_success_with_lineage_parser(

0 commit comments

Comments
 (0)