Skip to content

Commit 8ed98aa

Browse files
Fixes #24348: Strip URL scheme from hostPort at model construction
Centralise hostPort sanitisation in BaseModel.model_post_init() so that every *Connection class automatically strips accidental http(s):// or other URL-scheme prefixes the moment the model is constructed. This replaces the previous approach of patching individual connector connection.py files, covers all 38+ connectors at once, and removes the need for per-connector call-site changes. clean_host_port() is kept in db_utils.py as a public utility and is also used inside get_host_from_host_port() as defence-in-depth.
1 parent 219490a commit 8ed98aa

4 files changed

Lines changed: 145 additions & 6 deletions

File tree

ingestion/src/metadata/ingestion/models/custom_pydantic.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,17 +42,24 @@ class BaseModel(PydanticBaseModel):
4242

4343
def model_post_init(self, context: Any, /):
4444
"""
45-
This function is used to parse the FilterPattern fields for the Connection classes.
46-
This is needed because dict is defined in the JSON schema for the FilterPattern field,
47-
but a FilterPattern object is required in the generated code.
45+
Post-init hook for Connection classes:
46+
- Sanitises ``hostPort`` by stripping accidental URL scheme prefixes.
47+
- Converts raw ``dict`` values into ``FilterPattern`` objects.
4848
"""
4949
# pylint: disable=import-outside-toplevel
5050
try:
5151
if not self.__class__.__name__.endswith("Connection"):
52-
# Only parse FilterPattern for Connection classes
5352
return
5453
if not hasattr(self, "__pydantic_fields__"):
5554
return
55+
56+
if "hostPort" in self.__pydantic_fields__:
57+
raw = getattr(self, "hostPort", None)
58+
if raw and "://" in raw:
59+
from metadata.utils.db_utils import clean_host_port
60+
61+
object.__setattr__(self, "hostPort", clean_host_port(raw))
62+
5663
for field in self.__pydantic_fields__:
5764
if field.endswith("FilterPattern"):
5865
from metadata.generated.schema.type.filterPattern import (

ingestion/src/metadata/utils/db_utils.py

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,11 @@
1212
"""
1313
Helpers module for db sources
1414
"""
15+
1516
import time
1617
import traceback
1718
from typing import Iterable, List, Union
19+
from urllib.parse import urlparse
1820

1921
from metadata.generated.schema.api.lineage.addLineage import AddLineageRequest
2022
from metadata.generated.schema.entity.data.table import Table
@@ -43,12 +45,52 @@
4345
PUBLIC_SCHEMA = "public"
4446

4547

48+
def clean_host_port(host_port: str) -> str:
49+
"""
50+
Strip URL scheme prefixes from a hostPort string.
51+
52+
Users sometimes enter a full URL (e.g. 'http://localhost:3306')
53+
instead of just 'localhost:3306'. This strips the scheme to avoid
54+
ValueError when parsing host and port.
55+
"""
56+
host_port = host_port.strip()
57+
if "://" not in host_port:
58+
return host_port.rstrip("/")
59+
60+
logger.warning(
61+
"The hostPort '%s' contains a URL scheme. "
62+
"Expected format is 'hostname[:port]' (e.g. 'localhost:3306'). "
63+
"Stripping the scheme prefix.",
64+
host_port,
65+
)
66+
parsed = urlparse(host_port)
67+
hostname = parsed.hostname or ""
68+
try:
69+
port = parsed.port
70+
except ValueError as exc:
71+
raise ValueError(
72+
f"Invalid hostPort '{host_port}'. Expected format is "
73+
"'hostname[:port]' (e.g. 'localhost:3306')."
74+
) from exc
75+
76+
if not hostname:
77+
# urlparse couldn't extract hostname (e.g. jdbc:postgresql://host:5432)
78+
# Fall back to stripping everything before the last ://
79+
raw = host_port.rsplit("://", 1)[-1]
80+
raw = raw.split("/", 1)[0]
81+
raw = raw.split("?", 1)[0]
82+
raw = raw.split("#", 1)[0]
83+
return raw
84+
85+
return f"{hostname}:{port}" if port else hostname
86+
87+
4688
def get_host_from_host_port(uri: str) -> str:
4789
"""
4890
if uri is like "localhost:9000"
4991
then return the host "localhost"
5092
"""
51-
return uri.split(":")[0]
93+
return clean_host_port(uri).split(":")[0]
5294

5395

5496
# pylint: disable=too-many-locals

ingestion/tests/unit/test_build_connection_url.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,34 @@ def test_get_connection_url_mysql(self):
9797
"mysql+pymysql://openmetadata_user:mocked_token@localhost:3306/openmetadata_db",
9898
)
9999

100+
def test_get_connection_url_mysql_with_url_scheme(self):
101+
"""hostPort with http:// prefix should be cleaned automatically"""
102+
connection = MysqlConnectionConfig(
103+
username="openmetadata_user",
104+
authType=BasicAuth(password="openmetadata_password"),
105+
hostPort="http://localhost:3306",
106+
databaseSchema="openmetadata_db",
107+
)
108+
engine_connection = MySQLConnection(connection).client
109+
self.assertEqual(
110+
engine_connection.url.render_as_string(hide_password=False),
111+
"mysql+pymysql://openmetadata_user:openmetadata_password@localhost:3306/openmetadata_db",
112+
)
113+
114+
def test_get_connection_url_postgres_with_url_scheme(self):
115+
"""hostPort with https:// prefix should be cleaned automatically"""
116+
connection = PostgresConnectionConfig(
117+
username="openmetadata_user",
118+
authType=BasicAuth(password="openmetadata_password"),
119+
hostPort="https://localhost:5432",
120+
database="openmetadata_db",
121+
)
122+
engine_connection = PostgresConnection(connection).client
123+
self.assertEqual(
124+
engine_connection.url.render_as_string(hide_password=False),
125+
"postgresql+psycopg2://openmetadata_user:openmetadata_password@localhost:5432/openmetadata_db",
126+
)
127+
100128
def test_get_connection_url_postgres(self):
101129
connection = PostgresConnectionConfig(
102130
username="openmetadata_user",

ingestion/tests/unit/test_db_utils.py

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
"""
1313
Unit tests for db_utils module
1414
"""
15+
1516
import uuid
1617
from copy import deepcopy
1718
from unittest import TestCase
@@ -36,7 +37,11 @@
3637
from metadata.ingestion.lineage.models import Dialect
3738
from metadata.ingestion.lineage.sql_lineage import search_cache
3839
from metadata.ingestion.source.models import TableView
39-
from metadata.utils.db_utils import get_host_from_host_port, get_view_lineage
40+
from metadata.utils.db_utils import (
41+
clean_host_port,
42+
get_host_from_host_port,
43+
get_view_lineage,
44+
)
4045

4146

4247
# Mock LineageTable class to simulate collate_sqllineage.core.models.Table
@@ -118,6 +123,63 @@ def test_get_host_from_host_port(self):
118123
self.assertEqual(get_host_from_host_port("localhost"), "localhost")
119124
self.assertEqual(get_host_from_host_port("example.com"), "example.com")
120125

126+
# Test with URL scheme prefixes
127+
self.assertEqual(get_host_from_host_port("http://localhost:3306"), "localhost")
128+
self.assertEqual(
129+
get_host_from_host_port("https://example.com:5432"), "example.com"
130+
)
131+
self.assertEqual(get_host_from_host_port("http://localhost"), "localhost")
132+
133+
def test_clean_host_port(self):
134+
"""Test clean_host_port strips URL scheme prefixes"""
135+
# Already-clean values pass through unchanged
136+
self.assertEqual(clean_host_port("localhost:3306"), "localhost:3306")
137+
self.assertEqual(clean_host_port("127.0.0.1:5432"), "127.0.0.1:5432")
138+
self.assertEqual(clean_host_port("example.com"), "example.com")
139+
140+
# HTTP prefix is stripped
141+
self.assertEqual(clean_host_port("http://localhost:3306"), "localhost:3306")
142+
self.assertEqual(clean_host_port("http://example.com:8080"), "example.com:8080")
143+
144+
# HTTPS prefix is stripped
145+
self.assertEqual(clean_host_port("https://localhost:5432"), "localhost:5432")
146+
self.assertEqual(
147+
clean_host_port("https://mydb.example.com:3306"), "mydb.example.com:3306"
148+
)
149+
150+
# Trailing slash is stripped
151+
self.assertEqual(clean_host_port("http://localhost:3306/"), "localhost:3306")
152+
153+
# Host only with scheme
154+
self.assertEqual(clean_host_port("http://localhost"), "localhost")
155+
self.assertEqual(clean_host_port("https://example.com"), "example.com")
156+
157+
# URL with path is handled — path/query/fragment are discarded
158+
self.assertEqual(clean_host_port("http://localhost:3306/db"), "localhost:3306")
159+
self.assertEqual(
160+
clean_host_port("https://example.com:5432/mydb?ssl=true"),
161+
"example.com:5432",
162+
)
163+
164+
# Whitespace is stripped
165+
self.assertEqual(clean_host_port(" localhost:3306 "), "localhost:3306")
166+
self.assertEqual(clean_host_port(" http://localhost:3306 "), "localhost:3306")
167+
168+
# JDBC-style URLs fall back to raw extraction
169+
self.assertEqual(clean_host_port("jdbc:postgresql://host:5432"), "host:5432")
170+
self.assertEqual(clean_host_port("jdbc:postgresql://host:5432/db"), "host:5432")
171+
self.assertEqual(
172+
clean_host_port("jdbc:postgresql://host:5432?ssl=true"), "host:5432"
173+
)
174+
self.assertEqual(
175+
clean_host_port("jdbc:postgresql://host:5432/db?ssl=true#ref"),
176+
"host:5432",
177+
)
178+
179+
# Invalid port raises ValueError
180+
with self.assertRaises(ValueError):
181+
clean_host_port("http://localhost:abc")
182+
121183
@patch("metadata.utils.db_utils.ConnectionTypeDialectMapper")
122184
@patch("metadata.utils.db_utils.fqn")
123185
def test_get_view_lineage_success_with_lineage_parser(

0 commit comments

Comments
 (0)