Skip to content

Commit 8bc7b40

Browse files
fix(ingestion): keep custom_pydantic dependency-free to avoid circular import
The model_post_init hook added in the previous commit imported `clean_host_port` from `metadata.utils.db_utils` at runtime. That module has a heavy dependency graph (OpenMetadata client, LineageParser, generated schemas), so every *Connection class construction would drag it in and cause circular-import failures during integration test bootstrap. The file docstring explicitly requires BaseModel to be 'self-sufficient with only pydantic at import time', so the hostPort sanitisation is now handled by a stdlib-only helper (_strip_hostport_scheme) colocated with BaseModel. `clean_host_port` remains the public API in db_utils and delegates to the same helper, preserving behaviour for all existing callers and unit tests.
1 parent 31c851f commit 8bc7b40

2 files changed

Lines changed: 62 additions & 45 deletions

File tree

ingestion/src/metadata/ingestion/models/custom_pydantic.py

Lines changed: 54 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import json
1919
import logging
2020
from typing import Any, Callable, Dict, Literal, Optional, Union
21+
from urllib.parse import urlparse
2122

2223
from pydantic import BaseModel as PydanticBaseModel
2324
from pydantic import WrapSerializer, model_validator
@@ -34,6 +35,55 @@
3435
JSON_ENCODERS = "json_encoders"
3536

3637

38+
def _strip_hostport_scheme(raw: str) -> str:
39+
"""
40+
Strip an accidental URL scheme from a hostPort string.
41+
42+
Self-contained helper that depends only on the standard library so
43+
``model_post_init`` never drags heavy ``metadata.*`` imports into the
44+
bootstrap path of every generated Connection class.
45+
46+
Raises ValueError if the scheme carries a non-numeric port so the user
47+
gets a clear error instead of a silently broken hostPort.
48+
"""
49+
value = raw.strip()
50+
if "://" not in value:
51+
return value
52+
53+
parsed = urlparse(value)
54+
hostname = parsed.hostname or ""
55+
safe_label = (
56+
f"{parsed.scheme}://{hostname}"
57+
if parsed.scheme and hostname
58+
else "URL with scheme"
59+
)
60+
logger.warning(
61+
"The hostPort '%s' contains a URL scheme. Expected format is "
62+
"'hostname[:port]' (e.g. 'localhost:3306'). Stripping the scheme prefix.",
63+
safe_label,
64+
)
65+
try:
66+
port = parsed.port
67+
except ValueError as exc:
68+
raise ValueError(
69+
f"Invalid hostPort '{safe_label}'. Expected format is "
70+
"'hostname[:port]' (e.g. 'localhost:3306')."
71+
) from exc
72+
73+
if not hostname:
74+
# urlparse couldn't extract a hostname (e.g. 'jdbc:postgresql://host:5432/db')
75+
# Fall back to stripping scheme and any trailing path/query/fragment/userinfo.
76+
tail = value.rsplit("://", 1)[-1]
77+
for sep in ("/", "?", "#"):
78+
tail = tail.split(sep, 1)[0]
79+
if "@" in tail:
80+
tail = tail.rsplit("@", 1)[-1]
81+
return tail
82+
83+
host = f"[{hostname}]" if ":" in hostname else hostname
84+
return f"{host}:{port}" if port is not None else host
85+
86+
3787
class BaseModel(PydanticBaseModel):
3888
"""
3989
Base model for OpenMetadata generated models.
@@ -55,12 +105,10 @@ def model_post_init(self, context: Any, /):
55105
if "hostPort" in self.__pydantic_fields__:
56106
raw = getattr(self, "hostPort", None)
57107
if isinstance(raw, str) and "://" in raw:
58-
from metadata.utils.db_utils import clean_host_port
59-
60-
# Let ValueError propagate: if clean_host_port cannot parse
61-
# the input (e.g. non-numeric port), the user must fix their
62-
# config rather than silently getting a broken hostPort.
63-
object.__setattr__(self, "hostPort", clean_host_port(raw))
108+
# Let ValueError propagate: if the hostPort cannot be parsed
109+
# (e.g. non-numeric port), the user must fix their config
110+
# rather than silently getting a broken hostPort.
111+
object.__setattr__(self, "hostPort", _strip_hostport_scheme(raw))
64112

65113
try:
66114
for field in self.__pydantic_fields__:

ingestion/src/metadata/utils/db_utils.py

Lines changed: 8 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
import time
1717
import traceback
1818
from typing import Iterable, List, Union
19-
from urllib.parse import urlparse
2019

2120
from metadata.generated.schema.api.lineage.addLineage import AddLineageRequest
2221
from metadata.generated.schema.entity.data.table import Table
@@ -34,6 +33,7 @@
3433
get_lineage_by_query,
3534
get_lineage_via_table_entity,
3635
)
36+
from metadata.ingestion.models.custom_pydantic import _strip_hostport_scheme
3737
from metadata.ingestion.ometa.ometa_api import OpenMetadata
3838
from metadata.ingestion.source.models import TableView
3939
from metadata.utils import fqn
@@ -52,45 +52,14 @@ def clean_host_port(host_port: str) -> str:
5252
Users sometimes enter a full URL (e.g. 'http://localhost:3306')
5353
instead of just 'localhost:3306'. This strips the scheme to avoid
5454
ValueError when parsing host and port.
55+
56+
Delegates to the stdlib-only helper colocated with ``BaseModel`` so the
57+
behaviour stays in lockstep with Pydantic's ``model_post_init`` hook.
5558
"""
56-
host_port = host_port.strip()
57-
if "://" not in host_port:
58-
return host_port.rstrip("/")
59-
60-
parsed = urlparse(host_port)
61-
hostname = parsed.hostname or ""
62-
safe_label = (
63-
f"{parsed.scheme}://{hostname}"
64-
if parsed.scheme and hostname
65-
else "URL with scheme"
66-
)
67-
logger.warning(
68-
"The hostPort '%s' contains a URL scheme. "
69-
"Expected format is 'hostname[:port]' (e.g. 'localhost:3306'). "
70-
"Stripping the scheme prefix.",
71-
safe_label,
72-
)
73-
try:
74-
port = parsed.port
75-
except ValueError as exc:
76-
raise ValueError(
77-
f"Invalid hostPort '{safe_label}'. Expected format is "
78-
"'hostname[:port]' (e.g. 'localhost:3306')."
79-
) from exc
80-
81-
if not hostname:
82-
# urlparse couldn't extract hostname (e.g. jdbc:postgresql://host:5432)
83-
# Fall back to stripping everything before the last ://
84-
raw = host_port.rsplit("://", 1)[-1]
85-
raw = raw.split("/", 1)[0]
86-
raw = raw.split("?", 1)[0]
87-
raw = raw.split("#", 1)[0]
88-
if "@" in raw:
89-
raw = raw.rsplit("@", 1)[-1]
90-
return raw
91-
92-
host = f"[{hostname}]" if ":" in hostname else hostname
93-
return f"{host}:{port}" if port is not None else host
59+
value = host_port.strip()
60+
if "://" not in value:
61+
return value.rstrip("/")
62+
return _strip_hostport_scheme(value)
9463

9564

9665
def get_host_from_host_port(uri: str) -> str:

0 commit comments

Comments
 (0)