Skip to content

Commit d813bd6

Browse files
fix(ingestion): protect all connectors from scheme-prefixed hostPort by stripping in model_post_init
Addresses gitar-bot review feedback on PR open-metadata#27191: moving strip_hostport_scheme to get_connection_url_common left several connectors that parse hostPort manually still vulnerable to the original crash (open-metadata#24348): - cassandra/connection.py: host, port = connection.hostPort.split(':') - db2/connection.py: connection.hostPort.split(':')[0] - databricks/auth.py: connection.hostPort.split(':')[0] - redshift/connection.py: connection.hostPort.split(':')[0] - microsoftfabric/connection.py: connection.hostPort.split(':')[0] - builders.py (IAM auth): connection.hostPort.split(':') Restore strip_hostport_scheme call to model_post_init so scheme stripping happens at construction time for ALL *Connection classes, protecting every downstream split site automatically. Exclude AirflowConnection and OpenMetadataConnection via _HOSTPORT_URL_ALLOWED since their hostPort field legitimately stores a full URL (e.g. http://localhost:8080 and http://localhost:8585/api). The call in get_connection_url_common is kept for defence-in-depth; it is idempotent because strip_hostport_scheme short-circuits when '://' is absent. Add three new unit tests to test_connection_builders.py covering: - URL scheme stripping at model construction - ValueError raised for invalid port at construction time - URL-allowed connections are not stripped
1 parent 96428c1 commit d813bd6

2 files changed

Lines changed: 94 additions & 5 deletions

File tree

ingestion/src/metadata/ingestion/models/custom_pydantic.py

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,13 @@ def strip_hostport_scheme(raw: str) -> str:
122122
# pinned to the original underscored symbol while the helper was private.
123123
_strip_hostport_scheme = strip_hostport_scheme
124124

125+
# Connection classes whose hostPort field legitimately stores a full URL
126+
# (e.g. "http://airflow:8080" or "http://localhost:8585/api") and must NOT
127+
# have their scheme stripped by model_post_init.
128+
_HOSTPORT_URL_ALLOWED: frozenset = frozenset(
129+
{"AirflowConnection", "OpenMetadataConnection"}
130+
)
131+
125132

126133
class BaseModel(PydanticBaseModel):
127134
"""
@@ -134,14 +141,31 @@ def model_post_init(self, context: Any, /):
134141
This function is used to parse the FilterPattern fields for the Connection classes.
135142
This is needed because dict is defined in the JSON schema for the FilterPattern field,
136143
but a FilterPattern object is required in the generated code.
144+
145+
Additionally, for Connection classes that store a plain ``hostname[:port]``
146+
in ``hostPort``, any accidental URL scheme prefix (e.g. ``http://``) is
147+
stripped here so that all downstream connector code — including connectors
148+
that call ``connection.hostPort.split(":")`` directly — receives a clean value.
137149
"""
138150
# pylint: disable=import-outside-toplevel
151+
if not self.__class__.__name__.endswith("Connection"):
152+
# Only process Connection classes
153+
return
154+
if not hasattr(self, "__pydantic_fields__"):
155+
return
156+
157+
# Strip accidental URL schemes from hostPort at model construction time.
158+
# This protects connectors that split hostPort manually (e.g. Cassandra,
159+
# Databricks, Redshift, DB2, Microsoft Fabric) without going through
160+
# get_connection_url_common. ValueError for genuinely invalid inputs
161+
# (e.g. non-numeric port) propagates immediately with a clear message.
162+
if (
163+
hasattr(self, "hostPort")
164+
and self.__class__.__name__ not in _HOSTPORT_URL_ALLOWED
165+
):
166+
self.hostPort = strip_hostport_scheme(self.hostPort)
167+
139168
try:
140-
if not self.__class__.__name__.endswith("Connection"):
141-
# Only parse FilterPattern for Connection classes
142-
return
143-
if not hasattr(self, "__pydantic_fields__"):
144-
return
145169
for field in self.__pydantic_fields__:
146170
if field.endswith("FilterPattern"):
147171
from metadata.generated.schema.type.filterPattern import (

ingestion/tests/unit/test_connection_builders.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,15 @@
1919
from metadata.generated.schema.entity.services.connections.database.mysqlConnection import (
2020
MysqlConnection,
2121
)
22+
from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import (
23+
OpenMetadataConnection,
24+
)
2225
from metadata.ingestion.connections.builders import (
2326
get_connection_args_common,
2427
get_connection_options_dict,
2528
init_empty_connection_arguments,
2629
)
30+
from metadata.ingestion.models.custom_pydantic import _HOSTPORT_URL_ALLOWED
2731

2832

2933
class ConnectionBuilderTest(TestCase):
@@ -79,3 +83,64 @@ def test_init_empty_connection_arguments(self):
7983

8084
self.assertEqual(new_args.root.get("hello"), "world")
8185
self.assertIsNone(new_args.root.get("not there"))
86+
87+
def test_model_post_init_strips_url_scheme_from_hostport(self):
88+
"""
89+
Verify that model_post_init strips URL schemes from hostPort at
90+
construction time so connectors that call hostPort.split(":") directly
91+
receive a clean hostname[:port] value.
92+
"""
93+
# http:// prefix should be stripped
94+
conn = MysqlConnection(
95+
username="user",
96+
authType=BasicAuth(password="pass"),
97+
hostPort="http://localhost:3306",
98+
)
99+
self.assertEqual(conn.hostPort, "localhost:3306")
100+
101+
# https:// prefix should be stripped
102+
conn2 = MysqlConnection(
103+
username="user",
104+
authType=BasicAuth(password="pass"),
105+
hostPort="https://myhost:5432",
106+
)
107+
self.assertEqual(conn2.hostPort, "myhost:5432")
108+
109+
# Already clean value should pass through unchanged
110+
conn3 = MysqlConnection(
111+
username="user",
112+
authType=BasicAuth(password="pass"),
113+
hostPort="localhost:3306",
114+
)
115+
self.assertEqual(conn3.hostPort, "localhost:3306")
116+
117+
def test_model_post_init_raises_for_invalid_port_in_hostport(self):
118+
"""
119+
Verify that a non-numeric port in a scheme-prefixed hostPort raises
120+
at model construction time (fail-fast) rather than producing a
121+
confusing error deep in connector code.
122+
"""
123+
with self.assertRaises(Exception):
124+
MysqlConnection(
125+
username="user",
126+
authType=BasicAuth(password="pass"),
127+
hostPort="http://localhost:abc",
128+
)
129+
130+
def test_url_allowed_connections_not_stripped(self):
131+
"""
132+
Verify that connection classes in _HOSTPORT_URL_ALLOWED (AirflowConnection
133+
and OpenMetadataConnection) are excluded from hostPort scheme stripping,
134+
since their hostPort field legitimately stores a full URL.
135+
"""
136+
# Confirm the exclusion set contains the expected class names
137+
self.assertIn("AirflowConnection", _HOSTPORT_URL_ALLOWED)
138+
self.assertIn("OpenMetadataConnection", _HOSTPORT_URL_ALLOWED)
139+
140+
# OpenMetadataConnection.hostPort is a plain str field that expects a
141+
# full URL like "http://localhost:8585/api" — it must NOT be stripped.
142+
ometa = OpenMetadataConnection(
143+
hostPort="http://localhost:8585/api",
144+
)
145+
self.assertIn("http", ometa.hostPort)
146+
self.assertIn("localhost", ometa.hostPort)

0 commit comments

Comments
 (0)