Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion ingestion/src/metadata/ingestion/connections/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from metadata.ingestion.connections.headers import inject_query_header_by_conn
from metadata.ingestion.connections.query_logger import attach_query_tracker
from metadata.ingestion.connections.secrets import connection_with_options_secrets
from metadata.ingestion.models.custom_pydantic import strip_hostport_scheme
from metadata.utils.constants import BUILDER_PASSWORD_ATTR
from metadata.utils.logger import cli_logger

Expand Down Expand Up @@ -185,7 +186,7 @@
url = _add_password(url, connection)
url += "@"

url += connection.hostPort
url += strip_hostport_scheme(connection.hostPort)

Check warning on line 189 in ingestion/src/metadata/ingestion/connections/builders.py

View workflow job for this annotation

GitHub Actions / Unit Tests & Static Checks (3.10)

Argument type is unknown   Argument corresponds to parameter "raw" in function "strip_hostport_scheme" (reportUnknownArgumentType)

Check warning on line 189 in ingestion/src/metadata/ingestion/connections/builders.py

View workflow job for this annotation

GitHub Actions / Unit Tests & Static Checks (3.10)

Type of "hostPort" is unknown (reportUnknownMemberType)
Comment thread
gitar-bot[bot] marked this conversation as resolved.
if hasattr(connection, "database"):
url += f"/{connection.database}" if connection.database else ""

Expand Down
128 changes: 123 additions & 5 deletions ingestion/src/metadata/ingestion/models/custom_pydantic.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import json
import logging
from typing import Any, Callable, Dict, Literal, Optional, Union
from urllib.parse import urlparse

from pydantic import BaseModel as PydanticBaseModel
from pydantic import WrapSerializer, model_validator
Expand All @@ -35,6 +36,101 @@
JSON_ENCODERS = "json_encoders"


def strip_hostport_scheme(raw: str) -> str:

Check failure on line 39 in ingestion/src/metadata/ingestion/models/custom_pydantic.py

View check run for this annotation

SonarQubeCloud / [open-metadata-ingestion] SonarCloud Code Analysis

Refactor this function to reduce its Cognitive Complexity from 23 to the 15 allowed.

See more on https://sonarcloud.io/project/issues?id=open-metadata-ingestion&issues=AZ2q0Xum43vI1cIAROC4&open=AZ2q0Xum43vI1cIAROC4&pullRequest=27191
"""
Strip an accidental URL scheme from a hostPort string.

Self-contained helper that depends only on the standard library so it
can be imported from ``builders.py`` and ``db_utils.py`` without circular
imports.

Raises ValueError if the value cannot be resolved to a valid
``hostname[:port]`` string — e.g. when the port is non-numeric or when
the hostname is empty after stripping the scheme. This applies to both
standard URLs handled by ``urlparse`` and to JDBC-style URLs handled by
the fallback branch (e.g. ``jdbc:postgresql://host:abc/db``).
"""
value = raw.strip()
if "://" not in value:
return value

parsed = urlparse(value)
hostname = parsed.hostname or ""

# Build a sanitised label for log/error messages that identifies the
# problematic input without leaking credentials or a full path.
if parsed.scheme and hostname:
safe_label = f"{parsed.scheme}://{hostname}"
else:
# urlparse couldn't extract a hostname (e.g. jdbc:postgresql://…).
# Construct the label from the raw tail so messages remain actionable.
tail_for_label = value.rsplit("://", 1)[-1].split("/", 1)[0]
if "@" in tail_for_label:
tail_for_label = tail_for_label.rsplit("@", 1)[-1]
safe_label = tail_for_label or value

logger.warning(
"The hostPort '%s' contains a URL scheme. Expected format is "

Check warning on line 73 in ingestion/src/metadata/ingestion/models/custom_pydantic.py

View workflow job for this annotation

GitHub Actions / Unit Tests & Static Checks (3.10)

Implicit string concatenation not allowed (reportImplicitStringConcatenation)
"'hostname[:port]' (e.g. 'localhost:3306'). Stripping the scheme prefix.",
safe_label,
)
try:
port = parsed.port
except ValueError as exc:
raise ValueError(
f"Invalid hostPort '{safe_label}'. Expected format is "

Check warning on line 81 in ingestion/src/metadata/ingestion/models/custom_pydantic.py

View workflow job for this annotation

GitHub Actions / Unit Tests & Static Checks (3.10)

Implicit string concatenation not allowed (reportImplicitStringConcatenation)
"'hostname[:port]' (e.g. 'localhost:3306')."
) from exc

if not hostname:
# urlparse couldn't extract a hostname (e.g. 'jdbc:postgresql://host:5432/db').
# Fall back to stripping the scheme and any trailing path/query/fragment/userinfo.
tail = value.rsplit("://", 1)[-1]
for sep in ("/", "?", "#"):
tail = tail.split(sep, 1)[0]
if "@" in tail:
tail = tail.rsplit("@", 1)[-1]
Copy link

Copilot AI Apr 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the not hostname fallback branch, the function returns the stripped tail without validating the port component. For inputs like jdbc:postgresql://host:abc/db, this will return host:abc and the original int(port) failure can still happen later, which undermines the goal of producing a clearer error for non-numeric ports (and also makes the docstring claim about raising on non-numeric ports incomplete). Consider parsing tail for a host:port pattern and raising a ValueError when the port is present but non-numeric.

Suggested change
tail = tail.rsplit("@", 1)[-1]
tail = tail.rsplit("@", 1)[-1]
fallback_port = None
if tail.startswith("["):
closing_bracket = tail.find("]")
if closing_bracket != -1 and len(tail) > closing_bracket + 1:
if tail[closing_bracket + 1] == ":":
fallback_port = tail[closing_bracket + 2 :]
elif ":" in tail:
_, fallback_port = tail.rsplit(":", 1)
if fallback_port and not fallback_port.isdigit():
raise ValueError(
f"Invalid hostPort '{safe_label}'. Expected format is "
"'hostname[:port]' (e.g. 'localhost:3306')."
)

Copilot uses AI. Check for mistakes.

if not tail:
raise ValueError(
f"Invalid hostPort '{safe_label}'. Expected format is "

Check warning on line 96 in ingestion/src/metadata/ingestion/models/custom_pydantic.py

View workflow job for this annotation

GitHub Actions / Unit Tests & Static Checks (3.10)

Implicit string concatenation not allowed (reportImplicitStringConcatenation)
"'hostname[:port]' (e.g. 'localhost:3306')."
)

# Validate the port in the fallback path so the same ValueError
# contract holds for JDBC-style URLs and other malformed inputs.
fallback_parsed = urlparse(f"//{tail}")
try:
fallback_hostname = fallback_parsed.hostname or ""
_ = fallback_parsed.port # raises ValueError for non-numeric ports
except ValueError as exc:
raise ValueError(
f"Invalid hostPort '{safe_label}'. Expected format is "

Check warning on line 108 in ingestion/src/metadata/ingestion/models/custom_pydantic.py

View workflow job for this annotation

GitHub Actions / Unit Tests & Static Checks (3.10)

Implicit string concatenation not allowed (reportImplicitStringConcatenation)
"'hostname[:port]' (e.g. 'localhost:3306')."
) from exc
if not fallback_hostname:
raise ValueError(
f"Invalid hostPort '{safe_label}'. Expected format is "

Check warning on line 113 in ingestion/src/metadata/ingestion/models/custom_pydantic.py

View workflow job for this annotation

GitHub Actions / Unit Tests & Static Checks (3.10)

Implicit string concatenation not allowed (reportImplicitStringConcatenation)
"'hostname[:port]' (e.g. 'localhost:3306')."
)
Comment on lines +100 to +115
Copy link

Copilot AI Apr 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the hostname-empty fallback branch, malformed inputs like http:// or http:///localhost:3306 will produce an empty tail and be returned as an empty hostPort, which can lead to confusing downstream errors. It would be safer to validate that the extracted tail is non-empty (and ideally contains a hostname) and raise the same ValueError if it can't be parsed into hostname[:port].

Suggested change
# Validate the port in the fallback path so the same ValueError
# contract holds for JDBC-style URLs (e.g. 'jdbc:postgresql://host:abc').
fallback_port: Optional[str] = None
if tail.startswith("["):
closing = tail.find("]")
if closing != -1 and len(tail) > closing + 1 and tail[closing + 1] == ":":
fallback_port = tail[closing + 2 :]
elif ":" in tail:
fallback_port = tail.rsplit(":", 1)[1]
if fallback_port and not fallback_port.isdigit():
raise ValueError(
f"Invalid hostPort '{safe_label}'. Expected format is "
"'hostname[:port]' (e.g. 'localhost:3306')."
)
if not tail:
raise ValueError(
f"Invalid hostPort '{safe_label}'. Expected format is "
"'hostname[:port]' (e.g. 'localhost:3306')."
)
# Validate the fallback path so the same ValueError contract holds for
# JDBC-style URLs and malformed inputs that urlparse could not map to
# a standard hostname.
fallback_parsed = urlparse(f"//{tail}")
try:
fallback_hostname = fallback_parsed.hostname or ""
_ = fallback_parsed.port
except ValueError as exc:
raise ValueError(
f"Invalid hostPort '{safe_label}'. Expected format is "
"'hostname[:port]' (e.g. 'localhost:3306')."
) from exc
if not fallback_hostname:
raise ValueError(
f"Invalid hostPort '{safe_label}'. Expected format is "
"'hostname[:port]' (e.g. 'localhost:3306')."
)

Copilot uses AI. Check for mistakes.
return tail

host = f"[{hostname}]" if ":" in hostname else hostname
return f"{host}:{port}" if port is not None else host


# Backwards-compatible private alias retained for any internal callers that
# pinned to the original underscored symbol while the helper was private.
_strip_hostport_scheme = strip_hostport_scheme

# Module sub-path that identifies database-connector classes.
# Only *Connection classes whose module contains this sub-path have a plain
# ``hostname[:port]`` hostPort — all other connection categories
# (dashboard, pipeline, search, metadata, …) legitimately store a full URL
# in hostPort and must NOT have their scheme stripped.
_DATABASE_CONNECTION_MODULE_MARKER = ".services.connections.database."


class BaseModel(PydanticBaseModel):
"""
Base model for OpenMetadata generated models.
Expand All @@ -46,14 +142,36 @@
This function is used to parse the FilterPattern fields for the Connection classes.
This is needed because dict is defined in the JSON schema for the FilterPattern field,
but a FilterPattern object is required in the generated code.

Additionally, for Connection classes that store a plain ``hostname[:port]``
in ``hostPort``, any accidental URL scheme prefix (e.g. ``http://``) is
stripped here so that all downstream connector code — including connectors
that call ``connection.hostPort.split(":")`` directly — receives a clean value.
"""
# pylint: disable=import-outside-toplevel
if not self.__class__.__name__.endswith("Connection"):
# Only process Connection classes
return
if not hasattr(self, "__pydantic_fields__"):
return

# Strip accidental URL schemes from hostPort at model construction time,
# but ONLY for database-connector classes whose hostPort is a plain
# ``hostname[:port]``. Dashboard, pipeline, search, and other connector
# categories legitimately use a full URL in hostPort (e.g.
# ``http://grafana:3000``), so we restrict stripping to classes whose
# module path contains the database-connection marker.
# The isinstance(str) guard prevents an AttributeError when hostPort is
# optional and left unset (None).
host_port = getattr(self, "hostPort", None)
if (
isinstance(host_port, str)
and "://" in host_port
and _DATABASE_CONNECTION_MODULE_MARKER in (self.__class__.__module__ or "")
):
self.hostPort = strip_hostport_scheme(host_port)

Check warning on line 172 in ingestion/src/metadata/ingestion/models/custom_pydantic.py

View workflow job for this annotation

GitHub Actions / Unit Tests & Static Checks (3.10)

Type annotation for attribute `hostPort` is required because this class is not decorated with `@final` (reportUnannotatedClassAttribute)

try:
if not self.__class__.__name__.endswith("Connection"):
# Only parse FilterPattern for Connection classes
return
if not hasattr(self, "__pydantic_fields__"):
return
for field in self.__pydantic_fields__:
if field.endswith("FilterPattern"):
from metadata.generated.schema.type.filterPattern import (
Expand Down
23 changes: 22 additions & 1 deletion ingestion/src/metadata/utils/db_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
get_lineage_by_query,
get_lineage_via_table_entity,
)
from metadata.ingestion.models.custom_pydantic import strip_hostport_scheme
from metadata.ingestion.ometa.ometa_api import OpenMetadata
from metadata.ingestion.source.models import TableView
from metadata.utils import fqn
Expand All @@ -44,12 +45,32 @@
PUBLIC_SCHEMA = "public"


def clean_host_port(host_port: str) -> str:
"""
Strip URL scheme prefixes from a hostPort string.

Users sometimes enter a full URL (e.g. 'http://localhost:3306')
instead of just 'localhost:3306'. This strips the scheme to avoid
ValueError when parsing host and port.

Delegates to the stdlib-only helper colocated with ``BaseModel`` so the
behaviour stays in lockstep with Pydantic's ``model_post_init`` hook.
"""
value = host_port.strip()
if "://" not in value:
return value.rstrip("/")
return strip_hostport_scheme(value)


def get_host_from_host_port(uri: str) -> str:
"""
if uri is like "localhost:9000"
then return the host "localhost"
"""
return uri.split(":")[0]
cleaned = clean_host_port(uri)
if cleaned.startswith("["):
return cleaned.split("]")[0] + "]"
return cleaned.split(":")[0]


# pylint: disable=too-many-locals
Expand Down
28 changes: 28 additions & 0 deletions ingestion/tests/unit/test_build_connection_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,34 @@ def test_get_connection_url_mysql(self):
"mysql+pymysql://openmetadata_user:mocked_token@localhost:3306/openmetadata_db",
)

def test_get_connection_url_mysql_with_url_scheme(self):
"""hostPort with http:// prefix should be cleaned automatically"""
connection = MysqlConnectionConfig(
username="openmetadata_user",
authType=BasicAuth(password="openmetadata_password"),
hostPort="http://localhost:3306",
databaseSchema="openmetadata_db",
)
engine_connection = MySQLConnection(connection).client
self.assertEqual(
engine_connection.url.render_as_string(hide_password=False),
"mysql+pymysql://openmetadata_user:openmetadata_password@localhost:3306/openmetadata_db",
)

def test_get_connection_url_postgres_with_url_scheme(self):
"""hostPort with https:// prefix should be cleaned automatically"""
connection = PostgresConnectionConfig(
username="openmetadata_user",
authType=BasicAuth(password="openmetadata_password"),
hostPort="https://localhost:5432",
database="openmetadata_db",
)
engine_connection = PostgresConnection(connection).client
self.assertEqual(
engine_connection.url.render_as_string(hide_password=False),
"postgresql+psycopg2://openmetadata_user:openmetadata_password@localhost:5432/openmetadata_db",
)

def test_get_connection_url_postgres(self):
connection = PostgresConnectionConfig(
username="openmetadata_user",
Expand Down
82 changes: 82 additions & 0 deletions ingestion/tests/unit/test_connection_builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,21 @@
from metadata.generated.schema.entity.services.connections.database.common.basicAuth import (
BasicAuth,
)
from metadata.generated.schema.entity.services.connections.database.cassandraConnection import (
CassandraConnection,
)
from metadata.generated.schema.entity.services.connections.database.mysqlConnection import (
MysqlConnection,
)
from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import (
OpenMetadataConnection,
)
from metadata.ingestion.connections.builders import (
get_connection_args_common,
get_connection_options_dict,
init_empty_connection_arguments,
)
from metadata.ingestion.models.custom_pydantic import _DATABASE_CONNECTION_MODULE_MARKER


class ConnectionBuilderTest(TestCase):
Expand Down Expand Up @@ -78,3 +85,78 @@ def test_init_empty_connection_arguments(self):

self.assertEqual(new_args.root.get("hello"), "world")
self.assertIsNone(new_args.root.get("not there"))

def test_model_post_init_strips_url_scheme_from_hostport(self):
"""
Verify that model_post_init strips URL schemes from hostPort at
construction time so connectors that call hostPort.split(":") directly
receive a clean hostname[:port] value.
"""
# http:// prefix should be stripped
conn = MysqlConnection(
username="user",
authType=BasicAuth(password="pass"),
hostPort="http://localhost:3306",
)
self.assertEqual(conn.hostPort, "localhost:3306")

# https:// prefix should be stripped
conn2 = MysqlConnection(
username="user",
authType=BasicAuth(password="pass"),
hostPort="https://myhost:5432",
)
self.assertEqual(conn2.hostPort, "myhost:5432")

# Already clean value should pass through unchanged
conn3 = MysqlConnection(
username="user",
authType=BasicAuth(password="pass"),
hostPort="localhost:3306",
)
self.assertEqual(conn3.hostPort, "localhost:3306")

def test_model_post_init_raises_for_invalid_port_in_hostport(self):
"""
Verify that a non-numeric port in a scheme-prefixed hostPort raises
at model construction time (fail-fast) rather than producing a
confusing error deep in connector code.
"""
with self.assertRaises(Exception):
MysqlConnection(
username="user",
authType=BasicAuth(password="pass"),
hostPort="http://localhost:abc",
)

def test_non_database_connections_not_stripped(self):
"""
Verify that non-database connection classes (metadata, dashboard, pipeline, …)
are NOT subject to hostPort scheme stripping because their hostPort legitimately
stores a full URL (e.g. OpenMetadataConnection hostPort = http://localhost:8585/api).

The guard uses the module path: only classes whose __module__ contains
_DATABASE_CONNECTION_MODULE_MARKER ('.services.connections.database.')
are stripped. All others pass through unchanged.
"""
# Confirm the marker is correct
self.assertIn("database", _DATABASE_CONNECTION_MODULE_MARKER)

# OpenMetadataConnection.hostPort is a plain str that expects a full URL.
# Its module contains '.connections.metadata.' — NOT the database marker —
# so it must NOT be stripped.
ometa = OpenMetadataConnection(
hostPort="http://localhost:8585/api",
)
self.assertIn("http", ometa.hostPort)
self.assertIn("localhost", ometa.hostPort)

def test_none_hostport_does_not_crash(self):
"""
Regression test for gitar-bot bug report: constructing a database
connection where hostPort is Optional and left as None must NOT raise
AttributeError from strip_hostport_scheme(None).
"""
# CassandraConnection.hostPort is Optional[str] = None by default.
conn = CassandraConnection()
self.assertIsNone(conn.hostPort)
Loading
Loading