Skip to content

Commit d96a0b7

Browse files
Fix #22911 [1]: add support for more column datatypes (#26979)
* FIx dataype warnings * Update ingestion/tests/unit/topology/database/test_starrocks.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * checkstyle * checkstyle * Update ingestion/src/metadata/ingestion/source/database/vertica/metadata.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * address comments --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 06535e9 commit d96a0b7

12 files changed

Lines changed: 303 additions & 5 deletions

File tree

ingestion/src/metadata/ingestion/source/database/azuresql/metadata.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343

4444
logger = ingestion_logger()
4545

46-
ischema_names = update_mssql_ischema_names(ischema_names)
46+
update_mssql_ischema_names(ischema_names)
4747

4848
MSDialect.get_table_comment = get_table_comment
4949
MSDialect.get_view_definition = get_view_definition

ingestion/src/metadata/ingestion/source/database/clickhouse/utils.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,13 @@
6262
"UInt8": SMALLINT,
6363
"IPv4": create_sqlalchemy_type("IPv4"),
6464
"IPv6": create_sqlalchemy_type("IPv6"),
65+
# ClickHouse geo types (v21+)
66+
"Point": create_sqlalchemy_type("Point"),
67+
"Ring": create_sqlalchemy_type("Ring"),
68+
"Polygon": create_sqlalchemy_type("Polygon"),
69+
"MultiPolygon": create_sqlalchemy_type("MultiPolygon"),
70+
"LineString": create_sqlalchemy_type("LineString"),
71+
"MultiLineString": create_sqlalchemy_type("MultiLineString"),
6572
}
6673
)
6774

ingestion/src/metadata/ingestion/source/database/common_pg_mappings.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
"pg_snapshot": create_sqlalchemy_type("PG_SNAPSHOT"),
4444
"tsquery": create_sqlalchemy_type("TSQUERY"),
4545
"txid_snapshot": create_sqlalchemy_type("TXID_SNAPSHOT"),
46+
"tid": SqlAlchemyString,
4647
"xid": SqlAlchemyString,
4748
"xml": create_sqlalchemy_type("XML"),
4849
# PostgreSQL range types (used by TimescaleDB for chunk boundaries)

ingestion/src/metadata/ingestion/source/database/mssql/metadata.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@
7676
# Avoid using these data types in new development work, and plan to modify applications that currently use them.
7777
# Use nvarchar(max), varchar(max), and varbinary(max) instead.
7878
# ref: https://learn.microsoft.com/en-us/sql/t-sql/data-types/ntext-text-and-image-transact-sql?view=sql-server-ver16
79-
ischema_names = update_mssql_ischema_names(ischema_names)
79+
update_mssql_ischema_names(ischema_names)
8080

8181
MSDialect.get_table_comment = get_table_comment
8282
MSDialect.get_view_definition = get_view_definition

ingestion/src/metadata/ingestion/source/database/starrocks/metadata.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,12 @@ def _get_sqlalchemy_type(type_str):
9393
type_mapping = {
9494
"VARCHAR": sqltypes.VARCHAR,
9595
"CHAR": sqltypes.CHAR,
96+
"TINYINT": sqltypes.SMALLINT,
97+
"SMALLINT": sqltypes.SMALLINT,
9698
"INT": sqltypes.INT,
99+
"INTEGER": sqltypes.INTEGER,
97100
"BIGINT": sqltypes.BIGINT,
101+
"LARGEINT": sqltypes.BIGINT,
98102
"FLOAT": sqltypes.FLOAT,
99103
"DOUBLE": sqltypes.FLOAT,
100104
"DECIMAL": sqltypes.DECIMAL,
@@ -103,11 +107,17 @@ def _get_sqlalchemy_type(type_str):
103107
"TIMESTAMP": sqltypes.TIMESTAMP,
104108
"BOOLEAN": sqltypes.BOOLEAN,
105109
"ARRAY": sqltypes.ARRAY,
110+
"MAP": sqltypes.TEXT,
111+
"STRUCT": sqltypes.TEXT,
106112
"JSON": sqltypes.JSON,
107113
"STRING": sqltypes.TEXT,
108114
"BINARY": sqltypes.BINARY,
109115
"VARBINARY": sqltypes.VARBINARY,
110116
"TEXT": sqltypes.TEXT,
117+
# StarRocks specialised analytics types — no SQL equivalent; store as TEXT
118+
"BITMAP": sqltypes.TEXT,
119+
"HLL": sqltypes.TEXT,
120+
"PERCENTILE": sqltypes.TEXT,
111121
"UNKNOWN": sqltypes.NullType,
112122
}
113123

ingestion/src/metadata/ingestion/source/database/vertica/metadata.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,18 @@
5656
{
5757
"UUID": create_sqlalchemy_type("UUID"),
5858
"GEOGRAPHY": create_sqlalchemy_type("GEOGRAPHY"),
59+
"GEOMETRY": create_sqlalchemy_type("GEOMETRY"),
60+
# Binary types
61+
"BINARY": sqltypes.LargeBinary,
62+
"VARBINARY": sqltypes.LargeBinary,
63+
"LONG VARBINARY": sqltypes.LargeBinary,
64+
# Long string
65+
"LONG VARCHAR": sqltypes.Text,
66+
# Complex / semi-structured types (Vertica v11+)
67+
"ARRAY": create_sqlalchemy_type("ARRAY"),
68+
"NATIVE ARRAY": create_sqlalchemy_type("ARRAY"),
69+
"ROW": create_sqlalchemy_type("ROW"),
70+
"SET": create_sqlalchemy_type("SET"),
5971
}
6072
)
6173

ingestion/src/metadata/utils/sqa_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -256,8 +256,8 @@ def is_array(kwargs: Dict) -> bool:
256256
return False
257257

258258

259-
def update_mssql_ischema_names(ischema_names):
260-
return ischema_names.update(
259+
def update_mssql_ischema_names(ischema_names: dict) -> None:
260+
ischema_names.update(
261261
{
262262
"nvarchar": create_sqlalchemy_type("NVARCHAR"),
263263
"nchar": create_sqlalchemy_type("NCHAR"),

ingestion/tests/unit/topology/database/test_clickhouse_utils.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,3 +68,69 @@ def test_nullable_unwraps_to_inner_type(self):
6868
def test_unknown_type_returns_null_type(self):
6969
result = self.dialect._get_column_type("col", "SomeUnknownType")
7070
assert result is sqltypes.NullType
71+
72+
73+
class TestClickhouseGeoTypes:
74+
"""Verify that ClickHouse geo types are registered in ischema_names
75+
and resolved correctly by _get_column_type."""
76+
77+
def setup_method(self):
78+
self.dialect = MockDialect()
79+
80+
# --- Registration checks ---
81+
82+
def test_geo_types_registered_in_ischema_names(self):
83+
for geo_type in (
84+
"Point",
85+
"Ring",
86+
"Polygon",
87+
"MultiPolygon",
88+
"LineString",
89+
"MultiLineString",
90+
):
91+
assert (
92+
geo_type in ch_ischema_names
93+
), f"{geo_type} not found in ischema_names"
94+
95+
# --- Resolution via _get_column_type ---
96+
97+
def test_point_type_resolves(self):
98+
result = self.dialect._get_column_type("col", "Point")
99+
assert result == ch_ischema_names["Point"]
100+
101+
def test_ring_type_resolves(self):
102+
result = self.dialect._get_column_type("col", "Ring")
103+
assert result == ch_ischema_names["Ring"]
104+
105+
def test_polygon_type_resolves(self):
106+
result = self.dialect._get_column_type("col", "Polygon")
107+
assert result == ch_ischema_names["Polygon"]
108+
109+
def test_multipolygon_type_resolves(self):
110+
result = self.dialect._get_column_type("col", "MultiPolygon")
111+
assert result == ch_ischema_names["MultiPolygon"]
112+
113+
def test_linestring_type_resolves(self):
114+
result = self.dialect._get_column_type("col", "LineString")
115+
assert result == ch_ischema_names["LineString"]
116+
117+
def test_multilinestring_type_resolves(self):
118+
result = self.dialect._get_column_type("col", "MultiLineString")
119+
assert result == ch_ischema_names["MultiLineString"]
120+
121+
def test_geo_types_are_distinct(self):
122+
"""Each geo type should resolve to a different object."""
123+
types = {
124+
name: ch_ischema_names[name]
125+
for name in (
126+
"Point",
127+
"Ring",
128+
"Polygon",
129+
"MultiPolygon",
130+
"LineString",
131+
"MultiLineString",
132+
)
133+
}
134+
# All values should be distinct from NullType
135+
for name, t in types.items():
136+
assert t is not sqltypes.NullType, f"{name} resolved to NullType"

ingestion/tests/unit/topology/database/test_mssql.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
MSSQL_GET_DATABASE,
5656
MSSQL_TEST_GET_QUERIES,
5757
)
58+
from metadata.utils.sqa_utils import update_mssql_ischema_names
5859

5960
mock_mssql_config = {
6061
"source": {
@@ -338,6 +339,55 @@ def test_get_stored_procedures(self):
338339
self.assertEqual(len(results), 1)
339340
self.assertEqual(results[0].name, "sp_include")
340341

342+
343+
class TestUpdateMssqlIschemaNames:
344+
"""Verify update_mssql_ischema_names mutates the dict in-place and returns None."""
345+
346+
EXPECTED_MSSQL_TYPES = [
347+
"nvarchar",
348+
"nchar",
349+
"ntext",
350+
"bit",
351+
"image",
352+
"binary",
353+
"smallmoney",
354+
"money",
355+
"real",
356+
"smalldatetime",
357+
"datetime2",
358+
"datetimeoffset",
359+
"sql_variant",
360+
"uniqueidentifier",
361+
"xml",
362+
"hierarchyid",
363+
"geography",
364+
"geometry",
365+
]
366+
367+
def test_returns_none(self):
368+
result = update_mssql_ischema_names({})
369+
assert result is None
370+
371+
def test_mutates_dict_in_place(self):
372+
target = {}
373+
update_mssql_ischema_names(target)
374+
for type_key in self.EXPECTED_MSSQL_TYPES:
375+
assert (
376+
type_key in target
377+
), f"'{type_key}' was not added by update_mssql_ischema_names"
378+
379+
def test_all_added_types_are_not_none(self):
380+
target = {}
381+
update_mssql_ischema_names(target)
382+
for type_key in self.EXPECTED_MSSQL_TYPES:
383+
assert target[type_key] is not None, f"'{type_key}' was mapped to None"
384+
385+
def test_does_not_overwrite_existing_entries(self):
386+
sentinel = object()
387+
target = {"existing_key": sentinel}
388+
update_mssql_ischema_names(target)
389+
assert target["existing_key"] is sentinel
390+
341391
@patch(
342392
"metadata.ingestion.source.database.mssql.connection.test_connection_db_common"
343393
)

ingestion/tests/unit/topology/database/test_postgres.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -923,3 +923,30 @@ def test_mark_deleted_databases_with_multiple_databases(self):
923923
self.assertEqual(
924924
call_args[1]["entity_source_state"], expected_source_state
925925
)
926+
927+
928+
class TestPostgresCommonMappings(TestCase):
929+
"""Verify extended type entries in the shared PostgreSQL ischema_names map."""
930+
931+
def test_tid_type_registered(self):
932+
"""'tid' must be present in the PostgreSQL ischema_names after common_pg_mappings is loaded."""
933+
# common_pg_mappings registers types as a side-effect of module import
934+
from sqlalchemy.dialects.postgresql.base import (
935+
ischema_names as pg_ischema_names,
936+
)
937+
938+
import metadata.ingestion.source.database.common_pg_mappings # noqa: F401
939+
940+
self.assertIn("tid", pg_ischema_names)
941+
942+
def test_tid_maps_to_string(self):
943+
"""'tid' must map to a String-compatible SQLAlchemy type."""
944+
from sqlalchemy import String as SqlAlchemyString
945+
from sqlalchemy.dialects.postgresql.base import (
946+
ischema_names as pg_ischema_names,
947+
)
948+
949+
import metadata.ingestion.source.database.common_pg_mappings # noqa: F401
950+
951+
tid_type = pg_ischema_names["tid"]
952+
self.assertIs(tid_type, SqlAlchemyString)

0 commit comments

Comments
 (0)