Skip to content

Commit 89c6d21

Browse files
edg956claude
andauthored
fix(datalake): _resolve_col_type uses frequency-first majority vote (open-metadata#28093)
A single date-parseable token (e.g. the surname "May") was enough to flip an entire string column to DATETIME because _TYPE_PRECEDENCE puts datetime64[ns] above str. The fix counts occurrences of each inferred type in the sample and picks the most frequent one, breaking ties with _TYPE_PRECEDENCE. A column with hundreds of plain strings and a handful of month-name values now correctly resolves to STRING. Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 4cef6a6 commit 89c6d21

2 files changed

Lines changed: 42 additions & 4 deletions

File tree

ingestion/src/metadata/utils/datalake/datalake_utils.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import json
1919
import random
2020
import traceback
21+
from collections import Counter
2122
from typing import Any, Dict, List, Optional, Union, cast # noqa: UP035
2223

2324
from metadata.generated.schema.entity.data.table import Column, DataType
@@ -53,12 +54,22 @@
5354

5455

5556
def _resolve_col_type(type_list: List[str]) -> str: # noqa: UP006
56-
"""Pick the dominant type from type_list using _TYPE_PRECEDENCE instead of lexicographic max()."""
57-
type_set = set(type_list)
57+
"""Pick the dominant type from type_list.
58+
59+
Frequency-first: the most common type in the sample wins.
60+
Ties are broken by _TYPE_PRECEDENCE order.
61+
This prevents a small number of date-parseable tokens (e.g. the surname "May")
62+
from overriding a column that is overwhelmingly strings.
63+
"""
64+
if not type_list:
65+
return "str"
66+
counts = Counter(type_list)
67+
max_count = max(counts.values())
68+
top_types = {t for t, c in counts.items() if c == max_count}
5869
for t in _TYPE_PRECEDENCE:
59-
if t in type_set:
70+
if t in top_types:
6071
return t
61-
return type_list[0] if type_list else "str"
72+
return type_list[0]
6273

6374

6475
class _ArrayOfStruct:

ingestion/tests/unit/utils/test_datalake.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,33 @@ def test_create_column_object(self):
237237
column_obj = Column(**column)
238238
assert column_obj.children is not None and len(column_obj.children) == 3
239239

240+
def test_fetch_col_types_majority_wins(self):
241+
"""Majority type wins; a handful of date-parseable tokens must not flip a string column."""
242+
cases = [
243+
# Overwhelmingly strings with a few month-name values — must stay STRING.
244+
# This is the dvdrental last_name bug: "May" parses as a date via dateutil
245+
# but the column is a string column.
246+
(
247+
"last_name_with_month_surnames",
248+
["Smith", "Gonzalez", "Brown", "May", "Jones", "Williams", "Davis"],
249+
DataType.STRING,
250+
),
251+
# Minority of ambiguous month tokens mixed in a long list of plain strings.
252+
("mostly_strings_few_month_tokens", ["foo", "bar", "baz", "May", "qux", "quux", "March"], DataType.STRING),
253+
# All values are unambiguous ISO dates — must be DATETIME.
254+
("pure_iso_dates", ["2024-01-01", "2024-06-15", "2025-03-20"], DataType.DATETIME),
255+
# Natural-language date phrases — all parse as dates — must be DATETIME.
256+
("natural_language_dates", ["May 2025", "June 2026", "March 2024", "January 2023"], DataType.DATETIME),
257+
# Pure strings, no date-parseable values at all.
258+
("pure_strings", ["hello", "world", "foo", "bar"], DataType.STRING),
259+
# All plain integers stored as strings — must be INT.
260+
("integer_strings", ["1", "2", "3", "42"], DataType.INT),
261+
]
262+
for name, values, expected in cases:
263+
with self.subTest(name):
264+
df = pd.DataFrame({"col": values})
265+
self.assertEqual(GenericDataFrameColumnParser.fetch_col_types(df, "col"), expected)
266+
240267

241268
class TestParquetDataFrameColumnParser(TestCase):
242269
"""Test parquet dataframe column parser"""

0 commit comments

Comments
 (0)