fix(datalake): _resolve_col_type uses frequency-first majority vote (open-metadata#28093)

edg956 · claude · web-flow · commit 89c6d218f0c5 · 2026-05-13T16:01:19.000Z
A single date-parseable token (e.g. the surname "May") was enough to
flip an entire string column to DATETIME because _TYPE_PRECEDENCE puts
datetime64[ns] above str. The fix counts occurrences of each inferred
type in the sample and picks the most frequent one, breaking ties with
_TYPE_PRECEDENCE. A column with hundreds of plain strings and a handful
of month-name values now correctly resolves to STRING.

Co-authored-by: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/ingestion/src/metadata/utils/datalake/datalake_utils.py b/ingestion/src/metadata/utils/datalake/datalake_utils.py
@@ -18,6 +18,7 @@
 import json
 import random
 import traceback
+from collections import Counter
 from typing import Any, Dict, List, Optional, Union, cast  # noqa: UP035
 
 from metadata.generated.schema.entity.data.table import Column, DataType
@@ -53,12 +54,22 @@
 
 
 def _resolve_col_type(type_list: List[str]) -> str:  # noqa: UP006
-    """Pick the dominant type from type_list using _TYPE_PRECEDENCE instead of lexicographic max()."""
-    type_set = set(type_list)
+    """Pick the dominant type from type_list.
+
+    Frequency-first: the most common type in the sample wins.
+    Ties are broken by _TYPE_PRECEDENCE order.
+    This prevents a small number of date-parseable tokens (e.g. the surname "May")
+    from overriding a column that is overwhelmingly strings.
+    """
+    if not type_list:
+        return "str"
+    counts = Counter(type_list)
+    max_count = max(counts.values())
+    top_types = {t for t, c in counts.items() if c == max_count}
     for t in _TYPE_PRECEDENCE:
-        if t in type_set:
+        if t in top_types:
             return t
-    return type_list[0] if type_list else "str"
+    return type_list[0]
 
 
 class _ArrayOfStruct:
diff --git a/ingestion/tests/unit/utils/test_datalake.py b/ingestion/tests/unit/utils/test_datalake.py
@@ -237,6 +237,33 @@ def test_create_column_object(self):
         column_obj = Column(**column)
         assert column_obj.children is not None and len(column_obj.children) == 3
 
+    def test_fetch_col_types_majority_wins(self):
+        """Majority type wins; a handful of date-parseable tokens must not flip a string column."""
+        cases = [
+            # Overwhelmingly strings with a few month-name values — must stay STRING.
+            # This is the dvdrental last_name bug: "May" parses as a date via dateutil
+            # but the column is a string column.
+            (
+                "last_name_with_month_surnames",
+                ["Smith", "Gonzalez", "Brown", "May", "Jones", "Williams", "Davis"],
+                DataType.STRING,
+            ),
+            # Minority of ambiguous month tokens mixed in a long list of plain strings.
+            ("mostly_strings_few_month_tokens", ["foo", "bar", "baz", "May", "qux", "quux", "March"], DataType.STRING),
+            # All values are unambiguous ISO dates — must be DATETIME.
+            ("pure_iso_dates", ["2024-01-01", "2024-06-15", "2025-03-20"], DataType.DATETIME),
+            # Natural-language date phrases — all parse as dates — must be DATETIME.
+            ("natural_language_dates", ["May 2025", "June 2026", "March 2024", "January 2023"], DataType.DATETIME),
+            # Pure strings, no date-parseable values at all.
+            ("pure_strings", ["hello", "world", "foo", "bar"], DataType.STRING),
+            # All plain integers stored as strings — must be INT.
+            ("integer_strings", ["1", "2", "3", "42"], DataType.INT),
+        ]
+        for name, values, expected in cases:
+            with self.subTest(name):
+                df = pd.DataFrame({"col": values})
+                self.assertEqual(GenericDataFrameColumnParser.fetch_col_types(df, "col"), expected)
+
 
 class TestParquetDataFrameColumnParser(TestCase):
     """Test parquet dataframe column parser"""