reimplement _check_schema_compatible

kevinjqliu · kevinjqliu · commit 8b2d2766e574 · 2024-07-08T17:44:36.000-07:00
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -161,7 +161,9 @@ def _check_schema_compatible(table_schema: Schema, other_schema: "pa.Schema") ->
     """
     Check if the `table_schema` is compatible with `other_schema`.
 
-    Two schemas are considered compatible when they are equal in terms of the Iceberg Schema type.
+    The schemas are compatible if:
+    - All fields in `other_schema` are present in `table_schema`. (other_schema <= table_schema)
+    - All required fields in `table_schema` are present in `other_schema`.
 
     Raises:
         ValueError: If the schemas are not compatible.
@@ -170,15 +172,18 @@ def _check_schema_compatible(table_schema: Schema, other_schema: "pa.Schema") ->
 
     name_mapping = table_schema.name_mapping
     try:
-        task_schema = pyarrow_to_schema(other_schema, name_mapping=name_mapping)
+        other_schema = pyarrow_to_schema(other_schema, name_mapping=name_mapping)
     except ValueError as e:
         other_schema = _pyarrow_to_schema_without_ids(other_schema)
         additional_names = set(other_schema.column_names) - set(table_schema.column_names)
         raise ValueError(
             f"PyArrow table contains more columns: {', '.join(sorted(additional_names))}. Update the schema first (hint, use union_by_name)."
         ) from e
 
-    if table_schema.as_struct() != task_schema.as_struct():
+    missing_table_schema_fields = {field for field in other_schema.fields if field not in table_schema.fields}
+    required_table_schema_fields = {field for field in table_schema.fields if field.required}
+    missing_required_fields = {field for field in required_table_schema_fields if field not in other_schema.fields}
+    if missing_table_schema_fields or missing_required_fields:
         from rich.console import Console
         from rich.table import Table as RichTable
 
@@ -191,7 +196,7 @@ def _check_schema_compatible(table_schema: Schema, other_schema: "pa.Schema") ->
 
         for lhs in table_schema.fields:
             try:
-                rhs = task_schema.find_field(lhs.field_id)
+                rhs = other_schema.find_field(lhs.field_id)
                 rich_table.add_row("✅" if lhs == rhs else "❌", str(lhs), str(rhs))
             except ValueError:
                 rich_table.add_row("❌", str(lhs), "Missing")
@@ -483,7 +488,7 @@ def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT)
                 f"Not all partition types are supported for writes. Following partitions cannot be written using pyarrow: {unsupported_partitions}."
             )
 
-        # _check_schema_compatible(self._table.schema(), other_schema=df.schema)
+        _check_schema_compatible(self._table.schema(), other_schema=df.schema)
 
         with self.update_snapshot(snapshot_properties=snapshot_properties).fast_append() as update_snapshot:
             # skip writing data files if the dataframe is empty
@@ -520,7 +525,7 @@ def overwrite(
         if len(self._table.spec().fields) > 0:
             raise ValueError("Cannot write to partitioned tables")
 
-        # _check_schema_compatible(self._table.schema(), other_schema=df.schema)
+        _check_schema_compatible(self._table.schema(), other_schema=df.schema)
 
         with self.update_snapshot(snapshot_properties=snapshot_properties).overwrite() as update_snapshot:
             # skip writing data files if the dataframe is empty
diff --git a/tests/table/test_init.py b/tests/table/test_init.py
@@ -1200,6 +1200,25 @@ def test_schema_mismatch_additional_field(table_schema_simple: Schema) -> None:
         _check_schema_compatible(table_schema_simple, other_schema)
 
 
+def test_schema_compatible(table_schema_simple: Schema) -> None:
+    try:
+        _check_schema_compatible(table_schema_simple, table_schema_simple.as_arrow())
+    except Exception:
+        pytest.fail("Unexpected Exception raised when calling `_check_schema_compatible`")
+
+
+def test_schema_projection(table_schema_simple: Schema) -> None:
+    # remove optional `baz` field from `table_schema_simple`
+    other_schema = pa.schema((
+        pa.field("foo", pa.string(), nullable=True),
+        pa.field("bar", pa.int32(), nullable=False),
+    ))
+    try:
+        _check_schema_compatible(table_schema_simple, other_schema)
+    except Exception:
+        pytest.fail("Unexpected Exception raised when calling `_check_schema_compatible`")
+
+
 def test_schema_downcast(table_schema_simple: Schema) -> None:
     # large_string type is compatible with string type
     other_schema = pa.schema((
@@ -1211,7 +1230,7 @@ def test_schema_downcast(table_schema_simple: Schema) -> None:
     try:
         _check_schema_compatible(table_schema_simple, other_schema)
     except Exception:
-        pytest.fail("Unexpected Exception raised when calling `_check_schema`")
+        pytest.fail("Unexpected Exception raised when calling `_check_schema_compatible`")
 
 
 def test_table_properties(example_table_metadata_v2: Dict[str, Any]) -> None: