[GH-2007] Geopandas.Dataframe: Fix constructor for pandas-on-pyspark and Sedona Geopandas input types (#2008)

petern48 · web-flow · commit 70967cb963e2 · 2025-06-26T12:12:04.000-07:00
* Fix small constructor bug

* Fix condition for converting to wkb

* Fix constructor to not error on sgpd and pspd inputs

* Add constructor tests for all input types, including non-geometry

* pre-commit reformat

* Change to BaseGeometry for shapely compatibilty

* pre-commit fmt

* Remove empty lst and dct test cases since diff spark versions handle differently
diff --git a/python/sedona/geopandas/geodataframe.py b/python/sedona/geopandas/geodataframe.py
@@ -141,15 +141,12 @@ def __init__(
         if isinstance(data, (GeoDataFrame, GeoSeries)):
             assert dtype is None
             assert not copy
-            self._anchor = data
-            self._col_label = index
+            super().__init__(data, index=index, dtype=dtype, copy=copy)
         elif isinstance(data, (PandasOnSparkSeries, PandasOnSparkDataFrame)):
             assert columns is None
             assert dtype is None
             assert not copy
-            if index is None:
-                internal = InternalFrame(spark_frame=data._internal.spark_frame)
-                object.__setattr__(self, "_internal_frame", internal)
+            super().__init__(data, index=index, dtype=dtype)
         elif isinstance(data, SparkDataFrame):
             assert columns is None
             assert dtype is None
@@ -173,8 +170,13 @@ def __init__(
                 )
             gdf = gpd.GeoDataFrame(df)
             # convert each geometry column to wkb type
+            import shapely
+
             for col in gdf.columns:
-                if isinstance(gdf[col], gpd.GeoSeries):
+                # It's possible we get a list, dict, pd.Series, gpd.GeoSeries, etc of shapely.Geometry objects.
+                if len(gdf[col]) > 0 and isinstance(
+                    gdf[col].iloc[0], shapely.geometry.base.BaseGeometry
+                ):
                     gdf[col] = gdf[col].apply(lambda geom: geom.wkb)
             pdf = pd.DataFrame(gdf)
             # initialize the parent class pyspark Dataframe with the pandas Series
diff --git a/python/tests/geopandas/test_geodataframe.py b/python/tests/geopandas/test_geodataframe.py
@@ -21,9 +21,14 @@
     Point,
 )
 
-from sedona.geopandas import GeoDataFrame
+from sedona.geopandas import GeoDataFrame, GeoSeries
 from tests.test_base import TestBase
 import pyspark.pandas as ps
+import pandas as pd
+import geopandas as gpd
+import sedona.geopandas as sgpd
+import pytest
+from pandas.testing import assert_frame_equal
 
 
 class TestDataframe(TestBase):
@@ -41,10 +46,52 @@ class TestDataframe(TestBase):
     #
     # def teardown_method(self):
     #     shutil.rmtree(self.tempdir)
-
-    def test_constructor(self):
-        df = GeoDataFrame([Point(x, x) for x in range(3)])
-        check_geodataframe(df)
+    @pytest.mark.parametrize(
+        "obj",
+        [
+            [Point(x, x) for x in range(3)],
+            {"geometry": [Point(x, x) for x in range(3)]},
+            pd.DataFrame([Point(x, x) for x in range(3)]),
+            gpd.GeoDataFrame([Point(x, x) for x in range(3)]),
+            pd.Series([Point(x, x) for x in range(3)]),
+            gpd.GeoSeries([Point(x, x) for x in range(3)]),
+            GeoSeries([Point(x, x) for x in range(3)]),
+            GeoDataFrame([Point(x, x) for x in range(3)]),
+        ],
+    )
+    def test_constructor(self, obj):
+        sgpd_df = GeoDataFrame(obj)
+        check_geodataframe(sgpd_df)
+
+    def test_constructor_pandas_on_spark(self):
+        for obj in [
+            ps.DataFrame([Point(x, x) for x in range(3)]),
+            ps.Series([Point(x, x) for x in range(3)]),
+        ]:
+            sgpd_df = GeoDataFrame(obj)
+            check_geodataframe(sgpd_df)
+
+    @pytest.mark.parametrize(
+        "obj",
+        [
+            [0, 1, 2],
+            ["x", "y", "z"],
+            {"a": [0, 1, 2], 1: [4, 5, 6]},
+            {"a": ["x", "y", "z"], 1: ["a", "b", "c"]},
+            pd.Series([0, 1, 2]),
+            pd.Series(["x", "y", "z"]),
+            pd.DataFrame({"x": ["x", "y", "z"]}),
+            gpd.GeoDataFrame({"x": [0, 1, 2]}),
+            ps.DataFrame({"x": ["x", "y", "z"]}),
+        ],
+    )
+    def test_non_geometry(self, obj):
+        pd_df = pd.DataFrame(obj)
+        # pd.DataFrame(obj) doesn't work correctly for pandas on spark DataFrame type, so we use to_pandas() method instead.
+        if isinstance(obj, ps.DataFrame):
+            pd_df = obj.to_pandas()
+        sgpd_df = sgpd.GeoDataFrame(obj)
+        assert_frame_equal(pd_df, sgpd_df.to_pandas())
 
     def test_psdf(self):
         # this is to make sure the spark session works with pandas on spark api
@@ -73,7 +120,10 @@ def test_type_single_geometry_column(self):
 
         # Assert the geometry column has the correct type and is not nullable
         geometry_field = schema["geometry1"]
-        assert geometry_field.dataType.typeName() == "geometrytype"
+        assert (
+            geometry_field.dataType.typeName() == "geometrytype"
+            or geometry_field.dataType.typeName() == "binary"
+        )
         assert not geometry_field.nullable
 
         # Assert non-geometry columns are present with correct types
@@ -97,16 +147,25 @@ def test_type_multiple_geometry_columns(self):
         schema = df._internal.spark_frame.schema
         # Assert both geometry columns have the correct type
         geometry_field1 = schema["geometry1"]
-        assert geometry_field1.dataType.typeName() == "geometrytype"
+        assert (
+            geometry_field1.dataType.typeName() == "geometrytype"
+            or geometry_field1.dataType.typeName() == "binary"
+        )
         assert not geometry_field1.nullable
 
         geometry_field2 = schema["geometry2"]
-        assert geometry_field2.dataType.typeName() == "geometrytype"
+        assert (
+            geometry_field2.dataType.typeName() == "geometrytype"
+            or geometry_field2.dataType.typeName() == "binary"
+        )
         assert not geometry_field2.nullable
 
         # Check non-geometry column
         attribute_field = schema["attribute"]
-        assert attribute_field.dataType.typeName() != "geometrytype"
+        assert (
+            attribute_field.dataType.typeName() != "geometrytype"
+            and attribute_field.dataType.typeName() != "binary"
+        )
 
     def test_copy(self):
         df = GeoDataFrame([Point(x, x) for x in range(3)], name="test_df")