From a89cb0132ded6ccc476e6077adc475c998f8d9f9 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Wed, 25 Jun 2025 10:45:51 -0700 Subject: [PATCH 1/8] Fix small constructor bug --- python/sedona/geopandas/geodataframe.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/sedona/geopandas/geodataframe.py b/python/sedona/geopandas/geodataframe.py index a3b1db624c0..e5ac7605bfa 100644 --- a/python/sedona/geopandas/geodataframe.py +++ b/python/sedona/geopandas/geodataframe.py @@ -148,7 +148,9 @@ def __init__( assert dtype is None assert not copy if index is None: - internal = InternalFrame(spark_frame=data._internal.spark_frame) + internal = InternalFrame( + spark_frame=data._internal.spark_frame, index_spark_columns=None + ) object.__setattr__(self, "_internal_frame", internal) elif isinstance(data, SparkDataFrame): assert columns is None From b971e740097e12f55f166a48d336c34f9b919033 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Wed, 25 Jun 2025 14:59:11 -0700 Subject: [PATCH 2/8] Fix condition for converting to wkb --- python/sedona/geopandas/geodataframe.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/sedona/geopandas/geodataframe.py b/python/sedona/geopandas/geodataframe.py index e5ac7605bfa..9e59c96e9e3 100644 --- a/python/sedona/geopandas/geodataframe.py +++ b/python/sedona/geopandas/geodataframe.py @@ -175,8 +175,10 @@ def __init__( ) gdf = gpd.GeoDataFrame(df) # convert each geometry column to wkb type + import shapely for col in gdf.columns: - if isinstance(gdf[col], gpd.GeoSeries): + # It's possible we get a list, dict, pd.Series, gpd.GeoSeries, etc of shapely.Geometry objects. + if len(gdf[col]) > 0 and isinstance(gdf[col].iloc[0], shapely.Geometry): gdf[col] = gdf[col].apply(lambda geom: geom.wkb) pdf = pd.DataFrame(gdf) # initialize the parent class pyspark Dataframe with the pandas Series From a75476d93ddf39512b1f3506e243f1805a960b56 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Wed, 25 Jun 2025 15:00:33 -0700 Subject: [PATCH 3/8] Fix constructor to not error on sgpd and pspd inputs --- python/sedona/geopandas/geodataframe.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/python/sedona/geopandas/geodataframe.py b/python/sedona/geopandas/geodataframe.py index 9e59c96e9e3..d5f05ba9e20 100644 --- a/python/sedona/geopandas/geodataframe.py +++ b/python/sedona/geopandas/geodataframe.py @@ -141,17 +141,12 @@ def __init__( if isinstance(data, (GeoDataFrame, GeoSeries)): assert dtype is None assert not copy - self._anchor = data - self._col_label = index + super().__init__(data, index=index, dtype=dtype, copy=copy) elif isinstance(data, (PandasOnSparkSeries, PandasOnSparkDataFrame)): assert columns is None assert dtype is None assert not copy - if index is None: - internal = InternalFrame( - spark_frame=data._internal.spark_frame, index_spark_columns=None - ) - object.__setattr__(self, "_internal_frame", internal) + super().__init__(data, index=index, dtype=dtype) elif isinstance(data, SparkDataFrame): assert columns is None assert dtype is None From 8c0b15d2a8ee1863b201b270fcd415a95ba93be1 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Wed, 25 Jun 2025 15:02:24 -0700 Subject: [PATCH 4/8] Add constructor tests for all input types, including non-geometry --- python/tests/geopandas/test_geodataframe.py | 67 ++++++++++++++++++--- 1 file changed, 58 insertions(+), 9 deletions(-) diff --git a/python/tests/geopandas/test_geodataframe.py b/python/tests/geopandas/test_geodataframe.py index b236581255c..76151499247 100644 --- a/python/tests/geopandas/test_geodataframe.py +++ b/python/tests/geopandas/test_geodataframe.py @@ -21,9 +21,14 @@ Point, ) -from sedona.geopandas import GeoDataFrame +from sedona.geopandas import GeoDataFrame, GeoSeries from tests.test_base import TestBase import pyspark.pandas as ps +import pandas as pd +import geopandas as gpd +import sedona.geopandas as sgpd +import pytest +from pandas.testing import assert_frame_equal class TestDataframe(TestBase): @@ -41,10 +46,54 @@ class TestDataframe(TestBase): # # def teardown_method(self): # shutil.rmtree(self.tempdir) - - def test_constructor(self): - df = GeoDataFrame([Point(x, x) for x in range(3)]) - check_geodataframe(df) + @pytest.mark.parametrize( + "obj", + [ + [Point(x, x) for x in range(3)], + {"geometry": [Point(x, x) for x in range(3)]}, + pd.DataFrame([Point(x, x) for x in range(3)]), + gpd.GeoDataFrame([Point(x, x) for x in range(3)]), + pd.Series([Point(x, x) for x in range(3)]), + gpd.GeoSeries([Point(x, x) for x in range(3)]), + GeoSeries([Point(x, x) for x in range(3)]), + GeoDataFrame([Point(x, x) for x in range(3)]), + ], + ) + def test_constructor(self, obj): + sgpd_df = GeoDataFrame(obj) + check_geodataframe(sgpd_df) + + def test_constructor_pandas_on_spark(self): + for obj in [ + ps.DataFrame([Point(x, x) for x in range(3)]), + ps.Series([Point(x, x) for x in range(3)]), + ]: + sgpd_df = GeoDataFrame(obj) + check_geodataframe(sgpd_df) + + @pytest.mark.parametrize( + "obj", + [ + [], + [0, 1, 2], + ["x", "y", "z"], + {}, + {"a": [0, 1, 2], 1: [4, 5, 6]}, + {"a": ["x", "y", "z"], 1: ["a", "b", "c"]}, + pd.Series([0, 1, 2]), + pd.Series(["x", "y", "z"]), + pd.DataFrame({"x": ["x", "y", "z"]}), + gpd.GeoDataFrame({"x": [0, 1, 2]}), + ps.DataFrame({"x": ["x", "y", "z"]}), + ], + ) + def test_non_geometry(self, obj): + pd_df = pd.DataFrame(obj) + # pd.DataFrame(obj) doesn't work correctly for pandas on spark DataFrame type, so we use to_pandas() method instead. + if isinstance(obj, ps.DataFrame): + pd_df = obj.to_pandas() + sgpd_df = sgpd.GeoDataFrame(obj) + assert_frame_equal(pd_df, sgpd_df.to_pandas()) def test_psdf(self): # this is to make sure the spark session works with pandas on spark api @@ -73,7 +122,7 @@ def test_type_single_geometry_column(self): # Assert the geometry column has the correct type and is not nullable geometry_field = schema["geometry1"] - assert geometry_field.dataType.typeName() == "geometrytype" + assert geometry_field.dataType.typeName() == "geometrytype" or geometry_field.dataType.typeName() == "binary" assert not geometry_field.nullable # Assert non-geometry columns are present with correct types @@ -97,16 +146,16 @@ def test_type_multiple_geometry_columns(self): schema = df._internal.spark_frame.schema # Assert both geometry columns have the correct type geometry_field1 = schema["geometry1"] - assert geometry_field1.dataType.typeName() == "geometrytype" + assert geometry_field1.dataType.typeName() == "geometrytype" or geometry_field1.dataType.typeName() == "binary" assert not geometry_field1.nullable geometry_field2 = schema["geometry2"] - assert geometry_field2.dataType.typeName() == "geometrytype" + assert geometry_field2.dataType.typeName() == "geometrytype" or geometry_field2.dataType.typeName() == "binary" assert not geometry_field2.nullable # Check non-geometry column attribute_field = schema["attribute"] - assert attribute_field.dataType.typeName() != "geometrytype" + assert attribute_field.dataType.typeName() != "geometrytype" and attribute_field.dataType.typeName() != "binary" def test_copy(self): df = GeoDataFrame([Point(x, x) for x in range(3)], name="test_df") From 72b4967c32ce97714eafb0858555f754c328f9c5 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Wed, 25 Jun 2025 15:03:42 -0700 Subject: [PATCH 5/8] pre-commit reformat --- python/sedona/geopandas/geodataframe.py | 1 + python/tests/geopandas/test_geodataframe.py | 20 ++++++++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/python/sedona/geopandas/geodataframe.py b/python/sedona/geopandas/geodataframe.py index d5f05ba9e20..02a519cd5db 100644 --- a/python/sedona/geopandas/geodataframe.py +++ b/python/sedona/geopandas/geodataframe.py @@ -171,6 +171,7 @@ def __init__( gdf = gpd.GeoDataFrame(df) # convert each geometry column to wkb type import shapely + for col in gdf.columns: # It's possible we get a list, dict, pd.Series, gpd.GeoSeries, etc of shapely.Geometry objects. if len(gdf[col]) > 0 and isinstance(gdf[col].iloc[0], shapely.Geometry): diff --git a/python/tests/geopandas/test_geodataframe.py b/python/tests/geopandas/test_geodataframe.py index 76151499247..3988ac4355a 100644 --- a/python/tests/geopandas/test_geodataframe.py +++ b/python/tests/geopandas/test_geodataframe.py @@ -122,7 +122,10 @@ def test_type_single_geometry_column(self): # Assert the geometry column has the correct type and is not nullable geometry_field = schema["geometry1"] - assert geometry_field.dataType.typeName() == "geometrytype" or geometry_field.dataType.typeName() == "binary" + assert ( + geometry_field.dataType.typeName() == "geometrytype" + or geometry_field.dataType.typeName() == "binary" + ) assert not geometry_field.nullable # Assert non-geometry columns are present with correct types @@ -146,16 +149,25 @@ def test_type_multiple_geometry_columns(self): schema = df._internal.spark_frame.schema # Assert both geometry columns have the correct type geometry_field1 = schema["geometry1"] - assert geometry_field1.dataType.typeName() == "geometrytype" or geometry_field1.dataType.typeName() == "binary" + assert ( + geometry_field1.dataType.typeName() == "geometrytype" + or geometry_field1.dataType.typeName() == "binary" + ) assert not geometry_field1.nullable geometry_field2 = schema["geometry2"] - assert geometry_field2.dataType.typeName() == "geometrytype" or geometry_field2.dataType.typeName() == "binary" + assert ( + geometry_field2.dataType.typeName() == "geometrytype" + or geometry_field2.dataType.typeName() == "binary" + ) assert not geometry_field2.nullable # Check non-geometry column attribute_field = schema["attribute"] - assert attribute_field.dataType.typeName() != "geometrytype" and attribute_field.dataType.typeName() != "binary" + assert ( + attribute_field.dataType.typeName() != "geometrytype" + and attribute_field.dataType.typeName() != "binary" + ) def test_copy(self): df = GeoDataFrame([Point(x, x) for x in range(3)], name="test_df") From 3e243f36da9dce452d7948ab63e455522bf06c8c Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Wed, 25 Jun 2025 15:24:38 -0700 Subject: [PATCH 6/8] Change to BaseGeometry for shapely compatibilty --- python/sedona/geopandas/geodataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sedona/geopandas/geodataframe.py b/python/sedona/geopandas/geodataframe.py index 02a519cd5db..28e30278ee8 100644 --- a/python/sedona/geopandas/geodataframe.py +++ b/python/sedona/geopandas/geodataframe.py @@ -174,7 +174,7 @@ def __init__( for col in gdf.columns: # It's possible we get a list, dict, pd.Series, gpd.GeoSeries, etc of shapely.Geometry objects. - if len(gdf[col]) > 0 and isinstance(gdf[col].iloc[0], shapely.Geometry): + if len(gdf[col]) > 0 and isinstance(gdf[col].iloc[0], shapely.geometry.base.BaseGeometry): gdf[col] = gdf[col].apply(lambda geom: geom.wkb) pdf = pd.DataFrame(gdf) # initialize the parent class pyspark Dataframe with the pandas Series From aed31262b48f79024a66b1584799eb42de6e5c52 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Wed, 25 Jun 2025 15:33:28 -0700 Subject: [PATCH 7/8] pre-commit fmt --- python/sedona/geopandas/geodataframe.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/sedona/geopandas/geodataframe.py b/python/sedona/geopandas/geodataframe.py index 28e30278ee8..a2f90dff4b1 100644 --- a/python/sedona/geopandas/geodataframe.py +++ b/python/sedona/geopandas/geodataframe.py @@ -174,7 +174,9 @@ def __init__( for col in gdf.columns: # It's possible we get a list, dict, pd.Series, gpd.GeoSeries, etc of shapely.Geometry objects. - if len(gdf[col]) > 0 and isinstance(gdf[col].iloc[0], shapely.geometry.base.BaseGeometry): + if len(gdf[col]) > 0 and isinstance( + gdf[col].iloc[0], shapely.geometry.base.BaseGeometry + ): gdf[col] = gdf[col].apply(lambda geom: geom.wkb) pdf = pd.DataFrame(gdf) # initialize the parent class pyspark Dataframe with the pandas Series From ceb9a2aa2f688c0d8471484eac505ca77cb4d7c2 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Wed, 25 Jun 2025 16:07:56 -0700 Subject: [PATCH 8/8] Remove empty lst and dct test cases since diff spark versions handle differently --- python/tests/geopandas/test_geodataframe.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/tests/geopandas/test_geodataframe.py b/python/tests/geopandas/test_geodataframe.py index 3988ac4355a..33e0041dc20 100644 --- a/python/tests/geopandas/test_geodataframe.py +++ b/python/tests/geopandas/test_geodataframe.py @@ -74,10 +74,8 @@ def test_constructor_pandas_on_spark(self): @pytest.mark.parametrize( "obj", [ - [], [0, 1, 2], ["x", "y", "z"], - {}, {"a": [0, 1, 2], 1: [4, 5, 6]}, {"a": ["x", "y", "z"], 1: ["a", "b", "c"]}, pd.Series([0, 1, 2]),