fix: fix spark4 unit tests

kx79wq · kx79wq · commit 5b1b4241b460 · 2025-09-01T22:21:17.000+02:00
- add py3.12 support for spark4
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -35,7 +35,7 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           if [ "${{ matrix.numpy_version }}" = "numpy<2" ]; then
-            pip install ".[test,pandas,spark,test_numpy_pre2]"
+            pip install ".[test,pandas,test_spark_pre2,test_numpy_pre2]"
           else
             pip install ".[test,pandas,spark]"          
           fi
diff --git a/histogrammar/dfinterface/spark_histogrammar.py b/histogrammar/dfinterface/spark_histogrammar.py
@@ -225,7 +225,7 @@ def construct_empty_hist(self, df, features):
         for idx, col in enumerate(revcols):
             # histogram type depends on the data type
             dt = self.var_dtype[col]
-            quant = df[col]
+            quant = f.col(col)
             hist = self.get_hist_bin(hist, features, quant, col, dt)
 
         return hist
diff --git a/histogrammar/notebooks/histogrammar_tutorial_advanced.ipynb b/histogrammar/notebooks/histogrammar_tutorial_advanced.ipynb
@@ -3,6 +3,7 @@
   {
    "cell_type": "markdown",
    "metadata": {
+    "collapsed": false,
     "jupyter": {
      "outputs_hidden": false
     },
@@ -118,9 +119,9 @@
     "# for spark 2.X, in the jars string, for both jar files change \"_2.12\" into \"_2.11\".\n",
     "\n",
     "if pyspark_installed:\n",
-    "    scala = '2.12' if int(pyspark_version[0]) >= 3 else '2.11'\n",
-    "    hist_jar = f'io.github.histogrammar:histogrammar_{scala}:1.0.20'\n",
-    "    hist_spark_jar = f'io.github.histogrammar:histogrammar-sparksql_{scala}:1.0.20'\n",
+    "    scala = '2.12' if int(pyspark_version[0]) == 3 else '2.13'\n",
+    "    hist_jar = f'io.github.histogrammar:histogrammar_{scala}:1.0.30'\n",
+    "    hist_spark_jar = f'io.github.histogrammar:histogrammar-sparksql_{scala}:1.0.30'\n",
     "\n",
     "    spark = SparkSession.builder.config(\n",
     "        \"spark.jars.packages\", f'{hist_spark_jar},{hist_jar}'\n",
@@ -521,7 +522,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.6"
+   "version": "3.11.11"
   },
   "nteract": {
    "version": "0.15.0"
diff --git a/histogrammar/util.py b/histogrammar/util.py
@@ -247,7 +247,7 @@ def __init__(self, expr, name=None):
                 ok = False
             else:
                 if isinstance(expr, Column) and self.name is None:
-                    self.name = str(expr)[7:-1]
+                    self.name = str(expr)[8:-2]
                 ok = True
         if not ok:
             raise TypeError(f"quantity ({expr}) must be a string, function, or SparkSQL Column")
diff --git a/pyproject.toml b/pyproject.toml
@@ -40,7 +40,7 @@ pandas = [
     "pandas"
 ]
 spark = [
-    "pyspark; python_version <= '3.11'",
+    "pyspark",
 ]
 test = [
     "ipykernel>=5.1.3",
@@ -55,6 +55,9 @@ test_numpy_pre2 = [
     "numpy<2",
     "pandas<2",
 ]
+test_spark_pre2 = [
+    "pyspark<4; python_version <= '3.11'",
+]
 
 # files to be shipped with the installation, under: histogrammar/test_data and histogrammar/notebooks
 # after installation, these can be found with the functions in resources.py
diff --git a/tests/test_spark_histogrammar.py b/tests/test_spark_histogrammar.py
@@ -53,16 +53,16 @@ def spark_co():
 @pytest.mark.skipif(not spark_found, reason="spark not found")
 @pytest.mark.filterwarnings("ignore:createDataFrame attempted Arrow optimization because")
 def test_get_histograms(spark_co):
-    pytest.age["data"]["name"] = "'age'"
-    pytest.company["data"]["name"] = "'company'"
-    pytest.eyesColor["data"]["name"] = "'eyeColor'"
-    pytest.gender["data"]["name"] = "'gender'"
-    pytest.isActive["data"]["name"] = "'isActive'"
-    pytest.latitude["data"]["name"] = "'latitude'"
-    pytest.longitude["data"]["name"] = "'longitude'"
-    pytest.transaction["data"]["name"] = "'transaction'"
-
-    pytest.latitude_longitude["data"]["name"] = "'latitude:longitude'"
+    pytest.age["data"]["name"] = "age"
+    pytest.company["data"]["name"] = "company"
+    pytest.eyesColor["data"]["name"] = "eyeColor"
+    pytest.gender["data"]["name"] = "gender"
+    pytest.isActive["data"]["name"] = "isActive"
+    pytest.latitude["data"]["name"] = "latitude"
+    pytest.longitude["data"]["name"] = "longitude"
+    pytest.transaction["data"]["name"] = "transaction"
+
+    pytest.latitude_longitude["data"]["name"] = "latitude:longitude"
     pytest.latitude_longitude["data"]["bins:name"] = "unit_func"
 
     spark = spark_co
@@ -113,15 +113,15 @@ def test_get_histograms(spark_co):
 @pytest.mark.skipif(not spark_found, reason="spark not found")
 @pytest.mark.filterwarnings("ignore:createDataFrame attempted Arrow optimization because")
 def test_get_histograms_module(spark_co):
-    pytest.age["data"]["name"] = "'age'"
-    pytest.company["data"]["name"] = "'company'"
-    pytest.eyesColor["data"]["name"] = "'eyeColor'"
-    pytest.gender["data"]["name"] = "'gender'"
-    pytest.isActive["data"]["name"] = "'isActive'"
-    pytest.latitude["data"]["name"] = "'latitude'"
-    pytest.longitude["data"]["name"] = "'longitude'"
-
-    pytest.latitude_longitude["data"]["name"] = "'latitude:longitude'"
+    pytest.age["data"]["name"] = "age"
+    pytest.company["data"]["name"] = "company"
+    pytest.eyesColor["data"]["name"] = "eyeColor"
+    pytest.gender["data"]["name"] = "gender"
+    pytest.isActive["data"]["name"] = "isActive"
+    pytest.latitude["data"]["name"] = "latitude"
+    pytest.longitude["data"]["name"] = "longitude"
+
+    pytest.latitude_longitude["data"]["name"] = "latitude:longitude"
     pytest.latitude_longitude["data"]["bins:name"] = "unit_func"
 
     spark = spark_co
@@ -196,7 +196,7 @@ def test_get_histograms_timestamp(spark_co):
             "bins": {"108": 9.0, "109": 1.0},
             "bins:type": "Count",
             "entries": 10.0,
-            "name": "'dt'",
+            "name": "dt",
             "nanflow": 0.0,
             "nanflow:type": "Count",
             "origin": 1.2625632e18,
@@ -238,7 +238,7 @@ def test_get_histograms_date(spark_co):
             "bins": {"108": 9.0, "109": 1.0},
             "bins:type": "Count",
             "entries": 10.0,
-            "name": "'dt'",
+            "name": "dt",
             "nanflow": 0.0,
             "nanflow:type": "Count",
             "origin": 1.2625632e18,

Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,7 @@ pandas = [`
`40`	`40`	`"pandas"`
`41`	`41`	`]`
`42`	`42`	`spark = [`
`43`		`- "pyspark; python_version <= '3.11'",`
	`43`	`+ "pyspark",`
`44`	`44`	`]`
`45`	`45`	`test = [`
`46`	`46`	`"ipykernel>=5.1.3",`
`@@ -55,6 +55,9 @@ test_numpy_pre2 = [`
`55`	`55`	`"numpy<2",`
`56`	`56`	`"pandas<2",`
`57`	`57`	`]`
	`58`	`+test_spark_pre2 = [`
	`59`	`+ "pyspark<4; python_version <= '3.11'",`
	`60`	`+]`
`58`	`61`
`59`	`62`	`# files to be shipped with the installation, under: histogrammar/test_data and histogrammar/notebooks`
`60`	`63`	`# after installation, these can be found with the functions in resources.py`