Skip to content

Commit 5b1b424

Browse files
author
kx79wq
committed
fix: fix spark4 unit tests
- add py3.12 support for spark4
1 parent 6863972 commit 5b1b424

6 files changed

Lines changed: 33 additions & 29 deletions

File tree

.github/workflows/test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ jobs:
3535
run: |
3636
python -m pip install --upgrade pip
3737
if [ "${{ matrix.numpy_version }}" = "numpy<2" ]; then
38-
pip install ".[test,pandas,spark,test_numpy_pre2]"
38+
pip install ".[test,pandas,test_spark_pre2,test_numpy_pre2]"
3939
else
4040
pip install ".[test,pandas,spark]"
4141
fi

histogrammar/dfinterface/spark_histogrammar.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ def construct_empty_hist(self, df, features):
225225
for idx, col in enumerate(revcols):
226226
# histogram type depends on the data type
227227
dt = self.var_dtype[col]
228-
quant = df[col]
228+
quant = f.col(col)
229229
hist = self.get_hist_bin(hist, features, quant, col, dt)
230230

231231
return hist

histogrammar/notebooks/histogrammar_tutorial_advanced.ipynb

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
{
44
"cell_type": "markdown",
55
"metadata": {
6+
"collapsed": false,
67
"jupyter": {
78
"outputs_hidden": false
89
},
@@ -118,9 +119,9 @@
118119
"# for spark 2.X, in the jars string, for both jar files change \"_2.12\" into \"_2.11\".\n",
119120
"\n",
120121
"if pyspark_installed:\n",
121-
" scala = '2.12' if int(pyspark_version[0]) >= 3 else '2.11'\n",
122-
" hist_jar = f'io.github.histogrammar:histogrammar_{scala}:1.0.20'\n",
123-
" hist_spark_jar = f'io.github.histogrammar:histogrammar-sparksql_{scala}:1.0.20'\n",
122+
" scala = '2.12' if int(pyspark_version[0]) == 3 else '2.13'\n",
123+
" hist_jar = f'io.github.histogrammar:histogrammar_{scala}:1.0.30'\n",
124+
" hist_spark_jar = f'io.github.histogrammar:histogrammar-sparksql_{scala}:1.0.30'\n",
124125
"\n",
125126
" spark = SparkSession.builder.config(\n",
126127
" \"spark.jars.packages\", f'{hist_spark_jar},{hist_jar}'\n",
@@ -521,7 +522,7 @@
521522
"name": "python",
522523
"nbconvert_exporter": "python",
523524
"pygments_lexer": "ipython3",
524-
"version": "3.7.6"
525+
"version": "3.11.11"
525526
},
526527
"nteract": {
527528
"version": "0.15.0"

histogrammar/util.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ def __init__(self, expr, name=None):
247247
ok = False
248248
else:
249249
if isinstance(expr, Column) and self.name is None:
250-
self.name = str(expr)[7:-1]
250+
self.name = str(expr)[8:-2]
251251
ok = True
252252
if not ok:
253253
raise TypeError(f"quantity ({expr}) must be a string, function, or SparkSQL Column")

pyproject.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ pandas = [
4040
"pandas"
4141
]
4242
spark = [
43-
"pyspark; python_version <= '3.11'",
43+
"pyspark",
4444
]
4545
test = [
4646
"ipykernel>=5.1.3",
@@ -55,6 +55,9 @@ test_numpy_pre2 = [
5555
"numpy<2",
5656
"pandas<2",
5757
]
58+
test_spark_pre2 = [
59+
"pyspark<4; python_version <= '3.11'",
60+
]
5861

5962
# files to be shipped with the installation, under: histogrammar/test_data and histogrammar/notebooks
6063
# after installation, these can be found with the functions in resources.py

tests/test_spark_histogrammar.py

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -53,16 +53,16 @@ def spark_co():
5353
@pytest.mark.skipif(not spark_found, reason="spark not found")
5454
@pytest.mark.filterwarnings("ignore:createDataFrame attempted Arrow optimization because")
5555
def test_get_histograms(spark_co):
56-
pytest.age["data"]["name"] = "'age'"
57-
pytest.company["data"]["name"] = "'company'"
58-
pytest.eyesColor["data"]["name"] = "'eyeColor'"
59-
pytest.gender["data"]["name"] = "'gender'"
60-
pytest.isActive["data"]["name"] = "'isActive'"
61-
pytest.latitude["data"]["name"] = "'latitude'"
62-
pytest.longitude["data"]["name"] = "'longitude'"
63-
pytest.transaction["data"]["name"] = "'transaction'"
64-
65-
pytest.latitude_longitude["data"]["name"] = "'latitude:longitude'"
56+
pytest.age["data"]["name"] = "age"
57+
pytest.company["data"]["name"] = "company"
58+
pytest.eyesColor["data"]["name"] = "eyeColor"
59+
pytest.gender["data"]["name"] = "gender"
60+
pytest.isActive["data"]["name"] = "isActive"
61+
pytest.latitude["data"]["name"] = "latitude"
62+
pytest.longitude["data"]["name"] = "longitude"
63+
pytest.transaction["data"]["name"] = "transaction"
64+
65+
pytest.latitude_longitude["data"]["name"] = "latitude:longitude"
6666
pytest.latitude_longitude["data"]["bins:name"] = "unit_func"
6767

6868
spark = spark_co
@@ -113,15 +113,15 @@ def test_get_histograms(spark_co):
113113
@pytest.mark.skipif(not spark_found, reason="spark not found")
114114
@pytest.mark.filterwarnings("ignore:createDataFrame attempted Arrow optimization because")
115115
def test_get_histograms_module(spark_co):
116-
pytest.age["data"]["name"] = "'age'"
117-
pytest.company["data"]["name"] = "'company'"
118-
pytest.eyesColor["data"]["name"] = "'eyeColor'"
119-
pytest.gender["data"]["name"] = "'gender'"
120-
pytest.isActive["data"]["name"] = "'isActive'"
121-
pytest.latitude["data"]["name"] = "'latitude'"
122-
pytest.longitude["data"]["name"] = "'longitude'"
123-
124-
pytest.latitude_longitude["data"]["name"] = "'latitude:longitude'"
116+
pytest.age["data"]["name"] = "age"
117+
pytest.company["data"]["name"] = "company"
118+
pytest.eyesColor["data"]["name"] = "eyeColor"
119+
pytest.gender["data"]["name"] = "gender"
120+
pytest.isActive["data"]["name"] = "isActive"
121+
pytest.latitude["data"]["name"] = "latitude"
122+
pytest.longitude["data"]["name"] = "longitude"
123+
124+
pytest.latitude_longitude["data"]["name"] = "latitude:longitude"
125125
pytest.latitude_longitude["data"]["bins:name"] = "unit_func"
126126

127127
spark = spark_co
@@ -196,7 +196,7 @@ def test_get_histograms_timestamp(spark_co):
196196
"bins": {"108": 9.0, "109": 1.0},
197197
"bins:type": "Count",
198198
"entries": 10.0,
199-
"name": "'dt'",
199+
"name": "dt",
200200
"nanflow": 0.0,
201201
"nanflow:type": "Count",
202202
"origin": 1.2625632e18,
@@ -238,7 +238,7 @@ def test_get_histograms_date(spark_co):
238238
"bins": {"108": 9.0, "109": 1.0},
239239
"bins:type": "Count",
240240
"entries": 10.0,
241-
"name": "'dt'",
241+
"name": "dt",
242242
"nanflow": 0.0,
243243
"nanflow:type": "Count",
244244
"origin": 1.2625632e18,

0 commit comments

Comments
 (0)