Skip to content

Commit 38c1fed

Browse files
thodson-usgsclaude
andcommitted
Fix _arrange_cols: stop mutating the caller's properties list
`_arrange_cols` did `properties.append("geometry")` and `properties[properties.index("id")] = output_id` in place. Calling any getter twice with the same `properties=[...]` therefore caused that list to grow unboundedly and have `id` rewritten across calls — a real action-at-a-distance defect for any code that re-uses a `properties` list across requests. Take a local copy of the list and mutate that. Caller's list is untouched. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 51ac674 commit 38c1fed

2 files changed

Lines changed: 53 additions & 5 deletions

File tree

dataretrieval/waterdata/utils.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -687,15 +687,17 @@ def _arrange_cols(
687687
# If properties are provided, filter to only those columns
688688
# plus geometry if skip_geometry is False
689689
if properties and not all(pd.isna(properties)):
690+
# Work on a local copy so we don't mutate the caller's list across calls.
691+
local_properties = list(properties)
690692
# Make sure geometry stays in the dataframe if skip_geometry is False
691-
if "geometry" in df.columns and "geometry" not in properties:
692-
properties.append("geometry")
693+
if "geometry" in df.columns and "geometry" not in local_properties:
694+
local_properties.append("geometry")
693695
# id is technically a valid column from the service, but these
694696
# functions make the name more specific. So, if someone requests
695697
# 'id', give them the output_id column
696-
if "id" in properties:
697-
properties[properties.index("id")] = output_id
698-
df = df.loc[:, [col for col in properties if col in df.columns]]
698+
if "id" in local_properties:
699+
local_properties[local_properties.index("id")] = output_id
700+
df = df.loc[:, [col for col in local_properties if col in df.columns]]
699701

700702
# Move meaningless-to-user, extra id columns to the end
701703
# of the dataframe, if they exist

tests/waterdata_utils_test.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
from unittest import mock
22

3+
import pandas as pd
34
import requests
45

56
from dataretrieval.waterdata.utils import (
7+
_arrange_cols,
68
_get_args,
79
_walk_pages,
810
)
@@ -80,3 +82,47 @@ def test_walk_pages_multiple_mocked():
8082
assert mock_client.send.called
8183
assert mock_client.request.called
8284
assert mock_client.request.call_args[0][1] == "https://example.com/page2"
85+
86+
87+
# --- _arrange_cols ----------------------------------------------------------
88+
89+
90+
def test_arrange_cols_does_not_mutate_caller_properties():
91+
"""`_arrange_cols` must not mutate the caller's `properties` list.
92+
93+
Regression: previously the function did
94+
``properties.append("geometry")`` and
95+
``properties[properties.index("id")] = output_id`` in place, so the
96+
caller's list grew and was rewritten across successive calls.
97+
"""
98+
df = pd.DataFrame(
99+
{
100+
"id": ["a", "b"],
101+
"value": [1.0, 2.0],
102+
"geometry": ["p1", "p2"],
103+
}
104+
)
105+
properties = ["id", "value"]
106+
snapshot = list(properties)
107+
108+
_arrange_cols(df, properties, output_id="daily_id")
109+
_arrange_cols(df, properties, output_id="daily_id")
110+
111+
assert properties == snapshot, (
112+
f"caller's properties list was mutated: {properties!r} != {snapshot!r}"
113+
)
114+
115+
116+
def test_arrange_cols_swaps_id_in_returned_columns():
117+
"""`'id'` in `properties` should still resolve to the output_id column."""
118+
df = pd.DataFrame({"id": ["a"], "value": [1.0]})
119+
result = _arrange_cols(df, ["id", "value"], output_id="daily_id")
120+
assert "daily_id" in result.columns
121+
assert "id" not in result.columns
122+
123+
124+
def test_arrange_cols_keeps_geometry_when_present():
125+
"""Geometry must come along even if the caller didn't list it."""
126+
df = pd.DataFrame({"id": ["a"], "value": [1.0], "geometry": ["p1"]})
127+
result = _arrange_cols(df, ["value"], output_id="daily_id")
128+
assert "geometry" in result.columns

0 commit comments

Comments
 (0)