Skip to content

Commit f224666

Browse files
committed
Merge remote-tracking branch 'upstream/main' into fix-enums-site-type
2 parents 2a21b14 + dd70cfa commit f224666

2 files changed

Lines changed: 55 additions & 11 deletions

File tree

dataretrieval/waterdata/utils.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -713,18 +713,16 @@ def _arrange_cols(
713713
# Rename id column to output_id
714714
df = df.rename(columns={"id": output_id})
715715

716-
# If properties are provided, filter to only those columns
717-
# plus geometry if skip_geometry is False
718716
if properties and not all(pd.isna(properties)):
719-
# Make sure geometry stays in the dataframe if skip_geometry is False
720-
if "geometry" in df.columns and "geometry" not in properties:
721-
properties.append("geometry")
722-
# id is technically a valid column from the service, but these
723-
# functions make the name more specific. So, if someone requests
724-
# 'id', give them the output_id column
725-
if "id" in properties:
726-
properties[properties.index("id")] = output_id
727-
df = df.loc[:, [col for col in properties if col in df.columns]]
717+
# Don't alias the caller's list — we mutate below.
718+
local_properties = list(properties)
719+
if "geometry" in df.columns and "geometry" not in local_properties:
720+
local_properties.append("geometry")
721+
# 'id' is a valid service column, but expose it under the
722+
# service-specific output_id name instead.
723+
if "id" in local_properties:
724+
local_properties[local_properties.index("id")] = output_id
725+
df = df.loc[:, [col for col in local_properties if col in df.columns]]
728726

729727
# Move meaningless-to-user, extra id columns to the end
730728
# of the dataframe, if they exist

tests/waterdata_utils_test.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
from unittest import mock
22

3+
import pandas as pd
34
import requests
45

56
from dataretrieval.waterdata.utils import (
7+
_arrange_cols,
68
_format_api_dates,
79
_get_args,
810
_handle_stats_nesting,
@@ -114,6 +116,50 @@ def test_handle_stats_nesting_tolerates_missing_drop_columns():
114116
assert df["monitoring_location_id"].iloc[0] == "USGS-12345"
115117

116118

119+
# --- _arrange_cols ----------------------------------------------------------
120+
121+
122+
def test_arrange_cols_does_not_mutate_caller_properties():
123+
"""`_arrange_cols` must not mutate the caller's `properties` list.
124+
125+
Regression: previously the function did
126+
``properties.append("geometry")`` and
127+
``properties[properties.index("id")] = output_id`` in place, so the
128+
caller's list grew and was rewritten across successive calls.
129+
"""
130+
df = pd.DataFrame(
131+
{
132+
"id": ["a", "b"],
133+
"value": [1.0, 2.0],
134+
"geometry": ["p1", "p2"],
135+
}
136+
)
137+
properties = ["id", "value"]
138+
snapshot = list(properties)
139+
140+
_arrange_cols(df, properties, output_id="daily_id")
141+
_arrange_cols(df, properties, output_id="daily_id")
142+
143+
assert properties == snapshot, (
144+
f"caller's properties list was mutated: {properties!r} != {snapshot!r}"
145+
)
146+
147+
148+
def test_arrange_cols_swaps_id_in_returned_columns():
149+
"""`'id'` in `properties` should still resolve to the output_id column."""
150+
df = pd.DataFrame({"id": ["a"], "value": [1.0]})
151+
result = _arrange_cols(df, ["id", "value"], output_id="daily_id")
152+
assert "daily_id" in result.columns
153+
assert "id" not in result.columns
154+
155+
156+
def test_arrange_cols_keeps_geometry_when_present():
157+
"""Geometry must come along even if the caller didn't list it."""
158+
df = pd.DataFrame({"id": ["a"], "value": [1.0], "geometry": ["p1"]})
159+
result = _arrange_cols(df, ["value"], output_id="daily_id")
160+
assert "geometry" in result.columns
161+
162+
117163
# --- _format_api_dates -------------------------------------------------------
118164

119165

0 commit comments

Comments
 (0)