Merge remote-tracking branch 'upstream/main' into fix-enums-site-type

thodson-usgs · thodson-usgs · commit f224666d6e9f · 2026-05-05T10:41:19.000-05:00
diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py
@@ -713,18 +713,16 @@ def _arrange_cols(
     # Rename id column to output_id
     df = df.rename(columns={"id": output_id})
 
-    # If properties are provided, filter to only those columns
-    # plus geometry if skip_geometry is False
     if properties and not all(pd.isna(properties)):
-        # Make sure geometry stays in the dataframe if skip_geometry is False
-        if "geometry" in df.columns and "geometry" not in properties:
-            properties.append("geometry")
-        # id is technically a valid column from the service, but these
-        # functions make the name more specific. So, if someone requests
-        # 'id', give them the output_id column
-        if "id" in properties:
-            properties[properties.index("id")] = output_id
-        df = df.loc[:, [col for col in properties if col in df.columns]]
+        # Don't alias the caller's list — we mutate below.
+        local_properties = list(properties)
+        if "geometry" in df.columns and "geometry" not in local_properties:
+            local_properties.append("geometry")
+        # 'id' is a valid service column, but expose it under the
+        # service-specific output_id name instead.
+        if "id" in local_properties:
+            local_properties[local_properties.index("id")] = output_id
+        df = df.loc[:, [col for col in local_properties if col in df.columns]]
 
     # Move meaningless-to-user, extra id columns to the end
     # of the dataframe, if they exist
diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py
@@ -1,8 +1,10 @@
 from unittest import mock
 
+import pandas as pd
 import requests
 
 from dataretrieval.waterdata.utils import (
+    _arrange_cols,
     _format_api_dates,
     _get_args,
     _handle_stats_nesting,
@@ -114,6 +116,50 @@ def test_handle_stats_nesting_tolerates_missing_drop_columns():
     assert df["monitoring_location_id"].iloc[0] == "USGS-12345"
 
 
+# --- _arrange_cols ----------------------------------------------------------
+
+
+def test_arrange_cols_does_not_mutate_caller_properties():
+    """`_arrange_cols` must not mutate the caller's `properties` list.
+
+    Regression: previously the function did
+    ``properties.append("geometry")`` and
+    ``properties[properties.index("id")] = output_id`` in place, so the
+    caller's list grew and was rewritten across successive calls.
+    """
+    df = pd.DataFrame(
+        {
+            "id": ["a", "b"],
+            "value": [1.0, 2.0],
+            "geometry": ["p1", "p2"],
+        }
+    )
+    properties = ["id", "value"]
+    snapshot = list(properties)
+
+    _arrange_cols(df, properties, output_id="daily_id")
+    _arrange_cols(df, properties, output_id="daily_id")
+
+    assert properties == snapshot, (
+        f"caller's properties list was mutated: {properties!r} != {snapshot!r}"
+    )
+
+
+def test_arrange_cols_swaps_id_in_returned_columns():
+    """`'id'` in `properties` should still resolve to the output_id column."""
+    df = pd.DataFrame({"id": ["a"], "value": [1.0]})
+    result = _arrange_cols(df, ["id", "value"], output_id="daily_id")
+    assert "daily_id" in result.columns
+    assert "id" not in result.columns
+
+
+def test_arrange_cols_keeps_geometry_when_present():
+    """Geometry must come along even if the caller didn't list it."""
+    df = pd.DataFrame({"id": ["a"], "value": [1.0], "geometry": ["p1"]})
+    result = _arrange_cols(df, ["value"], output_id="daily_id")
+    assert "geometry" in result.columns
+
+
 # --- _format_api_dates -------------------------------------------------------