@@ -232,7 +232,27 @@ def _lonlat(geom):
232232
233233
234234def _point_coords (df , site ):
235- """lon/lat dicts keyed by site from point geometry, or None."""
235+ """lon/lat dicts keyed by site, or None.
236+
237+ Reads either a ``geometry`` column (the time-series getters' OGC response) or
238+ explicit ``longitude`` / ``latitude`` columns (the Samples profile, mapped via
239+ :data:`_SAMPLES_RENAME`) -- so every service surfaces station coordinates.
240+ """
241+ if {"longitude" , "latitude" }.issubset (df .columns ):
242+ geo = df .dropna (subset = ["longitude" , "latitude" ]).drop_duplicates (site )
243+ if geo .empty :
244+ return None
245+ lon , lat = {}, {}
246+ for site_id , x , y in zip (
247+ geo [site ].to_numpy (),
248+ geo ["longitude" ].to_numpy (),
249+ geo ["latitude" ].to_numpy (),
250+ ):
251+ try :
252+ lon [site_id ], lat [site_id ] = float (x ), float (y )
253+ except (TypeError , ValueError ):
254+ continue
255+ return (lon , lat ) if lon else None
236256 if "geometry" not in df .columns :
237257 return None
238258 geo = df .dropna (subset = ["geometry" ]).drop_duplicates (site )
@@ -408,8 +428,9 @@ def lookup(self, site_ids):
408428 """
409429 sites = sorted ({str (s ) for s in site_ids if _pd .notna (s )})
410430 # Racy read of the keys is fine: a concurrent miss just re-fetches (the
411- # fetch is idempotent); only the writes in _ingest take the lock.
431+ # fetch is idempotent); only the writes in _store take the lock.
412432 todo = [s for s in sites if s not in self ._entries ]
433+ fresh : dict [str , dict ] = {}
413434 if todo :
414435 try :
415436 meta , _ = self ._getter (monitoring_location_id = todo )
@@ -420,12 +441,17 @@ def lookup(self, site_ids):
420441 stacklevel = 2 ,
421442 )
422443 else :
423- self ._ingest (meta , todo )
444+ fresh = self ._parse (meta , todo )
445+ self ._store (fresh )
424446 param_meta : dict [str , dict ] = {}
425447 site_meta : dict [str , dict ] = {}
426448 with self ._lock :
427449 for s in sites :
428- entry = self ._entries .get (s , {})
450+ # Prefer this call's freshly-parsed entry over the cache: the
451+ # bounded cache may have already evicted just-fetched sites when a
452+ # single pull's ``todo`` exceeds maxsize, but the current call
453+ # must still see every site it fetched.
454+ entry = fresh .get (s ) or self ._entries .get (s , {})
429455 param_meta .update (entry .get ("params" , {}))
430456 if entry .get ("site" ):
431457 site_meta [s ] = entry ["site" ]
@@ -439,12 +465,8 @@ def clear(self):
439465 def __len__ (self ):
440466 return len (self ._entries )
441467
442- def _ingest (self , meta , todo ):
443- """Parse ``meta`` into per-site entries, then merge + evict under lock.
444-
445- The parsing runs lock-free on a local dict; only the (cheap) merge into
446- the shared cache and the FIFO eviction past ``maxsize`` hold the lock.
447- """
468+ def _parse (self , meta , todo ):
469+ """Parse ``meta`` into per-site ``{params, site}`` entries (lock-free)."""
448470 fresh = {s : {"params" : {}, "site" : {}} for s in todo }
449471 if not meta .empty :
450472 name_cols = [c for c in _NAME_DESCRIPTORS if c in meta .columns ]
@@ -470,8 +492,20 @@ def _ingest(self, meta, todo):
470492 }
471493 if desc :
472494 fresh [site ]["site" ] = desc
495+ return fresh
496+
497+ def _store (self , fresh ):
498+ """Merge non-empty entries into the bounded cache (FIFO eviction).
499+
500+ Sites that came back with no metadata are *not* cached, so a later call
501+ retries them rather than being stuck with a sticky empty result; the
502+ current call still sees them via the freshly-parsed ``fresh`` dict.
503+ """
504+ keep = {s : e for s , e in fresh .items () if e ["params" ] or e ["site" ]}
505+ if not keep :
506+ return
473507 with self ._lock :
474- self ._entries .update (fresh )
508+ self ._entries .update (keep )
475509 while len (self ._entries ) > self ._maxsize :
476510 self ._entries .pop (next (iter (self ._entries )))
477511
@@ -524,14 +558,24 @@ def select_series(ds, **keys):
524558 "so select by name instead, e.g. "
525559 "ds[variable].sel(monitoring_location_id=...)."
526560 )
527- inst_coords = [c for c in ds .coords if ds [c ].dims == ("timeseries" ,)]
561+ # Selectable keys are the series *identity* coordinates only -- exclude the
562+ # per-series descriptors (lon/lat are a float-equality footgun; unit/HUC/state
563+ # are not series identifiers).
564+ descriptors = {"longitude" , "latitude" , "unit_of_measure" , * _SITE_DESCRIPTORS }
565+ inst_coords = [
566+ c for c in ds .coords if ds [c ].dims == ("timeseries" ,) and c not in descriptors
567+ ]
528568 mask = _np .ones (ds .sizes ["timeseries" ], dtype = bool )
529569 for key , value in keys .items ():
530570 if key not in inst_coords :
531571 raise KeyError (
532- f"{ key !r} is not a per-series coordinate; choose from { inst_coords } ."
572+ f"{ key !r} is not a per-series identity coordinate; choose from "
573+ f"{ inst_coords } ."
533574 )
534- mask &= ds [key ].to_numpy () == value
575+ arr = ds [key ].to_numpy ()
576+ # NaN never equals anything, so match a missing instance key (e.g. a
577+ # characteristic with no sample fraction) by null-ness instead.
578+ mask &= _pd .isna (arr ) if _is_missing (value ) else (arr == value )
535579 matches = _np .flatnonzero (mask )
536580 if matches .size == 0 :
537581 raise KeyError (f"no time series matches { keys } ." )
@@ -563,6 +607,10 @@ def select_series(ds, **keys):
563607 "Result_SampleFraction" : "sample_fraction" ,
564608 "Result_ResultDetectionCondition" : "detection_condition" ,
565609 "Result_MeasureStatusIdentifier" : "status" ,
610+ # Samples carry position as explicit columns (no OGC ``geometry``); map them
611+ # to the canonical names so _point_coords surfaces station lon/lat.
612+ "Location_Longitude" : "longitude" ,
613+ "Location_Latitude" : "latitude" ,
566614}
567615_CANONICAL_COORD_ATTRS = {
568616 "parameter_code" : {"long_name" : "USGS parameter code" },
@@ -786,7 +834,9 @@ def _assemble(self, work, inst_cols, ancillary, has_unit):
786834 )
787835 data_vars = {
788836 "value" : ("obs" , work ["value" ].to_numpy ()),
789- "row_size" : ("timeseries" , row_size .to_numpy ().astype ("int32" )),
837+ # int64 (not int32): a single long, high-frequency series can exceed
838+ # 2^31 observations, and the select_series cumsum must not overflow.
839+ "row_size" : ("timeseries" , row_size .to_numpy ().astype ("int64" )),
790840 }
791841 for c in ancillary :
792842 data_vars [c ] = ("obs" , work [c ].to_numpy ())
@@ -899,16 +949,25 @@ def _build_series(self, work, group_cols, ancillary, has_unit):
899949
900950 def _variable_datasets (self , work , group_cols , ancillary , has_unit ):
901951 """One pivoted ``(site, time)`` Dataset per (parameter, statistic)."""
902- datasets , used = [], set ()
952+ # First pass: gather each group's identity and base name, so naming can
953+ # see the whole set (a bare name is only used when it is unambiguous).
954+ specs = []
903955 for _ , group in work .groupby (group_cols , dropna = False ):
904956 pcode = _first_present (group , "parameter_code" )
905957 stat = _first_present (group , "statistic_id" )
906- group_units = group ["unit_of_measure" ].dropna ().unique () if has_unit else ()
907- unit = group_units [0 ] if len (group_units ) else None
908958 desc = self .series_meta .get (str (pcode ), {}) if pcode is not None else {}
909-
910- name = self ._variable_name (desc , pcode , stat , used )
911- used .add (name )
959+ base = _slug (_none_if_nan (desc .get ("parameter_name" )) or pcode or "value" )
960+ specs .append ((group , pcode , stat , desc , base ))
961+ names = self ._disambiguate ([s [4 ] for s in specs ], [(s [1 ], s [2 ]) for s in specs ])
962+
963+ datasets = []
964+ for (group , pcode , stat , desc , _base ), name in zip (specs , names ):
965+ # Sort the units so the chosen label is deterministic across pulls
966+ # (values are not converted either way; see the multi-unit warning).
967+ group_units = (
968+ sorted (group ["unit_of_measure" ].dropna ().unique ()) if has_unit else []
969+ )
970+ unit = group_units [0 ] if group_units else None
912971
913972 if len (group_units ) > 1 :
914973 # One variable can carry only one ``units`` attr; surface the
@@ -951,15 +1010,34 @@ def _variable_datasets(self, work, group_cols, ancillary, has_unit):
9511010 return datasets
9521011
9531012 @staticmethod
954- def _variable_name (desc , pcode , stat , used ):
955- """A unique slug for a variable; disambiguate same-parameter series."""
956- name = _slug (_none_if_nan (desc .get ("parameter_name" )) or pcode or "value" )
957- if name in used : # same parameter, different statistic -> distinct var
958- op = CF_CELL_METHODS .get (str (stat )) or (str (stat ) if stat else None )
959- name = f"{ name } _{ _slug (op )} " if op else name
960- while name in used :
961- name += "_x"
962- return name
1013+ def _disambiguate (bases , keys ):
1014+ """Map per-group base slugs to unique, deterministic variable names.
1015+
1016+ ``keys[i]`` is the group's ``(parameter_code, statistic_id)``. A base used
1017+ by exactly one group stays bare (e.g. ``discharge``); a base shared by
1018+ several groups is disambiguated for *all* of them -- by the statistic's
1019+ cell-method operator (``discharge_maximum`` / ``discharge_mean``), falling
1020+ back to the statistic id then the parameter code -- so a bare name never
1021+ silently refers to an arbitrary one of several same-named series.
1022+ """
1023+ counts : dict [str , int ] = {}
1024+ for b in bases :
1025+ counts [b ] = counts .get (b , 0 ) + 1
1026+ names , used = [], set ()
1027+ for base , (pcode , stat ) in zip (bases , keys ):
1028+ if counts [base ] == 1 :
1029+ name = base
1030+ else :
1031+ op = CF_CELL_METHODS .get (str (stat )) if stat is not None else None
1032+ suffix = op or (str (stat ) if stat is not None else None )
1033+ name = f"{ base } _{ _slug (suffix )} " if suffix else base
1034+ if name == base or name in used : # statistic didn't separate them
1035+ name = f"{ base } _{ _slug (pcode )} " if pcode is not None else base
1036+ while name in used :
1037+ name += "_x"
1038+ used .add (name )
1039+ names .append (name )
1040+ return names
9631041
9641042
9651043class _StatsBuilder (_DatasetBuilder ):
0 commit comments