Skip to content

Commit 8b9d7e9

Browse files
thodson-usgsclaude
andcommitted
Split _split_top_level_or into generator + consumer
Pull the state machine out into ``_iter_or_boundaries``, a generator that yields ``(start, end)`` spans of each top-level ``OR`` separator, and reduce ``_split_top_level_or`` to a short slice loop over those spans. Behaviour is unchanged (all 26 existing tests pass); the win is readability — each function now has one job instead of three, and the producer/consumer split mirrors how ``re.finditer`` / ``tokenize`` are structured elsewhere in the stdlib. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 5709cd5 commit 8b9d7e9

1 file changed

Lines changed: 23 additions & 11 deletions

File tree

dataretrieval/waterdata/utils.py

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import logging
55
import os
66
import re
7+
from collections.abc import Iterator
78
from datetime import datetime
89
from typing import Any, get_args
910

@@ -232,18 +233,16 @@ def _format_api_dates(
232233
_CQL_FILTER_CHUNK_LEN = 5000
233234

234235

235-
def _split_top_level_or(expr: str) -> list[str]:
236-
"""Split a CQL expression at each top-level ``OR`` separator.
236+
def _iter_or_boundaries(expr: str) -> Iterator[tuple[int, int]]:
237+
"""Yield ``(start, end)`` spans of each top-level ``OR`` separator.
237238
238-
Respects parentheses and single/double-quoted string literals so that
239-
``OR`` tokens inside ``(A OR B)`` or ``'word OR word'`` are left alone.
240-
Matching is case-insensitive. Whitespace around each emitted part is
241-
stripped; empty parts are dropped.
239+
Tracks single/double-quoted string literals and parenthesized
240+
sub-expressions so that ``OR`` tokens inside them are skipped.
241+
Matching is case-insensitive and the yielded span covers the
242+
surrounding whitespace on both sides.
242243
"""
243-
parts = []
244244
depth = 0
245245
in_quote = None
246-
last = 0
247246
i = 0
248247
n = len(expr)
249248
while i < n:
@@ -265,22 +264,35 @@ def _split_top_level_or(expr: str) -> list[str]:
265264
depth -= 1
266265
i += 1
267266
continue
268-
# Match whitespace + OR + whitespace at depth 0, case-insensitive.
269267
if depth == 0 and ch.isspace():
270268
j = i + 1
271269
while j < n and expr[j].isspace():
272270
j += 1
273271
if j + 2 <= n and expr[j : j + 2].lower() == "or":
274272
k = j + 2
275273
if k < n and expr[k].isspace():
276-
parts.append(expr[last:i].strip())
277274
m = k + 1
278275
while m < n and expr[m].isspace():
279276
m += 1
280-
last = m
277+
yield i, m
281278
i = m
282279
continue
283280
i += 1
281+
282+
283+
def _split_top_level_or(expr: str) -> list[str]:
284+
"""Split a CQL expression at each top-level ``OR`` separator.
285+
286+
Respects parentheses and single/double-quoted string literals so that
287+
``OR`` tokens inside ``(A OR B)`` or ``'word OR word'`` are left alone.
288+
Matching is case-insensitive. Whitespace around each emitted part is
289+
stripped; empty parts are dropped.
290+
"""
291+
parts = []
292+
last = 0
293+
for start, end in _iter_or_boundaries(expr):
294+
parts.append(expr[last:start].strip())
295+
last = end
284296
parts.append(expr[last:].strip())
285297
return [p for p in parts if p]
286298

0 commit comments

Comments
 (0)