Skip to content

Commit b55d9f5

Browse files
Remove legacy datasets from python clients (#192)
* Remove geobuf * Remove legacy datasets
1 parent 62bdd94 commit b55d9f5

43 files changed

Lines changed: 1160 additions & 2113 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.pre-commit-config.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,13 @@ repos:
66
- id: end-of-file-fixer
77
- repo: https://github.com/charliermarsh/ruff-pre-commit
88
# keep the version here in sync with the version in uv.lock
9-
rev: "v0.11.5"
9+
rev: "v0.11.12"
1010
hooks:
11-
- id: ruff
11+
- id: ruff-check
1212
args: [--fix, --exit-non-zero-on-fix]
1313
- id: ruff-format
1414
- repo: https://github.com/RobertCraigie/pyright-python
1515
# keep the version here in sync with the version in uv.lock
16-
rev: v1.1.399
16+
rev: v1.1.400
1717
hooks:
1818
- id: pyright

pyproject.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,9 @@ dev = [
2323
# DeprecationWarning: Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0)
2424
"pyarrow>=17.0.0",
2525
# some dev tooling
26-
"ruff>=0.6.4",
27-
"pyright>=1.1.379",
26+
"ruff>=0.11.10",
27+
# pyright 1.1.401 reports many wrong false positives, let's wait until that is fixed before upgrading
28+
"pyright>=1.1.379,<1.1.401",
2829
"pre-commit>=3.8.0",
2930
"types-protobuf>=5.27.0.20240907",
3031
"junitparser>=3.2.0",

tilebox-datasets/tests/data/datapoint.py

Lines changed: 3 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,8 @@
3333
from tests.example_dataset.example_dataset_pb2 import ExampleDatapoint
3434
from tilebox.datasets.data.datapoint import (
3535
AnyMessage,
36-
Datapoint,
3736
DatapointInterval,
3837
DatapointIntervalLike,
39-
DatapointPage,
4038
IngestResponse,
4139
QueryResultPage,
4240
RepeatedAny,
@@ -203,37 +201,6 @@ def datapoint_metadata_messages(draw: DrawFn) -> core_pb2.DatapointMetadata:
203201
return core_pb2.DatapointMetadata(event_time=event_time, ingestion_time=ingestion_time, id=data_point_id)
204202

205203

206-
@composite
207-
def datapoints(draw: DrawFn, generated_fields: bool = False, missing_fields: bool = False) -> Datapoint:
208-
"""A hypothesis strategy for generating random datapoints"""
209-
meta = draw(datapoint_metadata_messages())
210-
data = draw(anys(generated_fields, missing_fields))
211-
return Datapoint(meta, data)
212-
213-
214-
@composite
215-
def datapoint_pages(
216-
draw: DrawFn, empty_next_page: bool | None = None, generated_fields: bool = True, missing_fields: bool = False
217-
) -> DatapointPage:
218-
"""
219-
A hypothesis strategy for generating random datapoint pages
220-
221-
Args:
222-
empty_next_page: Whether the next page should be empty or not. If None, it will randomly be either an
223-
empty or non-empty next page.
224-
generated_fields: Whether to generate datapoints with all generated fields (id and ingestion_time) set as well.
225-
If True, datapoints will have all meta fields set. If False, those fields will be set to None, similar
226-
to how a datapoint for ingestion would look like.
227-
missing_fields: Whether to generate datapoints with missing custom fields. If True, datapoints
228-
will randomly have some fields missing. If False, all fields will be set with data.
229-
"""
230-
meta = draw(lists(datapoint_metadata_messages(), min_size=1, max_size=5))
231-
data = draw(repeated_anys(generated_fields, missing_fields, fixed_length=len(meta)))
232-
next_page = draw(paginations(empty_next_page))
233-
byte_size = sum(len(m) for m in data.value)
234-
return DatapointPage(meta, data, next_page, byte_size)
235-
236-
237204
@composite
238205
def query_result_pages(
239206
draw: DrawFn, empty_next_page: bool | None = None, generated_fields: bool = True, missing_fields: bool = False
@@ -257,11 +224,11 @@ def query_result_pages(
257224

258225

259226
@composite
260-
def paginated_datapoint_for_interval_responses(draw: DrawFn) -> list[DatapointPage]:
227+
def paginated_query_results(draw: DrawFn) -> list[QueryResultPage]:
261228
"""A hypothesis strategy for generating random datapoint pages for a time interval"""
262229
# let's generate a couple of pages, that each have a next page set, indicating that there are more pages
263-
first_pages = draw(lists(datapoint_pages(empty_next_page=False), min_size=0, max_size=5))
264-
last_page = draw(datapoint_pages(empty_next_page=True))
230+
first_pages = draw(lists(query_result_pages(empty_next_page=False), min_size=0, max_size=5))
231+
last_page = draw(query_result_pages(empty_next_page=True))
265232
return [*first_pages, last_page]
266233

267234

tilebox-datasets/tests/data/test_datapoint.py

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,14 @@
44
anys,
55
datapoint_intervals,
66
datapoint_intervals_like,
7-
datapoint_pages,
8-
datapoints,
97
ingest_datapoints_responses,
108
query_result_pages,
119
repeated_anys,
1210
)
1311
from tilebox.datasets.data.datapoint import (
1412
AnyMessage,
15-
Datapoint,
1613
DatapointInterval,
1714
DatapointIntervalLike,
18-
DatapointPage,
1915
IngestResponse,
2016
QueryResultPage,
2117
RepeatedAny,
@@ -50,16 +46,6 @@ def test_repeated_anys_to_message_and_back(repeated_any: RepeatedAny) -> None:
5046
assert RepeatedAny.from_message(repeated_any.to_message()) == repeated_any
5147

5248

53-
@given(datapoints())
54-
def test_datapoints_to_message_and_back(datapoint: Datapoint) -> None:
55-
assert Datapoint.from_message(datapoint.to_message()) == datapoint
56-
57-
58-
@given(datapoint_pages())
59-
def test_datapoint_pages_to_message_and_back(page: DatapointPage) -> None:
60-
assert DatapointPage.from_message(page.to_message()) == page
61-
62-
6349
@given(query_result_pages())
6450
def test_query_result_pages_to_message_and_back(page: QueryResultPage) -> None:
6551
assert QueryResultPage.from_message(page.to_message()) == page

tilebox-datasets/tests/data/well_known_types.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -69,18 +69,6 @@ def latlonalt_messages(draw: DrawFn) -> well_known_types_pb2.LatLonAlt:
6969
)
7070

7171

72-
@composite
73-
def geobuf_messages(draw: DrawFn) -> well_known_types_pb2.GeobufData:
74-
encoded_geobufs = [
75-
# POLYGON ((-24.7917 80.1388, -20.6726 80.9065, -42.9612 82.8902, -45.6082 81.9369, -24.7917 80.1388))
76-
b"\x10\x02\x18\x042\x1e\x08\x05\x12\x01\x04\x1a\x17\xd9\xa1\x1e\xd8\xe9a\xce\x83\x05\xfaw\xcb\x9a\x1b\xfa\xb5\x02\xcb\x9d\x03\xf9\x94\x01",
77-
# POLYGON ((37.1165 82.5691, 60.97982545 85.05115, 3.218114686 85.05115, 3.0578 83.8312, 37.1165 82.5691))
78-
b"\x10\x02\x18\t2/\x08\x05\x12\x01\x04\x1a(\xc0\xe0\x86\xc5\x94\x02\xc0\xeb\x83\x98\xe7\x04\x94\xcc\xeb\xe5\xb1\x01\xa0\xcf\x88\xbf\x12\xd7\x8a\xee\xad\xae\x03\x00\xfb\xd4\xf1\x98\x01\xdf\xd6\xb7\x8b\t",
79-
]
80-
data = draw(sampled_from(encoded_geobufs))
81-
return well_known_types_pb2.GeobufData.FromString(data)
82-
83-
8472
@composite
8573
def shapely_polygons(draw: DrawFn) -> Polygon:
8674
xmin = draw(floats(min_value=-180, max_value=160))

tilebox-datasets/tests/protobuf_conversion/test_geobuf.py

Lines changed: 0 additions & 76 deletions
This file was deleted.

tilebox-datasets/tests/protobuf_conversion/test_protobuf_xarray.py

Lines changed: 2 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,10 @@
88
from shapely import MultiPolygon, Polygon, from_wkb
99
from xarray.testing import assert_equal
1010

11-
from tests.data.datapoint import datapoint_pages, datapoints, example_datapoints
11+
from tests.data.datapoint import example_datapoints
1212
from tests.example_dataset.example_dataset_pb2 import ExampleDatapoint
13-
from tilebox.datasets.data.datapoint import Datapoint, DatapointPage
1413
from tilebox.datasets.data.time_interval import timestamp_to_datetime, us_to_datetime
15-
from tilebox.datasets.protobuf_conversion.protobuf_xarray import (
16-
MessageToXarrayConverter,
17-
TimeseriesToXarrayConverter,
18-
)
14+
from tilebox.datasets.protobuf_conversion.protobuf_xarray import MessageToXarrayConverter
1915

2016

2117
@given(example_datapoints(generated_fields=True, missing_fields=False))
@@ -102,20 +98,6 @@ def test_convert_datapoint(datapoint: ExampleDatapoint) -> None: # noqa: PLR091
10298
assert isinstance(dataset.some_repeated_geometry[i].item(), Polygon | MultiPolygon)
10399

104100

105-
@given(datapoints(generated_fields=False, missing_fields=True))
106-
def test_convert_timeseries_datapoint(datapoint: Datapoint) -> None:
107-
converter = TimeseriesToXarrayConverter()
108-
converter.convert(datapoint)
109-
dataset = converter.finalize()
110-
111-
assert dataset.sizes["time"] == 1
112-
assert dataset.id.item() == datapoint.meta.id
113-
event_time = dataset.time.item() // 1000
114-
assert us_to_datetime(event_time) == timestamp_to_datetime(datapoint.meta.event_time)
115-
ingestion_time = dataset.ingestion_time.item() // 1000
116-
assert us_to_datetime(ingestion_time) == timestamp_to_datetime(datapoint.meta.ingestion_time)
117-
118-
119101
@given(lists(example_datapoints(generated_fields=True, missing_fields=True), min_size=5, max_size=30))
120102
def test_convert_datapoints(datapoints: list[ExampleDatapoint]) -> None: # noqa: C901, PLR0912
121103
converter = MessageToXarrayConverter()
@@ -187,23 +169,6 @@ def test_convert_datapoints(datapoints: list[ExampleDatapoint]) -> None: # noqa
187169
assert bytes_ is None or isinstance(bytes_, bytes)
188170

189171

190-
@given(datapoint_pages(empty_next_page=True, missing_fields=True))
191-
def test_convert_timeseries_datapoints(page: DatapointPage) -> None:
192-
converter = TimeseriesToXarrayConverter()
193-
converter.convert_all(page)
194-
dataset = converter.finalize()
195-
196-
assert dataset.sizes["time"] == len(page.meta)
197-
for i in range(len(page.meta)):
198-
datapoint = dataset.isel(time=i)
199-
meta = page.meta[i]
200-
assert datapoint.id.item() == meta.id
201-
event_time = datapoint.time.item() // 1000
202-
assert us_to_datetime(event_time) == timestamp_to_datetime(meta.event_time)
203-
ingestion_time = datapoint.ingestion_time.item() // 1000
204-
assert us_to_datetime(ingestion_time) == timestamp_to_datetime(meta.ingestion_time)
205-
206-
207172
@given(lists(example_datapoints(missing_fields=True), min_size=1, max_size=10))
208173
@settings(max_examples=10)
209174
def test_convert_datapoints_all_at_once_or_one_by_one_same_result(

tilebox-datasets/tests/test_client.py

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from _tilebox.grpc.error import NotFoundError
1111
from _tilebox.grpc.replay import open_recording_channel, open_replay_channel
1212
from tilebox.datasets import Client, DatasetClient
13-
from tilebox.datasets.data.datapoint import DatapointPage
13+
from tilebox.datasets.data.datapoint import QueryResultPage
1414
from tilebox.datasets.data.time_interval import us_to_datetime
1515

1616

@@ -35,7 +35,7 @@ def record_client(recording_file: str) -> Client:
3535
# this will open a channel to api.tilebox.com, which will send real requests to the server, and record them
3636
# for later offline replay
3737
recording_channel = open_recording_channel(
38-
"http://localhost:8083", os.environ["TILEBOX_OPENDATA_ONLY_API_KEY"], recording
38+
"https://api.tilebox.com", os.environ["TILEBOX_OPENDATA_ONLY_API_KEY"], recording
3939
)
4040

4141
with patch("tilebox.datasets.sync.client.open_channel") as open_channel_mock:
@@ -101,12 +101,11 @@ def test_find_datapoint() -> None:
101101
collection = s2_dataset.collection("S2A_S2MSI1C")
102102

103103
for skip_data in (False, True):
104-
datapoint = collection.find("0181f4ef-2040-1613-3eed-1c970dde6d2b", skip_data=skip_data)
104+
datapoint = collection.find("0181f4ef-2040-13e7-ba1f-d5575e2a32a4", skip_data=skip_data)
105105
assert isinstance(datapoint, xr.Dataset)
106106

107-
assert datapoint.id.item() == "0181f4ef-2040-1613-3eed-1c970dde6d2b"
107+
assert datapoint.id.item() == "0181f4ef-2040-13e7-ba1f-d5575e2a32a4"
108108
assert _dt(datapoint.time.item()) == datetime(2022, 7, 13, 0, 22, 1, 24000, tzinfo=timezone.utc)
109-
assert _dt(datapoint.ingestion_time.item()) == datetime(2024, 7, 17, 4, 14, 29, 349263, tzinfo=timezone.utc)
110109

111110
if not skip_data:
112111
assert datapoint.granule_name.item() == "S2A_MSIL1C_20220713T002201_N0400_R102_T08XNS_20220713T015332.SAFE"
@@ -128,47 +127,47 @@ def test_datapoint_not_found() -> None:
128127
collection = s2_dataset.collection("S2A_S2MSI1C")
129128

130129
with pytest.raises(NotFoundError, match="No such datapoint.*"):
131-
collection.find("0181f4dc-53c0-4912-acda-e35a368994fc") # is in another collection
130+
collection.find("0181f4ef-2040-101a-1423-d818e4d1895e") # is in another collection
132131

133132

134-
def test_load_data() -> None:
135-
client = replay_client("load_s2_data_interval.rpcs.bin")
133+
def test_query() -> None:
134+
client = replay_client("query_sentinel2.rpcs.bin")
136135

137136
s2_dataset = client.dataset("open_data.copernicus.sentinel2_msi")
138137
collection = s2_dataset.collection("S2A_S2MSI1C")
139138

140139
for skip_data in (False, True):
141-
data = collection.load(("2022-07-13", "2022-07-13T02:00"), skip_data=skip_data)
140+
data = collection.query(temporal_extent=("2022-07-13", "2022-07-13T02:00"), skip_data=skip_data)
142141
assert isinstance(data, xr.Dataset)
143142

144143
assert data.sizes["time"] == 756
145-
assert data.id[0] == "0181f4ef-2040-0566-def5-50246aabcabc"
146-
assert data.id[-1] == "0181f506-51c0-ffb7-20cb-a2ab4f5057cf"
144+
assert data.id[0] == "0181f4ef-2040-1004-5540-5bca22067ac8"
145+
assert data.id[-1] == "0181f506-51c0-a351-f295-6502e81f8ecf"
147146

148147
if not skip_data:
149-
assert data.granule_name[0] == "S2A_MSIL1C_20220713T002201_N0400_R102_T09XWK_20220713T015332.SAFE"
150-
assert data.granule_name[-1] == "S2A_MSIL1C_20220713T004721_N0400_R102_T53HNA_20220713T021615.SAFE"
148+
assert data.granule_name[0] == "S2A_MSIL1C_20220713T002201_N0400_R102_T09XWL_20220713T015332.SAFE"
149+
assert data.granule_name[-1] == "S2A_MSIL1C_20220713T004721_N0400_R102_T53HPV_20220713T021615.SAFE"
151150
else:
152151
assert "granule_name" not in data
153152

154153

155-
def test_load_data_pagination() -> None:
156-
client = replay_client("load_s2_data_interval_paging.rpcs.bin")
154+
def test_query_pagination() -> None:
155+
client = replay_client("query_sentinel2_paging.rpcs.bin")
157156

158157
s2_dataset = client.dataset("open_data.copernicus.sentinel2_msi")
159158
collection = s2_dataset.collection("S2A_S2MSI1C")
160159

161-
pages = list(collection._iter_pages_legacy(("2022-07-13", "2022-07-13T02:00"), page_size=10))
160+
pages = list(collection._iter_pages(("2022-07-13", "2022-07-13T02:00"), page_size=10))
162161

163162
assert len(pages) == 76 # we have 756 datapoints, so 76 pages, and the last page has only 6 datapoints
164163

165164
for i, page in enumerate(pages):
166-
assert isinstance(page, DatapointPage)
167-
assert page.meta[0].id >= "0181f4ef-2040-0566-def5-50246aabcabc"
168-
assert page.meta[-1].id <= "0181f506-51c0-ffb7-20cb-a2ab4f5057cf"
165+
assert isinstance(page, QueryResultPage)
166+
assert str(page.min_id) >= "0181f4ef-2040-1004-5540-5bca22067ac8"
167+
assert str(page.max_id) <= "0181f506-51c0-a351-f295-6502e81f8ecf"
169168
is_last_page = i == len(pages) - 1
170169
expected_len = 6 if is_last_page else 10
171-
assert len(page.meta) == expected_len
170+
assert page.n_datapoints == expected_len
172171

173172

174173
def _dt(timestamp_nano: int) -> datetime:

0 commit comments

Comments
 (0)