Skip to content

Commit c980d45

Browse files
committed
fix(weave): retry chunked file reads when replication is lagging
1 parent 439e6be commit c980d45

3 files changed

Lines changed: 24 additions & 7 deletions

File tree

tests/trace_server/test_clickhouse_trace_server_batched.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1244,6 +1244,8 @@ def test_file_content_read_retries_eventual_consistency():
12441244
assert mock_read_once.call_count == 2
12451245

12461246
# Real miss: after exhausting retries, the original NotFoundError still surfaces.
1247+
# `_file_content_read_with_retry` defaults to 5 attempts so chunked-file reads
1248+
# can absorb replication lag across multiple shards/replicas.
12471249
with patch.object(
12481250
server,
12491251
"_file_content_read_once",
@@ -1255,7 +1257,7 @@ def test_file_content_read_retries_eventual_consistency():
12551257
):
12561258
server.file_content_read(req)
12571259

1258-
assert mock_read_once.call_count == 2
1260+
assert mock_read_once.call_count == 5
12591261

12601262

12611263
@pytest.mark.disable_logging_error_check

weave/trace_server/clickhouse_trace_server_batched.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3446,9 +3446,7 @@ def annotator_queue_items_progress_update(
34463446
annotation_state=req.annotation_state,
34473447
pb=update_pb,
34483448
cluster_name=self.clickhouse_cluster_name,
3449-
table_name=self._mutation_table_name(
3450-
"annotator_queue_items_progress"
3451-
),
3449+
table_name=self._mutation_table_name("annotator_queue_items_progress"),
34523450
)
34533451
self._command(
34543452
update_query,
@@ -5557,9 +5555,15 @@ def _obj_read_with_retry(
55575555
name="clickhouse_trace_server_batched._file_content_read_with_retry"
55585556
)
55595557
def _file_content_read_with_retry(
5560-
self, req: tsi.FileContentReadReq, max_attempts: int = 2
5558+
self, req: tsi.FileContentReadReq, max_attempts: int = 5
55615559
) -> tsi.FileContentReadRes:
5562-
"""Read file content with retry for ClickHouse eventual consistency."""
5560+
"""Read file content with retry for ClickHouse eventual consistency.
5561+
5562+
Higher attempt count than `_obj_read_with_retry` because chunked-file
5563+
reads need all chunks visible across all shards/replicas: in
5564+
replicated/distributed mode each missing chunk on any replica trips
5565+
the retry, and lag can briefly exceed the 50ms-1s window.
5566+
"""
55635567
return self._read_with_retry(
55645568
lambda: self._file_content_read_once(req), max_attempts=max_attempts
55655569
)
@@ -5995,7 +5999,12 @@ def _file_content_read_once(
59955999
result_rows = list(query_result.result_rows)
59966000

59976001
if len(result_rows) < n_chunks:
5998-
raise ValueError("Missing chunks")
6002+
# Treat as not-found so the tenacity retry in `_read_with_retry`
6003+
# picks it up. Replicated/distributed reads can transiently see
6004+
# fewer rows than `n_chunks` while replication catches up.
6005+
raise NotFoundError(
6006+
f"File with digest {req.digest} has {len(result_rows)}/{n_chunks} chunks visible"
6007+
)
59996008
elif len(result_rows) > n_chunks:
60006009
# The general case where this can occur is when there are multiple
60016010
# writes of the same digest AND the effective `FILE_CHUNK_SIZE`

weave/trace_server/clickhouse_trace_server_migrator.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,12 @@ def _is_transient_ch_error(exc: BaseException) -> bool:
175175
# "versions for agent" queries have the same locality as the agent row.
176176
"agents": "project_id, agent_name",
177177
"agent_versions": "project_id, agent_name",
178+
# Files are chunked: `_file_content_read_once` selects all rows for a
179+
# (project_id, digest) and checks the count against `n_chunks`. With
180+
# rand() sharding chunks land on different shards, so any per-shard
181+
# replication lag manifests as "Missing chunks". Co-locate chunks of
182+
# one file on one shard so the read sees an atomic set.
183+
"files": "project_id, digest",
178184
}
179185

180186

0 commit comments

Comments
 (0)