Skip to content

Commit ac21704

Browse files
authored
Merge branch 'master' into kim/commitlog/index-log-noise
2 parents 4afd916 + c982195 commit ac21704

44 files changed

Lines changed: 2168 additions & 880 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

crates/client-api-messages/src/websocket.rs

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -622,13 +622,19 @@ pub struct TableUpdate<F: WebsocketFormat> {
622622
pub updates: SmallVec<[F::QueryUpdate; 1]>,
623623
}
624624

625+
/// Computed update for a single query, annotated with the number of matching rows.
626+
pub struct SingleQueryUpdate<F: WebsocketFormat> {
627+
pub update: F::QueryUpdate,
628+
pub num_rows: u64,
629+
}
630+
625631
impl<F: WebsocketFormat> TableUpdate<F> {
626-
pub fn new(table_id: TableId, table_name: Box<str>, (update, num_rows): (F::QueryUpdate, u64)) -> Self {
632+
pub fn new(table_id: TableId, table_name: Box<str>, update: SingleQueryUpdate<F>) -> Self {
627633
Self {
628634
table_id,
629635
table_name,
630-
num_rows,
631-
updates: [update].into(),
636+
num_rows: update.num_rows,
637+
updates: [update.update].into(),
632638
}
633639
}
634640

@@ -641,9 +647,9 @@ impl<F: WebsocketFormat> TableUpdate<F> {
641647
}
642648
}
643649

644-
pub fn push(&mut self, (update, num_rows): (F::QueryUpdate, u64)) {
645-
self.updates.push(update);
646-
self.num_rows += num_rows;
650+
pub fn push(&mut self, update: SingleQueryUpdate<F>) {
651+
self.updates.push(update.update);
652+
self.num_rows += update.num_rows;
647653
}
648654

649655
pub fn num_rows(&self) -> usize {

crates/client-api/src/routes/subscribe.rs

Lines changed: 50 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,24 @@ async fn ws_client_actor_inner(
228228

229229
let addr = client.module.info().database_identity;
230230

231+
// Grab handles on the total incoming and outgoing queue length metrics,
232+
// which we'll increment and decrement as we push into and pull out of those queues.
233+
// Note that `total_outgoing_queue_length` is incremented separately,
234+
// by `ClientConnectionSender::send` in core/src/client/client_connection.rs;
235+
// we're only responsible for decrementing that one.
236+
// Also note that much care must be taken to clean up these metrics when the connection closes!
237+
// Any path which exits this function must decrement each of these metrics
238+
// by the number of messages still waiting in this client's queue,
239+
// or else they will grow without bound as clients disconnect, and be useless.
240+
let incoming_queue_length_metric = WORKER_METRICS.total_incoming_queue_length.with_label_values(&addr);
241+
let outgoing_queue_length_metric = WORKER_METRICS.total_outgoing_queue_length.with_label_values(&addr);
242+
243+
let clean_up_metrics = |message_queue: &VecDeque<(DataMessage, Instant)>,
244+
sendrx: &mpsc::Receiver<SerializableMessage>| {
245+
incoming_queue_length_metric.sub(message_queue.len() as _);
246+
outgoing_queue_length_metric.sub(sendrx.len() as _);
247+
};
248+
231249
loop {
232250
rx_buf.clear();
233251
enum Item {
@@ -236,6 +254,7 @@ async fn ws_client_actor_inner(
236254
}
237255
if let MaybeDone::Gone = *current_message {
238256
if let Some((message, timer)) = message_queue.pop_front() {
257+
incoming_queue_length_metric.dec();
239258
let client = client.clone();
240259
let fut = async move { client.handle_message(message, timer).await };
241260
current_message.set(MaybeDone::Future(fut));
@@ -263,43 +282,47 @@ async fn ws_client_actor_inner(
263282
continue;
264283
}
265284
// the client sent us a close frame
266-
None => break,
285+
None => {
286+
clean_up_metrics(&message_queue, &sendrx);
287+
break
288+
},
267289
},
268290

269291
// If we have an outgoing message to send, send it off.
270292
// No incoming `message` to handle, so `continue`.
271293
Some(n) = sendrx.recv_many(&mut rx_buf, 32).map(|n| (n != 0).then_some(n)) => {
294+
outgoing_queue_length_metric.sub(n as _);
272295
if closed {
273296
// TODO: this isn't great. when we receive a close request from the peer,
274297
// tungstenite doesn't let us send any new messages on the socket,
275298
// even though the websocket RFC allows it. should we fork tungstenite?
276299
log::info!("dropping messages due to ws already being closed: {:?}", &rx_buf[..n]);
277300
} else {
278-
let send_all = async {
279-
for msg in rx_buf.drain(..n) {
280-
let workload = msg.workload();
281-
let num_rows = msg.num_rows();
282-
283-
let msg = datamsg_to_wsmsg(serialize(msg, client.config));
284-
285-
// These metrics should be updated together,
286-
// or not at all.
287-
if let (Some(workload), Some(num_rows)) = (workload, num_rows) {
288-
WORKER_METRICS
289-
.websocket_sent_num_rows
290-
.with_label_values(&addr, &workload)
291-
.observe(num_rows as f64);
292-
WORKER_METRICS
293-
.websocket_sent_msg_size
294-
.with_label_values(&addr, &workload)
295-
.observe(msg.len() as f64);
296-
}
297-
// feed() buffers the message, but does not necessarily send it
298-
ws.feed(msg).await?;
301+
let send_all = async {
302+
for msg in rx_buf.drain(..n) {
303+
let workload = msg.workload();
304+
let num_rows = msg.num_rows();
305+
306+
let msg = datamsg_to_wsmsg(serialize(msg, client.config));
307+
308+
// These metrics should be updated together,
309+
// or not at all.
310+
if let (Some(workload), Some(num_rows)) = (workload, num_rows) {
311+
WORKER_METRICS
312+
.websocket_sent_num_rows
313+
.with_label_values(&addr, &workload)
314+
.observe(num_rows as f64);
315+
WORKER_METRICS
316+
.websocket_sent_msg_size
317+
.with_label_values(&addr, &workload)
318+
.observe(msg.len() as f64);
299319
}
300-
// now we flush all the messages to the socket
301-
ws.flush().await
302-
};
320+
// feed() buffers the message, but does not necessarily send it
321+
ws.feed(msg).await?;
322+
}
323+
// now we flush all the messages to the socket
324+
ws.flush().await
325+
};
303326
// Flush the websocket while continuing to poll the `handle_queue`,
304327
// to avoid deadlocks or delays due to enqueued futures holding resources.
305328
let send_all = also_poll(send_all, make_progress(&mut current_message));
@@ -348,6 +371,7 @@ async fn ws_client_actor_inner(
348371
} else {
349372
// the client never responded to our ping; drop them without trying to send them a Close
350373
log::warn!("client {} timed out", client.id);
374+
clean_up_metrics(&message_queue, &sendrx);
351375
break;
352376
}
353377
}
@@ -362,6 +386,7 @@ async fn ws_client_actor_inner(
362386
match message {
363387
Item::Message(ClientMessage::Message(message)) => {
364388
let timer = Instant::now();
389+
incoming_queue_length_metric.inc();
365390
message_queue.push_back((message, timer))
366391
}
367392
Item::HandleResult(res) => {

crates/commitlog/src/index/indexfile.rs

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use std::{
55
mem,
66
};
77

8-
use log::debug;
8+
use log::{debug, trace};
99
use memmap2::MmapMut;
1010
use spacetimedb_paths::server::OffsetIndexFile;
1111

@@ -190,19 +190,32 @@ impl<Key: Into<u64> + From<u64>> IndexFileMut<Key> {
190190
self.inner.flush_async()
191191
}
192192

193-
/// Truncates the index file starting from the entry with a key greater than or equal to the given key.
193+
/// Truncates the index file starting from the entry with a key greater than
194+
/// or equal to the given key.
195+
///
196+
/// If successful, `key` will no longer be in the index.
194197
pub(crate) fn truncate(&mut self, key: Key) -> Result<(), IndexError> {
195198
let key = key.into();
196-
let (found_key, index) = self.find_index(Key::from(key))?;
199+
let (found_key, index) = self
200+
.find_index(Key::from(key))
201+
.map(|(found, index)| (found.into(), index))?;
197202

198-
// If returned key is smalled than asked key, truncate from next entry
199-
self.num_entries = if found_key.into() == key {
203+
// If returned key is smaller than asked key, truncate from next entry
204+
self.num_entries = if found_key == key {
200205
index as usize
201206
} else {
202207
index as usize + 1
203208
};
204209

205210
let start = self.num_entries * ENTRY_SIZE;
211+
trace!(
212+
"truncate key={} found={} index={} num-entries={} start={}",
213+
key,
214+
found_key,
215+
index,
216+
self.num_entries,
217+
start
218+
);
206219

207220
if start < self.inner.len() {
208221
self.inner[start..].fill(0);

crates/commitlog/src/segment.rs

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -697,6 +697,7 @@ mod tests {
697697
use super::*;
698698
use crate::{payload::ArrayDecoder, repo, Options};
699699
use itertools::Itertools;
700+
use pretty_assertions::assert_matches;
700701
use proptest::prelude::*;
701702
use spacetimedb_paths::server::CommitLogDir;
702703
use tempfile::tempdir;
@@ -926,16 +927,28 @@ mod tests {
926927

927928
// Truncating to any offset in the written range or larger
928929
// retains that offset - 1, or the max offset written.
929-
let truncate_to: TxOffset = rand::random_range(1..=32);
930-
let retained_key = truncate_to.saturating_sub(1).min(10);
931-
let retained_val = retained_key * 128;
932-
let retained = (retained_key, retained_val);
933-
934-
writer.ftruncate(truncate_to, rand::random()).unwrap();
935-
assert_eq!(writer.head.key_lookup(truncate_to).unwrap(), retained);
936-
// Make sure this also holds after reopen.
937-
drop(writer);
938-
let index = TxOffsetIndex::open_index_file(&index_path).unwrap();
939-
assert_eq!(index.key_lookup(truncate_to).unwrap(), retained);
930+
for truncate_to in (2..=10u64).rev() {
931+
let retained_key = truncate_to.saturating_sub(1).min(10);
932+
let retained_val = retained_key * 128;
933+
let retained = (retained_key, retained_val);
934+
935+
writer.ftruncate(truncate_to, rand::random()).unwrap();
936+
assert_matches!(
937+
writer.head.key_lookup(truncate_to),
938+
Ok(x) if x == retained,
939+
"truncate to {truncate_to} should retain {retained:?}"
940+
);
941+
// Make sure this also holds after reopen.
942+
let index = TxOffsetIndex::open_index_file(&index_path).unwrap();
943+
assert_matches!(
944+
index.key_lookup(truncate_to),
945+
Ok(x) if x == retained,
946+
"truncate to {truncate_to} should retain {retained:?} after reopen"
947+
);
948+
}
949+
950+
// Truncating to 1 leaves no entries in the index
951+
writer.ftruncate(1, rand::random()).unwrap();
952+
assert_matches!(writer.head.key_lookup(1), Err(IndexError::KeyNotFound));
940953
}
941954
}

crates/core/src/client/client_connection.rs

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ use bytes::Bytes;
1616
use bytestring::ByteString;
1717
use derive_more::From;
1818
use futures::prelude::*;
19-
use prometheus::{Histogram, IntCounter};
19+
use prometheus::{Histogram, IntCounter, IntGauge};
2020
use spacetimedb_client_api_messages::websocket::{
2121
BsatnFormat, CallReducerFlags, Compression, FormatSwitch, JsonFormat, SubscribeMulti, SubscribeSingle, Unsubscribe,
2222
UnsubscribeMulti, WebsocketFormat,
@@ -91,6 +91,15 @@ pub struct ClientConnectionSender {
9191
pub struct ClientConnectionMetrics {
9292
pub websocket_request_msg_size: Histogram,
9393
pub websocket_requests: IntCounter,
94+
95+
/// The `total_outgoing_queue_length` metric labeled with this database's `Identity`,
96+
/// which we'll increment whenever sending a message.
97+
///
98+
/// This metric will be decremented, and cleaned up,
99+
/// by `ws_client_actor_inner` in client-api/src/routes/subscribe.rs.
100+
/// Care must be taken not to increment it after the client has disconnected
101+
/// and performed its clean-up.
102+
pub sendtx_queue_size: IntGauge,
94103
}
95104

96105
impl ClientConnectionMetrics {
@@ -102,10 +111,14 @@ impl ClientConnectionMetrics {
102111
let websocket_requests = WORKER_METRICS
103112
.websocket_requests
104113
.with_label_values(&database_identity, message_kind);
114+
let sendtx_queue_size = WORKER_METRICS
115+
.total_outgoing_queue_length
116+
.with_label_values(&database_identity);
105117

106118
Self {
107119
websocket_request_msg_size,
108120
websocket_requests,
121+
sendtx_queue_size,
109122
}
110123
}
111124
}
@@ -126,6 +139,7 @@ impl ClientConnectionSender {
126139
Ok(h) => h.spawn(async {}).abort_handle(),
127140
Err(_) => tokio::runtime::Runtime::new().unwrap().spawn(async {}).abort_handle(),
128141
};
142+
129143
let cancelled = AtomicBool::new(false);
130144
let sender = Self {
131145
id,
@@ -150,17 +164,27 @@ impl ClientConnectionSender {
150164
if self.cancelled.load(Relaxed) {
151165
return Err(ClientSendError::Cancelled);
152166
}
153-
self.sendtx.try_send(message).map_err(|e| match e {
154-
mpsc::error::TrySendError::Full(_) => {
167+
168+
match self.sendtx.try_send(message) {
169+
Err(mpsc::error::TrySendError::Full(_)) => {
155170
// we've hit CLIENT_CHANNEL_CAPACITY messages backed up in
156171
// the channel, so forcibly kick the client
157172
tracing::warn!(identity = %self.id.identity, connection_id = %self.id.connection_id, "client channel capacity exceeded");
158173
self.abort_handle.abort();
159174
self.cancelled.store(true, Relaxed);
160-
ClientSendError::Cancelled
175+
return Err(ClientSendError::Cancelled);
161176
}
162-
mpsc::error::TrySendError::Closed(_) => ClientSendError::Disconnected,
163-
})?;
177+
Err(mpsc::error::TrySendError::Closed(_)) => return Err(ClientSendError::Disconnected),
178+
Ok(()) => {
179+
// If we successfully pushed a message into the queue, increment the queue size metric.
180+
// Don't do this before pushing because, if the client has disconnected,
181+
// it will already have performed its clean-up,
182+
// and so would never perform the corresponding `dec` to this `inc`.
183+
if let Some(metrics) = &self.metrics {
184+
metrics.sendtx_queue_size.inc();
185+
}
186+
}
187+
}
164188

165189
Ok(())
166190
}

0 commit comments

Comments
 (0)