Skip to content

Commit f7a9801

Browse files
authored
Option 2: Retroactively modify migration 229 to shrink sessions table (#9868)
Alternate version of #9867 that does not expire all existing console sessions. See more detailed description in #9866 and in the comment in the migration file.
1 parent eb54046 commit f7a9801

2 files changed

Lines changed: 71 additions & 4 deletions

File tree

  • nexus/tests/integration_tests
  • schema/crdb/fix-session-token-column-order

nexus/tests/integration_tests/schema.rs

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4129,12 +4129,19 @@ fn after_222_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> {
41294129
// Migration 229 fixes column ordering in console_session and device_access_token.
41304130
// The "id" column was added via ALTER TABLE in migration 145, placing it at the
41314131
// end. This migration recreates the tables with "id" as the first column.
4132+
// It also drops console sessions created more than 24 hours ago.
41324133
const SESSION_229_ID: Uuid =
41334134
Uuid::from_u128(0x22900001_0000_0000_0000_000000000001);
41344135
const SESSION_229_TOKEN: &str = "tok-console-229-migration-test";
41354136
const SESSION_229_SILO_USER: Uuid =
41364137
Uuid::from_u128(0x22900001_0000_0000_0000_000000000002);
41374138

4139+
const SESSION_229_OLD_ID: Uuid =
4140+
Uuid::from_u128(0x22900001_0000_0000_0000_000000000003);
4141+
const SESSION_229_OLD_TOKEN: &str = "tok-console-229-old-session";
4142+
const SESSION_229_OLD_SILO_USER: Uuid =
4143+
Uuid::from_u128(0x22900001_0000_0000_0000_000000000004);
4144+
41384145
const DEVICE_229_ID: Uuid =
41394146
Uuid::from_u128(0x22900002_0000_0000_0000_000000000001);
41404147
const DEVICE_229_TOKEN: &str = "tok-device-229-migration-test";
@@ -4148,6 +4155,8 @@ fn before_229_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> {
41484155
Box::pin(async move {
41494156
// Insert test data into console_session and device_access_token.
41504157
// These tables currently have "id" as the last column (from migration 145).
4158+
// We insert two console sessions: one recent (should be kept) and one
4159+
// created over 24 hours ago (should be dropped).
41514160
ctx.client
41524161
.batch_execute(&format!(
41534162
"
@@ -4156,6 +4165,13 @@ fn before_229_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> {
41564165
VALUES
41574166
('{SESSION_229_ID}', '{SESSION_229_TOKEN}', now(), now(), '{SESSION_229_SILO_USER}');
41584167
4168+
INSERT INTO omicron.public.console_session
4169+
(id, token, time_created, time_last_used, silo_user_id)
4170+
VALUES
4171+
('{SESSION_229_OLD_ID}', '{SESSION_229_OLD_TOKEN}',
4172+
now() - INTERVAL '48 hours', now() - INTERVAL '48 hours',
4173+
'{SESSION_229_OLD_SILO_USER}');
4174+
41594175
INSERT INTO omicron.public.device_access_token
41604176
(id, token, client_id, device_code, silo_user_id, time_requested, time_created)
41614177
VALUES
@@ -4172,7 +4188,7 @@ fn after_229_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> {
41724188
Box::pin(async move {
41734189
// Verify data was preserved after the column reordering migration.
41744190

4175-
// Check console_session
4191+
// Check that the recent console_session was kept
41764192
let rows = ctx
41774193
.client
41784194
.query(
@@ -4184,7 +4200,11 @@ fn after_229_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> {
41844200
)
41854201
.await
41864202
.expect("failed to query post-migration console_session");
4187-
assert_eq!(rows.len(), 1, "console_session row should still exist");
4203+
assert_eq!(
4204+
rows.len(),
4205+
1,
4206+
"recent console_session row should still exist"
4207+
);
41884208

41894209
let id: Uuid = rows[0].get("id");
41904210
assert_eq!(id, SESSION_229_ID);
@@ -4193,6 +4213,26 @@ fn after_229_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> {
41934213
let silo_user_id: Uuid = rows[0].get("silo_user_id");
41944214
assert_eq!(silo_user_id, SESSION_229_SILO_USER);
41954215

4216+
// Check that the old (>24h) console_session was dropped
4217+
let rows = ctx
4218+
.client
4219+
.query(
4220+
&format!(
4221+
"SELECT id FROM omicron.public.console_session
4222+
WHERE id = '{SESSION_229_OLD_ID}'"
4223+
),
4224+
&[],
4225+
)
4226+
.await
4227+
.expect(
4228+
"failed to query post-migration console_session for old row",
4229+
);
4230+
assert_eq!(
4231+
rows.len(),
4232+
0,
4233+
"console_session row older than 24h should have been dropped"
4234+
);
4235+
41964236
// Check device_access_token
41974237
let rows = ctx
41984238
.client
Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,37 @@
1-
-- Copy data from console_session to console_session_new
2-
-- Full table scan is unavoidable when copying all data
1+
-- This step used to copy everything from console_session to console_session_new
2+
-- (see https://github.com/oxidecomputer/omicron/pull/9816), but we discovered
3+
-- an exciting new class of bug when applying this migration on the colo rack.
4+
-- That bug is https://github.com/oxidecomputer/omicron/issues/9866.
5+
--
6+
-- CREATE INDEX in CRDB proceeds in two steps: one that creates the index
7+
-- metadata allowing it to be found by name, and another that actually makes
8+
-- the schema change and populates the index. Nexus instance A starts the
9+
-- CREATE INDEX step (up06, up07, up08) but because the table is so large (3
10+
-- million rows in this case) the backfill takes a while. While that's going,
11+
-- Nexus B also tries to pick up step 6, and it immediately succeeds because
12+
-- CREATE INDEX IF NOT EXISTS sees that the index exists. This marks the step
13+
-- as successful even though it's actually still running. Then the backfill runs
14+
-- out of memory and fails, and the index metadata is rolled back, so there is
15+
-- no index, but the migration system thinks it worked.
16+
--
17+
-- We are working on more reliable ways to avoid this (it could happen while
18+
-- creating an index on any large table) but in the meantime we can avoid
19+
-- this particular manifestation by making sure the console_session table is
20+
-- small when the CREATE INDEX runs. So, instead of copying all the rows from
21+
-- console_session, we only copy recent ones by filtering for
22+
--
23+
-- time_created >= now() - INTERVAL '24 hours'.
24+
--
25+
-- The absolute session timeout is 24 hours (session_absolute_timeout_minutes
26+
-- = 1440), so any session older than that guaranteed to be expired already.
27+
--
28+
-- Full table scan is unavoidable when copying data
329
SET LOCAL disallow_full_table_scans = off;
430

531
INSERT INTO omicron.public.console_session_new
632
(id, token, time_created, time_last_used, silo_user_id)
733
SELECT
834
id, token, time_created, time_last_used, silo_user_id
935
FROM omicron.public.console_session
36+
WHERE time_created >= now() - INTERVAL '24 hours'
1037
ON CONFLICT DO NOTHING;

0 commit comments

Comments
 (0)