Skip to content

Commit 545db8f

Browse files
wjones127claude
andcommitted
feat: clean up compaction data files on failed commit or post-write error
The most common production failure (commit conflict) would leave all compacted data files as orphans. This adds cleanup in two places: 1. commit_compaction(): if apply_commit fails (e.g. commit conflict), delete all data files from the completed compaction tasks. 2. rewrite_files(): if post-write steps fail (rechunking stable row ids, recalculating versions), delete the newly written fragments' data files. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent a804c5a commit 545db8f

2 files changed

Lines changed: 45 additions & 23 deletions

File tree

rust/lance/src/dataset.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ use self::fragment::FileFragment;
9595
use self::refs::Refs;
9696
use self::scanner::{DatasetRecordBatchStream, Scanner};
9797
use self::transaction::{Operation, Transaction, TransactionBuilder, UpdateMapEntry};
98-
use self::write::write_fragments_internal;
98+
use self::write::{cleanup_data_fragments, write_fragments_internal};
9999
use crate::dataset::branch_location::BranchLocation;
100100
use crate::dataset::cleanup::{CleanupPolicy, CleanupPolicyBuilder};
101101
use crate::dataset::refs::{BranchContents, BranchIdentifier, Branches, Tags};

rust/lance/src/dataset/optimize.rs

Lines changed: 44 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ use super::transaction::{
9393
Operation, RewriteGroup, RewrittenIndex, Transaction, TransactionBuilder,
9494
};
9595
use super::utils::make_rowid_capture_stream;
96-
use super::{WriteMode, WriteParams, write_fragments_internal};
96+
use super::{WriteMode, WriteParams, cleanup_data_fragments, write_fragments_internal};
9797
use crate::Dataset;
9898
use crate::Result;
9999
use crate::dataset::utils::CapturedRowIds;
@@ -1263,26 +1263,37 @@ async fn rewrite_files(
12631263

12641264
log::info!("Compaction task {}: file written", task_id);
12651265

1266-
let row_addrs = if let Some(row_ids_rx) = row_ids_rx {
1267-
let captured_ids = row_ids_rx
1268-
.try_recv()
1269-
.map_err(|err| Error::internal(format!("Failed to receive row ids: {}", err)))?;
1270-
let row_addrs = captured_ids.row_addrs(None).into_owned();
1271-
let mut serialized = Vec::with_capacity(row_addrs.serialized_size());
1272-
row_addrs.serialize_into(&mut serialized)?;
1273-
Some(serialized)
1274-
} else {
1275-
if dataset.manifest.uses_stable_row_ids() {
1276-
log::info!("Compaction task {}: rechunking stable row ids", task_id);
1277-
rechunk_stable_row_ids(dataset.as_ref(), &mut new_fragments, &fragments).await?;
1278-
recalc_versions_for_rewritten_fragments(
1279-
dataset.as_ref(),
1280-
&mut new_fragments,
1281-
&fragments,
1282-
)
1283-
.await?;
1266+
let row_addrs_result: Result<Option<Vec<u8>>> = async {
1267+
if let Some(row_ids_rx) = row_ids_rx {
1268+
let captured_ids = row_ids_rx
1269+
.try_recv()
1270+
.map_err(|err| Error::internal(format!("Failed to receive row ids: {}", err)))?;
1271+
let row_addrs = captured_ids.row_addrs(None).into_owned();
1272+
let mut serialized = Vec::with_capacity(row_addrs.serialized_size());
1273+
row_addrs.serialize_into(&mut serialized)?;
1274+
Ok(Some(serialized))
1275+
} else {
1276+
if dataset.manifest.uses_stable_row_ids() {
1277+
log::info!("Compaction task {}: rechunking stable row ids", task_id);
1278+
rechunk_stable_row_ids(dataset.as_ref(), &mut new_fragments, &fragments).await?;
1279+
recalc_versions_for_rewritten_fragments(
1280+
dataset.as_ref(),
1281+
&mut new_fragments,
1282+
&fragments,
1283+
)
1284+
.await?;
1285+
}
1286+
Ok(None)
1287+
}
1288+
}
1289+
.await;
1290+
1291+
let row_addrs = match row_addrs_result {
1292+
Ok(v) => v,
1293+
Err(e) => {
1294+
cleanup_data_fragments(&dataset.object_store, &dataset.base, &new_fragments).await;
1295+
return Err(e);
12841296
}
1285-
None
12861297
};
12871298

12881299
metrics.files_removed = task
@@ -1585,6 +1596,13 @@ pub async fn commit_compaction(
15851596
None
15861597
};
15871598

1599+
// Collect new fragment paths before moving rewrite_groups into the transaction,
1600+
// so we can clean them up if the commit fails.
1601+
let all_new_fragments: Vec<Fragment> = rewrite_groups
1602+
.iter()
1603+
.flat_map(|g| g.new_fragments.iter().cloned())
1604+
.collect();
1605+
15881606
let transaction = TransactionBuilder::new(
15891607
dataset.manifest.version,
15901608
Operation::Rewrite {
@@ -1596,9 +1614,13 @@ pub async fn commit_compaction(
15961614
.transaction_properties(options.transaction_properties.clone())
15971615
.build();
15981616

1599-
dataset
1617+
if let Err(e) = dataset
16001618
.apply_commit(transaction, &Default::default(), &Default::default())
1601-
.await?;
1619+
.await
1620+
{
1621+
cleanup_data_fragments(&dataset.object_store, &dataset.base, &all_new_fragments).await;
1622+
return Err(e);
1623+
}
16021624

16031625
Ok(metrics)
16041626
}

0 commit comments

Comments
 (0)