Skip to content

Commit c7ffc1a

Browse files
SOLR-17949: Verify checksums on Azure Blob backup and document flush() no-op
Verify Lucene checksums when copying index files to the Azure Blob backup repository (mirroring the S3 repository), rewriting the footer after validation and rejecting too-small or corrupt files. Clarify that AzureBlobOutputStream.flush() is intentionally a no-op so frequent flushes cannot exhaust Azure's committed-block limit, and document the pathExists/isDirectory asymmetry. Add covering tests and group test helpers and lifecycle methods consistently. Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent 75a5db7 commit c7ffc1a

12 files changed

Lines changed: 384 additions & 132 deletions

File tree

solr/modules/azure-blob-repository/build.gradle

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,6 @@ dependencies {
6464

6565
testImplementation libs.azure.core.http.okhttp
6666
testImplementation libs.squareup.okhttp3.okhttp
67-
testImplementation libs.projectreactor.core
6867

6968
// Testcontainers for Azurite integration testing
7069
testImplementation libs.testcontainers

solr/modules/azure-blob-repository/src/java/org/apache/solr/azureblob/AzureBlobBackupRepository.java

Lines changed: 68 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@
2929
import java.util.Objects;
3030
import java.util.Set;
3131
import java.util.stream.Collectors;
32+
import org.apache.lucene.codecs.CodecUtil;
33+
import org.apache.lucene.index.CorruptIndexException;
34+
import org.apache.lucene.store.ChecksumIndexInput;
3235
import org.apache.lucene.store.Directory;
3336
import org.apache.lucene.store.IOContext;
3437
import org.apache.lucene.store.IndexInput;
@@ -146,11 +149,7 @@ public void createDirectory(URI path) throws IOException {
146149
log.debug("Create directory '{}'", blobPath);
147150
}
148151

149-
try {
150-
client.createDirectory(blobPath);
151-
} catch (AzureBlobException e) {
152-
throw new IOException("Failed to create directory " + blobPath, e);
153-
}
152+
client.createDirectory(blobPath);
154153
}
155154

156155
@Override
@@ -163,11 +162,7 @@ public void deleteDirectory(URI path) throws IOException {
163162
log.debug("Delete directory '{}'", blobPath);
164163
}
165164

166-
try {
167-
client.deleteDirectory(blobPath);
168-
} catch (AzureBlobException e) {
169-
throw new IOException("Failed to delete directory " + blobPath, e);
170-
}
165+
client.deleteDirectory(blobPath);
171166
}
172167

173168
@Override
@@ -177,13 +172,9 @@ public void delete(URI path, Collection<String> files) throws IOException {
177172

178173
String basePath = getBlobPath(path);
179174

180-
try {
181-
if (!client.isDirectory(basePath)) {
182-
int lastSlash = basePath.lastIndexOf('/');
183-
basePath = lastSlash >= 0 ? basePath.substring(0, lastSlash) : "";
184-
}
185-
} catch (AzureBlobException e) {
186-
throw new IOException("Failed to check path type for " + basePath, e);
175+
if (!client.isDirectory(basePath)) {
176+
int lastSlash = basePath.lastIndexOf('/');
177+
basePath = lastSlash >= 0 ? basePath.substring(0, lastSlash) : "";
187178
}
188179

189180
final String prefix;
@@ -202,11 +193,7 @@ public void delete(URI path, Collection<String> files) throws IOException {
202193
log.debug("Delete files '{}'", fullPaths);
203194
}
204195

205-
try {
206-
client.delete(fullPaths);
207-
} catch (AzureBlobException e) {
208-
throw new IOException("Failed to delete files " + fullPaths, e);
209-
}
196+
client.delete(fullPaths);
210197
}
211198

212199
@Override
@@ -219,11 +206,7 @@ public boolean exists(URI path) throws IOException {
219206
log.debug("Check existence '{}'", blobPath);
220207
}
221208

222-
try {
223-
return client.pathExists(blobPath);
224-
} catch (AzureBlobException e) {
225-
throw new IOException("Failed to check existence of " + blobPath, e);
226-
}
209+
return client.pathExists(blobPath);
227210
}
228211

229212
@Override
@@ -236,14 +219,10 @@ public PathType getPathType(URI path) throws IOException {
236219
log.debug("Get path type '{}'", blobPath);
237220
}
238221

239-
try {
240-
if (client.isDirectory(blobPath)) {
241-
return BackupRepository.PathType.DIRECTORY;
242-
} else {
243-
return BackupRepository.PathType.FILE;
244-
}
245-
} catch (AzureBlobException e) {
246-
throw new IOException("Failed to get path type for " + blobPath, e);
222+
if (client.isDirectory(blobPath)) {
223+
return BackupRepository.PathType.DIRECTORY;
224+
} else {
225+
return BackupRepository.PathType.FILE;
247226
}
248227
}
249228

@@ -257,11 +236,7 @@ public String[] listAll(URI path) throws IOException {
257236
log.debug("List all '{}'", blobPath);
258237
}
259238

260-
try {
261-
return client.listDir(blobPath);
262-
} catch (AzureBlobException e) {
263-
throw new IOException("Failed to list directory " + blobPath, e);
264-
}
239+
return client.listDir(blobPath);
265240
}
266241

267242
@Override
@@ -276,11 +251,7 @@ public IndexInput openInput(URI dirPath, String fileName, IOContext ctx) throws
276251
log.debug("Open input '{}'", blobPath);
277252
}
278253

279-
try {
280-
return new AzureBlobIndexInput(client, blobPath, client.length(blobPath));
281-
} catch (AzureBlobException e) {
282-
throw new IOException("Failed to open input stream for " + blobPath, e);
283-
}
254+
return new AzureBlobIndexInput(client, blobPath, client.length(blobPath));
284255
}
285256

286257
@Override
@@ -293,11 +264,7 @@ public OutputStream createOutput(URI path) throws IOException {
293264
log.debug("Create output '{}'", blobPath);
294265
}
295266

296-
try {
297-
return client.pushStream(blobPath);
298-
} catch (AzureBlobException e) {
299-
throw new IOException("Failed to create output stream for " + blobPath, e);
300-
}
267+
return client.pushStream(blobPath);
301268
}
302269

303270
@Override
@@ -330,18 +297,29 @@ public void copyIndexFileFrom(
330297
// ignore; write will surface real issues
331298
}
332299

333-
try (IndexInput input = sourceDir.openInput(sourceFileName, IOContext.DEFAULT);
334-
OutputStream output = client.pushStream(blobPath)) {
335-
byte[] buffer = new byte[COPY_BUFFER_SIZE];
336-
long remaining = input.length();
337-
while (remaining > 0) {
338-
int toRead = (int) Math.min(buffer.length, remaining);
339-
input.readBytes(buffer, 0, toRead);
340-
output.write(buffer, 0, toRead);
341-
remaining -= toRead;
300+
try (IndexInput input =
301+
shouldVerifyChecksum
302+
? sourceDir.openChecksumInput(sourceFileName)
303+
: sourceDir.openInput(sourceFileName, IOContext.READONCE)) {
304+
if (input.length() <= CodecUtil.footerLength()) {
305+
throw new CorruptIndexException("file is too small:" + input.length(), input);
306+
}
307+
308+
try (OutputStream output = client.pushStream(blobPath)) {
309+
byte[] buffer = new byte[COPY_BUFFER_SIZE];
310+
long remaining =
311+
shouldVerifyChecksum ? input.length() - CodecUtil.footerLength() : input.length();
312+
while (remaining > 0) {
313+
int toRead = (int) Math.min(buffer.length, remaining);
314+
input.readBytes(buffer, 0, toRead);
315+
output.write(buffer, 0, toRead);
316+
remaining -= toRead;
317+
}
318+
if (shouldVerifyChecksum) {
319+
long checksum = CodecUtil.checkFooter((ChecksumIndexInput) input);
320+
writeFooter(checksum, output);
321+
}
342322
}
343-
} catch (AzureBlobException e) {
344-
throw new IOException("Failed to copy file from " + sourceFileName + " to " + blobPath, e);
345323
}
346324
}
347325

@@ -379,8 +357,6 @@ public void copyIndexFileTo(
379357
while ((len = inputStream.read(buffer)) != -1) {
380358
indexOutput.writeBytes(buffer, 0, len);
381359
}
382-
} catch (AzureBlobException e) {
383-
throw new IOException("Failed to copy file from " + blobPath + " to " + destFileName, e);
384360
}
385361

386362
long timeElapsed = Duration.between(start, Instant.now()).toMillis();
@@ -403,4 +379,33 @@ private String getBlobPath(URI uri) {
403379
}
404380
return uri.getPath();
405381
}
382+
383+
private void writeFooter(long checksum, OutputStream outputStream) throws IOException {
384+
IndexOutput out =
385+
new IndexOutput("", "") {
386+
@Override
387+
public void writeByte(byte b) throws IOException {
388+
outputStream.write(b);
389+
}
390+
391+
@Override
392+
public void writeBytes(byte[] b, int offset, int length) throws IOException {
393+
outputStream.write(b, offset, length);
394+
}
395+
396+
@Override
397+
public void close() {}
398+
399+
@Override
400+
public long getFilePointer() {
401+
return 0;
402+
}
403+
404+
@Override
405+
public long getChecksum() {
406+
return checksum;
407+
}
408+
};
409+
CodecUtil.writeFooter(out);
410+
}
406411
}

solr/modules/azure-blob-repository/src/java/org/apache/solr/azureblob/AzureBlobException.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,17 @@
1616
*/
1717
package org.apache.solr.azureblob;
1818

19+
import java.io.IOException;
20+
1921
/**
2022
* Generic exception for Blob Storage related failures. Could originate from the {@link
2123
* AzureBlobBackupRepository} or from its underlying {@link AzureBlobStorageClient}.
2224
*/
23-
public class AzureBlobException extends Exception {
25+
public class AzureBlobException extends IOException {
26+
public AzureBlobException(Throwable cause) {
27+
super(cause);
28+
}
29+
2430
public AzureBlobException(String message) {
2531
super(message);
2632
}

solr/modules/azure-blob-repository/src/java/org/apache/solr/azureblob/AzureBlobIndexInput.java

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -143,12 +143,7 @@ private void ensureStreamAt(long targetAbsolutePos) throws IOException {
143143
"read past EOF: pos=" + targetAbsolutePos + " vs end=" + (absoluteOffset + length));
144144
}
145145

146-
try {
147-
inputStream = client.pullRangeStream(path, targetAbsolutePos, remaining);
148-
} catch (AzureBlobException e) {
149-
throw new IOException(
150-
"Failed to open range stream for " + path + " at offset " + targetAbsolutePos, e);
151-
}
146+
inputStream = client.pullRangeStream(path, targetAbsolutePos, remaining);
152147
streamAbsolutePos = targetAbsolutePos;
153148
}
154149

solr/modules/azure-blob-repository/src/java/org/apache/solr/azureblob/AzureBlobNotFoundException.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@
1818

1919
/** Exception thrown when a blob is not found in Azure Blob Storage. */
2020
public class AzureBlobNotFoundException extends AzureBlobException {
21+
public AzureBlobNotFoundException(Throwable cause) {
22+
super(cause);
23+
}
24+
2125
public AzureBlobNotFoundException(String message) {
2226
super(message);
2327
}

solr/modules/azure-blob-repository/src/java/org/apache/solr/azureblob/AzureBlobOutputStream.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
* OutputStream implementation for Azure Blob Storage using block blobs. Supports chunked uploads
3737
* for large files.
3838
*/
39-
public class AzureBlobOutputStream extends OutputStream {
39+
class AzureBlobOutputStream extends OutputStream {
4040
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
4141

4242
static final int BLOCK_SIZE = 4 * 1024 * 1024;
@@ -47,7 +47,7 @@ public class AzureBlobOutputStream extends OutputStream {
4747
private final ByteBuffer buffer;
4848
private BlockUpload blockUpload;
4949

50-
public AzureBlobOutputStream(BlobClient blobClient, String blobPath) {
50+
AzureBlobOutputStream(BlobClient blobClient, String blobPath) {
5151
this.blobClient = blobClient;
5252
this.blobPath = blobPath;
5353
this.closed = false;
@@ -138,9 +138,9 @@ public void flush() throws IOException {
138138
throw new IOException("Stream closed");
139139
}
140140

141-
if (buffer.position() > 0) {
142-
uploadBlock();
143-
}
141+
// Intentionally a no-op. Full blocks are staged as the buffer fills in write(), and the
142+
// partial tail is staged in close(). Staging on every flush() would create tiny blocks and a
143+
// frequently-flushing caller could exhaust Azure's 50,000-committed-block limit on small files.
144144
}
145145

146146
@Override

solr/modules/azure-blob-repository/src/java/org/apache/solr/azureblob/AzureBlobStorageClient.java

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -105,13 +105,6 @@ public class AzureBlobStorageClient {
105105
AzureBlobStorageClient(BlobServiceClient blobServiceClient, String containerName) {
106106
this.containerClient = blobServiceClient.getBlobContainerClient(containerName);
107107
this.batchClient = new BlobBatchClientBuilder(blobServiceClient).buildClient();
108-
try {
109-
containerClient.create();
110-
} catch (BlobStorageException e) {
111-
if (e.getStatusCode() != HTTP_CONFLICT) {
112-
throw e;
113-
}
114-
}
115108
}
116109

117110
private static BlobServiceClient createInternalClient(
@@ -235,6 +228,13 @@ String[] listDir(String path) throws AzureBlobException {
235228
}
236229
}
237230

231+
/**
232+
* Checks existence by resolving the exact blob (a HEAD request). This module always writes {@code
233+
* hdi_isfolder} marker blobs for directories, so it is self-consistent. Note the asymmetry with
234+
* {@link #isDirectory(String)}: a marker-less "virtual" directory created by an external tool
235+
* (e.g. azcopy) returns {@code false} here even though {@code isDirectory} reports it as a
236+
* directory via prefix listing.
237+
*/
238238
boolean pathExists(String path) throws AzureBlobException {
239239
final String blobPath = sanitizedPath(path);
240240

@@ -434,7 +434,21 @@ OutputStream pushStream(String path) throws AzureBlobException {
434434
}
435435
}
436436

437-
void close() {}
437+
void close() {
438+
// No-op: the underlying OkHttp client is SPI-loaded and shared process-wide, so there is
439+
// nothing per-instance to release here.
440+
}
441+
442+
@VisibleForTesting
443+
void createContainerForTests() {
444+
try {
445+
containerClient.create();
446+
} catch (BlobStorageException e) {
447+
if (e.getStatusCode() != HTTP_CONFLICT) {
448+
throw e;
449+
}
450+
}
451+
}
438452

439453
@VisibleForTesting
440454
void deleteContainerForTests() {

0 commit comments

Comments
 (0)