Skip to content

Commit bcbf22a

Browse files
authored
medialibrary(index) : reducing the index file size and saving them in batches (#413)
* medialibrary(index) : reducing the index file size and saving them in batches with introducing wait times in between to avoid da-admin worker issues * media-library : better logging to identify chunk failures
1 parent ed8fd6e commit bcbf22a

7 files changed

Lines changed: 131 additions & 195 deletions

File tree

nx/blocks/media-library/core/constants.js

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@ export const IndexConfig = Object.freeze({
1414
DISCOVERY_MAX_PATHS_PER_JOB: 250,
1515
/* Larger batch to minimize UI update overhead - updates every ~100 seconds */
1616
USAGE_MAP_PROGRESSIVE_BATCH_SIZE: 1000,
17-
/* Index chunking configuration */
18-
MEDIA_INDEX_CHUNK_SIZE: 20_000, /* Entries per chunk (~15-20MB per chunk) */
1917
LOCK_HEARTBEAT_INTERVAL_MS: 60_000,
2018
LOCK_STALE_THRESHOLD_MS: 10 * 60_000,
2119
BUILD_MAX_DURATION_MS: 30 * 60 * 1000,

nx/blocks/media-library/indexing/admin-api.js

Lines changed: 0 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -98,20 +98,6 @@ function getChunkFileName(chunkNum) {
9898
return `${IndexFiles.MEDIA_INDEX_CHUNK_PREFIX}${String(chunkNum).padStart(3, '0')}.json`;
9999
}
100100

101-
/**
102-
* Split media sheet into chunks
103-
* @param {Array} mediaData - Full media sheet data
104-
* @param {number} chunkSize - Entries per chunk
105-
* @returns {Array<Array>} Array of chunks
106-
*/
107-
function chunkMediaSheet(mediaData, chunkSize) {
108-
const chunks = [];
109-
for (let i = 0; i < mediaData.length; i += chunkSize) {
110-
chunks.push(mediaData.slice(i, i + chunkSize));
111-
}
112-
return chunks;
113-
}
114-
115101
const DEFAULT_TIMEFRAME_DAYS = 3650; /* 10 years */
116102

117103
export async function fetchWithAuth(url, opts = {}) {
@@ -321,59 +307,6 @@ export async function loadIndexChunks(basePath, chunkCount, sheetName, onProgres
321307
return results.map((r) => r.data).flat();
322308
}
323309

324-
/**
325-
* Save index as chunks
326-
* @param {string} basePath - Base path without filename
327-
* @param {Array} mediaData - Media sheet data (must be pre-sorted)
328-
* @param {Array} usageData - Usage sheet data
329-
* @param {number} chunkSize - Entries per chunk
330-
* @returns {Promise<number>} Number of chunks created
331-
*/
332-
export async function saveIndexChunks(basePath, mediaData, usageData, chunkSize) {
333-
const mediaChunks = chunkMediaSheet(mediaData, chunkSize);
334-
335-
// Always save at least chunk 0, even if empty (for consistency)
336-
const chunksToSave = mediaChunks.length > 0 ? mediaChunks : [[]];
337-
const savePromises = [];
338-
339-
for (let i = 0; i < chunksToSave.length; i += 1) {
340-
const chunkFileName = getChunkFileName(i);
341-
const chunkPath = `${basePath}/${chunkFileName}`;
342-
343-
// Only include usage sheet in first chunk to avoid duplication
344-
const sheets = {
345-
media: chunksToSave[i],
346-
usage: i === 0 ? usageData : [],
347-
};
348-
349-
const formData = await createMultiSheet(sheets);
350-
const savePromise = daFetch(`${DA_ORIGIN}/source${chunkPath}`, {
351-
method: 'PUT',
352-
body: formData,
353-
});
354-
355-
savePromises.push(savePromise);
356-
}
357-
358-
const responses = await Promise.all(savePromises);
359-
360-
// Validate all chunks saved successfully
361-
const failedChunks = [];
362-
responses.forEach((resp, i) => {
363-
if (!resp.ok) {
364-
failedChunks.push({ chunk: i, status: resp.status });
365-
}
366-
});
367-
368-
if (failedChunks.length > 0) {
369-
const error = new Error(`Failed to save ${failedChunks.length}/${chunksToSave.length} chunks: ${failedChunks.map((f) => `chunk ${f.chunk} (${f.status})`).join(', ')}`);
370-
error.failedChunks = failedChunks;
371-
throw error;
372-
}
373-
374-
return chunksToSave.length;
375-
}
376-
377310
export async function saveSheet(data, path) {
378311
const formData = await createSheet(data);
379312
return daFetch(`${DA_ORIGIN}/source${path}`, {

nx/blocks/media-library/indexing/medialog.js

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -315,12 +315,27 @@ export function processPageMediaUpdates(
315315
const pageData = pageMediaMap.get(normalizedPath);
316316
const newEntries = pageData ? pageData.entries : [];
317317

318+
// Filter oldHashes to only include medialog-sourced entries for accurate comparison
319+
// External media (extlinks-parsed, markdown-parsed) should not be compared against medialog
320+
// because they're never in medialog - they come from markdown parsing
321+
const oldMedialogHashes = new Set();
322+
oldHashes.forEach((hash) => {
323+
const entry = updatedIndex.find((e) => e.hash === hash && e.doc === normalizedPath);
324+
if (entry) {
325+
const op = entry.operation || entry.source;
326+
const isFromMedialog = op !== 'extlinks-parsed' && op !== 'markdown-parsed' && op !== 'auditlog-parsed';
327+
if (isFromMedialog) {
328+
oldMedialogHashes.add(hash);
329+
}
330+
}
331+
});
332+
318333
onLog(`--- Page: ${normalizedPath} ---`);
319-
onLog(` Old (bypage): ${oldHashes.size}, New (page-based): ${newEntries.length}`);
334+
onLog(` Old (bypage): ${oldMedialogHashes.size}, New (page-based): ${newEntries.length}`);
320335

321336
const newHashes = new Set(newEntries.map((e) => e.hash));
322-
const toRemove = [...oldHashes].filter((h) => !newHashes.has(h));
323-
const toAdd = [...newHashes].filter((h) => !oldHashes.has(h));
337+
const toRemove = [...oldMedialogHashes].filter((h) => !newHashes.has(h));
338+
const toAdd = [...newHashes].filter((h) => !oldMedialogHashes.has(h));
324339

325340
if (toRemove.length || toAdd.length) {
326341
onLog(` Diff: remove ${toRemove.length}, add ${toAdd.length}`);
@@ -329,18 +344,12 @@ export function processPageMediaUpdates(
329344
toRemove.forEach((hash) => {
330345
const oldEntry = updatedIndex.find((e) => e.hash === hash && e.doc === normalizedPath);
331346
if (oldEntry) {
332-
// Don't remove external media (extlinks-parsed/markdown-parsed) or auditlog-parsed entries
333-
// They come from markdown parsing, not medialog, so they're handled by processLinkedContent
334-
const op = oldEntry.operation || oldEntry.source;
335-
const isFromMarkdown = op === 'extlinks-parsed' || op === 'markdown-parsed' || op === 'auditlog-parsed';
336-
if (!isFromMarkdown) {
337-
removed += removeOrOrphanMedia(
338-
updatedIndex,
339-
oldEntry,
340-
normalizedPath,
341-
medialogEntries,
342-
);
343-
}
347+
removed += removeOrOrphanMedia(
348+
updatedIndex,
349+
oldEntry,
350+
normalizedPath,
351+
medialogEntries,
352+
);
344353
}
345354
});
346355

nx/blocks/media-library/indexing/worker/fetch.js

Lines changed: 81 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -729,74 +729,114 @@ function chunkMediaSheet(mediaData, chunkSize) {
729729
}
730730

731731
/**
732-
* Worker-safe version of saveIndexChunks from admin-api.js
732+
* Determine optimal chunk size based on total entry count
733733
*
734-
* @param {string} basePath - Base path for chunks (e.g., /org/repo/.da/media-insights)
735-
* @param {Array} mediaData - Media sheet data
736-
* @param {Array} usageData - Usage sheet data
737-
* @param {number} chunkSize - Entries per chunk
738-
* @param {string} daOrigin - DA origin (e.g., https://admin.da.live)
734+
* Rationale:
735+
* - Small sites (<10k entries): Single file (100k chunk size ensures no chunking)
736+
* - No overhead from loading multiple chunks
737+
* - Simpler debugging and inspection
738+
*
739+
* - Medium sites (10k-200k): 8k entries per chunk (~4-5MB files)
740+
* - Prevents CF Worker 128MB memory limit errors during PUT
741+
* - Balances file size vs chunk count overhead
742+
* - Progressive loading: chunk 0 loads quickly for default Images view
743+
*
744+
* - Large sites (>200k entries): 6k entries per chunk (~3-4MB files)
745+
* - Smaller files for better reliability on massive indexes
746+
* - More chunks acceptable given already high chunk count
747+
* - Further reduces memory pressure on uploads
748+
*
749+
* Chunk size targets file sizes ≤5MB to avoid DA Admin/S3 timeouts
750+
* Average media entry size: ~550 bytes (URL + metadata + doc field)
751+
*
752+
* @param {number} totalEntries - Total number of media entries in index
753+
* @returns {number} Optimal chunk size (entries per chunk)
754+
*/
755+
export function getAdaptiveChunkSize(totalEntries) {
756+
if (totalEntries < 10_000) {
757+
return 100_000;
758+
}
759+
760+
if (totalEntries < 200_000) {
761+
return 8_000;
762+
}
763+
764+
return 6_000;
765+
}
766+
767+
/**
768+
* Save index as chunks with batched uploads to prevent rate limiting
769+
* Uploads 3 chunks concurrently with 500ms delays between batches
770+
*
771+
* @param {string} basePath - Base path without filename (e.g., '/site/.da/media-insights')
772+
* @param {Array} mediaData - Media sheet data (must be pre-sorted)
773+
* @param {number} chunkSize - Entries per chunk (from getAdaptiveChunkSize)
774+
* @param {string} daOrigin - DA origin (e.g., 'https://admin.da.live')
739775
* @param {string} imsToken - IMS access token
740776
* @param {string} indexFilesChunkPrefix - Chunk filename prefix (e.g., 'index-')
741-
* @returns {Promise<number>} Number of chunks saved
777+
* @returns {Promise<number>} Number of chunks created
742778
*/
743779
export async function saveIndexChunks(
744780
basePath,
745781
mediaData,
746-
usageData,
747782
chunkSize,
748783
daOrigin,
749784
imsToken,
750785
indexFilesChunkPrefix,
751786
) {
752787
const mediaChunks = chunkMediaSheet(mediaData, chunkSize);
753-
754-
// Always save at least chunk 0, even if empty (for consistency)
755788
const chunksToSave = mediaChunks.length > 0 ? mediaChunks : [[]];
756-
const savePromises = [];
757789

758-
for (let i = 0; i < chunksToSave.length; i += 1) {
759-
const chunkFileName = getChunkFileName(i, indexFilesChunkPrefix);
760-
const chunkPath = `${basePath}/${chunkFileName}`;
761-
762-
// Only include usage sheet in first chunk to avoid duplication
763-
const sheets = {
764-
media: chunksToSave[i],
765-
usage: i === 0 ? usageData : [],
766-
};
767-
768-
const formData = await createMultiSheet(sheets);
769-
const savePromise = workerDaFetch(`${daOrigin}/source${chunkPath}`, imsToken, {
770-
method: 'PUT',
771-
body: formData,
790+
// Rate limiting to prevent DA Admin endpoint overload:
791+
// - batchSize=3: Limit concurrent uploads (prevents 503 errors)
792+
// - delayMs=500: 500ms delay between batches (~20 req/sec rate limit)
793+
// - Prevents CF Worker 128MB memory errors from large concurrent PUTs
794+
const batchSize = 3;
795+
const delayMs = 500;
796+
797+
for (let i = 0; i < chunksToSave.length; i += batchSize) {
798+
const batch = chunksToSave.slice(i, i + batchSize);
799+
const batchPromises = batch.map(async (chunk, idx) => {
800+
const chunkNum = i + idx;
801+
const chunkFileName = getChunkFileName(chunkNum, indexFilesChunkPrefix);
802+
const chunkPath = `${basePath}/${chunkFileName}`;
803+
const sheets = { media: chunk };
804+
805+
const formData = await createMultiSheet(sheets);
806+
return workerDaFetch(`${daOrigin}/source${chunkPath}`, imsToken, {
807+
method: 'PUT',
808+
body: formData,
809+
});
772810
});
773811

774-
savePromises.push(savePromise);
775-
}
776-
777-
const responses = await Promise.all(savePromises);
812+
const responses = await Promise.all(batchPromises);
778813

779-
// Validate all chunks saved successfully
780-
const failedChunks = [];
781-
responses.forEach((resp, i) => {
782-
if (!resp.ok) {
783-
failedChunks.push(i);
814+
const failed = [];
815+
responses.forEach((r, idx) => {
816+
if (!r.ok) {
817+
failed.push({ chunkNum: i + idx, status: r.status });
818+
}
819+
});
820+
if (failed.length > 0) {
821+
const chunkNums = failed.map((f) => `${f.chunkNum} (${f.status})`).join(', ');
822+
throw new Error(`Failed to save ${failed.length} chunk(s): ${chunkNums}`);
784823
}
785-
});
786824

787-
if (failedChunks.length > 0) {
788-
throw new Error(`Failed to save chunks: ${failedChunks.join(', ')}`);
825+
if (i + batchSize < chunksToSave.length) {
826+
await new Promise((resolve) => { setTimeout(resolve, delayMs); });
827+
}
789828
}
790829

791830
return chunksToSave.length;
792831
}
793832

794833
/**
795-
* Worker-safe version of saveIndexMeta from admin-api.js
834+
* Save index metadata to DA storage
835+
* Must be called AFTER saveIndexChunks to ensure chunkCount is accurate
796836
*
797-
* @param {object} meta - Metadata object
798-
* @param {string} path - Full path to meta file
799-
* @param {string} daOrigin - DA origin (e.g., https://admin.da.live)
837+
* @param {object} meta - Metadata object containing indexType, timestamp, chunkCount, etc.
838+
* @param {string} path - Full path to meta file (e.g., '/site/.da/media-insights/index-meta.json')
839+
* @param {string} daOrigin - DA origin (e.g., 'https://admin.da.live')
800840
* @param {string} imsToken - IMS access token
801841
* @returns {Promise<Response>}
802842
*/

0 commit comments

Comments
 (0)