Skip to content

Commit 68a2f13

Browse files
bram-atmireclaude
andcommitted
Add DAO-level pagination to DOIOrganiser bulk operations
DOIOrganiser loaded ALL matching DOIs into memory before processing, causing OOM with 10K+ DOIs. This adds paginated findByStatus at the DAO/service level and an automatic batched processing loop in DOIOrganiser that fetches 100 DOIs at a time from offset 0. Successful operations change DOI status so they drop out of subsequent queries. Includes infinite-loop protection when an entire batch fails. Resolves DSpace#9622 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 301c6f3 commit 68a2f13

5 files changed

Lines changed: 123 additions & 93 deletions

File tree

dspace-api/src/main/java/org/dspace/identifier/DOIServiceImpl.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,12 @@ public List<DOI> getDOIsByStatus(Context context, List<Integer> statuses) throws
174174
return doiDAO.findByStatus(context, statuses);
175175
}
176176

177+
@Override
178+
public List<DOI> getDOIsByStatus(Context context, List<Integer> statuses, int limit, int offset)
179+
throws SQLException {
180+
return doiDAO.findByStatus(context, statuses, limit, offset);
181+
}
182+
177183
@Override
178184
public List<DOI> getSimilarDOIsNotInState(Context context, String doiPattern, List<Integer> statuses,
179185
boolean dsoIsNotNull)

dspace-api/src/main/java/org/dspace/identifier/dao/DOIDAO.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,5 +33,18 @@ public List<DOI> findSimilarNotInState(Context context, String doi, List<Integer
3333

3434
public List<DOI> findByStatus(Context context, List<Integer> statuses) throws SQLException;
3535

36+
/**
37+
* Find all DOIs with any of the given statuses, with pagination support.
38+
*
39+
* @param context current DSpace session.
40+
* @param statuses desired statuses.
41+
* @param limit maximum number of results to return (-1 for unlimited).
42+
* @param offset number of results to skip (-1 for none).
43+
* @return matching DOIs ordered by ID.
44+
* @throws SQLException passed through.
45+
*/
46+
public List<DOI> findByStatus(Context context, List<Integer> statuses, int limit, int offset)
47+
throws SQLException;
48+
3649
public DOI findDOIByDSpaceObject(Context context, DSpaceObject dso) throws SQLException;
3750
}

dspace-api/src/main/java/org/dspace/identifier/dao/impl/DOIDAOImpl.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,12 @@ public DOI findDOIByDSpaceObject(Context context, DSpaceObject dso, List<Integer
7171

7272
@Override
7373
public List<DOI> findByStatus(Context context, List<Integer> statuses) throws SQLException {
74+
return findByStatus(context, statuses, -1, -1);
75+
}
76+
77+
@Override
78+
public List<DOI> findByStatus(Context context, List<Integer> statuses, int limit, int offset)
79+
throws SQLException {
7480
CriteriaBuilder criteriaBuilder = getCriteriaBuilder(context);
7581
CriteriaQuery criteriaQuery = getCriteriaQuery(criteriaBuilder, DOI.class);
7682
Root<DOI> doiRoot = criteriaQuery.from(DOI.class);
@@ -80,7 +86,8 @@ public List<DOI> findByStatus(Context context, List<Integer> statuses) throws SQ
8086
orPredicates.add(criteriaBuilder.equal(doiRoot.get(DOI_.status), status));
8187
}
8288
criteriaQuery.where(criteriaBuilder.or(orPredicates.toArray(new Predicate[] {})));
83-
return list(context, criteriaQuery, false, DOI.class, -1, -1);
89+
criteriaQuery.orderBy(criteriaBuilder.asc(doiRoot.get(DOI_.id)));
90+
return list(context, criteriaQuery, false, DOI.class, limit, offset);
8491
}
8592

8693
@Override

dspace-api/src/main/java/org/dspace/identifier/doi/DOIOrganiser.java

Lines changed: 83 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,25 @@ public class DOIOrganiser {
5656

5757
private static final Logger LOG = LogManager.getLogger(DOIOrganiser.class);
5858

59+
/**
60+
* Number of DOIs to fetch per batch during bulk operations.
61+
*/
62+
private static final int BATCH_SIZE = 100;
63+
64+
/**
65+
* Functional interface for a DOI processing operation.
66+
*/
67+
@FunctionalInterface
68+
private interface DOIOperation {
69+
/**
70+
* Process a single DOI.
71+
*
72+
* @param doi the DOI to process.
73+
* @throws Exception if processing fails.
74+
*/
75+
void process(DOI doi) throws Exception;
76+
}
77+
5978
private final DOIIdentifierProvider provider;
6079
private final Context context;
6180
private boolean quiet;
@@ -217,107 +236,27 @@ public static void runCLI(Context context, DOIOrganiser organiser, String[] args
217236
}
218237

219238
if (line.hasOption('s')) {
220-
try {
221-
List<DOI> dois = doiService
222-
.getDOIsByStatus(context, Arrays.asList(DOIIdentifierProvider.TO_BE_RESERVED));
223-
if (dois.isEmpty()) {
224-
System.err.println("There are no objects in the database "
225-
+ "that could be reserved.");
226-
}
227-
228-
for (DOI doi : dois) {
229-
doi = context.reloadEntity(doi);
230-
try {
231-
organiser.reserve(doi);
232-
context.commit();
233-
} catch (RuntimeException e) {
234-
System.err.format("DOI %s for object %s reservation failed, skipping: %s%n",
235-
doi.getDSpaceObject().getID().toString(),
236-
doi.getDoi(), e.getMessage());
237-
context.rollback();
238-
}
239-
}
240-
} catch (SQLException ex) {
241-
System.err.println("Error in database connection:" + ex.getMessage());
242-
ex.printStackTrace(System.err);
243-
}
239+
List<Integer> statuses = Arrays.asList(DOIIdentifierProvider.TO_BE_RESERVED);
240+
processBatched(context, doiService, statuses, organiser::reserve, "reservation");
244241
}
245242

246243
if (line.hasOption('r')) {
247-
try {
248-
List<DOI> dois = doiService
249-
.getDOIsByStatus(context, Arrays.asList(DOIIdentifierProvider.TO_BE_REGISTERED));
250-
if (dois.isEmpty()) {
251-
System.err.println("There are no objects in the database "
252-
+ "that could be registered.");
253-
}
254-
for (DOI doi : dois) {
255-
doi = context.reloadEntity(doi);
256-
try {
257-
organiser.register(doi);
258-
context.commit();
259-
} catch (SQLException e) {
260-
System.err.format("DOI %s for object %s registration failed, skipping: %s%n",
261-
doi.getDSpaceObject().getID().toString(),
262-
doi.getDoi(), e.getMessage());
263-
context.rollback();
264-
}
265-
}
266-
} catch (SQLException ex) {
267-
System.err.format("Error in database connection: %s%n", ex.getMessage());
268-
ex.printStackTrace(System.err);
269-
} catch (RuntimeException ex) {
270-
System.err.format("Error registering DOI identifier: %s%n", ex.getMessage());
271-
}
244+
List<Integer> statuses = Arrays.asList(DOIIdentifierProvider.TO_BE_REGISTERED);
245+
processBatched(context, doiService, statuses, organiser::register, "registration");
272246
}
273247

274248
if (line.hasOption('u')) {
275-
try {
276-
List<DOI> dois = doiService.getDOIsByStatus(context, Arrays.asList(
277-
DOIIdentifierProvider.UPDATE_BEFORE_REGISTRATION,
278-
DOIIdentifierProvider.UPDATE_RESERVED,
279-
DOIIdentifierProvider.UPDATE_REGISTERED));
280-
if (dois.isEmpty()) {
281-
System.err.println("There are no objects in the database "
282-
+ "whose metadata needs an update.");
283-
}
284-
285-
for (DOI doi : dois) {
286-
doi = context.reloadEntity(doi);
287-
organiser.update(doi);
288-
context.commit();
289-
}
290-
} catch (SQLException ex) {
291-
System.err.println("Error in database connection:" + ex.getMessage());
292-
ex.printStackTrace(System.err);
293-
}
249+
List<Integer> statuses = Arrays.asList(
250+
DOIIdentifierProvider.UPDATE_BEFORE_REGISTRATION,
251+
DOIIdentifierProvider.UPDATE_RESERVED,
252+
DOIIdentifierProvider.UPDATE_REGISTERED);
253+
processBatched(context, doiService, statuses, organiser::update, "update");
294254
}
295255

296256
if (line.hasOption('d')) {
297-
try {
298-
List<DOI> dois = doiService
299-
.getDOIsByStatus(context, Arrays.asList(DOIIdentifierProvider.TO_BE_DELETED));
300-
if (dois.isEmpty()) {
301-
System.err.println("There are no objects in the database "
302-
+ "that could be deleted.");
303-
}
304-
305-
for (DOI doi : dois) {
306-
doi = context.reloadEntity(doi);
307-
try {
308-
organiser.delete(doi.getDoi());
309-
context.commit();
310-
} catch (SQLException e) {
311-
System.err.format("DOI %s for object %s deletion failed, skipping: %s%n",
312-
doi.getDSpaceObject().getID().toString(),
313-
doi.getDoi(), e.getMessage());
314-
context.rollback();
315-
}
316-
}
317-
} catch (SQLException ex) {
318-
System.err.println("Error in database connection:" + ex.getMessage());
319-
ex.printStackTrace(System.err);
320-
}
257+
List<Integer> statuses = Arrays.asList(DOIIdentifierProvider.TO_BE_DELETED);
258+
processBatched(context, doiService, statuses,
259+
doi -> organiser.delete(doi.getDoi()), "deletion");
321260
}
322261

323262
if (line.hasOption("reserve-doi")) {
@@ -381,6 +320,58 @@ public static void runCLI(Context context, DOIOrganiser organiser, String[] args
381320

382321
}
383322

323+
/**
324+
* Process all DOIs matching the given statuses in batches.
325+
* Each batch queries from offset 0 because successfully processed DOIs change status and
326+
* drop out of subsequent queries. Stops when a batch is empty or an entire batch fails
327+
* (to prevent infinite loops).
328+
*
329+
* @param context current DSpace session.
330+
* @param doiService the DOI service to query.
331+
* @param statuses the statuses to query for.
332+
* @param operation the operation to perform on each DOI.
333+
* @param processName a human-readable name for the operation (for logging).
334+
*/
335+
private static void processBatched(Context context, DOIService doiService,
336+
List<Integer> statuses, DOIOperation operation,
337+
String processName) {
338+
try {
339+
List<DOI> batch;
340+
boolean firstBatch = true;
341+
do {
342+
batch = doiService.getDOIsByStatus(context, statuses, BATCH_SIZE, 0);
343+
if (firstBatch && batch.isEmpty()) {
344+
System.err.println("There are no objects in the database "
345+
+ "that could be processed for " + processName + ".");
346+
}
347+
firstBatch = false;
348+
349+
int succeeded = 0;
350+
for (DOI doi : batch) {
351+
doi = context.reloadEntity(doi);
352+
try {
353+
operation.process(doi);
354+
context.commit();
355+
succeeded++;
356+
} catch (Exception e) {
357+
System.err.format("DOI %s %s failed, skipping: %s%n",
358+
doi.getDoi(), processName, e.getMessage());
359+
context.rollback();
360+
}
361+
}
362+
// If no DOI in this batch succeeded, stop to prevent an infinite loop.
363+
if (!batch.isEmpty() && succeeded == 0) {
364+
System.err.println("Entire batch failed for " + processName
365+
+ ", stopping to prevent infinite loop.");
366+
break;
367+
}
368+
} while (!batch.isEmpty());
369+
} catch (SQLException ex) {
370+
System.err.println("Error in database connection: " + ex.getMessage());
371+
ex.printStackTrace(System.err);
372+
}
373+
}
374+
384375
/**
385376
* list DOIs queued for reservation or registration
386377
* @param processName - process name for display

dspace-api/src/main/java/org/dspace/identifier/service/DOIService.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,19 @@ public String formatIdentifier(String identifier)
141141
*/
142142
public List<DOI> getDOIsByStatus(Context context, List<Integer> statuses) throws SQLException;
143143

144+
/**
145+
* Find DOIs that have one of a given set of statuses, with pagination support.
146+
*
147+
* @param context current DSpace session.
148+
* @param statuses desired statuses.
149+
* @param limit maximum number of results to return (-1 for unlimited).
150+
* @param offset number of results to skip (-1 for none).
151+
* @return matching DOIs ordered by ID.
152+
* @throws SQLException passed through.
153+
*/
154+
public List<DOI> getDOIsByStatus(Context context, List<Integer> statuses, int limit, int offset)
155+
throws SQLException;
156+
144157
/**
145158
* Find all DOIs that are similar to the specified pattern and not in the
146159
* specified states.

0 commit comments

Comments
 (0)