Skip to content

Commit 4a8f349

Browse files
committed
replace phaser with future
1 parent d745321 commit 4a8f349

13 files changed

Lines changed: 290 additions & 415 deletions

File tree

sdk/src/main/java/com/amazonaws/lambda/durable/DurableFuture.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ public interface DurableFuture<T> {
99
/**
1010
* Blocks until the operation completes and returns the result.
1111
*
12-
* <p>This delegates to operation.get() which handles: - Phaser blocking (arriveAndAwaitAdvance) - Thread
13-
* deregistration (allows suspension) - Thread reactivation (resumes execution) - Result retrieval
12+
* <p>This delegates to operation.get() which handles: - Thread deregistration (allows suspension) - Thread
13+
* reactivation (resumes execution) - Result retrieval
1414
*
1515
* @return the operation result
1616
*/

sdk/src/main/java/com/amazonaws/lambda/durable/execution/CheckpointBatcher.java

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -139,10 +139,6 @@ private void checkpointBatch(List<OperationUpdate> updates) {
139139
logger.debug("Durable checkpoint API called (latency={}ns): {}.", System.nanoTime() - startTime, response);
140140

141141
// Notify callback of completion
142-
// TODO: sam local backend returns no new execution state when called with zero
143-
// updates. WHY?
144-
// This means the polling will never receive an operation update and complete
145-
// the Phaser.
146142
checkpointToken = response.checkpointToken();
147143
if (response.newExecutionState() != null) {
148144
// fetch all pages of operations

sdk/src/main/java/com/amazonaws/lambda/durable/execution/ExecutionManager.java

Lines changed: 24 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,13 @@
44

55
import com.amazonaws.lambda.durable.DurableConfig;
66
import com.amazonaws.lambda.durable.exception.UnrecoverableDurableExecutionException;
7+
import com.amazonaws.lambda.durable.operation.BaseDurableOperation;
78
import java.time.Duration;
89
import java.util.Collections;
910
import java.util.HashMap;
1011
import java.util.List;
1112
import java.util.Map;
1213
import java.util.concurrent.CompletableFuture;
13-
import java.util.concurrent.Phaser;
1414
import java.util.concurrent.atomic.AtomicReference;
1515
import java.util.stream.Collectors;
1616
import org.slf4j.Logger;
@@ -28,8 +28,8 @@
2828
* <ul>
2929
* <li>Execution state (operations, checkpoint token)
3030
* <li>Thread lifecycle (registration/deregistration)
31-
* <li>Phaser management (coordination)
32-
* <li>Checkpoint batching (via CheckpointManager)
31+
* <li>Checkpoint batching (via CheckpointBatcher)
32+
* <li>Checkpoint result handling (CheckpointBatcher callback)
3333
* <li>Polling (for waits and retries)
3434
* </ul>
3535
*
@@ -43,15 +43,16 @@ public class ExecutionManager {
4343
private static final Logger logger = LoggerFactory.getLogger(ExecutionManager.class);
4444

4545
// ===== Execution State =====
46-
private final Map<String, Operation> operations;
46+
private final Map<String, Operation> operationStorage;
4747
private final String executionOperationId;
4848
private final String durableExecutionArn;
4949
private final AtomicReference<ExecutionMode> executionMode;
5050

5151
// ===== Thread Coordination =====
52+
private final Map<String, BaseDurableOperation<?>> registeredOperations =
53+
Collections.synchronizedMap(new HashMap<>());
5254
private final Map<String, ThreadType> activeThreads = Collections.synchronizedMap(new HashMap<>());
5355
private static final ThreadLocal<OperationContext> currentContext = new ThreadLocal<>();
54-
private final Map<String, Phaser> openPhasers = Collections.synchronizedMap(new HashMap<>());
5556
private final CompletableFuture<Void> executionExceptionFuture = new CompletableFuture<>();
5657

5758
// ===== Checkpoint Batching =====
@@ -69,12 +70,12 @@ public ExecutionManager(
6970
this.checkpointBatcher =
7071
new CheckpointBatcher(config, durableExecutionArn, checkpointToken, this::onCheckpointComplete);
7172

72-
this.operations = checkpointBatcher.fetchAllPages(initialExecutionState).stream()
73+
this.operationStorage = checkpointBatcher.fetchAllPages(initialExecutionState).stream()
7374
.collect(Collectors.toConcurrentMap(Operation::id, op -> op));
7475

7576
// Start in REPLAY mode if we have more than just the initial EXECUTION operation
7677
this.executionMode =
77-
new AtomicReference<>(operations.size() > 1 ? ExecutionMode.REPLAY : ExecutionMode.EXECUTION);
78+
new AtomicReference<>(operationStorage.size() > 1 ? ExecutionMode.REPLAY : ExecutionMode.EXECUTION);
7879
}
7980

8081
// ===== State Management =====
@@ -87,24 +88,22 @@ public boolean isReplaying() {
8788
return executionMode.get() == ExecutionMode.REPLAY;
8889
}
8990

91+
public void registerOperation(BaseDurableOperation<?> operation) {
92+
registeredOperations.put(operation.getOperationId(), operation);
93+
}
94+
9095
// ===== Checkpoint Completion Handler =====
91-
/** Called by CheckpointManager when a checkpoint completes. Updates state and advances phasers. */
96+
/** Called by CheckpointManager when a checkpoint completes. Updates operationStorage and notify operations . */
9297
private void onCheckpointComplete(List<Operation> newOperations) {
93-
// Update operation storage
94-
newOperations.forEach(op -> operations.put(op.id(), op));
95-
96-
// Advance phasers for completed operations
97-
for (Operation operation : newOperations) {
98-
if (openPhasers.containsKey(operation.id()) && isTerminalStatus(operation.status())) {
99-
var phaser = openPhasers.get(operation.id());
100-
101-
// Two-phase completion
102-
logger.trace("Advancing phaser 0 -> 1: {}", phaser);
103-
phaser.arriveAndAwaitAdvance();
104-
logger.trace("Advancing phaser 1 -> 2: {}", phaser);
105-
phaser.arriveAndAwaitAdvance();
106-
}
107-
}
98+
newOperations.forEach(op -> {
99+
// Update operation storage
100+
operationStorage.put(op.id(), op);
101+
// call registered operation's onCheckpointComplete method for completed operations
102+
registeredOperations.computeIfPresent(op.id(), (id, operation) -> {
103+
operation.onCheckpointComplete(op);
104+
return operation;
105+
});
106+
});
108107
}
109108

110109
/**
@@ -115,7 +114,7 @@ private void onCheckpointComplete(List<Operation> newOperations) {
115114
* @return the existing operation, or null if not found (first execution)
116115
*/
117116
public Operation getOperationAndUpdateReplayState(String operationId) {
118-
var existing = operations.get(operationId);
117+
var existing = operationStorage.get(operationId);
119118
if (executionMode.get() == ExecutionMode.REPLAY) {
120119
if (existing == null || !isTerminalStatus(existing.status())) {
121120
if (executionMode.compareAndSet(ExecutionMode.REPLAY, ExecutionMode.EXECUTION)) {
@@ -127,7 +126,7 @@ public Operation getOperationAndUpdateReplayState(String operationId) {
127126
}
128127

129128
public Operation getExecutionOperation() {
130-
return operations.get(executionOperationId);
129+
return operationStorage.get(executionOperationId);
131130
}
132131

133132
// ===== Thread Coordination =====
@@ -185,15 +184,6 @@ public void deregisterActiveThreadAndUnsetCurrentContext(String threadId) {
185184
}
186185
}
187186

188-
// ===== Phaser Management =====
189-
190-
public Phaser startPhaser(String operationId) {
191-
var phaser = new Phaser(1);
192-
openPhasers.put(operationId, phaser);
193-
logger.trace("Started phaser for operation '{}'", operationId);
194-
return phaser;
195-
}
196-
197187
// ===== Checkpointing =====
198188

199189
// This method will checkpoint the operation updates to the durable backend and return a future which completes

sdk/src/main/java/com/amazonaws/lambda/durable/execution/ExecutionPhase.java

Lines changed: 0 additions & 18 deletions
This file was deleted.

sdk/src/main/java/com/amazonaws/lambda/durable/execution/InternalExecutor.java

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,8 @@
99
/**
1010
* Shared executor for internal SDK coordination tasks.
1111
*
12-
* <p>This executor is used for SDK-internal operations such as checkpoint batching, polling for wait completion, and
13-
* phaser coordination. It is separate from the user-configured executor (via {@code DurableConfig}) which runs
14-
* user-defined operations.
12+
* <p>This executor is used for SDK-internal operations such as checkpoint batching. It is separate from the
13+
* user-configured executor (via {@code DurableConfig}) which runs user-defined operations.
1514
*
1615
* <p>Using a dedicated thread pool ensures SDK coordination tasks are isolated from user code.
1716
*/

sdk/src/main/java/com/amazonaws/lambda/durable/operation/BaseDurableOperation.java

Lines changed: 67 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -9,18 +9,17 @@
99
import com.amazonaws.lambda.durable.exception.SerDesException;
1010
import com.amazonaws.lambda.durable.exception.UnrecoverableDurableExecutionException;
1111
import com.amazonaws.lambda.durable.execution.ExecutionManager;
12-
import com.amazonaws.lambda.durable.execution.ExecutionPhase;
1312
import com.amazonaws.lambda.durable.execution.ThreadType;
1413
import com.amazonaws.lambda.durable.serde.SerDes;
1514
import com.amazonaws.lambda.durable.util.ExceptionHelper;
1615
import java.time.Duration;
1716
import java.util.Objects;
1817
import java.util.concurrent.CompletableFuture;
19-
import java.util.concurrent.Phaser;
2018
import org.slf4j.Logger;
2119
import org.slf4j.LoggerFactory;
2220
import software.amazon.awssdk.services.lambda.model.ErrorObject;
2321
import software.amazon.awssdk.services.lambda.model.Operation;
22+
import software.amazon.awssdk.services.lambda.model.OperationStatus;
2423
import software.amazon.awssdk.services.lambda.model.OperationType;
2524
import software.amazon.awssdk.services.lambda.model.OperationUpdate;
2625

@@ -39,7 +38,7 @@
3938
* <ul>
4039
* <li>Starting multiple async operations quickly
4140
* <li>Blocking on results later when needed
42-
* <li>Proper thread coordination via Phasers
41+
* <li>Proper thread coordination via future
4342
* </ul>
4443
*/
4544
public abstract class BaseDurableOperation<T> implements DurableFuture<T> {
@@ -51,7 +50,7 @@ public abstract class BaseDurableOperation<T> implements DurableFuture<T> {
5150
private final ExecutionManager executionManager;
5251
private final TypeToken<T> resultTypeToken;
5352
private final SerDes resultSerDes;
54-
private final Phaser phaser;
53+
private final CompletableFuture<Void> completionFuture;
5554

5655
public BaseDurableOperation(
5756
String operationId,
@@ -67,8 +66,10 @@ public BaseDurableOperation(
6766
this.resultTypeToken = resultTypeToken;
6867
this.resultSerDes = resultSerDes;
6968

70-
// todo: phaser could be used only in ExecutionManager and invisible from operations.
71-
this.phaser = executionManager.startPhaser(operationId);
69+
this.completionFuture = new CompletableFuture<>();
70+
71+
// register this operation in ExecutionManager so that the operation can receive updates from ExecutionManager
72+
executionManager.registerOperation(this);
7273
}
7374

7475
/** Gets the unique identifier for this operation. */
@@ -96,7 +97,7 @@ public OperationType getType() {
9697
*
9798
* <ul>
9899
* <li>Thread deregistration (allows suspension)
99-
* <li>Phaser blocking (waits for operation to complete)
100+
* <li>Blocking (waits for operation to complete)
100101
* <li>Thread reactivation (resumes execution)
101102
* <li>Result retrieval
102103
* </ul>
@@ -116,7 +117,7 @@ protected Operation getOperation() {
116117
}
117118

118119
/**
119-
* check if it's called from a Step.
120+
* Checks if it's called from a Step.
120121
*
121122
* @throws IllegalDurableOperationException if it's in a step
122123
*/
@@ -131,43 +132,43 @@ private void validateCurrentThreadType() {
131132
}
132133
}
133134

134-
// phase control utilities
135+
/** Checks if this operation is completed */
136+
protected boolean isOperationCompleted() {
137+
return completionFuture.isDone();
138+
}
139+
140+
/** Waits for the operation to complete and suspends the execution if no active thread is running */
135141
protected Operation waitForOperationCompletion() {
136142

137143
validateCurrentThreadType();
138144

139-
// register to prevent the state from advancing
140-
phaser.register();
141-
142-
// If we are in a replay where the operation is already complete (SUCCEEDED /
143-
// FAILED), the Phaser will be
144-
// advanced in .execute() already and we don't block but return the result
145-
// immediately.
146-
if (phaser.getPhase() == ExecutionPhase.RUNNING.getValue()) {
147-
// Operation not done yet
148-
var context = executionManager.getCurrentContext();
149-
// Deregister current context - allows suspension
150-
logger.debug(
151-
"get() on {} attempting to deregister context: {}",
152-
getType(),
153-
executionManager.getCurrentContext().contextId());
154-
deregisterActiveThreadAndUnsetCurrentContext(context.contextId());
155-
156-
// Block until operation completes
157-
logger.trace("Waiting for operation to finish {} (Phaser: {})", getOperationId(), phaser);
158-
phaser.arriveAndAwaitAdvance();
159-
160-
// Reactivate current context
161-
registerActiveThread(context.contextId(), context.threadType());
162-
setCurrentContext(context.contextId(), context.threadType());
163-
164-
// Complete phase 1
165-
phaser.arriveAndDeregister();
166-
} else {
167-
// The phaser is already completed. Deregister now.
168-
phaser.arriveAndDeregister();
145+
var context = executionManager.getCurrentContext();
146+
147+
// Use a synchronized block here to prevent the completionFuture from being completed by the execution thread
148+
// (a step or child context thread) when it's inside the `if` block where the completion check is done (not
149+
// completed) while the callback isn't added to the completionFuture or the current thread isn't deregistered.
150+
synchronized (completionFuture) {
151+
if (!isOperationCompleted()) {
152+
// Operation not done yet
153+
logger.debug("get() on {} attempting to deregister context: {}", getType(), context.contextId());
154+
155+
// Add a callback to completionFuture so that when the completionFuture is completed,
156+
// it will register the current Context thread synchronously to make sure it is always registered
157+
// before the execution thread (Step or child context) is deregistered.
158+
completionFuture.thenRun(() -> registerActiveThread(context.contextId(), context.threadType()));
159+
160+
// Deregister the current thread to allow suspension
161+
deregisterActiveThreadAndUnsetCurrentContext(context.contextId());
162+
}
169163
}
170164

165+
// Block until operation completes. No-op if the future is already completed.
166+
logger.trace("Waiting for operation to finish {} ({})", getOperationId(), completionFuture);
167+
completionFuture.join();
168+
169+
// Reactivate current context. No-op if this is called twice.
170+
setCurrentContext(context.contextId(), context.threadType());
171+
171172
// Get result based on status
172173
var op = getOperation();
173174
if (op == null) {
@@ -177,13 +178,27 @@ protected Operation waitForOperationCompletion() {
177178
return op;
178179
}
179180

181+
/** Receives operation updates from ExecutionManager and updates the internal state of the operation */
182+
public void onCheckpointComplete(Operation operation) {
183+
if (isTerminalStatus(operation.status())) {
184+
synchronized (completionFuture) {
185+
logger.trace("In onCheckpointComplete, completing operation {} ({})", operationId, completionFuture);
186+
completionFuture.complete(null);
187+
}
188+
}
189+
}
190+
191+
/** Marks the operation as already completed (in replay). */
180192
protected void markAlreadyCompleted() {
181-
// Operation is already completed in a relay. We advance and deregister from the Phaser
182-
// so that get method doesn't block and returns the result immediately.
183-
logger.trace("Detected terminal status during replay. Advancing phaser 0 -> 1 {}.", phaser);
184-
phaser.arriveAndDeregister(); // Phase 0 -> 1
193+
// When the operation is already completed in a replay, we complete completionFuture immediately
194+
// so that the `get` method will be unblocked and the context thread will be registered
195+
logger.trace("In markAlreadyCompleted, completing operation: {} ({}).", operationId, completionFuture);
196+
synchronized (completionFuture) {
197+
completionFuture.complete(null);
198+
}
185199
}
186200

201+
// terminate the execution
187202
protected T terminateExecution(UnrecoverableDurableExecutionException exception) {
188203
executionManager.terminateExecution(exception);
189204
// Exception is already thrown from above. Keep the throw statement below to make tests happy
@@ -194,7 +209,7 @@ protected T terminateExecutionWithIllegalDurableOperationException(String messag
194209
return terminateExecution(new IllegalDurableOperationException(message));
195210
}
196211

197-
// advanced phase control used by Step only
212+
// advanced thread and context control
198213
protected void deregisterActiveThreadAndUnsetCurrentContext(String threadId) {
199214
executionManager.deregisterActiveThreadAndUnsetCurrentContext(threadId);
200215
}
@@ -298,4 +313,12 @@ protected void validateReplay(Operation checkpointed) {
298313
operationId, checkpointed.name(), getName())));
299314
}
300315
}
316+
317+
private boolean isTerminalStatus(OperationStatus status) {
318+
return status == OperationStatus.SUCCEEDED
319+
|| status == OperationStatus.FAILED
320+
|| status == OperationStatus.CANCELLED
321+
|| status == OperationStatus.TIMED_OUT
322+
|| status == OperationStatus.STOPPED;
323+
}
301324
}

sdk/src/main/java/com/amazonaws/lambda/durable/operation/CallbackOperation.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ public void execute() {
5252

5353
switch (existing.status()) {
5454
case SUCCEEDED, FAILED, TIMED_OUT -> {
55-
// Terminal state - complete phaser immediately
55+
// Terminal state - complete the operation immediately
5656
markAlreadyCompleted();
5757
return;
5858
}

0 commit comments

Comments
 (0)