Skip to content

Commit ec6762c

Browse files
committed
* Implement RFC 8555 async order finalization to fix read timeout issue
RFC 8555 Section 7.4 explicitly defines "processing" as a distinct order state to decouple the finalize request (submit CSR) from certificate issuance (CA signing). This async handoff lets ACME servers back slow CAs without blocking the client. Current ACME implementation in step-ca skips this state which causes issues when an external CA sometimes takes a long time to return a signed certificate. With this change, FinalizeOrder now immediately returns status=processing with Retry-After instead of blocking until the external CA responds. This prevents certbot's 45s & cert-manager's 60s default read timeout from firing when certificate signing takes longer. Certbot's poll_finalization loop handles the processing state natively. esnet/acme-proxy#22 smallstep#2636
1 parent 8c05c7f commit ec6762c

5 files changed

Lines changed: 177 additions & 37 deletions

File tree

acme/api/order.go

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"encoding/base64"
77
"encoding/json"
88
"fmt"
9+
"log/slog"
910
"net"
1011
"net/http"
1112
"strings"
@@ -481,15 +482,53 @@ func FinalizeOrder(w http.ResponseWriter, r *http.Request) {
481482
}
482483

483484
ca := mustAuthority(ctx)
484-
if err = o.Finalize(ctx, db, fr.csr, ca, prov); err != nil {
485-
render.Error(w, r, acme.WrapErrorISE(err, "error finalizing order"))
485+
486+
// Validate the order is ready without signing yet.
487+
if err = o.UpdateStatus(ctx, db); err != nil {
488+
render.Error(w, r, acme.WrapErrorISE(err, "error updating order status"))
489+
return
490+
}
491+
if o.Status != acme.StatusReady {
492+
render.Error(w, r, acme.NewError(acme.ErrorOrderNotReadyType, "order %s is not ready", o.ID))
486493
return
487494
}
488495

496+
// Immediately mark as processing and return — RFC 8555 §7.4 async finalization.
497+
// This prevents ACME clients with short read timeouts (e.g. certbot's hardcoded
498+
// 45s DEFAULT_NETWORK_TIMEOUT) from timing out while a slow external CA signs.
499+
o.Status = acme.StatusProcessing
500+
if err = db.UpdateOrder(ctx, o); err != nil {
501+
render.Error(w, r, acme.WrapErrorISE(err, "error setting order to processing"))
502+
return
503+
}
489504
linker.LinkOrder(ctx, o)
490-
491505
w.Header().Set("Location", linker.GetLink(ctx, acme.OrderLinkType, o.ID))
506+
w.Header().Set("Retry-After", "15")
492507
render.JSON(w, r, o)
508+
509+
// Sign the certificate in the background. DB status drives client polling responses.
510+
// We capture orderID and csr rather than the order pointer to avoid data races with
511+
// the HTTP response still being flushed.
512+
orderID, csr := o.ID, fr.csr
513+
go func() {
514+
bgCtx := context.Background()
515+
// Re-fetch from DB to get a clean copy, then reset to ready so
516+
// Finalize()'s internal switch statement can proceed normally.
517+
bgOrder, err := db.GetOrder(bgCtx, orderID)
518+
if err != nil {
519+
slog.Error("async finalization: failed to re-fetch order", "order", orderID, "err", err)
520+
return
521+
}
522+
bgOrder.Status = acme.StatusReady
523+
if err := bgOrder.Finalize(bgCtx, db, csr, ca, prov); err != nil {
524+
slog.Error("async finalization failed", "order", orderID, "err", err)
525+
bgOrder.Status = acme.StatusInvalid
526+
_ = db.UpdateOrder(bgCtx, bgOrder)
527+
return
528+
}
529+
// On success, Finalize() sets StatusValid and calls db.UpdateOrder internally.
530+
slog.Info("async finalization succeeded", "order", orderID)
531+
}()
493532
}
494533

495534
// challengeTypes determines the types of challenges that should be used

acme/api/order_test.go

Lines changed: 73 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -2424,10 +2424,11 @@ func TestHandler_FinalizeOrder(t *testing.T) {
24242424
assert.FatalError(t, err)
24252425

24262426
type test struct {
2427-
db acme.DB
2428-
ctx context.Context
2429-
statusCode int
2430-
err *acme.Error
2427+
db acme.DB
2428+
ctx context.Context
2429+
statusCode int
2430+
err *acme.Error
2431+
expectedStatus acme.Status // for success cases; empty means skip status assertion
24312432
}
24322433
var tests = map[string]func(t *testing.T) test{
24332434
"fail/no-account": func(t *testing.T) test {
@@ -2569,7 +2570,9 @@ func TestHandler_FinalizeOrder(t *testing.T) {
25692570
err: acme.NewError(acme.ErrorUnauthorizedType, "provisioner id mismatch"),
25702571
}
25712572
},
2572-
"fail/order-finalize-error": func(t *testing.T) test {
2573+
// Tests that an expired ready order fails at UpdateStatus, which tries to
2574+
// write StatusInvalid to the DB and gets an error back.
2575+
"fail/update-status-error": func(t *testing.T) test {
25732576
acc := &acme.Account{ID: "accountID"}
25742577
ctx := acme.NewProvisionerContext(context.Background(), prov)
25752578
ctx = context.WithValue(ctx, accContextKey, acc)
@@ -2594,7 +2597,8 @@ func TestHandler_FinalizeOrder(t *testing.T) {
25942597
err: acme.NewErrorISE("force"),
25952598
}
25962599
},
2597-
"ok": func(t *testing.T) test {
2600+
// Tests that a DB failure when writing the processing status returns 500.
2601+
"fail/db.UpdateOrder-processing-error": func(t *testing.T) test {
25982602
acc := &acme.Account{ID: "accountID"}
25992603
ctx := acme.NewProvisionerContext(context.Background(), prov)
26002604
ctx = context.WithValue(ctx, accContextKey, acc)
@@ -2604,30 +2608,66 @@ func TestHandler_FinalizeOrder(t *testing.T) {
26042608
db: &acme.MockDB{
26052609
MockGetOrder: func(ctx context.Context, id string) (*acme.Order, error) {
26062610
return &acme.Order{
2607-
ID: "orderID",
2608-
AccountID: "accountID",
2609-
ProvisionerID: fmt.Sprintf("acme/%s", prov.GetName()),
2610-
ExpiresAt: naf,
2611-
Status: acme.StatusValid,
2612-
AuthorizationIDs: []string{"foo", "bar", "baz"},
2613-
NotBefore: nbf,
2614-
NotAfter: naf,
2615-
Identifiers: []acme.Identifier{
2616-
{
2617-
Type: "dns",
2618-
Value: "example.com",
2619-
},
2620-
{
2621-
Type: "dns",
2622-
Value: "*.smallstep.com",
2623-
},
2624-
},
2625-
CertificateID: "certID",
2611+
ID: "orderID",
2612+
AccountID: "accountID",
2613+
ProvisionerID: fmt.Sprintf("acme/%s", prov.GetName()),
2614+
ExpiresAt: naf,
2615+
Status: acme.StatusReady,
26262616
}, nil
26272617
},
2618+
MockUpdateOrder: func(ctx context.Context, o *acme.Order) error {
2619+
return acme.NewErrorISE("force")
2620+
},
26282621
},
26292622
ctx: ctx,
2630-
statusCode: 200,
2623+
statusCode: 500,
2624+
err: acme.NewErrorISE("force"),
2625+
}
2626+
},
2627+
// Tests the happy path: a ready order immediately returns processing + Retry-After,
2628+
// and the background goroutine is short-circuited by an error on re-fetch.
2629+
"ok": func(t *testing.T) test {
2630+
acc := &acme.Account{ID: "accountID"}
2631+
ctx := acme.NewProvisionerContext(context.Background(), prov)
2632+
ctx = context.WithValue(ctx, accContextKey, acc)
2633+
ctx = context.WithValue(ctx, payloadContextKey, &payloadInfo{value: payloadBytes})
2634+
ctx = context.WithValue(ctx, chi.RouteCtxKey, chiCtx)
2635+
// firstCall acts as a one-shot token: the HTTP handler consumes it,
2636+
// the background goroutine gets the default error path and exits early.
2637+
firstCall := make(chan struct{}, 1)
2638+
firstCall <- struct{}{}
2639+
return test{
2640+
db: &acme.MockDB{
2641+
MockGetOrder: func(ctx context.Context, id string) (*acme.Order, error) {
2642+
select {
2643+
case <-firstCall:
2644+
return &acme.Order{
2645+
ID: "orderID",
2646+
AccountID: "accountID",
2647+
ProvisionerID: fmt.Sprintf("acme/%s", prov.GetName()),
2648+
ExpiresAt: naf,
2649+
Status: acme.StatusReady,
2650+
AuthorizationIDs: []string{"foo", "bar", "baz"},
2651+
NotBefore: nbf,
2652+
NotAfter: naf,
2653+
Identifiers: []acme.Identifier{
2654+
{Type: "dns", Value: "example.com"},
2655+
{Type: "dns", Value: "*.smallstep.com"},
2656+
},
2657+
}, nil
2658+
default:
2659+
// Second call is from the background goroutine; return an
2660+
// error to short-circuit it so it doesn't race with assertions.
2661+
return nil, acme.NewErrorISE("test: short-circuit goroutine")
2662+
}
2663+
},
2664+
MockUpdateOrder: func(ctx context.Context, o *acme.Order) error {
2665+
return nil
2666+
},
2667+
},
2668+
ctx: ctx,
2669+
statusCode: 200,
2670+
expectedStatus: acme.StatusProcessing,
26312671
}
26322672
},
26332673
}
@@ -2656,13 +2696,15 @@ func TestHandler_FinalizeOrder(t *testing.T) {
26562696
assert.Equals(t, ae.Subproblems, tc.err.Subproblems)
26572697
assert.Equals(t, res.Header["Content-Type"], []string{"application/problem+json"})
26582698
} else {
2659-
expB, err := json.Marshal(o)
2660-
assert.FatalError(t, err)
2661-
26622699
ro := new(acme.Order)
2663-
assert.FatalError(t, json.Unmarshal(body, ro))
2700+
assert.FatalError(t, json.Unmarshal(bytes.TrimSpace(body), ro))
26642701

2665-
assert.Equals(t, bytes.TrimSpace(body), expB)
2702+
if tc.expectedStatus != "" {
2703+
assert.Equals(t, ro.Status, tc.expectedStatus)
2704+
}
2705+
if tc.expectedStatus == acme.StatusProcessing {
2706+
assert.Equals(t, res.Header["Retry-After"], []string{"15"})
2707+
}
26662708
assert.Equals(t, res.Header["Location"], []string{u})
26672709
assert.Equals(t, res.Header["Content-Type"], []string{"application/json"})
26682710
}

acme/order.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,15 @@ func (o *Order) UpdateStatus(ctx context.Context, db DB) error {
8282
return nil
8383
case StatusValid:
8484
return nil
85+
case StatusProcessing:
86+
// Check expiry so orphaned orders (e.g. from a server restart mid-goroutine)
87+
// eventually transition to invalid rather than staying stuck forever.
88+
if now.After(o.ExpiresAt) {
89+
o.Status = StatusInvalid
90+
o.Error = NewError(ErrorMalformedType, "order has expired")
91+
break
92+
}
93+
return nil
8594
case StatusReady:
8695
// Check expiry
8796
if now.After(o.ExpiresAt) {

acme/order_test.go

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,55 @@ func TestOrder_UpdateStatus(t *testing.T) {
4848
o: o,
4949
}
5050
},
51+
"ok/already-processing": func(t *testing.T) test {
52+
now := clock.Now()
53+
o := &Order{
54+
Status: StatusProcessing,
55+
ExpiresAt: now.Add(5 * time.Minute),
56+
}
57+
return test{
58+
o: o,
59+
}
60+
},
61+
"ok/processing-expired": func(t *testing.T) test {
62+
now := clock.Now()
63+
o := &Order{
64+
ID: "oID",
65+
AccountID: "accID",
66+
Status: StatusProcessing,
67+
ExpiresAt: now.Add(-5 * time.Minute),
68+
}
69+
return test{
70+
o: o,
71+
db: &MockDB{
72+
MockUpdateOrder: func(ctx context.Context, updo *Order) error {
73+
assert.Equals(t, updo.ID, o.ID)
74+
assert.Equals(t, updo.AccountID, o.AccountID)
75+
assert.Equals(t, updo.Status, StatusInvalid)
76+
assert.Equals(t, updo.ExpiresAt, o.ExpiresAt)
77+
return nil
78+
},
79+
},
80+
}
81+
},
82+
"fail/processing-expired-db.UpdateOrder-error": func(t *testing.T) test {
83+
now := clock.Now()
84+
o := &Order{
85+
ID: "oID",
86+
AccountID: "accID",
87+
Status: StatusProcessing,
88+
ExpiresAt: now.Add(-5 * time.Minute),
89+
}
90+
return test{
91+
o: o,
92+
db: &MockDB{
93+
MockUpdateOrder: func(ctx context.Context, updo *Order) error {
94+
return errors.New("force")
95+
},
96+
},
97+
err: NewErrorISE("error updating order: force"),
98+
}
99+
},
51100
"fail/error-unexpected-status": func(t *testing.T) test {
52101
o := &Order{
53102
Status: "foo",

acme/status.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ var (
1414
StatusDeactivated = Status("deactivated")
1515
// StatusReady -- ready; e.g. for an Order that is ready to be finalized.
1616
StatusReady = Status("ready")
17-
//statusExpired = "expired"
18-
//statusActive = "active"
19-
//statusProcessing = "processing"
17+
// StatusProcessing -- processing; the certificate is being issued.
18+
StatusProcessing = Status("processing")
19+
//statusExpired = "expired"
20+
//statusActive = "active"
2021
)

0 commit comments

Comments
 (0)