Skip to content

Commit e15a2c5

Browse files
sameh-faroukclaude
andauthored
fix(bridge): quarantine undeliverable refunds instead of crashlooping (#1092)
A refund whose Stellar target is permanently undeliverable (e.g. the target removed its trustline) caused submitTransaction to return a fatal error, exiting the bridge. The on-chain on_finalize retry then re-emitted the refund every RetryInterval blocks, producing an endless crash loop (observed: 10k+ restarts over ~40 days). Make refund submission non-fatal: - Classify Stellar payment result codes. Target-side, bridge-unresolvable failures (op_no_trust, op_no_destination, op_not_authorized) are forfeited: the refund is marked executed on chain to stop the retry loop, with a refund_quarantined alert. - All other submit failures (incl. op_line_full and transient codes) are postponed and retried via the on-chain re-emit, matching the existing withdraw path. - op_underfunded (bridge wallet empty) raises a bridge_account_underfunded vault alert at submit time, covering both refund and withdraw paths. Adds unit tests for the result-code classifiers and documents the two new structured-logging events. Refs #1089 Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 7722f4f commit e15a2c5

5 files changed

Lines changed: 311 additions & 3 deletions

File tree

bridge/docs/observability.md

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ For example, if a customer is complaining that their deposit never bridged, you
6060
- `refund_skipped`: a refund request skipped by the bridge instance as it has already been refunded.
6161
- `refund_proposed`: a refund has proposed or signed by the bridge instance.
6262
- `refund_postponed`: a refund has postponed due to a problem in sending this transaction to the stellar network and will be retried later.
63+
- `refund_quarantined`: a refund is permanently undeliverable to the target account (e.g. the target removed its trustline, the account no longer exists, or it is not authorized to hold the asset) and has been marked executed on chain so the bridge stops retrying it. The refunded TFT is forfeited.
6364
- `refund_completed`: a refund has completed and received on the target stellar account.
6465

6566
##### Withdraw related
@@ -74,6 +75,7 @@ For example, if a customer is complaining that their deposit never bridged, you
7475
##### Bridge vault account related
7576
- `payment_received` : This event represents successful payment to the bridge account (a deposit).
7677
- `stellar_transaction_submitted` : This event represents successful transaction from the bridge account (a refund or a withdraw).
78+
- `bridge_account_underfunded` : The bridge account has insufficient funds to submit a transaction. The transaction is retried, but the bridge wallet must be refilled by an operator.
7779

7880
### Metrics:
7981

@@ -659,6 +661,38 @@ the source field set contains all fields which are included in the source object
659661
</tbody>
660662
</table>
661663

664+
##### refund_quarantined
665+
666+
- kind: alert
667+
668+
- category: refund
669+
670+
<table>
671+
<thead>
672+
<tr><th colspan="4"><div>refund_quarantined Event Properties</div></th></tr>
673+
<tr>
674+
<th>Property</th>
675+
<th>Type</th>
676+
<th>Required</th>
677+
<th>Description</th>
678+
</tr>
679+
</thead>
680+
<tbody>
681+
<tr>
682+
<td><code>target</code></td>
683+
<td>string</td>
684+
<td>yes</td>
685+
<td>The stellar target account the refund could not be delivered to. </td>
686+
</tr>
687+
<tr>
688+
<td><code>amount</code></td>
689+
<td>number</td>
690+
<td>yes</td>
691+
<td>The forfeited refund amount in stroops. </td>
692+
</tr>
693+
</tbody>
694+
</table>
695+
662696
##### event_burn_tx_created_received
663697

664698
- kind: event
@@ -908,6 +942,29 @@ the source field set contains all fields which are included in the source object
908942
</tbody>
909943
</table>
910944

945+
##### bridge_account_underfunded
946+
947+
- kind: alert
948+
949+
- category: vault
950+
951+
<table>
952+
<thead>
953+
<tr><th colspan="4"><div>bridge_account_underfunded Event Properties</div></th></tr>
954+
<tr>
955+
<th>Property</th>
956+
<th>Type</th>
957+
<th>Required</th>
958+
<th>Description</th>
959+
</tr>
960+
</thead>
961+
<tbody>
962+
<tr>
963+
<td colspan="4"><div>No metadata</div></td>
964+
</tr>
965+
</tbody>
966+
</table>
967+
911968
##### wallet_balance
912969

913970
- kind: metric

bridge/tfchain_bridge/pkg/bridge/refund.go

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,43 @@ func (bridge *Bridge) handleRefundReady(ctx context.Context, refundReadyEvent su
100100

101101
// Todo, retry here?
102102
if err = bridge.wallet.CreateRefundPaymentWithSignaturesAndSubmit(ctx, refund.Target, uint64(refund.Amount), refund.TxHash, refund.Signatures, int64(refund.SequenceNumber)); err != nil {
103-
return err
103+
// A refund submission must never crash the bridge. A single failure would
104+
// otherwise propagate up to a fatal exit, and the on-chain on_finalize
105+
// retry would re-emit the event every RetryInterval blocks, producing an
106+
// endless crash loop.
107+
if errors.Is(err, pkg.ErrStellarTransactionUndeliverable) {
108+
// Target-side, bridge-unresolvable (e.g. the target removed its Stellar
109+
// trustline). It can never be delivered, so quarantine it by marking it
110+
// executed on chain to stop the on-chain retries. The refunded TFT is
111+
// forfeited.
112+
logger.Warn().
113+
Err(err).
114+
Str("event_action", "refund_quarantined").
115+
Str("event_kind", "alert").
116+
Str("category", "refund").
117+
Dict("metadata", zerolog.Dict().
118+
Str("target", refund.Target).
119+
Uint64("amount", uint64(refund.Amount))).
120+
Msg("the refund is permanently undeliverable to the target account and has been quarantined; the bridge will continue")
121+
122+
if execErr := bridge.subClient.RetrySetRefundTransactionExecutedTx(ctx, refund.TxHash); execErr != nil {
123+
return execErr
124+
}
125+
return nil
126+
}
127+
128+
// Any other failure (transient network/sequence issue, or a recoverable
129+
// target condition such as op_line_full) is postponed: log and continue,
130+
// relying on the on-chain on_finalize retry to re-emit the event so we try
131+
// again later. Matches handleWithdrawReady's behaviour.
132+
logger.Info().
133+
Str("event_action", "refund_postponed").
134+
Str("event_kind", "event").
135+
Str("category", "refund").
136+
Dict("metadata", zerolog.Dict().
137+
Str("reason", err.Error())).
138+
Msg("the refund has been postponed due to a problem submitting the transaction to the Stellar network; it will be retried")
139+
return nil
104140
}
105141

106142
err = bridge.subClient.RetrySetRefundTransactionExecutedTx(ctx, refund.TxHash)

bridge/tfchain_bridge/pkg/config.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,9 @@ var ErrTransactionAlreadyRefunded = errors.New("transaction is already refunded"
3030
var ErrTransactionAlreadyMinted = errors.New("transaction is already minted")
3131
var ErrTransactionAlreadyBurned = errors.New("transaction is already burned")
3232
var ErrNoSignatures = errors.New("transaction has no signatures")
33+
34+
// ErrStellarTransactionUndeliverable signals that a Stellar submission failed
35+
// with a permanent, target-side error (e.g. the destination removed its
36+
// trustline) that can never succeed on retry. The refund must be quarantined
37+
// and the bridge must continue, rather than crashing in a retry loop.
38+
var ErrStellarTransactionUndeliverable = errors.New("stellar transaction permanently undeliverable to target account")

bridge/tfchain_bridge/pkg/stellar/stellar.go

Lines changed: 72 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,58 @@ func (w *StellarWallet) createTransaction(ctx context.Context, txn txnbuild.Tran
276276
return tx, nil
277277
}
278278

279+
// terminalSubmitOperationCodes are Horizon payment operation result codes that
280+
// represent a target-side failure the bridge can never resolve on its own:
281+
// delivery requires a deliberate action by the destination owner (add a
282+
// trustline, create the account, get issuer authorization) that the bridge has
283+
// no visibility into. A refund hitting one of these is forfeited rather than
284+
// retried forever.
285+
//
286+
// Deliberately excluded:
287+
// - op_line_full: the destination CAN receive the asset but is momentarily at
288+
// its trustline limit; it may succeed once the balance is spent down, so it
289+
// is postponed and retried rather than forfeited.
290+
// - op_underfunded / op_src_no_trust / op_src_not_authorized: bridge-side
291+
// (source) problems that must alert/halt, never forfeit a user's refund.
292+
// - op_malformed / op_no_issuer: bridge bug or global asset misconfiguration
293+
// affecting all transactions, not a per-target condition.
294+
var terminalSubmitOperationCodes = map[string]struct{}{
295+
"op_no_trust": {}, // destination has no trustline for the asset
296+
"op_no_destination": {}, // destination account does not exist
297+
"op_not_authorized": {}, // destination is not authorized to hold the asset
298+
}
299+
300+
// isTerminalSubmitError reports whether a Stellar submission failure is a
301+
// target-side error that the bridge cannot resolve by retrying, and so should
302+
// be forfeited rather than postponed.
303+
func isTerminalSubmitError(codes *hProtocol.TransactionResultCodes) bool {
304+
if codes == nil {
305+
return false
306+
}
307+
for _, op := range codes.OperationCodes {
308+
if _, ok := terminalSubmitOperationCodes[op]; ok {
309+
return true
310+
}
311+
}
312+
return false
313+
}
314+
315+
// isInsufficientFundsError reports whether a Stellar submission failed because
316+
// the bridge (source) account is out of funds. This is an operational
317+
// condition that requires the bridge wallet to be refilled; it is not the
318+
// target's fault and the transaction must not be forfeited.
319+
func isInsufficientFundsError(codes *hProtocol.TransactionResultCodes) bool {
320+
if codes == nil {
321+
return false
322+
}
323+
for _, op := range codes.OperationCodes {
324+
if op == "op_underfunded" {
325+
return true
326+
}
327+
}
328+
return false
329+
}
330+
279331
func (w *StellarWallet) submitTransaction(ctx context.Context, txn *txnbuild.Transaction) error {
280332
client, err := w.getHorizonClient()
281333
if err != nil {
@@ -287,8 +339,26 @@ func (w *StellarWallet) submitTransaction(ctx context.Context, txn *txnbuild.Tra
287339
if err != nil {
288340
log.Info().Msg(err.Error())
289341
if hError, ok := err.(*horizonclient.Error); ok {
290-
if ok {
291-
log.Err(err).Msgf("error while submitting transaction %+v", hError.Problem.Extras)
342+
log.Err(err).Msgf("error while submitting transaction %+v", hError.Problem.Extras)
343+
if codes, rcErr := hError.ResultCodes(); rcErr == nil {
344+
// A target-side failure (e.g. the destination removed its trustline)
345+
// can never succeed on retry. Surface it as a typed error so the
346+
// caller can quarantine the transaction instead of retrying forever.
347+
// The account sequence is irrelevant here, so don't reset it.
348+
if isTerminalSubmitError(codes) {
349+
return errors.Wrapf(pkg.ErrStellarTransactionUndeliverable, "operation result codes %v", codes.OperationCodes)
350+
}
351+
// The bridge account is out of funds. This is recoverable (the
352+
// transaction will be retried) but needs operator attention to refill
353+
// the bridge wallet, so raise an alert rather than only postponing.
354+
if isInsufficientFundsError(codes) {
355+
log.Warn().
356+
Str("trace_id", fmt.Sprint(ctx.Value(TraceIdKey{}))).
357+
Str("event_action", "bridge_account_underfunded").
358+
Str("event_kind", "alert").
359+
Str("category", "vault").
360+
Msg("the bridge account has insufficient funds to submit the transaction; it will be retried but the bridge wallet must be refilled")
361+
}
292362
}
293363
}
294364
errSequence := w.resetAccountSequence()
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
package stellar
2+
3+
import (
4+
"testing"
5+
6+
hProtocol "github.com/stellar/go/protocols/horizon"
7+
)
8+
9+
func TestIsTerminalSubmitError(t *testing.T) {
10+
tests := []struct {
11+
name string
12+
codes *hProtocol.TransactionResultCodes
13+
want bool
14+
}{
15+
{
16+
name: "nil codes are not terminal",
17+
codes: nil,
18+
want: false,
19+
},
20+
{
21+
name: "op_no_trust is terminal (target removed trustline)",
22+
codes: &hProtocol.TransactionResultCodes{
23+
TransactionCode: "tx_failed",
24+
OperationCodes: []string{"op_no_trust"},
25+
},
26+
want: true,
27+
},
28+
{
29+
name: "op_no_destination is terminal (target account gone)",
30+
codes: &hProtocol.TransactionResultCodes{
31+
TransactionCode: "tx_failed",
32+
OperationCodes: []string{"op_no_destination"},
33+
},
34+
want: true,
35+
},
36+
{
37+
name: "op_not_authorized is terminal (target not authorized to hold the asset)",
38+
codes: &hProtocol.TransactionResultCodes{
39+
TransactionCode: "tx_failed",
40+
OperationCodes: []string{"op_not_authorized"},
41+
},
42+
want: true,
43+
},
44+
{
45+
name: "op_line_full is NOT terminal (target trustline momentarily full; may succeed on retry - postpone, do not forfeit)",
46+
codes: &hProtocol.TransactionResultCodes{
47+
TransactionCode: "tx_failed",
48+
OperationCodes: []string{"op_line_full"},
49+
},
50+
want: false,
51+
},
52+
{
53+
name: "op_underfunded is NOT terminal (bridge wallet out of funds - operational, must alert not forfeit)",
54+
codes: &hProtocol.TransactionResultCodes{
55+
TransactionCode: "tx_failed",
56+
OperationCodes: []string{"op_underfunded"},
57+
},
58+
want: false,
59+
},
60+
{
61+
name: "op_src_no_trust is NOT terminal (bridge-side trustline missing, not the target's fault)",
62+
codes: &hProtocol.TransactionResultCodes{
63+
TransactionCode: "tx_failed",
64+
OperationCodes: []string{"op_src_no_trust"},
65+
},
66+
want: false,
67+
},
68+
{
69+
name: "tx_bad_seq is NOT terminal (transient, resolved by sequence reset)",
70+
codes: &hProtocol.TransactionResultCodes{
71+
TransactionCode: "tx_bad_seq",
72+
OperationCodes: nil,
73+
},
74+
want: false,
75+
},
76+
{
77+
name: "terminal code mixed with others is still terminal",
78+
codes: &hProtocol.TransactionResultCodes{
79+
TransactionCode: "tx_failed",
80+
OperationCodes: []string{"op_success", "op_no_trust"},
81+
},
82+
want: true,
83+
},
84+
}
85+
86+
for _, tt := range tests {
87+
t.Run(tt.name, func(t *testing.T) {
88+
if got := isTerminalSubmitError(tt.codes); got != tt.want {
89+
t.Errorf("isTerminalSubmitError() = %v, want %v", got, tt.want)
90+
}
91+
})
92+
}
93+
}
94+
95+
func TestIsInsufficientFundsError(t *testing.T) {
96+
tests := []struct {
97+
name string
98+
codes *hProtocol.TransactionResultCodes
99+
want bool
100+
}{
101+
{
102+
name: "nil codes are not an underfunded condition",
103+
codes: nil,
104+
want: false,
105+
},
106+
{
107+
name: "op_underfunded means the bridge wallet is out of funds",
108+
codes: &hProtocol.TransactionResultCodes{
109+
TransactionCode: "tx_failed",
110+
OperationCodes: []string{"op_underfunded"},
111+
},
112+
want: true,
113+
},
114+
{
115+
name: "op_no_trust is a target-side failure, not underfunded",
116+
codes: &hProtocol.TransactionResultCodes{
117+
TransactionCode: "tx_failed",
118+
OperationCodes: []string{"op_no_trust"},
119+
},
120+
want: false,
121+
},
122+
{
123+
name: "op_underfunded mixed with others is still underfunded",
124+
codes: &hProtocol.TransactionResultCodes{
125+
TransactionCode: "tx_failed",
126+
OperationCodes: []string{"op_success", "op_underfunded"},
127+
},
128+
want: true,
129+
},
130+
}
131+
132+
for _, tt := range tests {
133+
t.Run(tt.name, func(t *testing.T) {
134+
if got := isInsufficientFundsError(tt.codes); got != tt.want {
135+
t.Errorf("isInsufficientFundsError() = %v, want %v", got, tt.want)
136+
}
137+
})
138+
}
139+
}

0 commit comments

Comments
 (0)