Skip to content

Commit 70b133f

Browse files
sameh-faroukclaude
andcommitted
feat(bridge): crash-safe idempotency for withdraws and refunds
Adds a persistent idempotency store and startup reconciliation so a crash between submitting a Stellar payment and confirming it on TFChain is recovered without double-paying or double-confirming. Per-transaction submit is retained (batching is intentionally excluded/deferred). - pkg/idempotency.go: bbolt-backed store tracking PROCESSING/COMPLETED state per withdraw (by burn tx id) and refund (by tx hash); refuses to downgrade COMPLETED back to PROCESSING. Reset() wipes the store. - handleWithdrawReady / handleRefundReady: check the store first (skip if COMPLETED); on PROCESSING, look up Horizon for an existing outgoing tx and only complete the TFChain confirmation if found, otherwise mark PROCESSING, submit, confirm, then mark COMPLETED. The existing #1092 undeliverable-refund quarantine is preserved and now also marks the refund COMPLETED. - Withdraw recovery: withdraw payments are now tagged with the burn tx id as a Stellar text memo (traceability + recovery by memo), with the account sequence number as a fallback for pre-memo submissions. The memo is part of the signed tx and is set identically at both build sites (CreatePaymentAndReturnSignature + CreatePaymentWithSignaturesAndSubmit). Refund recovery uses the existing MemoReturn hash, sequence as fallback. - reconcilePendingTransactions: runs once at startup to recover entries left PROCESSING by a previous run, using one Horizon page for all lookups. - Event loop: process Ready events before Created/Expired (Ready submits time-sensitive Stellar signatures). - Chain-reset safety: the store is chain-scoped (burn tx ids restart after a reset), so it is wiped via Reset() when started with RescanBridgeAccount (the same flag that zeroes the Stellar cursor). Deployment note: the withdraw memo is part of the signed transaction, so all validators must run this version together. During a mixed-version rollout, withdraw submissions whose signature set spans both versions are rejected (tx_bad_auth) and postponed/retried — withdraws stall but do not crash or double-pay, and self-heal once all validators are upgraded. No runtime upgrade. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent a08dd29 commit 70b133f

7 files changed

Lines changed: 665 additions & 34 deletions

File tree

bridge/tfchain_bridge/go.mod

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,15 @@ require (
1515
github.com/rs/zerolog v1.26.0
1616
github.com/sirupsen/logrus v1.4.2 // indirect
1717
github.com/stellar/go v0.0.0-20210922122349-e6f322c047c5
18-
github.com/stretchr/objx v0.3.0 // indirect
18+
github.com/stretchr/objx v0.5.0 // indirect
1919
github.com/vedhavyas/go-subkey v1.0.3
2020
)
2121

2222
require (
2323
github.com/cenkalti/backoff/v4 v4.1.3
2424
github.com/hashicorp/go-retryablehttp v0.7.7
2525
github.com/threefoldtech/tfchain/clients/tfchain-client-go v0.0.0-20230607082553-5605bca61c79
26+
go.etcd.io/bbolt v1.3.9
2627
)
2728

2829
require (
@@ -50,7 +51,7 @@ require (
5051
github.com/rs/cors v1.8.2 // indirect
5152
github.com/segmentio/go-loggly v0.5.1-0.20171222203950-eb91657e62b2 // indirect
5253
github.com/stellar/go-xdr v0.0.0-20201028102745-f80a23dac78a // indirect
53-
github.com/stretchr/testify v1.7.2 // indirect
54+
github.com/stretchr/testify v1.8.1 // indirect
5455
golang.org/x/crypto v0.7.0 // indirect
5556
golang.org/x/sys v0.20.0 // indirect
5657
gopkg.in/natefinch/npipe.v2 v2.0.0-20160621034901-c1b8fa8bdcce // indirect

bridge/tfchain_bridge/go.sum

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -339,15 +339,17 @@ github.com/stellar/go-xdr v0.0.0-20201028102745-f80a23dac78a/go.mod h1:yoxyU/M8n
339339
github.com/stellar/throttled v2.2.3-0.20190823235211-89d75816f59d+incompatible/go.mod h1:7CJ23pXirXBJq45DqvO6clzTEGM/l1SfKrgrzLry8b4=
340340
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
341341
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
342-
github.com/stretchr/objx v0.3.0 h1:NGXK3lHquSN08v5vWalVI/L8XU9hdzE/G6xsrze47As=
343-
github.com/stretchr/objx v0.3.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE=
342+
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
343+
github.com/stretchr/objx v0.5.0 h1:1zr/of2m5FGMsad5YfcqgdqdWrIhu+EBEJRhR1U7z/c=
344+
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
344345
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
345-
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
346346
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
347347
github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
348348
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
349-
github.com/stretchr/testify v1.7.2 h1:4jaiDzPyXQvSd7D0EjG45355tLlV3VOECpq10pLC+8s=
350-
github.com/stretchr/testify v1.7.2/go.mod h1:R6va5+xMeoiuVRoj+gSkQ7d3FALtqAAGI1FQKckRals=
349+
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
350+
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
351+
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
352+
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
351353
github.com/tklauser/go-sysconf v0.3.9 h1:JeUVdAOWhhxVcU6Eqr/ATFHgXk/mmiItdKeJPev3vTo=
352354
github.com/tklauser/go-sysconf v0.3.9/go.mod h1:11DU/5sG7UexIrp/O6g35hrWzu0JxlwQ3LSFUzyeuhs=
353355
github.com/tklauser/numcpus v0.2.2 h1:oyhllyrScuYI6g+h/zUvNXNp1wy7x8qQy3t/piefldA=
@@ -381,6 +383,8 @@ github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9dec
381383
github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
382384
github.com/yuin/goldmark v1.4.0/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
383385
github.com/ziutek/mymysql v1.5.4/go.mod h1:LMSpPZ6DbqWFxNCHW77HeMg9I646SAhApZ/wKdgO/C0=
386+
go.etcd.io/bbolt v1.3.9 h1:8x7aARPEXiXbHmtUwAIv7eV2fQFHrLLavdiJ3uzJXoI=
387+
go.etcd.io/bbolt v1.3.9/go.mod h1:zaO32+Ti0PK1ivdPtgMESzuzL2VPoIG1PCQNvOdo/dE=
384388
go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU=
385389
go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8=
386390
go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
@@ -502,6 +506,8 @@ golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208/go.mod h1:RxMgew5VJxzue5/jJ
502506
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
503507
golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
504508
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
509+
golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE=
510+
golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
505511
golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
506512
golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
507513
golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=

bridge/tfchain_bridge/pkg/bridge/bridge.go

Lines changed: 155 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,15 @@ package bridge
22

33
import (
44
"context"
5+
"fmt"
56
"strconv"
67
"time"
78

9+
"github.com/centrifuge/go-substrate-rpc-client/v4/types"
810
"github.com/pkg/errors"
911
"github.com/rs/zerolog"
1012
"github.com/rs/zerolog/log"
13+
hProtocol "github.com/stellar/go/protocols/horizon"
1114
"github.com/threefoldtech/tfchain/bridge/tfchain_bridge/pkg"
1215
"github.com/threefoldtech/tfchain/bridge/tfchain_bridge/pkg/stellar"
1316
subpkg "github.com/threefoldtech/tfchain/bridge/tfchain_bridge/pkg/substrate"
@@ -26,6 +29,7 @@ type Bridge struct {
2629
blockPersistency *pkg.ChainPersistency
2730
config *pkg.BridgeConfig
2831
depositFee int64
32+
idempotency *pkg.IdempotencyStore
2933
}
3034

3135
func NewBridge(ctx context.Context, cfg pkg.BridgeConfig) (*Bridge, string, error) {
@@ -64,12 +68,33 @@ func NewBridge(ctx context.Context, cfg pkg.BridgeConfig) (*Bridge, string, erro
6468
return nil, "", err
6569
}
6670

71+
// Crash-safe idempotency store, kept alongside the block persistency file.
72+
// Records the PROCESSING/COMPLETED state of withdraws and refunds so that a
73+
// crash between submitting a Stellar payment and confirming it on TFChain is
74+
// recovered without double-paying or double-confirming.
75+
idempotency, err := pkg.NewIdempotencyStore(cfg.PersistencyFile + ".idem.db")
76+
if err != nil {
77+
return nil, "", errors.Wrap(err, "failed to open idempotency store")
78+
}
79+
80+
// The idempotency store is chain-scoped: withdraw keys are TFChain burn tx ids,
81+
// which restart from a low number after a chain reset and would otherwise collide
82+
// with stale COMPLETED entries, causing new withdraws to be wrongly skipped. The
83+
// rescan flag marks a fresh start (it also zeroes the Stellar cursor above), so
84+
// clear the store here too.
85+
if cfg.RescanBridgeAccount {
86+
if err := idempotency.Reset(); err != nil {
87+
return nil, "", errors.Wrap(err, "failed to reset idempotency store")
88+
}
89+
}
90+
6791
bridge := &Bridge{
6892
subClient: subClient,
6993
blockPersistency: blockPersistency,
7094
wallet: wallet,
7195
config: &cfg,
7296
depositFee: depositFee,
97+
idempotency: idempotency,
7398
}
7499
// stat deposit fee?
75100
return bridge, wallet.GetKeypair().Address(), nil
@@ -93,11 +118,25 @@ func (bridge *Bridge) preCheckBalance(ctx context.Context) error {
93118
}
94119

95120
func (bridge *Bridge) Start(ctx context.Context) error {
121+
// Close the idempotency store when Start returns.
122+
defer func() {
123+
if err := bridge.idempotency.Close(); err != nil {
124+
log.Warn().Err(err).Msg("failed to close idempotency store")
125+
}
126+
}()
127+
96128
// pre-check wallet balance
97129
if err := bridge.preCheckBalance(ctx); err != nil {
98130
return err
99131
}
100132

133+
// Crash recovery: reconcile any transactions left in PROCESSING state by a
134+
// previous run before we start consuming new events. Non-fatal — unreconciled
135+
// transactions are retried when their Ready event fires again.
136+
if err := bridge.reconcilePendingTransactions(ctx); err != nil {
137+
return errors.Wrap(err, "startup reconciliation failed")
138+
}
139+
101140
log.Info().
102141
Str("event_action", "bridge_started").
103142
Str("event_kind", "event").
@@ -148,6 +187,27 @@ func (bridge *Bridge) Start(ctx context.Context) error {
148187
if data.Err != nil {
149188
return errors.Wrap(data.Err, "failed to get tfchain events")
150189
}
190+
// Ready events are processed before Created/Expired events: a Ready
191+
// event submits a payment to Stellar whose signatures are time-sensitive
192+
// (they expire), so it must not wait behind proposal/expiry handling.
193+
for _, withdawReadyEvent := range data.Events.WithdrawReadyEvents {
194+
err := bridge.handleWithdrawReady(ctx, withdawReadyEvent)
195+
if err != nil {
196+
if errors.Is(err, pkg.ErrTransactionAlreadyBurned) {
197+
continue
198+
}
199+
return errors.Wrap(err, "an error occurred while handling WithdrawReadyEvents")
200+
}
201+
}
202+
for _, refundReadyEvent := range data.Events.RefundReadyEvents {
203+
err := bridge.handleRefundReady(ctx, refundReadyEvent)
204+
if err != nil {
205+
if errors.Is(err, pkg.ErrTransactionAlreadyRefunded) {
206+
continue
207+
}
208+
return errors.Wrap(err, "an error occurred while handling RefundReadyEvents")
209+
}
210+
}
151211
for _, withdrawCreatedEvent := range data.Events.WithdrawCreatedEvents {
152212
err := bridge.handleWithdrawCreated(ctx, withdrawCreatedEvent)
153213
if err != nil {
@@ -164,30 +224,12 @@ func (bridge *Bridge) Start(ctx context.Context) error {
164224
return errors.Wrap(err, "an error occurred while handling WithdrawExpiredEvents")
165225
}
166226
}
167-
for _, withdawReadyEvent := range data.Events.WithdrawReadyEvents {
168-
err := bridge.handleWithdrawReady(ctx, withdawReadyEvent)
169-
if err != nil {
170-
if errors.Is(err, pkg.ErrTransactionAlreadyBurned) {
171-
continue
172-
}
173-
return errors.Wrap(err, "an error occurred while handling WithdrawReadyEvents")
174-
}
175-
}
176227
for _, refundExpiredEvent := range data.Events.RefundExpiredEvents {
177228
err := bridge.handleRefundExpired(ctx, refundExpiredEvent)
178229
if err != nil {
179230
return errors.Wrap(err, "an error occurred while handling RefundExpiredEvents")
180231
}
181232
}
182-
for _, refundReadyEvent := range data.Events.RefundReadyEvents {
183-
err := bridge.handleRefundReady(ctx, refundReadyEvent)
184-
if err != nil {
185-
if errors.Is(err, pkg.ErrTransactionAlreadyRefunded) {
186-
continue
187-
}
188-
return errors.Wrap(err, "an error occurred while handling RefundReadyEvents")
189-
}
190-
}
191233
case data := <-stellarSub:
192234
if data.Err != nil {
193235
return errors.Wrap(data.Err, "failed to get stellar payments")
@@ -222,3 +264,98 @@ func (bridge *Bridge) Start(ctx context.Context) error {
222264
time.Sleep(1 * time.Second)
223265
}
224266
}
267+
268+
// reconcilePendingTransactions runs once at startup to recover transactions that a
269+
// previous run left in PROCESSING state — i.e. the Stellar payment may or may not
270+
// have been submitted before the bridge stopped. For each pending withdraw/refund it
271+
// looks for a matching outgoing Stellar transaction (by memo, falling back to the
272+
// sequence number for pre-memo submissions). If found, the funds already left the
273+
// bridge, so it only completes the TFChain confirmation and marks the entry COMPLETED.
274+
// If not found, the entry is left PROCESSING and will be retried when its Ready event
275+
// fires again. All failures here are non-fatal: a transient Horizon/RPC problem must
276+
// not stop the bridge from starting.
277+
func (bridge *Bridge) reconcilePendingTransactions(ctx context.Context) error {
278+
pendingWithdraws, err := bridge.idempotency.GetPendingWithdraws()
279+
if err != nil {
280+
return errors.Wrap(err, "failed to get pending withdraws")
281+
}
282+
pendingRefunds, err := bridge.idempotency.GetPendingRefunds()
283+
if err != nil {
284+
return errors.Wrap(err, "failed to get pending refunds")
285+
}
286+
287+
if len(pendingWithdraws) == 0 && len(pendingRefunds) == 0 {
288+
return nil
289+
}
290+
291+
log.Info().
292+
Int("pending_withdraws", len(pendingWithdraws)).
293+
Int("pending_refunds", len(pendingRefunds)).
294+
Msg("reconciling pending transactions from previous run")
295+
296+
// Fetch outgoing transactions once and reuse the page for all lookups, avoiding
297+
// one Horizon HTTP call per pending transaction.
298+
outgoingPage, err := bridge.wallet.FetchOutgoingTransactionsPage(ctx)
299+
if err != nil {
300+
// Non-fatal: pending txs are retried when their next Ready event fires.
301+
log.Warn().Err(err).Msg("failed to fetch Horizon transactions for reconciliation, pending transactions will retry on next event")
302+
outgoingPage = hProtocol.TransactionsPage{}
303+
}
304+
305+
for _, txID := range pendingWithdraws {
306+
// Recover by the text memo (burn tx id), falling back to the account sequence
307+
// number for payments submitted by a pre-memo bridge version.
308+
stellarTx := bridge.wallet.FindPaymentByMemoInPage(outgoingPage, fmt.Sprint(txID))
309+
if stellarTx == nil {
310+
burnTx, err := bridge.subClient.GetBurnTransaction(types.U64(txID))
311+
if err != nil {
312+
log.Warn().Err(err).Uint64("tx_id", txID).Msg("failed to get burn tx for sequence lookup during reconciliation")
313+
} else {
314+
stellarTx = bridge.wallet.FindPaymentBySequenceInPage(outgoingPage, int64(burnTx.SequenceNumber))
315+
}
316+
}
317+
318+
if stellarTx == nil {
319+
log.Info().Uint64("tx_id", txID).Msg("reconcile: no Stellar tx found by memo or sequence, will retry on next event")
320+
continue
321+
}
322+
323+
log.Info().Uint64("tx_id", txID).Msg("reconcile: found existing Stellar payment, completing TFChain confirmation")
324+
if err := bridge.subClient.RetrySetWithdrawExecuted(ctx, txID); err != nil {
325+
log.Warn().Err(err).Uint64("tx_id", txID).Msg("failed to set withdraw executed during reconciliation")
326+
continue
327+
}
328+
if err := bridge.idempotency.MarkWithdrawCompleted(txID); err != nil {
329+
log.Warn().Err(err).Uint64("tx_id", txID).Msg("failed to mark withdraw completed during reconciliation")
330+
}
331+
}
332+
333+
for _, txHash := range pendingRefunds {
334+
stellarTx := bridge.wallet.FindRefundByReturnHashInPage(outgoingPage, txHash)
335+
if stellarTx == nil {
336+
refundTx, err := bridge.subClient.GetRefundTransaction(txHash)
337+
if err != nil {
338+
log.Warn().Err(err).Str("tx_hash", txHash).Msg("failed to get refund tx for sequence lookup during reconciliation")
339+
} else {
340+
stellarTx = bridge.wallet.FindPaymentBySequenceInPage(outgoingPage, int64(refundTx.SequenceNumber))
341+
}
342+
}
343+
344+
if stellarTx == nil {
345+
log.Info().Str("tx_hash", txHash).Msg("reconcile: no Stellar refund found by return hash or sequence, will retry on next event")
346+
continue
347+
}
348+
349+
log.Info().Str("tx_hash", txHash).Msg("reconcile: found existing Stellar refund, completing TFChain confirmation")
350+
if err := bridge.subClient.RetrySetRefundTransactionExecutedTx(ctx, txHash); err != nil {
351+
log.Warn().Err(err).Str("tx_hash", txHash).Msg("failed to set refund executed during reconciliation")
352+
continue
353+
}
354+
if err := bridge.idempotency.MarkRefundCompleted(txHash); err != nil {
355+
log.Warn().Err(err).Str("tx_hash", txHash).Msg("failed to mark refund completed during reconciliation")
356+
}
357+
}
358+
359+
log.Info().Msg("reconciliation complete")
360+
return nil
361+
}

0 commit comments

Comments
 (0)