-
Notifications
You must be signed in to change notification settings - Fork 1
fix: prod robustness #68
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,6 +24,17 @@ function sleep(ms) { | |
| return new Promise((r) => setTimeout(r, ms)); | ||
| } | ||
|
|
||
| // Dormant after this many consecutive task errors (thrown or agent-reported) | ||
| // with no successful task between. Prevents an orphaned bot from spinning | ||
| // against a dead token / stale key / misconfigured server forever. | ||
| // | ||
| // Only *task* errors count — top-up failures log but don't retire a bot, | ||
| // because a billing-portal outage simultaneously hitting all 20 bots would | ||
| // otherwise cause mass dormancy. If top-ups fail long enough that the bot | ||
| // runs out of credit, subsequent task failures (billing denials) will | ||
| // correctly increment this counter. | ||
| const MAX_CONSECUTIVE_TASK_ERRORS = 5; | ||
|
|
||
| export class VirtualBot { | ||
| #agent; | ||
| #keyPath; | ||
|
|
@@ -41,8 +52,10 @@ export class VirtualBot { | |
| #taskIndex = 0; | ||
| #tasksRun = 0; | ||
| #topUpsDone = 0; | ||
| #consecutiveTaskErrors = 0; | ||
| #nextTaskAt = 0; | ||
| #nextTopUpAt = 0; | ||
| #loopPromise = null; | ||
|
|
||
| constructor({ | ||
| keyPath, | ||
|
|
@@ -107,7 +120,13 @@ export class VirtualBot { | |
| const result = await this.#agent.executeTask(task); | ||
| this.#stats.addTaskResults([result], { persona: this.#persona.name }); | ||
| this.#tasksRun++; | ||
| if (result.error) { | ||
| this.#consecutiveTaskErrors++; | ||
| } else { | ||
| this.#consecutiveTaskErrors = 0; | ||
| } | ||
| } catch (err) { | ||
| this.#consecutiveTaskErrors++; | ||
| console.log( | ||
| `[EVENT] ${iso()} bot_task_crash partyId=${this.partyId?.slice(0, 40) || "unknown"} persona=${this.#persona.name} error=${JSON.stringify(err.message)}` | ||
| ); | ||
|
|
@@ -130,6 +149,7 @@ export class VirtualBot { | |
| amountCC: this.#persona.topUpAmountCC, | ||
| }); | ||
| } catch (err) { | ||
| // Intentionally does NOT bump the task-error counter; see constant comment. | ||
| console.log( | ||
| `[WARN] ${iso()} bot_topup_failed partyId=${this.partyId?.slice(0, 40) || "unknown"} persona=${this.#persona.name} error=${JSON.stringify(err.message)}` | ||
| ); | ||
|
|
@@ -160,10 +180,10 @@ export class VirtualBot { | |
| } | ||
|
|
||
| startLoop() { | ||
| if (this.#running) return; | ||
| if (this.#running) return this.#loopPromise; | ||
| this.#running = true; | ||
|
|
||
| (async () => { | ||
| this.#loopPromise = (async () => { | ||
| const now = Date.now(); | ||
| this.#scheduleNextTask(now); | ||
| this.#scheduleNextTopUp(now); | ||
|
|
@@ -185,13 +205,28 @@ export class VirtualBot { | |
| this.#goDormant("lifetime_cap"); | ||
| break; | ||
| } | ||
| if (this.#consecutiveTaskErrors >= MAX_CONSECUTIVE_TASK_ERRORS) { | ||
| this.#goDormant("task_error_threshold"); | ||
| break; | ||
| } | ||
| this.#scheduleNextTask(Date.now()); | ||
| } else if (fired >= this.#nextTopUpAt) { | ||
| await this.#runTopUp(); | ||
| if (this.#consecutiveTaskErrors >= MAX_CONSECUTIVE_TASK_ERRORS) { | ||
| this.#goDormant("task_error_threshold"); | ||
| break; | ||
| } | ||
| this.#scheduleNextTopUp(Date.now()); | ||
|
Comment on lines
+215
to
219
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This error threshold check is redundant in the top-up block. The this.#scheduleNextTopUp(Date.now()); |
||
| } | ||
| } | ||
| this.#running = false; | ||
| })(); | ||
|
|
||
| return this.#loopPromise; | ||
| } | ||
|
|
||
| /** Promise resolving when the loop has exited. Used by orchestrator.stop(). */ | ||
| get loopPromise() { | ||
| return this.#loopPromise; | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The comment mentions taking a "snapshot" of the bot list, but the code iterates over
this.#botsdirectly. While safe in this specific implementation becauseb.stop()is synchronous and doesn't trigger immediate removal, it is better practice to actually snapshot the array (e.g., using the spread operator) to ensure the set of bots being signaled is isolated from any concurrent mutations to the collection during iteration.