Skip to content

Commit db8cecb

Browse files
committed
fix: retry HA Discovery TreeXML on C-Gate startup race
C-Gate accepts TCP connections on the command port before its project's networks are loaded, so the initial TREEXML query at bridge startup could return 401 "Network not found" and HA Discovery would silently give up. Devices never appeared in Home Assistant even though events flowed normally. HaDiscovery now retries failed TreeXML requests with exponential backoff (2s up to 60s, max 8 attempts), driven by both the 401 fast-fail and an 8s no-response watchdog. Once the retry limit is exhausted, the warning explains how to recover via cbus/write/<network>///gettree. Bumps to 1.8.1.
1 parent 8834f50 commit db8cecb

6 files changed

Lines changed: 293 additions & 6 deletions

File tree

homeassistant-addon/CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@ All notable changes to the C-Gate Web Bridge Home Assistant add-on will be docum
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [1.8.1] - 2026-05-04
9+
10+
### Fixed
11+
- **HA Discovery startup race**: C-Gate accepts TCP connections on the command port before its project's networks are loaded, so the initial `TREEXML` query could return `401 Network not found` and HA Discovery would silently give up — devices never appeared in Home Assistant even though events flowed normally. `HaDiscovery` now retries failed TreeXML requests with exponential backoff (2s → 60s, up to 8 attempts), driven both by the `401 Network not found` fast-fail and an 8s no-response watchdog. After the retry limit, a clear warning explains how to recover via `cbus/write/<network>///gettree`.
12+
813
## [1.8.0] - 2026-04-29
914

1015
### Changed

homeassistant-addon/config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
name: "C-Gate Web Bridge"
2-
version: "1.8.0"
2+
version: "1.8.1"
33
slug: cgateweb
44
description: "Bridge between Clipsal C-Bus systems and MQTT/Home Assistant"
55
url: "https://github.com/dougrathbone/cgateweb"

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "cgateweb",
3-
"version": "1.8.0",
3+
"version": "1.8.1",
44
"description": "Node.js bridge connecting Clipsal C-Bus automation systems to MQTT for Home Assistant integration",
55
"keywords": [
66
"cbus",

src/bridgeInitializationService.js

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,8 +305,14 @@ class BridgeInitializationService {
305305
* Handles C-Gate command errors. If a 401 (not found) is received for a path
306306
* that is being periodically polled, the polling timer is cancelled to prevent
307307
* recurring error logs for apps that don't exist on this C-Bus installation.
308+
* Also forwards 401 errors to HaDiscovery so it can retry TreeXML requests
309+
* that fail because C-Gate hasn't finished loading networks at startup.
308310
*/
309311
handleCommandError(code, statusData) {
312+
if (this.bridge.haDiscovery && typeof this.bridge.haDiscovery.handleCommandError === 'function') {
313+
this.bridge.haDiscovery.handleCommandError(code, statusData);
314+
}
315+
310316
if (code !== '401') return;
311317
// Extract network/app path from statusData like:
312318
// "Bad object or device ID: //CLIPSAL/254/203/* (Object not found)"
@@ -338,6 +344,9 @@ class BridgeInitializationService {
338344
this.bridge.labelLoader.unwatch();
339345

340346
if (this.bridge.haDiscovery) {
347+
if (typeof this.bridge.haDiscovery.stop === 'function') {
348+
this.bridge.haDiscovery.stop();
349+
}
341350
this.bridge.haDiscovery.removeAllListeners?.();
342351
this.bridge.haDiscovery = null;
343352
this.bridge.commandResponseProcessor.haDiscovery = null;

src/haDiscovery.js

Lines changed: 131 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,18 @@ class HaDiscovery {
6969
// Tracks all discovery config topics published in this session so that
7070
// stale retained messages can be cleared when devices are excluded or change type.
7171
this._publishedTopics = new Set();
72+
73+
// C-Gate accepts TCP connections on the command port before its project
74+
// networks are loaded. Initial TREEXML can therefore return 401 "Network
75+
// not found" until C-Gate finishes startup. These maps drive a
76+
// per-network retry loop with exponential backoff so HA Discovery
77+
// recovers automatically without restarting the bridge.
78+
this._treeWatchdogs = new Map(); // networkId -> timeoutHandle
79+
this._treeRetryState = new Map(); // networkId -> { attempts, retryHandle }
80+
this._maxTreeRetryAttempts = 8;
81+
this._treeRetryInitialDelayMs = 2000;
82+
this._treeRetryMaxDelayMs = 60000;
83+
this._treeRequestTimeoutMs = 8000;
7284
}
7385

7486
/**
@@ -145,19 +157,136 @@ class HaDiscovery {
145157

146158
queueTreeRequest(networkId) {
147159
const normalizedNetwork = String(networkId);
160+
161+
// Cancel any pending retry timer — we're sending a fresh request now.
162+
this._cancelTreeRetry(normalizedNetwork, /* keepAttempts */ true);
163+
148164
this.logger.info(`Requesting TreeXML for network ${normalizedNetwork}...`);
149-
this.pendingTreeNetworks.push(normalizedNetwork);
165+
166+
// Avoid duplicate pending entries when a retry races a late response.
167+
if (!this.pendingTreeNetworks.includes(normalizedNetwork)) {
168+
this.pendingTreeNetworks.push(normalizedNetwork);
169+
}
170+
171+
this._armTreeWatchdog(normalizedNetwork);
150172
this._sendCommand(`${CGATE_CMD_TREEXML} ${normalizedNetwork}${NEWLINE}`);
151173
}
152174

175+
_armTreeWatchdog(networkId) {
176+
this._cancelTreeWatchdog(networkId);
177+
const handle = setTimeout(() => {
178+
this._treeWatchdogs.delete(networkId);
179+
this._handleTreeRequestFailure(networkId, 'no response within timeout');
180+
}, this._treeRequestTimeoutMs);
181+
if (typeof handle.unref === 'function') handle.unref();
182+
this._treeWatchdogs.set(networkId, handle);
183+
}
184+
185+
_cancelTreeWatchdog(networkId) {
186+
const handle = this._treeWatchdogs.get(networkId);
187+
if (handle) {
188+
clearTimeout(handle);
189+
this._treeWatchdogs.delete(networkId);
190+
}
191+
}
192+
193+
_cancelTreeRetry(networkId, keepAttempts = false) {
194+
const state = this._treeRetryState.get(networkId);
195+
if (!state) return;
196+
if (state.retryHandle) {
197+
clearTimeout(state.retryHandle);
198+
state.retryHandle = null;
199+
}
200+
if (!keepAttempts) {
201+
this._treeRetryState.delete(networkId);
202+
}
203+
}
204+
205+
/**
206+
* Receives a 4xx/5xx C-Gate command error and, if it indicates that an
207+
* in-flight TreeXML request failed because the network isn't loaded yet,
208+
* fast-fails the head of the pending queue and schedules a retry.
209+
*
210+
* Tree-related "Network not found" errors come back without a path
211+
* ("401 Bad object or device ID: Network not found"), whereas getall errors
212+
* include a path ("401 Bad object or device ID: //PROJECT/254/56/* (...)").
213+
* That difference lets us distinguish the two.
214+
*/
215+
handleCommandError(code, statusData) {
216+
if (code !== '401') return;
217+
const data = statusData || '';
218+
if (!/Network not found/i.test(data)) return;
219+
if (/\/\/[^/]+\/\d+/.test(data)) return;
220+
if (this.pendingTreeNetworks.length === 0) return;
221+
222+
const failedNetwork = this.pendingTreeNetworks.shift();
223+
this._cancelTreeWatchdog(failedNetwork);
224+
this._handleTreeRequestFailure(failedNetwork, '401 Network not found');
225+
}
226+
227+
_handleTreeRequestFailure(networkId, reason) {
228+
const state = this._treeRetryState.get(networkId) || { attempts: 0, retryHandle: null };
229+
state.attempts += 1;
230+
231+
if (state.attempts > this._maxTreeRetryAttempts) {
232+
this.logger.warn(
233+
`HA Discovery: TreeXML for network ${networkId} failed after ${this._maxTreeRetryAttempts} attempts (${reason}). ` +
234+
`Auto-discovery for this network is paused. ` +
235+
`Verify the network is configured and reachable in C-Gate, then restart the bridge or publish to cbus/write/${networkId}///gettree to retry.`
236+
);
237+
this._treeRetryState.delete(networkId);
238+
return;
239+
}
240+
241+
const delay = Math.min(
242+
this._treeRetryInitialDelayMs * Math.pow(2, state.attempts - 1),
243+
this._treeRetryMaxDelayMs
244+
);
245+
246+
this.logger.warn(
247+
`HA Discovery: TreeXML for network ${networkId} failed (${reason}). ` +
248+
`Retrying in ${Math.round(delay / 1000)}s (attempt ${state.attempts}/${this._maxTreeRetryAttempts}). ` +
249+
`This typically means C-Gate is still loading networks at startup.`
250+
);
251+
252+
if (state.retryHandle) clearTimeout(state.retryHandle);
253+
state.retryHandle = setTimeout(() => {
254+
state.retryHandle = null;
255+
this.queueTreeRequest(networkId);
256+
}, delay);
257+
if (typeof state.retryHandle.unref === 'function') state.retryHandle.unref();
258+
this._treeRetryState.set(networkId, state);
259+
}
260+
261+
/**
262+
* Cancels all retry timers and watchdogs. Call on bridge shutdown.
263+
*/
264+
stop() {
265+
for (const handle of this._treeWatchdogs.values()) {
266+
clearTimeout(handle);
267+
}
268+
this._treeWatchdogs.clear();
269+
for (const state of this._treeRetryState.values()) {
270+
if (state.retryHandle) clearTimeout(state.retryHandle);
271+
}
272+
this._treeRetryState.clear();
273+
}
274+
153275
handleTreeStart(_statusData) {
154276
if (this.activeTreeSession && this.activeTreeSession.bufferParts.length > 0) {
155277
this.logger.warn(`Received a new TreeXML start before previous tree completed; dropping incomplete tree for network ${this.activeTreeSession.network}`);
156278
}
157279

158280
const nextNetwork = this.pendingTreeNetworks.shift() || this.treeNetwork || 'unknown';
281+
const networkKey = String(nextNetwork);
282+
283+
// The request succeeded — cancel watchdog and any pending retry so a
284+
// late retry doesn't issue a redundant TREEXML.
285+
this._cancelTreeWatchdog(networkKey);
286+
this._cancelTreeRetry(networkKey);
287+
159288
this.activeTreeSession = {
160-
network: String(nextNetwork),
289+
network: networkKey,
161290
bufferParts: []
162291
};
163292

tests/haDiscovery.test.js

Lines changed: 146 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -524,16 +524,160 @@ describe('HaDiscovery', () => {
524524
describe('Integration with CgateWebBridge', () => {
525525
it('should handle tree responses from C-Gate correctly', () => {
526526
haDiscovery.treeNetwork = '254';
527-
527+
528528
haDiscovery.handleTreeStart('start');
529529
expect(haDiscovery.treeBufferParts).toEqual([]);
530-
530+
531531
haDiscovery.handleTreeData('data1');
532532
haDiscovery.handleTreeData('data2');
533533
expect(haDiscovery.treeBufferParts).toEqual(['data1', 'data2']);
534534
});
535535
});
536536

537+
describe('TreeXML retry on startup race (401 Network not found)', () => {
538+
beforeEach(() => {
539+
jest.useFakeTimers();
540+
mockSettings.ha_discovery_networks = ['254'];
541+
});
542+
543+
afterEach(() => {
544+
haDiscovery.stop();
545+
jest.useRealTimers();
546+
});
547+
548+
it('schedules a retry after a 401 Network not found error for an in-flight TreeXML', () => {
549+
haDiscovery.trigger();
550+
expect(mockSendCommandFn).toHaveBeenCalledTimes(1);
551+
expect(haDiscovery.pendingTreeNetworks).toEqual(['254']);
552+
553+
haDiscovery.handleCommandError('401', 'Bad object or device ID: Network not found');
554+
555+
// Failed entry removed from pending so a late tree-start can't be misattributed.
556+
expect(haDiscovery.pendingTreeNetworks).toEqual([]);
557+
// No retry yet — it's scheduled.
558+
expect(mockSendCommandFn).toHaveBeenCalledTimes(1);
559+
560+
// First retry runs after the initial backoff (2s).
561+
jest.advanceTimersByTime(2000);
562+
expect(mockSendCommandFn).toHaveBeenCalledTimes(2);
563+
expect(haDiscovery.pendingTreeNetworks).toEqual(['254']);
564+
});
565+
566+
it('ignores 401 errors that include a path (not a tree request)', () => {
567+
haDiscovery.trigger();
568+
haDiscovery.handleCommandError('401', 'Bad object or device ID: //PROJECT/254/56/* (Network not found)');
569+
expect(haDiscovery.pendingTreeNetworks).toEqual(['254']);
570+
jest.advanceTimersByTime(2000);
571+
expect(mockSendCommandFn).toHaveBeenCalledTimes(1);
572+
});
573+
574+
it('ignores 401 errors when no TreeXML is in flight', () => {
575+
haDiscovery.handleCommandError('401', 'Bad object or device ID: Network not found');
576+
jest.advanceTimersByTime(60000);
577+
expect(mockSendCommandFn).not.toHaveBeenCalled();
578+
});
579+
580+
it('uses exponential backoff between successive retries', () => {
581+
haDiscovery.trigger();
582+
const errMsg = 'Bad object or device ID: Network not found';
583+
584+
haDiscovery.handleCommandError('401', errMsg); // attempt 1, retry in 2s
585+
jest.advanceTimersByTime(2000);
586+
expect(mockSendCommandFn).toHaveBeenCalledTimes(2);
587+
588+
haDiscovery.handleCommandError('401', errMsg); // attempt 2, retry in 4s
589+
jest.advanceTimersByTime(3999);
590+
expect(mockSendCommandFn).toHaveBeenCalledTimes(2);
591+
jest.advanceTimersByTime(1);
592+
expect(mockSendCommandFn).toHaveBeenCalledTimes(3);
593+
594+
haDiscovery.handleCommandError('401', errMsg); // attempt 3, retry in 8s
595+
jest.advanceTimersByTime(8000);
596+
expect(mockSendCommandFn).toHaveBeenCalledTimes(4);
597+
});
598+
599+
it('gives up after the maximum number of attempts', () => {
600+
const warnSpy = jest.spyOn(haDiscovery.logger, 'warn');
601+
haDiscovery.trigger();
602+
const errMsg = 'Bad object or device ID: Network not found';
603+
604+
// 8 retries permitted; the 9th failure exhausts the budget.
605+
for (let i = 1; i <= 8; i++) {
606+
haDiscovery.handleCommandError('401', errMsg);
607+
jest.runOnlyPendingTimers();
608+
}
609+
haDiscovery.handleCommandError('401', errMsg);
610+
611+
// No further retry scheduled.
612+
const callsAfterFinalFailure = mockSendCommandFn.mock.calls.length;
613+
jest.advanceTimersByTime(120000);
614+
expect(mockSendCommandFn).toHaveBeenCalledTimes(callsAfterFinalFailure);
615+
expect(warnSpy.mock.calls.some(([msg]) => /failed after 8 attempts/i.test(msg))).toBe(true);
616+
});
617+
618+
it('falls back to the watchdog when no response arrives within the request timeout', () => {
619+
haDiscovery.trigger();
620+
expect(mockSendCommandFn).toHaveBeenCalledTimes(1);
621+
622+
// Watchdog fires after 8s (request timeout), then 2s backoff before retry.
623+
jest.advanceTimersByTime(8000);
624+
jest.advanceTimersByTime(2000);
625+
expect(mockSendCommandFn).toHaveBeenCalledTimes(2);
626+
});
627+
628+
it('cancels pending retry when the next TreeXML succeeds', () => {
629+
haDiscovery.trigger();
630+
haDiscovery.handleCommandError('401', 'Bad object or device ID: Network not found');
631+
jest.advanceTimersByTime(2000);
632+
633+
// Retry sent — simulate a successful tree response.
634+
haDiscovery.handleTreeStart('start');
635+
haDiscovery.handleTreeData('<xml/>');
636+
jest.spyOn(require('xml2js'), 'parseString').mockImplementation((xml, _opts, cb) => cb(null, {}));
637+
haDiscovery.handleTreeEnd('end');
638+
639+
// Watchdog and retry state should now be cleared.
640+
expect(haDiscovery._treeWatchdogs.size).toBe(0);
641+
expect(haDiscovery._treeRetryState.size).toBe(0);
642+
643+
// Advance time well past any potential retry — no extra commands.
644+
const callsBefore = mockSendCommandFn.mock.calls.length;
645+
jest.advanceTimersByTime(120000);
646+
expect(mockSendCommandFn).toHaveBeenCalledTimes(callsBefore);
647+
});
648+
649+
it('retries each network independently when multiple fail', () => {
650+
mockSettings.ha_discovery_networks = ['254', '200'];
651+
haDiscovery.trigger();
652+
expect(mockSendCommandFn).toHaveBeenCalledTimes(2);
653+
expect(haDiscovery.pendingTreeNetworks).toEqual(['254', '200']);
654+
655+
// First 401 is for 254 (FIFO).
656+
haDiscovery.handleCommandError('401', 'Bad object or device ID: Network not found');
657+
// Second 401 is for 200.
658+
haDiscovery.handleCommandError('401', 'Bad object or device ID: Network not found');
659+
expect(haDiscovery.pendingTreeNetworks).toEqual([]);
660+
661+
jest.advanceTimersByTime(2000);
662+
expect(mockSendCommandFn).toHaveBeenCalledTimes(4);
663+
expect(haDiscovery.pendingTreeNetworks).toEqual(expect.arrayContaining(['254', '200']));
664+
});
665+
666+
it('stop() clears all retry timers and watchdogs', () => {
667+
haDiscovery.trigger();
668+
haDiscovery.handleCommandError('401', 'Bad object or device ID: Network not found');
669+
expect(haDiscovery._treeRetryState.size).toBe(1);
670+
671+
haDiscovery.stop();
672+
expect(haDiscovery._treeWatchdogs.size).toBe(0);
673+
expect(haDiscovery._treeRetryState.size).toBe(0);
674+
675+
const callsBefore = mockSendCommandFn.mock.calls.length;
676+
jest.advanceTimersByTime(120000);
677+
expect(mockSendCommandFn).toHaveBeenCalledTimes(callsBefore);
678+
});
679+
});
680+
537681
describe('Custom Label Override (three-tier priority)', () => {
538682
beforeEach(() => {
539683
jest.spyOn(require('xml2js'), 'parseString').mockImplementation((xml, _opts, callback) => {

0 commit comments

Comments
 (0)