Skip to content

Commit ad33b42

Browse files
Fix service location failure (#818)
* Retry service location auto-connect on startup * listen for address change * dont retry if succeeded * cleanup * bump * Apply suggestions from code review Co-authored-by: Adam <adam@NetBSD.org> * simplify code * simplify further * fix * Update .npmrc * Update lint.yaml --------- Co-authored-by: Adam <adam@NetBSD.org>
1 parent 5c0402c commit ad33b42

File tree

10 files changed

+154
-43
lines changed

10 files changed

+154
-43
lines changed

.github/workflows/lint.yaml

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,22 @@
1-
name: "lint"
1+
name: 'lint'
22

33
on:
44
push:
55
branches:
66
- main
77
- dev
8-
- "release/**"
8+
- 'release/**'
99
paths-ignore:
10-
- "*.md"
11-
- "LICENSE"
10+
- '*.md'
11+
- 'LICENSE'
1212
pull_request:
1313
branches:
1414
- main
1515
- dev
16-
- "release/**"
16+
- 'release/**'
1717
paths-ignore:
18-
- "*.md"
19-
- "LICENSE"
18+
- '*.md'
19+
- 'LICENSE'
2020

2121
jobs:
2222
lint-web:
@@ -30,7 +30,7 @@ jobs:
3030

3131
- uses: actions/setup-node@v6
3232
with:
33-
node-version: "24"
33+
node-version: '24'
3434

3535
- uses: pnpm/action-setup@v5
3636
with:
@@ -56,5 +56,6 @@ jobs:
5656
- name: Run Biome and Prettier Lint
5757
run: pnpm lint
5858

59-
- name: Audit
60-
run: pnpm audit --prod
59+
# TODO: Restore when it works again: https://github.com/pnpm/pnpm/issues/11265
60+
# - name: Audit
61+
# run: pnpm audit --prod

.npmrc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
strict-peer-dependencies=false
1+
strict-peer-dependencies=false
2+
audit=false

.trivyignore.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
vulnerabilities:
22
- id: GHSA-wrw7-89jp-8q8g
3-
expired_at: 2026-04-18
3+
expired_at: 2026-05-16
44
statement: 'glib is a transitive dependency of Tauri which we cannot update ourselves. Waiting for tauri to finish migration to gtk4-rs: https://github.com/tauri-apps/tauri/issues/12563'

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@
8585
"html-react-parser": "^5.2.17",
8686
"itertools": "^2.6.0",
8787
"js-base64": "^3.7.8",
88-
"lodash-es": "^4.17.23",
88+
"lodash-es": "^4.18.1",
8989
"merge-refs": "^2.0.0",
9090
"millify": "^6.1.0",
9191
"motion": "^12.38.0",

pnpm-lock.yaml

Lines changed: 5 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src-tauri/Cargo.lock

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src-tauri/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,9 @@ windows-sys = { version = "0.61", features = [
160160
# HANDLE & file functions
161161
"Win32_System_IO",
162162
"Win32_System_Threading",
163+
164+
# Network address change notifications (NotifyAddrChange)
165+
"Win32_NetworkManagement_IpHelper",
163166
] }
164167

165168
[features]

src-tauri/src/enterprise/service_locations/windows.rs

Lines changed: 63 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ use defguard_wireguard_rs::{
1515
};
1616
use known_folders::get_known_folder_path;
1717
use log::{debug, error, warn};
18+
use tokio::time::sleep;
1819
use windows::{
1920
core::PSTR,
2021
Win32::System::RemoteDesktop::{
@@ -23,6 +24,7 @@ use windows::{
2324
},
2425
};
2526
use windows_acl::acl::ACL;
27+
use windows_sys::Win32::NetworkManagement::IpHelper::NotifyAddrChange;
2628

2729
use crate::{
2830
enterprise::service_locations::{
@@ -36,10 +38,59 @@ use crate::{
3638
};
3739

3840
const LOGIN_LOGOFF_EVENT_RETRY_DELAY_SECS: u64 = 5;
41+
// How long to wait after a network change before attempting to connect.
42+
// Gives DHCP time to complete and DNS to become available.
43+
const NETWORK_STABILIZATION_DELAY: Duration = Duration::from_secs(3);
44+
// How long to wait before restarting the network change watcher on error.
45+
const NETWORK_CHANGE_MONITOR_RESTART_DELAY: Duration = Duration::from_secs(5);
3946
const DEFAULT_WIREGUARD_PORT: u16 = 51820;
4047
const DEFGUARD_DIR: &str = "Defguard";
4148
const SERVICE_LOCATIONS_SUBDIR: &str = "service_locations";
4249

50+
/// Watches for IP address changes on any network interface and attempts to connect to any
51+
/// service locations that are not yet connected. This handles the case where the endpoint
52+
/// hostname cannot be resolved at service startup because the network (e.g. Wi-Fi) is not
53+
/// yet available. When the network comes up and an IP is assigned, this watcher fires and
54+
/// retries the connection.
55+
///
56+
/// Note: `NotifyAddrChange` also fires when WireGuard interfaces are created. This is
57+
/// harmless because `connect_to_service_locations` skips already-connected locations.
58+
pub(crate) async fn watch_for_network_change(
59+
service_location_manager: Arc<RwLock<ServiceLocationManager>>,
60+
) {
61+
loop {
62+
// NotifyAddrChange blocks until any IP address is added or removed on any interface.
63+
// Passing NULL for both handle and overlapped selects the synchronous (blocking) mode.
64+
let result = unsafe { NotifyAddrChange(std::ptr::null_mut(), std::ptr::null()) };
65+
66+
if result != 0 {
67+
error!("NotifyAddrChange failed with error code: {result}");
68+
sleep(NETWORK_CHANGE_MONITOR_RESTART_DELAY).await;
69+
continue;
70+
}
71+
72+
debug!(
73+
"Network address change detected, waiting {NETWORK_STABILIZATION_DELAY:?}s for \
74+
network to stabilize before attempting service location connections..."
75+
);
76+
sleep(NETWORK_STABILIZATION_DELAY).await;
77+
78+
debug!("Attempting to connect to service locations after network change");
79+
match service_location_manager
80+
.write()
81+
.unwrap()
82+
.connect_to_service_locations()
83+
{
84+
Ok(_) => {
85+
debug!("Service location connect attempt after network change completed");
86+
}
87+
Err(err) => {
88+
warn!("Failed to connect to service locations after network change: {err}");
89+
}
90+
}
91+
}
92+
}
93+
4394
pub(crate) async fn watch_for_login_logoff(
4495
service_location_manager: Arc<RwLock<ServiceLocationManager>>,
4596
) -> Result<(), ServiceLocationError> {
@@ -59,7 +110,7 @@ pub(crate) async fn watch_for_login_logoff(
59110
}
60111
Err(err) => {
61112
error!("Failed waiting for login/logoff event: {err:?}");
62-
tokio::time::sleep(Duration::from_secs(LOGIN_LOGOFF_EVENT_RETRY_DELAY_SECS)).await;
113+
sleep(Duration::from_secs(LOGIN_LOGOFF_EVENT_RETRY_DELAY_SECS)).await;
63114
continue;
64115
}
65116
};
@@ -680,12 +731,19 @@ impl ServiceLocationManager {
680731
Ok(())
681732
}
682733

683-
pub(crate) fn connect_to_service_locations(&mut self) -> Result<(), ServiceLocationError> {
734+
/// Attempts to connect to all service locations that are not already connected.
735+
///
736+
/// Returns `Ok(true)` if every location is now connected (either it was already connected or
737+
/// it was successfully connected during this call), and `Ok(false)` if at least one location
738+
/// failed to connect (indicating that a retry may be worthwhile).
739+
pub(crate) fn connect_to_service_locations(&mut self) -> Result<bool, ServiceLocationError> {
684740
debug!("Attempting to auto-connect to VPN...");
685741

686742
let data = self.load_service_locations()?;
687743
debug!("Loaded {} instance(s) from ServiceLocationApi", data.len());
688744

745+
let mut all_connected = true;
746+
689747
for instance_data in data {
690748
debug!(
691749
"Found service locations for instance ID: {}",
@@ -725,10 +783,11 @@ impl ServiceLocationManager {
725783
if let Err(err) =
726784
self.setup_service_location_interface(&location, &instance_data.private_key)
727785
{
728-
debug!(
786+
warn!(
729787
"Failed to setup service location interface for '{}': {err:?}",
730788
location.name
731789
);
790+
all_connected = false;
732791
continue;
733792
}
734793

@@ -749,7 +808,7 @@ impl ServiceLocationManager {
749808

750809
debug!("Auto-connect attempt completed");
751810

752-
Ok(())
811+
Ok(all_connected)
753812
}
754813

755814
pub fn save_service_locations(

src-tauri/src/service/windows.rs

Lines changed: 63 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ use std::{
77

88
use clap::Parser;
99
use error;
10-
use tokio::runtime::Runtime;
10+
use tokio::{runtime::Runtime, time::sleep};
1111
use windows_service::{
1212
define_windows_service,
1313
service::{
@@ -20,7 +20,8 @@ use windows_service::{
2020

2121
use crate::{
2222
enterprise::service_locations::{
23-
windows::watch_for_login_logoff, ServiceLocationError, ServiceLocationManager,
23+
windows::{watch_for_login_logoff, watch_for_network_change},
24+
ServiceLocationError, ServiceLocationManager,
2425
},
2526
service::{
2627
config::Config,
@@ -32,6 +33,8 @@ use crate::{
3233
static SERVICE_NAME: &str = "DefguardService";
3334
const SERVICE_TYPE: ServiceType = ServiceType::OWN_PROCESS;
3435
const LOGIN_LOGOFF_MONITORING_RESTART_DELAY_SECS: Duration = Duration::from_secs(5);
36+
const SERVICE_LOCATION_CONNECT_RETRY_COUNT: u32 = 5;
37+
const SERVICE_LOCATION_CONNECT_RETRY_DELAY: Duration = Duration::from_secs(30);
3538

3639
pub fn run() -> Result<(), windows_service::Error> {
3740
// Register generated `ffi_service_main` with the system and start the service, blocking
@@ -112,25 +115,69 @@ fn run_service() -> Result<(), DaemonError> {
112115

113116
let service_location_manager = Arc::new(RwLock::new(service_location_manager));
114117

115-
// Spawn service location management task
118+
// Spawn network change monitoring task first so NotifyAddrChange is registered as early
119+
// as possible, minimising the window in which a network event could be missed before
120+
// the watcher is listening. The retry task below is the backstop for any event that
121+
// still slips through that window.
116122
let service_location_manager_clone = service_location_manager.clone();
117123
runtime.spawn(async move {
118124
let manager = service_location_manager_clone;
125+
info!("Starting network change monitoring");
126+
watch_for_network_change(manager.clone()).await;
127+
error!("Network change monitoring ended unexpectedly.");
128+
});
119129

120-
info!("Starting service location management task");
121-
122-
info!("Attempting to auto-connect to service locations");
123-
match manager.write().unwrap().connect_to_service_locations() {
124-
Ok(()) => {
125-
info!("Auto-connect to service locations completed successfully");
130+
// Spawn service location auto-connect task with retries.
131+
// Each attempt skips locations that are already connected, so it is safe to call
132+
// connect_to_service_locations repeatedly. The retry loop exists to handle the case
133+
// where the connection may fail initially at startup because the network
134+
// (e.g. Wi-Fi) is not yet available (mainly DNS resolution issues), and serves as
135+
// a backstop for any network events missed by the watcher above.
136+
// If all locations connect successfully on a given attempt, no further retries are made.
137+
let service_location_manager_connect = service_location_manager.clone();
138+
runtime.spawn(async move {
139+
for attempt in 1..=SERVICE_LOCATION_CONNECT_RETRY_COUNT {
140+
info!(
141+
"Attempting to auto-connect to service locations \
142+
(attempt {attempt}/{SERVICE_LOCATION_CONNECT_RETRY_COUNT})"
143+
);
144+
match service_location_manager_connect
145+
.write()
146+
.unwrap()
147+
.connect_to_service_locations()
148+
{
149+
Ok(true) => {
150+
info!(
151+
"All service locations connected successfully \
152+
(attempt {attempt}/{SERVICE_LOCATION_CONNECT_RETRY_COUNT})"
153+
);
154+
break;
155+
}
156+
Ok(false) => {
157+
warn!(
158+
"Auto-connect attempt {attempt}/{SERVICE_LOCATION_CONNECT_RETRY_COUNT} \
159+
completed with some failures"
160+
);
161+
}
162+
Err(err) => {
163+
warn!(
164+
"Auto-connect attempt {attempt}/{SERVICE_LOCATION_CONNECT_RETRY_COUNT} \
165+
failed: {err}"
166+
);
167+
}
126168
}
127-
Err(err) => {
128-
warn!(
129-
"Error while trying to auto-connect to service locations: {err}. \
130-
Will continue monitoring for login/logoff events.",
131-
);
169+
170+
if attempt < SERVICE_LOCATION_CONNECT_RETRY_COUNT {
171+
sleep(SERVICE_LOCATION_CONNECT_RETRY_DELAY).await;
132172
}
133173
}
174+
info!("Service location auto-connect task finished");
175+
});
176+
177+
// Spawn login/logoff monitoring task, runs concurrently with the tasks above.
178+
let service_location_manager_clone = service_location_manager.clone();
179+
runtime.spawn(async move {
180+
let manager = service_location_manager_clone;
134181

135182
info!("Starting login/logoff event monitoring");
136183
loop {
@@ -140,14 +187,14 @@ fn run_service() -> Result<(), DaemonError> {
140187
"Login/logoff event monitoring ended unexpectedly. Restarting in \
141188
{LOGIN_LOGOFF_MONITORING_RESTART_DELAY_SECS:?}..."
142189
);
143-
tokio::time::sleep(LOGIN_LOGOFF_MONITORING_RESTART_DELAY_SECS).await;
190+
sleep(LOGIN_LOGOFF_MONITORING_RESTART_DELAY_SECS).await;
144191
}
145192
Err(e) => {
146193
error!(
147194
"Error in login/logoff event monitoring: {e}. Restarting in \
148195
{LOGIN_LOGOFF_MONITORING_RESTART_DELAY_SECS:?}...",
149196
);
150-
tokio::time::sleep(LOGIN_LOGOFF_MONITORING_RESTART_DELAY_SECS).await;
197+
sleep(LOGIN_LOGOFF_MONITORING_RESTART_DELAY_SECS).await;
151198
info!("Restarting login/logoff event monitoring");
152199
}
153200
}

src-tauri/src/wg_config.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1+
use std::{array::TryFromSliceError, net::IpAddr, path::Path};
2+
13
use base64::{prelude::BASE64_STANDARD, DecodeError, Engine};
2-
use std::path::Path;
3-
use std::{array::TryFromSliceError, net::IpAddr};
44
use thiserror::Error;
55
use x25519_dalek::{PublicKey, StaticSecret};
66

0 commit comments

Comments
 (0)