Skip to content

Commit 0d2c7a1

Browse files
authored
Merge pull request #3433 from jimklimov/issue-3365
Revise `upsd` behavior of `MAXCONN` vs. `ulimit -n` (`getrlimit()`)
2 parents 3cf1984 + 7af375f commit 0d2c7a1

6 files changed

Lines changed: 208 additions & 20 deletions

File tree

NEWS.adoc

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,16 @@ https://github.com/networkupstools/nut/milestone/13
121121
ended up as literal HTML on KDE Plasma 6; now that rich text formatting
122122
is only used for main window status labels. [PR #3430]
123123

124+
- `upsd` data server updates:
125+
* If we hit "Too many open files" during configuration reload, close
126+
the oldest client connection and retry. [issue #3365]
127+
* If the `MAXCONN` requested in the configuration file exceeds the OS
128+
allowance on open file descriptors, fail early since the requested
129+
configuration can not be guaranteed and can mis-fire unexpectedly
130+
much later (tell the sysadmin to increase `ulimit` or set up a more
131+
conservative `MAXCONN`). If there is a separate soft and hard limit,
132+
and `MAXCONN` exceeds the soft limit, try to raise the bar. [issue #3365]
133+
124134
- Recipes, CI and helper script updates not classified above:
125135
* Introduced `ci_build.sh` settings and respective CI workflow settings
126136
to optionally re-use a `config.cache` file from older runs, and similar
@@ -134,6 +144,7 @@ https://github.com/networkupstools/nut/milestone/13
134144
dependencies were not previously discovered and bundled). [issue #3420,
135145
PRs #3429, #3432]
136146

147+
137148
Release notes for NUT 2.8.5 - what's new since 2.8.4
138149
----------------------------------------------------
139150

UPGRADING.adoc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,13 @@ Changes from 2.8.5 to 2.8.6
3939
in this regard, at the cost of adding arguments to methods introduced in
4040
the previous release. [issue #3331, PR #3408]
4141
42+
- Potentially a breaking change for existing deployments with a (large)
43+
`MAXCONN` setting in `upsd.conf`: now this value is checked against the
44+
`getrlimit()` (e.g. `ulimit -n`) setting of the operating system for this
45+
daemon, where available, and the `upsd` data server would refuse to start
46+
if the requested value is larger than what is allowed (minus some reserve
47+
for configuration files and other use-cases). [issue #3365]
48+
4249
4350
Changes from 2.8.4 to 2.8.5
4451
---------------------------

docs/nut.dict

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2242,6 +2242,7 @@ getenv
22422242
gethostbyname
22432243
getopt
22442244
getproctag
2245+
getrlimit
22452246
getter
22462247
getters
22472248
gettext

server/conf.c

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include "netssl.h"
2828
#include "nut_stdint.h"
2929
#include <ctype.h>
30+
#include <errno.h>
3031

3132
static ups_t *upstable = NULL;
3233
int num_ups = 0;
@@ -420,7 +421,12 @@ void load_upsdconf(int reloading)
420421

421422
pconf_init(&ctx, upsd_conf_err);
422423

424+
retry:
423425
if (!pconf_file_begin(&ctx, fn)) {
426+
if (errno == EMFILE && reloading == 2) {
427+
close_oldest_client();
428+
goto retry;
429+
}
424430
pconf_finish(&ctx);
425431

426432
if (!reloading)
@@ -491,6 +497,24 @@ void load_upsdconf(int reloading)
491497
pconf_finish(&ctx);
492498
}
493499

500+
static int load_upsconf(int reloading) {
501+
int ret;
502+
503+
ret = read_upsconf(0); /* 0 = do not abort fatally just yet */
504+
if (ret == -1) {
505+
if (errno == EMFILE && reloading == 2) {
506+
upsdebugx(1, "%s: close an oldest client connection and try reading config again", __func__);
507+
close_oldest_client();
508+
ret = read_upsconf(1); /* 1 = may abort upon fundamental errors */
509+
} else {
510+
/* Not fatalx(), the method above already reported the problem */
511+
exit(EXIT_FAILURE);
512+
}
513+
}
514+
515+
return ret;
516+
}
517+
494518
/* callback during parsing of ups.conf */
495519
void do_upsconf_args(char *upsname, char *var, char *val)
496520
{
@@ -654,12 +678,19 @@ static int check_file(const char *fn)
654678
{
655679
char chkfn[NUT_PATH_MAX];
656680
FILE *f;
681+
int retries = 0;
657682

658683
snprintf(chkfn, sizeof(chkfn), "%s/%s", confpath(), fn);
659684

685+
retry:
660686
f = fopen(chkfn, "r");
661687

662688
if (!f) {
689+
if (errno == EMFILE && retries < 10) {
690+
close_oldest_client();
691+
retries++;
692+
goto retry;
693+
}
663694
upslog_with_errno(LOG_ERR, "Reload failed: can't open %s", chkfn);
664695
return 0; /* failed */
665696
}
@@ -687,11 +718,11 @@ void conf_reload(void)
687718
}
688719

689720
/* reload from ups.conf */
690-
read_upsconf(1); /* 1 = may abort upon fundamental errors */
721+
load_upsconf(2); /* 2 = reloading, and may retry by closing clients if EMFILE */
691722
upsconf_add(1); /* 1 = reloading */
692723

693724
/* now reread upsd.conf */
694-
load_upsdconf(1); /* 1 = reloading */
725+
load_upsdconf(2); /* 2 = reloading, and may retry by closing clients if EMFILE */
695726

696727
/* now delete all UPS entries that didn't get reloaded */
697728

server/upsd.c

Lines changed: 117 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@
4545
# include <signal.h>
4646
/* #include <poll.h> */
4747
# endif
48+
# ifdef HAVE_SYS_RESOURCE_H
49+
# include <sys/resource.h> /* for getrlimit() and struct rlimit */
50+
# endif
4851
#else /* WIN32 */
4952
/* Those 2 files for support of getaddrinfo, getnameinfo and freeaddrinfo
5053
on Windows 2000 and older versions */
@@ -94,12 +97,14 @@ int allow_no_device = 0;
9497
*/
9598
int allow_not_all_listeners = 0;
9699

97-
/* preloaded to POSIX sysconf(_SC_OPEN_MAX) or WIN32 MAX_WAIT_OBJECTS in main
100+
/* Preloaded to POSIX sysconf(_SC_OPEN_MAX) or WIN32 MAX_WAIT_OBJECTS in main
98101
* and elsewhere, the run-time value can be overridden via upsd.conf `MAXCONN`
99102
* option (may cause partial waits chunk by chunk, if sysmaxconn is smaller).
103+
* The sysmaxconn_hard is derived from getrlimit() (aka `ulimit` on allowed
104+
* opened file descriptors) where available.
100105
*/
101106
nfds_t maxconn = 0;
102-
static nfds_t sysmaxconn = 0;
107+
static nfds_t sysmaxconn = 0, sysmaxconn_hard = 0;
103108

104109
/* preloaded to STATEPATH in main, can be overridden via upsd.conf */
105110
char *statepath = NULL;
@@ -1258,12 +1263,68 @@ static void update_sysmaxconn(void)
12581263
char *s = getenv("NUT_SYSMAXCONN_LIMIT");
12591264

12601265
#ifndef WIN32
1266+
# ifdef HAVE_SYS_RESOURCE_H
1267+
struct rlimit limit;
1268+
# endif /* HAVE_SYS_RESOURCE_H */
1269+
12611270
/* default to system limit (may be overridden in upsd.conf) */
12621271
/* FIXME: Check for overflows (and int size of nfds_t vs. long) - see get_max_pid_t() for example */
12631272
l = sysconf(_SC_OPEN_MAX);
1273+
1274+
# ifdef HAVE_SYS_RESOURCE_H
1275+
/* Try to use getrlimit/setrlimit to detect and possibly increase the limit */
1276+
if (getrlimit(RLIMIT_NOFILE, &limit) == 0) {
1277+
upsdebugx(2, "%s: System file descriptor limits: soft=%ld, hard=%ld",
1278+
__func__, (long)limit.rlim_cur, (long)limit.rlim_max);
1279+
1280+
/* If we requested a specific MAXCONN, try to ensure we have enough FDs */
1281+
if (maxconn > 0) {
1282+
rlim_t needed = (rlim_t)maxconn + RESERVE_FD_COUNT_UPSD;
1283+
1284+
if (limit.rlim_cur < needed) {
1285+
if (needed <= limit.rlim_max) {
1286+
upslogx(LOG_INFO, "Increasing file descriptor limit to %ld", (long)needed);
1287+
1288+
limit.rlim_cur = needed;
1289+
if (setrlimit(RLIMIT_NOFILE, &limit) != 0) {
1290+
upslog_with_errno(LOG_WARNING, "setrlimit(RLIMIT_NOFILE) to %ld failed", (long)needed);
1291+
}
1292+
} else {
1293+
upslogx(LOG_WARNING, "WARNING: Requested MAXCONN %" PRIdMAX
1294+
" requires %ld FDs overall "
1295+
"(with %ld reserved for non-connection purposes), "
1296+
"but system hard limit is %ld",
1297+
(intmax_t)maxconn, (long)needed,
1298+
(long)RESERVE_FD_COUNT_UPSD,
1299+
(long)limit.rlim_max);
1300+
1301+
/* We might still try to bump to hard limit */
1302+
if (limit.rlim_cur < limit.rlim_max) {
1303+
limit.rlim_cur = limit.rlim_max;
1304+
setrlimit(RLIMIT_NOFILE, &limit);
1305+
}
1306+
}
1307+
}
1308+
}
1309+
1310+
/* Refresh limit after possible update */
1311+
getrlimit(RLIMIT_NOFILE, &limit);
1312+
sysmaxconn_hard = (long)limit.rlim_cur;
1313+
} else {
1314+
# endif /* HAVE_SYS_RESOURCE_H */
1315+
/* Fallback to sysconf if getrlimit fails or is absent */
1316+
/* TOTHINK: Any other reasonable fallback hard limit? */
1317+
sysmaxconn_hard = (nfds_t)l;
1318+
# ifdef HAVE_SYS_RESOURCE_H
1319+
}
1320+
# endif /* HAVE_SYS_RESOURCE_H */
1321+
12641322
#else /* WIN32 */
12651323
/* hard-coded 64 (from ddk/wdm.h or winnt.h) */
12661324
l = (long)MAXIMUM_WAIT_OBJECTS;
1325+
1326+
/* No known limit, do not check */
1327+
sysmaxconn_hard = 0;
12671328
#endif /* WIN32 */
12681329

12691330
if (l < 1) {
@@ -1276,11 +1337,35 @@ static void update_sysmaxconn(void)
12761337
l);
12771338
}
12781339

1340+
if (sysmaxconn_hard > 0 && sysmaxconn_hard < RESERVE_FD_COUNT_UPSD + 10) {
1341+
fatalx(EXIT_FAILURE,
1342+
"System reported an absurd value %ld (below the %ld reservation for\n"
1343+
"non-connection purposes and some 10 for driver/client/... connections)\n"
1344+
"as its hard maximum number of connections.\n"
1345+
"The server won't start until this problem is resolved.\n",
1346+
(long)sysmaxconn_hard, (long)RESERVE_FD_COUNT_UPSD);
1347+
}
1348+
12791349
/* Note this historically also serves as
12801350
* the initial/default MAXCONN setting
12811351
* (so site/platform-dependent).
12821352
*/
1283-
sysmaxconn = (nfds_t)l;
1353+
if (sysmaxconn_hard > 0) {
1354+
if (l < RESERVE_FD_COUNT_UPSD + 10) {
1355+
fatalx(EXIT_FAILURE,
1356+
"System reported an absurd value %ld (below the %ld reservation for\n"
1357+
"non-connection purposes and some 10 for driver/client/... connections)\n"
1358+
"as its sysconf maximum number of connections.\n"
1359+
"The server won't start until this problem is resolved.\n",
1360+
l, (long)RESERVE_FD_COUNT_UPSD);
1361+
}
1362+
1363+
sysmaxconn = (nfds_t)(l - RESERVE_FD_COUNT_UPSD);
1364+
} else {
1365+
/* No known limit on open FDs/handles, whether connections or files or other streams */
1366+
sysmaxconn = (nfds_t)l;
1367+
}
1368+
12841369
if (maxconn < 1) {
12851370
upsdebugx(1, "%s: defaulting maxconn to sysmaxconn: %ld",
12861371
__func__, l);
@@ -1308,6 +1393,15 @@ static void poll_reload(void)
13081393
/* Not likely this would change, but refresh just in case */
13091394
update_sysmaxconn();
13101395

1396+
if (sysmaxconn_hard > 0 && (maxconn > sysmaxconn_hard - RESERVE_FD_COUNT_UPSD)) {
1397+
fatalx(EXIT_FAILURE,
1398+
"You requested %" PRIdMAX " as maximum number of connections,\n"
1399+
"but the system only allows %" PRIdMAX " and we need %d for ourselves.\n"
1400+
"The server won't start until this problem is resolved\n"
1401+
"(reduce MAXCONN or increase ulimit or similar settings).\n",
1402+
(intmax_t)maxconn, (intmax_t)sysmaxconn, RESERVE_FD_COUNT_UPSD);
1403+
}
1404+
13111405
if ((intmax_t)sysmaxconn < (intmax_t)maxconn) {
13121406
upslogx(LOG_WARNING,
13131407
"Your system limits the maximum number of connections to %" PRIdMAX "\n"
@@ -1593,6 +1687,7 @@ static void mainloop(void)
15931687
if (reload_flag) {
15941688
upsnotify(NOTIFY_STATE_RELOADING, NULL);
15951689
conf_reload();
1690+
/* Among other things, re-detect sysmaxconn after loading config, because MAXCONN might have changed */
15961691
poll_reload();
15971692
reload_flag = 0;
15981693
upsnotify(NOTIFY_STATE_READY, NULL);
@@ -2419,6 +2514,22 @@ void check_perms(const char *fn)
24192514
#endif /* WIN32 */
24202515
}
24212516

2517+
void close_oldest_client(void)
2518+
{
2519+
nut_ctype_t *client, *oldest = NULL;
2520+
2521+
for (client = firstclient; client; client = client->next) {
2522+
if (!oldest || client->last_heard < oldest->last_heard) {
2523+
oldest = client;
2524+
}
2525+
}
2526+
2527+
if (oldest) {
2528+
upslogx(LOG_INFO, "Closing oldest client connection from %s to free up file descriptors", oldest->addr);
2529+
client_disconnect(oldest);
2530+
}
2531+
}
2532+
24222533
int main(int argc, char **argv)
24232534
{
24242535
int opt_ret = 0, cmdret = 0, foreground = -1;
@@ -2708,6 +2819,9 @@ int main(int argc, char **argv)
27082819
/* handle upsd.conf */
27092820
load_upsdconf(0); /* 0 = initial */
27102821

2822+
/* Re-detect sysmaxconn after loading config, because MAXCONN might have changed */
2823+
update_sysmaxconn();
2824+
27112825
/* CLI debug level can not be smaller than debug_min specified
27122826
* in upsd.conf. Note that non-zero debug_min does not impact
27132827
* foreground running mode.

0 commit comments

Comments
 (0)