Skip to content

Commit b6e2122

Browse files
committed
Emit named IN_CREATE/IN_DELETE for inotify directory watches
EVFILT_VNODE reports that a watched directory changed but not which child, so directory events were queued with no name. fsnotify-based consumers (notably the k0s manifest applier, which re-applies only when a *.yaml entry appears) filter on the entry name and silently drop nameless events, so manifests written after the watch was established were never picked up. Keep a per-watch snapshot of the directory's entry names; on each NOTE_WRITE re-list the directory and diff against the snapshot to emit a named IN_CREATE per added child and IN_DELETE per removed one, matching real inotify semantics. The blocking-read and non-blocking collect paths share one process_vnode_event() helper. The snapshot is allocated on add_watch and freed on rm_watch and inotify_close. Add a regression test (test-inotify Test 6) that watches a fresh directory, creates a child, and asserts a named IN_CREATE for it is delivered; this fails before the fix (the event arrives without a name). Validated with make check on Apple Silicon. (cherry picked from commit 2a5fa28b5257ceb5ed116c231aecc7c4a2ef54ab)
1 parent 75fb59b commit b6e2122

2 files changed

Lines changed: 254 additions & 35 deletions

File tree

src/syscall/inotify.c

Lines changed: 184 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
#include <errno.h>
3030
#include <limits.h>
3131
#include <pthread.h>
32+
#include <dirent.h>
33+
#include <stdlib.h>
3234
#include <sys/event.h>
3335
#include <sys/stat.h>
3436
#include <sys/uio.h>
@@ -83,6 +85,11 @@ typedef struct {
8385
bool is_dir; /* true if watching a directory */
8486
dev_t dev; /* Device ID (for re-add lookup by inode) */
8587
ino_t ino; /* Inode number (for re-add lookup by inode) */
88+
/* Dir watches only: path + entry-name snapshot, diffed on change to
89+
* recover the child name kqueue omits. NULL/0 for file watches. */
90+
char *path;
91+
char **entries;
92+
int n_entries;
8693
} inotify_watch_t;
8794

8895
typedef struct {
@@ -296,8 +303,133 @@ static void pipe_drain(inotify_instance_t *inst)
296303
;
297304
}
298305

306+
/* Snapshot the entry names of a directory (excluding "." and ".."). On return
307+
* *out is a malloc'd array of malloc'd strings with *n_out entries (free with
308+
* free_dir_snapshot). On any failure the snapshot is left empty.
309+
*/
310+
static void dir_snapshot(const char *path, char ***out, int *n_out)
311+
{
312+
*out = NULL;
313+
*n_out = 0;
314+
315+
DIR *d = opendir(path);
316+
if (!d)
317+
return;
318+
319+
char **names = NULL;
320+
int n = 0, cap = 0;
321+
struct dirent *de;
322+
while ((de = readdir(d)) != NULL) {
323+
if (!strcmp(de->d_name, ".") || !strcmp(de->d_name, ".."))
324+
continue;
325+
if (n == cap) {
326+
int ncap = cap ? cap * 2 : 16;
327+
char **tmp = realloc(names, (size_t) ncap * sizeof(char *));
328+
if (!tmp)
329+
break;
330+
names = tmp;
331+
cap = ncap;
332+
}
333+
names[n] = strdup(de->d_name);
334+
if (!names[n])
335+
break;
336+
n++;
337+
}
338+
closedir(d);
339+
340+
*out = names;
341+
*n_out = n;
342+
}
343+
344+
static void free_dir_snapshot(char **entries, int n)
345+
{
346+
if (!entries)
347+
return;
348+
for (int i = 0; i < n; i++)
349+
free(entries[i]);
350+
free(entries);
351+
}
352+
353+
static bool snapshot_contains(char *const *entries, int n, const char *name)
354+
{
355+
for (int i = 0; i < n; i++)
356+
if (!strcmp(entries[i], name))
357+
return true;
358+
return false;
359+
}
360+
299361
/* Collect events from kqueue. */
300362

363+
/* Translate one EVFILT_VNODE notification into queued inotify events for the
364+
* watch on host_fd. Returns the number queued, or -1 on buffer overflow (an
365+
* IN_Q_OVERFLOW marker is queued). Caller holds inotify_lock.
366+
*/
367+
static int process_vnode_event(inotify_instance_t *inst,
368+
int host_fd,
369+
uint32_t fflags)
370+
{
371+
int widx = watch_find_by_hostfd(inst, host_fd);
372+
if (widx < 0)
373+
return 0;
374+
375+
inotify_watch_t *w = &inst->watches[widx];
376+
int queued = 0;
377+
bool overflow = false;
378+
379+
if (w->is_dir && (fflags & NOTE_WRITE) && w->path) {
380+
char **now = NULL;
381+
int now_n = 0;
382+
dir_snapshot(w->path, &now, &now_n);
383+
384+
for (int j = 0; j < now_n && !overflow; j++) {
385+
if ((w->mask & IN_CREATE) &&
386+
!snapshot_contains(w->entries, w->n_entries, now[j])) {
387+
if (queue_event(inst, w->wd, IN_CREATE, 0, now[j]) < 0)
388+
overflow = true;
389+
else
390+
queued++;
391+
}
392+
}
393+
for (int j = 0; j < w->n_entries && !overflow; j++) {
394+
if ((w->mask & IN_DELETE) &&
395+
!snapshot_contains(now, now_n, w->entries[j])) {
396+
if (queue_event(inst, w->wd, IN_DELETE, 0, w->entries[j]) < 0)
397+
overflow = true;
398+
else
399+
queued++;
400+
}
401+
}
402+
403+
/* Advance the snapshot regardless: the directory state has moved on,
404+
* and any names dropped under overflow are covered by IN_Q_OVERFLOW.
405+
*/
406+
free_dir_snapshot(w->entries, w->n_entries);
407+
w->entries = now;
408+
w->n_entries = now_n;
409+
}
410+
411+
if (!overflow) {
412+
uint32_t in_mask = notes_to_in_mask(fflags, w->mask, w->is_dir);
413+
/* The per-child create/delete is emitted by the diff above; only emit
414+
* the bare-mask event for file watches or non-create/delete changes.
415+
*/
416+
if (in_mask != 0 &&
417+
!(w->is_dir && (in_mask & (IN_CREATE | IN_DELETE)))) {
418+
if (queue_event(inst, w->wd, in_mask, 0, NULL) < 0)
419+
overflow = true;
420+
else
421+
queued++;
422+
}
423+
}
424+
425+
if (overflow) {
426+
/* IN_Q_OVERFLOW (0x4000) uses wd=-1 per Linux semantics. */
427+
queue_event(inst, -1, 0x4000, 0, NULL);
428+
return -1;
429+
}
430+
return queued;
431+
}
432+
301433
/* Poll the kqueue for pending vnode events and translate them into
302434
* inotify events in the instance buffer. Returns the number of
303435
* events collected.
@@ -312,35 +444,19 @@ static int collect_events(inotify_instance_t *inst)
312444
return 0;
313445

314446
int collected = 0;
447+
bool overflow = false;
315448
for (int i = 0; i < nev; i++) {
316-
int host_fd = (int) kevs[i].ident;
317-
int widx = watch_find_by_hostfd(inst, host_fd);
318-
if (widx < 0)
319-
continue;
320-
321-
inotify_watch_t *w = &inst->watches[widx];
322-
uint32_t in_mask =
323-
notes_to_in_mask((uint32_t) kevs[i].fflags, w->mask, w->is_dir);
324-
if (in_mask == 0)
325-
continue;
326-
327-
/* Queue event without a filename for file watches. For directory
328-
* watches, inotify emulation also omits the filename since kqueue
329-
* EVFILT_VNODE does not report which child changed.
330-
*/
331-
if (queue_event(inst, w->wd, in_mask, 0, NULL) == 0) {
332-
collected++;
333-
} else {
334-
/* Fixed inotify queue is full; queue IN_Q_OVERFLOW and stop.
335-
* IN_Q_OVERFLOW (0x4000) uses wd=-1 per Linux semantics.
336-
*/
337-
queue_event(inst, -1, 0x4000, 0, NULL);
449+
int r = process_vnode_event(inst, (int) kevs[i].ident,
450+
(uint32_t) kevs[i].fflags);
451+
if (r < 0) {
452+
overflow = true;
338453
break;
339454
}
455+
collected += r;
340456
}
341457

342458
/* Signal the self-pipe so poll/epoll sees readability */
343-
if (collected > 0)
459+
if (collected > 0 || overflow)
344460
pipe_signal(inst);
345461

346462
return collected;
@@ -438,12 +554,27 @@ int64_t sys_inotify_add_watch(guest_t *g,
438554
/* Strip IN_MASK_ADD control flag before storing */
439555
uint32_t event_mask = mask & ~(uint32_t) IN_MASK_ADD;
440556

557+
/* For directory watches, snapshot the path + current entries up-front
558+
* (outside the lock) so collect_events can diff on each change to emit
559+
* named IN_CREATE/IN_DELETE. Ownership moves to the watch slot on success;
560+
* every early-exit path below frees these.
561+
*/
562+
char *wpath = NULL;
563+
char **wentries = NULL;
564+
int wn = 0;
565+
if (is_dir) {
566+
wpath = strdup(path);
567+
dir_snapshot(path, &wentries, &wn);
568+
}
569+
441570
pthread_mutex_lock(&inotify_lock);
442571

443572
int slot = inotify_find(inotify_fd);
444573
if (slot < 0) {
445574
pthread_mutex_unlock(&inotify_lock);
446575
close(host_fd);
576+
free_dir_snapshot(wentries, wn);
577+
free(wpath);
447578
return -LINUX_EBADF;
448579
}
449580

@@ -466,8 +597,12 @@ int64_t sys_inotify_add_watch(guest_t *g,
466597
uint32_t snapshot_mask = w->mask; /* Snapshot before unlock */
467598
pthread_mutex_unlock(&inotify_lock);
468599

469-
/* Close the duplicate fd; inotify emulation keeps the original */
600+
/* Close the duplicate fd; inotify emulation keeps the original.
601+
* The existing watch keeps its snapshot; drop this call's copy.
602+
*/
470603
close(host_fd);
604+
free_dir_snapshot(wentries, wn);
605+
free(wpath);
471606

472607
/* Update kevent filter with the new mask (use snapshot --
473608
* w->mask may be modified by another thread after unlock)
@@ -486,6 +621,8 @@ int64_t sys_inotify_add_watch(guest_t *g,
486621
if (widx < 0) {
487622
pthread_mutex_unlock(&inotify_lock);
488623
close(host_fd);
624+
free_dir_snapshot(wentries, wn);
625+
free(wpath);
489626
return -LINUX_ENOSPC;
490627
}
491628

@@ -501,6 +638,9 @@ int64_t sys_inotify_add_watch(guest_t *g,
501638
w->is_dir = is_dir;
502639
w->dev = st.st_dev;
503640
w->ino = st.st_ino;
641+
w->path = wpath;
642+
w->entries = wentries;
643+
w->n_entries = wn;
504644

505645
/* Capture kq_fd while under lock */
506646
int kq_fd = inst->kq_fd;
@@ -521,6 +661,11 @@ int64_t sys_inotify_add_watch(guest_t *g,
521661
pthread_mutex_lock(&inotify_lock);
522662
w->wd = 0;
523663
w->host_fd = 0;
664+
free_dir_snapshot(w->entries, w->n_entries);
665+
w->entries = NULL;
666+
w->n_entries = 0;
667+
free(w->path);
668+
w->path = NULL;
524669
pthread_mutex_unlock(&inotify_lock);
525670
close(host_fd);
526671
errno = saved;
@@ -555,6 +700,11 @@ int64_t sys_inotify_rm_watch(int inotify_fd, int wd)
555700
w->host_fd = 0;
556701
w->mask = 0;
557702
w->is_dir = 0;
703+
free_dir_snapshot(w->entries, w->n_entries);
704+
w->entries = NULL;
705+
w->n_entries = 0;
706+
free(w->path);
707+
w->path = NULL;
558708
pthread_mutex_unlock(&inotify_lock);
559709

560710
/* Remove from kqueue and close outside lock */
@@ -619,18 +769,12 @@ int64_t inotify_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count)
619769
}
620770
inst = &inotify_state[slot];
621771

622-
/* Process the received event */
772+
/* Process the received event (same named-directory diff as the
773+
* non-blocking collect path).
774+
*/
623775
int host_fd = (int) kev.ident;
624-
int widx = watch_find_by_hostfd(inst, host_fd);
625-
if (widx >= 0) {
626-
inotify_watch_t *w = &inst->watches[widx];
627-
uint32_t in_mask =
628-
notes_to_in_mask((uint32_t) kev.fflags, w->mask, w->is_dir);
629-
if (in_mask != 0) {
630-
queue_event(inst, w->wd, in_mask, 0, NULL);
631-
pipe_signal(inst);
632-
}
633-
}
776+
if (process_vnode_event(inst, host_fd, (uint32_t) kev.fflags) != 0)
777+
pipe_signal(inst);
634778
}
635779
}
636780

@@ -711,6 +855,11 @@ static void inotify_close(int guest_fd)
711855
watch_fds[nfds++] = inst->watches[i].host_fd;
712856
inst->watches[i].wd = 0;
713857
}
858+
free_dir_snapshot(inst->watches[i].entries, inst->watches[i].n_entries);
859+
inst->watches[i].entries = NULL;
860+
inst->watches[i].n_entries = 0;
861+
free(inst->watches[i].path);
862+
inst->watches[i].path = NULL;
714863
}
715864

716865
inst->guest_fd = -1;

0 commit comments

Comments
 (0)