Skip to content

Commit 5fcc9c0

Browse files
authored
Merge pull request #71 from sysprog21/mem
Short-circuit mprotect on matching tracker prot
2 parents 3fb7e18 + 41d43fd commit 5fcc9c0

7 files changed

Lines changed: 335 additions & 123 deletions

File tree

src/core/guest.c

Lines changed: 177 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -1737,6 +1737,40 @@ static bool regions_mergeable(const guest_region_t *a, const guest_region_t *b)
17371737
return a->offset + (a->end - a->start) == b->offset;
17381738
}
17391739

1740+
/* First region whose start is >= start. regions[] is sorted by start. */
1741+
static int region_lower_bound_start(const guest_t *g, uint64_t start)
1742+
{
1743+
int lo = 0;
1744+
int hi = g->nregions;
1745+
1746+
while (lo < hi) {
1747+
int mid = lo + (hi - lo) / 2;
1748+
if (g->regions[mid].start < start)
1749+
lo = mid + 1;
1750+
else
1751+
hi = mid;
1752+
}
1753+
return lo;
1754+
}
1755+
1756+
/* First region whose end is > addr. See guest.h for the contract; also used
1757+
* inside this file to skip the untouched prefix for remove and set_prot.
1758+
*/
1759+
int guest_region_first_end_above(const guest_t *g, uint64_t addr)
1760+
{
1761+
int lo = 0;
1762+
int hi = g->nregions;
1763+
1764+
while (lo < hi) {
1765+
int mid = lo + (hi - lo) / 2;
1766+
if (g->regions[mid].end <= addr)
1767+
lo = mid + 1;
1768+
else
1769+
hi = mid;
1770+
}
1771+
return lo;
1772+
}
1773+
17401774
/* Merge region at index i with its right neighbor (i+1) when their layouts
17411775
* agree. No-op if i is the last region or layouts differ.
17421776
*/
@@ -1857,12 +1891,10 @@ int guest_region_add_ex_owned_gpa(guest_t *g,
18571891
return -1;
18581892
}
18591893

1860-
/* Find insertion point (keep sorted by start address) */
1861-
int i = g->nregions;
1862-
while (i > 0 && g->regions[i - 1].start > start) {
1863-
g->regions[i] = g->regions[i - 1];
1864-
i--;
1865-
}
1894+
/* Find insertion point (keep sorted by start address). */
1895+
int i = region_lower_bound_start(g, start);
1896+
memmove(&g->regions[i + 1], &g->regions[i],
1897+
(g->nregions - i) * sizeof(guest_region_t));
18661898

18671899
guest_region_t *r = &g->regions[i];
18681900
r->start = start;
@@ -1926,104 +1958,117 @@ int guest_preannounce(guest_t *g,
19261958

19271959
void guest_region_remove(guest_t *g, uint64_t start, uint64_t end)
19281960
{
1929-
int i = 0;
1930-
while (i < g->nregions) {
1931-
guest_region_t *r = &g->regions[i];
1961+
if (end <= start)
1962+
return;
19321963

1933-
/* No overlap: region is entirely before the removal range */
1934-
if (r->end <= start) {
1935-
i++;
1936-
continue;
1937-
}
1964+
/* In-place compaction: 'out' is the next output slot, 'in' is the next
1965+
* input slot. Since the prefix [0, first) is untouched (it sorts strictly
1966+
* before [start, end) by guest_region_first_end_above), both cursors begin
1967+
* at 'first'. The non-overlap invariant guarantees out <= in throughout the
1968+
* loop, so writes at g->regions[out] never clobber slots not yet read.
1969+
*/
1970+
int first = guest_region_first_end_above(g, start);
1971+
int out = first;
1972+
int in = first;
19381973

1939-
/* No overlap: region is entirely after the removal range */
1974+
while (in < g->nregions) {
1975+
guest_region_t *r = &g->regions[in];
19401976
if (r->start >= end)
1941-
break; /* sorted, so done */
1942-
1943-
/* Full containment: remove the entire region */
1944-
if (r->start >= start && r->end <= end) {
1945-
if (r->backing_fd >= 0)
1946-
close(r->backing_fd);
1947-
memmove(&g->regions[i], &g->regions[i + 1],
1948-
(g->nregions - i - 1) * sizeof(guest_region_t));
1949-
g->nregions--;
1950-
continue; /* do not increment i */
1951-
}
1952-
1953-
/* Partial overlap: removal range cuts the beginning */
1954-
if (r->start >= start && r->end > end) {
1955-
uint64_t trimmed = end - r->start;
1956-
r->offset += trimmed;
1957-
r->gpa_base += trimmed;
1958-
r->start = end;
1959-
guest_region_clip_overlay(r);
1960-
i++;
1961-
continue;
1962-
}
1977+
break;
19631978

1964-
/* Partial overlap: removal range cuts the end */
1965-
if (r->start < start && r->end > start && r->end <= end) {
1966-
r->end = start;
1967-
guest_region_clip_overlay(r);
1968-
i++;
1969-
continue;
1970-
}
1979+
bool keep_left = r->start < start;
1980+
bool keep_right = r->end > end;
19711981

1972-
/* Split: removal range is entirely inside the region */
1973-
if (r->start < start && r->end > end) {
1974-
/* Need to split into two regions: [r->start, start) and [end,
1975-
* r->end)
1976-
*/
1982+
/* Interior split: removal range lies strictly inside *r, producing
1983+
* two output entries from one input slot. This is the only growth
1984+
* path; handle it explicitly so the untouched suffix is shifted out
1985+
* of the way before either half is written. After this branch no
1986+
* further input regions can overlap [start, end), so the loop is
1987+
* done.
1988+
*/
1989+
if (keep_left && keep_right) {
19771990
if (g->nregions >= GUEST_MAX_REGIONS) {
1978-
/* Region table is full; trim to [r->start, start) and drop
1979-
* the tail. The tail [end, r->end) becomes untracked in
1980-
* /proc/self/maps but remains mapped in page tables.
1991+
/* Table full: drop the tail [end, r->end) and fall through to
1992+
* the simple "trim end" treatment of *r. The tail stays mapped
1993+
* in page tables but is now untracked, so a later mprotect over
1994+
* that range would otherwise see vacuously uniform prot in the
1995+
* tracker and skip PTE work. Mark the tracker permanently stale
1996+
* to disarm the mprotect fast path for the lifetime of the
1997+
* process.
19811998
*/
19821999
log_error(
19832000
"guest: region table full, "
19842001
"munmap split drops tail [0x%llx-0x%llx)",
19852002
(unsigned long long) end, (unsigned long long) r->end);
1986-
r->end = start;
1987-
i++;
1988-
continue;
1989-
}
1990-
/* Make room for the new region after i */
1991-
memmove(&g->regions[i + 2], &g->regions[i + 1],
1992-
(g->nregions - i - 1) * sizeof(guest_region_t));
1993-
1994-
/* Right half: [end, old_end) */
1995-
guest_region_t *right = &g->regions[i + 1];
1996-
*right = *r; /* Copy attributes */
1997-
right->offset += (end - r->start);
1998-
right->gpa_base += (end - r->start);
1999-
right->start = end;
2000-
if (r->backing_fd >= 0) {
2001-
/* A dup failure leaves backing_fd=-1, silently converting this
2002-
* half to anonymous semantics (msync and MADV_DONTNEED skip
2003-
* regions with backing_fd<0). Propagating the error would
2004-
* require making all region split callers (mprotect, munmap)
2005-
* fallible.
2006-
*/
2007-
right->backing_fd = dup(r->backing_fd);
2008-
if (right->backing_fd < 0)
2009-
log_error(
2010-
"guest: dup() failed for region split "
2011-
"backing fd %d: %s",
2012-
r->backing_fd, strerror(errno));
2013-
}
2003+
g->regions_tracker_stale = true;
2004+
keep_right = false;
2005+
} else {
2006+
guest_region_t orig = *r;
2007+
int suffix_count = g->nregions - in - 1;
2008+
if (suffix_count > 0)
2009+
memmove(&g->regions[out + 2], &g->regions[in + 1],
2010+
suffix_count * sizeof(guest_region_t));
2011+
2012+
guest_region_t left = orig;
2013+
left.end = start;
2014+
guest_region_clip_overlay(&left);
2015+
g->regions[out] = left;
2016+
2017+
guest_region_t right = orig;
2018+
uint64_t trimmed = end - orig.start;
2019+
right.offset += trimmed;
2020+
right.gpa_base += trimmed;
2021+
right.start = end;
2022+
if (orig.backing_fd >= 0) {
2023+
right.backing_fd = dup(orig.backing_fd);
2024+
if (right.backing_fd < 0)
2025+
log_error(
2026+
"guest: dup() failed for region split "
2027+
"backing fd %d: %s",
2028+
orig.backing_fd, strerror(errno));
2029+
}
2030+
guest_region_clip_overlay(&right);
2031+
g->regions[out + 1] = right;
20142032

2015-
/* Left half keeps the original entry and shortens its end. */
2016-
r->end = start;
2017-
guest_region_clip_overlay(r);
2018-
guest_region_clip_overlay(right);
2033+
g->nregions = out + 2 + suffix_count;
2034+
return;
2035+
}
2036+
}
20192037

2020-
g->nregions++;
2021-
i += 2; /* skip both halves */
2038+
if (!keep_left && !keep_right) {
2039+
if (r->backing_fd >= 0)
2040+
close(r->backing_fd);
2041+
in++;
20222042
continue;
20232043
}
20242044

2025-
i++;
2045+
/* Trim-only paths: either keep_left xor keep_right is true. Build the
2046+
* surviving half from the source slot, then publish it to g->regions
2047+
* [out]. The original backing_fd transfers to whichever half survives;
2048+
* no dup is needed because only one half remains.
2049+
*/
2050+
guest_region_t survivor = *r;
2051+
if (keep_left) {
2052+
survivor.end = start;
2053+
} else {
2054+
uint64_t trimmed = end - r->start;
2055+
survivor.offset += trimmed;
2056+
survivor.gpa_base += trimmed;
2057+
survivor.start = end;
2058+
}
2059+
guest_region_clip_overlay(&survivor);
2060+
g->regions[out++] = survivor;
2061+
in++;
20262062
}
2063+
2064+
/* Append the unread suffix (regions whose start >= end) after the
2065+
* compacted overlap area, shifting only if compaction left a hole.
2066+
*/
2067+
int tail = g->nregions - in;
2068+
if (tail > 0 && out != in)
2069+
memmove(&g->regions[out], &g->regions[in],
2070+
tail * sizeof(guest_region_t));
2071+
g->nregions = out + tail;
20272072
}
20282073

20292074
const guest_region_t *guest_region_find(const guest_t *g, uint64_t addr)
@@ -2043,6 +2088,35 @@ const guest_region_t *guest_region_find(const guest_t *g, uint64_t addr)
20432088
return NULL;
20442089
}
20452090

2091+
bool guest_region_range_prot_uniform(const guest_t *g,
2092+
uint64_t start,
2093+
uint64_t end,
2094+
int prot)
2095+
{
2096+
for (int i = guest_region_first_end_above(g, start); i < g->nregions; i++) {
2097+
const guest_region_t *r = &g->regions[i];
2098+
if (r->start >= end)
2099+
break;
2100+
if (r->prot != prot)
2101+
return false;
2102+
}
2103+
return true;
2104+
}
2105+
2106+
bool guest_region_range_has_noreserve(const guest_t *g,
2107+
uint64_t start,
2108+
uint64_t end)
2109+
{
2110+
for (int i = guest_region_first_end_above(g, start); i < g->nregions; i++) {
2111+
const guest_region_t *r = &g->regions[i];
2112+
if (r->start >= end)
2113+
break;
2114+
if (r->noreserve)
2115+
return true;
2116+
}
2117+
return false;
2118+
}
2119+
20462120
void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot)
20472121
{
20482122
/* Walk regions overlapping [start, end), split at boundaries, update prot.
@@ -2051,20 +2125,28 @@ void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot)
20512125
*/
20522126
int first_modified = -1, last_modified = -1;
20532127

2054-
for (int i = 0; i < g->nregions; i++) {
2128+
/* The prefix skip ensures regions[i].end > start for i >= first; the
2129+
* non-overlap invariant carries it through all later iterations.
2130+
*/
2131+
for (int i = guest_region_first_end_above(g, start); i < g->nregions; i++) {
20552132
guest_region_t *r = &g->regions[i];
2056-
if (r->end <= start)
2057-
continue;
20582133
if (r->start >= end)
20592134
break;
20602135

20612136
/* If region extends before start, split at start */
20622137
if (r->start < start) {
20632138
if (g->nregions >= GUEST_MAX_REGIONS) {
2139+
/* The region keeps its old prot in the tracker, but PTEs for
2140+
* [start, r->end) have already been updated. Mark the tracker
2141+
* permanently stale so the mprotect fast path falls back to
2142+
* unconditional PTE work and cannot be fooled by a tracker
2143+
* that lags actual PTE state.
2144+
*/
20642145
log_error(
20652146
"guest: region table full, "
20662147
"mprotect split skipped at 0x%llx",
20672148
(unsigned long long) start);
2149+
g->regions_tracker_stale = true;
20682150
continue;
20692151
}
20702152
memmove(&g->regions[i + 1], &g->regions[i],
@@ -2094,15 +2176,18 @@ void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot)
20942176
/* If region extends past end, split at end */
20952177
if (r->end > end) {
20962178
if (g->nregions >= GUEST_MAX_REGIONS) {
2097-
/* Split failure applies prot to the whole region.
2098-
* The tail [end, r->end) gets new prot too.
2179+
/* Over-apply prot to the whole region: the tail [end, r->end)
2180+
* now claims new prot in the tracker even though PTE work
2181+
* did not cover it. Mark the tracker stale so the mprotect
2182+
* fast path stops trusting prot uniformity.
20992183
*/
21002184
log_error(
21012185
"guest: region table full, "
21022186
"mprotect split skipped at 0x%llx "
21032187
"(region [0x%llx-0x%llx) gets prot %d entirely)",
21042188
(unsigned long long) end, (unsigned long long) r->start,
21052189
(unsigned long long) r->end, prot);
2190+
g->regions_tracker_stale = true;
21062191
r->prot = prot;
21072192
if (first_modified < 0)
21082193
first_modified = i;

src/core/guest.h

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,14 @@ typedef struct {
494494
/* Semantic region tracking for munmap/mprotect/proc-self-maps */
495495
guest_region_t regions[GUEST_MAX_REGIONS];
496496
int nregions; /* Number of active regions */
497+
/* Sticky flag set when guest_region_set_prot could not honor a request
498+
* because the region table was full. After this point the tracker no
499+
* longer faithfully reflects PTE state, so the mprotect fast path must
500+
* fall back to unconditional PTE work. Propagated across fork IPC with
501+
* the semantic region snapshot so children inherit the same fast-path
502+
* guard as the parent.
503+
*/
504+
bool regions_tracker_stale;
497505
guest_region_t preannounced[GUEST_MAX_PREANNOUNCED];
498506
int npreannounced; /* /proc/self/maps-only shadow regions */
499507

@@ -1174,6 +1182,32 @@ const guest_region_t *guest_region_find(const guest_t *g, uint64_t addr);
11741182
*/
11751183
void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot);
11761184

1185+
/* Index of the first region whose end is strictly above addr. Earlier
1186+
* regions sort entirely below addr (regions[] is start-sorted and
1187+
* non-overlapping, so ends are monotonic). Callers use this to skip the
1188+
* untouched prefix in O(log n) before a linear walk over the overlap.
1189+
*/
1190+
int guest_region_first_end_above(const guest_t *g, uint64_t addr);
1191+
1192+
/* True if every tracked region overlapping [start, end) already has prot.
1193+
* Returns true vacuously when no region overlaps the range, since callers
1194+
* use this to decide whether page-table maintenance can be skipped and an
1195+
* untracked sub-range has no PTEs of its own to update.
1196+
*/
1197+
bool guest_region_range_prot_uniform(const guest_t *g,
1198+
uint64_t start,
1199+
uint64_t end,
1200+
int prot);
1201+
1202+
/* True if any tracked region overlapping [start, end) is MAP_NORESERVE.
1203+
* Callers must run page-table maintenance for such ranges even when prot
1204+
* already matches, because lazy materialization may have produced PTEs
1205+
* that need re-permissioning.
1206+
*/
1207+
bool guest_region_range_has_noreserve(const guest_t *g,
1208+
uint64_t start,
1209+
uint64_t end);
1210+
11771211
/* Try to materialize a lazy (MAP_NORESERVE) page at the given offset.
11781212
* Called from the data/instruction abort handler when the faulting address
11791213
* falls within a noreserve region. Creates page table entries for one 2MiB

0 commit comments

Comments
 (0)