@@ -1737,6 +1737,40 @@ static bool regions_mergeable(const guest_region_t *a, const guest_region_t *b)
17371737 return a -> offset + (a -> end - a -> start ) == b -> offset ;
17381738}
17391739
1740+ /* First region whose start is >= start. regions[] is sorted by start. */
1741+ static int region_lower_bound_start (const guest_t * g , uint64_t start )
1742+ {
1743+ int lo = 0 ;
1744+ int hi = g -> nregions ;
1745+
1746+ while (lo < hi ) {
1747+ int mid = lo + (hi - lo ) / 2 ;
1748+ if (g -> regions [mid ].start < start )
1749+ lo = mid + 1 ;
1750+ else
1751+ hi = mid ;
1752+ }
1753+ return lo ;
1754+ }
1755+
1756+ /* First region whose end is > addr. See guest.h for the contract; also used
1757+ * inside this file to skip the untouched prefix for remove and set_prot.
1758+ */
1759+ int guest_region_first_end_above (const guest_t * g , uint64_t addr )
1760+ {
1761+ int lo = 0 ;
1762+ int hi = g -> nregions ;
1763+
1764+ while (lo < hi ) {
1765+ int mid = lo + (hi - lo ) / 2 ;
1766+ if (g -> regions [mid ].end <= addr )
1767+ lo = mid + 1 ;
1768+ else
1769+ hi = mid ;
1770+ }
1771+ return lo ;
1772+ }
1773+
17401774/* Merge region at index i with its right neighbor (i+1) when their layouts
17411775 * agree. No-op if i is the last region or layouts differ.
17421776 */
@@ -1857,12 +1891,10 @@ int guest_region_add_ex_owned_gpa(guest_t *g,
18571891 return -1 ;
18581892 }
18591893
1860- /* Find insertion point (keep sorted by start address) */
1861- int i = g -> nregions ;
1862- while (i > 0 && g -> regions [i - 1 ].start > start ) {
1863- g -> regions [i ] = g -> regions [i - 1 ];
1864- i -- ;
1865- }
1894+ /* Find insertion point (keep sorted by start address). */
1895+ int i = region_lower_bound_start (g , start );
1896+ memmove (& g -> regions [i + 1 ], & g -> regions [i ],
1897+ (g -> nregions - i ) * sizeof (guest_region_t ));
18661898
18671899 guest_region_t * r = & g -> regions [i ];
18681900 r -> start = start ;
@@ -1926,104 +1958,117 @@ int guest_preannounce(guest_t *g,
19261958
19271959void guest_region_remove (guest_t * g , uint64_t start , uint64_t end )
19281960{
1929- int i = 0 ;
1930- while (i < g -> nregions ) {
1931- guest_region_t * r = & g -> regions [i ];
1961+ if (end <= start )
1962+ return ;
19321963
1933- /* No overlap: region is entirely before the removal range */
1934- if (r -> end <= start ) {
1935- i ++ ;
1936- continue ;
1937- }
1964+ /* In-place compaction: 'out' is the next output slot, 'in' is the next
1965+ * input slot. Since the prefix [0, first) is untouched (it sorts strictly
1966+ * before [start, end) by guest_region_first_end_above), both cursors begin
1967+ * at 'first'. The non-overlap invariant guarantees out <= in throughout the
1968+ * loop, so writes at g->regions[out] never clobber slots not yet read.
1969+ */
1970+ int first = guest_region_first_end_above (g , start );
1971+ int out = first ;
1972+ int in = first ;
19381973
1939- /* No overlap: region is entirely after the removal range */
1974+ while (in < g -> nregions ) {
1975+ guest_region_t * r = & g -> regions [in ];
19401976 if (r -> start >= end )
1941- break ; /* sorted, so done */
1942-
1943- /* Full containment: remove the entire region */
1944- if (r -> start >= start && r -> end <= end ) {
1945- if (r -> backing_fd >= 0 )
1946- close (r -> backing_fd );
1947- memmove (& g -> regions [i ], & g -> regions [i + 1 ],
1948- (g -> nregions - i - 1 ) * sizeof (guest_region_t ));
1949- g -> nregions -- ;
1950- continue ; /* do not increment i */
1951- }
1952-
1953- /* Partial overlap: removal range cuts the beginning */
1954- if (r -> start >= start && r -> end > end ) {
1955- uint64_t trimmed = end - r -> start ;
1956- r -> offset += trimmed ;
1957- r -> gpa_base += trimmed ;
1958- r -> start = end ;
1959- guest_region_clip_overlay (r );
1960- i ++ ;
1961- continue ;
1962- }
1977+ break ;
19631978
1964- /* Partial overlap: removal range cuts the end */
1965- if (r -> start < start && r -> end > start && r -> end <= end ) {
1966- r -> end = start ;
1967- guest_region_clip_overlay (r );
1968- i ++ ;
1969- continue ;
1970- }
1979+ bool keep_left = r -> start < start ;
1980+ bool keep_right = r -> end > end ;
19711981
1972- /* Split: removal range is entirely inside the region */
1973- if (r -> start < start && r -> end > end ) {
1974- /* Need to split into two regions: [r->start, start) and [end,
1975- * r->end)
1976- */
1982+ /* Interior split: removal range lies strictly inside *r, producing
1983+ * two output entries from one input slot. This is the only growth
1984+ * path; handle it explicitly so the untouched suffix is shifted out
1985+ * of the way before either half is written. After this branch no
1986+ * further input regions can overlap [start, end), so the loop is
1987+ * done.
1988+ */
1989+ if (keep_left && keep_right ) {
19771990 if (g -> nregions >= GUEST_MAX_REGIONS ) {
1978- /* Region table is full; trim to [r->start, start) and drop
1979- * the tail. The tail [end, r->end) becomes untracked in
1980- * /proc/self/maps but remains mapped in page tables.
1991+ /* Table full: drop the tail [end, r->end) and fall through to
1992+ * the simple "trim end" treatment of *r. The tail stays mapped
1993+ * in page tables but is now untracked, so a later mprotect over
1994+ * that range would otherwise see vacuously uniform prot in the
1995+ * tracker and skip PTE work. Mark the tracker permanently stale
1996+ * to disarm the mprotect fast path for the lifetime of the
1997+ * process.
19811998 */
19821999 log_error (
19832000 "guest: region table full, "
19842001 "munmap split drops tail [0x%llx-0x%llx)" ,
19852002 (unsigned long long ) end , (unsigned long long ) r -> end );
1986- r -> end = start ;
1987- i ++ ;
1988- continue ;
1989- }
1990- /* Make room for the new region after i */
1991- memmove (& g -> regions [i + 2 ], & g -> regions [i + 1 ],
1992- (g -> nregions - i - 1 ) * sizeof (guest_region_t ));
1993-
1994- /* Right half: [end, old_end) */
1995- guest_region_t * right = & g -> regions [i + 1 ];
1996- * right = * r ; /* Copy attributes */
1997- right -> offset += (end - r -> start );
1998- right -> gpa_base += (end - r -> start );
1999- right -> start = end ;
2000- if (r -> backing_fd >= 0 ) {
2001- /* A dup failure leaves backing_fd=-1, silently converting this
2002- * half to anonymous semantics (msync and MADV_DONTNEED skip
2003- * regions with backing_fd<0). Propagating the error would
2004- * require making all region split callers (mprotect, munmap)
2005- * fallible.
2006- */
2007- right -> backing_fd = dup (r -> backing_fd );
2008- if (right -> backing_fd < 0 )
2009- log_error (
2010- "guest: dup() failed for region split "
2011- "backing fd %d: %s" ,
2012- r -> backing_fd , strerror (errno ));
2013- }
2003+ g -> regions_tracker_stale = true;
2004+ keep_right = false;
2005+ } else {
2006+ guest_region_t orig = * r ;
2007+ int suffix_count = g -> nregions - in - 1 ;
2008+ if (suffix_count > 0 )
2009+ memmove (& g -> regions [out + 2 ], & g -> regions [in + 1 ],
2010+ suffix_count * sizeof (guest_region_t ));
2011+
2012+ guest_region_t left = orig ;
2013+ left .end = start ;
2014+ guest_region_clip_overlay (& left );
2015+ g -> regions [out ] = left ;
2016+
2017+ guest_region_t right = orig ;
2018+ uint64_t trimmed = end - orig .start ;
2019+ right .offset += trimmed ;
2020+ right .gpa_base += trimmed ;
2021+ right .start = end ;
2022+ if (orig .backing_fd >= 0 ) {
2023+ right .backing_fd = dup (orig .backing_fd );
2024+ if (right .backing_fd < 0 )
2025+ log_error (
2026+ "guest: dup() failed for region split "
2027+ "backing fd %d: %s" ,
2028+ orig .backing_fd , strerror (errno ));
2029+ }
2030+ guest_region_clip_overlay (& right );
2031+ g -> regions [out + 1 ] = right ;
20142032
2015- /* Left half keeps the original entry and shortens its end. */
2016- r -> end = start ;
2017- guest_region_clip_overlay ( r );
2018- guest_region_clip_overlay ( right );
2033+ g -> nregions = out + 2 + suffix_count ;
2034+ return ;
2035+ }
2036+ }
20192037
2020- g -> nregions ++ ;
2021- i += 2 ; /* skip both halves */
2038+ if (!keep_left && !keep_right ) {
2039+ if (r -> backing_fd >= 0 )
2040+ close (r -> backing_fd );
2041+ in ++ ;
20222042 continue ;
20232043 }
20242044
2025- i ++ ;
2045+ /* Trim-only paths: either keep_left xor keep_right is true. Build the
2046+ * surviving half from the source slot, then publish it to g->regions
2047+ * [out]. The original backing_fd transfers to whichever half survives;
2048+ * no dup is needed because only one half remains.
2049+ */
2050+ guest_region_t survivor = * r ;
2051+ if (keep_left ) {
2052+ survivor .end = start ;
2053+ } else {
2054+ uint64_t trimmed = end - r -> start ;
2055+ survivor .offset += trimmed ;
2056+ survivor .gpa_base += trimmed ;
2057+ survivor .start = end ;
2058+ }
2059+ guest_region_clip_overlay (& survivor );
2060+ g -> regions [out ++ ] = survivor ;
2061+ in ++ ;
20262062 }
2063+
2064+ /* Append the unread suffix (regions whose start >= end) after the
2065+ * compacted overlap area, shifting only if compaction left a hole.
2066+ */
2067+ int tail = g -> nregions - in ;
2068+ if (tail > 0 && out != in )
2069+ memmove (& g -> regions [out ], & g -> regions [in ],
2070+ tail * sizeof (guest_region_t ));
2071+ g -> nregions = out + tail ;
20272072}
20282073
20292074const guest_region_t * guest_region_find (const guest_t * g , uint64_t addr )
@@ -2043,6 +2088,35 @@ const guest_region_t *guest_region_find(const guest_t *g, uint64_t addr)
20432088 return NULL ;
20442089}
20452090
2091+ bool guest_region_range_prot_uniform (const guest_t * g ,
2092+ uint64_t start ,
2093+ uint64_t end ,
2094+ int prot )
2095+ {
2096+ for (int i = guest_region_first_end_above (g , start ); i < g -> nregions ; i ++ ) {
2097+ const guest_region_t * r = & g -> regions [i ];
2098+ if (r -> start >= end )
2099+ break ;
2100+ if (r -> prot != prot )
2101+ return false;
2102+ }
2103+ return true;
2104+ }
2105+
2106+ bool guest_region_range_has_noreserve (const guest_t * g ,
2107+ uint64_t start ,
2108+ uint64_t end )
2109+ {
2110+ for (int i = guest_region_first_end_above (g , start ); i < g -> nregions ; i ++ ) {
2111+ const guest_region_t * r = & g -> regions [i ];
2112+ if (r -> start >= end )
2113+ break ;
2114+ if (r -> noreserve )
2115+ return true;
2116+ }
2117+ return false;
2118+ }
2119+
20462120void guest_region_set_prot (guest_t * g , uint64_t start , uint64_t end , int prot )
20472121{
20482122 /* Walk regions overlapping [start, end), split at boundaries, update prot.
@@ -2051,20 +2125,28 @@ void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot)
20512125 */
20522126 int first_modified = -1 , last_modified = -1 ;
20532127
2054- for (int i = 0 ; i < g -> nregions ; i ++ ) {
2128+ /* The prefix skip ensures regions[i].end > start for i >= first; the
2129+ * non-overlap invariant carries it through all later iterations.
2130+ */
2131+ for (int i = guest_region_first_end_above (g , start ); i < g -> nregions ; i ++ ) {
20552132 guest_region_t * r = & g -> regions [i ];
2056- if (r -> end <= start )
2057- continue ;
20582133 if (r -> start >= end )
20592134 break ;
20602135
20612136 /* If region extends before start, split at start */
20622137 if (r -> start < start ) {
20632138 if (g -> nregions >= GUEST_MAX_REGIONS ) {
2139+ /* The region keeps its old prot in the tracker, but PTEs for
2140+ * [start, r->end) have already been updated. Mark the tracker
2141+ * permanently stale so the mprotect fast path falls back to
2142+ * unconditional PTE work and cannot be fooled by a tracker
2143+ * that lags actual PTE state.
2144+ */
20642145 log_error (
20652146 "guest: region table full, "
20662147 "mprotect split skipped at 0x%llx" ,
20672148 (unsigned long long ) start );
2149+ g -> regions_tracker_stale = true;
20682150 continue ;
20692151 }
20702152 memmove (& g -> regions [i + 1 ], & g -> regions [i ],
@@ -2094,15 +2176,18 @@ void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot)
20942176 /* If region extends past end, split at end */
20952177 if (r -> end > end ) {
20962178 if (g -> nregions >= GUEST_MAX_REGIONS ) {
2097- /* Split failure applies prot to the whole region.
2098- * The tail [end, r->end) gets new prot too.
2179+ /* Over-apply prot to the whole region: the tail [end, r->end)
2180+ * now claims new prot in the tracker even though PTE work
2181+ * did not cover it. Mark the tracker stale so the mprotect
2182+ * fast path stops trusting prot uniformity.
20992183 */
21002184 log_error (
21012185 "guest: region table full, "
21022186 "mprotect split skipped at 0x%llx "
21032187 "(region [0x%llx-0x%llx) gets prot %d entirely)" ,
21042188 (unsigned long long ) end , (unsigned long long ) r -> start ,
21052189 (unsigned long long ) r -> end , prot );
2190+ g -> regions_tracker_stale = true;
21062191 r -> prot = prot ;
21072192 if (first_modified < 0 )
21082193 first_modified = i ;
0 commit comments