[None][feat] Parallelize host KV cache pool prefault and add THP control (#15431)

nafis271 · thorjohnsen · web-flow · commit c9b6518c8a6c · 2026-06-29T12:15:30.000-04:00
Signed-off-by: Md Nafis Ul Haque Shifat &lt;nafis@deepinfra.com&gt;
Co-authored-by: Thor Johnsen &lt;41591019+thorjohnsen@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/runtime/kv_cache_manager_v2/_utils.py b/tensorrt_llm/runtime/kv_cache_manager_v2/_utils.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import array
+import concurrent.futures
 import ctypes
 import errno
 import functools
@@ -394,6 +395,21 @@ def find_index(seq: Iterable[T], predicate: Callable[[T], bool]) -> int:
 _libc.posix_fallocate.argtypes = [ctypes.c_int, ctypes.c_longlong, ctypes.c_longlong]
 
 MADV_HUGEPAGE: Final[int] = 14
+MADV_NOHUGEPAGE: Final[int] = 15
+MADV_POPULATE_WRITE: Final[int] = 23
+
+# TLLM_KV_CACHE_MANAGER_V2_THP=0 backs host pools with regular 4KB pages
+# (MADV_NOHUGEPAGE). On nodes with fragmented physical memory and THP
+# defrag=madvise, every 2MB THP fault stalls in direct compaction that
+# rarely succeeds, slowing pool population from GB/s to GB/min.
+USE_THP: Final[bool] = os.environ.get("TLLM_KV_CACHE_MANAGER_V2_THP", "1") == "1"
+# TLLM_KV_CACHE_MANAGER_V2_PREFAULT_THREADS=0 disables prefaulting; pages are
+# then faulted in lazily, single-threaded, inside cuMemHostRegister.
+PREFAULT_THREADS: Final[int] = int(
+    os.environ.get(
+        "TLLM_KV_CACHE_MANAGER_V2_PREFAULT_THREADS", str(min(64, (os.cpu_count() or 32) // 2))
+    )
+)
 
 
 def _madvise(ptr: int, size: int, advice: int) -> None:
@@ -508,8 +524,10 @@ def __init__(self, size: int) -> None:
 
         # Opportunistically advise huge pages for the whole range.
         # The kernel will use huge pages for aligned 2MB chunks within this range.
-        _madvise(self._address, self._size, MADV_HUGEPAGE)
+        _madvise(self._address, self._size, MADV_HUGEPAGE if USE_THP else MADV_NOHUGEPAGE)
 
+        if PREFAULT_THREADS > 0:
+            self._parallel_prefault(PREFAULT_THREADS)
         self._register_to_cuda()
 
     def resize(self, new_size: int) -> None:
@@ -519,8 +537,8 @@ def resize(self, new_size: int) -> None:
             assert self._address % self.ALIGNMENT == 0
             self._size = new_size
 
-            # Re-advise HUGEPAGE for the new range
-            _madvise(self._address, self._size, MADV_HUGEPAGE)
+            # Re-advise the configured page mode for the new range.
+            _madvise(self._address, self._size, MADV_HUGEPAGE if USE_THP else MADV_NOHUGEPAGE)
         finally:
             self._register_to_cuda()
 
@@ -535,6 +553,50 @@ def destroy(self) -> None:
     def __del__(self) -> None:
         self.destroy()
 
+    def _parallel_prefault(self, nthreads: int) -> None:
+        """Fault in all pages with parallel threads before cuMemHostRegister,
+        so registration only pins pages (never allocates them).
+
+        Lazy faulting inside cuMemHostRegister is single-threaded and, under
+        memory pressure or THP compaction stalls, can take minutes for
+        multi-hundred-GiB pools. MADV_POPULATE_WRITE populates in bulk; small
+        chunks keep mmap_lock hold times short (one giant madvise per thread
+        serializes every other thread behind it) and let threads
+        load-balance.
+        """
+        chunk = 512 << 20
+
+        def populate(off: int) -> None:
+            ln = min(chunk, self._size - off)
+            if ln <= 0:
+                return
+            ret = _libc.madvise(
+                ctypes.c_void_p(self._address + off),
+                ctypes.c_size_t(ln),
+                ctypes.c_int(MADV_POPULATE_WRITE),
+            )
+            if ret != 0:
+                error_code = ctypes.get_errno()
+                if error_code in (errno.EINVAL, getattr(errno, "ENOSYS", -1)):
+                    # MADV_POPULATE_WRITE requires Linux >= 5.14; on older
+                    # kernels fall back to touching every page.
+                    ctypes.memset(self._address + off, 0, ln)
+                    return
+                error_name = errno.errorcode.get(error_code, "Unknown error")
+                if error_code == errno.ENOMEM:
+                    # Surface real allocation failures instead of masking them
+                    # with a memset that would trigger a system OOM kill.
+                    raise HostOOMError(
+                        f"madvise(MADV_POPULATE_WRITE) failed with errno {error_code}: {error_name}"
+                    )
+                raise OSError(
+                    error_code,
+                    f"madvise(MADV_POPULATE_WRITE) failed: {error_name}",
+                )
+
+        with concurrent.futures.ThreadPoolExecutor(max_workers=nthreads) as executor:
+            list(executor.map(populate, range(0, self._size, chunk)))
+
     def _register_to_cuda(self) -> None:
         assert self._num_registered_chunks == 0
         for addr, size in self._iterate_chunks():