1414# limitations under the License.
1515"""
1616
17+ import ctypes
18+ import os
1719import threading
1820import time
1921from concurrent .futures import ThreadPoolExecutor
@@ -99,6 +101,9 @@ def __init__(self, config: "FDConfig", local_rank: int, device_id: int):
99101
100102 self ._initialized = True
101103
104+ # NUMA binding flag
105+ self ._numa_bound = False
106+
102107 @property
103108 def write_policy (self ) -> Optional [str ]:
104109 """Get the write policy for cache operations."""
@@ -384,6 +389,126 @@ def initialize_mtp_kv_cache(
384389
385390 return cache_kvs_list
386391
392+ def _get_numa_node_for_gpu (self , device_id : int ) -> int :
393+ """
394+ Get the NUMA node closest to the specified GPU device.
395+
396+ Tries multiple methods in order:
397+ 1. nvidia-smi topo -C -i <gpu_id> (fastest and most reliable)
398+ 2. /sys/class/nvidia-gpu/ (direct sysfs)
399+ 3. /sys/bus/pci/devices/ (fallback)
400+
401+ Args:
402+ device_id: CUDA device ID.
403+
404+ Returns:
405+ NUMA node index, or -1 if cannot be determined.
406+ """
407+ try :
408+ # Method 1: Use nvidia-smi topo -C -i (fastest, SGLang-style)
409+ # This directly outputs the NUMA ID for the specific GPU
410+ try :
411+ import subprocess
412+
413+ result = subprocess .run (
414+ ["nvidia-smi" , "topo" , "-C" , "-i" , str (device_id )], capture_output = True , text = True , timeout = 5
415+ )
416+ if result .returncode == 0 :
417+ output_line = result .stdout .strip ()
418+ prefix = "NUMA IDs of closest CPU:"
419+ if output_line .startswith (prefix ):
420+ numa_str = output_line [len (prefix ) :].strip ()
421+ # Handle comma-separated or range values (e.g., "0" or "0,1" or "0-1")
422+ if numa_str :
423+ # Take the first NUMA node if multiple are listed
424+ first_numa = numa_str .split ("," )[0 ].split ("-" )[0 ].strip ()
425+ if first_numa .isdigit ():
426+ return int (first_numa )
427+ except (subprocess .TimeoutExpired , FileNotFoundError , Exception ) as e :
428+ logger .debug (f"[CacheController] nvidia-smi topo -C method failed: { e } " )
429+
430+ # Method 2: Try to read from /sys filesystem
431+ sys_path = f"/sys/class/nvidia-gpu/nvidia{ device_id } /device/numa_node"
432+ if os .path .exists (sys_path ):
433+ with open (sys_path , "r" ) as f :
434+ return int (f .read ().strip ())
435+
436+ # Method 3: Fallback - check all NVIDIA PCI devices
437+ import glob
438+
439+ numa_paths = glob .glob ("/sys/bus/pci/devices/*/numa_node" )
440+ for path in numa_paths :
441+ vendor_path = path .replace ("numa_node" , "vendor" )
442+ if os .path .exists (vendor_path ):
443+ with open (vendor_path , "r" ) as f :
444+ vendor = f .read ().strip ()
445+ if vendor == "0x10de" : # NVIDIA vendor ID
446+ with open (path , "r" ) as f :
447+ return int (f .read ().strip ())
448+
449+ return - 1
450+ except Exception as e :
451+ logger .debug (f"[CacheController] Failed to get NUMA node for GPU { device_id } : { e } " )
452+ return - 1
453+
454+ def _bind_to_closest_numa_node (self ) -> bool :
455+ """
456+ Bind current thread and memory allocation to the NUMA node closest to the GPU.
457+
458+ This should be called before allocating host memory to ensure the memory
459+ is allocated on the NUMA node local to the GPU, reducing cross-NUMA access
460+ latency during H2D transfers.
461+
462+ Returns:
463+ True if binding was successful, False otherwise.
464+ """
465+ if self ._numa_bound :
466+ return True
467+
468+ try :
469+ # Load libnuma
470+ try :
471+ libnuma = ctypes .CDLL ("libnuma.so.1" )
472+ except OSError :
473+ try :
474+ libnuma = ctypes .CDLL ("libnuma.so" )
475+ except OSError :
476+ logger .warning ("[CacheController] libnuma not found, NUMA binding skipped" )
477+ return False
478+
479+ # Check if NUMA is available
480+ if libnuma .numa_available () < 0 :
481+ logger .warning ("[CacheController] NUMA is not available on this system" )
482+ return False
483+
484+ # Get NUMA node for current GPU
485+ numa_node = self ._get_numa_node_for_gpu (self ._device_id )
486+
487+ if numa_node < 0 :
488+ logger .warning (f"[CacheController] Could not determine NUMA node for GPU { self ._device_id } " )
489+ return False
490+
491+ # Bind current thread to specific NUMA node
492+ # numa_run_on_node binds the current thread to run on the specified node
493+ result = libnuma .numa_run_on_node (numa_node )
494+ if result < 0 :
495+ logger .warning (f"[CacheController] numa_run_on_node({ numa_node } ) failed" )
496+ return False
497+
498+ # Set memory allocation preference to the specified NUMA node
499+ # This affects subsequent memory allocations (including cudaHostAlloc)
500+ libnuma .numa_set_preferred (numa_node )
501+
502+ self ._numa_bound = True
503+ logger .info (
504+ f"[CacheController] NUMA binding successful: " f"GPU { self ._device_id } bound to NUMA node { numa_node } "
505+ )
506+ return True
507+
508+ except Exception as e :
509+ logger .warning (f"[CacheController] NUMA binding failed: { e } " )
510+ return False
511+
387512 def initialize_host_cache (
388513 self ,
389514 attn_backend : Any ,
@@ -408,6 +533,11 @@ def initialize_host_cache(
408533 if len (self .host_cache_kvs_map ) > 0 :
409534 return
410535
536+ # Step 0: Bind to closest NUMA node before allocating host memory
537+ # This ensures subsequent cuda_host_alloc allocations are on the local NUMA node
538+ if not self ._numa_bound :
539+ self ._bind_to_closest_numa_node ()
540+
411541 # Get kv cache quantization type
412542 kv_cache_quant_type = self ._get_kv_cache_quant_type ()
413543
0 commit comments