3131from fastdeploy .utils import get_host_ip
3232
3333DEFAULT_GLOBAL_SEGMENT_SIZE = 1024 * 1024 * 1024 # 1 GiB
34- DEFAULT_LOCAL_BUFFER_SIZE = 128 * 1024 * 1024 # 128MB
34+ DEFAULT_LOCAL_BUFFER_SIZE = 1024 * 1024 # 1MB
35+ DEFAULT_MC_MAX_MR_SIZE = 4 * 1024 * 1024 * 1024 # 4GB
36+ MIN_MC_MAX_MR_SIZE = 1024 * 1024 * 1024 # 1GB
37+ MAX_MC_MAX_MR_SIZE = 6 * 1024 * 1024 * 1024 # 6GB
38+
39+
40+ def byte_to_gb (byte ):
41+ return byte / (1024 * 1024 * 1024 )
3542
3643
3744@dataclass
@@ -111,9 +118,25 @@ def __init__(self, tp_rank=None):
111118 host_ip = get_host_ip ()
112119 os .environ ["MC_TCP_BIND_ADDRESS" ] = host_ip
113120 logger .info (f"Set MC_TCP_BIND_ADDRESS to { host_ip } " )
114- if os .environ .get ("MC_MAX_MR_SIZE" ) is None :
115- os .environ ["MC_MAX_MR_SIZE" ] = "4294967296" # 4GB
116- logger .info ("MC_MAX_MR_SIZE is not set, default to 4GB." )
121+
122+ # Set MC_MAX_MR_SIZE for mooncake store to control the maximum mr size
123+ self .mc_max_mr_size = int (os .environ .get ("MC_MAX_MR_SIZE" , 0 ))
124+ if self .mc_max_mr_size == 0 :
125+ self .mc_max_mr_size = DEFAULT_MC_MAX_MR_SIZE
126+ logger .info (f"MC_MAX_MR_SIZE is not set, default to { byte_to_gb (DEFAULT_MC_MAX_MR_SIZE )} GB." )
127+ elif self .mc_max_mr_size < MIN_MC_MAX_MR_SIZE :
128+ self .mc_max_mr_size = MIN_MC_MAX_MR_SIZE
129+ logger .info (
130+ f"MC_MAX_MR_SIZE is smaller than { byte_to_gb (MIN_MC_MAX_MR_SIZE )} GB, set to { byte_to_gb (MIN_MC_MAX_MR_SIZE )} GB."
131+ )
132+ elif self .mc_max_mr_size > MAX_MC_MAX_MR_SIZE :
133+ self .mc_max_mr_size = MAX_MC_MAX_MR_SIZE
134+ logger .info (
135+ f"MC_MAX_MR_SIZE is larger than { byte_to_gb (MAX_MC_MAX_MR_SIZE )} GB, set to { byte_to_gb (MAX_MC_MAX_MR_SIZE )} GB."
136+ )
137+ else :
138+ logger .info (f"MC_MAX_MR_SIZE is set to { self .mc_max_mr_size } bytes." )
139+ os .environ ["MC_MAX_MR_SIZE" ] = str (self .mc_max_mr_size )
117140
118141 try :
119142 from mooncake .store import MooncakeDistributedStore
@@ -129,6 +152,11 @@ def __init__(self, tp_rank=None):
129152 self .config = MooncakeStoreConfig .create ()
130153 if self .tp_rank is not None :
131154 self .config .select_rdma_device (self .tp_rank )
155+ if self .config .local_buffer_size > self .mc_max_mr_size :
156+ raise ValueError (
157+ f"local_buffer_size { self .config .local_buffer_size } must be "
158+ f"smaller than mc_max_mr_size { self .mc_max_mr_size } "
159+ )
132160 logger .info (f"Mooncake Configuration loaded, { self .config } ." )
133161
134162 ret_code = self .store .setup (
@@ -162,13 +190,38 @@ def warmup(self):
162190 self .store .remove (warmup_key )
163191
164192 def register_buffer (self , buffer_ptr , buffer_size ) -> None :
165- try :
166- ret_code = self .store .register_buffer (buffer_ptr , buffer_size )
167- if ret_code :
168- logger .error (f"failed to register buffer, error code: { ret_code } " )
169- except TypeError as err :
170- logger .error ("Failed to register buffer to Mooncake Store: %s" , err )
171- raise TypeError ("Mooncake Store Register Buffer Error." ) from err
193+ """Register a buffer with Mooncake Store.
194+ If buffer_size exceeds mc_max_mr_size, the buffer is split into
195+ multiple chunks, each registered separately.
196+ cuda_host_alloc returns physically contiguous pinned memory, so
197+ pointer offset arithmetic is valid for sub-region registration.
198+ """
199+ max_mr_size = self .mc_max_mr_size
200+ if buffer_size <= max_mr_size :
201+ try :
202+ ret_code = self .store .register_buffer (buffer_ptr , buffer_size )
203+ assert ret_code == 0 , f"failed to register buffer, error code: { ret_code } "
204+ except TypeError as err :
205+ logger .error ("Failed to register buffer to Mooncake Store: %s" , err )
206+ raise TypeError ("Mooncake Store Register Buffer Error." ) from err
207+ else :
208+ num_chunks = (buffer_size + max_mr_size - 1 ) // max_mr_size
209+ logger .info (
210+ f"Registering buffer of { byte_to_gb (buffer_size ):.2f} GB in { num_chunks } chunks "
211+ f"(max_mr_size={ byte_to_gb (max_mr_size ):.2f} GB per chunk)"
212+ )
213+ for i in range (num_chunks ):
214+ chunk_ptr = buffer_ptr + i * max_mr_size
215+ chunk_size = min (max_mr_size , buffer_size - i * max_mr_size )
216+ try :
217+ ret_code = self .store .register_buffer (chunk_ptr , chunk_size )
218+ assert ret_code == 0 , (
219+ f"failed to register chunk { i } /{ num_chunks } , "
220+ f"size={ byte_to_gb (chunk_size ):.2f} GB, error code: { ret_code } "
221+ )
222+ except TypeError as err :
223+ logger .error ("Failed to register chunk %d/%d to Mooncake Store: %s" , i , num_chunks , err )
224+ raise TypeError ("Mooncake Store Register Buffer Error." ) from err
172225
173226 def set (
174227 self ,
0 commit comments