|
59 | 59 | from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue |
60 | 60 | from fastdeploy.inter_communicator import ( |
61 | 61 | ExistTaskStatus, |
| 62 | + IPCLock, |
62 | 63 | IPCSignal, |
63 | 64 | ModelWeightsStatus, |
64 | 65 | RearrangeExpertStatus, |
@@ -284,6 +285,14 @@ def init_health_status(self) -> None: |
284 | 285 | create=False, |
285 | 286 | ) |
286 | 287 |
|
| 288 | + # gpu_cache_lock: file-based lock for mutual exclusion between worker |
| 289 | + # and CPU transfer when accessing GPU KV cache. |
| 290 | + self.gpu_cache_lock = IPCLock( |
| 291 | + name="gpu_cache_lock", |
| 292 | + suffix=self.parallel_config.local_engine_worker_queue_port, |
| 293 | + create=False, |
| 294 | + ) |
| 295 | + |
287 | 296 | def update_weights_from_tensor(self, mmap_infos): |
288 | 297 | """ |
289 | 298 | update_weights_from_tensor |
@@ -426,6 +435,35 @@ def _run_eplb(self, tp_rank): |
426 | 435 | self.rearrange_experts_signal.value[0] = RearrangeExpertStatus.DONE.value |
427 | 436 | logger.info("redundant_expert: done") |
428 | 437 |
|
| 438 | + def _acquire_kvcache_lock(self, tp_rank): |
| 439 | + """Acquire the GPU KV cache lock for the worker process. |
| 440 | +
|
| 441 | + Uses a file-based lock (fcntl.flock) to ensure mutual exclusion |
| 442 | + between the worker and the CPU transfer process during model |
| 443 | + execution. Only rank 0 acquires the lock to avoid deadlock among |
| 444 | + tensor-parallel workers. |
| 445 | +
|
| 446 | + Args: |
| 447 | + tp_rank: Tensor parallel rank of the current worker. Only rank 0 |
| 448 | + acquires the lock. |
| 449 | + """ |
| 450 | + if not envs.FD_USE_KVCACHE_LOCK: |
| 451 | + return |
| 452 | + if tp_rank == 0: |
| 453 | + self.gpu_cache_lock.acquire() |
| 454 | + |
| 455 | + def _release_kvcache_lock(self, tp_rank): |
| 456 | + """Release the GPU KV cache lock held by the worker process. |
| 457 | +
|
| 458 | + Args: |
| 459 | + tp_rank: Tensor parallel rank of the current worker. Only rank 0 |
| 460 | + releases the lock. |
| 461 | + """ |
| 462 | + if not envs.FD_USE_KVCACHE_LOCK: |
| 463 | + return |
| 464 | + if tp_rank == 0: |
| 465 | + self.gpu_cache_lock.release() |
| 466 | + |
429 | 467 | def event_loop_normal(self) -> None: |
430 | 468 | """Main event loop for Paddle Distributed Workers. |
431 | 469 | TODO(gongshaotian): support remote calling of functions that control worker. |
@@ -572,7 +610,11 @@ def event_loop_normal(self) -> None: |
572 | 610 | # Execute model to generate token. The generated token will be written to the buffer. |
573 | 611 | # These generated tokens can be obtained through get_output op. |
574 | 612 | start_execute_time = time.time() |
| 613 | + |
| 614 | + self._acquire_kvcache_lock(tp_rank) |
575 | 615 | self.worker.execute_model(req_dicts, max_occupied_batch_index) |
| 616 | + self._release_kvcache_lock(tp_rank) |
| 617 | + |
576 | 618 | # Only v0 use this signal |
577 | 619 | if not envs.ENABLE_V1_KVCACHE_SCHEDULER: |
578 | 620 | self.exist_prefill_task_signal.value[0] = self.worker.exist_prefill() |
|
0 commit comments