TensorRT-LLM/cpp/tensorrt_llm/nanobind/runtime/moeBindings.cpp at 5f391a6b3d6a4735f54f4e5f2ca1cc4f300f6cf2 · NVIDIA/TensorRT-LLM · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
/*
 * SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "moeBindings.h"
#include "tensorrt_llm/nanobind/common/customCasters.h"
#include "tensorrt_llm/runtime/moeLoadBalancer/hostAccessibleDeviceAllocator.h"
#include "tensorrt_llm/runtime/moeLoadBalancer/moeLoadBalancer.h"
#include <nanobind/nanobind.h>
#include <nanobind/stl/shared_ptr.h>
#include <vector>

namespace nb = nanobind;
namespace tr = tensorrt_llm::runtime;
namespace tk = tensorrt_llm::kernels;

namespace tensorrt_llm::nanobind::runtime
{

void pyDoReplication(tk::MoeLoadBalanceMetaInfo const& metaInfo, std::vector<float>& expertLoadFactor,
    tr::MoePlacementCpuInfo* cpuPlacement)
{
    TLLM_CHECK_WITH_INFO(
        metaInfo.expertCount == expertLoadFactor.size(), "expert_count and expert_load_factor size mismatch");
    tr::doReplication(metaInfo, expertLoadFactor.data(), cpuPlacement);
};

void pyDoPlacement(tk::MoeLoadBalanceMetaInfo const& metaInfo, std::vector<float>& expertLoadFactor,
    tr::MoePlacementCpuInfo* cpuPlacement)
{
    TLLM_CHECK_WITH_INFO(
        metaInfo.expertCount == expertLoadFactor.size(), "expert_count and expert_load_factor size mismatch");
    tr::doPlacement(metaInfo, expertLoadFactor.data(), cpuPlacement);
};

void initMoeBindings(nb::module_& m)
{
    // Bind MoeWeight struct
    nb::class_<tr::MoeWeight>(m, "MoeWeight")
        .def(nb::init<>())
        .def_prop_rw("weight_ptr", &tr::MoeWeight::getWeightPtr, &tr::MoeWeight::setWeightPtr)
        .def_rw("height", &tr::MoeWeight::mHeight)
        .def_rw("width", &tr::MoeWeight::mWidth)
        .def_rw("pitch", &tr::MoeWeight::mPitch)
        .def("__repr__",
            [](tr::MoeWeight const& self)
            {
                return "<MoeWeight ptr=" + std::to_string(self.getWeightPtr())
                    + " height=" + std::to_string(self.mHeight) + " width=" + std::to_string(self.mWidth)
                    + " pitch=" + std::to_string(self.mPitch) + ">";
            });

    // Bind MoeLoadBalanceMetaInfo struct
    nb::class_<tk::MoeLoadBalanceMetaInfo>(m, "MoeLoadBalanceMetaInfo")
        .def(nb::init<int, int, int, int, int>(), nb::arg("expert_count"), nb::arg("top_k"), nb::arg("ep_rank"),
            nb::arg("ep_size"), nb::arg("slot_count_per_rank"))
        .def_rw("expert_count", &tk::MoeLoadBalanceMetaInfo::expertCount)
        .def_rw("top_k", &tk::MoeLoadBalanceMetaInfo::topK)
        .def_rw("ep_rank", &tk::MoeLoadBalanceMetaInfo::epRank)
        .def_rw("ep_size", &tk::MoeLoadBalanceMetaInfo::epSize)
        .def_rw("slot_count_per_rank", &tk::MoeLoadBalanceMetaInfo::slotCountPerRank);

    // Bind MoePlacementCpuInfo struct
    nb::class_<tr::MoePlacementCpuInfo>(m, "MoePlacementCpuInfo")
        .def(nb::init<>())
        .def_rw("expert_replica_count", &tr::MoePlacementCpuInfo::expertReplicaCount)
        .def_rw("rank_expert_ids", &tr::MoePlacementCpuInfo::rankExpertIds);

    // Bind SingleLayerMoeLoadBalancer class
    nb::class_<tr::SingleLayerMoeLoadBalancer>(m, "SingleLayerMoeLoadBalancer")
        .def("add_single_weight_slot", &tr::SingleLayerMoeLoadBalancer::addSingleWeightSlot, nb::arg("slot_id"),
            nb::arg("name"), nb::arg("weight_slot"), "Add a single weight slot for a specific slot ID",
            nb::call_guard<nb::gil_scoped_release>())
        .def("add_single_host_weight", &tr::SingleLayerMoeLoadBalancer::addSingleHostWeight, nb::arg("expert_id"),
            nb::arg("name"), nb::arg("host_weight"), "Add a single host weight for a specific expert ID",
            nb::call_guard<nb::gil_scoped_release>())
        .def("set_initial_weight_assignments", &tr::SingleLayerMoeLoadBalancer::setInitialWeightAssignments,
            nb::arg("initial_weight_assignments"), "Set initial weight assignments for each slot",
            nb::call_guard<nb::gil_scoped_release>())
        .def("get_pointer", &tr::SingleLayerMoeLoadBalancer::getSelfPtr,
            "Get the pointer of the SingleLayerMoeLoadBalancer", nb::call_guard<nb::gil_scoped_release>())
        .def("get_layer_id", &tr::SingleLayerMoeLoadBalancer::getLayerId,
            "Get the layer id of the SingleLayerMoeLoadBalancer", nb::call_guard<nb::gil_scoped_release>())
        .def("get_old_rank_expert_ids", &tr::SingleLayerMoeLoadBalancer::getOldRankExpertIds,
            "Get the old rank expert ids of the SingleLayerMoeLoadBalancer", nb::call_guard<nb::gil_scoped_release>());

    // Bind MoeLoadBalancer class
    nb::class_<tr::MoeLoadBalancer>(m, "MoeLoadBalancer")
        .def(nb::init<int, int, int>(), nb::arg("ep_rank"), nb::arg("ep_size"), nb::arg("layer_updates_per_iter"),
            "Initialize the MoeLoadBalancer with the specified expert parallel rank, size, and update frequency",
            nb::call_guard<nb::gil_scoped_release>())
        .def("set_use_gpu_memcpy", &tr::MoeLoadBalancer::setUseGpuMemcpy, nb::arg("use_gpu_memcpy"),
            "Set whether to use GPU memcpy for weight updates", nb::call_guard<nb::gil_scoped_release>())
        .def("add_layer", &tr::MoeLoadBalancer::AddLayer, nb::arg("expert_count"), nb::arg("top_k"),
            nb::arg("slot_count_per_rank"), "Add a new MOE layer to the load balancer",
            nb::call_guard<nb::gil_scoped_release>())
        .def("finalize_model", &tr::MoeLoadBalancer::finalizeModel,
            "Finalize the model structure, must be called after all layers are added",
            nb::call_guard<nb::gil_scoped_release>())
        .def("set_warm_up_iter_count", &tr::MoeLoadBalancer::setWarmUpIterCount, nb::arg("iter_count"),
            "Set the number of warm-up iterations", nb::call_guard<nb::gil_scoped_release>())
        .def("start_iter", &tr::MoeLoadBalancer::startIter, nb::arg("iter_id"), nb::arg("enable_statistic"),
            nb::arg("enable_update_weights"), "Start a new iteration with the given ID and settings",
            nb::call_guard<nb::gil_scoped_release>())
        .def("end_iter", &tr::MoeLoadBalancer::endIter, nb::arg("iter_id"), "End the iteration with the given ID",
            nb::call_guard<nb::gil_scoped_release>())
        .def("reconfigure_mask_only", &tr::MoeLoadBalancer::reconfigureMaskOnly, nb::arg("dead_ranks"),
            "Reconfigure EPLB routing metadata so slots on dead EP ranks are unreachable",
            nb::call_guard<nb::gil_scoped_release>())
        .def("shutdown", &tr::MoeLoadBalancer::shutdown, "Shutdown the load balancer and clean up resources",
            nb::call_guard<nb::gil_scoped_release>());

    m.def("is_host_accessible_device_memory_supported", &tr::HostAccessibleDeviceAllocator::isSupported,
        "If current system support host accessible device memory");

    // Bind do_replication function for testing
    m.def("do_replication", &pyDoReplication, nb::arg("meta_info"), nb::arg("expert_load_factor"),
        nb::arg("cpu_placement"), "Do replication");

    // Bind do_placement function for testing
    m.def("do_placement", &pyDoPlacement, nb::arg("meta_info"), nb::arg("expert_load_factor"), nb::arg("cpu_placement"),
        "Do placement");
}

} // namespace tensorrt_llm::nanobind::runtime