From 621dfd58292bf8181ba9815ec556eec8e2551043 Mon Sep 17 00:00:00 2001 From: Michael Margolin Date: Sun, 22 Mar 2026 16:37:18 +0000 Subject: [PATCH] libibverbs: Introduce Completion Counters verbs Extend verbs interface to support Completion Counters that can be seen as a light-weight alternative to polling CQ. A completion counter object separately counts successful and error completions, can be attached to multiple QPs and be configured to count completions of a subset of operation types. This is especially useful for batch or credit based workloads running on accelerators but can serve many other types of applications as well. Expose supported number of completion counters through query device extended verb. Reviewed-by: Yonatan Nachum Signed-off-by: Michael Margolin --- libibverbs/examples/devinfo.c | 1 + libibverbs/man/CMakeLists.txt | 9 + libibverbs/man/ibv_create_comp_cntr.3.md | 246 ++++++++++++++++++++ libibverbs/man/ibv_qp_attach_comp_cntr.3.md | 120 ++++++++++ libibverbs/man/ibv_query_device_ex.3 | 1 + libibverbs/verbs.h | 115 +++++++++ 6 files changed, 492 insertions(+) create mode 100644 libibverbs/man/ibv_create_comp_cntr.3.md create mode 100644 libibverbs/man/ibv_qp_attach_comp_cntr.3.md diff --git a/libibverbs/examples/devinfo.c b/libibverbs/examples/devinfo.c index c245b1f28..f44dd18b8 100644 --- a/libibverbs/examples/devinfo.c +++ b/libibverbs/examples/devinfo.c @@ -585,6 +585,7 @@ static int print_hca_cap(struct ibv_device *ib_dev, uint8_t ib_port) printf("\tmax_srq_sge:\t\t\t%d\n", device_attr.orig_attr.max_srq_sge); } printf("\tmax_pkeys:\t\t\t%d\n", device_attr.orig_attr.max_pkeys); + printf("\tmax_comp_cntr:\t\t\t\t%d\n", device_attr.max_comp_cntr); printf("\tlocal_ca_ack_delay:\t\t%d\n", device_attr.orig_attr.local_ca_ack_delay); print_odp_caps(&device_attr); diff --git a/libibverbs/man/CMakeLists.txt b/libibverbs/man/CMakeLists.txt index f498c1532..f6a470ce4 100644 --- a/libibverbs/man/CMakeLists.txt +++ b/libibverbs/man/CMakeLists.txt @@ -14,7 +14,9 @@ rdma_man_pages( ibv_create_ah.3 ibv_create_ah_from_wc.3 ibv_create_comp_channel.3 + ibv_create_comp_cntr.3.md ibv_create_counters.3.md + ibv_qp_attach_comp_cntr.3.md ibv_create_cq.3 ibv_create_cq_ex.3 ibv_modify_cq.3 @@ -98,6 +100,13 @@ rdma_alias_man_pages( ibv_create_ah.3 ibv_destroy_ah.3 ibv_create_ah_from_wc.3 ibv_init_ah_from_wc.3 ibv_create_comp_channel.3 ibv_destroy_comp_channel.3 + ibv_create_comp_cntr.3 ibv_destroy_comp_cntr.3 + ibv_create_comp_cntr.3 ibv_set_comp_cntr.3 + ibv_create_comp_cntr.3 ibv_set_err_comp_cntr.3 + ibv_create_comp_cntr.3 ibv_inc_comp_cntr.3 + ibv_create_comp_cntr.3 ibv_inc_err_comp_cntr.3 + ibv_create_comp_cntr.3 ibv_read_comp_cntr.3 + ibv_create_comp_cntr.3 ibv_read_err_comp_cntr.3 ibv_create_counters.3 ibv_destroy_counters.3 ibv_create_cq.3 ibv_destroy_cq.3 ibv_create_flow.3 ibv_destroy_flow.3 diff --git a/libibverbs/man/ibv_create_comp_cntr.3.md b/libibverbs/man/ibv_create_comp_cntr.3.md new file mode 100644 index 000000000..b3b7d0591 --- /dev/null +++ b/libibverbs/man/ibv_create_comp_cntr.3.md @@ -0,0 +1,246 @@ +--- +date: 2026-02-09 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: ibv_create_comp_cntr +tagline: Verbs +--- + +# NAME + +**ibv_create_comp_cntr**, **ibv_destroy_comp_cntr** - Create or destroy a +completion counter + +**ibv_set_comp_cntr**, **ibv_set_err_comp_cntr** - Set the value of a +completion or error counter + +**ibv_inc_comp_cntr**, **ibv_inc_err_comp_cntr** - Increment a completion or +error counter + +**ibv_read_comp_cntr**, **ibv_read_err_comp_cntr** - Read the value of a +completion or error counter + +# SYNOPSIS + +```c +#include + +struct ibv_comp_cntr *ibv_create_comp_cntr(struct ibv_context *context, + struct ibv_comp_cntr_init_attr *cc_attr); + +int ibv_destroy_comp_cntr(struct ibv_comp_cntr *comp_cntr); + +int ibv_set_comp_cntr(struct ibv_comp_cntr *comp_cntr, uint64_t value); +int ibv_set_err_comp_cntr(struct ibv_comp_cntr *comp_cntr, uint64_t value); +int ibv_inc_comp_cntr(struct ibv_comp_cntr *comp_cntr, uint64_t amount); +int ibv_inc_err_comp_cntr(struct ibv_comp_cntr *comp_cntr, uint64_t amount); +int ibv_read_comp_cntr(struct ibv_comp_cntr *comp_cntr, uint64_t *value); +int ibv_read_err_comp_cntr(struct ibv_comp_cntr *comp_cntr, uint64_t *value); +``` + +# DESCRIPTION + +Completion counters provide a lightweight completion mechanism as an +alternative or extension to completion queues (CQs). Rather than generating +individual completion queue entries, a completion counter tracks the aggregate +number of completed operations. This makes them well suited for applications +that need to know how many requests have completed without requiring +per-request details, such as credit based flow control or tracking responses +from remote peers. + +Each completion counter maintains two distinct 64-bit values: a completion +count that is incremented on successful completions, and an error count that +is incremented when operations complete in error. + +**ibv_create_comp_cntr**() allocates a new completion counter for the RDMA +device context *context*. The properties of the counter are defined by +*cc_attr*. The maximum number of completion counters a device supports is +reported by the *max_comp_cntr* field of **ibv_device_attr_ex**. + +**ibv_destroy_comp_cntr**() releases all resources associated with the +completion counter *comp_cntr*. The counter must not be attached to any QP +when destroyed. + +**ibv_set_comp_cntr**() sets the completion count of *comp_cntr* to *value*. + +**ibv_set_err_comp_cntr**() sets the error count of *comp_cntr* to *value*. + +**ibv_inc_comp_cntr**() increments the completion count of *comp_cntr* by +*amount*. + +**ibv_inc_err_comp_cntr**() increments the error count of *comp_cntr* by +*amount*. + +**ibv_read_comp_cntr**() reads the current completion count of *comp_cntr* +into *value*. + +**ibv_read_err_comp_cntr**() reads the current error count of *comp_cntr* +into *value*. + +## External memory + +By default, the memory backing the counter values is allocated internally. +When the **IBV_COMP_CNTR_INIT_WITH_EXTERNAL_MEM** flag is set in +*ibv_comp_cntr_init_attr.flags*, the application provides its own memory for +the completion and error counts via the *comp_cntr_ext_mem* and +*err_cntr_ext_mem* fields. The external memory is described by an +**ibv_memory_location** structure which supports two modes: a virtual address +(**IBV_MEMORY_LOCATION_VA**), where the application supplies a direct pointer, or +a DMA-BUF reference (**IBV_MEMORY_LOCATION_DMABUF**), where the application +supplies a file descriptor and offset into an exported DMA-BUF. When using +DMA-BUF, the *ptr* field may also be set to provide a process-accessible +mapping of the memory, which may enable more efficient counter reads. Using +external memory allows the counter values to +reside in application-managed buffers or in memory exported through DMA-BUF, +enabling zero-copy observation of completion progress by co-located processes +or devices. + +# ARGUMENTS + +## ibv_comp_cntr + +```c +struct ibv_comp_cntr { + struct ibv_context *context; + uint32_t handle; + uint64_t comp_count_max_value; + uint64_t err_count_max_value; +}; +``` + +*context* +: Device context associated with the completion counter. + +*handle* +: Kernel object handle for the completion counter. + +*comp_count_max_value* +: The maximum value the completion count can hold. A subsequent + increment that would exceed this value wraps the counter to zero. + +*err_count_max_value* +: The maximum value the error count can hold. A subsequent increment + that would exceed this value wraps the counter to zero. + +## ibv_comp_cntr_init_attr + +```c +struct ibv_comp_cntr_init_attr { + uint32_t comp_mask; + uint32_t flags; + struct ibv_memory_location comp_cntr_ext_mem; + struct ibv_memory_location err_cntr_ext_mem; +}; +``` + +*comp_mask* +: Bitmask specifying what fields in the structure are valid. + +*flags* +: Creation flags. The following flags are supported: + + **IBV_COMP_CNTR_INIT_WITH_EXTERNAL_MEM** - Use application-provided + memory for the counter values, as specified by *comp_cntr_ext_mem* + and *err_cntr_ext_mem*. + +*comp_cntr_ext_mem* +: Memory location for the completion count when using external memory. + +*err_cntr_ext_mem* +: Memory location for the error count when using external memory. + +## ibv_memory_location + +```c +enum ibv_memory_location_type { + IBV_MEMORY_LOCATION_VA, + IBV_MEMORY_LOCATION_DMABUF, +}; + +struct ibv_memory_location { + uint8_t *ptr; + struct { + uint64_t offset; + int32_t fd; + uint32_t reserved; + } dmabuf; + uint8_t type; + uint8_t reserved[7]; +}; +``` + +*type* +: The type of memory location. **IBV_MEMORY_LOCATION_VA** for a virtual + address, or **IBV_MEMORY_LOCATION_DMABUF** for a DMA-BUF reference. + +*ptr* +: Virtual address pointer. Required when type is + **IBV_MEMORY_LOCATION_VA**. When type is + **IBV_MEMORY_LOCATION_DMABUF**, may optionally be set to provide a + process-accessible mapping of the DMA-BUF memory. Otherwise should be + NULL. + +*dmabuf.fd* +: DMA-BUF file descriptor (used when type is + **IBV_MEMORY_LOCATION_DMABUF**). + +*dmabuf.offset* +: Offset within the DMA-BUF. + +# RETURN VALUE + +**ibv_create_comp_cntr**() returns a pointer to the allocated ibv_comp_cntr +object, or NULL if the request fails (and sets errno to indicate the failure +reason). + +**ibv_destroy_comp_cntr**(), **ibv_set_comp_cntr**(), +**ibv_set_err_comp_cntr**(), **ibv_inc_comp_cntr**(), +**ibv_inc_err_comp_cntr**(), **ibv_read_comp_cntr**(), and +**ibv_read_err_comp_cntr**() return 0 on success, or the value of errno on +failure (which indicates the failure reason). + +# ERRORS + +ENOTSUP +: Completion counters are not supported on this device, or the + requested operation is not supported for the given counter + configuration. + +ENOMEM +: Not enough resources to create the completion counter. + +EINVAL +: Invalid argument(s) passed. + +EBUSY +: The completion counter is still attached to a QP + (**ibv_destroy_comp_cntr**() only). + +# NOTES + +Counter values must only be updated using **ibv_set_comp_cntr**(), +**ibv_set_err_comp_cntr**(), **ibv_inc_comp_cntr**(), or +**ibv_inc_err_comp_cntr**(). Counter memory supplied by the application +must not be modified directly. + +Updates made to counter values (e.g. via **ibv_set_comp_cntr**() or +**ibv_inc_comp_cntr**()) may not be immediately visible when reading the +counter via **ibv_read_comp_cntr**() or **ibv_read_err_comp_cntr**(). A small +delay may occur between the update and the observed value. However, the final +updated value will eventually be reflected. + +Applications should ensure that the counter value is stable before calling +**ibv_set_comp_cntr**() or **ibv_set_err_comp_cntr**(). Otherwise, concurrent +updates may be lost. + +# SEE ALSO + +**ibv_qp_attach_comp_cntr**(3), **ibv_create_cq**(3), +**ibv_create_cq_ex**(3), **ibv_create_qp**(3) + +# AUTHORS + +Michael Margolin diff --git a/libibverbs/man/ibv_qp_attach_comp_cntr.3.md b/libibverbs/man/ibv_qp_attach_comp_cntr.3.md new file mode 100644 index 000000000..64e24de26 --- /dev/null +++ b/libibverbs/man/ibv_qp_attach_comp_cntr.3.md @@ -0,0 +1,120 @@ +--- +date: 2026-02-09 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: ibv_qp_attach_comp_cntr +tagline: Verbs +--- + +# NAME + +**ibv_qp_attach_comp_cntr** - Attach a completion counter to a QP + +# SYNOPSIS + +```c +#include + +int ibv_qp_attach_comp_cntr(struct ibv_qp *qp, + struct ibv_comp_cntr *comp_cntr, + struct ibv_comp_cntr_attach_attr *attr); +``` + +# DESCRIPTION + +**ibv_qp_attach_comp_cntr**() attaches the completion counter *comp_cntr* to +the queue pair *qp*. The *attr* argument specifies which operation types +should update the counter. + +The QP must be in **IBV_QPS_RESET** or **IBV_QPS_INIT** state when attaching +a completion counter. Attempting to attach a counter to a QP in any other +state will fail with EINVAL. + +The completion counter starts collecting values for the specified QP once +attached. Attaching the same completion counter to multiple QPs will +accumulate values from all attached QPs into the same counter. + +The *op_mask* field controls which operation completions are counted. Local +operations (**IBV_COMP_CNTR_ATTACH_OP_SEND**, **IBV_COMP_CNTR_ATTACH_OP_RECV**, +**IBV_COMP_CNTR_ATTACH_OP_RDMA_READ**, **IBV_COMP_CNTR_ATTACH_OP_RDMA_WRITE**) +count completions initiated by the local QP. Remote operations +(**IBV_COMP_CNTR_ATTACH_OP_REMOTE_RDMA_READ**, +**IBV_COMP_CNTR_ATTACH_OP_REMOTE_RDMA_WRITE**) count completions of incoming +RDMA operations initiated by the remote side. Supported *op_mask* values may +vary by device; unsupported values will result in an ENOTSUP error. + +Multiple completion counters can be attached to the same QP, provided their +*op_mask* values do not overlap. Each QP and operation type pair can be +associated with at most one completion counter. Attempting to attach a +counter with an *op_mask* that conflicts with an already attached counter +will fail. + +There is no explicit detach operation. A completion counter is implicitly +detached when the QP it is attached to is destroyed. A completion counter +cannot be destroyed while it is still attached to any QP; the QP must be +destroyed first. + +# ARGUMENTS + +*qp* +: The queue pair to attach the completion counter to. + +*comp_cntr* +: The completion counter to attach, previously created with + **ibv_create_comp_cntr**(). + +*attr* +: Attach attributes specifying which operation types update the counter. + +## ibv_comp_cntr_attach_attr + +```c +enum ibv_comp_cntr_attach_op { + IBV_COMP_CNTR_ATTACH_OP_SEND = 1 << 0, + IBV_COMP_CNTR_ATTACH_OP_RECV = 1 << 1, + IBV_COMP_CNTR_ATTACH_OP_RDMA_READ = 1 << 2, + IBV_COMP_CNTR_ATTACH_OP_REMOTE_RDMA_READ = 1 << 3, + IBV_COMP_CNTR_ATTACH_OP_RDMA_WRITE = 1 << 4, + IBV_COMP_CNTR_ATTACH_OP_REMOTE_RDMA_WRITE = 1 << 5, +}; + +struct ibv_comp_cntr_attach_attr { + uint32_t comp_mask; + uint32_t op_mask; +}; +``` + +*comp_mask* +: Bitmask specifying what fields in the structure are valid. + +*op_mask* +: Bitmask of **ibv_comp_cntr_attach_op** values specifying which + operation types should update the counter. + +# RETURN VALUE + +**ibv_qp_attach_comp_cntr**() returns 0 on success, or the value of errno on +failure (which indicates the failure reason). + +# ERRORS + +EINVAL +: Invalid argument(s) passed. + +ENOTSUP +: Requested operation is not supported on this device. + +EBUSY +: The *op_mask* overlaps with a completion counter already attached + to this QP. + +# SEE ALSO + +**ibv_create_comp_cntr**(3), **ibv_create_qp**(3) + +# AUTHORS + +Michael Margolin diff --git a/libibverbs/man/ibv_query_device_ex.3 b/libibverbs/man/ibv_query_device_ex.3 index c77e8b4f8..2d502e6ac 100644 --- a/libibverbs/man/ibv_query_device_ex.3 +++ b/libibverbs/man/ibv_query_device_ex.3 @@ -44,6 +44,7 @@ uint64_t max_dm_size; /* Max Device Memory size (in bytes) avail struct ibv_pci_atomic_caps atomic_caps; /* PCI atomic operations capabilities, use enum ibv_pci_atomic_op_size */ uint32_t xrc_odp_caps; /* Mask with enum ibv_odp_transport_cap_bits to know which operations are supported. */ uint32_t phys_port_cnt_ex /* Extended number of physical port count, allows exposing more than 255 ports device */ +uint32_t max_comp_cntr; /* Maximum number of completion counters supported (0 = unsupported) */ .in -8 }; diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h index 36d120eec..439326108 100644 --- a/libibverbs/verbs.h +++ b/libibverbs/verbs.h @@ -361,6 +361,7 @@ struct ibv_device_attr_ex { struct ibv_pci_atomic_caps pci_atomic_caps; uint32_t xrc_odp_caps; uint32_t phys_port_cnt_ex; + uint32_t max_comp_cntr; }; enum ibv_mtu { @@ -485,6 +486,22 @@ struct ibv_async_event { enum ibv_event_type event_type; }; +enum ibv_memory_location_type { + IBV_MEMORY_LOCATION_VA, + IBV_MEMORY_LOCATION_DMABUF, +}; + +struct ibv_memory_location { + uint8_t *ptr; + struct { + uint64_t offset; + int32_t fd; + uint32_t reserved; + } dmabuf; + uint8_t type; /* Use ibv_memory_location_type */ + uint8_t reserved[7]; +}; + enum ibv_wc_status { IBV_WC_SUCCESS, IBV_WC_LOC_LEN_ERR, @@ -3018,6 +3035,81 @@ static inline int ibv_modify_cq(struct ibv_cq *cq, struct ibv_modify_cq_attr *at return vctx->modify_cq(cq, attr); } + +struct ibv_comp_cntr { + struct ibv_context *context; + uint32_t handle; + uint64_t comp_count_max_value; + uint64_t err_count_max_value; +}; + +enum { + IBV_COMP_CNTR_INIT_WITH_EXTERNAL_MEM, +}; + +struct ibv_comp_cntr_init_attr { + uint32_t comp_mask; /* Compatibility mask */ + uint32_t flags; + struct ibv_memory_location comp_cntr_ext_mem; + struct ibv_memory_location err_cntr_ext_mem; +}; + +/** + * ibv_create_comp_cntr - Create a completion counter + * @context: Device context to create the counter on. + * @cc_attr: Attributes for the completion counter. + */ +struct ibv_comp_cntr *ibv_create_comp_cntr(struct ibv_context *context, + struct ibv_comp_cntr_init_attr *cc_attr); + +/** + * ibv_destroy_comp_cntr - Destroy a completion counter + * @comp_cntr: The completion counter to destroy. + */ +int ibv_destroy_comp_cntr(struct ibv_comp_cntr *comp_cntr); + +/** + * ibv_set_comp_cntr - Set the completion count value + * @comp_cntr: The completion counter to update. + * @value: The value to set. + */ +int ibv_set_comp_cntr(struct ibv_comp_cntr *comp_cntr, uint64_t value); + +/** + * ibv_set_err_comp_cntr - Set the error count value + * @comp_cntr: The completion counter to update. + * @value: The value to set. + */ +int ibv_set_err_comp_cntr(struct ibv_comp_cntr *comp_cntr, uint64_t value); + +/** + * ibv_inc_comp_cntr - Increment the completion count + * @comp_cntr: The completion counter to increment. + * @amount: The amount to increment by. + */ +int ibv_inc_comp_cntr(struct ibv_comp_cntr *comp_cntr, uint64_t amount); + +/** + * ibv_inc_err_comp_cntr - Increment the error count + * @comp_cntr: The completion counter to increment. + * @amount: The amount to increment by. + */ +int ibv_inc_err_comp_cntr(struct ibv_comp_cntr *comp_cntr, uint64_t amount); + +/** + * ibv_read_comp_cntr - Read the completion count value + * @comp_cntr: The completion counter to read. + * @value: Output pointer to store the current completion count. + */ +int ibv_read_comp_cntr(struct ibv_comp_cntr *comp_cntr, uint64_t *value); + +/** + * ibv_read_err_comp_cntr - Read the error count value + * @comp_cntr: The completion counter to read. + * @value: Output pointer to store the current error count. + */ +int ibv_read_err_comp_cntr(struct ibv_comp_cntr *comp_cntr, uint64_t *value); + /** * ibv_create_srq - Creates a SRQ associated with the specified protection * domain. @@ -3293,6 +3385,29 @@ ibv_modify_qp_rate_limit(struct ibv_qp *qp, return vctx->modify_qp_rate_limit(qp, attr); } +enum ibv_comp_cntr_attach_op { + IBV_COMP_CNTR_ATTACH_OP_SEND = 1 << 0, + IBV_COMP_CNTR_ATTACH_OP_RECV = 1 << 1, + IBV_COMP_CNTR_ATTACH_OP_RDMA_READ = 1 << 2, + IBV_COMP_CNTR_ATTACH_OP_REMOTE_RDMA_READ = 1 << 3, + IBV_COMP_CNTR_ATTACH_OP_RDMA_WRITE = 1 << 4, + IBV_COMP_CNTR_ATTACH_OP_REMOTE_RDMA_WRITE = 1 << 5, +}; + +struct ibv_comp_cntr_attach_attr { + uint32_t comp_mask; /* Compatibility mask */ + uint32_t op_mask; /* Use ibv_comp_cntr_attach_op */ +}; + +/** + * ibv_qp_attach_comp_cntr - Attach a completion counter to a QP + * @qp: The queue pair to attach the counter to. + * @comp_cntr: The completion counter to attach. + * @attr: Attach attributes. + */ +int ibv_qp_attach_comp_cntr(struct ibv_qp *qp, struct ibv_comp_cntr *comp_cntr, + struct ibv_comp_cntr_attach_attr *attr); + /** * ibv_query_qp_data_in_order - Checks whether the data is guaranteed to be * written in-order.