InfiniTensor
diff --git a/‎include/infiniop/ops/repetition_penalty.h‎
Lines changed: 72 additions & 0 deletions b/‎include/infiniop/ops/repetition_penalty.h‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎src/infiniop/ops/repetition_penalty/cpu/repetition_penalty_cpu.cc‎
Lines changed: 123 additions & 0 deletions b/‎src/infiniop/ops/repetition_penalty/cpu/repetition_penalty_cpu.cc‎
Lines changed: 123 additions & 0 deletions
diff --git a/‎src/infiniop/ops/repetition_penalty/cpu/repetition_penalty_cpu.h‎
Lines changed: 8 additions & 0 deletions b/‎src/infiniop/ops/repetition_penalty/cpu/repetition_penalty_cpu.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/infiniop/ops/repetition_penalty/info.h‎
Lines changed: 34 additions & 0 deletions b/‎src/infiniop/ops/repetition_penalty/info.h‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎src/infiniop/ops/repetition_penalty/metax/repetition_penalty_kernel.h‎
Lines changed: 60 additions & 0 deletions b/‎src/infiniop/ops/repetition_penalty/metax/repetition_penalty_kernel.h‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎src/infiniop/ops/repetition_penalty/metax/repetition_penalty_metax.h‎
Lines changed: 8 additions & 0 deletions b/‎src/infiniop/ops/repetition_penalty/metax/repetition_penalty_metax.h‎
Lines changed: 8 additions & 0 deletions
@@ -0,0 +1,72 @@
+#ifndef __INFINIOP_REPETITION_PENALTY_API_H__
+#define __INFINIOP_REPETITION_PENALTY_API_H__
+
+#include "../operator_descriptor.h"
+#include <stdint.h>
+
+typedef struct InfiniopDescriptor *infiniopRepetitionPenaltyDescriptor_t;
+
+/**
+ * @brief Creates a repetition penalty operator descriptor.
+ *
+ * @param handle InfiniCore handle
+ * @param desc_ptr Output descriptor pointer
+ * @param logits_desc Logits tensor descriptor [num_seqs, vocab_size] - will be modified in-place
+ * @return infiniStatus_t Status code
+ */
+__C __export infiniStatus_t infiniopCreateRepetitionPenaltyDescriptor(
+    infiniopHandle_t handle,
+    infiniopRepetitionPenaltyDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t logits_desc);
+
+/**
+ * @brief Gets the workspace size required for repetition penalty operation.
+ *
+ * @param desc Operator descriptor
+ * @param size Output workspace size
+ * @return infiniStatus_t Status code
+ */
+__C __export infiniStatus_t infiniopGetRepetitionPenaltyWorkspaceSize(
+    infiniopRepetitionPenaltyDescriptor_t desc,
+    size_t *size);
+
+/**
+ * @brief Applies repetition penalty to logits in-place using token indices only.
+ *
+ * @param desc Operator descriptor
+ * @param workspace Workspace buffer
+ * @param workspace_size Workspace size
+ * @param logits Logits tensor [num_seqs, vocab_size] - modified in-place (device pointer)
+ * @param repetition_penalties Repetition penalty values [num_seqs] - device pointer for GPU backends, host pointer for CPU
+ * @param token_indices Flattened token ids to penalize (device pointer)
+ * @param token_offsets Prefix sums into token_indices, length = num_seqs + 1 (device pointer)
+ * @param total_indices Total number of token indices across all sequences (token_offsets[num_seqs])
+ * @param stream CUDA stream
+ * @return infiniStatus_t Status code
+ *
+ * @note For CUDA graph compatibility:
+ *       - repetition_penalties and token buffers must be device pointers for GPU backends
+ *       - total_indices must be computed on host before graph capture
+ *       - The caller is responsible for copying penalty values and token buffers to device before graph capture
+ */
+__C __export infiniStatus_t infiniopApplyRepetitionPenalty(
+    infiniopRepetitionPenaltyDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *logits,
+    const float *repetition_penalties,
+    const uint32_t *token_indices,   // flattened token ids to penalize
+    const size_t *token_offsets,     // prefix sum, len = num_seqs + 1
+    size_t total_indices,            // total number of indices (token_offsets[num_seqs])
+    void *stream);
+
+/**
+ * @brief Destroys a repetition penalty operator descriptor.
+ *
+ * @param desc Operator descriptor
+ * @return infiniStatus_t Status code
+ */
+__C __export infiniStatus_t infiniopDestroyRepetitionPenaltyDescriptor(
+    infiniopRepetitionPenaltyDescriptor_t desc);
+
+#endif
@@ -0,0 +1,123 @@
+#include "repetition_penalty_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../info.h"
+#include "infinicore.h"
+#include <algorithm>
+
+namespace op::repetition_penalty::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t logits_desc) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+    auto result = RepetitionPenaltyInfo::create(logits_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        0,  // No workspace needed for CPU
+        nullptr,
+        handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+size_t Descriptor::minWorkspaceSize() const {
+    return _min_workspace_size;
+}
+
+template <typename T>
+void apply_penalty_cpu(
+    T *logits,
+    const float *repetition_penalties,
+    const uint32_t *token_indices,
+    const size_t *token_offsets,
+    size_t num_seqs,
+    size_t vocab_size) {
+
+    for (size_t seq_idx = 0; seq_idx < num_seqs; seq_idx++) {
+        float penalty = repetition_penalties[seq_idx];
+        if (penalty == 1.0f) {
+            continue;  // Skip if no penalty
+        }
+
+        size_t start = token_offsets[seq_idx];
+        size_t end = token_offsets[seq_idx + 1];
+        for (size_t i = start; i < end; i++) {
+            uint32_t token_id = token_indices[i];
+            if (token_id >= vocab_size) {
+                continue; // skip out-of-range ids
+            }
+            size_t offset = seq_idx * vocab_size + token_id;
+            T logit_val_orig = logits[offset];
+            float logit_val = utils::cast<float>(logit_val_orig);
+
+            // Match PyTorch behavior exactly: val / p if val > 0 else val * p
+            if (logit_val > 0.0f) {
+                logits[offset] = utils::cast<T>(logit_val / penalty);
+            } else {
+                // For val <= 0: multiply by penalty (covers negative and zero)
+                logits[offset] = utils::cast<T>(logit_val * penalty);
+            }
+        }
+    }
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *logits,
+    const float *repetition_penalties,
+    const uint32_t *token_indices,
+    const size_t *token_offsets,
+    size_t total_indices,
+    void *stream) const {
+
+    switch (_info.dt_logits) {
+    case INFINI_DTYPE_F16:
+        apply_penalty_cpu<fp16_t>(
+            reinterpret_cast<fp16_t *>(logits),
+            repetition_penalties,
+            token_indices,
+            token_offsets,
+            _info.num_seqs,
+            _info.vocab_size);
+        break;
+    case INFINI_DTYPE_BF16:
+        apply_penalty_cpu<bf16_t>(
+            reinterpret_cast<bf16_t *>(logits),
+            repetition_penalties,
+            token_indices,
+            token_offsets,
+            _info.num_seqs,
+            _info.vocab_size);
+        break;
+    case INFINI_DTYPE_F32:
+        apply_penalty_cpu<float>(
+            reinterpret_cast<float *>(logits),
+            repetition_penalties,
+            token_indices,
+            token_offsets,
+            _info.num_seqs,
+            _info.vocab_size);
+        break;
+    case INFINI_DTYPE_F64:
+        apply_penalty_cpu<double>(
+            reinterpret_cast<double *>(logits),
+            repetition_penalties,
+            token_indices,
+            token_offsets,
+            _info.num_seqs,
+            _info.vocab_size);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::repetition_penalty::cpu
@@ -0,0 +1,8 @@
+#ifndef __REPETITION_PENALTY_CPU_H__
+#define __REPETITION_PENALTY_CPU_H__
+
+#include "../repetition_penalty.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __REPETITION_PENALTY_CPU_H__
@@ -0,0 +1,34 @@
+#ifndef __REPETITION_PENALTY_INFO_H__
+#define __REPETITION_PENALTY_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+
+namespace op::repetition_penalty {
+
+struct RepetitionPenaltyInfo {
+    infiniDtype_t dt_logits;
+    size_t num_seqs;
+    size_t vocab_size;
+
+    static utils::Result<RepetitionPenaltyInfo> create(
+        infiniopTensorDescriptor_t logits_desc) {
+
+        CHECK_OR_RETURN(logits_desc->ndim() == 2, INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        auto num_seqs = logits_desc->dim(0);
+        auto vocab_size = logits_desc->dim(1);
+
+        CHECK_DTYPE(logits_desc->dtype(), INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+        return utils::Result<RepetitionPenaltyInfo>({
+            logits_desc->dtype(),
+            num_seqs,
+            vocab_size
+        });
+    }
+};
+
+} // namespace op::repetition_penalty
+
+#endif // __REPETITION_PENALTY_INFO_H__
@@ -0,0 +1,60 @@
+#ifndef __REPETITION_PENALTY_KERNEL_H__
+#define __REPETITION_PENALTY_KERNEL_H__
+
+#include "../../../devices/metax/metax_common.h"
+#include "../info.h"
+
+namespace op::repetition_penalty::metax {
+
+// CUDA graph compatible kernel - all operations on device, no host-device memcpy
+template <typename T>
+static __global__ void applyRepetitionPenaltyKernel(
+    T *__restrict__ logits,
+    const float *__restrict__ repetition_penalties,
+    const uint32_t *__restrict__ token_indices,
+    const size_t *__restrict__ token_offsets,
+    size_t num_seqs,
+    size_t vocab_size,
+    size_t total_indices) {
+
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= total_indices) {
+        return;
+    }
+
+    // Binary search over token_offsets to find seq_idx such that
+    // token_offsets[seq_idx] <= idx < token_offsets[seq_idx + 1]
+    size_t lo = 0;
+    size_t hi = num_seqs;
+    while (lo < hi) {
+        size_t mid = (lo + hi) >> 1;
+        if (token_offsets[mid + 1] <= idx) {
+            lo = mid + 1;
+        } else {
+            hi = mid;
+        }
+    }
+    size_t seq_idx = lo;
+
+    uint32_t token_id = token_indices[idx];
+    if (token_id >= vocab_size) {
+        return;
+    }
+
+    float penalty = repetition_penalties[seq_idx];
+    if (penalty == 1.0f) {
+        return;  // No penalty, skip
+    }
+
+    size_t offset = seq_idx * vocab_size + token_id;
+    float logit_val = static_cast<float>(logits[offset]);
+    if (logit_val > 0) {
+        logits[offset] = static_cast<T>(logit_val / penalty);
+    } else {
+        logits[offset] = static_cast<T>(logit_val * penalty);
+    }
+}
+
+} // namespace op::repetition_penalty::metax
+
+#endif // __REPETITION_PENALTY_KERNEL_H__
@@ -0,0 +1,8 @@
+#ifndef __REPETITION_PENALTY_METAX_H__
+#define __REPETITION_PENALTY_METAX_H__
+
+#include "../repetition_penalty.h"
+
+DESCRIPTOR(metax)
+
+#endif // __REPETITION_PENALTY_METAX_H__