Add LogitProcessor interface for pre-sampling logit transforms (pytorch#19517)

kirklandsign · web-flow · commit 174d3adc9d67 · 2026-05-15T01:35:23.000Z
Differential Revision: D104767967 Pull Request resolved: pytorch#19517
diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl
@@ -68,6 +68,7 @@ def define_common_targets():
             visibility = ["PUBLIC"],
             exported_deps = [
                 ":text_decoder_runner" + aten_suffix,
+                "//executorch/extension/llm/sampler:sampler" + aten_suffix,
                 "//pytorch/tokenizers:headers",
                 "//executorch/extension/module:module" + aten_suffix,
                 "//executorch/extension/tensor:tensor" + aten_suffix,
diff --git a/extension/llm/runner/test/test_text_llm_runner.cpp b/extension/llm/runner/test/test_text_llm_runner.cpp
@@ -12,12 +12,16 @@
 #include <executorch/extension/llm/runner/text_llm_runner.h>
 #include <executorch/extension/llm/runner/text_prefiller.h>
 #include <executorch/extension/llm/runner/text_token_generator.h>
+#include <executorch/extension/llm/sampler/logit_processor.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
+#include <limits>
+
 using namespace ::testing;
 using executorch::extension::llm::GenerationConfig;
+using executorch::extension::llm::LogitProcessor;
 using executorch::extension::llm::Stats;
 using executorch::extension::llm::TextDecoderRunner;
 using executorch::extension::llm::TextLLMRunner;
@@ -97,6 +101,29 @@ class MockTextPrefiller : public TextPrefiller {
   MOCK_METHOD(bool, is_loaded, (), ());
 };
 
+class MaskTokenProcessor : public LogitProcessor {
+ public:
+  explicit MaskTokenProcessor(int32_t banned_token)
+      : banned_token_(banned_token) {}
+
+  ::executorch::runtime::Error process(
+      ::executorch::aten::Tensor logits) override {
+    const int32_t vocab_size = logits.size(logits.dim() - 1);
+    int32_t offset = 0;
+    if (logits.dim() == 3) {
+      offset = (logits.size(1) - 1) * vocab_size;
+    }
+    float* data = logits.mutable_data_ptr<float>();
+    if (banned_token_ >= 0 && banned_token_ < vocab_size) {
+      data[offset + banned_token_] = -std::numeric_limits<float>::infinity();
+    }
+    return ::executorch::runtime::Error::Ok;
+  }
+
+ private:
+  int32_t banned_token_;
+};
+
 // Callback counter class for tests
 class CallbackCounter {
  public:
@@ -618,4 +645,95 @@ TEST_F(RunnerTest, MultiTurnWithSeqLenRespectsPos) {
   EXPECT_EQ(counter.getCount(), 10);
 }
 
+// Verify that a LogitProcessor injected into TextTokenGenerator actually
+// affects token selection. Without the processor, greedy argmax of
+// {0.1, 0.2, 0.3, 0.4} picks token 3. Masking token 3 should pick token 2.
+TEST_F(RunnerTest, TextTokenGeneratorWithProcessorMasksToken) {
+  auto tokenizer = createMockTokenizer();
+  auto text_decoder_runner = createMockTextDecoderRunner();
+  Stats stats;
+  auto generator = createTextTokenGenerator(
+      tokenizer.get(), text_decoder_runner.get(), &stats);
+
+  generator->add_logit_processor(
+      std::make_shared<MaskTokenProcessor>(/*banned_token=*/3));
+
+  std::vector<uint64_t> generated_tokens;
+  ON_CALL(*tokenizer, decode)
+      .WillByDefault(
+          [&](uint64_t,
+              uint64_t cur,
+              bool) -> ::tokenizers::Result<std::string> {
+            generated_tokens.push_back(cur);
+            return ::tokenizers::Result<std::string>(std::string("token"));
+          });
+
+  std::vector<uint64_t> tokens = {1, 2, 3};
+  auto result =
+      generator->generate(tokens, 3, 3, 0.0f, [](const std::string&) {});
+
+  EXPECT_TRUE(result.ok());
+  const std::vector<uint64_t> expected(3, 2);
+  EXPECT_EQ(generated_tokens, expected);
+}
+
+// Multiple processors in chain should all take effect.
+TEST_F(RunnerTest, TextTokenGeneratorProcessorChainMasksMultipleTokens) {
+  auto tokenizer = createMockTokenizer();
+  auto text_decoder_runner = createMockTextDecoderRunner();
+  Stats stats;
+  auto generator = createTextTokenGenerator(
+      tokenizer.get(), text_decoder_runner.get(), &stats);
+
+  generator->add_logit_processor(
+      std::make_shared<MaskTokenProcessor>(/*banned_token=*/3));
+  generator->add_logit_processor(
+      std::make_shared<MaskTokenProcessor>(/*banned_token=*/2));
+
+  std::vector<uint64_t> generated_tokens;
+  ON_CALL(*tokenizer, decode)
+      .WillByDefault(
+          [&](uint64_t,
+              uint64_t cur,
+              bool) -> ::tokenizers::Result<std::string> {
+            generated_tokens.push_back(cur);
+            return ::tokenizers::Result<std::string>(std::string("token"));
+          });
+
+  std::vector<uint64_t> tokens = {1, 2, 3};
+  auto result =
+      generator->generate(tokens, 3, 3, 0.0f, [](const std::string&) {});
+
+  EXPECT_TRUE(result.ok());
+  const std::vector<uint64_t> expected(3, 1);
+  EXPECT_EQ(generated_tokens, expected);
+}
+
+// Without any processors, greedy argmax picks token 3 (zero-overhead path).
+TEST_F(RunnerTest, TextTokenGeneratorWithoutProcessorPicksArgmax) {
+  auto tokenizer = createMockTokenizer();
+  auto text_decoder_runner = createMockTextDecoderRunner();
+  Stats stats;
+  auto generator = createTextTokenGenerator(
+      tokenizer.get(), text_decoder_runner.get(), &stats);
+
+  std::vector<uint64_t> generated_tokens;
+  ON_CALL(*tokenizer, decode)
+      .WillByDefault(
+          [&](uint64_t,
+              uint64_t cur,
+              bool) -> ::tokenizers::Result<std::string> {
+            generated_tokens.push_back(cur);
+            return ::tokenizers::Result<std::string>(std::string("token"));
+          });
+
+  std::vector<uint64_t> tokens = {1, 2, 3};
+  auto result =
+      generator->generate(tokens, 3, 3, 0.0f, [](const std::string&) {});
+
+  EXPECT_TRUE(result.ok());
+  const std::vector<uint64_t> expected(3, 3);
+  EXPECT_EQ(generated_tokens, expected);
+}
+
 } // namespace
diff --git a/extension/llm/runner/text_token_generator.h b/extension/llm/runner/text_token_generator.h
@@ -10,9 +10,12 @@
 #pragma once
 
 #include <atomic>
+#include <memory>
+#include <vector>
 
 #include <executorch/extension/llm/runner/stats.h>
 #include <executorch/extension/llm/runner/text_decoder_runner.h>
+#include <executorch/extension/llm/sampler/logit_processor.h>
 #include <executorch/extension/tensor/tensor.h>
 #include <pytorch/tokenizers/tokenizer.h>
 
@@ -38,6 +41,20 @@ class ET_EXPERIMENTAL TextTokenGenerator {
     ignore_eos_ = ignore_eos;
   }
 
+  void add_logit_processor(std::shared_ptr<LogitProcessor> processor) {
+    if (processor) {
+      logit_processors_.push_back(std::move(processor));
+    }
+  }
+
+  void clear_logit_processors() {
+    logit_processors_.clear();
+  }
+
+  size_t num_logit_processors() const {
+    return logit_processors_.size();
+  }
+
   virtual ~TextTokenGenerator() = default;
 
   /**
@@ -109,6 +126,10 @@ class ET_EXPERIMENTAL TextTokenGenerator {
 
       prev_token = cur_token;
 
+      for (auto& processor : logit_processors_) {
+        ET_CHECK_OK_OR_RETURN_ERROR(processor->process(logits_tensor));
+      }
+
       stats_->on_sampling_begin();
       cur_token =
           text_decoder_runner_->logits_to_token(logits_tensor, temperature);
@@ -189,6 +210,8 @@ class ET_EXPERIMENTAL TextTokenGenerator {
   bool use_kv_cache_;
   bool ignore_eos_ = false;
 
+  std::vector<std::shared_ptr<LogitProcessor>> logit_processors_;
+
   // state machine
   std::atomic<bool> should_stop_{false};
 
diff --git a/extension/llm/sampler/logit_processor.h b/extension/llm/sampler/logit_processor.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/platform/compiler.h>
+
+namespace executorch {
+namespace extension {
+namespace llm {
+
+/**
+ * In-place logit transform applied between the model forward pass and the
+ * sampler. Examples: grammar masks, logit bias, repetition penalty.
+ *
+ * `TextTokenGenerator` runs registered processors in order; each sees
+ * prior processors' edits. Called once per decoded token — keep it cheap.
+ *
+ * Tensor contract:
+ *   rank 2 [batch, vocab]      — operate on the full last dim
+ *   rank 3 [batch, seq, vocab] — operate on the LAST sequence position
+ *   other ranks                 — undefined behavior
+ *
+ * Implementations dispatch their own dtype (the chain runner neither casts
+ * nor copies the tensor). Return non-Ok to abort the chain.
+ */
+class ET_EXPERIMENTAL LogitProcessor {
+ public:
+  virtual ~LogitProcessor() = default;
+
+  virtual ::executorch::runtime::Error process(
+      ::executorch::aten::Tensor logits) = 0;
+};
+
+} // namespace llm
+} // namespace extension
+} // namespace executorch
diff --git a/extension/llm/sampler/targets.bzl b/extension/llm/sampler/targets.bzl
@@ -7,6 +7,7 @@ def define_common_targets():
         runtime.cxx_library(
             name = "sampler" + aten_suffix,
             exported_headers = [
+                "logit_processor.h",
                 "sampler.h",
                 "util.h",
             ],
diff --git a/extension/llm/sampler/test/targets.bzl b/extension/llm/sampler/test/targets.bzl
@@ -22,3 +22,14 @@ def define_common_targets():
             "//caffe2:torch-cpp",
         ],
     )
+
+    runtime.cxx_test(
+        name = "test_logit_processor",
+        srcs = [
+            "test_logit_processor.cpp",
+        ],
+        deps = [
+            "//executorch/extension/llm/sampler:sampler",
+            "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+        ],
+    )
diff --git a/extension/llm/sampler/test/test_logit_processor.cpp b/extension/llm/sampler/test/test_logit_processor.cpp