@@ -1076,6 +1076,44 @@ def test_min_tokens(use_speculative: bool):
10761076 assert len (res .outputs [0 ].token_ids ) == output_len
10771077
10781078
1079+ @pytest .mark .part0
1080+ def test_min_tokens_long_prompt ():
1081+ """Check min_tokens is respected when prompt is longer than min_tokens.
1082+
1083+ Regression test for NVBug 5823135: _apply_min_length_penalty compared
1084+ total token count (prompt + generated) against the raw min_tokens value
1085+ instead of comparing generated token count only. When prompt_len >=
1086+ min_tokens the EOS suppression was never activated, allowing early
1087+ termination.
1088+ """
1089+ min_tok = 50
1090+ max_tok = 100
1091+ # Prompt long enough so that prompt_len > min_tok. "Hello " tokenises
1092+ # to ~1-2 tokens with most tokenizers, so 200 repetitions ≈ 200-400
1093+ # tokens >> min_tok.
1094+ long_prompt = "Hello " * 200
1095+
1096+ llm = LLM (
1097+ model = llama_model_path ,
1098+ max_batch_size = 2 ,
1099+ kv_cache_config = global_kvcache_config ,
1100+ max_num_tokens = 2048 ,
1101+ )
1102+
1103+ sampling_params = SamplingParams (
1104+ max_tokens = max_tok ,
1105+ min_tokens = min_tok ,
1106+ temperature = 1 ,
1107+ )
1108+ res = llm .generate (long_prompt , sampling_params = sampling_params )
1109+
1110+ assert len (res .outputs ) == 1
1111+ generated_len = len (res .outputs [0 ].token_ids )
1112+ assert generated_len >= min_tok , (
1113+ f"Generated only { generated_len } tokens with min_tokens={ min_tok } "
1114+ f"and a long prompt. Bug 5823135 regression." )
1115+
1116+
10791117@skip_ray
10801118@pytest .mark .parametrize (
10811119 "prompt_logprobs, logprobs, return_context_logits, return_generation_logits, backend" ,
0 commit comments