diff --git a/example_notebooks/vllm/text_retrieval_logits_processor.ipynb b/example_notebooks/vllm/text_retrieval_logits_processor.ipynb new file mode 100644 index 0000000..8203dc1 --- /dev/null +++ b/example_notebooks/vllm/text_retrieval_logits_processor.ipynb @@ -0,0 +1,222 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "28ed6952", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/aerdem/projects/nvidia/logits-processor-zoo\n" + ] + } + ], + "source": [ + "%cd ../.." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b89279fe", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO 05-21 13:05:12 [__init__.py:239] Automatically detected platform cuda.\n", + "WARNING 05-21 13:05:15 [config.py:2972] Casting torch.bfloat16 to torch.float16.\n", + "INFO 05-21 13:05:20 [config.py:717] This model supports multiple tasks: {'embed', 'classify', 'score', 'reward', 'generate'}. Defaulting to 'generate'.\n", + "WARNING 05-21 13:05:20 [cuda.py:93] To see benefits of async output processing, enable CUDA graph. Since, enforce-eager is enabled, async output processor cannot be used\n", + "INFO 05-21 13:05:20 [llm_engine.py:240] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='Qwen/Qwen2.5-1.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-1.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='auto', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=Qwen/Qwen2.5-1.5B-Instruct, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=False, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={\"splitting_ops\":[],\"compile_sizes\":[],\"cudagraph_capture_sizes\":[],\"max_capture_size\":0}, use_cached_outputs=False, \n", + "INFO 05-21 13:05:22 [cuda.py:292] Using Flash Attention backend.\n", + "INFO 05-21 13:05:22 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0\n", + "INFO 05-21 13:05:22 [model_runner.py:1108] Starting to load model Qwen/Qwen2.5-1.5B-Instruct...\n", + "INFO 05-21 13:05:23 [weight_utils.py:265] Using model weights format ['*.safetensors']\n", + "INFO 05-21 13:05:23 [weight_utils.py:315] No model.safetensors.index.json found in remote.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a62d6b15778b4da397cb4f540673f035", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00 torch.Tensor: + if not past_token_ids: + next_tokens = self.start_tokens + else: + next_tokens = self._find_all_next_tokens(list(past_token_ids)) + + if (past_token_ids[-1] == self.sep_token) or (len(next_tokens) == 0): + next_tokens.append(self.tokenizer.eos_token_id) + + scores = enforce_tokens(scores, next_tokens) + return scores