openvinotoolkit
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎demos/common/export_models/README.md‎
Lines changed: 2 additions & 2 deletions b/‎demos/common/export_models/README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎demos/common/export_models/export_model.py‎
Lines changed: 1 addition & 1 deletion b/‎demos/common/export_models/export_model.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎demos/common/export_models/requirements.txt‎
Lines changed: 3 additions & 3 deletions b/‎demos/common/export_models/requirements.txt‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎demos/python_demos/clip_image_classification/README.md‎
Lines changed: 1 addition & 1 deletion b/‎demos/python_demos/clip_image_classification/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎demos/python_demos/clip_image_classification/download_model_requirements.txt‎
Lines changed: 0 additions & 8 deletions b/‎demos/python_demos/clip_image_classification/download_model_requirements.txt‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎docs/model_server_rest_api_chat.md‎
Lines changed: 4 additions & 4 deletions b/‎docs/model_server_rest_api_chat.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/model_server_rest_api_completions.md‎
Lines changed: 4 additions & 4 deletions b/‎docs/model_server_rest_api_completions.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/model_server_rest_api_responses.md‎
Lines changed: 4 additions & 2 deletions b/‎docs/model_server_rest_api_responses.md‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎docs/parameters.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/parameters.md‎
Lines changed: 2 additions & 2 deletions
@@ -39,3 +39,4 @@ tmp/
 *.tar.gz
 models
 genhtml
+.github/skills/
@@ -45,7 +45,7 @@ Expected Output:
 usage: export_model.py text_generation [-h] [--model_repository_path MODEL_REPOSITORY_PATH] --source_model SOURCE_MODEL [--model_name MODEL_NAME] [--weight-format PRECISION] [--config_file_path CONFIG_FILE_PATH] [--overwrite_models] [--target_device TARGET_DEVICE] [--ov_cache_dir OV_CACHE_DIR]
                                        [--extra_quantization_params EXTRA_QUANTIZATION_PARAMS] [--pipeline_type {LM,LM_CB,VLM,VLM_CB,AUTO}] [--kv_cache_precision {u8}] [--enable_prefix_caching ENABLE_PREFIX_CACHING] [--disable_dynamic_split_fuse] [--max_num_batched_tokens MAX_NUM_BATCHED_TOKENS] [--max_num_seqs MAX_NUM_SEQS]
                                        [--cache_size CACHE_SIZE] [--draft_source_model DRAFT_SOURCE_MODEL] [--draft_model_name DRAFT_MODEL_NAME] [--draft_eagle3_mode] [--max_prompt_len MAX_PROMPT_LEN] [--prompt_lookup_decoding] [--reasoning_parser {qwen3,gptoss}]
-                                       [--tool_parser {llama3,phi4,hermes3,mistral,qwen3coder,gptoss,devstral}] [--enable_tool_guided_generation]
+                                       [--tool_parser {llama3,phi4,hermes3,mistral,qwen3coder,gptoss,devstral,lfm2}] [--enable_tool_guided_generation]
 
 options:
   -h, --help            show this help message and exit
@@ -91,7 +91,7 @@ options:
                         Set pipeline to use prompt lookup decoding
   --reasoning_parser {qwen3,gptoss}
                         Set the type of the reasoning parser for reasoning content extraction
-  --tool_parser {llama3,phi4,hermes3,mistral,qwen3coder,gptoss,devstral}
+  --tool_parser {llama3,phi4,hermes3,mistral,qwen3coder,gptoss,devstral,lfm2}
                         Set the type of the tool parser for tool calls extraction
   --enable_tool_guided_generation
                         Enables enforcing tool schema during generation. Requires setting tool_parser
 
@@ -54,7 +54,7 @@ def add_common_arguments(parser):
                          'Not effective if target device is not NPU', dest='max_prompt_len')
 parser_text.add_argument('--prompt_lookup_decoding', action='store_true', help='Set pipeline to use prompt lookup decoding', dest='prompt_lookup_decoding')
 parser_text.add_argument('--reasoning_parser', choices=["qwen3", "gptoss"], help='Set the type of the reasoning parser for reasoning content extraction', dest='reasoning_parser')
-parser_text.add_argument('--tool_parser', choices=["llama3", "phi4", "hermes3", "mistral", "qwen3coder", "gptoss", "devstral"], help='Set the type of the tool parser for tool calls extraction', dest='tool_parser')
+parser_text.add_argument('--tool_parser', choices=["llama3", "phi4", "hermes3", "mistral", "qwen3coder", "gptoss", "devstral", "lfm2"], help='Set the type of the tool parser for tool calls extraction', dest='tool_parser')
 parser_text.add_argument('--enable_tool_guided_generation', action='store_true', help='Enables enforcing tool schema during generation. Requires setting tool_parser', dest='enable_tool_guided_generation')
 
 parser_embeddings_ov = subparsers.add_parser('embeddings_ov', help='export model for embeddings endpoint with directory structure aligned with OpenVINO tools')
 
@@ -1,14 +1,14 @@
 --extra-index-url "https://download.pytorch.org/whl/cpu"
 --extra-index-url "https://storage.openvinotoolkit.org/simple/wheels/nightly"
 --extra-index-url "https://storage.openvinotoolkit.org/simple/wheels/pre-release"
-optimum-intel@git+https://github.com/huggingface/optimum-intel.git@39121884e050a32f9ded590f8597760d7e7ac205
+optimum-intel@git+https://github.com/huggingface/optimum-intel.git@d4dd21a3aa89c0671d85b704847ac06a378e761c
 accelerate
 datasets
 diffusers  # for image generation
 nncf
 numpy
-openvino-tokenizers==2026.2.0.dev20260501
-openvino==2026.2.0.dev20260501
+openvino-tokenizers==2026.2.0.dev20260509
+openvino==2026.2.0.dev20260509
 pillow
 sentence_transformers==5.3.0
 sentencepiece  # Required by: transformers`
 
@@ -23,7 +23,7 @@ pip3 install -r requirements.txt
 ## Download and convert model
 
 ```bash
-pip3 install -r download_model_requirements.txt
+pip3 install --pre --extra-index-url "https://download.pytorch.org/whl/cpu" --extra-index-url "https://storage.openvinotoolkit.org/simple/wheels/nightly" "openvino==2026.1.*" "numpy<2.0" "pillow==12.2.0" "torch==2.8.0+cpu" "transformers<=4.53.0"
 ```
 
 ```bash
 
@@ -221,6 +221,7 @@ Some parameters, especially related to sampling (like `temperature`, `top_p` etc
 | tool_choice | ✅ | ✅ | ✅ | string or object | Controls which (if any) tool is called by the model. `none` means the model will not call any tool and instead generates a message. `auto` means the model can pick between generating a message or calling one or more tools. `required` means that model should call at least one tool. Specifying a particular tool via `{"type": "function", "function": {"name": "my_function"}}` forces the model to call that tool. See [OpenAI API reference](https://platform.openai.com/docs/api-reference/chat/create#chat-create-tool_choice) for more details. |
 | response_format | ✅ | ✅ | ✅ | object | An object specifying the format that the model must output. Setting to `{ "type": "json_schema", "json_schema": {...} }` enables Structured Outputs which ensures the model will match your supplied JSON schema according to [OpenAI reference](https://platform.openai.com/docs/api-reference/chat/create#chat-create-response_format). Learn more in the [Structured Outputs demo](../demos/continuous_batching/structured_output/README.md). Additionally, `response_format` can accept [XGrammar structural tags format](https://github.com/mlc-ai/xgrammar/blob/main/docs/tutorials/structural_tag.md#format-types) (not part of OpenAI API). For example: `{ "type": "const_string", "value": "Hello World!" }`. **Note** that if model server fails to process the format, the request will still be processed, but the format will not be imposed. |
 | chat_template_kwargs | ✅ | ❌ | ✅ |  object | Enables passing additional parameters to chat template engine. Example `{"enable_thinking": false}`. Note that values like `messages`, `eos_token`, `bos_token` etc. are provided natively to the template engine, so including them in `chat_template_kwargs` will cause error. |
+| skip_special_tokens | ✅ | ❌ | ✅ | bool (default: `true`) | Whether to remove special tokens (e.g. `<\|endoftext\|>`, `<\|im_end\|>`) from the generated output. Set to `false` to include them, which is useful when the model uses special tokens to encode structured information (e.g. bounding boxes, reasoning markers). When `false`, any tool or reasoning parser configured on the endpoint is silently disabled for the request, so the raw token stream is returned. This option works with most detokenizers exported with OpenVINO Tokenizers 2024.5 or later, unless they are based on custom ops. |
 
 #### Beam search sampling specific
 | Param | OpenVINO Model Server | OpenAI /chat/completions API | vLLM Serving Sampling Params | Type | Description |
@@ -234,11 +235,12 @@ Some parameters, especially related to sampling (like `temperature`, `top_p` etc
 |-------|----------|----------|----------|---------|-----|
 | temperature | ✅ | ✅ | ✅ | float (default: `1.0`) | The value is used to modulate token probabilities for multinomial sampling. It enables multinomial sampling when set to `> 0.0`. |
 | top_p | ✅ | ✅ | ✅ | float (default: `1.0`) | Controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. Set to 1 to consider all tokens. |
-| top_k | ✅ | ❌ | ✅ | int (default: all tokens) | Controls the number of top tokens to consider. Set to empty or -1 to consider all tokens. |
+| min_p | ✅ | ❌ | ✅ | float (default: `0.0`) | Minimum probability threshold relative to the most likely token. Tokens with probability below `min_p` × the top token probability are filtered out. `0.0` (default) disables the filter. Typical values: `0.05`–`0.1`. Must be in `[0.0, 1.0)`. |
+| top_k | ✅ | ❌ | ✅ | int (default: `40`) | Controls the number of top tokens to consider. When multinomial sampling is active, defaults to `40` if not set. Set to `-1` to consider all tokens. |
 | repetition_penalty | ✅ | ❌ | ✅ | float (default: `1.0`) | Penalizes new tokens based on whether they appear in the prompt and the generated text so far. Values > `1.0` encourage the model to use new tokens, while values < `1.0` encourage the model to repeat tokens. `1.0` means no penalty. |
 | frequency_penalty | ✅ | ✅ | ✅ | float (default: `0.0`) | Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. |
 | presence_penalty | ✅ | ✅ | ✅ | float (default: `0.0`) | Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. |
-| seed | ✅ | ✅ | ✅ | integer (default: `0`) | Random seed to use for the generation. |
+| seed | ✅ | ✅ | ✅ | integer (default: random) | Random seed for generation in range `[0, 4294967295]`. Omit to use a random seed (non-deterministic). Set explicitly to get reproducible output. Note: `rng_seed` set in `generation_config.json` is not honoured for multinomial sampling — only a per-request seed is applied. |
 
 #### Speculative decoding specific
 
@@ -274,14 +276,12 @@ If any of those parameters is not specified and request is made to Prompt Lookup
 - functions
 
 #### Unsupported params from vLLM:
-- min_p
 - use_beam_search (**In OpenVINO Model Server just simply increase _best_of_ param to enable beam search**)
 - early_stopping
 - stop_token_ids
 - min_tokens
 - prompt_logprobs
 - detokenize
-- skip_special_tokens
 - spaces_between_special_tokens
 - logits_processors
 - truncate_prompt_tokens
 
@@ -62,6 +62,7 @@ curl http://localhost/v3/completions \
 | include_stop_str_in_output | ✅ | ❌ | ✅ | bool (default: `false` if `stream=false`, `true` if `stream=true`) | Whether to include matched stop string in output. Setting it to false when `stream=true` is invalid configuration and will result in error. |
 | logprobs | ⚠️ | ✅ | ✅ | integer (optional) | Include the log probabilities on the logprob of the returned output token. **_ in stream mode logprobs are not returned. Only value 1 is accepted which returns logarithm or the chosen token _** |
 | echo | ✅ | ✅ | ✅ | boolean (optional) | Echo back the prompt in addition to the completion |
+| skip_special_tokens | ✅ | ❌ | ✅ | bool (default: `true`) | Whether to remove special tokens (e.g. `<\|endoftext\|>`, `<\|im_end\|>`) from the generated output. Set to `false` to include them, which is useful when the model uses special tokens to encode structured information. This option works with most detokenizers exported with OpenVINO Tokenizers 2024.5 or later, unless they are based on custom ops. |
 
 #### Beam search sampling specific
 | Param | OpenVINO Model Server | OpenAI /completions API | vLLM Serving Sampling Params | Type | Description |
@@ -75,11 +76,12 @@ curl http://localhost/v3/completions \
 |-------|----------|----------|----------|---------|-----|
 | temperature | ✅ | ✅ | ✅ | float (default: `1.0`) | The value is used to modulate token probabilities for multinomial sampling. It enables multinomial sampling when set to `> 0.0`. |
 | top_p | ✅ | ✅ | ✅ | float (default: `1.0`) | Controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. Set to 1 to consider all tokens. |
-| top_k | ✅ | ❌ | ✅ | int (default: all tokens) | Controls the number of top tokens to consider. Set to empty or -1 to consider all tokens. |
+| min_p | ✅ | ❌ | ✅ | float (default: `0.0`) | Minimum probability threshold relative to the most likely token. Tokens with probability below `min_p` × the top token probability are filtered out. `0.0` (default) disables the filter. Typical values: `0.05`–`0.1`. Must be in `[0.0, 1.0)`. |
+| top_k | ✅ | ❌ | ✅ | int (default: `40`) | Controls the number of top tokens to consider. When multinomial sampling is active, defaults to `40` if not set. Set to `-1` to consider all tokens. |
 | repetition_penalty | ✅ | ❌ | ✅ | float (default: `1.0`) | Penalizes new tokens based on whether they appear in the prompt and the generated text so far. Values > `1.0` encourage the model to use new tokens, while values < `1.0` encourage the model to repeat tokens. `1.0` means no penalty. |
 | frequency_penalty | ✅ | ✅ | ✅ | float (default: `0.0`) | Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. |
 | presence_penalty | ✅ | ✅ | ✅ | float (default: `0.0`) | Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. |
-| seed | ✅ | ✅ | ✅ | integer (default: `0`) | Random seed to use for the generation. |
+| seed | ✅ | ✅ | ✅ | integer (default: random) | Random seed for generation in range `[0, 4294967295]`. Omit to use a random seed (non-deterministic). Set explicitly to get reproducible output. Note: `rng_seed` set in `generation_config.json` is not honoured for multinomial sampling — only a per-request seed is applied. |
 
 #### Speculative decoding specific
 
@@ -105,14 +107,12 @@ Note that below parameters are valid only for prompt lookup pipeline. Add `"prom
 
 
 #### Unsupported params from vLLM:
-- min_p
 - use_beam_search (**In OpenVINO Model Server just simply increase _best_of_ param to enable beam search**)
 - early_stopping
 - stop_token_ids
 - min_tokens
 - prompt_logprobs
 - detokenize
-- skip_special_tokens
 - spaces_between_special_tokens
 - logits_processors
 - truncate_prompt_tokens
 
@@ -105,6 +105,7 @@ curl http://localhost/v3/responses \
 | tool_choice | ✅ | ✅ | string or object (optional) | Controls which (if any) tool is called by the model. `none` means the model will not call any tool and instead generates a message. `auto` means the model can pick between generating a message or calling one or more tools. `required` means that model should call at least one tool. Specifying a particular function via `{"type": "function", "function": {"name": "my_function"}}` forces the model to call that tool. |
 | reasoning | ⚠️ | ✅ | object (optional) | Configuration for reasoning/thinking mode. The `effort` field accepts `"low"`, `"medium"`, or `"high"` — any value enables thinking mode (`enable_thinking: true` is injected into chat template kwargs). The `summary` field is accepted but ignored. |
 | chat_template_kwargs | ✅ | ❌ | object (optional) | Additional keyword arguments passed to the chat template. When `reasoning` is also provided, `enable_thinking: true` is merged into these kwargs. |
+| skip_special_tokens | ✅ | ❌ | bool (default: `true`) | Whether to remove special tokens (e.g. `<\|endoftext\|>`, `<\|im_end\|>`) from the generated output. Set to `false` to include them, which is useful when the model uses special tokens to encode structured information (e.g. bounding boxes, reasoning markers). When `false`, any tool or reasoning parser configured on the endpoint is silently disabled for the request, so the raw token stream is returned. This option works with most detokenizers exported with OpenVINO Tokenizers 2024.5 or later, unless they are based on custom ops. |
 | stream_options | ❌ | ❌ | | Not supported in Responses API. Usage statistics are always included in the `response.completed` event. |
 
 #### Beam search sampling specific
@@ -119,11 +120,12 @@ curl http://localhost/v3/responses \
 |-------|----------|----------|---------|-----|
 | temperature | ✅ | ✅ | float (default: `1.0`) | The value is used to modulate token probabilities for multinomial sampling. It enables multinomial sampling when set to `> 0.0`. |
 | top_p | ✅ | ✅ | float (default: `1.0`) | Controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. Set to 1 to consider all tokens. |
-| top_k | ✅ | ❌ | int (default: all tokens) | Controls the number of top tokens to consider. Set to empty or -1 to consider all tokens. |
+| min_p | ✅ | ❌ | float (default: `0.0`) | Minimum probability threshold relative to the most likely token. Tokens with probability below `min_p` × the top token probability are filtered out. `0.0` (default) disables the filter. Typical values: `0.05`–`0.1`. Must be in `[0.0, 1.0)`. |
+| top_k | ✅ | ❌ | int (default: `40`) | Controls the number of top tokens to consider. When multinomial sampling is active, defaults to `40` if not set. Set to `-1` to consider all tokens. |
 | repetition_penalty | ✅ | ❌ | float (default: `1.0`) | Penalizes new tokens based on whether they appear in the prompt and the generated text so far. Values > `1.0` encourage the model to use new tokens, while values < `1.0` encourage the model to repeat tokens. `1.0` means no penalty. |
 | frequency_penalty | ✅ | ❌ | float (default: `0.0`) | Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. |
 | presence_penalty | ✅ | ❌ | float (default: `0.0`) | Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. |
-| seed | ✅ | ❌ | integer (default: `0`) | Random seed to use for the generation. |
+| seed | ✅ | ❌ | integer (default: random) | Random seed for generation in range `[0, 4294967295]`. Omit to use a random seed (non-deterministic). Set explicitly to get reproducible output. Note: `rng_seed` set in `generation_config.json` is not honoured for multinomial sampling — only a per-request seed is applied. |
 
 #### Speculative decoding specific
 
 
@@ -138,8 +138,8 @@ Task specific parameters for different tasks (text generation/image generation/e
 | `--max_prompt_len`                    | `integer`    | Sets NPU specific property for maximum number of tokens in the prompt.                                                     |
 | `--kv_cache_precision`                | `string`     | Reduced kv cache precision to `u8` lowers the cache size consumption. Accepted values: `u8` or empty (default).            |
 | `--model_distribution_policy`         | `string`     | TENSOR_PARALLEL distributes tensor to multiple sockets/devices and processes it in parallel. PIPELINE_PARALLEL distributes different tensors to process by each device. Accepted values: `TENSOR_PARALLEL`, `PIPELINE_PARALLEL` or empty (default). |
-| `--reasoning_parser`                  | `string`     | Type of parser to use for reasoning content extraction from model output. Currently supported: [qwen3]                     |
-| `--tool_parser`                       | `string`     | Type of parser to use for tool calls extraction from model output. Currently supported: [llama3, hermes3, phi4]            |
+| `--reasoning_parser`                  | `string`     | Type of parser to use for reasoning content extraction from model output. Currently supported: [qwen3, gptoss]                     |
+| `--tool_parser`                       | `string`     | Type of parser to use for tool calls extraction from model output. Currently supported: [llama3, phi4, hermes3, mistral, qwen3coder, gptoss, devstral, lfm2]            |
 | `--enable_tool_guided_generation`     | `bool`       | Enables enforcing tool schema during generation. Requires setting response parser. Default: false.                         |
 
 ### Image generation