openvinotoolkit · dtrawins · May 13, 2026
diff --git a/demos/common/export_models/README.md b/demos/common/export_models/README.md
@@ -40,62 +40,10 @@ For every use case subcommand there is adjusted list of parameters:
 ```console
 python export_model.py text_generation --help
 ```
-Expected Output:
-```console
-usage: export_model.py text_generation [-h] [--model_repository_path MODEL_REPOSITORY_PATH] --source_model SOURCE_MODEL [--model_name MODEL_NAME] [--weight-format PRECISION] [--config_file_path CONFIG_FILE_PATH] [--overwrite_models] [--target_device TARGET_DEVICE] [--ov_cache_dir OV_CACHE_DIR]
-                                       [--extra_quantization_params EXTRA_QUANTIZATION_PARAMS] [--pipeline_type {LM,LM_CB,VLM,VLM_CB,AUTO}] [--kv_cache_precision {u8}] [--enable_prefix_caching ENABLE_PREFIX_CACHING] [--disable_dynamic_split_fuse] [--max_num_batched_tokens MAX_NUM_BATCHED_TOKENS] [--max_num_seqs MAX_NUM_SEQS]
-                                       [--cache_size CACHE_SIZE] [--draft_source_model DRAFT_SOURCE_MODEL] [--draft_model_name DRAFT_MODEL_NAME] [--draft_eagle3_mode] [--max_prompt_len MAX_PROMPT_LEN] [--prompt_lookup_decoding] [--reasoning_parser {qwen3,gptoss}]
-                                       [--tool_parser {llama3,phi4,hermes3,mistral,qwen3coder,gptoss,devstral,lfm2}] [--enable_tool_guided_generation]
 
-options:
-  -h, --help            show this help message and exit
-  --model_repository_path MODEL_REPOSITORY_PATH
-                        Where the model should be exported to
-  --source_model SOURCE_MODEL
-                        HF model name or path to the local folder with PyTorch or OpenVINO model
-  --model_name MODEL_NAME
-                        Model name that should be used in the deployment. Equal to source_model if HF model name is used
-  --weight-format PRECISION
-                        precision of the exported model
-  --config_file_path CONFIG_FILE_PATH
-                        path to the config file
-  --overwrite_models    Overwrite the model if it already exists in the models repository
-  --target_device TARGET_DEVICE
-                        CPU, GPU, NPU or HETERO, default is CPU
-  --ov_cache_dir OV_CACHE_DIR
-                        Folder path for compilation cache to speedup initialization time
-  --extra_quantization_params EXTRA_QUANTIZATION_PARAMS
-                        Add advanced quantization parameters. Check optimum-intel documentation. Example: "--sym --group-size -1 --ratio 1.0 --awq --scale-estimation --dataset wikitext2"
-  --pipeline_type {LM,LM_CB,VLM,VLM_CB,AUTO}
-                        Type of the pipeline to be used. AUTO is used by default
-  --kv_cache_precision {u8}
-                        u8 or empty (model default). Reduced kv cache precision to u8 lowers the cache size consumption.
-  --enable_prefix_caching ENABLE_PREFIX_CACHING
-                        This algorithm is used to cache the prompt tokens. Default is True.
-  --disable_dynamic_split_fuse
-                        The maximum number of tokens that can be batched together.
-  --max_num_batched_tokens MAX_NUM_BATCHED_TOKENS
-                        empty or integer. The maximum number of tokens that can be batched together.
-  --max_num_seqs MAX_NUM_SEQS
-                        256 by default. The maximum number of sequences that can be processed together.
-  --cache_size CACHE_SIZE
-                        KV cache size in GB. If not set, cache is allocated dynamically.
-  --draft_source_model DRAFT_SOURCE_MODEL
-                        HF model name or path to the local folder with PyTorch or OpenVINO draft model. Using this option will create configuration for speculative decoding
-  --draft_model_name DRAFT_MODEL_NAME
-                        Draft model name that should be used in the deployment. Equal to draft_source_model if HF model name is used. Available only in draft_source_model has been specified.
-  --draft_eagle3_mode   Set this flag if you use EAGLE3 draft model for speculative decoding
-  --max_prompt_len MAX_PROMPT_LEN
-                        Sets NPU specific property for maximum number of tokens in the prompt. Not effective if target device is not NPU
-  --prompt_lookup_decoding
-                        Set pipeline to use prompt lookup decoding
-  --reasoning_parser {qwen3,gptoss}
-                        Set the type of the reasoning parser for reasoning content extraction
-  --tool_parser {llama3,phi4,hermes3,mistral,qwen3coder,gptoss,devstral,lfm2}
-                        Set the type of the tool parser for tool calls extraction
-  --enable_tool_guided_generation
-                        Enables enforcing tool schema during generation. Requires setting tool_parser
-```
+> Note: Exporting some models might require different transformers version than specified in requirements.txt Check [supported models](https://openvinotoolkit.github.io/openvino.genai/docs/supported-models/). If custom transformers version is required, install it afterwards via `pip install transformers==<version>`
+
+
 
 ## Model Export Examples
 

diff --git a/demos/common/export_models/requirements.txt b/demos/common/export_models/requirements.txt
@@ -14,3 +14,4 @@ sentence_transformers==5.3.0
 sentencepiece  # Required by: transformers`
 torchvision
 requests
+einops
diff --git a/demos/continuous_batching/accuracy/README.md b/demos/continuous_batching/accuracy/README.md
@@ -14,33 +14,17 @@ Install the framework via pip:
 pip3 install --extra-index-url "https://download.pytorch.org/whl/cpu" lm_eval[api] langdetect immutabledict dotenv openai
 ```
 
-## Exporting the models
-```bash
-git clone https://github.com/openvinotoolkit/model_server.git
-cd model_server
-pip3 install -U -r demos/common/export_models/requirements.txt
-mkdir models 
-python demos/common/export_models/export_model.py text_generation --source_model meta-llama/Meta-Llama-3.1-8B-Instruct --weight-format fp16 --kv_cache_precision u8 --config_file_path models/config.json --model_repository_path models
-python demos/common/export_models/export_model.py text_generation --source_model meta-llama/Meta-Llama-3.1-8B --weight-format fp16 --kv_cache_precision u8 --config_file_path models/config.json --model_repository_path models
-python demos/common/export_models/export_model.py text_generation --source_model OpenGVLab/InternVL2_5-8B --weight-format fp16 --config_file_path models/config.json --model_repository_path models
-python demos/common/export_models/export_model.py text_generation --source_model Qwen/Qwen3-8B --model_name openvino-qwen3-8b-int8 --weight-format int8 --config_file_path models/config.json --model_repository_path models --tool_parser hermes3 --overwrite_models
-```
-
 ## Starting the model server
 
-### With Docker
-```bash
-docker run -d --rm -p 8000:8000 -v $(pwd)/models:/workspace:ro openvino/model_server:latest --rest_port 8000 --config_path /workspace/config.json
-```
 
-### On Baremetal
-```bash
-ovms --rest_port 8000 --config_path ./models/config.json
-```
+Example of LLM and VLM models deployment is documented in other demos like
+[Agentic usage for LLM models](../agentic_ai/README.md) 
+[Using VLM models](../vlm/README.md)
+
 
 ## Running the tests for LLM models
 
-```bash
+```text
 lm-eval --model local-chat-completions --tasks gsm8k --model_args model=meta-llama/Meta-Llama-3.1-8B-Instruct,base_url=http://localhost:8000/v3/chat/completions,num_concurrent=1,max_retries=3,tokenized_requests=False --verbosity DEBUG  --log_samples --output_path test/ --seed 1 --apply_chat_template --limit 100
 
 local-chat-completions ({'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'base_url': 'http://localhost:8000/v3/chat/completions', 'num_concurrent': 10, 'max_retries': 3, 'tokenized_requests': False}), gen_kwargs: ({}), limit: 100.0, num_fewshot: None, batch_size: 1
@@ -52,7 +36,7 @@ local-chat-completions ({'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'base
 
 While testing the non chat model and `completion` endpoint, the command would look like this:
 
-```bash
+```text
 lm-eval --model local-completions --tasks gsm8k --model_args model=meta-llama/Meta-Llama-3.1-8B,base_url=http://localhost:8000/v3/completions,num_concurrent=1,max_retries=3,tokenized_requests=False --verbosity DEBUG  --log_samples --output_path results/ --seed 1 --limit 100
 
 local-completions ({'model': 'meta-llama/Meta-Llama-3.1-8B', 'base_url': 'http://localhost:8000/v3/completions', 'num_concurrent': 10, 'max_retries': 3, 'tokenized_requests': False}), gen_kwargs: ({}), limit: 100.0, num_fewshot: None, batch_size: 1
@@ -64,19 +48,19 @@ local-completions ({'model': 'meta-llama/Meta-Llama-3.1-8B', 'base_url': 'http:/
 
 Other examples are below:
 
-```bash
+```text
 lm-eval --model local-chat-completions --tasks leaderboard_ifeval --model_args model=meta-llama/Meta-Llama-3.1-8B-Instruct,base_url=http://localhost:8000/v3/chat/completions,num_concurrent=10,max_retries=3,tokenized_requests=False --verbosity DEBUG --log_samples --output_path test/ --seed 1 --limit 100 --apply_chat_template  
 ```
 
-```bash
+```text
 lm-eval --model local-completions --tasks wikitext --model_args model=meta-llama/Meta-Llama-3.1-8B,base_url=http://localhost:8000/v3/completions,num_concurrent=10,max_retries=3,tokenized_requests=False --verbosity DEBUG --log_samples --output_path test/ --seed 1 --limit 100
 ```
 
 ## Running the tests for VLM models
 
 Use [lmms-eval project](https://github.com/EvolvingLMMs-Lab/lmms-eval) - mme and mmmu_val tasks. 
 
-```bash
+```text
 export OPENAI_BASE_URL=http://localhost:8000/v3
 export OPENAI_API_KEY="unused"
 git clone https://github.com/EvolvingLMMs-Lab/lmms-eval
@@ -85,7 +69,7 @@ git checkout 88b23e2bfa16a1edbc16e9e238ed82130b3a4f56
 pip install -e . --extra-index-url "https://download.pytorch.org/whl/cpu"
 python -m lmms_eval \
     --model openai_compatible \
-    --model_args model_version=OpenGVLab/InternVL2_5-8B,max_retries=1 \
+    --model_args model_version=OpenVINO/InternVL2_5-8B_int4-ov,max_retries=1 \
     --tasks mme,mmmu_val \
     --batch_size 1 \
     --log_samples \