Skip to content

Commit 8844bc8

Browse files
mzeglaporlows1
authored andcommitted
Use mainstream optimum in speculative decoding demo (#4190)
1 parent df2fe7e commit 8844bc8

3 files changed

Lines changed: 21 additions & 13 deletions

File tree

demos/common/export_models/export_model.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -403,11 +403,13 @@ def export_text_generation_model(model_repository_path, source_model, model_name
403403
print("Using default quantization parameters for NPU: --sym --ratio 1.0 --group-size -1")
404404
task_parameters['extra_quantization_params'] = "--sym --ratio 1.0 --group-size -1"
405405
optimum_command = "optimum-cli export openvino --model {} --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], llm_model_path)
406+
print('Running command: ', optimum_command) # for debug purposes
406407
if os.system(optimum_command):
407408
raise ValueError("Failed to export llm model", source_model)
408409
if not (os.path.isfile(os.path.join(llm_model_path, 'openvino_detokenizer.xml'))):
409410
print("Tokenizer and detokenizer not found in the exported model. Exporting tokenizer and detokenizer from HF model")
410411
convert_tokenizer_command = f"convert_tokenizer --with-detokenizer --trust-remote-code -o {llm_model_path} {source_model}"
412+
print('Running command: ', convert_tokenizer_command) # for debug purposes
411413
if os.system(convert_tokenizer_command):
412414
raise ValueError("Failed to export tokenizer and detokenizer", source_model)
413415
### Export draft model for speculative decoding
@@ -430,8 +432,9 @@ def export_text_generation_model(model_repository_path, source_model, model_name
430432
additional_options = ""
431433
if args["draft_eagle3_mode"]:
432434
print("Using eagle3 option for the draft model export")
433-
additional_options += " --eagle3 --task text-generation-with-past"
435+
additional_options += " --task text-generation-with-past"
434436
optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code {} {}".format(draft_source_model, precision, additional_options, draft_llm_model_path)
437+
print('Running command: ', optimum_command) # for debug purposes
435438
if os.system(optimum_command):
436439
raise ValueError("Failed to export llm model", source_model)
437440

@@ -482,12 +485,12 @@ def export_embeddings_model_ov(model_repository_path, source_model, model_name,
482485
print("Exporting embeddings model to ",destination_path)
483486
if not os.path.isdir(destination_path) or args['overwrite_models']:
484487
optimum_command = "optimum-cli export openvino --model {} --disable-convert-tokenizer --task feature-extraction --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], destination_path)
485-
print('Running command:', optimum_command) # for debug purposes
488+
print('Running command: ', optimum_command) # for debug purposes
486489
if os.system(optimum_command):
487490
raise ValueError("Failed to export embeddings model", source_model)
488491
print("Exporting tokenizer to ", destination_path)
489492
convert_tokenizer_command = "convert_tokenizer -o {} {} {}".format(destination_path, source_model, set_max_context_length)
490-
print('Running command:', convert_tokenizer_command) # for debug purposes
493+
print('Running command: ', convert_tokenizer_command) # for debug purposes
491494
if (os.system(convert_tokenizer_command)):
492495
raise ValueError("Failed to export tokenizer model", source_model)
493496
gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(embedding_graph_ov_template)
@@ -502,6 +505,7 @@ def export_text2speech_model(model_repository_path, source_model, model_name, pr
502505
print("Exporting text2speech model to ",destination_path)
503506
if not os.path.isdir(destination_path) or args['overwrite_models']:
504507
optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code --model-kwargs \"{{\\\"vocoder\\\": \\\"{}\\\"}}\" {}".format(source_model, precision, task_parameters['vocoder'], destination_path)
508+
print('Running command: ', optimum_command) # for debug purposes
505509
if os.system(optimum_command):
506510
raise ValueError("Failed to export text2speech model", source_model)
507511
gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(t2s_graph_template)
@@ -516,6 +520,7 @@ def export_speech2text_model(model_repository_path, source_model, model_name, pr
516520
print("Exporting speech2text model to ",destination_path)
517521
if not os.path.isdir(destination_path) or args['overwrite_models']:
518522
optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code {}".format(source_model, precision, destination_path)
523+
print('Running command: ', optimum_command) # for debug purposes
519524
if os.system(optimum_command):
520525
raise ValueError("Failed to export speech2text model", source_model)
521526
gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(s2t_graph_template)
@@ -530,6 +535,7 @@ def export_rerank_model_ov(model_repository_path, source_model, model_name, prec
530535
print("Exporting rerank model to ",destination_path)
531536
if not os.path.isdir(destination_path) or args['overwrite_models']:
532537
optimum_command = "optimum-cli export openvino --model {} --disable-convert-tokenizer --task text-classification --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], destination_path)
538+
print('Running command: ', optimum_command) # for debug purposes
533539
if os.system(optimum_command):
534540
raise ValueError("Failed to export rerank model", source_model)
535541
print("Exporting tokenizer to ", destination_path)
@@ -551,7 +557,7 @@ def export_image_generation_model(model_repository_path, source_model, model_nam
551557
print("Model index file already exists. Skipping conversion, re-generating graph only.")
552558
else:
553559
optimum_command = "optimum-cli export openvino --model {} --weight-format {} {} {}".format(source_model, precision, task_parameters['extra_quantization_params'], target_path)
554-
print(f'optimum cli command: {optimum_command}')
560+
print('Running command: ', optimum_command) # for debug purposes
555561
if os.system(optimum_command):
556562
raise ValueError("Failed to export image generation model", source_model)
557563

demos/common/export_models/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ optimum-intel@git+https://github.com/huggingface/optimum-intel.git@d4dd21a3aa89c
55
accelerate
66
datasets
77
diffusers # for image generation
8+
einops
89
nncf
910
numpy
1011
openvino-tokenizers==2026.2.0.0rc2

demos/continuous_batching/speculative_decoding/README.md

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,16 +34,13 @@ Python environment setup:
3434
curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/export_model.py -o export_model.py
3535
pip3 install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/requirements.txt
3636

37-
# Override optimum-intel with version supporting eagle3
38-
python -m pip install git+https://github.com/xufang-lisa/optimum-intel.git@xufang/add_eagle3_draft_model_conversion
39-
4037
mkdir models
4138
```
4239

4340
Run `export_model.py` script to download and quantize the model:
4441

4542
```console
46-
python export_model.py text_generation --source_model Qwen/Qwen3-8B --draft_source_model Tengyunw/qwen3_8b_eagle3 --draft_eagle3_mode --weight-format int4 --config_file_path models/config.json --model_repository_path models
43+
python export_model.py text_generation --source_model Qwen/Qwen3-8B --draft_source_model AngelSlim/Qwen3-8B_eagle3 --draft_eagle3_mode --weight-format int4 --config_file_path models/config.json --model_repository_path models
4744
```
4845

4946
Draft model inherits all scheduler properties from the main model.
@@ -55,6 +52,12 @@ models
5552
└── Qwen
5653
└── Qwen3-8B
5754
├── added_tokens.json
55+
├── AngelSlim-Qwen3-8B_eagle3
56+
│   ├── config.json
57+
│   ├── generation_config.json
58+
│   ├── openvino_config.json
59+
│   ├── openvino_model.bin
60+
│   └── openvino_model.xml
5861
├── chat_template.jinja
5962
├── config.json
6063
├── generation_config.json
@@ -68,14 +71,10 @@ models
6871
├── openvino_tokenizer.bin
6972
├── openvino_tokenizer.xml
7073
├── special_tokens_map.json
71-
├── Tengyunw-qwen3_8b_eagle3
72-
│   ├── config.json
73-
│   ├── generation_config.json
74-
│   ├── openvino_model.bin
75-
│   └── openvino_model.xml
7674
├── tokenizer_config.json
7775
├── tokenizer.json
7876
└── vocab.json
77+
7978
```
8079

8180
## Server Deployment
@@ -316,6 +315,8 @@ for chunk in stream:
316315
```
317316

318317
Output:
318+
319+
```
319320
if len(numbers) <= 1:
320321
return numbers
321322
else:

0 commit comments

Comments
 (0)