-
Notifications
You must be signed in to change notification settings - Fork 260
Expand file tree
/
Copy pathexport_model.py
More file actions
741 lines (686 loc) · 45 KB
/
Copy pathexport_model.py
File metadata and controls
741 lines (686 loc) · 45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
#
# Copyright (c) 2024 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import argparse
import os
import jinja2
import json
import shutil
import tempfile
from pathlib import Path
from huggingface_hub import snapshot_download
def add_common_arguments(parser):
parser.add_argument('--model_repository_path', required=False, default='models', help='Where the model should be exported to', dest='model_repository_path')
parser.add_argument('--source_model', required=True, help='HF model name or path to the local folder with PyTorch or OpenVINO model', dest='source_model')
parser.add_argument('--model_name', required=False, default=None, help='Model name that should be used in the deployment. Equal to source_model if HF model name is used', dest='model_name')
parser.add_argument('--weight-format', default='int8', help='precision of the exported model', dest='precision')
parser.add_argument('--config_file_path', default='config.json', help='path to the config file', dest='config_file_path')
parser.add_argument('--overwrite_models', default=False, action='store_true', help='Overwrite the model if it already exists in the models repository', dest='overwrite_models')
parser.add_argument('--target_device', default="CPU", help='CPU, GPU, NPU or HETERO, default is CPU', dest='target_device')
parser.add_argument('--ov_cache_dir', default=None, help='Folder path for compilation cache to speedup initialization time', dest='ov_cache_dir')
parser.add_argument('--extra_quantization_params', required=False, help='Add advanced quantization parameters. Check optimum-intel documentation. Example: "--sym --group-size -1 --ratio 1.0 --awq --scale-estimation --dataset wikitext2"', dest='extra_quantization_params')
parser = argparse.ArgumentParser(description='Export Hugging face models to OVMS models repository including all configuration for deployments')
subparsers = parser.add_subparsers(help='subcommand help', required=True, dest='task')
parser_text = subparsers.add_parser('text_generation', help='export model for chat and completion endpoints')
add_common_arguments(parser_text)
parser_text.add_argument('--pipeline_type', default=None, choices=["LM", "LM_CB", "VLM", "VLM_CB", "AUTO"], help='Type of the pipeline to be used. AUTO is used by default', dest='pipeline_type')
parser_text.add_argument('--kv_cache_precision', default=None, choices=["u8"], help='u8 or empty (model default). Reduced kv cache precision to u8 lowers the cache size consumption.', dest='kv_cache_precision')
parser_text.add_argument('--enable_prefix_caching', type=lambda x: (str(x).lower() == 'true'), default=True, help='This algorithm is used to cache the prompt tokens. Default is True.', dest='enable_prefix_caching')
parser_text.add_argument('--disable_dynamic_split_fuse', action='store_false', help='The maximum number of tokens that can be batched together.', dest='dynamic_split_fuse')
parser_text.add_argument('--max_num_batched_tokens', default=None, help='empty or integer. The maximum number of tokens that can be batched together.', dest='max_num_batched_tokens')
parser_text.add_argument('--max_num_seqs', default=None, help='256 by default. The maximum number of sequences that can be processed together.', dest='max_num_seqs')
parser_text.add_argument('--cache_size', default=0, type=int, help='KV cache size in GB. If not set, cache is allocated dynamically.', dest='cache_size')
parser_text.add_argument('--draft_source_model', required=False, default=None, help='HF model name or path to the local folder with PyTorch or OpenVINO draft model. '
'Using this option will create configuration for speculative decoding', dest='draft_source_model')
parser_text.add_argument('--draft_model_name', required=False, default=None, help='Draft model name that should be used in the deployment. '
'Equal to draft_source_model if HF model name is used. Available only in draft_source_model has been specified.', dest='draft_model_name')
parser_text.add_argument('--draft_eagle3_mode', action='store_true', help='Set this flag if you use EAGLE3 draft model for speculative decoding', dest='draft_eagle3_mode')
parser_text.add_argument('--max_prompt_len', required=False, type=int, default=None, help='Sets NPU specific property for maximum number of tokens in the prompt. '
'Not effective if target device is not NPU', dest='max_prompt_len')
parser_text.add_argument('--prompt_lookup_decoding', action='store_true', help='Set pipeline to use prompt lookup decoding', dest='prompt_lookup_decoding')
parser_text.add_argument('--reasoning_parser', choices=["qwen3", "gptoss"], help='Set the type of the reasoning parser for reasoning content extraction', dest='reasoning_parser')
parser_text.add_argument('--tool_parser', choices=["llama3", "phi4", "hermes3", "mistral", "qwen3coder", "gptoss", "devstral", "lfm2"], help='Set the type of the tool parser for tool calls extraction', dest='tool_parser')
parser_text.add_argument('--enable_tool_guided_generation', action='store_true', help='Enables enforcing tool schema during generation. Requires setting tool_parser', dest='enable_tool_guided_generation')
parser_embeddings_ov = subparsers.add_parser('embeddings_ov', help='export model for embeddings endpoint with directory structure aligned with OpenVINO tools')
add_common_arguments(parser_embeddings_ov)
parser_embeddings_ov.add_argument('--skip_normalize', default=True, action='store_false', help='Skip normalize the embeddings.', dest='normalize')
parser_embeddings_ov.add_argument('--pooling', default="CLS", choices=["CLS", "LAST", "MEAN"], help='Embeddings pooling mode', dest='pooling')
parser_embeddings_ov.add_argument('--truncate', default=False, action='store_true', help='Truncate the prompts to fit to the embeddings model', dest='truncate')
parser_embeddings_ov.add_argument('--num_streams', default=1,type=int, help='The number of parallel execution streams to use for the model. Use at least 2 on 2 socket CPU systems.', dest='num_streams')
parser_rerank_ov = subparsers.add_parser('rerank_ov', help='export model for rerank endpoint with directory structure aligned with OpenVINO tools')
add_common_arguments(parser_rerank_ov)
parser_rerank_ov.add_argument('--num_streams', default=1, type=int, help='The number of parallel execution streams to use for the model. Use at least 2 on 2 socket CPU systems.', dest='num_streams')
parser_rerank_ov.add_argument('--max_doc_length', default=16000, type=int, help='Maximum length of input documents in tokens', dest='max_doc_length')
parser_image_generation = subparsers.add_parser('image_generation', help='export model for image generation endpoint')
add_common_arguments(parser_image_generation)
parser_image_generation.add_argument('--num_streams', default=0, type=int, help='The number of parallel execution streams to use for the models in the pipeline.', dest='num_streams')
parser_image_generation.add_argument('--resolution', default="", help='Selection of allowed resolutions in a format of WxH; W=width H=height, space separated. If only one is selected, the pipeline will be reshaped to static.', dest='resolution')
parser_image_generation.add_argument('--guidance_scale', default="", help='Static guidance scale for the image generation requests. If not specified, default 7.5f is used.', dest='guidance_scale')
parser_image_generation.add_argument('--num_images_per_prompt', default="", help='Static number of images to be generated per the image generation request. If not specified, default 1 is used.', dest='num_images_per_prompt')
parser_image_generation.add_argument('--max_resolution', default="", help='Max allowed resolution in a format of WxH; W=width H=height', dest='max_resolution')
parser_image_generation.add_argument('--default_resolution', default="", help='Default resolution when not specified by client', dest='default_resolution')
parser_image_generation.add_argument('--max_num_images_per_prompt', type=int, default=0, help='Max allowed number of images client is allowed to request for a given prompt', dest='max_num_images_per_prompt')
parser_image_generation.add_argument('--default_num_inference_steps', type=int, default=0, help='Default number of inference steps when not specified by client', dest='default_num_inference_steps')
parser_image_generation.add_argument('--max_num_inference_steps', type=int, default=0, help='Max allowed number of inference steps client is allowed to request for a given prompt', dest='max_num_inference_steps')
parser_image_generation.add_argument('--source_loras', default=None,
help='LoRA adapters to apply. Format: alias1=org1/repo1[:alpha],alias2=org2/repo2[@file.safetensors][:alpha],'
'composite=@alias1:alpha+@alias2:alpha '
'@filename specifies which .safetensors file (auto-detected when repo has exactly one). '
':alpha sets adapter alpha (default 1.0). '
'Composite entries (source starts with @) blend multiple adapters. Only for image_generation task.',
dest='source_loras')
parser_text2speech = subparsers.add_parser('text2speech', help='export model for text2speech endpoint')
add_common_arguments(parser_text2speech)
parser_text2speech.add_argument('--num_streams', default=0, type=int, help='The number of parallel execution streams to use for the models in the pipeline.', dest='num_streams')
parser_text2speech.add_argument('--model_type', default='speecht5', choices=['speecht5', 'kokoro'], help='Type of the source TTS model. speecht5 uses optimum-cli; kokoro uses a dedicated PyTorch->OpenVINO conversion path.', dest='model_type')
parser_text2speech.add_argument('--vocoder', type=str, help='The vocoder model to use for speecht5. For example microsoft/speecht5_hifigan. Ignored for kokoro.', dest='vocoder')
parser_text2speech.add_argument('--speaker_name', type=str, help='Name of the speaker (speecht5 only; for kokoro all voices from the HF repo are exported).', dest='speaker_name')
parser_text2speech.add_argument('--speaker_path', type=str, help='Path to the speaker.bin file (speecht5 only; for kokoro all voices from the HF repo are exported).', dest='speaker_path')
parser_speech2text = subparsers.add_parser('speech2text', help='export model for speech2text endpoint')
add_common_arguments(parser_speech2text)
parser_speech2text.add_argument('--num_streams', default=0, type=int, help='The number of parallel execution streams to use for the models in the pipeline.', dest='num_streams')
parser_speech2text.add_argument('--enable_word_timestamps', default=False, action='store_true', help='Load model with word timestamps support.', dest='enable_word_timestamps')
args = vars(parser.parse_args())
t2s_graph_template = """# OVMS_GRAPH_QUEUE_MAX_SIZE: AUTO
input_stream: "HTTP_REQUEST_PAYLOAD:input"
output_stream: "HTTP_RESPONSE_PAYLOAD:output"
node {
name: "T2sExecutor"
input_side_packet: "TTS_NODE_RESOURCES:t2s_servable"
calculator: "T2sCalculator"
input_stream: "HTTP_REQUEST_PAYLOAD:input"
output_stream: "HTTP_RESPONSE_PAYLOAD:output"
node_options: {
[type.googleapis.com / mediapipe.T2sCalculatorOptions]: {
models_path: "{{model_path}}",
plugin_config: '{ "NUM_STREAMS": "{{num_streams|default(1, true)}}" }',
target_device: "{{target_device|default("CPU", true)}}",
{%- if voices %}
voices: [
{%- for v in voices %}
{
name: "{{v.name}}",
path: "{{v.path}}"
}{% if not loop.last %},{% endif %}
{%- endfor %}
]
{%- endif %}
}
}
}
"""
s2t_graph_template = """# OVMS_GRAPH_QUEUE_MAX_SIZE: AUTO
input_stream: "HTTP_REQUEST_PAYLOAD:input"
output_stream: "HTTP_RESPONSE_PAYLOAD:output"
node {
name: "S2tExecutor"
input_side_packet: "STT_NODE_RESOURCES:s2t_servable"
calculator: "S2tCalculator"
input_stream: "LOOPBACK:loopback"
input_stream: "HTTP_REQUEST_PAYLOAD:input"
output_stream: "LOOPBACK:loopback"
output_stream: "HTTP_RESPONSE_PAYLOAD:output"
input_stream_info: {
tag_index: 'LOOPBACK:0',
back_edge: true
}
node_options: {
[type.googleapis.com / mediapipe.S2tCalculatorOptions]: {
models_path: "{{model_path}}",
plugin_config: '{ "NUM_STREAMS": "{{num_streams|default(1, true)}}" }',
target_device: "{{target_device|default("CPU", true)}}",
enable_word_timestamps: {% if not enable_word_timestamps %}false{% else %}true{% endif%},
}
}
input_stream_handler {
input_stream_handler: "SyncSetInputStreamHandler",
options {
[mediapipe.SyncSetInputStreamHandlerOptions.ext] {
sync_set {
tag_index: "LOOPBACK:0"
}
}
}
}
}
"""
embedding_graph_ov_template = """# OVMS_GRAPH_QUEUE_MAX_SIZE: AUTO
input_stream: "REQUEST_PAYLOAD:input"
output_stream: "RESPONSE_PAYLOAD:output"
node {
name: "EmbeddingsExecutor"
input_side_packet: "EMBEDDINGS_NODE_RESOURCES:embeddings_servable"
calculator: "EmbeddingsCalculatorOV"
input_stream: "REQUEST_PAYLOAD:input"
output_stream: "RESPONSE_PAYLOAD:output"
node_options: {
[type.googleapis.com / mediapipe.EmbeddingsCalculatorOVOptions]: {
models_path: "{{model_path}}",
plugin_config: '{"NUM_STREAMS": "{{num_streams}}" }',
normalize_embeddings: {% if not normalize %}false{% else %}true{% endif%},
{%- if pooling %}
pooling: {{pooling}},{% endif %}
{%- if truncate %}
truncate: true,{% endif %}
target_device: "{{target_device|default("CPU", true)}}"
}
}
}
"""
rerank_graph_ov_template = """# OVMS_GRAPH_QUEUE_MAX_SIZE: AUTO
input_stream: "REQUEST_PAYLOAD:input"
output_stream: "RESPONSE_PAYLOAD:output"
node {
name: "RerankExecutor"
input_side_packet: "RERANK_NODE_RESOURCES:rerank_servable"
calculator: "RerankCalculatorOV"
input_stream: "REQUEST_PAYLOAD:input"
output_stream: "RESPONSE_PAYLOAD:output"
node_options: {
[type.googleapis.com / mediapipe.RerankCalculatorOVOptions]: {
models_path: "{{model_path}}",
plugin_config: '{"NUM_STREAMS": "{{num_streams}}" }',
target_device: "{{target_device|default("CPU", true)}}"
}
}
}
"""
text_generation_graph_template = """# OVMS_GRAPH_QUEUE_MAX_SIZE: AUTO
input_stream: "HTTP_REQUEST_PAYLOAD:input"
output_stream: "HTTP_RESPONSE_PAYLOAD:output"
node: {
name: "LLMExecutor"
calculator: "HttpLLMCalculator"
input_stream: "LOOPBACK:loopback"
input_stream: "HTTP_REQUEST_PAYLOAD:input"
input_side_packet: "LLM_NODE_RESOURCES:llm"
output_stream: "LOOPBACK:loopback"
output_stream: "HTTP_RESPONSE_PAYLOAD:output"
input_stream_info: {
tag_index: 'LOOPBACK:0',
back_edge: true
}
node_options: {
[type.googleapis.com / mediapipe.LLMCalculatorOptions]: {
{%- if pipeline_type %}
pipeline_type: {{pipeline_type}},{% endif %}
models_path: "{{model_path}}",
plugin_config: '{{plugin_config}}',
enable_prefix_caching: {% if not enable_prefix_caching %}false{% else %} true{% endif%},
cache_size: {{cache_size|default("0", true)}},
{%- if max_num_batched_tokens %}
max_num_batched_tokens: {{max_num_batched_tokens}},{% endif %}
{%- if not dynamic_split_fuse %}
dynamic_split_fuse: false, {% endif %}
max_num_seqs: {% if draft_eagle3_mode %}1{% else %}{{max_num_seqs|default("256", true)}}{% endif %},
device: "{{target_device|default("CPU", true)}}",
{%- if draft_model_dir_name %}
# Speculative decoding configuration
draft_models_path: "./{{draft_model_dir_name}}",
draft_device: "{{target_device|default("CPU", true)}}",
draft_eagle3_mode: {{draft_eagle3_mode|default(false)}},{% endif %}
{%- if reasoning_parser %}
reasoning_parser: "{{reasoning_parser}}",{% endif %}
{%- if tool_parser %}
tool_parser: "{{tool_parser}}",{% endif %}
{%- if enable_tool_guided_generation %}
enable_tool_guided_generation: {% if not enable_tool_guided_generation %}false{% else %} true{% endif%},{% endif %}
}
}
input_stream_handler {
input_stream_handler: "SyncSetInputStreamHandler",
options {
[mediapipe.SyncSetInputStreamHandlerOptions.ext] {
sync_set {
tag_index: "LOOPBACK:0"
}
}
}
}
}"""
image_generation_graph_template = """# OVMS_GRAPH_QUEUE_MAX_SIZE: AUTO
input_stream: "HTTP_REQUEST_PAYLOAD:input"
output_stream: "HTTP_RESPONSE_PAYLOAD:output"
node: {
name: "ImageGenExecutor"
calculator: "ImageGenCalculator"
input_stream: "HTTP_REQUEST_PAYLOAD:input"
input_side_packet: "IMAGE_GEN_NODE_RESOURCES:pipes"
output_stream: "HTTP_RESPONSE_PAYLOAD:output"
node_options: {
[type.googleapis.com / mediapipe.ImageGenCalculatorOptions]: {
models_path: "{{model_path}}",
{%- if plugin_config_str %}
plugin_config: '{{plugin_config_str}}',{% endif %}
device: "{{target_device|default("CPU", true)}}",
{%- if resolution %}
resolution: "{{resolution}}",{% endif %}
{%- if num_images_per_prompt %}
num_images_per_prompt: {{num_images_per_prompt}},{% endif %}
{%- if guidance_scale %}
guidance_scale: {{guidance_scale}},{% endif %}
{%- if max_resolution %}
max_resolution: '{{max_resolution}}',{% endif %}
{%- if default_resolution %}
default_resolution: '{{default_resolution}}',{% endif %}
{%- if max_num_images_per_prompt > 0 %}
max_num_images_per_prompt: {{max_num_images_per_prompt}},{% endif %}
{%- if default_num_inference_steps > 0 %}
default_num_inference_steps: {{default_num_inference_steps}},{% endif %}
{%- if max_num_inference_steps > 0 %}
max_num_inference_steps: {{max_num_inference_steps}},{% endif %}
{%- for lora in lora_adapters %}
lora_adapters { alias: "{{lora.alias}}" path: "{{lora.path}}"{% if lora.alpha is not none %} alpha: {{lora.alpha}}{% endif %} mode: DYNAMIC }
{%- endfor %}
{%- for composite in composite_lora_adapters %}
composite_lora_adapters {
alias: "{{composite.alias}}"
{%- for comp in composite.components %}
components { adapter_alias: "{{comp.adapter_alias}}"{% if comp.alpha != 1.0 %} alpha: {{comp.alpha}}{% endif %} }
{%- endfor %}
}
{%- endfor %}
}
}
}"""
def export_rerank_tokenizer(source_model, destination_path, max_length):
import openvino as ov
from openvino_tokenizers import convert_tokenizer
from transformers import AutoTokenizer
hf_tokenizer = AutoTokenizer.from_pretrained(source_model)
hf_tokenizer.model_max_length = max_length
hf_tokenizer.save_pretrained(destination_path)
ov_tokenizer = convert_tokenizer(hf_tokenizer, add_special_tokens=False)
ov.save_model(ov_tokenizer, os.path.join(destination_path, "openvino_tokenizer.xml"))
def set_rt_info(model_folder_path, model_filename, config_filename):
import openvino as ov
model = ov.Core().read_model(os.path.join(model_folder_path, model_filename))
with open(os.path.join(model_folder_path, config_filename), 'r') as config_file:
config_data = json.load(config_file)
for key, value in config_data.items():
try:
model.set_rt_info(value, ['model_info', key])
except Exception as e:
model.set_rt_info(str(value), ['model_info', key])
temp_model_name = model_filename.replace('.xml', '_temp.xml')
ov.save_model(model, os.path.join(model_folder_path, temp_model_name))
del model
shutil.move(os.path.join(model_folder_path, temp_model_name), os.path.join(model_folder_path, model_filename))
shutil.move(os.path.join(model_folder_path, temp_model_name.replace('.xml','.bin')), os.path.join(model_folder_path, model_filename.replace('.xml','.bin')))
def get_models_max_context(tmpdirname, config_filename):
with open(os.path.join(tmpdirname, config_filename), 'r') as config_file:
config_data = json.load(config_file)
if config_data['max_position_embeddings'] is not None:
return config_data['max_position_embeddings']
if config_data['n_positions'] is not None:
return config_data['n_positions']
return None
def add_servable_to_config(config_path, model_name, base_path):
base_path = Path(base_path).as_posix()
print(config_path, model_name, base_path)
if not os.path.isfile(config_path):
print("Creating new config file")
with open(config_path, 'w') as config_file:
json.dump({'mediapipe_config_list': [], "model_config_list": []}, config_file, indent=4)
with open(config_path, 'r') as config_file:
config_data = json.load(config_file)
if 'model_config_list' not in config_data:
config_data['model_config_list'] = []
## read legacy mediapipe_config_list to model_config_list
if 'mediapipe_config_list' in config_data:
for mp_config in config_data['mediapipe_config_list']:
if 'name' in mp_config and 'base_path' in mp_config:
if not any(d['config']['name'] == mp_config['name'] + "_model" for d in config_data['model_config_list']):
config_data['model_config_list'].append({'config': {'name': mp_config['name'] + "_model", 'base_path': mp_config['base_path']}})
del config_data['mediapipe_config_list']
model_list = config_data['model_config_list']
updated = False
for model_config in model_list:
if model_config['config']['name'] == model_name:
model_config['config']['base_path'] = base_path
updated = True
if not updated:
model_list.append({'config': {'name': model_name, 'base_path': base_path}})
with open(config_path, 'w') as config_file:
json.dump(config_data, config_file, indent=4)
print("Added servable to config file", config_path)
def export_text_generation_model(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path):
model_path = "./"
# validation for tool parsing
if (task_parameters.get('tool_parser', None) == 'gptoss' or task_parameters.get('reasoning_parser', None) == 'gptoss'):
if (task_parameters.get('tool_parser', None) != task_parameters.get('reasoning_parser', None)):
raise ValueError("Both tool_parser and reasoning_parser need to be set to gptoss when one of them is set to gptoss")
### Export model
if os.path.isfile(os.path.join(source_model, 'openvino_model.xml')) or os.path.isfile(os.path.join(source_model, 'openvino_language_model.xml')):
print("OV model is source folder. Skipping conversion.")
model_path = source_model
elif source_model.startswith("OpenVINO/"):
if precision:
print("Precision change is not supported for OpenVINO models. Parameter --weight-format {} will be ignored.".format(precision))
hugging_face_cmd = "huggingface-cli download {} --local-dir {} ".format(source_model, os.path.join(model_repository_path, model_name))
if os.system(hugging_face_cmd):
raise ValueError("Failed to download llm model", source_model)
else: # assume HF model name or local pytorch model folder
llm_model_path = os.path.join(model_repository_path, model_name)
print("Exporting LLM model to ", llm_model_path)
if not os.path.isdir(llm_model_path) or args['overwrite_models']:
if task_parameters['target_device'] == 'NPU':
if precision != 'int4' and precision != 'nf4':
print("NPU target device requires int4 or nf4 precision. Changing to int4")
precision = 'int4'
if task_parameters['extra_quantization_params'] == "":
print("Using default quantization parameters for NPU: --sym --ratio 1.0 --group-size -1")
task_parameters['extra_quantization_params'] = "--sym --ratio 1.0 --group-size -1"
optimum_command = "optimum-cli export openvino --model {} --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], llm_model_path)
print('Running command: ', optimum_command) # for debug purposes
if os.system(optimum_command):
raise ValueError("Failed to export llm model", source_model)
if not (os.path.isfile(os.path.join(llm_model_path, 'openvino_detokenizer.xml'))):
print("Tokenizer and detokenizer not found in the exported model. Exporting tokenizer and detokenizer from HF model")
convert_tokenizer_command = f"convert_tokenizer --with-detokenizer --trust-remote-code -o {llm_model_path} {source_model}"
print('Running command: ', convert_tokenizer_command) # for debug purposes
if os.system(convert_tokenizer_command):
raise ValueError("Failed to export tokenizer and detokenizer", source_model)
### Export draft model for speculative decoding
draft_source_model = task_parameters.get("draft_source_model", None)
draft_model_dir_name = None
if draft_source_model:
draft_model_dir_name = draft_source_model.replace("/", "-") # flatten the name so we don't create nested directory structure
draft_llm_model_path = os.path.join(model_repository_path, model_name, draft_model_dir_name)
if os.path.isfile(os.path.join(draft_llm_model_path, 'openvino_model.xml')):
print("OV model is source folder. Skipping conversion.")
elif source_model.startswith("OpenVINO/"):
if precision:
print("Precision change is not supported for OpenVINO models. Parameter --weight-format {} will be ignored.".format(precision))
hugging_face_cmd = "huggingface-cli download {} --local-dir {} ".format(source_model, os.path.join(draft_llm_model_path, draft_source_model))
if os.system(hugging_face_cmd):
raise ValueError("Failed to download llm model", source_model)
else: # assume HF model name or local pytorch model folder
print("Exporting draft LLM model to ", draft_llm_model_path)
if not os.path.isdir(draft_llm_model_path) or args['overwrite_models']:
additional_options = ""
if args["draft_eagle3_mode"]:
print("Using eagle3 option for the draft model export")
additional_options += " --task text-generation-with-past"
optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code {} {}".format(draft_source_model, precision, additional_options, draft_llm_model_path)
print('Running command: ', optimum_command) # for debug purposes
if os.system(optimum_command):
raise ValueError("Failed to export llm model", source_model)
### Prepare plugin config string for jinja rendering
plugin_config = {}
if task_parameters['kv_cache_precision'] is not None:
plugin_config['KV_CACHE_PRECISION'] = task_parameters['kv_cache_precision']
if task_parameters['max_prompt_len'] is not None:
if task_parameters['target_device'] != 'NPU':
raise ValueError("max_prompt_len is only supported for NPU target device")
if task_parameters['max_prompt_len'] <= 0:
raise ValueError("max_prompt_len should be a positive integer")
if task_parameters['ov_cache_dir'] is not None:
plugin_config['CACHE_DIR'] = task_parameters['ov_cache_dir']
if task_parameters['prompt_lookup_decoding']:
plugin_config['prompt_lookup'] = True
# Additional plugin properties for HETERO
if "HETERO" in task_parameters['target_device']:
plugin_config['MODEL_DISTRIBUTION_POLICY'] = 'PIPELINE_PARALLEL'
if task_parameters['target_device'] == 'NPU':
max_prompt_len = task_parameters['max_prompt_len']
npu_properties = {}
if max_prompt_len is not None:
npu_properties['MAX_PROMPT_LEN'] = max_prompt_len
if task_parameters['enable_prefix_caching']:
npu_properties['NPUW_LLM_ENABLE_PREFIX_CACHING'] = True
device_properties = { "NPU": npu_properties }
plugin_config['DEVICE_PROPERTIES'] = device_properties
plugin_config_str = json.dumps(plugin_config)
task_parameters['plugin_config'] = plugin_config_str
os.makedirs(os.path.join(model_repository_path, model_name), exist_ok=True)
gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(text_generation_graph_template)
print("task_parameters", task_parameters)
graph_content = gtemplate.render(model_path=model_path, draft_model_dir_name=draft_model_dir_name, **task_parameters)
with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
f.write(graph_content)
print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
add_servable_to_config(config_file_path, model_name, os.path.relpath(os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path)))
def export_embeddings_model_ov(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path, truncate=True):
set_max_context_length = ""
destination_path = os.path.join(model_repository_path, model_name)
print("Exporting embeddings model to ",destination_path)
if not os.path.isdir(destination_path) or args['overwrite_models']:
optimum_command = "optimum-cli export openvino --model {} --disable-convert-tokenizer --task feature-extraction --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], destination_path)
print('Running command: ', optimum_command) # for debug purposes
if os.system(optimum_command):
raise ValueError("Failed to export embeddings model", source_model)
print("Exporting tokenizer to ", destination_path)
convert_tokenizer_command = "convert_tokenizer -o {} {} {}".format(destination_path, source_model, set_max_context_length)
print('Running command: ', convert_tokenizer_command) # for debug purposes
if (os.system(convert_tokenizer_command)):
raise ValueError("Failed to export tokenizer model", source_model)
gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(embedding_graph_ov_template)
graph_content = gtemplate.render(model_path="./", **task_parameters)
with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
f.write(graph_content)
print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
add_servable_to_config(config_file_path, model_name, os.path.relpath(os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path)))
def _list_kokoro_voices(destination_path):
"""optimum-cli's Kokoro exporter writes per-voice speaker embeddings to
<destination_path>/voices/<name>.bin. Return the sorted list of voice names."""
voices_dir = os.path.join(destination_path, "voices")
if not os.path.isdir(voices_dir):
print("Warning: no voices/ directory found under", destination_path)
return []
return sorted(Path(p).stem for p in Path(voices_dir).glob("*.bin"))
def export_text2speech_model(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path):
destination_path = os.path.join(model_repository_path, model_name)
print("Exporting text2speech model to ",destination_path)
model_type = task_parameters.get('model_type', 'speecht5')
if model_type == 'kokoro':
# optimum-intel registers Kokoro under library_name=kokoro / task=text-to-audio.
# The kokoro exporter also dumps each speaker embedding to voices/<name>.bin.
if not os.path.isfile(os.path.join(destination_path, 'openvino_model.xml')) or args['overwrite_models']:
optimum_command = "optimum-cli export openvino --model {} --task text-to-audio --weight-format {} {} --trust-remote-code {}".format(
source_model, precision, task_parameters['extra_quantization_params'], destination_path)
print('Running command:', optimum_command)
if os.system(optimum_command):
raise ValueError("Failed to export kokoro model", source_model)
voice_names = _list_kokoro_voices(destination_path)
# Render the graph with every available voice (path is relative to graph.pbtxt).
task_parameters['voices'] = [{'name': n, 'path': f'./voices/{n}.bin'} for n in voice_names]
else:
if not os.path.isdir(destination_path) or args['overwrite_models']:
if not task_parameters.get('vocoder'):
raise ValueError("--vocoder is required when --model_type=speecht5")
optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code --model-kwargs \"{{\\\"vocoder\\\": \\\"{}\\\"}}\" {}".format(source_model, precision, task_parameters['vocoder'], destination_path)
print('Running command: ', optimum_command)
if os.system(optimum_command):
raise ValueError("Failed to export text2speech model", source_model)
if task_parameters.get('speaker_name') and task_parameters.get('speaker_path'):
task_parameters['voices'] = [{'name': task_parameters['speaker_name'], 'path': task_parameters['speaker_path']}]
gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(t2s_graph_template)
graph_content = gtemplate.render(model_path="./", **task_parameters)
with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
f.write(graph_content)
print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
add_servable_to_config(config_file_path, model_name, os.path.relpath(os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path)))
def export_speech2text_model(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path):
destination_path = os.path.join(model_repository_path, model_name)
print("Exporting speech2text model to ",destination_path)
if not os.path.isdir(destination_path) or args['overwrite_models']:
optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code {}".format(source_model, precision, destination_path)
print('Running command: ', optimum_command) # for debug purposes
if os.system(optimum_command):
raise ValueError("Failed to export speech2text model", source_model)
gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(s2t_graph_template)
graph_content = gtemplate.render(model_path="./", **task_parameters)
with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
f.write(graph_content)
print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
add_servable_to_config(config_file_path, model_name, os.path.relpath(os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path)))
def export_rerank_model_ov(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path, max_doc_length):
destination_path = os.path.join(model_repository_path, model_name)
print("Exporting rerank model to ",destination_path)
if not os.path.isdir(destination_path) or args['overwrite_models']:
optimum_command = "optimum-cli export openvino --model {} --disable-convert-tokenizer --task text-classification --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], destination_path)
print('Running command: ', optimum_command) # for debug purposes
if os.system(optimum_command):
raise ValueError("Failed to export rerank model", source_model)
print("Exporting tokenizer to ", destination_path)
export_rerank_tokenizer(source_model, destination_path, max_doc_length)
gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(rerank_graph_ov_template)
graph_content = gtemplate.render(model_path="./", **task_parameters)
with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
f.write(graph_content)
print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
add_servable_to_config(config_file_path, model_name, os.path.relpath(os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path)))
def export_image_generation_model(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path, num_streams, source_loras):
model_path = "./"
target_path = os.path.join(model_repository_path, model_name)
model_index_path = os.path.join(target_path, 'model_index.json')
if os.path.isfile(model_index_path):
print("Model index file already exists. Skipping conversion, re-generating graph only.")
else:
optimum_command = "optimum-cli export openvino --model {} --weight-format {} {} {}".format(source_model, precision, task_parameters['extra_quantization_params'], target_path)
print('Running command: ', optimum_command) # for debug purposes
if os.system(optimum_command):
raise ValueError("Failed to export image generation model", source_model)
# Download and resolve LoRA adapters
lora_adapters = []
composite_lora_adapters = []
if source_loras:
entries = source_loras.split(',')
for entry in entries:
entry = entry.strip()
if '=' in entry:
alias, source = entry.split('=', 1)
else:
source = entry
alias = entry.split('/')[-1] if '/' in entry else entry
# Composite LoRA: source starts with @
if source.startswith('@'):
components = []
for comp_token in source.split('+'):
comp_token = comp_token.strip().lstrip('@')
if ':' in comp_token:
ref, alpha_str = comp_token.rsplit(':', 1)
alpha = float(alpha_str)
else:
ref = comp_token
alpha = 1.0
components.append({'adapter_alias': ref, 'alpha': alpha})
composite_lora_adapters.append({'alias': alias, 'components': components})
print(f"Composite LoRA: {alias} -> {components}")
continue
# Parse optional alpha (trailing :float after repo or filename)
alpha = None
repo_and_file = source
# Check for alpha suffix: alias=org/repo:0.8 or alias=org/repo@file.safetensors:0.8
if ':' in repo_and_file:
last_colon = repo_and_file.rfind(':')
potential_alpha = repo_and_file[last_colon + 1:]
try:
alpha = float(potential_alpha)
repo_and_file = repo_and_file[:last_colon]
except ValueError:
pass # Not an alpha suffix (could be part of URL)
safetensors_file = ''
if '@' in repo_and_file:
repo, safetensors_file = repo_and_file.rsplit('@', 1)
else:
repo = repo_and_file
lora_dir = os.path.join(target_path, 'loras', repo)
if not os.path.isdir(lora_dir):
print(f"Downloading LoRA adapter: {repo} to {lora_dir}")
snapshot_download(repo_id=repo, local_dir=lora_dir)
else:
print(f"LoRA adapter directory already exists: {lora_dir}")
if not safetensors_file:
st_files = [f for f in os.listdir(lora_dir) if f.endswith('.safetensors')]
if len(st_files) == 0:
raise ValueError(f"No .safetensors files found in LoRA adapter: {repo}")
if len(st_files) > 1:
raise ValueError(f"Multiple .safetensors files in LoRA adapter: {repo}. Use @filename to specify.")
safetensors_file = st_files[0]
lora_path = 'loras/' + repo + '/' + safetensors_file
lora_entry = {'alias': alias, 'path': lora_path, 'alpha': alpha}
lora_adapters.append(lora_entry)
print(f"LoRA adapter: {alias} -> {lora_path}" + (f" (alpha={alpha})" if alpha else ""))
task_parameters['lora_adapters'] = lora_adapters
task_parameters['composite_lora_adapters'] = composite_lora_adapters
plugin_config = {}
assert num_streams >= 0, "num_streams should be a non-negative integer"
if num_streams > 0:
plugin_config['NUM_STREAMS'] = num_streams
if 'ov_cache_dir' in task_parameters and task_parameters['ov_cache_dir'] is not None:
plugin_config['CACHE_DIR'] = task_parameters['ov_cache_dir']
if len(plugin_config) > 0:
task_parameters['plugin_config_str'] = json.dumps(plugin_config)
# assert that max_resolution if exists, is in WxH format
for param in ['max_resolution', 'default_resolution']:
if task_parameters[param]:
if 'x' not in task_parameters[param]:
raise ValueError(param + " should be in WxH format, e.g. 1024x768")
width, height = task_parameters[param].split('x')
if not (width.isdigit() and height.isdigit()):
raise ValueError(param + " should be in WxH format with positive integers, e.g. 1024x768")
task_parameters[param] = '{}x{}'.format(int(width), int(height))
gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(image_generation_graph_template)
graph_content = gtemplate.render(model_path=model_path, **task_parameters)
with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
f.write(graph_content)
print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
add_servable_to_config(config_file_path, model_name, os.path.relpath(os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path)))
if not os.path.isdir(args['model_repository_path']):
raise ValueError(f"The model repository path '{args['model_repository_path']}' is not a valid directory.")
if args['source_model'] is None:
args['source_model'] = args['model_name']
if args['model_name'] is None:
args['model_name'] = args['source_model']
if args['model_name'] is None and args['source_model'] is None:
raise ValueError("Either model_name or source_model should be provided")
### Speculative decoding specific
if args['task'] == 'text_generation':
if args['draft_source_model'] is None:
args['draft_source_model'] = args['draft_model_name']
if args['draft_model_name'] is None:
args['draft_model_name'] = args['draft_source_model']
###
if args['extra_quantization_params'] is None:
args['extra_quantization_params'] = ""
template_parameters = {k: v for k, v in args.items() if k not in ['model_repository_path', 'source_model', 'model_name', 'precision', 'version', 'config_file_path', 'overwrite_models']}
print("template params:", template_parameters)
if args['task'] == 'text_generation':
export_text_generation_model(args['model_repository_path'], args['source_model'], args['model_name'], args['precision'], template_parameters, args['config_file_path'])
elif args['task'] == 'embeddings_ov':
export_embeddings_model_ov(args['model_repository_path'], args['source_model'], args['model_name'], args['precision'], template_parameters, args['config_file_path'], args['truncate'])
elif args['task'] == 'rerank_ov':
export_rerank_model_ov(args['model_repository_path'], args['source_model'], args['model_name'] ,args['precision'], template_parameters, args['config_file_path'], args['max_doc_length'])
elif args['task'] == 'text2speech':
export_text2speech_model(args['model_repository_path'], args['source_model'], args['model_name'], args['precision'], template_parameters, args['config_file_path'])
elif args['task'] == 'speech2text':
export_speech2text_model(args['model_repository_path'], args['source_model'], args['model_name'] ,args['precision'], template_parameters, args['config_file_path'])
elif args['task'] == 'image_generation':
template_parameters = {k: v for k, v in args.items() if k in [
'ov_cache_dir',
'target_device',
'resolution',
'num_images_per_prompt',
'guidance_scale',
'max_resolution',
'default_resolution',
'max_num_images_per_prompt',
'default_num_inference_steps',
'max_num_inference_steps',
'extra_quantization_params'
]}
export_image_generation_model(args['model_repository_path'], args['source_model'], args['model_name'], args['precision'], template_parameters, args['config_file_path'], args['num_streams'], args['source_loras'])