@@ -3013,15 +3013,15 @@ public final class ALMModelFactory: ModelFactory, @unchecked Sendable {
30133013 ) async throws -> ModelContext {
30143014 let context = try await LLMModelFactory . shared. _load ( configuration: configuration, tokenizerLoader: tokenizerLoader)
30153015
3016- let numAudioEmbeddings = OmniModelFactory . extractNumAudioEmbeddings ( configuration: configuration)
3016+ let tokens = OmniModelFactory . extractMultimodalTokens ( configuration: configuration)
30173017 let messageGenerator = DefaultMessageGenerator ( )
30183018 let processor = ALMUserInputProcessor (
30193019 tokenizer: context. tokenizer,
30203020 configuration: context. configuration,
30213021 messageGenerator: messageGenerator,
3022- boaToken: 255010 ,
3023- eoaToken: 255011 ,
3024- numAudioEmbeddings: numAudioEmbeddings
3022+ boaToken: tokens . boa ,
3023+ eoaToken: tokens . eoa ,
3024+ numAudioEmbeddings: tokens . numAudio
30253025 )
30263026
30273027 return . init(
@@ -3081,10 +3081,12 @@ public final class OmniModelFactory: ModelFactory, @unchecked Sendable {
30813081 tokenizerLoader: any TokenizerLoader
30823082 ) async throws -> ModelContext {
30833083 let vlmContext = try await VLMModelFactory . shared. _load ( configuration: configuration, tokenizerLoader: tokenizerLoader)
3084- let numAudioEmbeddings = OmniModelFactory . extractNumAudioEmbeddings ( configuration: configuration)
3084+ let tokens = OmniModelFactory . extractMultimodalTokens ( configuration: configuration)
30853085 let omniProcessor = OmniUserInputProcessor (
30863086 vlmProcessor: vlmContext. processor,
3087- numAudioEmbeddings: numAudioEmbeddings
3087+ boaToken: tokens. boa,
3088+ eoaToken: tokens. eoa,
3089+ numAudioEmbeddings: tokens. numAudio
30883090 )
30893091
30903092 return . init(
@@ -3095,19 +3097,35 @@ public final class OmniModelFactory: ModelFactory, @unchecked Sendable {
30953097 )
30963098 }
30973099
3100+ @available ( * , deprecated, message: " Use extractMultimodalTokens(configuration:).numAudio instead " )
30983101 public static func extractNumAudioEmbeddings( configuration: ResolvedModelConfiguration ) -> Int {
3102+ extractMultimodalTokens ( configuration: configuration) . numAudio
3103+ }
3104+
3105+ public static func extractMultimodalTokens( configuration: ResolvedModelConfiguration ) -> ( numAudio: Int , boa: Int , eoa: Int ) {
30993106 let configurationURL = configuration. modelDirectory. appending ( component: " config.json " )
3107+ var numAudio = 128
3108+ var boa = 255010
3109+ var eoa = 255011
3110+
31003111 if let data = try ? Data ( contentsOf: configurationURL) ,
31013112 let dict = try ? JSONSerialization . jsonObject ( with: data) as? [ String : Any ] {
31023113
3114+ // Extract num_audio_embeddings
31033115 if let subsampling = dict [ " subsampling_conv_channels " ] as? [ Int ] {
3104- return subsampling. first ?? 128
3105- }
3106- if let audioConfig = dict [ " audio_config " ] as? [ String : Any ] ,
3116+ numAudio = subsampling. first ?? 128
3117+ } else if let audioConfig = dict [ " audio_config " ] as? [ String : Any ] ,
31073118 let embeddings = audioConfig [ " num_audio_embeddings " ] as? Int {
3108- return embeddings
3119+ numAudio = embeddings
31093120 }
3121+
3122+ // Extract BOA/EOA tokens
3123+ if let b = dict [ " boa_token_id " ] as? Int { boa = b }
3124+ else if let b = ( dict [ " audio_config " ] as? [ String : Any ] ) ? [ " boa_token_id " ] as? Int { boa = b }
3125+
3126+ if let e = dict [ " eoa_token_id " ] as? Int { eoa = e }
3127+ else if let e = ( dict [ " audio_config " ] as? [ String : Any ] ) ? [ " eoa_token_id " ] as? Int { eoa = e }
31103128 }
3111- return 128
3129+ return ( numAudio , boa , eoa )
31123130 }
31133131}
0 commit comments