Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion Libraries/MLXLMCommon/LanguageModel.swift
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ public struct LMInput {
public let text: Text
public let image: ProcessedImage?
public let video: ProcessedVideo?
public let audio: ProcessedAudio?

/// Representation of tokenized input text.
public struct Text {
Expand Down Expand Up @@ -120,17 +121,33 @@ public struct LMInput {
}
}

/// Representation of prepared input audio.
public struct ProcessedAudio {

/// Mel spectrogram features, shape [batch, frames, melBins] or [frames, melBins].
public let features: MLXArray
/// Optional attention mask indicating padding frames (True = padding).
public let mask: MLXArray?

public init(features: MLXArray, mask: MLXArray? = nil) {
self.features = features
self.mask = mask
}
}

public init(tokens: MLXArray, mask: MLXArray? = nil) {
self.init(text: .init(tokens: tokens, mask: mask))
}

public init(
text: LMInput.Text, image: LMInput.ProcessedImage? = nil,
video: LMInput.ProcessedVideo? = nil
video: LMInput.ProcessedVideo? = nil,
audio: LMInput.ProcessedAudio? = nil
) {
self.text = text
self.image = image
self.video = video
self.audio = audio
}
}

Expand Down
3 changes: 3 additions & 0 deletions Libraries/MLXLMCommon/UserInput.swift
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,9 @@ public struct UserInput {
/// collect the videos from the chat messages, otherwise these are the stored videos with the ``UserInput``.
public var videos = [Video]()

/// Audio inputs as raw PCM float arrays (16 kHz mono expected by Gemma 4).
public var audios = [[Float]]()
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please introduce an Audio type modeled after the Image/Video types.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, this needs to be represented in ChatSession.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The input is specific to gemma4 today, but much like the images it should be up to the input processor to convert to the desired format.


public var tools: [ToolSpec]?

/// Additional values provided for the chat template rendering context
Expand Down
Loading
Loading