From 54e810889c3e8b19005b6d0b1ab599bde089c1ba Mon Sep 17 00:00:00 2001 From: Dharma Bellamkonda Date: Mon, 18 May 2026 22:21:39 -0600 Subject: [PATCH] Pass location names to recognizer initial prompt Adds a WithLocations functional option to NewWhisperRecognizer and the OpenAI recognizers. When set, location names are appended to the initial prompt so Whisper biases transcription towards known place names (e.g. "Incirlik" instead of "INSULIC"). The application wires this up automatically from config.Locations. Co-Authored-By: Claude Opus 4.7 --- internal/application/app.go | 20 +++++++++++--------- pkg/recognizer/openai.go | 27 ++++++++++++++++----------- pkg/recognizer/options.go | 16 ++++++++++++++++ pkg/recognizer/prompt.go | 13 ++++++++++--- pkg/recognizer/whisper.go | 11 ++++++++--- 5 files changed, 61 insertions(+), 26 deletions(-) create mode 100644 pkg/recognizer/options.go diff --git a/internal/application/app.go b/internal/application/app.go index 66eb11a7..2da0081d 100644 --- a/internal/application/app.go +++ b/internal/application/app.go @@ -153,26 +153,28 @@ func NewApplication(config conf.Configuration) (*Application, error) { ) } + locationNames := make([]string, 0) + for _, loc := range config.Locations { + locationNames = append(locationNames, loc.Names...) + } + + recognizerOpts := []recognizer.Option{recognizer.WithLocations(locationNames)} + log.Info().Msg("constructing speech-to-text recognizer") var speechRecognizer recognizer.Recognizer switch config.Recognizer { case conf.WhisperLocal: - speechRecognizer = recognizer.NewWhisperRecognizer(config.WhisperModel, config.Callsign) + speechRecognizer = recognizer.NewWhisperRecognizer(config.WhisperModel, config.Callsign, recognizerOpts...) case conf.WhisperAPI: - speechRecognizer = recognizer.NewWhisperAPIRecognizer(config.OpenAIAPIKey, config.Callsign) + speechRecognizer = recognizer.NewWhisperAPIRecognizer(config.OpenAIAPIKey, config.Callsign, recognizerOpts...) case conf.GPT4o: - speechRecognizer = recognizer.NewGPT4oRecognizer(config.OpenAIAPIKey, config.Callsign) + speechRecognizer = recognizer.NewGPT4oRecognizer(config.OpenAIAPIKey, config.Callsign, recognizerOpts...) case conf.GPT4oMini: - speechRecognizer = recognizer.NewGPT4oMiniRecognizer(config.OpenAIAPIKey, config.Callsign) + speechRecognizer = recognizer.NewGPT4oMiniRecognizer(config.OpenAIAPIKey, config.Callsign, recognizerOpts...) default: return nil, fmt.Errorf("failed to construct application: unrecognized recognizer %q", config.Recognizer) } - locationNames := make([]string, 0) - for _, loc := range config.Locations { - locationNames = append(locationNames, loc.Names...) - } - log.Info().Msg("constructing request parser") requestParser := parser.New(config.Callsign, locationNames, config.EnableTranscriptionLogging) diff --git a/pkg/recognizer/openai.go b/pkg/recognizer/openai.go index b4b71d1e..5746b6f4 100644 --- a/pkg/recognizer/openai.go +++ b/pkg/recognizer/openai.go @@ -18,39 +18,44 @@ type openAIRecognizer struct { callsign string client *openai.Client model string + recognizerOptions } var _ Recognizer = &openAIRecognizer{} -func newOpenAIRecognizer(apiKey, model, callsign string) Recognizer { - return &openAIRecognizer{ +func newOpenAIRecognizer(apiKey, model, callsign string, opts ...Option) Recognizer { + r := &openAIRecognizer{ callsign: callsign, client: openai.NewClient( option.WithAPIKey(apiKey), ), model: model, } + for _, opt := range opts { + opt(&r.recognizerOptions) + } + return r } -func NewWhisperAPIRecognizer(apiKey, callsign string) Recognizer { - return newOpenAIRecognizer(apiKey, "whisper-1", callsign) +func NewWhisperAPIRecognizer(apiKey, callsign string, opts ...Option) Recognizer { + return newOpenAIRecognizer(apiKey, "whisper-1", callsign, opts...) } // NewGPT4oRecognizer creates a new recognizer using OpenAI Platform's GPT-4o model. -func NewGPT4oRecognizer(apiKey, callsign string) Recognizer { - return newOpenAIRecognizer(apiKey, "gpt-4o-transcribe", callsign) +func NewGPT4oRecognizer(apiKey, callsign string, opts ...Option) Recognizer { + return newOpenAIRecognizer(apiKey, "gpt-4o-transcribe", callsign, opts...) } // NewGPT4oMiniRecognizer creates a new recognizer using OpenAI Platform's GPT-4o Mini model. -func NewGPT4oMiniRecognizer(apiKey, callsign string) Recognizer { - return newOpenAIRecognizer(apiKey, "gpt-4o-mini-transcribe", callsign) +func NewGPT4oMiniRecognizer(apiKey, callsign string, opts ...Option) Recognizer { + return newOpenAIRecognizer(apiKey, "gpt-4o-mini-transcribe", callsign, opts...) } // NewOpenAIRecognizer creates a new recognizer using OpenAI Platform. // // Deprecated: Use NewWhisperAPIRecognizer, NewGPT4oRecognizer, or NewGPT4oMiniRecognizer instead. -func NewOpenAIRecognizer(apiKey, callsign string) Recognizer { // nolint: revive // Ignore deprecated function - return NewWhisperAPIRecognizer(apiKey, callsign) +func NewOpenAIRecognizer(apiKey, callsign string, opts ...Option) Recognizer { // nolint: revive // Ignore deprecated function + return NewWhisperAPIRecognizer(apiKey, callsign, opts...) } // Recognize implements [Recognizer.Recognize] using OpenAI Platform's hosted GPT4 transcription model. @@ -65,7 +70,7 @@ func (r *openAIRecognizer) Recognize(ctx context.Context, sample []float32, _ bo File: openai.FileParam(buf, "audio.wav", "audio/wav"), Model: openai.String(r.model), Language: openai.String("en"), - Prompt: openai.String(prompt(r.callsign)), + Prompt: openai.String(prompt(r.callsign, r.locations)), } log.Info().Str("model", r.model).Msg("calling OpenAI Audio Transcriptions API") diff --git a/pkg/recognizer/options.go b/pkg/recognizer/options.go new file mode 100644 index 00000000..654585bb --- /dev/null +++ b/pkg/recognizer/options.go @@ -0,0 +1,16 @@ +package recognizer + +type recognizerOptions struct { + locations []string +} + +// Option configures a recognizer. +type Option func(*recognizerOptions) + +// WithLocations adds location names to the recognizer's initial prompt, +// improving transcription accuracy for place names. +func WithLocations(locations []string) Option { + return func(o *recognizerOptions) { + o.locations = locations + } +} diff --git a/pkg/recognizer/prompt.go b/pkg/recognizer/prompt.go index 57fec9dc..45e4a849 100644 --- a/pkg/recognizer/prompt.go +++ b/pkg/recognizer/prompt.go @@ -1,8 +1,15 @@ package recognizer -import "fmt" +import ( + "fmt" + "strings" +) // prompt constructs a prompt for OpenAI's audio transcription models. See https://platform.openai.com/docs/guides/speech-to-text#prompting -func prompt(callsign string) string { - return fmt.Sprintf("Either ANYFACE or %s, PILOT CALLSIGN, DIGITS, one of 'RADIO' 'ALPHA' 'BOGEY' 'PICTURE' 'DECLARE' 'SNAPLOCK' 'SPIKED', ARGUMENTS such as BULLSEYE, BRAA, numbers or digits.", callsign) +func prompt(callsign string, locations []string) string { + s := fmt.Sprintf("Either ANYFACE or %s, PILOT CALLSIGN, DIGITS, one of 'RADIO' 'ALPHA' 'BOGEY' 'PICTURE' 'DECLARE' 'SNAPLOCK' 'SPIKED', ARGUMENTS such as BULLSEYE, BRAA, numbers or digits.", callsign) + if len(locations) > 0 { + s += " Locations: " + strings.Join(locations, ", ") + "." + } + return s } diff --git a/pkg/recognizer/whisper.go b/pkg/recognizer/whisper.go index 3ad908fc..37c385f1 100644 --- a/pkg/recognizer/whisper.go +++ b/pkg/recognizer/whisper.go @@ -14,16 +14,21 @@ import ( type whisperRecognizer struct { model whisper.Model callsign string + recognizerOptions } var _ Recognizer = &whisperRecognizer{} // NewWhisperRecognizer creates a new recognizer using OpenAI Whisper. -func NewWhisperRecognizer(model *whisper.Model, callsign string) Recognizer { - return &whisperRecognizer{ +func NewWhisperRecognizer(model *whisper.Model, callsign string, opts ...Option) Recognizer { + r := &whisperRecognizer{ model: *model, callsign: callsign, } + for _, opt := range opts { + opt(&r.recognizerOptions) + } + return r } const maxSize = 256 * 1024 @@ -40,7 +45,7 @@ func (r *whisperRecognizer) Recognize(ctx context.Context, sample []float32, ena return "", fmt.Errorf("error creating whisper context: %w", err) } - wCtx.SetInitialPrompt(prompt(r.callsign)) + wCtx.SetInitialPrompt(prompt(r.callsign, r.locations)) if wCtx.IsMultilingual() { _ = wCtx.SetLanguage("en")