diff --git a/internal/application/app.go b/internal/application/app.go index 66eb11a7..2da0081d 100644 --- a/internal/application/app.go +++ b/internal/application/app.go @@ -153,26 +153,28 @@ func NewApplication(config conf.Configuration) (*Application, error) { ) } + locationNames := make([]string, 0) + for _, loc := range config.Locations { + locationNames = append(locationNames, loc.Names...) + } + + recognizerOpts := []recognizer.Option{recognizer.WithLocations(locationNames)} + log.Info().Msg("constructing speech-to-text recognizer") var speechRecognizer recognizer.Recognizer switch config.Recognizer { case conf.WhisperLocal: - speechRecognizer = recognizer.NewWhisperRecognizer(config.WhisperModel, config.Callsign) + speechRecognizer = recognizer.NewWhisperRecognizer(config.WhisperModel, config.Callsign, recognizerOpts...) case conf.WhisperAPI: - speechRecognizer = recognizer.NewWhisperAPIRecognizer(config.OpenAIAPIKey, config.Callsign) + speechRecognizer = recognizer.NewWhisperAPIRecognizer(config.OpenAIAPIKey, config.Callsign, recognizerOpts...) case conf.GPT4o: - speechRecognizer = recognizer.NewGPT4oRecognizer(config.OpenAIAPIKey, config.Callsign) + speechRecognizer = recognizer.NewGPT4oRecognizer(config.OpenAIAPIKey, config.Callsign, recognizerOpts...) case conf.GPT4oMini: - speechRecognizer = recognizer.NewGPT4oMiniRecognizer(config.OpenAIAPIKey, config.Callsign) + speechRecognizer = recognizer.NewGPT4oMiniRecognizer(config.OpenAIAPIKey, config.Callsign, recognizerOpts...) default: return nil, fmt.Errorf("failed to construct application: unrecognized recognizer %q", config.Recognizer) } - locationNames := make([]string, 0) - for _, loc := range config.Locations { - locationNames = append(locationNames, loc.Names...) - } - log.Info().Msg("constructing request parser") requestParser := parser.New(config.Callsign, locationNames, config.EnableTranscriptionLogging) diff --git a/pkg/recognizer/openai.go b/pkg/recognizer/openai.go index b4b71d1e..5746b6f4 100644 --- a/pkg/recognizer/openai.go +++ b/pkg/recognizer/openai.go @@ -18,39 +18,44 @@ type openAIRecognizer struct { callsign string client *openai.Client model string + recognizerOptions } var _ Recognizer = &openAIRecognizer{} -func newOpenAIRecognizer(apiKey, model, callsign string) Recognizer { - return &openAIRecognizer{ +func newOpenAIRecognizer(apiKey, model, callsign string, opts ...Option) Recognizer { + r := &openAIRecognizer{ callsign: callsign, client: openai.NewClient( option.WithAPIKey(apiKey), ), model: model, } + for _, opt := range opts { + opt(&r.recognizerOptions) + } + return r } -func NewWhisperAPIRecognizer(apiKey, callsign string) Recognizer { - return newOpenAIRecognizer(apiKey, "whisper-1", callsign) +func NewWhisperAPIRecognizer(apiKey, callsign string, opts ...Option) Recognizer { + return newOpenAIRecognizer(apiKey, "whisper-1", callsign, opts...) } // NewGPT4oRecognizer creates a new recognizer using OpenAI Platform's GPT-4o model. -func NewGPT4oRecognizer(apiKey, callsign string) Recognizer { - return newOpenAIRecognizer(apiKey, "gpt-4o-transcribe", callsign) +func NewGPT4oRecognizer(apiKey, callsign string, opts ...Option) Recognizer { + return newOpenAIRecognizer(apiKey, "gpt-4o-transcribe", callsign, opts...) } // NewGPT4oMiniRecognizer creates a new recognizer using OpenAI Platform's GPT-4o Mini model. -func NewGPT4oMiniRecognizer(apiKey, callsign string) Recognizer { - return newOpenAIRecognizer(apiKey, "gpt-4o-mini-transcribe", callsign) +func NewGPT4oMiniRecognizer(apiKey, callsign string, opts ...Option) Recognizer { + return newOpenAIRecognizer(apiKey, "gpt-4o-mini-transcribe", callsign, opts...) } // NewOpenAIRecognizer creates a new recognizer using OpenAI Platform. // // Deprecated: Use NewWhisperAPIRecognizer, NewGPT4oRecognizer, or NewGPT4oMiniRecognizer instead. -func NewOpenAIRecognizer(apiKey, callsign string) Recognizer { // nolint: revive // Ignore deprecated function - return NewWhisperAPIRecognizer(apiKey, callsign) +func NewOpenAIRecognizer(apiKey, callsign string, opts ...Option) Recognizer { // nolint: revive // Ignore deprecated function + return NewWhisperAPIRecognizer(apiKey, callsign, opts...) } // Recognize implements [Recognizer.Recognize] using OpenAI Platform's hosted GPT4 transcription model. @@ -65,7 +70,7 @@ func (r *openAIRecognizer) Recognize(ctx context.Context, sample []float32, _ bo File: openai.FileParam(buf, "audio.wav", "audio/wav"), Model: openai.String(r.model), Language: openai.String("en"), - Prompt: openai.String(prompt(r.callsign)), + Prompt: openai.String(prompt(r.callsign, r.locations)), } log.Info().Str("model", r.model).Msg("calling OpenAI Audio Transcriptions API") diff --git a/pkg/recognizer/options.go b/pkg/recognizer/options.go new file mode 100644 index 00000000..654585bb --- /dev/null +++ b/pkg/recognizer/options.go @@ -0,0 +1,16 @@ +package recognizer + +type recognizerOptions struct { + locations []string +} + +// Option configures a recognizer. +type Option func(*recognizerOptions) + +// WithLocations adds location names to the recognizer's initial prompt, +// improving transcription accuracy for place names. +func WithLocations(locations []string) Option { + return func(o *recognizerOptions) { + o.locations = locations + } +} diff --git a/pkg/recognizer/prompt.go b/pkg/recognizer/prompt.go index 57fec9dc..45e4a849 100644 --- a/pkg/recognizer/prompt.go +++ b/pkg/recognizer/prompt.go @@ -1,8 +1,15 @@ package recognizer -import "fmt" +import ( + "fmt" + "strings" +) // prompt constructs a prompt for OpenAI's audio transcription models. See https://platform.openai.com/docs/guides/speech-to-text#prompting -func prompt(callsign string) string { - return fmt.Sprintf("Either ANYFACE or %s, PILOT CALLSIGN, DIGITS, one of 'RADIO' 'ALPHA' 'BOGEY' 'PICTURE' 'DECLARE' 'SNAPLOCK' 'SPIKED', ARGUMENTS such as BULLSEYE, BRAA, numbers or digits.", callsign) +func prompt(callsign string, locations []string) string { + s := fmt.Sprintf("Either ANYFACE or %s, PILOT CALLSIGN, DIGITS, one of 'RADIO' 'ALPHA' 'BOGEY' 'PICTURE' 'DECLARE' 'SNAPLOCK' 'SPIKED', ARGUMENTS such as BULLSEYE, BRAA, numbers or digits.", callsign) + if len(locations) > 0 { + s += " Locations: " + strings.Join(locations, ", ") + "." + } + return s } diff --git a/pkg/recognizer/whisper.go b/pkg/recognizer/whisper.go index 3ad908fc..37c385f1 100644 --- a/pkg/recognizer/whisper.go +++ b/pkg/recognizer/whisper.go @@ -14,16 +14,21 @@ import ( type whisperRecognizer struct { model whisper.Model callsign string + recognizerOptions } var _ Recognizer = &whisperRecognizer{} // NewWhisperRecognizer creates a new recognizer using OpenAI Whisper. -func NewWhisperRecognizer(model *whisper.Model, callsign string) Recognizer { - return &whisperRecognizer{ +func NewWhisperRecognizer(model *whisper.Model, callsign string, opts ...Option) Recognizer { + r := &whisperRecognizer{ model: *model, callsign: callsign, } + for _, opt := range opts { + opt(&r.recognizerOptions) + } + return r } const maxSize = 256 * 1024 @@ -40,7 +45,7 @@ func (r *whisperRecognizer) Recognize(ctx context.Context, sample []float32, ena return "", fmt.Errorf("error creating whisper context: %w", err) } - wCtx.SetInitialPrompt(prompt(r.callsign)) + wCtx.SetInitialPrompt(prompt(r.callsign, r.locations)) if wCtx.IsMultilingual() { _ = wCtx.SetLanguage("en")