Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 11 additions & 9 deletions internal/application/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,26 +153,28 @@ func NewApplication(config conf.Configuration) (*Application, error) {
)
}

locationNames := make([]string, 0)
for _, loc := range config.Locations {
locationNames = append(locationNames, loc.Names...)
}

recognizerOpts := []recognizer.Option{recognizer.WithLocations(locationNames)}

log.Info().Msg("constructing speech-to-text recognizer")
var speechRecognizer recognizer.Recognizer
switch config.Recognizer {
case conf.WhisperLocal:
speechRecognizer = recognizer.NewWhisperRecognizer(config.WhisperModel, config.Callsign)
speechRecognizer = recognizer.NewWhisperRecognizer(config.WhisperModel, config.Callsign, recognizerOpts...)
case conf.WhisperAPI:
speechRecognizer = recognizer.NewWhisperAPIRecognizer(config.OpenAIAPIKey, config.Callsign)
speechRecognizer = recognizer.NewWhisperAPIRecognizer(config.OpenAIAPIKey, config.Callsign, recognizerOpts...)
case conf.GPT4o:
speechRecognizer = recognizer.NewGPT4oRecognizer(config.OpenAIAPIKey, config.Callsign)
speechRecognizer = recognizer.NewGPT4oRecognizer(config.OpenAIAPIKey, config.Callsign, recognizerOpts...)
case conf.GPT4oMini:
speechRecognizer = recognizer.NewGPT4oMiniRecognizer(config.OpenAIAPIKey, config.Callsign)
speechRecognizer = recognizer.NewGPT4oMiniRecognizer(config.OpenAIAPIKey, config.Callsign, recognizerOpts...)
default:
return nil, fmt.Errorf("failed to construct application: unrecognized recognizer %q", config.Recognizer)
}

locationNames := make([]string, 0)
for _, loc := range config.Locations {
locationNames = append(locationNames, loc.Names...)
}

log.Info().Msg("constructing request parser")
requestParser := parser.New(config.Callsign, locationNames, config.EnableTranscriptionLogging)

Expand Down
27 changes: 16 additions & 11 deletions pkg/recognizer/openai.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,39 +18,44 @@ type openAIRecognizer struct {
callsign string
client *openai.Client
model string
recognizerOptions
}

var _ Recognizer = &openAIRecognizer{}

func newOpenAIRecognizer(apiKey, model, callsign string) Recognizer {
return &openAIRecognizer{
func newOpenAIRecognizer(apiKey, model, callsign string, opts ...Option) Recognizer {
r := &openAIRecognizer{
callsign: callsign,
client: openai.NewClient(
option.WithAPIKey(apiKey),
),
model: model,
}
for _, opt := range opts {
opt(&r.recognizerOptions)
}
return r
}

func NewWhisperAPIRecognizer(apiKey, callsign string) Recognizer {
return newOpenAIRecognizer(apiKey, "whisper-1", callsign)
func NewWhisperAPIRecognizer(apiKey, callsign string, opts ...Option) Recognizer {
return newOpenAIRecognizer(apiKey, "whisper-1", callsign, opts...)
}

// NewGPT4oRecognizer creates a new recognizer using OpenAI Platform's GPT-4o model.
func NewGPT4oRecognizer(apiKey, callsign string) Recognizer {
return newOpenAIRecognizer(apiKey, "gpt-4o-transcribe", callsign)
func NewGPT4oRecognizer(apiKey, callsign string, opts ...Option) Recognizer {
return newOpenAIRecognizer(apiKey, "gpt-4o-transcribe", callsign, opts...)
}

// NewGPT4oMiniRecognizer creates a new recognizer using OpenAI Platform's GPT-4o Mini model.
func NewGPT4oMiniRecognizer(apiKey, callsign string) Recognizer {
return newOpenAIRecognizer(apiKey, "gpt-4o-mini-transcribe", callsign)
func NewGPT4oMiniRecognizer(apiKey, callsign string, opts ...Option) Recognizer {
return newOpenAIRecognizer(apiKey, "gpt-4o-mini-transcribe", callsign, opts...)
}

// NewOpenAIRecognizer creates a new recognizer using OpenAI Platform.
//
// Deprecated: Use NewWhisperAPIRecognizer, NewGPT4oRecognizer, or NewGPT4oMiniRecognizer instead.
func NewOpenAIRecognizer(apiKey, callsign string) Recognizer { // nolint: revive // Ignore deprecated function
return NewWhisperAPIRecognizer(apiKey, callsign)
func NewOpenAIRecognizer(apiKey, callsign string, opts ...Option) Recognizer { // nolint: revive // Ignore deprecated function
return NewWhisperAPIRecognizer(apiKey, callsign, opts...)
}

// Recognize implements [Recognizer.Recognize] using OpenAI Platform's hosted GPT4 transcription model.
Expand All @@ -65,7 +70,7 @@ func (r *openAIRecognizer) Recognize(ctx context.Context, sample []float32, _ bo
File: openai.FileParam(buf, "audio.wav", "audio/wav"),
Model: openai.String(r.model),
Language: openai.String("en"),
Prompt: openai.String(prompt(r.callsign)),
Prompt: openai.String(prompt(r.callsign, r.locations)),
}

log.Info().Str("model", r.model).Msg("calling OpenAI Audio Transcriptions API")
Expand Down
16 changes: 16 additions & 0 deletions pkg/recognizer/options.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package recognizer

type recognizerOptions struct {
locations []string
}

// Option configures a recognizer.
type Option func(*recognizerOptions)

// WithLocations adds location names to the recognizer's initial prompt,
// improving transcription accuracy for place names.
func WithLocations(locations []string) Option {
return func(o *recognizerOptions) {
o.locations = locations
}
}
13 changes: 10 additions & 3 deletions pkg/recognizer/prompt.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
package recognizer

import "fmt"
import (
"fmt"
"strings"
)

// prompt constructs a prompt for OpenAI's audio transcription models. See https://platform.openai.com/docs/guides/speech-to-text#prompting
func prompt(callsign string) string {
return fmt.Sprintf("Either ANYFACE or %s, PILOT CALLSIGN, DIGITS, one of 'RADIO' 'ALPHA' 'BOGEY' 'PICTURE' 'DECLARE' 'SNAPLOCK' 'SPIKED', ARGUMENTS such as BULLSEYE, BRAA, numbers or digits.", callsign)
func prompt(callsign string, locations []string) string {
s := fmt.Sprintf("Either ANYFACE or %s, PILOT CALLSIGN, DIGITS, one of 'RADIO' 'ALPHA' 'BOGEY' 'PICTURE' 'DECLARE' 'SNAPLOCK' 'SPIKED', ARGUMENTS such as BULLSEYE, BRAA, numbers or digits.", callsign)
if len(locations) > 0 {
s += " Locations: " + strings.Join(locations, ", ") + "."
}
return s
}
11 changes: 8 additions & 3 deletions pkg/recognizer/whisper.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,21 @@ import (
type whisperRecognizer struct {
model whisper.Model
callsign string
recognizerOptions
}

var _ Recognizer = &whisperRecognizer{}

// NewWhisperRecognizer creates a new recognizer using OpenAI Whisper.
func NewWhisperRecognizer(model *whisper.Model, callsign string) Recognizer {
return &whisperRecognizer{
func NewWhisperRecognizer(model *whisper.Model, callsign string, opts ...Option) Recognizer {
r := &whisperRecognizer{
model: *model,
callsign: callsign,
}
for _, opt := range opts {
opt(&r.recognizerOptions)
}
return r
}

const maxSize = 256 * 1024
Expand All @@ -40,7 +45,7 @@ func (r *whisperRecognizer) Recognize(ctx context.Context, sample []float32, ena
return "", fmt.Errorf("error creating whisper context: %w", err)
}

wCtx.SetInitialPrompt(prompt(r.callsign))
wCtx.SetInitialPrompt(prompt(r.callsign, r.locations))

if wCtx.IsMultilingual() {
_ = wCtx.SetLanguage("en")
Expand Down
Loading