88 "io"
99 "log/slog"
1010 "mime"
11+ "mime/multipart"
1112 "net/http"
1213 "net/http/httputil"
1314 "net/url"
@@ -19,6 +20,7 @@ const maxTTSRequestBody = 16 << 20
1920const (
2021 adapterLiteTTSHTTP = "litetts_http"
2122 adapterMooERGRPC = "mooer_grpc"
23+ adapterVoxCPMClone = "voxcpm_clone"
2224)
2325
2426// RegisterRoutes returns a function that registers AIMA inference proxy routes.
@@ -74,10 +76,15 @@ func (d *Deps) handleTTS(w http.ResponseWriter, r *http.Request) {
7476 return
7577 }
7678
77- if d .adapterFor (model , r .URL .Path ) == adapterLiteTTSHTTP {
79+ adapter := d .adapterFor (model , r .URL .Path )
80+ if adapter == adapterLiteTTSHTTP {
7881 d .handleLiteTTS (w , r , backend , raw )
7982 return
8083 }
84+ if adapter == adapterVoxCPMClone && hasTTSReferenceAudio (raw ) {
85+ d .handleVoxCPMClone (w , r , backend , raw , body )
86+ return
87+ }
8188
8289 switch r .URL .Path {
8390 case "/v1/tts" :
@@ -504,6 +511,122 @@ func (d *Deps) forwardTTSJSON(w http.ResponseWriter, r *http.Request, backend *B
504511 writeBackendResponse (w , resp , respBody )
505512}
506513
514+ func (d * Deps ) handleVoxCPMClone (w http.ResponseWriter , r * http.Request , backend * Backend , raw map [string ]any , requestBody []byte ) {
515+ body , contentType , err := buildVoxCPMCloneRequest (raw )
516+ if err != nil {
517+ http .Error (w , err .Error (), http .StatusBadRequest )
518+ return
519+ }
520+
521+ resp , respBody , err := d .callBackend (r , backend .Address , "/v1/clone" , contentType , body )
522+ if err != nil {
523+ slog .Warn ("aima proxy: VoxCPM clone backend request failed" , "backend" , backend .Address , "err" , err )
524+ http .Error (w , "backend unreachable" , http .StatusBadGateway )
525+ return
526+ }
527+
528+ if r .URL .Path == "/v1/tts" && resp .StatusCode >= 200 && resp .StatusCode < 300 && isAudioContent (resp .Header .Get ("Content-Type" )) {
529+ writeAudioJSON (w , respBody , requestBody , resp .Header .Get ("Content-Type" ), resp .StatusCode )
530+ return
531+ }
532+ if r .URL .Path == "/v1/audio/speech" && resp .StatusCode >= 200 && resp .StatusCode < 300 && writeAudioFromJSON (w , respBody , requestBody , resp .StatusCode ) {
533+ return
534+ }
535+ writeBackendResponse (w , resp , respBody )
536+ }
537+
538+ func buildVoxCPMCloneRequest (raw map [string ]any ) ([]byte , string , error ) {
539+ text := extractTTSText (raw )
540+ if text == "" {
541+ return nil , "" , fmt .Errorf (`{"error":"missing or invalid input field"}` )
542+ }
543+ refAudio := firstTTSString (raw , "reference_audio" , "ref_audio" )
544+ if refAudio == "" {
545+ return nil , "" , fmt .Errorf (`{"error":"missing or invalid reference_audio field"}` )
546+ }
547+ audio , filename , err := decodeReferenceAudio (refAudio )
548+ if err != nil {
549+ return nil , "" , err
550+ }
551+
552+ var body bytes.Buffer
553+ writer := multipart .NewWriter (& body )
554+ if err := writer .WriteField ("text" , text ); err != nil {
555+ return nil , "" , err
556+ }
557+ if refText := firstTTSString (raw , "reference_text" , "ref_text" ); refText != "" {
558+ if err := writer .WriteField ("ref_text" , refText ); err != nil {
559+ return nil , "" , err
560+ }
561+ }
562+ for _ , key := range []string {"response_format" , "temperature" , "cfg" , "max_length" } {
563+ if value , ok := raw [key ]; ok {
564+ if err := writer .WriteField (key , fmt .Sprint (value )); err != nil {
565+ return nil , "" , err
566+ }
567+ }
568+ }
569+ part , err := writer .CreateFormFile ("ref_audio" , filename )
570+ if err != nil {
571+ return nil , "" , err
572+ }
573+ if _ , err := part .Write (audio ); err != nil {
574+ return nil , "" , err
575+ }
576+ if err := writer .Close (); err != nil {
577+ return nil , "" , err
578+ }
579+ return body .Bytes (), writer .FormDataContentType (), nil
580+ }
581+
582+ func hasTTSReferenceAudio (raw map [string ]any ) bool {
583+ return firstTTSString (raw , "reference_audio" , "ref_audio" ) != ""
584+ }
585+
586+ func firstTTSString (raw map [string ]any , keys ... string ) string {
587+ for _ , key := range keys {
588+ if value , _ := raw [key ].(string ); strings .TrimSpace (value ) != "" {
589+ return strings .TrimSpace (value )
590+ }
591+ }
592+ return ""
593+ }
594+
595+ func decodeReferenceAudio (value string ) ([]byte , string , error ) {
596+ value = strings .TrimSpace (value )
597+ if strings .HasPrefix (strings .ToLower (value ), "data:" ) {
598+ return decodeReferenceAudioDataURL (value )
599+ }
600+ audio , err := base64 .StdEncoding .DecodeString (value )
601+ if err != nil {
602+ return nil , "" , fmt .Errorf (`{"error":"reference_audio must be a data URL or base64 audio"}` )
603+ }
604+ return audio , "reference.wav" , nil
605+ }
606+
607+ func decodeReferenceAudioDataURL (value string ) ([]byte , string , error ) {
608+ comma := strings .IndexByte (value , ',' )
609+ if comma < 0 {
610+ return nil , "" , fmt .Errorf (`{"error":"invalid reference_audio data URL"}` )
611+ }
612+ meta := value [len ("data:" ):comma ]
613+ payload := value [comma + 1 :]
614+ if ! strings .Contains (strings .ToLower (meta ), ";base64" ) {
615+ return nil , "" , fmt .Errorf (`{"error":"reference_audio data URL must be base64 encoded"}` )
616+ }
617+ audio , err := base64 .StdEncoding .DecodeString (payload )
618+ if err != nil {
619+ return nil , "" , fmt .Errorf (`{"error":"invalid reference_audio base64 data"}` )
620+ }
621+
622+ contentType := strings .TrimSpace (strings .Split (meta , ";" )[0 ])
623+ format := audioFormatFromContentType (contentType )
624+ if format == "" {
625+ format = "wav"
626+ }
627+ return audio , "reference." + format , nil
628+ }
629+
507630func (d * Deps ) callBackend (r * http.Request , targetAddr , targetPath , contentType string , body []byte ) (* http.Response , []byte , error ) {
508631 if ! strings .HasPrefix (targetAddr , "http://" ) && ! strings .HasPrefix (targetAddr , "https://" ) {
509632 targetAddr = "http://" + targetAddr
0 commit comments