mirror of
https://github.com/memohai/Memoh.git
synced 2026-04-27 07:16:19 +09:00
fix: separate audio domain and restore transcription templates
Move speech and transcription internals into the audio domain, restore template-driven transcription providers, and regenerate Swagger/SDK so the frontend can stop hand-calling /transcription-* APIs.
This commit is contained in:
@@ -72,8 +72,7 @@ func TestSpawnAndNotify(t *testing.T) {
|
||||
task := mgr.Get(taskID)
|
||||
if task == nil {
|
||||
t.Fatal("task not found after completion")
|
||||
}
|
||||
if task.Status != TaskCompleted {
|
||||
} else if task.Status != TaskCompleted {
|
||||
t.Errorf("expected task status completed, got %s", task.Status)
|
||||
}
|
||||
}
|
||||
@@ -130,8 +129,7 @@ func TestKillTask(t *testing.T) {
|
||||
task := mgr.Get(taskID)
|
||||
if task == nil {
|
||||
t.Fatal("task not found")
|
||||
}
|
||||
if task.Status != TaskKilled {
|
||||
} else if task.Status != TaskKilled {
|
||||
t.Errorf("expected status killed, got %s", task.Status)
|
||||
}
|
||||
|
||||
|
||||
@@ -16,9 +16,9 @@ import (
|
||||
|
||||
sdk "github.com/memohai/twilight-ai/sdk"
|
||||
|
||||
audiopkg "github.com/memohai/memoh/internal/audio"
|
||||
"github.com/memohai/memoh/internal/media"
|
||||
"github.com/memohai/memoh/internal/settings"
|
||||
ttspkg "github.com/memohai/memoh/internal/tts"
|
||||
)
|
||||
|
||||
const mediaDataPrefix = "/data/media/"
|
||||
@@ -26,19 +26,19 @@ const mediaDataPrefix = "/data/media/"
|
||||
type TranscriptionProvider struct {
|
||||
logger *slog.Logger
|
||||
settings *settings.Service
|
||||
tts *ttspkg.Service
|
||||
audio *audiopkg.Service
|
||||
media *media.Service
|
||||
http *http.Client
|
||||
}
|
||||
|
||||
func NewTranscriptionProvider(log *slog.Logger, settingsSvc *settings.Service, ttsSvc *ttspkg.Service, mediaSvc *media.Service) *TranscriptionProvider {
|
||||
func NewTranscriptionProvider(log *slog.Logger, settingsSvc *settings.Service, audioSvc *audiopkg.Service, mediaSvc *media.Service) *TranscriptionProvider {
|
||||
if log == nil {
|
||||
log = slog.Default()
|
||||
}
|
||||
return &TranscriptionProvider{
|
||||
logger: log.With(slog.String("tool", "transcribe_audio")),
|
||||
settings: settingsSvc,
|
||||
tts: ttsSvc,
|
||||
audio: audioSvc,
|
||||
media: mediaSvc,
|
||||
http: &http.Client{
|
||||
Timeout: 30 * time.Second,
|
||||
@@ -56,7 +56,7 @@ func NewTranscriptionProvider(log *slog.Logger, settingsSvc *settings.Service, t
|
||||
}
|
||||
|
||||
func (p *TranscriptionProvider) Tools(ctx context.Context, session SessionContext) ([]sdk.Tool, error) {
|
||||
if session.IsSubagent || p.settings == nil || p.tts == nil || p.media == nil {
|
||||
if session.IsSubagent || p.settings == nil || p.audio == nil || p.media == nil {
|
||||
return nil, nil
|
||||
}
|
||||
botID := strings.TrimSpace(session.BotID)
|
||||
@@ -120,7 +120,7 @@ func (p *TranscriptionProvider) execTranscribe(ctx context.Context, session Sess
|
||||
if prompt := FirstStringArg(args, "prompt"); prompt != "" {
|
||||
override["prompt"] = prompt
|
||||
}
|
||||
result, err := p.tts.Transcribe(ctx, modelID, audio, filename, contentType, override)
|
||||
result, err := p.audio.Transcribe(ctx, modelID, audio, filename, contentType, override)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -10,9 +10,9 @@ import (
|
||||
|
||||
sdk "github.com/memohai/twilight-ai/sdk"
|
||||
|
||||
audiopkg "github.com/memohai/memoh/internal/audio"
|
||||
"github.com/memohai/memoh/internal/channel"
|
||||
"github.com/memohai/memoh/internal/settings"
|
||||
ttspkg "github.com/memohai/memoh/internal/tts"
|
||||
)
|
||||
|
||||
const ttsMaxTextLen = 500
|
||||
@@ -30,26 +30,26 @@ type TTSChannelResolver interface {
|
||||
type TTSProvider struct {
|
||||
logger *slog.Logger
|
||||
settings *settings.Service
|
||||
tts *ttspkg.Service
|
||||
audio *audiopkg.Service
|
||||
sender TTSSender
|
||||
resolver TTSChannelResolver
|
||||
}
|
||||
|
||||
func NewTTSProvider(log *slog.Logger, settingsSvc *settings.Service, ttsSvc *ttspkg.Service, sender TTSSender, resolver TTSChannelResolver) *TTSProvider {
|
||||
func NewTTSProvider(log *slog.Logger, settingsSvc *settings.Service, audioSvc *audiopkg.Service, sender TTSSender, resolver TTSChannelResolver) *TTSProvider {
|
||||
if log == nil {
|
||||
log = slog.Default()
|
||||
}
|
||||
return &TTSProvider{
|
||||
logger: log.With(slog.String("tool", "tts")),
|
||||
settings: settingsSvc,
|
||||
tts: ttsSvc,
|
||||
audio: audioSvc,
|
||||
sender: sender,
|
||||
resolver: resolver,
|
||||
}
|
||||
}
|
||||
|
||||
func (p *TTSProvider) Tools(ctx context.Context, session SessionContext) ([]sdk.Tool, error) {
|
||||
if session.IsSubagent || p.settings == nil || p.tts == nil || p.sender == nil || p.resolver == nil {
|
||||
if session.IsSubagent || p.settings == nil || p.audio == nil || p.sender == nil || p.resolver == nil {
|
||||
return nil, nil
|
||||
}
|
||||
botID := strings.TrimSpace(session.BotID)
|
||||
@@ -115,7 +115,7 @@ func (p *TTSProvider) execSpeak(ctx context.Context, session SessionContext, arg
|
||||
if botSettings.TtsModelID == "" {
|
||||
return nil, errors.New("bot has no TTS model configured")
|
||||
}
|
||||
audioData, contentType, synthErr := p.tts.Synthesize(ctx, botSettings.TtsModelID, text, nil)
|
||||
audioData, contentType, synthErr := p.audio.Synthesize(ctx, botSettings.TtsModelID, text, nil)
|
||||
if synthErr != nil {
|
||||
return nil, fmt.Errorf("speech synthesis failed: %s", synthErr.Error())
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
package tts
|
||||
package audio
|
||||
|
||||
import "context"
|
||||
|
||||
@@ -6,10 +6,10 @@ import (
|
||||
"log/slog"
|
||||
"strings"
|
||||
|
||||
"github.com/memohai/memoh/internal/tts"
|
||||
"github.com/memohai/memoh/internal/audio"
|
||||
)
|
||||
|
||||
const TtsTypeEdge tts.TtsType = "edge"
|
||||
const TtsTypeEdge audio.TtsType = "edge"
|
||||
|
||||
const edgeModelReadAloud = "edge-read-aloud"
|
||||
|
||||
@@ -33,12 +33,12 @@ func NewEdgeAdapterWithClient(log *slog.Logger, client *EdgeWsClient) *EdgeAdapt
|
||||
}
|
||||
}
|
||||
|
||||
func (*EdgeAdapter) Type() tts.TtsType {
|
||||
func (*EdgeAdapter) Type() audio.TtsType {
|
||||
return TtsTypeEdge
|
||||
}
|
||||
|
||||
func (*EdgeAdapter) Meta() tts.TtsMeta {
|
||||
return tts.TtsMeta{
|
||||
func (*EdgeAdapter) Meta() audio.TtsMeta {
|
||||
return audio.TtsMeta{
|
||||
Provider: "Microsoft Edge",
|
||||
Description: "Microsoft Edge TTS",
|
||||
}
|
||||
@@ -54,32 +54,32 @@ var edgeFormats = []string{
|
||||
"webm-24khz-16bit-mono-opus",
|
||||
}
|
||||
|
||||
var edgeSpeedConstraint = &tts.ParamConstraint{
|
||||
var edgeSpeedConstraint = &audio.ParamConstraint{
|
||||
Options: []float64{0.5, 1.0, 2.0, 3.0},
|
||||
Default: 1.0,
|
||||
}
|
||||
|
||||
var edgePitchConstraint = &tts.ParamConstraint{
|
||||
var edgePitchConstraint = &audio.ParamConstraint{
|
||||
Min: -100,
|
||||
Max: 100,
|
||||
Default: 0,
|
||||
}
|
||||
|
||||
func (*EdgeAdapter) Models() []tts.ModelInfo {
|
||||
var voices []tts.VoiceInfo
|
||||
func (*EdgeAdapter) Models() []audio.ModelInfo {
|
||||
var voices []audio.VoiceInfo
|
||||
for lang, ids := range EdgeTTSVoices {
|
||||
for _, id := range ids {
|
||||
name := strings.TrimPrefix(id, lang+"-")
|
||||
name = strings.TrimSuffix(name, "Neural")
|
||||
voices = append(voices, tts.VoiceInfo{ID: id, Lang: lang, Name: name})
|
||||
voices = append(voices, audio.VoiceInfo{ID: id, Lang: lang, Name: name})
|
||||
}
|
||||
}
|
||||
return []tts.ModelInfo{
|
||||
return []audio.ModelInfo{
|
||||
{
|
||||
ID: edgeModelReadAloud,
|
||||
Name: "Edge Read Aloud",
|
||||
Description: "Built-in Edge Read Aloud speech model",
|
||||
Capabilities: tts.ModelCapabilities{
|
||||
Capabilities: audio.ModelCapabilities{
|
||||
Voices: voices,
|
||||
Formats: edgeFormats,
|
||||
Speed: edgeSpeedConstraint,
|
||||
@@ -100,14 +100,14 @@ func (*EdgeAdapter) ResolveModel(model string) (string, error) {
|
||||
return edgeModelReadAloud, nil
|
||||
}
|
||||
|
||||
func (a *EdgeAdapter) Synthesize(ctx context.Context, text string, _ string, config tts.AudioConfig) ([]byte, error) {
|
||||
func (a *EdgeAdapter) Synthesize(ctx context.Context, text string, _ string, config audio.AudioConfig) ([]byte, error) {
|
||||
if err := config.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("edge tts: invalid config: %w", err)
|
||||
}
|
||||
return a.client.Synthesize(ctx, text, config)
|
||||
}
|
||||
|
||||
func (a *EdgeAdapter) Stream(ctx context.Context, text string, _ string, config tts.AudioConfig) (chan []byte, chan error) {
|
||||
func (a *EdgeAdapter) Stream(ctx context.Context, text string, _ string, config audio.AudioConfig) (chan []byte, chan error) {
|
||||
if err := config.Validate(); err != nil {
|
||||
errCh := make(chan error, 1)
|
||||
errCh <- fmt.Errorf("edge tts: invalid config: %w", err)
|
||||
@@ -8,7 +8,7 @@ import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/memohai/memoh/internal/tts"
|
||||
"github.com/memohai/memoh/internal/audio"
|
||||
)
|
||||
|
||||
func TestEdgeAdapter_TypeAndMeta(t *testing.T) {
|
||||
@@ -37,7 +37,7 @@ func TestEdgeAdapter_Synthesize_WithMockServer(t *testing.T) {
|
||||
adapter := NewEdgeAdapterWithClient(slog.Default(), client)
|
||||
|
||||
ctx := context.Background()
|
||||
config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
|
||||
config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
|
||||
audio, err := adapter.Synthesize(ctx, "Hello", edgeModelReadAloud, config)
|
||||
if err != nil {
|
||||
t.Fatalf("Synthesize: %v", err)
|
||||
@@ -61,7 +61,7 @@ func TestEdgeAdapter_Stream_WithMockServer(t *testing.T) {
|
||||
adapter := NewEdgeAdapterWithClient(slog.Default(), client)
|
||||
|
||||
ctx := context.Background()
|
||||
config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
|
||||
config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
|
||||
ch, errCh := adapter.Stream(ctx, "Hi", edgeModelReadAloud, config)
|
||||
var chunks [][]byte
|
||||
for b := range ch {
|
||||
@@ -86,7 +86,7 @@ func TestEdgeAdapter_Synthesize_NotConnected(t *testing.T) {
|
||||
adapter := NewEdgeAdapterWithClient(slog.Default(), client)
|
||||
|
||||
ctx := context.Background()
|
||||
_, err := adapter.Synthesize(ctx, "x", edgeModelReadAloud, tts.AudioConfig{})
|
||||
_, err := adapter.Synthesize(ctx, "x", edgeModelReadAloud, audio.AudioConfig{})
|
||||
if err == nil {
|
||||
t.Fatal("expected error when connection fails")
|
||||
}
|
||||
@@ -20,7 +20,7 @@ import (
|
||||
"github.com/google/uuid"
|
||||
"github.com/gorilla/websocket"
|
||||
|
||||
"github.com/memohai/memoh/internal/tts"
|
||||
"github.com/memohai/memoh/internal/audio"
|
||||
)
|
||||
|
||||
// Edge TTS WebSocket client.
|
||||
@@ -184,7 +184,7 @@ func (c *EdgeWsClient) sendFrame(path, contentType, body string, extraHeaders ma
|
||||
}
|
||||
|
||||
// Configure sends the speech.config message (output format, etc.).
|
||||
func (c *EdgeWsClient) Configure(ctx context.Context, config tts.AudioConfig) error {
|
||||
func (c *EdgeWsClient) Configure(ctx context.Context, config audio.AudioConfig) error {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
if c.conn == nil {
|
||||
@@ -207,7 +207,7 @@ func (c *EdgeWsClient) Configure(ctx context.Context, config tts.AudioConfig) er
|
||||
}
|
||||
|
||||
// buildSSML builds SSML with rate and pitch for Edge TTS prosody.
|
||||
func buildSSML(text string, voice tts.VoiceConfig, speed, pitch float64) string {
|
||||
func buildSSML(text string, voice audio.VoiceConfig, speed, pitch float64) string {
|
||||
voiceID := voice.ID
|
||||
if voiceID == "" {
|
||||
voiceID = DEFAULT_VOICE
|
||||
@@ -241,7 +241,7 @@ func escapeSSML(s string) string {
|
||||
|
||||
// Synthesize sends SSML and synchronously collects all audio data.
|
||||
// It handles the full lifecycle: connect → configure → send → receive → close.
|
||||
func (c *EdgeWsClient) Synthesize(ctx context.Context, text string, config tts.AudioConfig) ([]byte, error) {
|
||||
func (c *EdgeWsClient) Synthesize(ctx context.Context, text string, config audio.AudioConfig) ([]byte, error) {
|
||||
if err := c.Connect(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -338,7 +338,7 @@ func parseAudioChunk(data []byte) ([]byte, error) {
|
||||
|
||||
// Stream sends SSML and returns audio chunks via channel.
|
||||
// It handles the full lifecycle: connect → configure → send → stream → close.
|
||||
func (c *EdgeWsClient) Stream(ctx context.Context, text string, config tts.AudioConfig) (ch chan []byte, errCh chan error) {
|
||||
func (c *EdgeWsClient) Stream(ctx context.Context, text string, config audio.AudioConfig) (ch chan []byte, errCh chan error) {
|
||||
ch = make(chan []byte, 8)
|
||||
errCh = make(chan error, 1)
|
||||
go func() {
|
||||
+12
-12
@@ -9,7 +9,7 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/memohai/memoh/internal/tts"
|
||||
"github.com/memohai/memoh/internal/audio"
|
||||
)
|
||||
|
||||
// Real Edge TTS integration tests. Not compiled by default (requires -tags=integration).
|
||||
@@ -17,14 +17,14 @@ import (
|
||||
//
|
||||
// Run:
|
||||
//
|
||||
// go test -tags=integration ./internal/tts/adapter/edge/... -run TestRealEdgeTTS -v
|
||||
// go test -tags=integration ./internal/audio/adapter/edge/... -run TestRealEdgeTTS -v
|
||||
|
||||
func TestRealEdgeTTS_Synthesize(t *testing.T) {
|
||||
client := NewEdgeWsClient()
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, Speed: 1.0}
|
||||
config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, Speed: 1.0}
|
||||
audio, err := client.Synthesize(ctx, "Hello, this is a real Edge TTS test.", config)
|
||||
if err != nil {
|
||||
t.Fatalf("Synthesize: %v", err)
|
||||
@@ -40,7 +40,7 @@ func TestRealEdgeTTS_Stream(t *testing.T) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}}
|
||||
config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}}
|
||||
ch, errCh := client.Stream(ctx, "你好,这是流式测试。", config)
|
||||
var total int
|
||||
for b := range ch {
|
||||
@@ -57,7 +57,7 @@ func TestRealEdgeTTS_Stream(t *testing.T) {
|
||||
|
||||
// TestRealEdgeTTS_Formats tries every candidate format and reports which ones are supported.
|
||||
//
|
||||
// go test -tags=integration ./internal/tts/adapter/edge/... -run TestRealEdgeTTS_Formats -v
|
||||
// go test -tags=integration ./internal/audio/adapter/edge/... -run TestRealEdgeTTS_Formats -v
|
||||
func TestRealEdgeTTS_Formats(t *testing.T) {
|
||||
formats := []string{
|
||||
"audio-24khz-48kbitrate-mono-mp3",
|
||||
@@ -71,8 +71,8 @@ func TestRealEdgeTTS_Formats(t *testing.T) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
config := tts.AudioConfig{
|
||||
Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"},
|
||||
config := audio.AudioConfig{
|
||||
Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"},
|
||||
Format: fmt,
|
||||
Speed: 1.0,
|
||||
}
|
||||
@@ -88,7 +88,7 @@ func TestRealEdgeTTS_Formats(t *testing.T) {
|
||||
|
||||
// TestRealEdgeTTS_SaveAudio synthesizes speech and writes the result to a file for manual inspection.
|
||||
//
|
||||
// go test -tags=integration ./internal/tts/adapter/edge/... -run TestRealEdgeTTS_SaveAudio -v
|
||||
// go test -tags=integration ./internal/audio/adapter/edge/... -run TestRealEdgeTTS_SaveAudio -v
|
||||
func TestRealEdgeTTS_SaveAudio(t *testing.T) {
|
||||
client := NewEdgeWsClient()
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||||
@@ -97,11 +97,11 @@ func TestRealEdgeTTS_SaveAudio(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
text string
|
||||
voice tts.VoiceConfig
|
||||
voice audio.VoiceConfig
|
||||
file string
|
||||
}{
|
||||
{"en", "Hello, this is an Edge TTS audio save test.", tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, "test_en.mp3"},
|
||||
{"zh", "你好,这是一段中文语音合成测试。", tts.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}, "test_zh.mp3"},
|
||||
{"en", "Hello, this is an Edge TTS audio save test.", audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, "test_en.mp3"},
|
||||
{"zh", "你好,这是一段中文语音合成测试。", audio.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}, "test_zh.mp3"},
|
||||
}
|
||||
|
||||
outDir := filepath.Join(os.TempDir(), "edge_tts_test")
|
||||
@@ -111,7 +111,7 @@ func TestRealEdgeTTS_SaveAudio(t *testing.T) {
|
||||
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
config := tts.AudioConfig{Voice: tc.voice, Speed: 1.0, Pitch: -10.0}
|
||||
config := audio.AudioConfig{Voice: tc.voice, Speed: 1.0, Pitch: -10.0}
|
||||
audio, err := client.Synthesize(ctx, tc.text, config)
|
||||
if err != nil {
|
||||
t.Fatalf("Synthesize: %v", err)
|
||||
@@ -11,7 +11,7 @@ import (
|
||||
|
||||
"github.com/gorilla/websocket"
|
||||
|
||||
"github.com/memohai/memoh/internal/tts"
|
||||
"github.com/memohai/memoh/internal/audio"
|
||||
)
|
||||
|
||||
var upgrader = websocket.Upgrader{
|
||||
@@ -95,7 +95,7 @@ func TestEdgeWsClient_ConnectAndSynthesize(t *testing.T) {
|
||||
client := NewEdgeWsClient()
|
||||
client.BaseURL = wsURL
|
||||
|
||||
config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, Speed: 1.0}
|
||||
config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, Speed: 1.0}
|
||||
audio, err := client.Synthesize(t.Context(), "Hello world", config)
|
||||
if err != nil {
|
||||
t.Fatalf("Synthesize: %v", err)
|
||||
@@ -114,7 +114,7 @@ func TestEdgeWsClient_Stream(t *testing.T) {
|
||||
client := NewEdgeWsClient()
|
||||
client.BaseURL = wsURL
|
||||
|
||||
config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
|
||||
config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
|
||||
ch, errCh := client.Stream(t.Context(), "Hi", config)
|
||||
var chunks [][]byte
|
||||
for b := range ch {
|
||||
@@ -197,7 +197,7 @@ func TestParseAudioChunk_EmptyOrShort(t *testing.T) {
|
||||
|
||||
func TestBuildSSML(t *testing.T) {
|
||||
t.Parallel()
|
||||
ssml := buildSSML("Hello", tts.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}, 1.0, 0)
|
||||
ssml := buildSSML("Hello", audio.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}, 1.0, 0)
|
||||
if !strings.Contains(ssml, "zh-CN-XiaoxiaoNeural") {
|
||||
t.Errorf("ssml should contain voice: %s", ssml)
|
||||
}
|
||||
@@ -1,11 +1,13 @@
|
||||
package tts
|
||||
package audio
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
|
||||
"github.com/jackc/pgx/v5"
|
||||
"github.com/jackc/pgx/v5/pgtype"
|
||||
|
||||
"github.com/memohai/memoh/internal/db/sqlc"
|
||||
@@ -14,23 +16,23 @@ import (
|
||||
|
||||
func SyncRegistry(ctx context.Context, logger *slog.Logger, queries *sqlc.Queries, registry *Registry) error {
|
||||
for _, def := range registry.List() {
|
||||
configJSON, err := json.Marshal(map[string]any{})
|
||||
provider, err := queries.GetProviderByClientType(ctx, string(def.ClientType))
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal speech provider config: %w", err)
|
||||
}
|
||||
var icon pgtype.Text
|
||||
if def.Icon != "" {
|
||||
icon = pgtype.Text{String: def.Icon, Valid: true}
|
||||
}
|
||||
|
||||
provider, err := queries.UpsertRegistryProvider(ctx, sqlc.UpsertRegistryProviderParams{
|
||||
Name: def.DisplayName,
|
||||
ClientType: string(def.ClientType),
|
||||
Icon: icon,
|
||||
Config: configJSON,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("upsert speech provider %s: %w", def.ClientType, err)
|
||||
if errors.Is(err, pgx.ErrNoRows) {
|
||||
if logger != nil {
|
||||
logger.Warn("audio registry skipped provider without template",
|
||||
slog.String("provider", string(def.ClientType)),
|
||||
slog.String("display_name", def.DisplayName))
|
||||
}
|
||||
continue
|
||||
}
|
||||
if logger != nil {
|
||||
logger.Warn("audio registry failed to load provider template",
|
||||
slog.String("provider", string(def.ClientType)),
|
||||
slog.String("display_name", def.DisplayName),
|
||||
slog.Any("error", err))
|
||||
}
|
||||
return fmt.Errorf("get provider by client type %s: %w", def.ClientType, err)
|
||||
}
|
||||
|
||||
synced := 0
|
||||
@@ -1,4 +1,4 @@
|
||||
package tts
|
||||
package audio
|
||||
|
||||
// VoiceConfig is kept for backward compatibility with the legacy Edge adapter tests.
|
||||
type VoiceConfig struct {
|
||||
@@ -1,4 +1,4 @@
|
||||
package tts
|
||||
package audio
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
@@ -1,4 +1,4 @@
|
||||
package tts
|
||||
package audio
|
||||
|
||||
import (
|
||||
"context"
|
||||
@@ -24,7 +24,7 @@ type Service struct {
|
||||
func NewService(log *slog.Logger, queries *sqlc.Queries, registry *Registry) *Service {
|
||||
return &Service{
|
||||
queries: queries,
|
||||
logger: log.With(slog.String("service", "tts")),
|
||||
logger: log.With(slog.String("service", "audio")),
|
||||
registry: registry,
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
package tts
|
||||
package audio
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
@@ -13,7 +13,7 @@ import (
|
||||
const (
|
||||
defaultTTL = 10 * time.Minute
|
||||
cleanupInterval = 1 * time.Minute
|
||||
tempDirName = "tts_temp"
|
||||
tempDirName = "audio_temp"
|
||||
)
|
||||
|
||||
// TempStore manages temporary audio files on disk with automatic TTL-based cleanup.
|
||||
@@ -30,7 +30,7 @@ type TempStore struct {
|
||||
func NewTempStore(baseDir string) (*TempStore, error) {
|
||||
dir := filepath.Join(baseDir, tempDirName)
|
||||
if err := os.MkdirAll(dir, 0o750); err != nil {
|
||||
return nil, fmt.Errorf("create tts temp dir: %w", err)
|
||||
return nil, fmt.Errorf("create audio temp dir: %w", err)
|
||||
}
|
||||
return &TempStore{
|
||||
dir: dir,
|
||||
@@ -1,4 +1,4 @@
|
||||
package tts
|
||||
package audio
|
||||
|
||||
import "time"
|
||||
|
||||
@@ -58,14 +58,14 @@ type mediaIngestor interface {
|
||||
channel.ContainerAttachmentIngester
|
||||
}
|
||||
|
||||
// ttsSynthesizer synthesizes text to speech audio.
|
||||
type ttsSynthesizer interface {
|
||||
// speechSynthesizer synthesizes text to speech audio.
|
||||
type speechSynthesizer interface {
|
||||
Synthesize(ctx context.Context, modelID string, text string, overrideCfg map[string]any) ([]byte, string, error)
|
||||
}
|
||||
|
||||
// ttsModelResolver looks up the TTS model ID configured for a bot.
|
||||
type ttsModelResolver interface {
|
||||
ResolveTtsModelID(ctx context.Context, botID string) (string, error)
|
||||
// speechModelResolver looks up the speech model ID configured for a bot.
|
||||
type speechModelResolver interface {
|
||||
ResolveSpeechModelID(ctx context.Context, botID string) (string, error)
|
||||
}
|
||||
|
||||
// TranscriptionResult is the minimal speech-to-text response shape needed by inbound routing.
|
||||
@@ -101,29 +101,29 @@ type SessionResult struct {
|
||||
|
||||
// ChannelInboundProcessor routes channel inbound messages to the chat gateway.
|
||||
type ChannelInboundProcessor struct {
|
||||
runner flow.Runner
|
||||
routeResolver RouteResolver
|
||||
message messagepkg.Writer
|
||||
mediaService mediaIngestor
|
||||
reactor channelReactor
|
||||
commandHandler *command.Handler
|
||||
registry *channel.Registry
|
||||
logger *slog.Logger
|
||||
jwtSecret string
|
||||
tokenTTL time.Duration
|
||||
identity *IdentityResolver
|
||||
policy PolicyService
|
||||
dispatcher *RouteDispatcher
|
||||
acl chatACL
|
||||
observer channel.StreamObserver
|
||||
ttsService ttsSynthesizer
|
||||
ttsModelResolver ttsModelResolver
|
||||
transcriber transcriptionRecognizer
|
||||
sttModelResolver transcriptionModelResolver
|
||||
sessionEnsurer SessionEnsurer
|
||||
pipeline *pipelinepkg.Pipeline
|
||||
eventStore *pipelinepkg.EventStore
|
||||
discussDriver *pipelinepkg.DiscussDriver
|
||||
runner flow.Runner
|
||||
routeResolver RouteResolver
|
||||
message messagepkg.Writer
|
||||
mediaService mediaIngestor
|
||||
reactor channelReactor
|
||||
commandHandler *command.Handler
|
||||
registry *channel.Registry
|
||||
logger *slog.Logger
|
||||
jwtSecret string
|
||||
tokenTTL time.Duration
|
||||
identity *IdentityResolver
|
||||
policy PolicyService
|
||||
dispatcher *RouteDispatcher
|
||||
acl chatACL
|
||||
observer channel.StreamObserver
|
||||
speechService speechSynthesizer
|
||||
speechModelResolver speechModelResolver
|
||||
transcriber transcriptionRecognizer
|
||||
sttModelResolver transcriptionModelResolver
|
||||
sessionEnsurer SessionEnsurer
|
||||
pipeline *pipelinepkg.Pipeline
|
||||
eventStore *pipelinepkg.EventStore
|
||||
discussDriver *pipelinepkg.DiscussDriver
|
||||
|
||||
// activeStreams maps "botID:routeID" to a context.CancelFunc for the
|
||||
// currently running agent stream. Used by /stop to abort generation
|
||||
@@ -205,14 +205,14 @@ func (p *ChannelInboundProcessor) SetStreamObserver(observer channel.StreamObser
|
||||
p.observer = observer
|
||||
}
|
||||
|
||||
// SetTtsService configures the TTS synthesizer and settings reader for handling
|
||||
// <speech> tag events (speech_delta) that require server-side audio synthesis.
|
||||
func (p *ChannelInboundProcessor) SetTtsService(synth ttsSynthesizer, modelResolver ttsModelResolver) {
|
||||
// SetSpeechService configures the speech synthesizer and settings reader for
|
||||
// handling <speech> tag events (speech_delta) that require server-side audio synthesis.
|
||||
func (p *ChannelInboundProcessor) SetSpeechService(synth speechSynthesizer, modelResolver speechModelResolver) {
|
||||
if p == nil {
|
||||
return
|
||||
}
|
||||
p.ttsService = synth
|
||||
p.ttsModelResolver = modelResolver
|
||||
p.speechService = synth
|
||||
p.speechModelResolver = modelResolver
|
||||
}
|
||||
|
||||
// SetTranscriptionService configures speech-to-text processing for inbound audio attachments.
|
||||
@@ -2304,13 +2304,13 @@ func (p *ChannelInboundProcessor) synthesizeAndPushVoice(
|
||||
outboundAssetRefs *[]conversation.OutboundAssetRef,
|
||||
assetMu *sync.Mutex,
|
||||
) {
|
||||
if p.ttsService == nil || p.ttsModelResolver == nil {
|
||||
if p.speechService == nil || p.speechModelResolver == nil {
|
||||
if p.logger != nil {
|
||||
p.logger.Warn("speech_delta received but TTS service not configured")
|
||||
}
|
||||
return
|
||||
}
|
||||
modelID, err := p.ttsModelResolver.ResolveTtsModelID(ctx, botID)
|
||||
modelID, err := p.speechModelResolver.ResolveSpeechModelID(ctx, botID)
|
||||
if err != nil || strings.TrimSpace(modelID) == "" {
|
||||
if p.logger != nil {
|
||||
p.logger.Warn("speech_delta: bot has no TTS model configured", slog.String("bot_id", botID))
|
||||
@@ -2322,7 +2322,7 @@ func (p *ChannelInboundProcessor) synthesizeAndPushVoice(
|
||||
if text == "" {
|
||||
continue
|
||||
}
|
||||
audioData, contentType, synthErr := p.ttsService.Synthesize(ctx, modelID, text, nil)
|
||||
audioData, contentType, synthErr := p.speechService.Synthesize(ctx, modelID, text, nil)
|
||||
if synthErr != nil {
|
||||
if p.logger != nil {
|
||||
p.logger.Warn("speech synthesis failed", slog.String("bot_id", botID), slog.Any("error", synthErr))
|
||||
|
||||
@@ -318,6 +318,27 @@ func (q *Queries) GetModelByProviderAndModelID(ctx context.Context, arg GetModel
|
||||
return i, err
|
||||
}
|
||||
|
||||
const getProviderByClientType = `-- name: GetProviderByClientType :one
|
||||
SELECT id, name, client_type, icon, enable, config, metadata, created_at, updated_at FROM providers WHERE client_type = $1
|
||||
`
|
||||
|
||||
func (q *Queries) GetProviderByClientType(ctx context.Context, clientType string) (Provider, error) {
|
||||
row := q.db.QueryRow(ctx, getProviderByClientType, clientType)
|
||||
var i Provider
|
||||
err := row.Scan(
|
||||
&i.ID,
|
||||
&i.Name,
|
||||
&i.ClientType,
|
||||
&i.Icon,
|
||||
&i.Enable,
|
||||
&i.Config,
|
||||
&i.Metadata,
|
||||
&i.CreatedAt,
|
||||
&i.UpdatedAt,
|
||||
)
|
||||
return i, err
|
||||
}
|
||||
|
||||
const getProviderByID = `-- name: GetProviderByID :one
|
||||
SELECT id, name, client_type, icon, enable, config, metadata, created_at, updated_at FROM providers WHERE id = $1
|
||||
`
|
||||
|
||||
@@ -7,28 +7,28 @@ import (
|
||||
|
||||
"github.com/labstack/echo/v4"
|
||||
|
||||
audiopkg "github.com/memohai/memoh/internal/audio"
|
||||
"github.com/memohai/memoh/internal/settings"
|
||||
"github.com/memohai/memoh/internal/tts"
|
||||
)
|
||||
|
||||
// BotTtsHandler handles per-bot TTS synthesis requests from the agent tool.
|
||||
type BotTtsHandler struct {
|
||||
ttsService *tts.Service
|
||||
// BotAudioHandler handles per-bot speech synthesis requests from the agent tool.
|
||||
type BotAudioHandler struct {
|
||||
audioService *audiopkg.Service
|
||||
settingsService *settings.Service
|
||||
tempStore *tts.TempStore
|
||||
tempStore *audiopkg.TempStore
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
func NewBotTtsHandler(log *slog.Logger, ttsService *tts.Service, settingsService *settings.Service, tempStore *tts.TempStore) *BotTtsHandler {
|
||||
return &BotTtsHandler{
|
||||
ttsService: ttsService,
|
||||
func NewBotAudioHandler(log *slog.Logger, audioService *audiopkg.Service, settingsService *settings.Service, tempStore *audiopkg.TempStore) *BotAudioHandler {
|
||||
return &BotAudioHandler{
|
||||
audioService: audioService,
|
||||
settingsService: settingsService,
|
||||
tempStore: tempStore,
|
||||
logger: log.With(slog.String("handler", "bot_tts")),
|
||||
logger: log.With(slog.String("handler", "bot_audio")),
|
||||
}
|
||||
}
|
||||
|
||||
func (h *BotTtsHandler) Register(e *echo.Echo) {
|
||||
func (h *BotAudioHandler) Register(e *echo.Echo) {
|
||||
e.POST("/bots/:bot_id/tts/synthesize", h.Synthesize)
|
||||
}
|
||||
|
||||
@@ -54,7 +54,7 @@ type synthesizeResponse struct {
|
||||
// @Failure 400 {object} ErrorResponse
|
||||
// @Failure 500 {object} ErrorResponse
|
||||
// @Router /bots/{bot_id}/tts/synthesize [post].
|
||||
func (h *BotTtsHandler) Synthesize(c echo.Context) error {
|
||||
func (h *BotAudioHandler) Synthesize(c echo.Context) error {
|
||||
botID := strings.TrimSpace(c.Param("bot_id"))
|
||||
if botID == "" {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "bot_id is required")
|
||||
@@ -88,10 +88,10 @@ func (h *BotTtsHandler) Synthesize(c echo.Context) error {
|
||||
return echo.NewHTTPError(http.StatusInternalServerError, "failed to create temp file")
|
||||
}
|
||||
|
||||
contentType, streamErr := h.ttsService.StreamToFile(c.Request().Context(), botSettings.TtsModelID, text, f)
|
||||
contentType, streamErr := h.audioService.StreamToFile(c.Request().Context(), botSettings.TtsModelID, text, f)
|
||||
closeErr := f.Close()
|
||||
if streamErr != nil {
|
||||
h.logger.Error("tts synthesis failed", slog.String("bot_id", botID), slog.String("model_id", botSettings.TtsModelID), slog.Any("error", streamErr))
|
||||
h.logger.Error("speech synthesis failed", slog.String("bot_id", botID), slog.String("model_id", botSettings.TtsModelID), slog.Any("error", streamErr))
|
||||
h.tempStore.Delete(tempID)
|
||||
return echo.NewHTTPError(http.StatusInternalServerError, streamErr.Error())
|
||||
}
|
||||
|
||||
@@ -30,30 +30,30 @@ import (
|
||||
messagepkg "github.com/memohai/memoh/internal/message"
|
||||
)
|
||||
|
||||
// localTtsSynthesizer synthesizes text to speech audio.
|
||||
type localTtsSynthesizer interface {
|
||||
// localSpeechSynthesizer synthesizes text to speech audio.
|
||||
type localSpeechSynthesizer interface {
|
||||
Synthesize(ctx context.Context, modelID string, text string, overrideCfg map[string]any) ([]byte, string, error)
|
||||
}
|
||||
|
||||
// localTtsModelResolver resolves TTS model IDs for bots.
|
||||
type localTtsModelResolver interface {
|
||||
ResolveTtsModelID(ctx context.Context, botID string) (string, error)
|
||||
// localSpeechModelResolver resolves speech model IDs for bots.
|
||||
type localSpeechModelResolver interface {
|
||||
ResolveSpeechModelID(ctx context.Context, botID string) (string, error)
|
||||
}
|
||||
|
||||
// LocalChannelHandler handles local channel routes (WebUI / API) backed by bot history.
|
||||
type LocalChannelHandler struct {
|
||||
channelType channel.ChannelType
|
||||
channelManager *channel.Manager
|
||||
channelStore *channel.Store
|
||||
chatService *conversation.Service
|
||||
routeHub *local.RouteHub
|
||||
botService *bots.Service
|
||||
accountService *accounts.Service
|
||||
resolver *flow.Resolver
|
||||
mediaService *media.Service
|
||||
ttsService localTtsSynthesizer
|
||||
ttsModelResolver localTtsModelResolver
|
||||
logger *slog.Logger
|
||||
channelType channel.ChannelType
|
||||
channelManager *channel.Manager
|
||||
channelStore *channel.Store
|
||||
chatService *conversation.Service
|
||||
routeHub *local.RouteHub
|
||||
botService *bots.Service
|
||||
accountService *accounts.Service
|
||||
resolver *flow.Resolver
|
||||
mediaService *media.Service
|
||||
speechService localSpeechSynthesizer
|
||||
speechModelResolver localSpeechModelResolver
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
// NewLocalChannelHandler creates a local channel handler.
|
||||
@@ -80,10 +80,10 @@ func (h *LocalChannelHandler) SetMediaService(svc *media.Service) {
|
||||
h.mediaService = svc
|
||||
}
|
||||
|
||||
// SetTtsService configures TTS synthesis for handling speech_delta events.
|
||||
func (h *LocalChannelHandler) SetTtsService(synth localTtsSynthesizer, resolver localTtsModelResolver) {
|
||||
h.ttsService = synth
|
||||
h.ttsModelResolver = resolver
|
||||
// SetSpeechService configures speech synthesis for handling speech_delta events.
|
||||
func (h *LocalChannelHandler) SetSpeechService(synth localSpeechSynthesizer, resolver localSpeechModelResolver) {
|
||||
h.speechService = synth
|
||||
h.speechModelResolver = resolver
|
||||
}
|
||||
|
||||
// Register registers the local channel routes.
|
||||
@@ -719,12 +719,12 @@ func (h *LocalChannelHandler) ingestSingleAttachment(ctx context.Context, botID,
|
||||
// wsSynthesizeSpeech handles speech_delta events by synthesizing audio and
|
||||
// injecting attachment_delta events with the resulting voice attachments.
|
||||
func (h *LocalChannelHandler) wsSynthesizeSpeech(ctx context.Context, botID string, original json.RawMessage) []json.RawMessage {
|
||||
if h.ttsService == nil || h.ttsModelResolver == nil {
|
||||
if h.speechService == nil || h.speechModelResolver == nil {
|
||||
h.logger.Warn("speech_delta received but TTS service not configured")
|
||||
return nil
|
||||
}
|
||||
|
||||
modelID, err := h.ttsModelResolver.ResolveTtsModelID(ctx, botID)
|
||||
modelID, err := h.speechModelResolver.ResolveSpeechModelID(ctx, botID)
|
||||
if err != nil || strings.TrimSpace(modelID) == "" {
|
||||
h.logger.Warn("speech_delta: bot has no TTS model configured", slog.String("bot_id", botID))
|
||||
return nil
|
||||
@@ -746,7 +746,7 @@ func (h *LocalChannelHandler) wsSynthesizeSpeech(ctx context.Context, botID stri
|
||||
continue
|
||||
}
|
||||
|
||||
audioData, contentType, synthErr := h.ttsService.Synthesize(ctx, modelID, text, nil)
|
||||
audioData, contentType, synthErr := h.speechService.Synthesize(ctx, modelID, text, nil)
|
||||
if synthErr != nil {
|
||||
h.logger.Warn("speech synthesis failed", slog.String("bot_id", botID), slog.Any("error", synthErr))
|
||||
continue
|
||||
|
||||
@@ -12,25 +12,25 @@ import (
|
||||
|
||||
"github.com/labstack/echo/v4"
|
||||
|
||||
audiopkg "github.com/memohai/memoh/internal/audio"
|
||||
"github.com/memohai/memoh/internal/models"
|
||||
"github.com/memohai/memoh/internal/tts"
|
||||
)
|
||||
|
||||
type SpeechHandler struct {
|
||||
service *tts.Service
|
||||
type AudioHandler struct {
|
||||
service *audiopkg.Service
|
||||
modelsService *models.Service
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
func NewSpeechHandler(log *slog.Logger, service *tts.Service, modelsService *models.Service) *SpeechHandler {
|
||||
return &SpeechHandler{
|
||||
func NewAudioHandler(log *slog.Logger, service *audiopkg.Service, modelsService *models.Service) *AudioHandler {
|
||||
return &AudioHandler{
|
||||
service: service,
|
||||
modelsService: modelsService,
|
||||
logger: log.With(slog.String("handler", "speech")),
|
||||
logger: log.With(slog.String("handler", "audio")),
|
||||
}
|
||||
}
|
||||
|
||||
func (h *SpeechHandler) Register(e *echo.Echo) {
|
||||
func (h *AudioHandler) Register(e *echo.Echo) {
|
||||
pg := e.Group("/speech-providers")
|
||||
pg.GET("", h.ListProviders)
|
||||
pg.GET("/:id", h.GetProvider)
|
||||
@@ -64,13 +64,19 @@ func (h *SpeechHandler) Register(e *echo.Echo) {
|
||||
// @Summary List speech provider metadata
|
||||
// @Description List available speech provider types with their models and capabilities
|
||||
// @Tags speech-providers
|
||||
// @Success 200 {array} tts.ProviderMetaResponse
|
||||
// @Success 200 {array} audiopkg.ProviderMetaResponse
|
||||
// @Router /speech-providers/meta [get].
|
||||
func (h *SpeechHandler) ListSpeechMeta(c echo.Context) error {
|
||||
func (h *AudioHandler) ListSpeechMeta(c echo.Context) error {
|
||||
return c.JSON(http.StatusOK, h.service.ListSpeechMeta(c.Request().Context()))
|
||||
}
|
||||
|
||||
func (h *SpeechHandler) ListTranscriptionMeta(c echo.Context) error {
|
||||
// ListTranscriptionMeta godoc
|
||||
// @Summary List transcription provider metadata
|
||||
// @Description List available transcription provider types with their models and capabilities
|
||||
// @Tags transcription-providers
|
||||
// @Success 200 {array} audiopkg.ProviderMetaResponse
|
||||
// @Router /transcription-providers/meta [get].
|
||||
func (h *AudioHandler) ListTranscriptionMeta(c echo.Context) error {
|
||||
return c.JSON(http.StatusOK, h.service.ListTranscriptionMeta(c.Request().Context()))
|
||||
}
|
||||
|
||||
@@ -79,10 +85,10 @@ func (h *SpeechHandler) ListTranscriptionMeta(c echo.Context) error {
|
||||
// @Description List providers that support speech (filtered view of unified providers table)
|
||||
// @Tags speech-providers
|
||||
// @Produce json
|
||||
// @Success 200 {array} tts.SpeechProviderResponse
|
||||
// @Success 200 {array} audiopkg.SpeechProviderResponse
|
||||
// @Failure 500 {object} ErrorResponse
|
||||
// @Router /speech-providers [get].
|
||||
func (h *SpeechHandler) ListProviders(c echo.Context) error {
|
||||
func (h *AudioHandler) ListProviders(c echo.Context) error {
|
||||
items, err := h.service.ListSpeechProviders(c.Request().Context())
|
||||
if err != nil {
|
||||
return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
|
||||
@@ -90,7 +96,15 @@ func (h *SpeechHandler) ListProviders(c echo.Context) error {
|
||||
return c.JSON(http.StatusOK, items)
|
||||
}
|
||||
|
||||
func (h *SpeechHandler) ListTranscriptionProviders(c echo.Context) error {
|
||||
// ListTranscriptionProviders godoc
|
||||
// @Summary List transcription providers
|
||||
// @Description List providers that support transcription (filtered view of unified providers table)
|
||||
// @Tags transcription-providers
|
||||
// @Produce json
|
||||
// @Success 200 {array} audiopkg.SpeechProviderResponse
|
||||
// @Failure 500 {object} ErrorResponse
|
||||
// @Router /transcription-providers [get].
|
||||
func (h *AudioHandler) ListTranscriptionProviders(c echo.Context) error {
|
||||
items, err := h.service.ListTranscriptionProviders(c.Request().Context())
|
||||
if err != nil {
|
||||
return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
|
||||
@@ -104,11 +118,12 @@ func (h *SpeechHandler) ListTranscriptionProviders(c echo.Context) error {
|
||||
// @Tags speech-providers
|
||||
// @Produce json
|
||||
// @Param id path string true "Provider ID (UUID)"
|
||||
// @Success 200 {object} tts.SpeechProviderResponse
|
||||
// @Success 200 {object} audiopkg.SpeechProviderResponse
|
||||
// @Failure 400 {object} ErrorResponse
|
||||
// @Failure 404 {object} ErrorResponse
|
||||
// @Router /speech-providers/{id} [get].
|
||||
func (h *SpeechHandler) GetProvider(c echo.Context) error {
|
||||
// @Router /transcription-providers/{id} [get].
|
||||
func (h *AudioHandler) GetProvider(c echo.Context) error {
|
||||
id := strings.TrimSpace(c.Param("id"))
|
||||
if id == "" {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
|
||||
@@ -126,11 +141,11 @@ func (h *SpeechHandler) GetProvider(c echo.Context) error {
|
||||
// @Tags speech-providers
|
||||
// @Produce json
|
||||
// @Param id path string true "Provider ID (UUID)"
|
||||
// @Success 200 {array} tts.SpeechModelResponse
|
||||
// @Success 200 {array} audiopkg.SpeechModelResponse
|
||||
// @Failure 400 {object} ErrorResponse
|
||||
// @Failure 500 {object} ErrorResponse
|
||||
// @Router /speech-providers/{id}/models [get].
|
||||
func (h *SpeechHandler) ListModelsByProvider(c echo.Context) error {
|
||||
func (h *AudioHandler) ListModelsByProvider(c echo.Context) error {
|
||||
id := strings.TrimSpace(c.Param("id"))
|
||||
if id == "" {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
|
||||
@@ -149,12 +164,12 @@ func (h *SpeechHandler) ListModelsByProvider(c echo.Context) error {
|
||||
// @Accept json
|
||||
// @Produce json
|
||||
// @Param id path string true "Provider ID (UUID)"
|
||||
// @Success 200 {object} tts.ImportModelsResponse
|
||||
// @Success 200 {object} audiopkg.ImportModelsResponse
|
||||
// @Failure 400 {object} ErrorResponse
|
||||
// @Failure 404 {object} ErrorResponse
|
||||
// @Failure 500 {object} ErrorResponse
|
||||
// @Router /speech-providers/{id}/import-models [post].
|
||||
func (h *SpeechHandler) ImportModels(c echo.Context) error {
|
||||
func (h *AudioHandler) ImportModels(c echo.Context) error {
|
||||
id := strings.TrimSpace(c.Param("id"))
|
||||
if id == "" {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
|
||||
@@ -165,7 +180,7 @@ func (h *SpeechHandler) ImportModels(c echo.Context) error {
|
||||
return echo.NewHTTPError(http.StatusInternalServerError, fmt.Sprintf("fetch remote speech models: %v", err))
|
||||
}
|
||||
|
||||
resp := tts.ImportModelsResponse{
|
||||
resp := audiopkg.ImportModelsResponse{
|
||||
Models: make([]string, 0, len(remoteModels)),
|
||||
}
|
||||
|
||||
@@ -197,7 +212,17 @@ func (h *SpeechHandler) ImportModels(c echo.Context) error {
|
||||
return c.JSON(http.StatusOK, resp)
|
||||
}
|
||||
|
||||
func (h *SpeechHandler) ListTranscriptionModelsByProvider(c echo.Context) error {
|
||||
// ListTranscriptionModelsByProvider godoc
|
||||
// @Summary List transcription models by provider
|
||||
// @Description List models of type 'transcription' for a specific transcription provider
|
||||
// @Tags transcription-providers
|
||||
// @Produce json
|
||||
// @Param id path string true "Provider ID (UUID)"
|
||||
// @Success 200 {array} audiopkg.TranscriptionModelResponse
|
||||
// @Failure 400 {object} ErrorResponse
|
||||
// @Failure 500 {object} ErrorResponse
|
||||
// @Router /transcription-providers/{id}/models [get].
|
||||
func (h *AudioHandler) ListTranscriptionModelsByProvider(c echo.Context) error {
|
||||
id := strings.TrimSpace(c.Param("id"))
|
||||
if id == "" {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
|
||||
@@ -209,7 +234,19 @@ func (h *SpeechHandler) ListTranscriptionModelsByProvider(c echo.Context) error
|
||||
return c.JSON(http.StatusOK, items)
|
||||
}
|
||||
|
||||
func (h *SpeechHandler) ImportTranscriptionModels(c echo.Context) error {
|
||||
// ImportTranscriptionModels godoc
|
||||
// @Summary Import transcription models from provider
|
||||
// @Description Fetch models using the configured transcription provider and import them into the unified models table
|
||||
// @Tags transcription-providers
|
||||
// @Accept json
|
||||
// @Produce json
|
||||
// @Param id path string true "Provider ID (UUID)"
|
||||
// @Success 200 {object} audiopkg.ImportModelsResponse
|
||||
// @Failure 400 {object} ErrorResponse
|
||||
// @Failure 404 {object} ErrorResponse
|
||||
// @Failure 500 {object} ErrorResponse
|
||||
// @Router /transcription-providers/{id}/import-models [post].
|
||||
func (h *AudioHandler) ImportTranscriptionModels(c echo.Context) error {
|
||||
id := strings.TrimSpace(c.Param("id"))
|
||||
if id == "" {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
|
||||
@@ -220,7 +257,7 @@ func (h *SpeechHandler) ImportTranscriptionModels(c echo.Context) error {
|
||||
return echo.NewHTTPError(http.StatusInternalServerError, fmt.Sprintf("fetch remote transcription models: %v", err))
|
||||
}
|
||||
|
||||
resp := tts.ImportModelsResponse{
|
||||
resp := audiopkg.ImportModelsResponse{
|
||||
Models: make([]string, 0, len(remoteModels)),
|
||||
}
|
||||
|
||||
@@ -257,10 +294,10 @@ func (h *SpeechHandler) ImportTranscriptionModels(c echo.Context) error {
|
||||
// @Description List all models of type 'speech' (filtered view of unified models table)
|
||||
// @Tags speech-models
|
||||
// @Produce json
|
||||
// @Success 200 {array} tts.SpeechModelResponse
|
||||
// @Success 200 {array} audiopkg.SpeechModelResponse
|
||||
// @Failure 500 {object} ErrorResponse
|
||||
// @Router /speech-models [get].
|
||||
func (h *SpeechHandler) ListModels(c echo.Context) error {
|
||||
func (h *AudioHandler) ListModels(c echo.Context) error {
|
||||
items, err := h.service.ListSpeechModels(c.Request().Context())
|
||||
if err != nil {
|
||||
return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
|
||||
@@ -268,7 +305,15 @@ func (h *SpeechHandler) ListModels(c echo.Context) error {
|
||||
return c.JSON(http.StatusOK, items)
|
||||
}
|
||||
|
||||
func (h *SpeechHandler) ListTranscriptionModels(c echo.Context) error {
|
||||
// ListTranscriptionModels godoc
|
||||
// @Summary List all transcription models
|
||||
// @Description List all models of type 'transcription' (filtered view of unified models table)
|
||||
// @Tags transcription-models
|
||||
// @Produce json
|
||||
// @Success 200 {array} audiopkg.TranscriptionModelResponse
|
||||
// @Failure 500 {object} ErrorResponse
|
||||
// @Router /transcription-models [get].
|
||||
func (h *AudioHandler) ListTranscriptionModels(c echo.Context) error {
|
||||
items, err := h.service.ListTranscriptionModels(c.Request().Context())
|
||||
if err != nil {
|
||||
return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
|
||||
@@ -281,10 +326,10 @@ func (h *SpeechHandler) ListTranscriptionModels(c echo.Context) error {
|
||||
// @Tags speech-models
|
||||
// @Produce json
|
||||
// @Param id path string true "Model ID"
|
||||
// @Success 200 {object} tts.SpeechModelResponse
|
||||
// @Success 200 {object} audiopkg.SpeechModelResponse
|
||||
// @Failure 404 {object} ErrorResponse
|
||||
// @Router /speech-models/{id} [get].
|
||||
func (h *SpeechHandler) GetModel(c echo.Context) error {
|
||||
func (h *AudioHandler) GetModel(c echo.Context) error {
|
||||
id := strings.TrimSpace(c.Param("id"))
|
||||
if id == "" {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
|
||||
@@ -296,12 +341,23 @@ func (h *SpeechHandler) GetModel(c echo.Context) error {
|
||||
return c.JSON(http.StatusOK, resp)
|
||||
}
|
||||
|
||||
func (h *SpeechHandler) UpdateModel(c echo.Context) error {
|
||||
// UpdateModel godoc
|
||||
// @Summary Update a speech model
|
||||
// @Tags speech-models
|
||||
// @Accept json
|
||||
// @Produce json
|
||||
// @Param id path string true "Model ID"
|
||||
// @Param request body audiopkg.UpdateSpeechModelRequest true "Model update payload"
|
||||
// @Success 200 {object} audiopkg.SpeechModelResponse
|
||||
// @Failure 400 {object} ErrorResponse
|
||||
// @Failure 500 {object} ErrorResponse
|
||||
// @Router /speech-models/{id} [put].
|
||||
func (h *AudioHandler) UpdateModel(c echo.Context) error {
|
||||
id := strings.TrimSpace(c.Param("id"))
|
||||
if id == "" {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
|
||||
}
|
||||
var req tts.UpdateSpeechModelRequest
|
||||
var req audiopkg.UpdateSpeechModelRequest
|
||||
if err := c.Bind(&req); err != nil {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, err.Error())
|
||||
}
|
||||
@@ -312,7 +368,15 @@ func (h *SpeechHandler) UpdateModel(c echo.Context) error {
|
||||
return c.JSON(http.StatusOK, resp)
|
||||
}
|
||||
|
||||
func (h *SpeechHandler) GetTranscriptionModel(c echo.Context) error {
|
||||
// GetTranscriptionModel godoc
|
||||
// @Summary Get a transcription model
|
||||
// @Tags transcription-models
|
||||
// @Produce json
|
||||
// @Param id path string true "Model ID"
|
||||
// @Success 200 {object} audiopkg.TranscriptionModelResponse
|
||||
// @Failure 404 {object} ErrorResponse
|
||||
// @Router /transcription-models/{id} [get].
|
||||
func (h *AudioHandler) GetTranscriptionModel(c echo.Context) error {
|
||||
id := strings.TrimSpace(c.Param("id"))
|
||||
if id == "" {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
|
||||
@@ -324,12 +388,23 @@ func (h *SpeechHandler) GetTranscriptionModel(c echo.Context) error {
|
||||
return c.JSON(http.StatusOK, resp)
|
||||
}
|
||||
|
||||
func (h *SpeechHandler) UpdateTranscriptionModel(c echo.Context) error {
|
||||
// UpdateTranscriptionModel godoc
|
||||
// @Summary Update a transcription model
|
||||
// @Tags transcription-models
|
||||
// @Accept json
|
||||
// @Produce json
|
||||
// @Param id path string true "Model ID"
|
||||
// @Param request body audiopkg.UpdateSpeechModelRequest true "Model update payload"
|
||||
// @Success 200 {object} audiopkg.TranscriptionModelResponse
|
||||
// @Failure 400 {object} ErrorResponse
|
||||
// @Failure 500 {object} ErrorResponse
|
||||
// @Router /transcription-models/{id} [put].
|
||||
func (h *AudioHandler) UpdateTranscriptionModel(c echo.Context) error {
|
||||
id := strings.TrimSpace(c.Param("id"))
|
||||
if id == "" {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
|
||||
}
|
||||
var req tts.UpdateSpeechModelRequest
|
||||
var req audiopkg.UpdateSpeechModelRequest
|
||||
if err := c.Bind(&req); err != nil {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, err.Error())
|
||||
}
|
||||
@@ -345,10 +420,10 @@ func (h *SpeechHandler) UpdateTranscriptionModel(c echo.Context) error {
|
||||
// @Tags speech-models
|
||||
// @Produce json
|
||||
// @Param id path string true "Model ID"
|
||||
// @Success 200 {object} tts.ModelCapabilities
|
||||
// @Success 200 {object} audiopkg.ModelCapabilities
|
||||
// @Failure 404 {object} ErrorResponse
|
||||
// @Router /speech-models/{id}/capabilities [get].
|
||||
func (h *SpeechHandler) GetModelCapabilities(c echo.Context) error {
|
||||
func (h *AudioHandler) GetModelCapabilities(c echo.Context) error {
|
||||
id := strings.TrimSpace(c.Param("id"))
|
||||
if id == "" {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
|
||||
@@ -360,7 +435,15 @@ func (h *SpeechHandler) GetModelCapabilities(c echo.Context) error {
|
||||
return c.JSON(http.StatusOK, caps)
|
||||
}
|
||||
|
||||
func (h *SpeechHandler) GetTranscriptionModelCapabilities(c echo.Context) error {
|
||||
// GetTranscriptionModelCapabilities godoc
|
||||
// @Summary Get transcription model capabilities
|
||||
// @Tags transcription-models
|
||||
// @Produce json
|
||||
// @Param id path string true "Model ID"
|
||||
// @Success 200 {object} audiopkg.ModelCapabilities
|
||||
// @Failure 404 {object} ErrorResponse
|
||||
// @Router /transcription-models/{id}/capabilities [get].
|
||||
func (h *AudioHandler) GetTranscriptionModelCapabilities(c echo.Context) error {
|
||||
id := strings.TrimSpace(c.Param("id"))
|
||||
if id == "" {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
|
||||
@@ -379,17 +462,17 @@ func (h *SpeechHandler) GetTranscriptionModelCapabilities(c echo.Context) error
|
||||
// @Accept json
|
||||
// @Produce application/octet-stream
|
||||
// @Param id path string true "Model ID"
|
||||
// @Param request body tts.TestSynthesizeRequest true "Text to synthesize"
|
||||
// @Param request body audiopkg.TestSynthesizeRequest true "Text to synthesize"
|
||||
// @Success 200 {file} binary "Audio data"
|
||||
// @Failure 400 {object} ErrorResponse
|
||||
// @Failure 500 {object} ErrorResponse
|
||||
// @Router /speech-models/{id}/test [post].
|
||||
func (h *SpeechHandler) TestModel(c echo.Context) error {
|
||||
func (h *AudioHandler) TestModel(c echo.Context) error {
|
||||
id := strings.TrimSpace(c.Param("id"))
|
||||
if id == "" {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
|
||||
}
|
||||
var req tts.TestSynthesizeRequest
|
||||
var req audiopkg.TestSynthesizeRequest
|
||||
if err := c.Bind(&req); err != nil {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, err.Error())
|
||||
}
|
||||
@@ -408,7 +491,20 @@ func (h *SpeechHandler) TestModel(c echo.Context) error {
|
||||
return c.Blob(http.StatusOK, contentType, audio)
|
||||
}
|
||||
|
||||
func (h *SpeechHandler) TestTranscriptionModel(c echo.Context) error {
|
||||
// TestTranscriptionModel godoc
|
||||
// @Summary Test transcription model recognition
|
||||
// @Description Transcribe uploaded audio using a specific model's config and return structured text output
|
||||
// @Tags transcription-models
|
||||
// @Accept mpfd
|
||||
// @Produce json
|
||||
// @Param id path string true "Model ID"
|
||||
// @Param file formData file true "Audio file"
|
||||
// @Param config formData string false "Optional JSON config"
|
||||
// @Success 200 {object} audiopkg.TestTranscriptionResponse
|
||||
// @Failure 400 {object} ErrorResponse
|
||||
// @Failure 500 {object} ErrorResponse
|
||||
// @Router /transcription-models/{id}/test [post].
|
||||
func (h *AudioHandler) TestTranscriptionModel(c echo.Context) error {
|
||||
id := strings.TrimSpace(c.Param("id"))
|
||||
if id == "" {
|
||||
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
|
||||
@@ -441,16 +537,16 @@ func (h *SpeechHandler) TestTranscriptionModel(c echo.Context) error {
|
||||
if err != nil {
|
||||
return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
|
||||
}
|
||||
resp := tts.TestTranscriptionResponse{
|
||||
resp := audiopkg.TestTranscriptionResponse{
|
||||
Text: result.Text,
|
||||
Language: result.Language,
|
||||
DurationSeconds: result.DurationSeconds,
|
||||
Metadata: result.ProviderMetadata,
|
||||
}
|
||||
if len(result.Words) > 0 {
|
||||
resp.Words = make([]tts.TranscriptionWord, 0, len(result.Words))
|
||||
resp.Words = make([]audiopkg.TranscriptionWord, 0, len(result.Words))
|
||||
for _, word := range result.Words {
|
||||
resp.Words = append(resp.Words, tts.TranscriptionWord{
|
||||
resp.Words = append(resp.Words, audiopkg.TranscriptionWord{
|
||||
Text: word.Text,
|
||||
Start: word.Start,
|
||||
End: word.End,
|
||||
|
||||
Reference in New Issue
Block a user