Revert "Feat/speech support (#392)" (#393)

This reverts commit c9dcfe287f.
2026-04-25 07:00:48 +09:00 · 2026-04-22 00:11:16 +08:00
parent c9dcfe287f
commit fd8f1ec078
70 changed files with 1689 additions and 6609 deletions
@@ -7,28 +7,28 @@ import (

 	"github.com/labstack/echo/v4"

-	audiopkg "github.com/memohai/memoh/internal/audio"
 	"github.com/memohai/memoh/internal/settings"
+	"github.com/memohai/memoh/internal/tts"
 )

-// BotAudioHandler handles per-bot speech synthesis requests from the agent tool.
-type BotAudioHandler struct {
-	audioService    *audiopkg.Service
+// BotTtsHandler handles per-bot TTS synthesis requests from the agent tool.
+type BotTtsHandler struct {
+	ttsService      *tts.Service
 	settingsService *settings.Service
-	tempStore       *audiopkg.TempStore
+	tempStore       *tts.TempStore
 	logger          *slog.Logger
 }

-func NewBotAudioHandler(log *slog.Logger, audioService *audiopkg.Service, settingsService *settings.Service, tempStore *audiopkg.TempStore) *BotAudioHandler {
-	return &BotAudioHandler{
-		audioService:    audioService,
+func NewBotTtsHandler(log *slog.Logger, ttsService *tts.Service, settingsService *settings.Service, tempStore *tts.TempStore) *BotTtsHandler {
+	return &BotTtsHandler{
+		ttsService:      ttsService,
 		settingsService: settingsService,
 		tempStore:       tempStore,
-		logger:          log.With(slog.String("handler", "bot_audio")),
+		logger:          log.With(slog.String("handler", "bot_tts")),
 	}
 }

-func (h *BotAudioHandler) Register(e *echo.Echo) {
+func (h *BotTtsHandler) Register(e *echo.Echo) {
 	e.POST("/bots/:bot_id/tts/synthesize", h.Synthesize)
 }

@@ -54,7 +54,7 @@ type synthesizeResponse struct {
 // @Failure 400 {object} ErrorResponse
 // @Failure 500 {object} ErrorResponse
 // @Router /bots/{bot_id}/tts/synthesize [post].
-func (h *BotAudioHandler) Synthesize(c echo.Context) error {
+func (h *BotTtsHandler) Synthesize(c echo.Context) error {
 	botID := strings.TrimSpace(c.Param("bot_id"))
 	if botID == "" {
 		return echo.NewHTTPError(http.StatusBadRequest, "bot_id is required")
@@ -88,10 +88,10 @@ func (h *BotAudioHandler) Synthesize(c echo.Context) error {
 		return echo.NewHTTPError(http.StatusInternalServerError, "failed to create temp file")
 	}

-	contentType, streamErr := h.audioService.StreamToFile(c.Request().Context(), botSettings.TtsModelID, text, f)
+	contentType, streamErr := h.ttsService.StreamToFile(c.Request().Context(), botSettings.TtsModelID, text, f)
 	closeErr := f.Close()
 	if streamErr != nil {
-		h.logger.Error("speech synthesis failed", slog.String("bot_id", botID), slog.String("model_id", botSettings.TtsModelID), slog.Any("error", streamErr))
+		h.logger.Error("tts synthesis failed", slog.String("bot_id", botID), slog.String("model_id", botSettings.TtsModelID), slog.Any("error", streamErr))
 		h.tempStore.Delete(tempID)
 		return echo.NewHTTPError(http.StatusInternalServerError, streamErr.Error())
 	}
@@ -30,30 +30,30 @@ import (
 	messagepkg "github.com/memohai/memoh/internal/message"
 )

-// localSpeechSynthesizer synthesizes text to speech audio.
-type localSpeechSynthesizer interface {
+// localTtsSynthesizer synthesizes text to speech audio.
+type localTtsSynthesizer interface {
 	Synthesize(ctx context.Context, modelID string, text string, overrideCfg map[string]any) ([]byte, string, error)
 }

-// localSpeechModelResolver resolves speech model IDs for bots.
-type localSpeechModelResolver interface {
-	ResolveSpeechModelID(ctx context.Context, botID string) (string, error)
+// localTtsModelResolver resolves TTS model IDs for bots.
+type localTtsModelResolver interface {
+	ResolveTtsModelID(ctx context.Context, botID string) (string, error)
 }

 // LocalChannelHandler handles local channel routes (WebUI / API) backed by bot history.
 type LocalChannelHandler struct {
-	channelType         channel.ChannelType
-	channelManager      *channel.Manager
-	channelStore        *channel.Store
-	chatService         *conversation.Service
-	routeHub            *local.RouteHub
-	botService          *bots.Service
-	accountService      *accounts.Service
-	resolver            *flow.Resolver
-	mediaService        *media.Service
-	speechService       localSpeechSynthesizer
-	speechModelResolver localSpeechModelResolver
-	logger              *slog.Logger
+	channelType      channel.ChannelType
+	channelManager   *channel.Manager
+	channelStore     *channel.Store
+	chatService      *conversation.Service
+	routeHub         *local.RouteHub
+	botService       *bots.Service
+	accountService   *accounts.Service
+	resolver         *flow.Resolver
+	mediaService     *media.Service
+	ttsService       localTtsSynthesizer
+	ttsModelResolver localTtsModelResolver
+	logger           *slog.Logger
 }

 // NewLocalChannelHandler creates a local channel handler.
@@ -80,10 +80,10 @@ func (h *LocalChannelHandler) SetMediaService(svc *media.Service) {
 	h.mediaService = svc
 }

-// SetSpeechService configures speech synthesis for handling speech_delta events.
-func (h *LocalChannelHandler) SetSpeechService(synth localSpeechSynthesizer, resolver localSpeechModelResolver) {
-	h.speechService = synth
-	h.speechModelResolver = resolver
+// SetTtsService configures TTS synthesis for handling speech_delta events.
+func (h *LocalChannelHandler) SetTtsService(synth localTtsSynthesizer, resolver localTtsModelResolver) {
+	h.ttsService = synth
+	h.ttsModelResolver = resolver
 }

 // Register registers the local channel routes.
@@ -719,12 +719,12 @@ func (h *LocalChannelHandler) ingestSingleAttachment(ctx context.Context, botID,
 // wsSynthesizeSpeech handles speech_delta events by synthesizing audio and
 // injecting attachment_delta events with the resulting voice attachments.
 func (h *LocalChannelHandler) wsSynthesizeSpeech(ctx context.Context, botID string, original json.RawMessage) []json.RawMessage {
-	if h.speechService == nil || h.speechModelResolver == nil {
+	if h.ttsService == nil || h.ttsModelResolver == nil {
 		h.logger.Warn("speech_delta received but TTS service not configured")
 		return nil
 	}

-	modelID, err := h.speechModelResolver.ResolveSpeechModelID(ctx, botID)
+	modelID, err := h.ttsModelResolver.ResolveTtsModelID(ctx, botID)
 	if err != nil || strings.TrimSpace(modelID) == "" {
 		h.logger.Warn("speech_delta: bot has no TTS model configured", slog.String("bot_id", botID))
 		return nil
@@ -746,7 +746,7 @@ func (h *LocalChannelHandler) wsSynthesizeSpeech(ctx context.Context, botID stri
 			continue
 		}

-		audioData, contentType, synthErr := h.speechService.Synthesize(ctx, modelID, text, nil)
+		audioData, contentType, synthErr := h.ttsService.Synthesize(ctx, modelID, text, nil)
 		if synthErr != nil {
 			h.logger.Warn("speech synthesis failed", slog.String("bot_id", botID), slog.Any("error", synthErr))
 			continue
@@ -1,83 +1,55 @@
 package handlers

 import (
-	"encoding/json"
 	"errors"
 	"fmt"
-	"io"
 	"log/slog"
-	"mime/multipart"
 	"net/http"
 	"strings"

 	"github.com/labstack/echo/v4"

-	audiopkg "github.com/memohai/memoh/internal/audio"
 	"github.com/memohai/memoh/internal/models"
+	"github.com/memohai/memoh/internal/tts"
 )

-type AudioHandler struct {
-	service       *audiopkg.Service
+type SpeechHandler struct {
+	service       *tts.Service
 	modelsService *models.Service
 	logger        *slog.Logger
 }

-func NewAudioHandler(log *slog.Logger, service *audiopkg.Service, modelsService *models.Service) *AudioHandler {
-	return &AudioHandler{
+func NewSpeechHandler(log *slog.Logger, service *tts.Service, modelsService *models.Service) *SpeechHandler {
+	return &SpeechHandler{
 		service:       service,
 		modelsService: modelsService,
-		logger:        log.With(slog.String("handler", "audio")),
+		logger:        log.With(slog.String("handler", "speech")),
 	}
 }

-func (h *AudioHandler) Register(e *echo.Echo) {
+func (h *SpeechHandler) Register(e *echo.Echo) {
 	pg := e.Group("/speech-providers")
 	pg.GET("", h.ListProviders)
 	pg.GET("/:id", h.GetProvider)
-	pg.GET("/meta", h.ListSpeechMeta)
+	pg.GET("/meta", h.ListMeta)
 	pg.GET("/:id/models", h.ListModelsByProvider)
 	pg.POST("/:id/import-models", h.ImportModels)

-	tpg := e.Group("/transcription-providers")
-	tpg.GET("", h.ListTranscriptionProviders)
-	tpg.GET("/meta", h.ListTranscriptionMeta)
-	tpg.GET("/:id", h.GetProvider)
-	tpg.GET("/:id/models", h.ListTranscriptionModelsByProvider)
-	tpg.POST("/:id/import-models", h.ImportTranscriptionModels)
-
 	mg := e.Group("/speech-models")
 	mg.GET("", h.ListModels)
 	mg.GET("/:id", h.GetModel)
-	mg.PUT("/:id", h.UpdateModel)
 	mg.GET("/:id/capabilities", h.GetModelCapabilities)
 	mg.POST("/:id/test", h.TestModel)
-
-	tg := e.Group("/transcription-models")
-	tg.GET("", h.ListTranscriptionModels)
-	tg.GET("/:id", h.GetTranscriptionModel)
-	tg.PUT("/:id", h.UpdateTranscriptionModel)
-	tg.GET("/:id/capabilities", h.GetTranscriptionModelCapabilities)
-	tg.POST("/:id/test", h.TestTranscriptionModel)
 }

 // ListMeta godoc
 // @Summary List speech provider metadata
 // @Description List available speech provider types with their models and capabilities
 // @Tags speech-providers
-// @Success 200 {array} audiopkg.ProviderMetaResponse
+// @Success 200 {array} tts.ProviderMetaResponse
 // @Router /speech-providers/meta [get].
-func (h *AudioHandler) ListSpeechMeta(c echo.Context) error {
-	return c.JSON(http.StatusOK, h.service.ListSpeechMeta(c.Request().Context()))
-}
-
-// ListTranscriptionMeta godoc
-// @Summary List transcription provider metadata
-// @Description List available transcription provider types with their models and capabilities
-// @Tags transcription-providers
-// @Success 200 {array} audiopkg.ProviderMetaResponse
-// @Router /transcription-providers/meta [get].
-func (h *AudioHandler) ListTranscriptionMeta(c echo.Context) error {
-	return c.JSON(http.StatusOK, h.service.ListTranscriptionMeta(c.Request().Context()))
+func (h *SpeechHandler) ListMeta(c echo.Context) error {
+	return c.JSON(http.StatusOK, h.service.ListMeta(c.Request().Context()))
 }

 // ListProviders godoc
@@ -85,10 +57,10 @@ func (h *AudioHandler) ListTranscriptionMeta(c echo.Context) error {
 // @Description List providers that support speech (filtered view of unified providers table)
 // @Tags speech-providers
 // @Produce json
-// @Success 200 {array} audiopkg.SpeechProviderResponse
+// @Success 200 {array} tts.SpeechProviderResponse
 // @Failure 500 {object} ErrorResponse
 // @Router /speech-providers [get].
-func (h *AudioHandler) ListProviders(c echo.Context) error {
+func (h *SpeechHandler) ListProviders(c echo.Context) error {
 	items, err := h.service.ListSpeechProviders(c.Request().Context())
 	if err != nil {
 		return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
@@ -96,34 +68,17 @@ func (h *AudioHandler) ListProviders(c echo.Context) error {
 	return c.JSON(http.StatusOK, items)
 }

-// ListTranscriptionProviders godoc
-// @Summary List transcription providers
-// @Description List providers that support transcription (filtered view of unified providers table)
-// @Tags transcription-providers
-// @Produce json
-// @Success 200 {array} audiopkg.SpeechProviderResponse
-// @Failure 500 {object} ErrorResponse
-// @Router /transcription-providers [get].
-func (h *AudioHandler) ListTranscriptionProviders(c echo.Context) error {
-	items, err := h.service.ListTranscriptionProviders(c.Request().Context())
-	if err != nil {
-		return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
-	}
-	return c.JSON(http.StatusOK, items)
-}
-
 // GetProvider godoc
 // @Summary Get speech provider
 // @Description Get a speech provider with masked config values
 // @Tags speech-providers
 // @Produce json
 // @Param id path string true "Provider ID (UUID)"
-// @Success 200 {object} audiopkg.SpeechProviderResponse
+// @Success 200 {object} tts.SpeechProviderResponse
 // @Failure 400 {object} ErrorResponse
 // @Failure 404 {object} ErrorResponse
 // @Router /speech-providers/{id} [get].
-// @Router /transcription-providers/{id} [get].
-func (h *AudioHandler) GetProvider(c echo.Context) error {
+func (h *SpeechHandler) GetProvider(c echo.Context) error {
 	id := strings.TrimSpace(c.Param("id"))
 	if id == "" {
 		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -141,11 +96,11 @@ func (h *AudioHandler) GetProvider(c echo.Context) error {
 // @Tags speech-providers
 // @Produce json
 // @Param id path string true "Provider ID (UUID)"
-// @Success 200 {array} audiopkg.SpeechModelResponse
+// @Success 200 {array} tts.SpeechModelResponse
 // @Failure 400 {object} ErrorResponse
 // @Failure 500 {object} ErrorResponse
 // @Router /speech-providers/{id}/models [get].
-func (h *AudioHandler) ListModelsByProvider(c echo.Context) error {
+func (h *SpeechHandler) ListModelsByProvider(c echo.Context) error {
 	id := strings.TrimSpace(c.Param("id"))
 	if id == "" {
 		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -164,12 +119,12 @@ func (h *AudioHandler) ListModelsByProvider(c echo.Context) error {
 // @Accept json
 // @Produce json
 // @Param id path string true "Provider ID (UUID)"
-// @Success 200 {object} audiopkg.ImportModelsResponse
+// @Success 200 {object} tts.ImportModelsResponse
 // @Failure 400 {object} ErrorResponse
 // @Failure 404 {object} ErrorResponse
 // @Failure 500 {object} ErrorResponse
 // @Router /speech-providers/{id}/import-models [post].
-func (h *AudioHandler) ImportModels(c echo.Context) error {
+func (h *SpeechHandler) ImportModels(c echo.Context) error {
 	id := strings.TrimSpace(c.Param("id"))
 	if id == "" {
 		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -180,7 +135,7 @@ func (h *AudioHandler) ImportModels(c echo.Context) error {
 		return echo.NewHTTPError(http.StatusInternalServerError, fmt.Sprintf("fetch remote speech models: %v", err))
 	}

-	resp := audiopkg.ImportModelsResponse{
+	resp := tts.ImportModelsResponse{
 		Models: make([]string, 0, len(remoteModels)),
 	}

@@ -212,92 +167,15 @@ func (h *AudioHandler) ImportModels(c echo.Context) error {
 	return c.JSON(http.StatusOK, resp)
 }

-// ListTranscriptionModelsByProvider godoc
-// @Summary List transcription models by provider
-// @Description List models of type 'transcription' for a specific transcription provider
-// @Tags transcription-providers
-// @Produce json
-// @Param id path string true "Provider ID (UUID)"
-// @Success 200 {array} audiopkg.TranscriptionModelResponse
-// @Failure 400 {object} ErrorResponse
-// @Failure 500 {object} ErrorResponse
-// @Router /transcription-providers/{id}/models [get].
-func (h *AudioHandler) ListTranscriptionModelsByProvider(c echo.Context) error {
-	id := strings.TrimSpace(c.Param("id"))
-	if id == "" {
-		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
-	}
-	items, err := h.service.ListTranscriptionModelsByProvider(c.Request().Context(), id)
-	if err != nil {
-		return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
-	}
-	return c.JSON(http.StatusOK, items)
-}
-
-// ImportTranscriptionModels godoc
-// @Summary Import transcription models from provider
-// @Description Fetch models using the configured transcription provider and import them into the unified models table
-// @Tags transcription-providers
-// @Accept json
-// @Produce json
-// @Param id path string true "Provider ID (UUID)"
-// @Success 200 {object} audiopkg.ImportModelsResponse
-// @Failure 400 {object} ErrorResponse
-// @Failure 404 {object} ErrorResponse
-// @Failure 500 {object} ErrorResponse
-// @Router /transcription-providers/{id}/import-models [post].
-func (h *AudioHandler) ImportTranscriptionModels(c echo.Context) error {
-	id := strings.TrimSpace(c.Param("id"))
-	if id == "" {
-		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
-	}
-
-	remoteModels, err := h.service.FetchRemoteTranscriptionModels(c.Request().Context(), id)
-	if err != nil {
-		return echo.NewHTTPError(http.StatusInternalServerError, fmt.Sprintf("fetch remote transcription models: %v", err))
-	}
-
-	resp := audiopkg.ImportModelsResponse{
-		Models: make([]string, 0, len(remoteModels)),
-	}
-
-	for _, model := range remoteModels {
-		name := strings.TrimSpace(model.Name)
-		if name == "" {
-			name = model.ID
-		}
-
-		_, err := h.modelsService.Create(c.Request().Context(), models.AddRequest{
-			ModelID:    model.ID,
-			Name:       name,
-			ProviderID: id,
-			Type:       models.ModelTypeTranscription,
-			Config:     models.ModelConfig{},
-		})
-		if err != nil {
-			if errors.Is(err, models.ErrModelIDAlreadyExists) {
-				resp.Skipped++
-				continue
-			}
-			h.logger.Warn("failed to import transcription model", slog.String("model_id", model.ID), slog.Any("error", err))
-			continue
-		}
-		resp.Created++
-		resp.Models = append(resp.Models, model.ID)
-	}
-
-	return c.JSON(http.StatusOK, resp)
-}
-
 // ListModels godoc
 // @Summary List all speech models
 // @Description List all models of type 'speech' (filtered view of unified models table)
 // @Tags speech-models
 // @Produce json
-// @Success 200 {array} audiopkg.SpeechModelResponse
+// @Success 200 {array} tts.SpeechModelResponse
 // @Failure 500 {object} ErrorResponse
 // @Router /speech-models [get].
-func (h *AudioHandler) ListModels(c echo.Context) error {
+func (h *SpeechHandler) ListModels(c echo.Context) error {
 	items, err := h.service.ListSpeechModels(c.Request().Context())
 	if err != nil {
 		return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
@@ -305,31 +183,15 @@ func (h *AudioHandler) ListModels(c echo.Context) error {
 	return c.JSON(http.StatusOK, items)
 }

-// ListTranscriptionModels godoc
-// @Summary List all transcription models
-// @Description List all models of type 'transcription' (filtered view of unified models table)
-// @Tags transcription-models
-// @Produce json
-// @Success 200 {array} audiopkg.TranscriptionModelResponse
-// @Failure 500 {object} ErrorResponse
-// @Router /transcription-models [get].
-func (h *AudioHandler) ListTranscriptionModels(c echo.Context) error {
-	items, err := h.service.ListTranscriptionModels(c.Request().Context())
-	if err != nil {
-		return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
-	}
-	return c.JSON(http.StatusOK, items)
-}
-
 // GetModel godoc
 // @Summary Get a speech model
 // @Tags speech-models
 // @Produce json
 // @Param id path string true "Model ID"
-// @Success 200 {object} audiopkg.SpeechModelResponse
+// @Success 200 {object} tts.SpeechModelResponse
 // @Failure 404 {object} ErrorResponse
 // @Router /speech-models/{id} [get].
-func (h *AudioHandler) GetModel(c echo.Context) error {
+func (h *SpeechHandler) GetModel(c echo.Context) error {
 	id := strings.TrimSpace(c.Param("id"))
 	if id == "" {
 		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -341,89 +203,15 @@ func (h *AudioHandler) GetModel(c echo.Context) error {
 	return c.JSON(http.StatusOK, resp)
 }

-// UpdateModel godoc
-// @Summary Update a speech model
-// @Tags speech-models
-// @Accept json
-// @Produce json
-// @Param id path string true "Model ID"
-// @Param request body audiopkg.UpdateSpeechModelRequest true "Model update payload"
-// @Success 200 {object} audiopkg.SpeechModelResponse
-// @Failure 400 {object} ErrorResponse
-// @Failure 500 {object} ErrorResponse
-// @Router /speech-models/{id} [put].
-func (h *AudioHandler) UpdateModel(c echo.Context) error {
-	id := strings.TrimSpace(c.Param("id"))
-	if id == "" {
-		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
-	}
-	var req audiopkg.UpdateSpeechModelRequest
-	if err := c.Bind(&req); err != nil {
-		return echo.NewHTTPError(http.StatusBadRequest, err.Error())
-	}
-	resp, err := h.service.UpdateSpeechModel(c.Request().Context(), id, req)
-	if err != nil {
-		return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
-	}
-	return c.JSON(http.StatusOK, resp)
-}
-
-// GetTranscriptionModel godoc
-// @Summary Get a transcription model
-// @Tags transcription-models
-// @Produce json
-// @Param id path string true "Model ID"
-// @Success 200 {object} audiopkg.TranscriptionModelResponse
-// @Failure 404 {object} ErrorResponse
-// @Router /transcription-models/{id} [get].
-func (h *AudioHandler) GetTranscriptionModel(c echo.Context) error {
-	id := strings.TrimSpace(c.Param("id"))
-	if id == "" {
-		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
-	}
-	resp, err := h.service.GetTranscriptionModel(c.Request().Context(), id)
-	if err != nil {
-		return echo.NewHTTPError(http.StatusNotFound, err.Error())
-	}
-	return c.JSON(http.StatusOK, resp)
-}
-
-// UpdateTranscriptionModel godoc
-// @Summary Update a transcription model
-// @Tags transcription-models
-// @Accept json
-// @Produce json
-// @Param id path string true "Model ID"
-// @Param request body audiopkg.UpdateSpeechModelRequest true "Model update payload"
-// @Success 200 {object} audiopkg.TranscriptionModelResponse
-// @Failure 400 {object} ErrorResponse
-// @Failure 500 {object} ErrorResponse
-// @Router /transcription-models/{id} [put].
-func (h *AudioHandler) UpdateTranscriptionModel(c echo.Context) error {
-	id := strings.TrimSpace(c.Param("id"))
-	if id == "" {
-		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
-	}
-	var req audiopkg.UpdateSpeechModelRequest
-	if err := c.Bind(&req); err != nil {
-		return echo.NewHTTPError(http.StatusBadRequest, err.Error())
-	}
-	resp, err := h.service.UpdateTranscriptionModel(c.Request().Context(), id, req)
-	if err != nil {
-		return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
-	}
-	return c.JSON(http.StatusOK, resp)
-}
-
 // GetModelCapabilities godoc
 // @Summary Get speech model capabilities
 // @Tags speech-models
 // @Produce json
 // @Param id path string true "Model ID"
-// @Success 200 {object} audiopkg.ModelCapabilities
+// @Success 200 {object} tts.ModelCapabilities
 // @Failure 404 {object} ErrorResponse
 // @Router /speech-models/{id}/capabilities [get].
-func (h *AudioHandler) GetModelCapabilities(c echo.Context) error {
+func (h *SpeechHandler) GetModelCapabilities(c echo.Context) error {
 	id := strings.TrimSpace(c.Param("id"))
 	if id == "" {
 		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -435,26 +223,6 @@ func (h *AudioHandler) GetModelCapabilities(c echo.Context) error {
 	return c.JSON(http.StatusOK, caps)
 }

-// GetTranscriptionModelCapabilities godoc
-// @Summary Get transcription model capabilities
-// @Tags transcription-models
-// @Produce json
-// @Param id path string true "Model ID"
-// @Success 200 {object} audiopkg.ModelCapabilities
-// @Failure 404 {object} ErrorResponse
-// @Router /transcription-models/{id}/capabilities [get].
-func (h *AudioHandler) GetTranscriptionModelCapabilities(c echo.Context) error {
-	id := strings.TrimSpace(c.Param("id"))
-	if id == "" {
-		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
-	}
-	caps, err := h.service.GetTranscriptionModelCapabilities(c.Request().Context(), id)
-	if err != nil {
-		return echo.NewHTTPError(http.StatusNotFound, err.Error())
-	}
-	return c.JSON(http.StatusOK, caps)
-}
-
 // TestModel godoc
 // @Summary Test speech model synthesis
 // @Description Synthesize text using a specific model's config and return audio
@@ -462,17 +230,17 @@ func (h *AudioHandler) GetTranscriptionModelCapabilities(c echo.Context) error {
 // @Accept json
 // @Produce application/octet-stream
 // @Param id path string true "Model ID"
-// @Param request body audiopkg.TestSynthesizeRequest true "Text to synthesize"
+// @Param request body tts.TestSynthesizeRequest true "Text to synthesize"
 // @Success 200 {file} binary "Audio data"
 // @Failure 400 {object} ErrorResponse
 // @Failure 500 {object} ErrorResponse
 // @Router /speech-models/{id}/test [post].
-func (h *AudioHandler) TestModel(c echo.Context) error {
+func (h *SpeechHandler) TestModel(c echo.Context) error {
 	id := strings.TrimSpace(c.Param("id"))
 	if id == "" {
 		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
 	}
-	var req audiopkg.TestSynthesizeRequest
+	var req tts.TestSynthesizeRequest
 	if err := c.Bind(&req); err != nil {
 		return echo.NewHTTPError(http.StatusBadRequest, err.Error())
 	}
@@ -490,69 +258,3 @@ func (h *AudioHandler) TestModel(c echo.Context) error {
 	}
 	return c.Blob(http.StatusOK, contentType, audio)
 }
-
-// TestTranscriptionModel godoc
-// @Summary Test transcription model recognition
-// @Description Transcribe uploaded audio using a specific model's config and return structured text output
-// @Tags transcription-models
-// @Accept mpfd
-// @Produce json
-// @Param id path string true "Model ID"
-// @Param file formData file true "Audio file"
-// @Param config formData string false "Optional JSON config"
-// @Success 200 {object} audiopkg.TestTranscriptionResponse
-// @Failure 400 {object} ErrorResponse
-// @Failure 500 {object} ErrorResponse
-// @Router /transcription-models/{id}/test [post].
-func (h *AudioHandler) TestTranscriptionModel(c echo.Context) error {
-	id := strings.TrimSpace(c.Param("id"))
-	if id == "" {
-		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
-	}
-	file, err := c.FormFile("file")
-	if err != nil {
-		return echo.NewHTTPError(http.StatusBadRequest, "file is required")
-	}
-	src, err := file.Open()
-	if err != nil {
-		return echo.NewHTTPError(http.StatusBadRequest, err.Error())
-	}
-	defer func(src multipart.File) {
-		err := src.Close()
-		if err != nil {
-			h.logger.Warn("failed to close uploaded file", slog.Any("error", err))
-		}
-	}(src)
-	audio, err := io.ReadAll(src)
-	if err != nil {
-		return echo.NewHTTPError(http.StatusBadRequest, err.Error())
-	}
-	var cfg map[string]any
-	if raw := strings.TrimSpace(c.FormValue("config")); raw != "" {
-		if err := json.Unmarshal([]byte(raw), &cfg); err != nil {
-			return echo.NewHTTPError(http.StatusBadRequest, "invalid config")
-		}
-	}
-	result, err := h.service.Transcribe(c.Request().Context(), id, audio, file.Filename, file.Header.Get("Content-Type"), cfg)
-	if err != nil {
-		return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
-	}
-	resp := audiopkg.TestTranscriptionResponse{
-		Text:            result.Text,
-		Language:        result.Language,
-		DurationSeconds: result.DurationSeconds,
-		Metadata:        result.ProviderMetadata,
-	}
-	if len(result.Words) > 0 {
-		resp.Words = make([]audiopkg.TranscriptionWord, 0, len(result.Words))
-		for _, word := range result.Words {
-			resp.Words = append(resp.Words, audiopkg.TranscriptionWord{
-				Text:      word.Text,
-				Start:     word.Start,
-				End:       word.End,
-				SpeakerID: word.SpeakerID,
-			})
-		}
-	}
-	return c.JSON(http.StatusOK, resp)
-}