mirror of
https://github.com/memohai/Memoh.git
synced 2026-04-25 07:00:48 +09:00
8d5c38f0e5
* refactor: unify providers and models tables
- Rename `llm_providers` → `providers`, `llm_provider_oauth_tokens` → `provider_oauth_tokens`
- Remove `tts_providers` and `tts_models` tables; speech models now live in the unified `models` table with `type = 'speech'`
- Replace top-level `api_key`/`base_url` columns with a JSONB `config` field on `providers`
- Rename `llm_provider_id` → `provider_id` across all references
- Add `edge-speech` client type and `conf/providers/edge.yaml` default provider
- Create new read-only speech endpoints (`/speech-providers`, `/speech-models`) backed by filtered views of the unified tables
- Remove old TTS CRUD handlers; simplify speech page to read-only + test
- Update registry loader to skip malformed YAML files instead of failing entirely
- Fix YAML quoting for model names containing colons in openrouter.yaml
- Regenerate sqlc, swagger, and TypeScript SDK
* fix: exclude speech providers from providers list endpoint
ListProviders now filters out client_type matching '%-speech' so Edge
and future speech providers no longer appear on the Providers page.
ListSpeechProviders uses the same pattern match instead of hard-coding
'edge-speech'.
* fix: use explicit client_type list instead of LIKE pattern
Replace '%-speech' pattern with explicit IN ('edge-speech') for both
ListProviders (exclusion) and ListSpeechProviders (inclusion). New
speech client types must be added to both queries.
* fix: use EXECUTE for dynamic SQL in migrations referencing old schema
PL/pgSQL pre-validates column/table references in static SQL statements
inside DO blocks before evaluating IF/RETURN guards. This caused
migrations 0010-0061 to fail on fresh databases where the canonical
schema uses `providers`/`provider_id` instead of `llm_providers`/
`llm_provider_id`.
Wrap all SQL that references potentially non-existent old schema objects
(llm_providers, llm_provider_id, tts_providers, tts_models, etc.) in
EXECUTE strings so they are only parsed at runtime when actually reached.
* fix: revert canonical schema to use llm_providers for migration compatibility
The CI migrations workflow (up → down → up) failed because 0061 down
renames `providers` back to `llm_providers`, but 0001 down only dropped
`providers` — leaving `llm_providers` as a remnant. On the second
migrate up, 0010 found the stale `llm_providers` and tried to reference
`models.llm_provider_id` which no longer existed.
Revert 0001 canonical schema to use original names (llm_providers,
tts_providers, tts_models) so incremental migrations work naturally and
0061 handles the final rename. Remove EXECUTE wrappers and unnecessary
guards from migrations that now always operate on llm_providers.
* fix: icons
* fix: sync canonical schema with 0061 migration to fix sqlc column mismatch
0001_init.up.sql still used old names (llm_providers, llm_provider_id)
and included dropped tts_providers/tts_models tables. sqlc could not
parse the PL/pgSQL EXECUTE in migration 0061, so generated code retained
stale columns (input_modalities, supports_reasoning) causing runtime
"column does not exist" errors when adding models.
- Update 0001_init.up.sql to current schema (providers, provider_id,
no tts tables, add provider_oauth_tokens)
- Use ALTER TABLE IF EXISTS in 0010/0041/0042 for backward compat
- Regenerate sqlc
* fix: guard all legacy migrations against fresh schema for CI compat
On fresh databases, 0001_init.up.sql creates providers/provider_id
(not llm_providers/llm_provider_id). Migrations 0013, 0041, 0046, 0047
referenced the old names without guards, causing CI migration failures.
- 0013: check llm_provider_id column exists before adding old constraint
- 0041: check llm_providers table exists before backfill/constraint DDL
- 0046: wrap CREATE TABLE in DO block with llm_providers existence check
- 0047: use ALTER TABLE IF EXISTS + DO block guard
368 lines
10 KiB
Go
368 lines
10 KiB
Go
package tts
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"log/slog"
|
|
"strings"
|
|
|
|
"github.com/memohai/memoh/internal/db"
|
|
"github.com/memohai/memoh/internal/db/sqlc"
|
|
)
|
|
|
|
type Service struct {
|
|
queries *sqlc.Queries
|
|
logger *slog.Logger
|
|
registry *Registry
|
|
}
|
|
|
|
func NewService(log *slog.Logger, queries *sqlc.Queries, registry *Registry) *Service {
|
|
return &Service{
|
|
queries: queries,
|
|
logger: log.With(slog.String("service", "tts")),
|
|
registry: registry,
|
|
}
|
|
}
|
|
|
|
func (s *Service) Registry() *Registry { return s.registry }
|
|
|
|
func (s *Service) ListMeta(_ context.Context) []ProviderMetaResponse {
|
|
return s.registry.ListMeta()
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Read helpers (speech-filtered views of unified tables)
|
|
// ---------------------------------------------------------------------------
|
|
|
|
// ListSpeechProviders returns providers with speech client types.
|
|
func (s *Service) ListSpeechProviders(ctx context.Context) ([]SpeechProviderResponse, error) {
|
|
rows, err := s.queries.ListSpeechProviders(ctx)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("list speech providers: %w", err)
|
|
}
|
|
items := make([]SpeechProviderResponse, 0, len(rows))
|
|
for _, row := range rows {
|
|
items = append(items, toSpeechProviderResponse(row))
|
|
}
|
|
return items, nil
|
|
}
|
|
|
|
// ListSpeechModels returns all speech-type models.
|
|
func (s *Service) ListSpeechModels(ctx context.Context) ([]SpeechModelResponse, error) {
|
|
rows, err := s.queries.ListSpeechModels(ctx)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("list speech models: %w", err)
|
|
}
|
|
items := make([]SpeechModelResponse, 0, len(rows))
|
|
for _, row := range rows {
|
|
items = append(items, toSpeechModelFromListRow(row))
|
|
}
|
|
return items, nil
|
|
}
|
|
|
|
// ListSpeechModelsByProvider returns speech models for a given provider.
|
|
func (s *Service) ListSpeechModelsByProvider(ctx context.Context, providerID string) ([]SpeechModelResponse, error) {
|
|
pgID, err := db.ParseUUID(providerID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
rows, err := s.queries.ListSpeechModelsByProviderID(ctx, pgID)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("list speech models by provider: %w", err)
|
|
}
|
|
items := make([]SpeechModelResponse, 0, len(rows))
|
|
for _, row := range rows {
|
|
items = append(items, toSpeechModelFromModel(row, ""))
|
|
}
|
|
return items, nil
|
|
}
|
|
|
|
// GetSpeechModel returns a speech model by ID.
|
|
func (s *Service) GetSpeechModel(ctx context.Context, id string) (SpeechModelResponse, error) {
|
|
pgID, err := db.ParseUUID(id)
|
|
if err != nil {
|
|
return SpeechModelResponse{}, err
|
|
}
|
|
row, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
|
|
if err != nil {
|
|
return SpeechModelResponse{}, fmt.Errorf("get speech model: %w", err)
|
|
}
|
|
return toSpeechModelWithProviderResponse(row), nil
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Synthesis
|
|
// ---------------------------------------------------------------------------
|
|
|
|
// Synthesize runs text-to-speech using the saved model config, optionally
|
|
// overridden by fields in overrideCfg. Returns raw audio bytes.
|
|
func (s *Service) Synthesize(ctx context.Context, modelID string, text string, overrideCfg map[string]any) ([]byte, string, error) {
|
|
pgID, err := db.ParseUUID(modelID)
|
|
if err != nil {
|
|
return nil, "", err
|
|
}
|
|
modelRow, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
|
|
if err != nil {
|
|
return nil, "", fmt.Errorf("get speech model: %w", err)
|
|
}
|
|
adapterType := clientTypeToTtsType(modelRow.ProviderType)
|
|
adapter, err := s.registry.Get(adapterType)
|
|
if err != nil {
|
|
return nil, "", fmt.Errorf("unsupported provider: %s", modelRow.ProviderType)
|
|
}
|
|
|
|
savedCfg := parseModelConfig(modelRow.Config)
|
|
for k, v := range overrideCfg {
|
|
savedCfg[k] = v
|
|
}
|
|
|
|
audioCfg := buildAudioConfig(savedCfg)
|
|
if err := audioCfg.Validate(); err != nil {
|
|
return nil, "", fmt.Errorf("invalid audio config: %w", err)
|
|
}
|
|
|
|
resolvedModel, _ := adapter.ResolveModel(modelRow.ModelID)
|
|
audio, synthErr := adapter.Synthesize(ctx, text, resolvedModel, audioCfg)
|
|
if synthErr != nil {
|
|
return nil, "", fmt.Errorf("synthesize: %w", synthErr)
|
|
}
|
|
|
|
contentType := resolveContentType(audioCfg.Format)
|
|
return audio, contentType, nil
|
|
}
|
|
|
|
// StreamToFile runs text-to-speech using Stream() and writes audio chunks
|
|
// directly to the given writer, keeping peak memory low for large audio.
|
|
func (s *Service) StreamToFile(ctx context.Context, modelID string, text string, w io.Writer) (string, error) {
|
|
pgID, err := db.ParseUUID(modelID)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
modelRow, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
|
|
if err != nil {
|
|
return "", fmt.Errorf("get speech model: %w", err)
|
|
}
|
|
adapterType := clientTypeToTtsType(modelRow.ProviderType)
|
|
adapter, err := s.registry.Get(adapterType)
|
|
if err != nil {
|
|
return "", fmt.Errorf("unsupported provider: %s", modelRow.ProviderType)
|
|
}
|
|
|
|
savedCfg := parseModelConfig(modelRow.Config)
|
|
audioCfg := buildAudioConfig(savedCfg)
|
|
if err := audioCfg.Validate(); err != nil {
|
|
return "", fmt.Errorf("invalid audio config: %w", err)
|
|
}
|
|
|
|
resolvedModel, _ := adapter.ResolveModel(modelRow.ModelID)
|
|
dataCh, errCh := adapter.Stream(ctx, text, resolvedModel, audioCfg)
|
|
if dataCh == nil {
|
|
select {
|
|
case streamErr := <-errCh:
|
|
return "", fmt.Errorf("stream: %w", streamErr)
|
|
default:
|
|
return "", errors.New("stream returned nil channels")
|
|
}
|
|
}
|
|
|
|
for chunk := range dataCh {
|
|
if _, writeErr := w.Write(chunk); writeErr != nil {
|
|
return "", fmt.Errorf("write chunk: %w", writeErr)
|
|
}
|
|
}
|
|
if streamErr, ok := <-errCh; ok && streamErr != nil {
|
|
return "", fmt.Errorf("stream: %w", streamErr)
|
|
}
|
|
|
|
return resolveContentType(audioCfg.Format), nil
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Capabilities
|
|
// ---------------------------------------------------------------------------
|
|
|
|
// GetModelCapabilities returns the adapter-level capabilities for a stored model.
|
|
func (s *Service) GetModelCapabilities(ctx context.Context, modelID string) (*ModelCapabilities, error) {
|
|
pgID, err := db.ParseUUID(modelID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
modelRow, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("get speech model: %w", err)
|
|
}
|
|
adapterType := clientTypeToTtsType(modelRow.ProviderType)
|
|
adapter, err := s.registry.Get(adapterType)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("unsupported provider: %s", modelRow.ProviderType)
|
|
}
|
|
for _, m := range adapter.Models() {
|
|
if m.ID == modelRow.ModelID {
|
|
return &m.Capabilities, nil
|
|
}
|
|
}
|
|
return nil, fmt.Errorf("model %s not found in adapter", modelRow.ModelID)
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Helpers
|
|
// ---------------------------------------------------------------------------
|
|
|
|
// clientTypeToTtsType maps the unified client_type to the TTS adapter type.
|
|
func clientTypeToTtsType(clientType string) TtsType {
|
|
switch clientType {
|
|
case "edge-speech":
|
|
return "edge"
|
|
default:
|
|
return TtsType(clientType)
|
|
}
|
|
}
|
|
|
|
func parseModelConfig(raw []byte) map[string]any {
|
|
if len(raw) == 0 {
|
|
return make(map[string]any)
|
|
}
|
|
var cfg map[string]any
|
|
if err := json.Unmarshal(raw, &cfg); err != nil {
|
|
return make(map[string]any)
|
|
}
|
|
if cfg == nil {
|
|
return make(map[string]any)
|
|
}
|
|
return cfg
|
|
}
|
|
|
|
func buildAudioConfig(cfg map[string]any) AudioConfig {
|
|
ac := AudioConfig{}
|
|
if voice, ok := cfg["voice"].(map[string]any); ok {
|
|
if id, ok := voice["id"].(string); ok {
|
|
ac.Voice.ID = id
|
|
}
|
|
if lang, ok := voice["lang"].(string); ok {
|
|
ac.Voice.Lang = lang
|
|
}
|
|
}
|
|
if format, ok := cfg["format"].(string); ok {
|
|
ac.Format = format
|
|
}
|
|
if speed, ok := toFloat(cfg["speed"]); ok {
|
|
ac.Speed = speed
|
|
}
|
|
if pitch, ok := toFloat(cfg["pitch"]); ok {
|
|
ac.Pitch = pitch
|
|
}
|
|
if sr, ok := toFloat(cfg["sample_rate"]); ok {
|
|
ac.SampleRate = int(sr)
|
|
}
|
|
return ac
|
|
}
|
|
|
|
func toFloat(v any) (float64, bool) {
|
|
switch n := v.(type) {
|
|
case float64:
|
|
return n, true
|
|
case float32:
|
|
return float64(n), true
|
|
case int:
|
|
return float64(n), true
|
|
case int64:
|
|
return float64(n), true
|
|
default:
|
|
return 0, false
|
|
}
|
|
}
|
|
|
|
func resolveContentType(format string) string {
|
|
switch {
|
|
case strings.Contains(format, "mp3"):
|
|
return "audio/mpeg"
|
|
case strings.Contains(format, "opus"):
|
|
return "audio/opus"
|
|
case strings.Contains(format, "ogg"):
|
|
return "audio/ogg"
|
|
case strings.Contains(format, "webm"):
|
|
return "audio/webm"
|
|
case strings.Contains(format, "wav"):
|
|
return "audio/wav"
|
|
default:
|
|
return "audio/mpeg"
|
|
}
|
|
}
|
|
|
|
func toSpeechProviderResponse(row sqlc.Provider) SpeechProviderResponse {
|
|
return SpeechProviderResponse{
|
|
ID: row.ID.String(),
|
|
Name: row.Name,
|
|
ClientType: row.ClientType,
|
|
Enable: row.Enable,
|
|
CreatedAt: row.CreatedAt.Time,
|
|
UpdatedAt: row.UpdatedAt.Time,
|
|
}
|
|
}
|
|
|
|
func toSpeechModelFromListRow(row sqlc.ListSpeechModelsRow) SpeechModelResponse {
|
|
var cfg map[string]any
|
|
if len(row.Config) > 0 {
|
|
_ = json.Unmarshal(row.Config, &cfg)
|
|
}
|
|
name := ""
|
|
if row.Name.Valid {
|
|
name = row.Name.String
|
|
}
|
|
return SpeechModelResponse{
|
|
ID: row.ID.String(),
|
|
ModelID: row.ModelID,
|
|
Name: name,
|
|
ProviderID: row.ProviderID.String(),
|
|
ProviderType: row.ProviderType,
|
|
Config: cfg,
|
|
CreatedAt: row.CreatedAt.Time,
|
|
UpdatedAt: row.UpdatedAt.Time,
|
|
}
|
|
}
|
|
|
|
func toSpeechModelFromModel(row sqlc.Model, providerType string) SpeechModelResponse {
|
|
var cfg map[string]any
|
|
if len(row.Config) > 0 {
|
|
_ = json.Unmarshal(row.Config, &cfg)
|
|
}
|
|
name := ""
|
|
if row.Name.Valid {
|
|
name = row.Name.String
|
|
}
|
|
return SpeechModelResponse{
|
|
ID: row.ID.String(),
|
|
ModelID: row.ModelID,
|
|
Name: name,
|
|
ProviderID: row.ProviderID.String(),
|
|
ProviderType: providerType,
|
|
Config: cfg,
|
|
CreatedAt: row.CreatedAt.Time,
|
|
UpdatedAt: row.UpdatedAt.Time,
|
|
}
|
|
}
|
|
|
|
func toSpeechModelWithProviderResponse(row sqlc.GetSpeechModelWithProviderRow) SpeechModelResponse {
|
|
var cfg map[string]any
|
|
if len(row.Config) > 0 {
|
|
_ = json.Unmarshal(row.Config, &cfg)
|
|
}
|
|
name := ""
|
|
if row.Name.Valid {
|
|
name = row.Name.String
|
|
}
|
|
return SpeechModelResponse{
|
|
ID: row.ID.String(),
|
|
ModelID: row.ModelID,
|
|
Name: name,
|
|
ProviderID: row.ProviderID.String(),
|
|
ProviderType: row.ProviderType,
|
|
Config: cfg,
|
|
CreatedAt: row.CreatedAt.Time,
|
|
UpdatedAt: row.UpdatedAt.Time,
|
|
}
|
|
}
|