Files
Memoh/internal/audio/types.go
T
Acbox c9dcfe287f Feat/speech support (#392)
* feat: expand speech provider support with new client types and configuration schema

* feat: add icon support for speech providers and update related configurations

* feat: add SVG support for Deepgram and Elevenlabs with Vue components

* feat: except *-speech client type in llm provider

* feat: enhance speech provider functionality with advanced settings and model import capabilities

* chore: remove go.mod replace

* feat: enhance speech provider functionality with advanced settings and model import capabilities

* chore: update go module dependencies

* feat: Ear and Mouth

* fix: separate ear/mouth page

* fix: separate audio domain and restore transcription templates

Move speech and transcription internals into the audio domain, restore template-driven transcription providers, and regenerate Swagger/SDK so the frontend can stop hand-calling /transcription-* APIs.

---------

Co-authored-by: aki <arisu@ieee.org>
2026-04-22 00:09:46 +08:00

103 lines
4.2 KiB
Go

package audio
import "time"
// ProviderMetaResponse exposes adapter metadata (from the registry, not DB).
type ProviderMetaResponse struct {
Provider string `json:"provider"`
DisplayName string `json:"display_name"`
Description string `json:"description"`
ConfigSchema ConfigSchema `json:"config_schema,omitempty"`
DefaultModel string `json:"default_model,omitempty"`
Models []ModelInfo `json:"models,omitempty"`
DefaultSynthesisModel string `json:"default_synthesis_model,omitempty"`
SynthesisModels []ModelInfo `json:"synthesis_models,omitempty"`
SupportsSynthesisList bool `json:"supports_synthesis_list,omitempty"`
DefaultTranscriptionModel string `json:"default_transcription_model,omitempty"`
TranscriptionModels []ModelInfo `json:"transcription_models,omitempty"`
SupportsTranscriptionList bool `json:"supports_transcription_list,omitempty"`
}
// SpeechProviderResponse represents a speech-capable provider from the unified providers table.
type SpeechProviderResponse struct {
ID string `json:"id"`
Name string `json:"name"`
ClientType string `json:"client_type"`
Icon string `json:"icon,omitempty"`
Enable bool `json:"enable"`
Config map[string]any `json:"config,omitempty"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
// SpeechModelResponse represents a speech model from the unified models table.
type SpeechModelResponse struct {
ID string `json:"id"`
ModelID string `json:"model_id"`
Name string `json:"name"`
ProviderID string `json:"provider_id"`
ProviderType string `json:"provider_type,omitempty"`
Config map[string]any `json:"config,omitempty"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
// TranscriptionModelResponse represents a transcription model from the unified models table.
type TranscriptionModelResponse struct {
ID string `json:"id"`
ModelID string `json:"model_id"`
Name string `json:"name"`
ProviderID string `json:"provider_id"`
ProviderType string `json:"provider_type,omitempty"`
Config map[string]any `json:"config,omitempty"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
// UpdateSpeechProviderRequest is used for updating a speech provider.
type UpdateSpeechProviderRequest struct {
Name *string `json:"name,omitempty"`
Enable *bool `json:"enable,omitempty"`
}
// UpdateSpeechModelRequest is used for updating a speech model.
type UpdateSpeechModelRequest struct {
Name *string `json:"name,omitempty"`
Config map[string]any `json:"config,omitempty"`
}
// TestSynthesizeRequest represents a text-to-speech test request.
type TestSynthesizeRequest struct {
Text string `json:"text"`
Config map[string]any `json:"config,omitempty"`
}
// TestTranscriptionRequest represents an audio-to-text test request.
type TestTranscriptionRequest struct {
Config map[string]any `json:"config,omitempty"`
}
// TestTranscriptionResponse represents the result of a transcription test.
type TestTranscriptionResponse struct {
Text string `json:"text"`
Language string `json:"language,omitempty"`
DurationSeconds float64 `json:"duration_seconds,omitempty"`
Words []TranscriptionWord `json:"words,omitempty"`
Metadata map[string]any `json:"metadata,omitempty"`
}
// TranscriptionWord represents a single word alignment from a transcription result.
type TranscriptionWord struct {
Text string `json:"text"`
Start float64 `json:"start,omitempty"`
End float64 `json:"end,omitempty"`
SpeakerID string `json:"speaker_id,omitempty"`
}
// ImportModelsResponse represents the response for importing speech models.
type ImportModelsResponse struct {
Created int `json:"created"`
Skipped int `json:"skipped"`
Models []string `json:"models"`
}