mirror of
https://github.com/memohai/Memoh.git
synced 2026-04-27 07:16:19 +09:00
627b673a5c
* refactor: restructure memory into multi-provider adapters, remove manifest.json dependency - Rename internal/memory/provider to internal/memory/adapters with per-provider subdirectories (builtin, mem0, openviking) - Replace manifest.json-based delete/update with scan-based index from daily files - Add mem0 and openviking provider adapters with HTTP client, chat hooks, MCP tools, and CRUD - Wire provider lifecycle into registry (auto-instantiate on create, evict on update/delete) - Split docker-compose into base stack + optional overlays (qdrant, browser, mem0, openviking) - Update admin UI to support dynamic provider config schema rendering * chore(lint): fix all golangci-lint issues for clean CI * refactor(docker): replace compose overlay files with profiles * feat(memory): add built-in memory multi modes * fix(ci): golangci lint * feat(memory): edit built-in memory sparse design
415 lines
12 KiB
Go
415 lines
12 KiB
Go
package edge
|
||
|
||
import (
|
||
"bytes"
|
||
"context"
|
||
"crypto/rand"
|
||
"crypto/sha256"
|
||
"crypto/tls"
|
||
"encoding/binary"
|
||
"encoding/hex"
|
||
"errors"
|
||
"fmt"
|
||
"io"
|
||
"net/http"
|
||
"net/url"
|
||
"strings"
|
||
"sync"
|
||
"time"
|
||
|
||
"github.com/google/uuid"
|
||
"github.com/gorilla/websocket"
|
||
|
||
"github.com/memohai/memoh/internal/tts"
|
||
)
|
||
|
||
// Edge TTS WebSocket client.
|
||
// Reference: https://github.com/readest/readest/blob/main/apps/readest-app/src/libs/edgeTTS.ts
|
||
//
|
||
// Protocol flow:
|
||
//
|
||
// Client ── Establish Connection ──────────> Edge TTS Server
|
||
// Client ── speech.config (JSON) ─────────> Edge TTS Server
|
||
// Client ── ssml (XML) ───────────────────> Edge TTS Server
|
||
// Client <─ turn.start / response (Text) ── Edge TTS Server
|
||
// Client <─ Audio binary frames ─────────── Edge TTS Server
|
||
// Client <─ turn.end (Text) ─────────────── Edge TTS Server
|
||
//
|
||
// Important implementation notes:
|
||
//
|
||
// - TLS must negotiate HTTP/1.1 (NextProtos: ["http/1.1"]); the server returns 404 on HTTP/2.
|
||
// - Do NOT set Sec-WebSocket-Version in headers; gorilla/websocket adds it automatically
|
||
// and a duplicate causes "duplicate header not allowed" errors.
|
||
// - ConnectionId (URL param) and X-RequestId (SSML header) MUST be the same 32-hex value.
|
||
// - Each synthesis uses a one-shot connection: connect → config → SSML → audio → turn.end → close.
|
||
// The server closes the WebSocket after turn.end, so connections cannot be reused.
|
||
// - Binary audio frames use big-endian for the 2-byte header-length prefix.
|
||
// JavaScript's DataView.getInt16() defaults to big-endian; using little-endian silently
|
||
// produces wrong offsets, causing all audio chunks to be discarded.
|
||
// - speech.config message must include an X-Timestamp header.
|
||
// - SSML must declare xmlns:mstts and set xml:lang from the voice name.
|
||
|
||
type EdgeWsClient struct {
|
||
conn *websocket.Conn
|
||
connID string
|
||
mu sync.Mutex
|
||
outputFormat string // like audio-24khz-48kbitrate-mono-mp3
|
||
BaseURL string // for mock, empty will use EDGE_SPEECH_URL
|
||
}
|
||
|
||
func NewEdgeWsClient() *EdgeWsClient {
|
||
return &EdgeWsClient{
|
||
outputFormat: "audio-24khz-48kbitrate-mono-mp3",
|
||
}
|
||
}
|
||
|
||
// Generate Sec-MS-GEC
|
||
// @see https://github.com/readest/readest/blob/main/apps/readest-app/src/libs/edgeTTS.ts#L208
|
||
func generateSecMSGec() string {
|
||
ticks := time.Now().Unix() + WIN_EPOCH_OFFSET
|
||
ticks -= ticks % 300
|
||
ticks100ns := ticks * (S_TO_NS / 100)
|
||
strToHash := fmt.Sprintf("%d%s", ticks100ns, EDGE_API_TOKEN)
|
||
sum := sha256.Sum256([]byte(strToHash))
|
||
return strings.ToUpper(hex.EncodeToString(sum[:]))
|
||
}
|
||
|
||
// generateMuid MUID,for Cookie.
|
||
func generateMuid() string {
|
||
b := make([]byte, 16)
|
||
_, _ = rand.Read(b)
|
||
return strings.ToUpper(hex.EncodeToString(b))
|
||
}
|
||
|
||
func (c *EdgeWsClient) buildWsURL() string {
|
||
base := EDGE_SPEECH_URL
|
||
if c.BaseURL != "" {
|
||
base = c.BaseURL
|
||
}
|
||
u, _ := url.Parse(base)
|
||
q := u.Query()
|
||
q.Set("TrustedClientToken", EDGE_API_TOKEN)
|
||
q.Set("Sec-MS-GEC", generateSecMSGec())
|
||
q.Set("Sec-MS-GEC-Version", "1-"+CHROMIUM_FULL_VERSION)
|
||
q.Set("ConnectionId", c.connID)
|
||
u.RawQuery = q.Encode()
|
||
return u.String()
|
||
}
|
||
|
||
func buildWSSHeaders() http.Header {
|
||
h := http.Header{}
|
||
h.Set("User-Agent",
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"+
|
||
" (KHTML, like Gecko) Chrome/"+CHROMIUM_MAJOR_VERSION+".0.0.0 Safari/537.36"+
|
||
" Edg/"+CHROMIUM_MAJOR_VERSION+".0.0.0")
|
||
h.Set("Accept-Encoding", "gzip, deflate, br, zstd")
|
||
h.Set("Accept-Language", "en-US,en;q=0.9")
|
||
h.Set("Pragma", "no-cache")
|
||
h.Set("Cache-Control", "no-cache")
|
||
h.Set("Origin", WSSOrigin)
|
||
h.Set("Cookie", "muid="+generateMuid()+";")
|
||
return h
|
||
}
|
||
|
||
// Connect establishes a new WebSocket connection to Edge TTS.
|
||
// Each call generates a fresh connID (matching readest's one-connection-per-request model).
|
||
// If already connected, this is a no-op; call Close first to force a reconnect.
|
||
func (c *EdgeWsClient) Connect(ctx context.Context) error {
|
||
c.mu.Lock()
|
||
defer c.mu.Unlock()
|
||
if c.conn != nil {
|
||
return nil
|
||
}
|
||
c.connID = strings.ReplaceAll(uuid.New().String(), "-", "")
|
||
|
||
tlsConfig := &tls.Config{
|
||
MinVersion: tls.VersionTLS12,
|
||
NextProtos: []string{"http/1.1"},
|
||
}
|
||
d := websocket.Dialer{
|
||
Proxy: http.ProxyFromEnvironment,
|
||
TLSClientConfig: tlsConfig,
|
||
HandshakeTimeout: 15 * time.Second,
|
||
}
|
||
wsURL := c.buildWsURL()
|
||
reqHeader := buildWSSHeaders()
|
||
conn, resp, err := d.DialContext(ctx, wsURL, reqHeader)
|
||
if err != nil {
|
||
if resp != nil {
|
||
body, _ := io.ReadAll(resp.Body)
|
||
_ = resp.Body.Close()
|
||
return fmt.Errorf("edge tts ws dial: %w (status=%s body=%s)", err, resp.Status, string(bytes.TrimSpace(body)))
|
||
}
|
||
return fmt.Errorf("edge tts ws dial: %w", err)
|
||
}
|
||
c.conn = conn
|
||
return nil
|
||
}
|
||
|
||
// Close closes the WebSocket connection.
|
||
func (c *EdgeWsClient) Close() error {
|
||
c.mu.Lock()
|
||
defer c.mu.Unlock()
|
||
return c.resetLocked()
|
||
}
|
||
|
||
// resetLocked closes the current connection and clears state. Caller must hold c.mu.
|
||
func (c *EdgeWsClient) resetLocked() error {
|
||
conn := c.conn
|
||
c.conn = nil
|
||
if conn == nil {
|
||
return nil
|
||
}
|
||
return conn.Close()
|
||
}
|
||
|
||
// sendFrame send a text frame with Path header (Edge protocol: header + empty line + body).
|
||
func (c *EdgeWsClient) sendFrame(path, contentType, body string, extraHeaders map[string]string) error {
|
||
var b strings.Builder
|
||
b.WriteString("Path: ")
|
||
b.WriteString(path)
|
||
b.WriteString("\r\n")
|
||
b.WriteString("Content-Type: ")
|
||
b.WriteString(contentType)
|
||
b.WriteString("\r\n")
|
||
for k, v := range extraHeaders {
|
||
b.WriteString(k)
|
||
b.WriteString(": ")
|
||
b.WriteString(v)
|
||
b.WriteString("\r\n")
|
||
}
|
||
b.WriteString("\r\n")
|
||
b.WriteString(body)
|
||
return c.conn.WriteMessage(websocket.TextMessage, []byte(b.String()))
|
||
}
|
||
|
||
// Configure sends the speech.config message (output format, etc.).
|
||
func (c *EdgeWsClient) Configure(ctx context.Context, config tts.AudioConfig) error {
|
||
c.mu.Lock()
|
||
defer c.mu.Unlock()
|
||
if c.conn == nil {
|
||
return errors.New("edge tts: not connected")
|
||
}
|
||
if deadline, ok := ctx.Deadline(); ok {
|
||
_ = c.conn.SetWriteDeadline(deadline)
|
||
defer func() { _ = c.conn.SetWriteDeadline(time.Time{}) }()
|
||
}
|
||
format := c.outputFormat
|
||
if config.Format != "" {
|
||
format = config.Format
|
||
}
|
||
// like readest: outputFormat + boolean is JSON false
|
||
body := fmt.Sprintf(`{"context":{"synthesis":{"audio":{"metadataoptions":{"sentenceBoundaryEnabled":false,"wordBoundaryEnabled":true},"outputFormat":"%s"}}}}`, format)
|
||
extra := map[string]string{
|
||
"X-Timestamp": time.Now().String(),
|
||
}
|
||
return c.sendFrame("speech.config", "application/json; charset=utf-8", body, extra)
|
||
}
|
||
|
||
// buildSSML builds SSML with rate and pitch for Edge TTS prosody.
|
||
func buildSSML(text string, voice tts.VoiceConfig, speed, pitch float64) string {
|
||
voiceID := voice.ID
|
||
if voiceID == "" {
|
||
voiceID = DEFAULT_VOICE
|
||
}
|
||
lang := voice.Lang
|
||
if lang == "" {
|
||
lang = "en-US"
|
||
}
|
||
|
||
rate := 0
|
||
if speed > 0 {
|
||
rate = int((speed - 1) * 100)
|
||
}
|
||
rateStr := fmt.Sprintf("%+d%%", rate)
|
||
pitchStr := fmt.Sprintf("%+dHz", int(pitch))
|
||
|
||
return fmt.Sprintf(
|
||
`<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="%s">`+
|
||
`<voice name="%s"><prosody rate="%s" pitch="%s">%s</prosody></voice></speak>`,
|
||
lang, voiceID, rateStr, pitchStr, escapeSSML(text))
|
||
}
|
||
|
||
func escapeSSML(s string) string {
|
||
s = strings.ReplaceAll(s, "&", "&")
|
||
s = strings.ReplaceAll(s, "<", "<")
|
||
s = strings.ReplaceAll(s, ">", ">")
|
||
s = strings.ReplaceAll(s, "'", "'")
|
||
s = strings.ReplaceAll(s, "\"", """)
|
||
return s
|
||
}
|
||
|
||
// Synthesize sends SSML and synchronously collects all audio data.
|
||
// It handles the full lifecycle: connect → configure → send → receive → close.
|
||
func (c *EdgeWsClient) Synthesize(ctx context.Context, text string, config tts.AudioConfig) ([]byte, error) {
|
||
if err := c.Connect(ctx); err != nil {
|
||
return nil, err
|
||
}
|
||
if err := c.Configure(ctx, config); err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
c.mu.Lock()
|
||
conn := c.conn
|
||
connID := c.connID
|
||
c.mu.Unlock()
|
||
if conn == nil {
|
||
return nil, errors.New("edge tts: not connected")
|
||
}
|
||
|
||
ssml := buildSSML(text, config.Voice, config.Speed, config.Pitch)
|
||
extra := map[string]string{
|
||
"X-RequestId": connID,
|
||
"X-Timestamp": time.Now().String(),
|
||
}
|
||
if err := c.sendFrame("ssml", "application/ssml+xml", ssml, extra); err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
var out []byte
|
||
for {
|
||
select {
|
||
case <-ctx.Done():
|
||
return nil, ctx.Err()
|
||
default:
|
||
}
|
||
mt, data, err := conn.ReadMessage()
|
||
if err != nil {
|
||
if websocket.IsCloseError(err, websocket.CloseNormalClosure, websocket.CloseGoingAway) {
|
||
if len(out) > 0 {
|
||
c.mu.Lock()
|
||
_ = c.resetLocked()
|
||
c.mu.Unlock()
|
||
return out, nil
|
||
}
|
||
}
|
||
c.mu.Lock()
|
||
_ = c.resetLocked()
|
||
c.mu.Unlock()
|
||
return nil, fmt.Errorf("edge tts read (format=%q voice=%q): %w", config.Format, config.Voice.ID, err)
|
||
}
|
||
switch mt {
|
||
case websocket.TextMessage:
|
||
if parsePath(data) == "turn.end" {
|
||
c.mu.Lock()
|
||
_ = c.resetLocked()
|
||
c.mu.Unlock()
|
||
return out, nil
|
||
}
|
||
case websocket.BinaryMessage:
|
||
audio, err := parseAudioChunk(data)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
if len(audio) > 0 {
|
||
out = append(out, audio...)
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// parsePath parse Path header from Edge text frame.
|
||
func parsePath(data []byte) string {
|
||
idx := bytes.Index(data, []byte("Path:"))
|
||
if idx < 0 {
|
||
return ""
|
||
}
|
||
lineEnd := bytes.Index(data[idx:], []byte("\r\n"))
|
||
if lineEnd < 0 {
|
||
lineEnd = len(data) - idx
|
||
}
|
||
pathLine := data[idx+5 : idx+lineEnd]
|
||
return strings.TrimSpace(string(pathLine))
|
||
}
|
||
|
||
// parseAudioChunk parse Edge binary audio frame: the first 2 bytes are the header length (big endian), followed by the header text, and then the audio data.
|
||
// if the data is too short or the header length is invalid, return nil, nil, not considered an error.
|
||
func parseAudioChunk(data []byte) ([]byte, error) {
|
||
if len(data) < 2 {
|
||
return nil, nil
|
||
}
|
||
headerLen := binary.BigEndian.Uint16(data[:2])
|
||
audioStart := 2 + int(headerLen)
|
||
if audioStart > len(data) {
|
||
return nil, nil
|
||
}
|
||
return data[audioStart:], nil
|
||
}
|
||
|
||
// Stream sends SSML and returns audio chunks via channel.
|
||
// It handles the full lifecycle: connect → configure → send → stream → close.
|
||
func (c *EdgeWsClient) Stream(ctx context.Context, text string, config tts.AudioConfig) (ch chan []byte, errCh chan error) {
|
||
ch = make(chan []byte, 8)
|
||
errCh = make(chan error, 1)
|
||
go func() {
|
||
defer close(ch)
|
||
defer close(errCh)
|
||
|
||
if err := c.Connect(ctx); err != nil {
|
||
errCh <- err
|
||
return
|
||
}
|
||
if err := c.Configure(ctx, config); err != nil {
|
||
errCh <- err
|
||
return
|
||
}
|
||
|
||
c.mu.Lock()
|
||
conn := c.conn
|
||
connID := c.connID
|
||
c.mu.Unlock()
|
||
if conn == nil {
|
||
errCh <- errors.New("edge tts: not connected")
|
||
return
|
||
}
|
||
|
||
ssml := buildSSML(text, config.Voice, config.Speed, config.Pitch)
|
||
extra := map[string]string{
|
||
"X-RequestId": connID,
|
||
"X-Timestamp": time.Now().String(),
|
||
}
|
||
if err := c.sendFrame("ssml", "application/ssml+xml", ssml, extra); err != nil {
|
||
errCh <- err
|
||
return
|
||
}
|
||
|
||
for {
|
||
select {
|
||
case <-ctx.Done():
|
||
errCh <- ctx.Err()
|
||
return
|
||
default:
|
||
}
|
||
mt, data, err := conn.ReadMessage()
|
||
if err != nil {
|
||
errCh <- fmt.Errorf("edge tts read: %w", err)
|
||
return
|
||
}
|
||
switch mt {
|
||
case websocket.TextMessage:
|
||
if parsePath(data) == "turn.end" {
|
||
c.mu.Lock()
|
||
_ = c.resetLocked()
|
||
c.mu.Unlock()
|
||
return
|
||
}
|
||
case websocket.BinaryMessage:
|
||
audio, err := parseAudioChunk(data)
|
||
if err != nil {
|
||
errCh <- err
|
||
return
|
||
}
|
||
if len(audio) > 0 {
|
||
select {
|
||
case ch <- audio:
|
||
case <-ctx.Done():
|
||
errCh <- ctx.Err()
|
||
return
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}()
|
||
return ch, errCh
|
||
}
|