package edge import ( "bytes" "context" "crypto/rand" "crypto/sha256" "crypto/tls" "encoding/binary" "encoding/hex" "errors" "fmt" "io" "net/http" "net/url" "strings" "sync" "time" "github.com/google/uuid" "github.com/gorilla/websocket" "github.com/memohai/memoh/internal/tts" ) // Edge TTS WebSocket client. // Reference: https://github.com/readest/readest/blob/main/apps/readest-app/src/libs/edgeTTS.ts // // Protocol flow: // // Client ── Establish Connection ──────────> Edge TTS Server // Client ── speech.config (JSON) ─────────> Edge TTS Server // Client ── ssml (XML) ───────────────────> Edge TTS Server // Client <─ turn.start / response (Text) ── Edge TTS Server // Client <─ Audio binary frames ─────────── Edge TTS Server // Client <─ turn.end (Text) ─────────────── Edge TTS Server // // Important implementation notes: // // - TLS must negotiate HTTP/1.1 (NextProtos: ["http/1.1"]); the server returns 404 on HTTP/2. // - Do NOT set Sec-WebSocket-Version in headers; gorilla/websocket adds it automatically // and a duplicate causes "duplicate header not allowed" errors. // - ConnectionId (URL param) and X-RequestId (SSML header) MUST be the same 32-hex value. // - Each synthesis uses a one-shot connection: connect → config → SSML → audio → turn.end → close. // The server closes the WebSocket after turn.end, so connections cannot be reused. // - Binary audio frames use big-endian for the 2-byte header-length prefix. // JavaScript's DataView.getInt16() defaults to big-endian; using little-endian silently // produces wrong offsets, causing all audio chunks to be discarded. // - speech.config message must include an X-Timestamp header. // - SSML must declare xmlns:mstts and set xml:lang from the voice name. type EdgeWsClient struct { conn *websocket.Conn connID string mu sync.Mutex outputFormat string // like audio-24khz-48kbitrate-mono-mp3 BaseURL string // for mock, empty will use EDGE_SPEECH_URL } func NewEdgeWsClient() *EdgeWsClient { return &EdgeWsClient{ outputFormat: "audio-24khz-48kbitrate-mono-mp3", } } // Generate Sec-MS-GEC // @see https://github.com/readest/readest/blob/main/apps/readest-app/src/libs/edgeTTS.ts#L208 func generateSecMSGec() string { ticks := time.Now().Unix() + WIN_EPOCH_OFFSET ticks -= ticks % 300 ticks100ns := ticks * (S_TO_NS / 100) strToHash := fmt.Sprintf("%d%s", ticks100ns, EDGE_API_TOKEN) sum := sha256.Sum256([]byte(strToHash)) return strings.ToUpper(hex.EncodeToString(sum[:])) } // generateMuid MUID,for Cookie. func generateMuid() string { b := make([]byte, 16) _, _ = rand.Read(b) return strings.ToUpper(hex.EncodeToString(b)) } func (c *EdgeWsClient) buildWsURL() string { base := EDGE_SPEECH_URL if c.BaseURL != "" { base = c.BaseURL } u, _ := url.Parse(base) q := u.Query() q.Set("TrustedClientToken", EDGE_API_TOKEN) q.Set("Sec-MS-GEC", generateSecMSGec()) q.Set("Sec-MS-GEC-Version", "1-"+CHROMIUM_FULL_VERSION) q.Set("ConnectionId", c.connID) u.RawQuery = q.Encode() return u.String() } func buildWSSHeaders() http.Header { h := http.Header{} h.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"+ " (KHTML, like Gecko) Chrome/"+CHROMIUM_MAJOR_VERSION+".0.0.0 Safari/537.36"+ " Edg/"+CHROMIUM_MAJOR_VERSION+".0.0.0") h.Set("Accept-Encoding", "gzip, deflate, br, zstd") h.Set("Accept-Language", "en-US,en;q=0.9") h.Set("Pragma", "no-cache") h.Set("Cache-Control", "no-cache") h.Set("Origin", WSSOrigin) h.Set("Cookie", "muid="+generateMuid()+";") return h } // Connect establishes a new WebSocket connection to Edge TTS. // Each call generates a fresh connID (matching readest's one-connection-per-request model). // If already connected, this is a no-op; call Close first to force a reconnect. func (c *EdgeWsClient) Connect(ctx context.Context) error { c.mu.Lock() defer c.mu.Unlock() if c.conn != nil { return nil } c.connID = strings.ReplaceAll(uuid.New().String(), "-", "") tlsConfig := &tls.Config{ MinVersion: tls.VersionTLS12, NextProtos: []string{"http/1.1"}, } d := websocket.Dialer{ Proxy: http.ProxyFromEnvironment, TLSClientConfig: tlsConfig, HandshakeTimeout: 15 * time.Second, } wsURL := c.buildWsURL() reqHeader := buildWSSHeaders() conn, resp, err := d.DialContext(ctx, wsURL, reqHeader) if err != nil { if resp != nil { body, _ := io.ReadAll(resp.Body) _ = resp.Body.Close() return fmt.Errorf("edge tts ws dial: %w (status=%s body=%s)", err, resp.Status, string(bytes.TrimSpace(body))) } return fmt.Errorf("edge tts ws dial: %w", err) } c.conn = conn return nil } // Close closes the WebSocket connection. func (c *EdgeWsClient) Close() error { c.mu.Lock() defer c.mu.Unlock() return c.resetLocked() } // resetLocked closes the current connection and clears state. Caller must hold c.mu. func (c *EdgeWsClient) resetLocked() error { conn := c.conn c.conn = nil if conn == nil { return nil } return conn.Close() } // sendFrame send a text frame with Path header (Edge protocol: header + empty line + body). func (c *EdgeWsClient) sendFrame(path, contentType, body string, extraHeaders map[string]string) error { var b strings.Builder b.WriteString("Path: ") b.WriteString(path) b.WriteString("\r\n") b.WriteString("Content-Type: ") b.WriteString(contentType) b.WriteString("\r\n") for k, v := range extraHeaders { b.WriteString(k) b.WriteString(": ") b.WriteString(v) b.WriteString("\r\n") } b.WriteString("\r\n") b.WriteString(body) return c.conn.WriteMessage(websocket.TextMessage, []byte(b.String())) } // Configure sends the speech.config message (output format, etc.). func (c *EdgeWsClient) Configure(ctx context.Context, config tts.AudioConfig) error { c.mu.Lock() defer c.mu.Unlock() if c.conn == nil { return errors.New("edge tts: not connected") } if deadline, ok := ctx.Deadline(); ok { _ = c.conn.SetWriteDeadline(deadline) defer func() { _ = c.conn.SetWriteDeadline(time.Time{}) }() } format := c.outputFormat if config.Format != "" { format = config.Format } // like readest: outputFormat + boolean is JSON false body := fmt.Sprintf(`{"context":{"synthesis":{"audio":{"metadataoptions":{"sentenceBoundaryEnabled":false,"wordBoundaryEnabled":true},"outputFormat":"%s"}}}}`, format) extra := map[string]string{ "X-Timestamp": time.Now().String(), } return c.sendFrame("speech.config", "application/json; charset=utf-8", body, extra) } // buildSSML builds SSML with rate and pitch for Edge TTS prosody. func buildSSML(text string, voice tts.VoiceConfig, speed, pitch float64) string { voiceID := voice.ID if voiceID == "" { voiceID = DEFAULT_VOICE } lang := voice.Lang if lang == "" { lang = "en-US" } rate := 0 if speed > 0 { rate = int((speed - 1) * 100) } rateStr := fmt.Sprintf("%+d%%", rate) pitchStr := fmt.Sprintf("%+dHz", int(pitch)) return fmt.Sprintf( ``+ `%s`, lang, voiceID, rateStr, pitchStr, escapeSSML(text)) } func escapeSSML(s string) string { s = strings.ReplaceAll(s, "&", "&") s = strings.ReplaceAll(s, "<", "<") s = strings.ReplaceAll(s, ">", ">") s = strings.ReplaceAll(s, "'", "'") s = strings.ReplaceAll(s, "\"", """) return s } // Synthesize sends SSML and synchronously collects all audio data. // It handles the full lifecycle: connect → configure → send → receive → close. func (c *EdgeWsClient) Synthesize(ctx context.Context, text string, config tts.AudioConfig) ([]byte, error) { if err := c.Connect(ctx); err != nil { return nil, err } if err := c.Configure(ctx, config); err != nil { return nil, err } c.mu.Lock() conn := c.conn connID := c.connID c.mu.Unlock() if conn == nil { return nil, errors.New("edge tts: not connected") } ssml := buildSSML(text, config.Voice, config.Speed, config.Pitch) extra := map[string]string{ "X-RequestId": connID, "X-Timestamp": time.Now().String(), } if err := c.sendFrame("ssml", "application/ssml+xml", ssml, extra); err != nil { return nil, err } var out []byte for { select { case <-ctx.Done(): return nil, ctx.Err() default: } mt, data, err := conn.ReadMessage() if err != nil { if websocket.IsCloseError(err, websocket.CloseNormalClosure, websocket.CloseGoingAway) { if len(out) > 0 { c.mu.Lock() _ = c.resetLocked() c.mu.Unlock() return out, nil } } c.mu.Lock() _ = c.resetLocked() c.mu.Unlock() return nil, fmt.Errorf("edge tts read (format=%q voice=%q): %w", config.Format, config.Voice.ID, err) } switch mt { case websocket.TextMessage: if parsePath(data) == "turn.end" { c.mu.Lock() _ = c.resetLocked() c.mu.Unlock() return out, nil } case websocket.BinaryMessage: audio, err := parseAudioChunk(data) if err != nil { return nil, err } if len(audio) > 0 { out = append(out, audio...) } } } } // parsePath parse Path header from Edge text frame. func parsePath(data []byte) string { idx := bytes.Index(data, []byte("Path:")) if idx < 0 { return "" } lineEnd := bytes.Index(data[idx:], []byte("\r\n")) if lineEnd < 0 { lineEnd = len(data) - idx } pathLine := data[idx+5 : idx+lineEnd] return strings.TrimSpace(string(pathLine)) } // parseAudioChunk parse Edge binary audio frame: the first 2 bytes are the header length (big endian), followed by the header text, and then the audio data. // if the data is too short or the header length is invalid, return nil, nil, not considered an error. func parseAudioChunk(data []byte) ([]byte, error) { if len(data) < 2 { return nil, nil } headerLen := binary.BigEndian.Uint16(data[:2]) audioStart := 2 + int(headerLen) if audioStart > len(data) { return nil, nil } return data[audioStart:], nil } // Stream sends SSML and returns audio chunks via channel. // It handles the full lifecycle: connect → configure → send → stream → close. func (c *EdgeWsClient) Stream(ctx context.Context, text string, config tts.AudioConfig) (ch chan []byte, errCh chan error) { ch = make(chan []byte, 8) errCh = make(chan error, 1) go func() { defer close(ch) defer close(errCh) if err := c.Connect(ctx); err != nil { errCh <- err return } if err := c.Configure(ctx, config); err != nil { errCh <- err return } c.mu.Lock() conn := c.conn connID := c.connID c.mu.Unlock() if conn == nil { errCh <- errors.New("edge tts: not connected") return } ssml := buildSSML(text, config.Voice, config.Speed, config.Pitch) extra := map[string]string{ "X-RequestId": connID, "X-Timestamp": time.Now().String(), } if err := c.sendFrame("ssml", "application/ssml+xml", ssml, extra); err != nil { errCh <- err return } for { select { case <-ctx.Done(): errCh <- ctx.Err() return default: } mt, data, err := conn.ReadMessage() if err != nil { errCh <- fmt.Errorf("edge tts read: %w", err) return } switch mt { case websocket.TextMessage: if parsePath(data) == "turn.end" { c.mu.Lock() _ = c.resetLocked() c.mu.Unlock() return } case websocket.BinaryMessage: audio, err := parseAudioChunk(data) if err != nil { errCh <- err return } if len(audio) > 0 { select { case ch <- audio: case <-ctx.Done(): errCh <- ctx.Err() return } } } } }() return ch, errCh }