mirror of
https://github.com/memohai/Memoh.git
synced 2026-04-25 07:00:48 +09:00
fix(text): avoid breaking UTF-8 during truncation
Use rune-aware truncation for user-facing text and log previews so multibyte content is not corrupted in memory context, Telegram messages, or diagnostics.
This commit is contained in:
@@ -1,7 +1,11 @@
|
||||
// Package common provides shared utilities for channel adapters.
|
||||
package common
|
||||
|
||||
import "strings"
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"github.com/memohai/memoh/internal/textutil"
|
||||
)
|
||||
|
||||
// SummarizeText returns a truncated preview of the text, limited to 120 characters.
|
||||
func SummarizeText(text string) string {
|
||||
@@ -10,8 +14,5 @@ func SummarizeText(text string) string {
|
||||
return ""
|
||||
}
|
||||
const limit = 120
|
||||
if len(value) <= limit {
|
||||
return value
|
||||
}
|
||||
return value[:limit] + "..."
|
||||
return textutil.TruncateRunesWithSuffix(value, limit, "...")
|
||||
}
|
||||
|
||||
@@ -21,6 +21,7 @@ import (
|
||||
"github.com/memohai/memoh/internal/channel"
|
||||
"github.com/memohai/memoh/internal/channel/adapters/common"
|
||||
"github.com/memohai/memoh/internal/media"
|
||||
"github.com/memohai/memoh/internal/textutil"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -1507,16 +1508,7 @@ func sanitizeTelegramText(text string) string {
|
||||
// truncateTelegramText truncates text to telegramMaxMessageLength on a valid
|
||||
// UTF-8 rune boundary, appending "..." when truncation occurs.
|
||||
func truncateTelegramText(text string) string {
|
||||
if len(text) <= telegramMaxMessageLength {
|
||||
return text
|
||||
}
|
||||
const suffix = "..."
|
||||
limit := telegramMaxMessageLength - len(suffix)
|
||||
// Walk backwards to a rune boundary.
|
||||
for limit > 0 && !utf8.RuneStart(text[limit]) {
|
||||
limit--
|
||||
}
|
||||
return text[:limit] + suffix
|
||||
return textutil.TruncateRunesWithSuffix(text, telegramMaxMessageLength, "...")
|
||||
}
|
||||
|
||||
// ProcessingStarted sends a "typing" chat action to indicate processing.
|
||||
|
||||
@@ -593,22 +593,25 @@ func TestTruncateTelegramText(t *testing.T) {
|
||||
// Over limit with ASCII.
|
||||
over := strings.Repeat("a", telegramMaxMessageLength+100)
|
||||
got := truncateTelegramText(over)
|
||||
if len(got) > telegramMaxMessageLength {
|
||||
t.Fatalf("truncated text should be <= %d bytes: got %d", telegramMaxMessageLength, len(got))
|
||||
if utf8.RuneCountInString(got) > telegramMaxMessageLength {
|
||||
t.Fatalf("truncated text should be <= %d chars: got %d", telegramMaxMessageLength, utf8.RuneCountInString(got))
|
||||
}
|
||||
if !strings.HasSuffix(got, "...") {
|
||||
t.Fatalf("truncated text should end with '...': %q", got[len(got)-10:])
|
||||
}
|
||||
|
||||
// Over limit with multi-byte characters (Chinese: 3 bytes each).
|
||||
multi := strings.Repeat("\u4f60", telegramMaxMessageLength)
|
||||
multi := strings.Repeat("\u4f60", telegramMaxMessageLength+1)
|
||||
got = truncateTelegramText(multi)
|
||||
if len(got) > telegramMaxMessageLength {
|
||||
t.Fatalf("truncated multi-byte text should be <= %d bytes: got %d", telegramMaxMessageLength, len(got))
|
||||
if utf8.RuneCountInString(got) > telegramMaxMessageLength {
|
||||
t.Fatalf("truncated multi-byte text should be <= %d chars: got %d", telegramMaxMessageLength, utf8.RuneCountInString(got))
|
||||
}
|
||||
if !strings.HasSuffix(got, "...") {
|
||||
t.Fatal("truncated multi-byte text should end with '...'")
|
||||
}
|
||||
if utf8.RuneCountInString(got) != telegramMaxMessageLength {
|
||||
t.Fatalf("truncated multi-byte text should keep exact char budget: got %d", utf8.RuneCountInString(got))
|
||||
}
|
||||
// Verify no broken runes.
|
||||
trimmed := strings.TrimSuffix(got, "...")
|
||||
for i := 0; i < len(trimmed); {
|
||||
|
||||
@@ -28,6 +28,7 @@ import (
|
||||
"github.com/memohai/memoh/internal/models"
|
||||
"github.com/memohai/memoh/internal/schedule"
|
||||
"github.com/memohai/memoh/internal/settings"
|
||||
"github.com/memohai/memoh/internal/textutil"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -1972,10 +1973,7 @@ func nonNilModelMessages(m []conversation.ModelMessage) []conversation.ModelMess
|
||||
}
|
||||
|
||||
func truncate(s string, n int) string {
|
||||
if len(s) <= n {
|
||||
return s
|
||||
}
|
||||
return s[:n] + "..."
|
||||
return textutil.TruncateRunesWithSuffix(s, n, "...")
|
||||
}
|
||||
|
||||
func parseResolverUUID(id string) (pgtype.UUID, error) {
|
||||
|
||||
@@ -19,6 +19,7 @@ import (
|
||||
|
||||
"github.com/memohai/memoh/internal/db"
|
||||
"github.com/memohai/memoh/internal/db/sqlc"
|
||||
"github.com/memohai/memoh/internal/textutil"
|
||||
)
|
||||
|
||||
// OAuthService manages OAuth flows for MCP connections.
|
||||
@@ -667,10 +668,7 @@ func parseTokenResponse(body []byte) (*tokenResponse, error) {
|
||||
}
|
||||
|
||||
func truncate(s string, maxLen int) string {
|
||||
if len(s) <= maxLen {
|
||||
return s
|
||||
}
|
||||
return s[:maxLen] + "..."
|
||||
return textutil.TruncateRunesWithSuffix(s, maxLen, "...")
|
||||
}
|
||||
|
||||
func (s *OAuthService) refreshToken(ctx context.Context, tokenEndpoint, refreshToken, clientID, resourceURI string) (*tokenResponse, error) {
|
||||
|
||||
@@ -9,6 +9,7 @@ import (
|
||||
|
||||
"github.com/memohai/memoh/internal/conversation"
|
||||
"github.com/memohai/memoh/internal/mcp"
|
||||
"github.com/memohai/memoh/internal/textutil"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -379,10 +380,11 @@ func (p *BuiltinProvider) Usage(ctx context.Context, filters map[string]any) (Us
|
||||
|
||||
func truncateSnippet(s string, n int) string {
|
||||
trimmed := strings.TrimSpace(s)
|
||||
if len(trimmed) <= n {
|
||||
truncated := textutil.TruncateRunes(trimmed, n)
|
||||
if truncated == trimmed {
|
||||
return trimmed
|
||||
}
|
||||
return strings.TrimSpace(trimmed[:n]) + "..."
|
||||
return strings.TrimSpace(truncated) + "..."
|
||||
}
|
||||
|
||||
func deduplicateItems(items []MemoryItem) []MemoryItem {
|
||||
|
||||
@@ -0,0 +1,52 @@
|
||||
package textutil
|
||||
|
||||
import "unicode/utf8"
|
||||
|
||||
// TruncateRunes returns s truncated to at most maxRunes Unicode code points.
|
||||
func TruncateRunes(s string, maxRunes int) string {
|
||||
if maxRunes <= 0 || s == "" {
|
||||
return ""
|
||||
}
|
||||
cut, truncated := byteIndexAfterRunes(s, maxRunes)
|
||||
if !truncated {
|
||||
return s
|
||||
}
|
||||
return s[:cut]
|
||||
}
|
||||
|
||||
// TruncateRunesWithSuffix returns s truncated to at most maxRunes Unicode code
|
||||
// points, appending suffix when truncation occurs.
|
||||
func TruncateRunesWithSuffix(s string, maxRunes int, suffix string) string {
|
||||
if maxRunes <= 0 || s == "" {
|
||||
return ""
|
||||
}
|
||||
if _, truncated := byteIndexAfterRunes(s, maxRunes); !truncated {
|
||||
return s
|
||||
}
|
||||
if suffix == "" {
|
||||
return TruncateRunes(s, maxRunes)
|
||||
}
|
||||
suffixRunes := utf8.RuneCountInString(suffix)
|
||||
if suffixRunes >= maxRunes {
|
||||
return TruncateRunes(s, maxRunes)
|
||||
}
|
||||
cut, truncated := byteIndexAfterRunes(s, maxRunes-suffixRunes)
|
||||
if !truncated {
|
||||
return s
|
||||
}
|
||||
return s[:cut] + suffix
|
||||
}
|
||||
|
||||
func byteIndexAfterRunes(s string, maxRunes int) (int, bool) {
|
||||
if maxRunes <= 0 || s == "" {
|
||||
return 0, len(s) > 0
|
||||
}
|
||||
count := 0
|
||||
for i := range s {
|
||||
if count == maxRunes {
|
||||
return i, true
|
||||
}
|
||||
count++
|
||||
}
|
||||
return len(s), false
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
package textutil
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
func TestTruncateRunes(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
text := "你好世界"
|
||||
got := TruncateRunes(text, 3)
|
||||
if got != "你好世" {
|
||||
t.Fatalf("TruncateRunes() = %q, want %q", got, "你好世")
|
||||
}
|
||||
if !utf8.ValidString(got) {
|
||||
t.Fatalf("TruncateRunes() returned invalid UTF-8: %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTruncateRunesWithSuffix(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
text := strings.Repeat("你", 10) + "abc"
|
||||
got := TruncateRunesWithSuffix(text, 8, "...")
|
||||
if utf8.RuneCountInString(got) != 8 {
|
||||
t.Fatalf("TruncateRunesWithSuffix() rune count = %d, want 8", utf8.RuneCountInString(got))
|
||||
}
|
||||
if got != strings.Repeat("你", 5)+"..." {
|
||||
t.Fatalf("TruncateRunesWithSuffix() = %q", got)
|
||||
}
|
||||
if !utf8.ValidString(got) {
|
||||
t.Fatalf("TruncateRunesWithSuffix() returned invalid UTF-8: %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTruncateRunesWithSuffixNoTruncation(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
text := "你好世界"
|
||||
if got := TruncateRunesWithSuffix(text, 8, "..."); got != text {
|
||||
t.Fatalf("TruncateRunesWithSuffix() = %q, want %q", got, text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTruncateRunesWithSuffixKeepsInvalidUTF8Bytes(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
text := "ab\xffcd"
|
||||
got := TruncateRunesWithSuffix(text, 4, "...")
|
||||
if got != "a..." {
|
||||
t.Fatalf("TruncateRunesWithSuffix() = %q, want %q", got, "a...")
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user