fix(text): avoid breaking UTF-8 during truncation

Use rune-aware truncation for user-facing text and log previews so multibyte content is not corrupted in memory context, Telegram messages, or diagnostics.
2026-04-25 07:00:48 +09:00 · 2026-03-08 16:29:38 +08:00
parent 09cdb8c87f
commit 3739def43f
8 changed files with 131 additions and 30 deletions
@@ -1,7 +1,11 @@
 // Package common provides shared utilities for channel adapters.
 package common

-import "strings"
+import (
+	"strings"
+
+	"github.com/memohai/memoh/internal/textutil"
+)

 // SummarizeText returns a truncated preview of the text, limited to 120 characters.
 func SummarizeText(text string) string {
@@ -10,8 +14,5 @@ func SummarizeText(text string) string {
 		return ""
 	}
 	const limit = 120
-	if len(value) <= limit {
-		return value
-	}
-	return value[:limit] + "..."
+	return textutil.TruncateRunesWithSuffix(value, limit, "...")
 }
@@ -21,6 +21,7 @@ import (
 	"github.com/memohai/memoh/internal/channel"
 	"github.com/memohai/memoh/internal/channel/adapters/common"
 	"github.com/memohai/memoh/internal/media"
+	"github.com/memohai/memoh/internal/textutil"
 )

 const (
@@ -1507,16 +1508,7 @@ func sanitizeTelegramText(text string) string {
 // truncateTelegramText truncates text to telegramMaxMessageLength on a valid
 // UTF-8 rune boundary, appending "..." when truncation occurs.
 func truncateTelegramText(text string) string {
-	if len(text) <= telegramMaxMessageLength {
-		return text
-	}
-	const suffix = "..."
-	limit := telegramMaxMessageLength - len(suffix)
-	// Walk backwards to a rune boundary.
-	for limit > 0 && !utf8.RuneStart(text[limit]) {
-		limit--
-	}
-	return text[:limit] + suffix
+	return textutil.TruncateRunesWithSuffix(text, telegramMaxMessageLength, "...")
 }

 // ProcessingStarted sends a "typing" chat action to indicate processing.
@@ -593,22 +593,25 @@ func TestTruncateTelegramText(t *testing.T) {
 	// Over limit with ASCII.
 	over := strings.Repeat("a", telegramMaxMessageLength+100)
 	got := truncateTelegramText(over)
-	if len(got) > telegramMaxMessageLength {
-		t.Fatalf("truncated text should be <= %d bytes: got %d", telegramMaxMessageLength, len(got))
+	if utf8.RuneCountInString(got) > telegramMaxMessageLength {
+		t.Fatalf("truncated text should be <= %d chars: got %d", telegramMaxMessageLength, utf8.RuneCountInString(got))
 	}
 	if !strings.HasSuffix(got, "...") {
 		t.Fatalf("truncated text should end with '...': %q", got[len(got)-10:])
 	}

 	// Over limit with multi-byte characters (Chinese: 3 bytes each).
-	multi := strings.Repeat("\u4f60", telegramMaxMessageLength)
+	multi := strings.Repeat("\u4f60", telegramMaxMessageLength+1)
 	got = truncateTelegramText(multi)
-	if len(got) > telegramMaxMessageLength {
-		t.Fatalf("truncated multi-byte text should be <= %d bytes: got %d", telegramMaxMessageLength, len(got))
+	if utf8.RuneCountInString(got) > telegramMaxMessageLength {
+		t.Fatalf("truncated multi-byte text should be <= %d chars: got %d", telegramMaxMessageLength, utf8.RuneCountInString(got))
 	}
 	if !strings.HasSuffix(got, "...") {
 		t.Fatal("truncated multi-byte text should end with '...'")
 	}
+	if utf8.RuneCountInString(got) != telegramMaxMessageLength {
+		t.Fatalf("truncated multi-byte text should keep exact char budget: got %d", utf8.RuneCountInString(got))
+	}
 	// Verify no broken runes.
 	trimmed := strings.TrimSuffix(got, "...")
 	for i := 0; i < len(trimmed); {
@@ -28,6 +28,7 @@ import (
 	"github.com/memohai/memoh/internal/models"
 	"github.com/memohai/memoh/internal/schedule"
 	"github.com/memohai/memoh/internal/settings"
+	"github.com/memohai/memoh/internal/textutil"
 )

 const (
@@ -1972,10 +1973,7 @@ func nonNilModelMessages(m []conversation.ModelMessage) []conversation.ModelMess
 }

 func truncate(s string, n int) string {
-	if len(s) <= n {
-		return s
-	}
-	return s[:n] + "..."
+	return textutil.TruncateRunesWithSuffix(s, n, "...")
 }

 func parseResolverUUID(id string) (pgtype.UUID, error) {
@@ -19,6 +19,7 @@ import (

 	"github.com/memohai/memoh/internal/db"
 	"github.com/memohai/memoh/internal/db/sqlc"
+	"github.com/memohai/memoh/internal/textutil"
 )

 // OAuthService manages OAuth flows for MCP connections.
@@ -667,10 +668,7 @@ func parseTokenResponse(body []byte) (*tokenResponse, error) {
 }

 func truncate(s string, maxLen int) string {
-	if len(s) <= maxLen {
-		return s
-	}
-	return s[:maxLen] + "..."
+	return textutil.TruncateRunesWithSuffix(s, maxLen, "...")
 }

 func (s *OAuthService) refreshToken(ctx context.Context, tokenEndpoint, refreshToken, clientID, resourceURI string) (*tokenResponse, error) {
@@ -9,6 +9,7 @@ import (

 	"github.com/memohai/memoh/internal/conversation"
 	"github.com/memohai/memoh/internal/mcp"
+	"github.com/memohai/memoh/internal/textutil"
 )

 const (
@@ -379,10 +380,11 @@ func (p *BuiltinProvider) Usage(ctx context.Context, filters map[string]any) (Us

 func truncateSnippet(s string, n int) string {
 	trimmed := strings.TrimSpace(s)
-	if len(trimmed) <= n {
+	truncated := textutil.TruncateRunes(trimmed, n)
+	if truncated == trimmed {
 		return trimmed
 	}
-	return strings.TrimSpace(trimmed[:n]) + "..."
+	return strings.TrimSpace(truncated) + "..."
 }

 func deduplicateItems(items []MemoryItem) []MemoryItem {
@@ -0,0 +1,52 @@
+package textutil
+
+import "unicode/utf8"
+
+// TruncateRunes returns s truncated to at most maxRunes Unicode code points.
+func TruncateRunes(s string, maxRunes int) string {
+	if maxRunes <= 0 || s == "" {
+		return ""
+	}
+	cut, truncated := byteIndexAfterRunes(s, maxRunes)
+	if !truncated {
+		return s
+	}
+	return s[:cut]
+}
+
+// TruncateRunesWithSuffix returns s truncated to at most maxRunes Unicode code
+// points, appending suffix when truncation occurs.
+func TruncateRunesWithSuffix(s string, maxRunes int, suffix string) string {
+	if maxRunes <= 0 || s == "" {
+		return ""
+	}
+	if _, truncated := byteIndexAfterRunes(s, maxRunes); !truncated {
+		return s
+	}
+	if suffix == "" {
+		return TruncateRunes(s, maxRunes)
+	}
+	suffixRunes := utf8.RuneCountInString(suffix)
+	if suffixRunes >= maxRunes {
+		return TruncateRunes(s, maxRunes)
+	}
+	cut, truncated := byteIndexAfterRunes(s, maxRunes-suffixRunes)
+	if !truncated {
+		return s
+	}
+	return s[:cut] + suffix
+}
+
+func byteIndexAfterRunes(s string, maxRunes int) (int, bool) {
+	if maxRunes <= 0 || s == "" {
+		return 0, len(s) > 0
+	}
+	count := 0
+	for i := range s {
+		if count == maxRunes {
+			return i, true
+		}
+		count++
+	}
+	return len(s), false
+}
@@ -0,0 +1,55 @@
+package textutil
+
+import (
+	"strings"
+	"testing"
+	"unicode/utf8"
+)
+
+func TestTruncateRunes(t *testing.T) {
+	t.Parallel()
+
+	text := "你好世界"
+	got := TruncateRunes(text, 3)
+	if got != "你好世" {
+		t.Fatalf("TruncateRunes() = %q, want %q", got, "你好世")
+	}
+	if !utf8.ValidString(got) {
+		t.Fatalf("TruncateRunes() returned invalid UTF-8: %q", got)
+	}
+}
+
+func TestTruncateRunesWithSuffix(t *testing.T) {
+	t.Parallel()
+
+	text := strings.Repeat("你", 10) + "abc"
+	got := TruncateRunesWithSuffix(text, 8, "...")
+	if utf8.RuneCountInString(got) != 8 {
+		t.Fatalf("TruncateRunesWithSuffix() rune count = %d, want 8", utf8.RuneCountInString(got))
+	}
+	if got != strings.Repeat("你", 5)+"..." {
+		t.Fatalf("TruncateRunesWithSuffix() = %q", got)
+	}
+	if !utf8.ValidString(got) {
+		t.Fatalf("TruncateRunesWithSuffix() returned invalid UTF-8: %q", got)
+	}
+}
+
+func TestTruncateRunesWithSuffixNoTruncation(t *testing.T) {
+	t.Parallel()
+
+	text := "你好世界"
+	if got := TruncateRunesWithSuffix(text, 8, "..."); got != text {
+		t.Fatalf("TruncateRunesWithSuffix() = %q, want %q", got, text)
+	}
+}
+
+func TestTruncateRunesWithSuffixKeepsInvalidUTF8Bytes(t *testing.T) {
+	t.Parallel()
+
+	text := "ab\xffcd"
+	got := TruncateRunesWithSuffix(text, 4, "...")
+	if got != "a..." {
+		t.Fatalf("TruncateRunesWithSuffix() = %q, want %q", got, "a...")
+	}
+}