mirror of
https://github.com/memohai/Memoh.git
synced 2026-04-25 07:00:48 +09:00
fix(text): avoid breaking UTF-8 during truncation
Use rune-aware truncation for user-facing text and log previews so multibyte content is not corrupted in memory context, Telegram messages, or diagnostics.
This commit is contained in:
@@ -1,7 +1,11 @@
|
|||||||
// Package common provides shared utilities for channel adapters.
|
// Package common provides shared utilities for channel adapters.
|
||||||
package common
|
package common
|
||||||
|
|
||||||
import "strings"
|
import (
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/memohai/memoh/internal/textutil"
|
||||||
|
)
|
||||||
|
|
||||||
// SummarizeText returns a truncated preview of the text, limited to 120 characters.
|
// SummarizeText returns a truncated preview of the text, limited to 120 characters.
|
||||||
func SummarizeText(text string) string {
|
func SummarizeText(text string) string {
|
||||||
@@ -10,8 +14,5 @@ func SummarizeText(text string) string {
|
|||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
const limit = 120
|
const limit = 120
|
||||||
if len(value) <= limit {
|
return textutil.TruncateRunesWithSuffix(value, limit, "...")
|
||||||
return value
|
|
||||||
}
|
|
||||||
return value[:limit] + "..."
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ import (
|
|||||||
"github.com/memohai/memoh/internal/channel"
|
"github.com/memohai/memoh/internal/channel"
|
||||||
"github.com/memohai/memoh/internal/channel/adapters/common"
|
"github.com/memohai/memoh/internal/channel/adapters/common"
|
||||||
"github.com/memohai/memoh/internal/media"
|
"github.com/memohai/memoh/internal/media"
|
||||||
|
"github.com/memohai/memoh/internal/textutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@@ -1507,16 +1508,7 @@ func sanitizeTelegramText(text string) string {
|
|||||||
// truncateTelegramText truncates text to telegramMaxMessageLength on a valid
|
// truncateTelegramText truncates text to telegramMaxMessageLength on a valid
|
||||||
// UTF-8 rune boundary, appending "..." when truncation occurs.
|
// UTF-8 rune boundary, appending "..." when truncation occurs.
|
||||||
func truncateTelegramText(text string) string {
|
func truncateTelegramText(text string) string {
|
||||||
if len(text) <= telegramMaxMessageLength {
|
return textutil.TruncateRunesWithSuffix(text, telegramMaxMessageLength, "...")
|
||||||
return text
|
|
||||||
}
|
|
||||||
const suffix = "..."
|
|
||||||
limit := telegramMaxMessageLength - len(suffix)
|
|
||||||
// Walk backwards to a rune boundary.
|
|
||||||
for limit > 0 && !utf8.RuneStart(text[limit]) {
|
|
||||||
limit--
|
|
||||||
}
|
|
||||||
return text[:limit] + suffix
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ProcessingStarted sends a "typing" chat action to indicate processing.
|
// ProcessingStarted sends a "typing" chat action to indicate processing.
|
||||||
|
|||||||
@@ -593,22 +593,25 @@ func TestTruncateTelegramText(t *testing.T) {
|
|||||||
// Over limit with ASCII.
|
// Over limit with ASCII.
|
||||||
over := strings.Repeat("a", telegramMaxMessageLength+100)
|
over := strings.Repeat("a", telegramMaxMessageLength+100)
|
||||||
got := truncateTelegramText(over)
|
got := truncateTelegramText(over)
|
||||||
if len(got) > telegramMaxMessageLength {
|
if utf8.RuneCountInString(got) > telegramMaxMessageLength {
|
||||||
t.Fatalf("truncated text should be <= %d bytes: got %d", telegramMaxMessageLength, len(got))
|
t.Fatalf("truncated text should be <= %d chars: got %d", telegramMaxMessageLength, utf8.RuneCountInString(got))
|
||||||
}
|
}
|
||||||
if !strings.HasSuffix(got, "...") {
|
if !strings.HasSuffix(got, "...") {
|
||||||
t.Fatalf("truncated text should end with '...': %q", got[len(got)-10:])
|
t.Fatalf("truncated text should end with '...': %q", got[len(got)-10:])
|
||||||
}
|
}
|
||||||
|
|
||||||
// Over limit with multi-byte characters (Chinese: 3 bytes each).
|
// Over limit with multi-byte characters (Chinese: 3 bytes each).
|
||||||
multi := strings.Repeat("\u4f60", telegramMaxMessageLength)
|
multi := strings.Repeat("\u4f60", telegramMaxMessageLength+1)
|
||||||
got = truncateTelegramText(multi)
|
got = truncateTelegramText(multi)
|
||||||
if len(got) > telegramMaxMessageLength {
|
if utf8.RuneCountInString(got) > telegramMaxMessageLength {
|
||||||
t.Fatalf("truncated multi-byte text should be <= %d bytes: got %d", telegramMaxMessageLength, len(got))
|
t.Fatalf("truncated multi-byte text should be <= %d chars: got %d", telegramMaxMessageLength, utf8.RuneCountInString(got))
|
||||||
}
|
}
|
||||||
if !strings.HasSuffix(got, "...") {
|
if !strings.HasSuffix(got, "...") {
|
||||||
t.Fatal("truncated multi-byte text should end with '...'")
|
t.Fatal("truncated multi-byte text should end with '...'")
|
||||||
}
|
}
|
||||||
|
if utf8.RuneCountInString(got) != telegramMaxMessageLength {
|
||||||
|
t.Fatalf("truncated multi-byte text should keep exact char budget: got %d", utf8.RuneCountInString(got))
|
||||||
|
}
|
||||||
// Verify no broken runes.
|
// Verify no broken runes.
|
||||||
trimmed := strings.TrimSuffix(got, "...")
|
trimmed := strings.TrimSuffix(got, "...")
|
||||||
for i := 0; i < len(trimmed); {
|
for i := 0; i < len(trimmed); {
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ import (
|
|||||||
"github.com/memohai/memoh/internal/models"
|
"github.com/memohai/memoh/internal/models"
|
||||||
"github.com/memohai/memoh/internal/schedule"
|
"github.com/memohai/memoh/internal/schedule"
|
||||||
"github.com/memohai/memoh/internal/settings"
|
"github.com/memohai/memoh/internal/settings"
|
||||||
|
"github.com/memohai/memoh/internal/textutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@@ -1972,10 +1973,7 @@ func nonNilModelMessages(m []conversation.ModelMessage) []conversation.ModelMess
|
|||||||
}
|
}
|
||||||
|
|
||||||
func truncate(s string, n int) string {
|
func truncate(s string, n int) string {
|
||||||
if len(s) <= n {
|
return textutil.TruncateRunesWithSuffix(s, n, "...")
|
||||||
return s
|
|
||||||
}
|
|
||||||
return s[:n] + "..."
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func parseResolverUUID(id string) (pgtype.UUID, error) {
|
func parseResolverUUID(id string) (pgtype.UUID, error) {
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ import (
|
|||||||
|
|
||||||
"github.com/memohai/memoh/internal/db"
|
"github.com/memohai/memoh/internal/db"
|
||||||
"github.com/memohai/memoh/internal/db/sqlc"
|
"github.com/memohai/memoh/internal/db/sqlc"
|
||||||
|
"github.com/memohai/memoh/internal/textutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
// OAuthService manages OAuth flows for MCP connections.
|
// OAuthService manages OAuth flows for MCP connections.
|
||||||
@@ -667,10 +668,7 @@ func parseTokenResponse(body []byte) (*tokenResponse, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func truncate(s string, maxLen int) string {
|
func truncate(s string, maxLen int) string {
|
||||||
if len(s) <= maxLen {
|
return textutil.TruncateRunesWithSuffix(s, maxLen, "...")
|
||||||
return s
|
|
||||||
}
|
|
||||||
return s[:maxLen] + "..."
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *OAuthService) refreshToken(ctx context.Context, tokenEndpoint, refreshToken, clientID, resourceURI string) (*tokenResponse, error) {
|
func (s *OAuthService) refreshToken(ctx context.Context, tokenEndpoint, refreshToken, clientID, resourceURI string) (*tokenResponse, error) {
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ import (
|
|||||||
|
|
||||||
"github.com/memohai/memoh/internal/conversation"
|
"github.com/memohai/memoh/internal/conversation"
|
||||||
"github.com/memohai/memoh/internal/mcp"
|
"github.com/memohai/memoh/internal/mcp"
|
||||||
|
"github.com/memohai/memoh/internal/textutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@@ -379,10 +380,11 @@ func (p *BuiltinProvider) Usage(ctx context.Context, filters map[string]any) (Us
|
|||||||
|
|
||||||
func truncateSnippet(s string, n int) string {
|
func truncateSnippet(s string, n int) string {
|
||||||
trimmed := strings.TrimSpace(s)
|
trimmed := strings.TrimSpace(s)
|
||||||
if len(trimmed) <= n {
|
truncated := textutil.TruncateRunes(trimmed, n)
|
||||||
|
if truncated == trimmed {
|
||||||
return trimmed
|
return trimmed
|
||||||
}
|
}
|
||||||
return strings.TrimSpace(trimmed[:n]) + "..."
|
return strings.TrimSpace(truncated) + "..."
|
||||||
}
|
}
|
||||||
|
|
||||||
func deduplicateItems(items []MemoryItem) []MemoryItem {
|
func deduplicateItems(items []MemoryItem) []MemoryItem {
|
||||||
|
|||||||
@@ -0,0 +1,52 @@
|
|||||||
|
package textutil
|
||||||
|
|
||||||
|
import "unicode/utf8"
|
||||||
|
|
||||||
|
// TruncateRunes returns s truncated to at most maxRunes Unicode code points.
|
||||||
|
func TruncateRunes(s string, maxRunes int) string {
|
||||||
|
if maxRunes <= 0 || s == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
cut, truncated := byteIndexAfterRunes(s, maxRunes)
|
||||||
|
if !truncated {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
return s[:cut]
|
||||||
|
}
|
||||||
|
|
||||||
|
// TruncateRunesWithSuffix returns s truncated to at most maxRunes Unicode code
|
||||||
|
// points, appending suffix when truncation occurs.
|
||||||
|
func TruncateRunesWithSuffix(s string, maxRunes int, suffix string) string {
|
||||||
|
if maxRunes <= 0 || s == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
if _, truncated := byteIndexAfterRunes(s, maxRunes); !truncated {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
if suffix == "" {
|
||||||
|
return TruncateRunes(s, maxRunes)
|
||||||
|
}
|
||||||
|
suffixRunes := utf8.RuneCountInString(suffix)
|
||||||
|
if suffixRunes >= maxRunes {
|
||||||
|
return TruncateRunes(s, maxRunes)
|
||||||
|
}
|
||||||
|
cut, truncated := byteIndexAfterRunes(s, maxRunes-suffixRunes)
|
||||||
|
if !truncated {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
return s[:cut] + suffix
|
||||||
|
}
|
||||||
|
|
||||||
|
func byteIndexAfterRunes(s string, maxRunes int) (int, bool) {
|
||||||
|
if maxRunes <= 0 || s == "" {
|
||||||
|
return 0, len(s) > 0
|
||||||
|
}
|
||||||
|
count := 0
|
||||||
|
for i := range s {
|
||||||
|
if count == maxRunes {
|
||||||
|
return i, true
|
||||||
|
}
|
||||||
|
count++
|
||||||
|
}
|
||||||
|
return len(s), false
|
||||||
|
}
|
||||||
@@ -0,0 +1,55 @@
|
|||||||
|
package textutil
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
"unicode/utf8"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestTruncateRunes(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
text := "你好世界"
|
||||||
|
got := TruncateRunes(text, 3)
|
||||||
|
if got != "你好世" {
|
||||||
|
t.Fatalf("TruncateRunes() = %q, want %q", got, "你好世")
|
||||||
|
}
|
||||||
|
if !utf8.ValidString(got) {
|
||||||
|
t.Fatalf("TruncateRunes() returned invalid UTF-8: %q", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTruncateRunesWithSuffix(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
text := strings.Repeat("你", 10) + "abc"
|
||||||
|
got := TruncateRunesWithSuffix(text, 8, "...")
|
||||||
|
if utf8.RuneCountInString(got) != 8 {
|
||||||
|
t.Fatalf("TruncateRunesWithSuffix() rune count = %d, want 8", utf8.RuneCountInString(got))
|
||||||
|
}
|
||||||
|
if got != strings.Repeat("你", 5)+"..." {
|
||||||
|
t.Fatalf("TruncateRunesWithSuffix() = %q", got)
|
||||||
|
}
|
||||||
|
if !utf8.ValidString(got) {
|
||||||
|
t.Fatalf("TruncateRunesWithSuffix() returned invalid UTF-8: %q", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTruncateRunesWithSuffixNoTruncation(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
text := "你好世界"
|
||||||
|
if got := TruncateRunesWithSuffix(text, 8, "..."); got != text {
|
||||||
|
t.Fatalf("TruncateRunesWithSuffix() = %q, want %q", got, text)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTruncateRunesWithSuffixKeepsInvalidUTF8Bytes(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
text := "ab\xffcd"
|
||||||
|
got := TruncateRunesWithSuffix(text, 4, "...")
|
||||||
|
if got != "a..." {
|
||||||
|
t.Fatalf("TruncateRunesWithSuffix() = %q, want %q", got, "a...")
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user